Subversion Repositories navi

Rev

Rev 258 | Rev 261 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
66 szabot 23
import dbconnector
103 szabot 24
import parsenum
246 szabot 25
import re
56 szabot 26
 
221 muzer 27
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
28
wordlist = dbconnector.getnavilist()
65 szabot 29
 
187 muzer 30
 
260 muzer 31
BROKENWORDS = ((u"sami", u"si", u"", u"am", u"", (()), (()), False), (u"to", u"to", u"", u"", u"", (()), (()), False), (u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False), (u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False), (u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False)) # XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, PREFIXES, suffixes. Things that can take affixes should go in the above list instead.
246 szabot 32
INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
33
INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
34
INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
254 muzer 35
PREFIXES = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m")
246 szabot 36
ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
256 muzer 37
POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 38
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 39
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 40
 
246 szabot 41
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 42
 
56 szabot 43
def parseword(wordin):
187 muzer 44
    tempid = 0
45
    temptype = u""
246 szabot 46
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 47
        if wordin[0] == brokenword[0]:
187 muzer 48
            for word in wordlist:
203 muzer 49
                if brokenword[1] == word["navi"]:
187 muzer 50
                    tempid = word["id"]
204 muzer 51
                    temptype = word["type"]
187 muzer 52
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90 szabot 53
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65 szabot 54
    for word in wordlist:
222 muzer 55
        word["navi"] = word["navi"].lower()
65 szabot 56
        foundit = True
57
        foundprefs = []
58
        foundposts = []
99 szabot 59
        lenited = False
74 szabot 60
        splitword = word["infix"].split(u" ")
172 muzer 61
        foundins = [u"", u"", u""]
74 szabot 62
        if len(wordin) < len(splitword):
68 szabot 63
            foundit = False
246 szabot 64
            continue
65 szabot 65
        for wor in range(len(splitword)):
76 szabot 66
            if not foundit:
67
                break
65 szabot 68
            foundprefs.append([])
69
            foundposts.append([])
70
            center = u""
71
            if u"<1>" in splitword[wor]:
185 muzer 72
                tempin1 = []
73
                tempin2 = []
74
                tempin3 = []
246 szabot 75
                for in1 in INFIXES1:
185 muzer 76
                    if in1 in wordin[wor]:
77
                        tempin1.append(in1)
246 szabot 78
                for in2 in INFIXES2:
185 muzer 79
                    if in2 in wordin[wor]:
80
                        tempin2.append(in2)
246 szabot 81
                for in3 in INFIXES3:
185 muzer 82
                    if in3 in wordin[wor]:
83
                        tempin3.append(in3)
180 szabot 84
                for in1 in tempin1:
85
                    for in2 in tempin2:
86
                        for in3 in tempin3:
246 szabot 87
                            if splitword[wor].replace(u"<1><2>", in1 + in2).replace(u"<3>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
88
                                center = splitword[wor].replace(u"<1><2>", in1 + in2).replace(u"<3>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 89
                                foundins = [in1, in2, in3]
90
                                break
246 szabot 91
                        if center != u"":
92
                            break
93
                    if center != u"":
94
                        break
65 szabot 95
            else:
96
                if splitword[wor] in wordin[wor]:
97
                    center = splitword[wor]
216 muzer 98
                if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246 szabot 99
                    for i in LENIT:
92 szabot 100
                        temp = u""
91 szabot 101
                        if splitword[wor].startswith(i[0]):
92 szabot 102
                            temp = i[1] + splitword[wor][len(i[0]):]
103
                            if temp in wordin[wor]:
99 szabot 104
                                lenited = True
92 szabot 105
                                center = temp
95 szabot 106
                if center == u"":
107
                    if splitword[wor].endswith(u"nga"):
97 szabot 108
                        temp = splitword[wor][:-3] + u"ng"
95 szabot 109
                        if temp in wordin[wor]:
110
                            center = temp
103 szabot 111
                    if splitword[wor].endswith(u"po"):
174 muzer 112
                        temp = splitword[wor][:-2] + u"p"
103 szabot 113
                        if temp in wordin[wor]:
114
                            center = temp
258 muzer 115
                    if splitword[wor].endswith(u"tsa"):
116
                        temp = splitword[wor][:-3] + u"ts"
117
                        if temp in wordin[wor]:
118
                            center = temp
74 szabot 119
            if center == u"":
65 szabot 120
                foundit = False
121
                break
91 szabot 122
            temp = wordin[wor].split(center)
123
            if len(temp) != 2:
124
                foundit = False
125
                break
126
            pref, posf = temp
119 szabot 127
            last = u""
128
            while last != pref:
129
                last = pref
246 szabot 130
                for pre in PREFIXES:
119 szabot 131
                    if pref != u"":
132
                        if pref.endswith(pre):
133
                            if pre in foundprefs[wor]:
134
                                break
135
                            foundprefs[wor].append(pre)
136
                            pref = pref[:-len(pre)]
120 szabot 137
                            break
65 szabot 138
            if pref != u"":
139
                foundit = False
244 szabot 140
                foundprefs = []
65 szabot 141
                break
119 szabot 142
            last = u""
143
            while last != posf:
144
                last = posf
246 szabot 145
                for pos in POSTFIXES:
119 szabot 146
                    if posf != u"":
147
                        if posf.startswith(pos):
148
                            if pos in foundposts[wor]:
149
                                break
244 szabot 150
                            if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
151
                                foundposts[wor].append(pos)
152
                                posf = posf[len(pos):]
153
                                break
154
                            else:
155
                                break
82 szabot 156
            if posf != u"":
80 szabot 157
                foundit = False
244 szabot 158
                foundposts = []
80 szabot 159
                break
65 szabot 160
        if foundit == True:
161
            foundword = word
56 szabot 162
            break
87 szabot 163
    ret["pref"] = foundprefs
164
    ret["post"] = foundposts
165
    ret["inf"] = foundins
99 szabot 166
    ret["len"] = lenited
65 szabot 167
    if foundit == True:
71 szabot 168
        ret["word"] = foundword
77 szabot 169
    return ret
170
 
171
def parsesent(sent):
101 szabot 172
    sent = sent.strip().lower().replace(u"’", u"'")
246 szabot 173
    sent = re.sub(ur"[^\wìä' ]", u"", sent)
174
    sent = re.sub(ur"\ +", u" ", sent)
89 szabot 175
    sent = sent.split(u" ")
77 szabot 176
    ret = []
177
    left = len(sent)
178
    while left:
246 szabot 179
        word = parsenum.parse(sent[len(sent) - left])
103 szabot 180
        if word == None:
181
            word = parseword(sent[-left:])
78 szabot 182
        left -= len(word["word"]["navi"].split(" "))
77 szabot 183
        ret.append(word)
136 muzer 184
    return ret