Subversion Repositories navi

Rev

Rev 289 | Rev 294 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
283 muzer 23
import tsimapiak.dbconnector as dbconnector
24
import tsimapiak.parsenum as parsenum
246 szabot 25
import re
56 szabot 26
 
221 muzer 27
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 28
wordlist = dbconnector.getnavilist()
65 szabot 29
 
283 muzer 30
prefixes, infixes, postfixes = dbconnector.getaffixlists()
187 muzer 31
 
276 muzer 32
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283 muzer 33
BROKENWORDS = (
34
    (u"sami", u"si", u"", u"am", u"", (()), (()), False),
35
    (u"to", u"to", u"", u"", u"", (()), (()), False),
36
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
284 muzer 37
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False),
283 muzer 38
    (u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
39
    (u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
40
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False),
41
    (u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False),
290 muzer 42
    #(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
43
    #(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
44
    #(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
283 muzer 45
    (u"ka", u"ka", u"", u"", u"", (()), (()), False),
46
    (u"uo", u"uo", u"", u"", u"", (()), (()), False),
47
    (u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
48
    (u"sim", u"sim", u"", u"", u"", (()), (()), False) # probably not tsim lenited
49
)
50
 
51
#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
52
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
53
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
54
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
55
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
56
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 57
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 58
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 59
 
290 muzer 60
EXTRAINFIXES = [
61
    {"id": "-1", "navi": "eiy", "gloss": "LAUD.", "position": 2},
62
    {"id": "-2", "navi": "eng", "gloss": "PEJ.", "position": 2},
63
]
64
 
65
EXTRAPOSTFIXES = [
66
    {"id": "-3", "navi": "eyä", "gloss": "GEN."},
67
]
68
 
284 muzer 69
EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283 muzer 70
 
246 szabot 71
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 72
 
289 muzer 73
# Let's lenit the prefixes
74
extraprefixes = []
75
for prefix in prefixes:
76
    for letter, replacement in LENIT:
77
        if prefix["navi"].startswith(letter):
290 muzer 78
            new_prefix = prefix["navi"].replace(letter, replacement, 1)
79
            if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
80
                extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD"})
289 muzer 81
            break
82
 
83
prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290 muzer 84
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
85
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289 muzer 86
 
56 szabot 87
def parseword(wordin):
187 muzer 88
    tempid = 0
89
    temptype = u""
246 szabot 90
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 91
        if wordin[0] == brokenword[0]:
187 muzer 92
            for word in wordlist:
203 muzer 93
                if brokenword[1] == word["navi"]:
187 muzer 94
                    tempid = word["id"]
204 muzer 95
                    temptype = word["type"]
187 muzer 96
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90 szabot 97
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65 szabot 98
    for word in wordlist:
222 muzer 99
        word["navi"] = word["navi"].lower()
65 szabot 100
        foundit = True
101
        foundprefs = []
102
        foundposts = []
99 szabot 103
        lenited = False
74 szabot 104
        splitword = word["infix"].split(u" ")
172 muzer 105
        foundins = [u"", u"", u""]
74 szabot 106
        if len(wordin) < len(splitword):
68 szabot 107
            foundit = False
246 szabot 108
            continue
65 szabot 109
        for wor in range(len(splitword)):
76 szabot 110
            if not foundit:
111
                break
65 szabot 112
            foundprefs.append([])
113
            foundposts.append([])
114
            center = u""
284 muzer 115
            if u"<0>" in splitword[wor]:
185 muzer 116
                tempin1 = []
117
                tempin2 = []
118
                tempin3 = []
283 muzer 119
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185 muzer 120
                    if in1 in wordin[wor]:
121
                        tempin1.append(in1)
283 muzer 122
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185 muzer 123
                    if in2 in wordin[wor]:
124
                        tempin2.append(in2)
283 muzer 125
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185 muzer 126
                    if in3 in wordin[wor]:
127
                        tempin3.append(in3)
180 szabot 128
                for in1 in tempin1:
129
                    for in2 in tempin2:
130
                        for in3 in tempin3:
284 muzer 131
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
132
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 133
                                foundins = [in1, in2, in3]
134
                                break
246 szabot 135
                        if center != u"":
136
                            break
137
                    if center != u"":
138
                        break
65 szabot 139
            else:
140
                if splitword[wor] in wordin[wor]:
141
                    center = splitword[wor]
216 muzer 142
                if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246 szabot 143
                    for i in LENIT:
92 szabot 144
                        temp = u""
91 szabot 145
                        if splitword[wor].startswith(i[0]):
92 szabot 146
                            temp = i[1] + splitword[wor][len(i[0]):]
147
                            if temp in wordin[wor]:
99 szabot 148
                                lenited = True
92 szabot 149
                                center = temp
95 szabot 150
                if center == u"":
151
                    if splitword[wor].endswith(u"nga"):
290 muzer 152
                        temp = splitword[wor][:-3] + u"nge"
95 szabot 153
                        if temp in wordin[wor]:
154
                            center = temp
271 muzer 155
                    if splitword[wor].endswith(u"fo"):
290 muzer 156
                        temp = splitword[wor][:-2] + u"fe"
103 szabot 157
                        if temp in wordin[wor]:
158
                            center = temp
273 muzer 159
                    if splitword[wor].endswith(u"po"):
290 muzer 160
                        temp = splitword[wor][:-2] + u"pe"
273 muzer 161
                        if temp in wordin[wor]:
162
                            center = temp
258 muzer 163
                    if splitword[wor].endswith(u"tsa"):
290 muzer 164
                        temp = splitword[wor][:-3] + u"tse"
258 muzer 165
                        if temp in wordin[wor]:
166
                            center = temp
290 muzer 167
                    if splitword[wor].endswith(u"fko"):
168
                        temp = splitword[wor][:-3] + u"fke"
169
                        if temp in wordin[wor]:
170
                            center = temp
171
                    if splitword[wor].endswith(u"tsa'u"):
172
                        temp = splitword[wor][:-5] + u"tse"
173
                        if temp in wordin[wor]:
174
                            center = temp
175
                    if splitword[wor].endswith(u"sa'u"):
176
                        temp = splitword[wor][:-4] + u"se"
177
                        if temp in wordin[wor]:
178
                            center = temp
179
                    if splitword[wor].endswith(u"sa"):
180
                        temp = splitword[wor][:-2] + u"se"
181
                        if temp in wordin[wor]:
182
                            center = temp
183
                    if splitword[wor].endswith(u"sno"):
184
                        temp = splitword[wor][:-3] + u"sne"
185
                        if temp in wordin[wor]:
186
                            center = temp
187
                    if splitword[wor].endswith(u"ayla"):
188
                        temp = splitword[wor][:-3] + u"ayle"
189
                        if temp in wordin[wor]:
190
                            center = temp
74 szabot 191
            if center == u"":
65 szabot 192
                foundit = False
193
                break
91 szabot 194
            temp = wordin[wor].split(center)
195
            if len(temp) != 2:
196
                foundit = False
197
                break
198
            pref, posf = temp
119 szabot 199
            last = u""
200
            while last != pref:
201
                last = pref
283 muzer 202
                for pre in [x["navi"] for x in prefixes]:
119 szabot 203
                    if pref != u"":
204
                        if pref.endswith(pre):
205
                            if pre in foundprefs[wor]:
206
                                break
207
                            foundprefs[wor].append(pre)
208
                            pref = pref[:-len(pre)]
120 szabot 209
                            break
65 szabot 210
            if pref != u"":
211
                foundit = False
244 szabot 212
                foundprefs = []
65 szabot 213
                break
119 szabot 214
            last = u""
215
            while last != posf:
216
                last = posf
288 muzer 217
                for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119 szabot 218
                    if posf != u"":
219
                        if posf.startswith(pos):
284 muzer 220
                            if (pos, posid) in foundposts[wor]:
119 szabot 221
                                break
244 szabot 222
                            if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284 muzer 223
                                foundposts[wor].append((pos, posid))
244 szabot 224
                                posf = posf[len(pos):]
225
                                break
226
                            else:
227
                                break
82 szabot 228
            if posf != u"":
80 szabot 229
                foundit = False
244 szabot 230
                foundposts = []
80 szabot 231
                break
65 szabot 232
        if foundit == True:
233
            foundword = word
56 szabot 234
            break
87 szabot 235
    ret["pref"] = foundprefs
236
    ret["post"] = foundposts
237
    ret["inf"] = foundins
99 szabot 238
    ret["len"] = lenited
65 szabot 239
    if foundit == True:
71 szabot 240
        ret["word"] = foundword
77 szabot 241
    return ret
242
 
243
def parsesent(sent):
101 szabot 244
    sent = sent.strip().lower().replace(u"’", u"'")
283 muzer 245
    sent = re.sub(r"[^\wìä' ]", u"", sent)
246
    sent = re.sub(r"\ +", u" ", sent)
89 szabot 247
    sent = sent.split(u" ")
77 szabot 248
    ret = []
249
    left = len(sent)
250
    while left:
246 szabot 251
        word = parsenum.parse(sent[len(sent) - left])
103 szabot 252
        if word == None:
253
            word = parseword(sent[-left:])
78 szabot 254
        left -= len(word["word"]["navi"].split(" "))
77 szabot 255
        ret.append(word)
136 muzer 256
    return ret