Subversion Repositories navi

Rev

Rev 294 | Rev 297 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
283 muzer 23
import tsimapiak.dbconnector as dbconnector
24
import tsimapiak.parsenum as parsenum
246 szabot 25
import re
56 szabot 26
 
221 muzer 27
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 28
wordlist = dbconnector.getnavilist()
65 szabot 29
 
283 muzer 30
prefixes, infixes, postfixes = dbconnector.getaffixlists()
187 muzer 31
 
276 muzer 32
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283 muzer 33
BROKENWORDS = (
296 muzer 34
    (u"sami", u"si", u"", u"am", u"", (()), (()), False), # otherwise parses as sa (tsa-lenited) + mi
35
    #(u"to", u"to", u"", u"", u"", (()), (()), False),
283 muzer 36
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
296 muzer 37
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False), # does not parse, irregular form
38
    #(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
39
    #(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
40
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False), # otherwise parses as kìm (spin) + ä (genitive)
41
    (u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False), # otherwise parses as apxa + -y (genitive)
290 muzer 42
    #(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
43
    #(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
44
    #(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
296 muzer 45
    #(u"ka", u"ka", u"", u"", u"", (()), (()), False),
46
    #(u"uo", u"uo", u"", u"", u"", (()), (()), False),
47
    #(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
48
    #(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
49
    (u"tse", u"tse", u"", u"", u"", (()), (()), False), # otherwise parses as tsa'u abbreviated (special case)
283 muzer 50
)
51
 
52
#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
53
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
54
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
55
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
56
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
57
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 58
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 59
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 60
 
290 muzer 61
EXTRAINFIXES = [
62
    {"id": "-1", "navi": "eiy", "gloss": "LAUD.", "position": 2},
63
    {"id": "-2", "navi": "eng", "gloss": "PEJ.", "position": 2},
64
]
65
 
66
EXTRAPOSTFIXES = [
67
    {"id": "-3", "navi": "eyä", "gloss": "GEN."},
68
]
69
 
284 muzer 70
EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283 muzer 71
 
246 szabot 72
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 73
 
289 muzer 74
# Let's lenit the prefixes
75
extraprefixes = []
76
for prefix in prefixes:
77
    for letter, replacement in LENIT:
78
        if prefix["navi"].startswith(letter):
290 muzer 79
            new_prefix = prefix["navi"].replace(letter, replacement, 1)
80
            if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
81
                extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD"})
289 muzer 82
            break
83
 
84
prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290 muzer 85
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
86
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289 muzer 87
 
296 muzer 88
# Let's lenit the dictionary
89
extrawords = []
90
for word in wordlist:
91
    for letter, replacement in LENIT:
92
        if word["navi"].startswith(letter):
93
            new_word = word["navi"].replace(letter, replacement, 1)
94
            new_infix = word["infix"].replace(letter, replacement, 1)
95
            extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True})
96
wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)
97
 
56 szabot 98
def parseword(wordin):
187 muzer 99
    tempid = 0
100
    temptype = u""
246 szabot 101
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 102
        if wordin[0] == brokenword[0]:
187 muzer 103
            for word in wordlist:
203 muzer 104
                if brokenword[1] == word["navi"]:
187 muzer 105
                    tempid = word["id"]
204 muzer 106
                    temptype = word["type"]
187 muzer 107
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
296 muzer 108
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}, "len": False}
65 szabot 109
    for word in wordlist:
222 muzer 110
        word["navi"] = word["navi"].lower()
65 szabot 111
        foundit = True
112
        foundprefs = []
113
        foundposts = []
74 szabot 114
        splitword = word["infix"].split(u" ")
172 muzer 115
        foundins = [u"", u"", u""]
74 szabot 116
        if len(wordin) < len(splitword):
68 szabot 117
            foundit = False
246 szabot 118
            continue
65 szabot 119
        for wor in range(len(splitword)):
76 szabot 120
            if not foundit:
121
                break
65 szabot 122
            foundprefs.append([])
123
            foundposts.append([])
124
            center = u""
284 muzer 125
            if u"<0>" in splitword[wor]:
185 muzer 126
                tempin1 = []
127
                tempin2 = []
128
                tempin3 = []
283 muzer 129
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185 muzer 130
                    if in1 in wordin[wor]:
131
                        tempin1.append(in1)
283 muzer 132
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185 muzer 133
                    if in2 in wordin[wor]:
134
                        tempin2.append(in2)
283 muzer 135
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185 muzer 136
                    if in3 in wordin[wor]:
137
                        tempin3.append(in3)
180 szabot 138
                for in1 in tempin1:
139
                    for in2 in tempin2:
140
                        for in3 in tempin3:
284 muzer 141
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
142
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 143
                                foundins = [in1, in2, in3]
144
                                break
246 szabot 145
                        if center != u"":
146
                            break
147
                    if center != u"":
148
                        break
65 szabot 149
            else:
150
                if splitword[wor] in wordin[wor]:
151
                    center = splitword[wor]
95 szabot 152
                if center == u"":
153
                    if splitword[wor].endswith(u"nga"):
290 muzer 154
                        temp = splitword[wor][:-3] + u"nge"
95 szabot 155
                        if temp in wordin[wor]:
156
                            center = temp
271 muzer 157
                    if splitword[wor].endswith(u"fo"):
290 muzer 158
                        temp = splitword[wor][:-2] + u"fe"
103 szabot 159
                        if temp in wordin[wor]:
160
                            center = temp
273 muzer 161
                    if splitword[wor].endswith(u"po"):
290 muzer 162
                        temp = splitword[wor][:-2] + u"pe"
273 muzer 163
                        if temp in wordin[wor]:
164
                            center = temp
258 muzer 165
                    if splitword[wor].endswith(u"tsa"):
290 muzer 166
                        temp = splitword[wor][:-3] + u"tse"
258 muzer 167
                        if temp in wordin[wor]:
168
                            center = temp
290 muzer 169
                    if splitword[wor].endswith(u"fko"):
170
                        temp = splitword[wor][:-3] + u"fke"
171
                        if temp in wordin[wor]:
172
                            center = temp
173
                    if splitword[wor].endswith(u"sa'u"):
174
                        temp = splitword[wor][:-4] + u"se"
175
                        if temp in wordin[wor]:
176
                            center = temp
177
                    if splitword[wor].endswith(u"sa"):
178
                        temp = splitword[wor][:-2] + u"se"
179
                        if temp in wordin[wor]:
180
                            center = temp
181
                    if splitword[wor].endswith(u"sno"):
182
                        temp = splitword[wor][:-3] + u"sne"
183
                        if temp in wordin[wor]:
184
                            center = temp
185
                    if splitword[wor].endswith(u"ayla"):
186
                        temp = splitword[wor][:-3] + u"ayle"
187
                        if temp in wordin[wor]:
188
                            center = temp
74 szabot 189
            if center == u"":
65 szabot 190
                foundit = False
191
                break
91 szabot 192
            temp = wordin[wor].split(center)
193
            if len(temp) != 2:
194
                foundit = False
195
                break
196
            pref, posf = temp
119 szabot 197
            last = u""
198
            while last != pref:
199
                last = pref
283 muzer 200
                for pre in [x["navi"] for x in prefixes]:
119 szabot 201
                    if pref != u"":
202
                        if pref.endswith(pre):
203
                            if pre in foundprefs[wor]:
204
                                break
205
                            foundprefs[wor].append(pre)
206
                            pref = pref[:-len(pre)]
120 szabot 207
                            break
65 szabot 208
            if pref != u"":
209
                foundit = False
244 szabot 210
                foundprefs = []
65 szabot 211
                break
119 szabot 212
            last = u""
213
            while last != posf:
214
                last = posf
288 muzer 215
                for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119 szabot 216
                    if posf != u"":
217
                        if posf.startswith(pos):
284 muzer 218
                            if (pos, posid) in foundposts[wor]:
119 szabot 219
                                break
296 muzer 220
                            if pos != u"ä" or (word["navi"] != u"pey" and word["navi"] != "fey"): # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284 muzer 221
                                foundposts[wor].append((pos, posid))
244 szabot 222
                                posf = posf[len(pos):]
223
                                break
224
                            else:
225
                                break
82 szabot 226
            if posf != u"":
80 szabot 227
                foundit = False
244 szabot 228
                foundposts = []
80 szabot 229
                break
65 szabot 230
        if foundit == True:
231
            foundword = word
56 szabot 232
            break
87 szabot 233
    ret["pref"] = foundprefs
234
    ret["post"] = foundposts
235
    ret["inf"] = foundins
65 szabot 236
    if foundit == True:
296 muzer 237
        ret["len"] = word["lenited"]
71 szabot 238
        ret["word"] = foundword
77 szabot 239
    return ret
240
 
241
def parsesent(sent):
101 szabot 242
    sent = sent.strip().lower().replace(u"’", u"'")
283 muzer 243
    sent = re.sub(r"[^\wìä' ]", u"", sent)
244
    sent = re.sub(r"\ +", u" ", sent)
89 szabot 245
    sent = sent.split(u" ")
77 szabot 246
    ret = []
247
    left = len(sent)
248
    while left:
246 szabot 249
        word = parsenum.parse(sent[len(sent) - left])
103 szabot 250
        if word == None:
251
            word = parseword(sent[-left:])
78 szabot 252
        left -= len(word["word"]["navi"].split(" "))
77 szabot 253
        ret.append(word)
136 muzer 254
    return ret