Subversion Repositories navi

Rev

Rev 290 | Rev 296 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
283 muzer 23
import tsimapiak.dbconnector as dbconnector
24
import tsimapiak.parsenum as parsenum
246 szabot 25
import re
56 szabot 26
 
221 muzer 27
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 28
wordlist = dbconnector.getnavilist()
65 szabot 29
 
283 muzer 30
prefixes, infixes, postfixes = dbconnector.getaffixlists()
187 muzer 31
 
276 muzer 32
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283 muzer 33
BROKENWORDS = (
34
    (u"sami", u"si", u"", u"am", u"", (()), (()), False),
35
    (u"to", u"to", u"", u"", u"", (()), (()), False),
36
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
284 muzer 37
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False),
283 muzer 38
    (u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
39
    (u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
40
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False),
41
    (u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False),
290 muzer 42
    #(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
43
    #(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
44
    #(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
283 muzer 45
    (u"ka", u"ka", u"", u"", u"", (()), (()), False),
46
    (u"uo", u"uo", u"", u"", u"", (()), (()), False),
47
    (u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
294 muzer 48
    (u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
49
    (u"tse", u"tse", u"", u"", u"", (()), (()), False),
283 muzer 50
)
51
 
52
#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
53
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
54
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
55
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
56
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
57
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 58
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 59
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 60
 
290 muzer 61
EXTRAINFIXES = [
62
    {"id": "-1", "navi": "eiy", "gloss": "LAUD.", "position": 2},
63
    {"id": "-2", "navi": "eng", "gloss": "PEJ.", "position": 2},
64
]
65
 
66
EXTRAPOSTFIXES = [
67
    {"id": "-3", "navi": "eyä", "gloss": "GEN."},
68
]
69
 
284 muzer 70
EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283 muzer 71
 
246 szabot 72
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 73
 
289 muzer 74
# Let's lenit the prefixes
75
extraprefixes = []
76
for prefix in prefixes:
77
    for letter, replacement in LENIT:
78
        if prefix["navi"].startswith(letter):
290 muzer 79
            new_prefix = prefix["navi"].replace(letter, replacement, 1)
80
            if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
81
                extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD"})
289 muzer 82
            break
83
 
84
prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290 muzer 85
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
86
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289 muzer 87
 
56 szabot 88
def parseword(wordin):
187 muzer 89
    tempid = 0
90
    temptype = u""
246 szabot 91
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 92
        if wordin[0] == brokenword[0]:
187 muzer 93
            for word in wordlist:
203 muzer 94
                if brokenword[1] == word["navi"]:
187 muzer 95
                    tempid = word["id"]
204 muzer 96
                    temptype = word["type"]
187 muzer 97
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90 szabot 98
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65 szabot 99
    for word in wordlist:
222 muzer 100
        word["navi"] = word["navi"].lower()
65 szabot 101
        foundit = True
102
        foundprefs = []
103
        foundposts = []
99 szabot 104
        lenited = False
74 szabot 105
        splitword = word["infix"].split(u" ")
172 muzer 106
        foundins = [u"", u"", u""]
74 szabot 107
        if len(wordin) < len(splitword):
68 szabot 108
            foundit = False
246 szabot 109
            continue
65 szabot 110
        for wor in range(len(splitword)):
76 szabot 111
            if not foundit:
112
                break
65 szabot 113
            foundprefs.append([])
114
            foundposts.append([])
115
            center = u""
284 muzer 116
            if u"<0>" in splitword[wor]:
185 muzer 117
                tempin1 = []
118
                tempin2 = []
119
                tempin3 = []
283 muzer 120
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185 muzer 121
                    if in1 in wordin[wor]:
122
                        tempin1.append(in1)
283 muzer 123
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185 muzer 124
                    if in2 in wordin[wor]:
125
                        tempin2.append(in2)
283 muzer 126
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185 muzer 127
                    if in3 in wordin[wor]:
128
                        tempin3.append(in3)
180 szabot 129
                for in1 in tempin1:
130
                    for in2 in tempin2:
131
                        for in3 in tempin3:
284 muzer 132
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
133
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 134
                                foundins = [in1, in2, in3]
135
                                break
246 szabot 136
                        if center != u"":
137
                            break
138
                    if center != u"":
139
                        break
65 szabot 140
            else:
141
                if splitword[wor] in wordin[wor]:
142
                    center = splitword[wor]
216 muzer 143
                if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246 szabot 144
                    for i in LENIT:
92 szabot 145
                        temp = u""
91 szabot 146
                        if splitword[wor].startswith(i[0]):
92 szabot 147
                            temp = i[1] + splitword[wor][len(i[0]):]
148
                            if temp in wordin[wor]:
99 szabot 149
                                lenited = True
92 szabot 150
                                center = temp
95 szabot 151
                if center == u"":
152
                    if splitword[wor].endswith(u"nga"):
290 muzer 153
                        temp = splitword[wor][:-3] + u"nge"
95 szabot 154
                        if temp in wordin[wor]:
155
                            center = temp
271 muzer 156
                    if splitword[wor].endswith(u"fo"):
290 muzer 157
                        temp = splitword[wor][:-2] + u"fe"
103 szabot 158
                        if temp in wordin[wor]:
159
                            center = temp
273 muzer 160
                    if splitword[wor].endswith(u"po"):
290 muzer 161
                        temp = splitword[wor][:-2] + u"pe"
273 muzer 162
                        if temp in wordin[wor]:
163
                            center = temp
258 muzer 164
                    if splitword[wor].endswith(u"tsa"):
290 muzer 165
                        temp = splitword[wor][:-3] + u"tse"
258 muzer 166
                        if temp in wordin[wor]:
167
                            center = temp
290 muzer 168
                    if splitword[wor].endswith(u"fko"):
169
                        temp = splitword[wor][:-3] + u"fke"
170
                        if temp in wordin[wor]:
171
                            center = temp
172
                    if splitword[wor].endswith(u"sa'u"):
173
                        temp = splitword[wor][:-4] + u"se"
174
                        if temp in wordin[wor]:
175
                            center = temp
176
                    if splitword[wor].endswith(u"sa"):
177
                        temp = splitword[wor][:-2] + u"se"
178
                        if temp in wordin[wor]:
179
                            center = temp
180
                    if splitword[wor].endswith(u"sno"):
181
                        temp = splitword[wor][:-3] + u"sne"
182
                        if temp in wordin[wor]:
183
                            center = temp
184
                    if splitword[wor].endswith(u"ayla"):
185
                        temp = splitword[wor][:-3] + u"ayle"
186
                        if temp in wordin[wor]:
187
                            center = temp
74 szabot 188
            if center == u"":
65 szabot 189
                foundit = False
190
                break
91 szabot 191
            temp = wordin[wor].split(center)
192
            if len(temp) != 2:
193
                foundit = False
194
                break
195
            pref, posf = temp
119 szabot 196
            last = u""
197
            while last != pref:
198
                last = pref
283 muzer 199
                for pre in [x["navi"] for x in prefixes]:
119 szabot 200
                    if pref != u"":
201
                        if pref.endswith(pre):
202
                            if pre in foundprefs[wor]:
203
                                break
204
                            foundprefs[wor].append(pre)
205
                            pref = pref[:-len(pre)]
120 szabot 206
                            break
65 szabot 207
            if pref != u"":
208
                foundit = False
244 szabot 209
                foundprefs = []
65 szabot 210
                break
119 szabot 211
            last = u""
212
            while last != posf:
213
                last = posf
288 muzer 214
                for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119 szabot 215
                    if posf != u"":
216
                        if posf.startswith(pos):
284 muzer 217
                            if (pos, posid) in foundposts[wor]:
119 szabot 218
                                break
244 szabot 219
                            if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284 muzer 220
                                foundposts[wor].append((pos, posid))
244 szabot 221
                                posf = posf[len(pos):]
222
                                break
223
                            else:
224
                                break
82 szabot 225
            if posf != u"":
80 szabot 226
                foundit = False
244 szabot 227
                foundposts = []
80 szabot 228
                break
65 szabot 229
        if foundit == True:
230
            foundword = word
56 szabot 231
            break
87 szabot 232
    ret["pref"] = foundprefs
233
    ret["post"] = foundposts
234
    ret["inf"] = foundins
99 szabot 235
    ret["len"] = lenited
65 szabot 236
    if foundit == True:
71 szabot 237
        ret["word"] = foundword
77 szabot 238
    return ret
239
 
240
def parsesent(sent):
101 szabot 241
    sent = sent.strip().lower().replace(u"’", u"'")
283 muzer 242
    sent = re.sub(r"[^\wìä' ]", u"", sent)
243
    sent = re.sub(r"\ +", u" ", sent)
89 szabot 244
    sent = sent.split(u" ")
77 szabot 245
    ret = []
246
    left = len(sent)
247
    while left:
246 szabot 248
        word = parsenum.parse(sent[len(sent) - left])
103 szabot 249
        if word == None:
250
            word = parseword(sent[-left:])
78 szabot 251
        left -= len(word["word"]["navi"].split(" "))
77 szabot 252
        ret.append(word)
136 muzer 253
    return ret