Subversion Repositories navi

Rev

Rev 298 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
283 muzer 23
import tsimapiak.dbconnector as dbconnector
24
import tsimapiak.parsenum as parsenum
297 muzer 25
import itertools
246 szabot 26
import re
56 szabot 27
 
221 muzer 28
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 29
wordlist = dbconnector.getnavilist()
65 szabot 30
 
283 muzer 31
prefixes, infixes, postfixes = dbconnector.getaffixlists()
187 muzer 32
 
276 muzer 33
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283 muzer 34
BROKENWORDS = (
298 muzer 35
    (u"sami", u"si", u"", u"am", u"", (()), (()), False, "si"), # otherwise parses as sa (tsa-lenited) + mi
296 muzer 36
    #(u"to", u"to", u"", u"", u"", (()), (()), False),
283 muzer 37
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
298 muzer 38
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False, "soaia"), # does not parse, irregular form
296 muzer 39
    #(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
40
    #(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
298 muzer 41
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False, "kä"), # otherwise parses as kìm (spin) + ä (genitive)
42
    (u"apxay", u"pxay", u"", u"", u"", [[(u"a", "a")]], (()), False, "pxay"), # otherwise parses as apxa + -y (genitive)
290 muzer 43
    #(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
44
    #(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
45
    #(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
296 muzer 46
    #(u"ka", u"ka", u"", u"", u"", (()), (()), False),
47
    #(u"uo", u"uo", u"", u"", u"", (()), (()), False),
48
    #(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
49
    #(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
298 muzer 50
    (u"tse", u"tse", u"", u"", u"", (()), (()), False, "tse"), # otherwise parses as tsa'u abbreviated (special case)
300 muzer 51
    (u"por", u"po", u"", u"", u"", (()), [[("r", None)]], False, "po"), # otherwise parses as lenited pxor which is unlikely
283 muzer 52
)
53
 
300 muzer 54
BANNEDNUMBERS = { # words which must not be parsed by the number parser
55
    "pey" # more likely dictionary word pey than lenited pxey 3
56
}
57
 
283 muzer 58
#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
59
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
60
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
61
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
62
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
63
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 64
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 65
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 66
 
290 muzer 67
EXTRAINFIXES = [
298 muzer 68
    {"id": "-1", "navi": "eiy", "orig_navi": "ei", "gloss": "LAUD.", "position": 2},
69
    {"id": "-2", "navi": "eng", "orig_navi": "äng", "gloss": "PEJ.", "position": 2},
290 muzer 70
]
71
 
72
EXTRAPOSTFIXES = [
298 muzer 73
    {"id": "-3", "navi": "eyä", "orig_navi": "yä", "gloss": "GEN."},
300 muzer 74
    {"id": "-4", "navi": "pxì", "orig_navi": "pxì", "gloss": "FRAC."},
290 muzer 75
]
76
 
284 muzer 77
EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283 muzer 78
 
246 szabot 79
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 80
 
289 muzer 81
# Let's lenit the prefixes
82
extraprefixes = []
83
for prefix in prefixes:
84
    for letter, replacement in LENIT:
85
        if prefix["navi"].startswith(letter):
290 muzer 86
            new_prefix = prefix["navi"].replace(letter, replacement, 1)
87
            if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
298 muzer 88
                extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD", "orig_navi": prefix["navi"]})
289 muzer 89
            break
90
 
91
prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290 muzer 92
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
93
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289 muzer 94
 
296 muzer 95
# Let's lenit the dictionary
96
extrawords = []
97
for word in wordlist:
297 muzer 98
    splitword = word["navi"].split(" ")
99
    splitinfix = word["infix"].split(" ")
100
    lenitword = {}
101
    lenitinfix = {}
102
    for i, wor in enumerate(splitword):
103
        for letter, replacement in LENIT:
104
            if wor.startswith(letter):
105
                lenitword[i] = wor.replace(letter, replacement, 1)
106
                lenitinfix[i] = splitinfix[i].replace(letter, replacement, 1)
107
                break
108
 
109
    s = list(lenitword.keys())
110
    for lenits in itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(1, len(s)+1)):
111
        new_word = ""
112
        new_infix = ""
113
        for i, wor in enumerate(splitword):
114
            if i in lenits:
115
                new_word += lenitword[i]
116
                new_infix += lenitinfix[i]
117
            else:
118
                new_word += wor
119
                new_infix += splitinfix[i]
120
            new_word += " "
121
            new_infix += " "
122
        print(f"Generated lenited {new_word} from {word['navi']}")
123
        new_word = new_word[:-1]
124
        new_infix = new_infix[:-1]
298 muzer 125
        extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True, "orig_navi": word["navi"]})
297 muzer 126
 
296 muzer 127
wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)
128
 
56 szabot 129
def parseword(wordin):
187 muzer 130
    tempid = 0
131
    temptype = u""
246 szabot 132
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 133
        if wordin[0] == brokenword[0]:
187 muzer 134
            for word in wordlist:
203 muzer 135
                if brokenword[1] == word["navi"]:
187 muzer 136
                    tempid = word["id"]
204 muzer 137
                    temptype = word["type"]
298 muzer 138
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype, "orig_navi": brokenword[8]}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
300 muzer 139
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u"", "orig_navi": "[" + wordin[0] + "]"}, "len": False, "pref": [], "post": [], "inf": ["", "", ""]}
65 szabot 140
    for word in wordlist:
222 muzer 141
        word["navi"] = word["navi"].lower()
65 szabot 142
        foundit = True
143
        foundprefs = []
144
        foundposts = []
74 szabot 145
        splitword = word["infix"].split(u" ")
172 muzer 146
        foundins = [u"", u"", u""]
74 szabot 147
        if len(wordin) < len(splitword):
68 szabot 148
            foundit = False
246 szabot 149
            continue
65 szabot 150
        for wor in range(len(splitword)):
76 szabot 151
            if not foundit:
152
                break
65 szabot 153
            foundprefs.append([])
154
            foundposts.append([])
155
            center = u""
284 muzer 156
            if u"<0>" in splitword[wor]:
185 muzer 157
                tempin1 = []
158
                tempin2 = []
159
                tempin3 = []
283 muzer 160
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185 muzer 161
                    if in1 in wordin[wor]:
162
                        tempin1.append(in1)
283 muzer 163
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185 muzer 164
                    if in2 in wordin[wor]:
165
                        tempin2.append(in2)
283 muzer 166
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185 muzer 167
                    if in3 in wordin[wor]:
168
                        tempin3.append(in3)
180 szabot 169
                for in1 in tempin1:
170
                    for in2 in tempin2:
171
                        for in3 in tempin3:
284 muzer 172
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
173
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 174
                                foundins = [in1, in2, in3]
175
                                break
246 szabot 176
                        if center != u"":
177
                            break
178
                    if center != u"":
179
                        break
65 szabot 180
            else:
181
                if splitword[wor] in wordin[wor]:
182
                    center = splitword[wor]
95 szabot 183
                if center == u"":
184
                    if splitword[wor].endswith(u"nga"):
290 muzer 185
                        temp = splitword[wor][:-3] + u"nge"
95 szabot 186
                        if temp in wordin[wor]:
187
                            center = temp
271 muzer 188
                    if splitword[wor].endswith(u"fo"):
290 muzer 189
                        temp = splitword[wor][:-2] + u"fe"
103 szabot 190
                        if temp in wordin[wor]:
191
                            center = temp
273 muzer 192
                    if splitword[wor].endswith(u"po"):
290 muzer 193
                        temp = splitword[wor][:-2] + u"pe"
273 muzer 194
                        if temp in wordin[wor]:
195
                            center = temp
258 muzer 196
                    if splitword[wor].endswith(u"tsa"):
290 muzer 197
                        temp = splitword[wor][:-3] + u"tse"
258 muzer 198
                        if temp in wordin[wor]:
199
                            center = temp
290 muzer 200
                    if splitword[wor].endswith(u"fko"):
201
                        temp = splitword[wor][:-3] + u"fke"
202
                        if temp in wordin[wor]:
203
                            center = temp
204
                    if splitword[wor].endswith(u"sa'u"):
205
                        temp = splitword[wor][:-4] + u"se"
206
                        if temp in wordin[wor]:
207
                            center = temp
208
                    if splitword[wor].endswith(u"sa"):
209
                        temp = splitword[wor][:-2] + u"se"
210
                        if temp in wordin[wor]:
211
                            center = temp
212
                    if splitword[wor].endswith(u"sno"):
213
                        temp = splitword[wor][:-3] + u"sne"
214
                        if temp in wordin[wor]:
215
                            center = temp
216
                    if splitword[wor].endswith(u"ayla"):
217
                        temp = splitword[wor][:-3] + u"ayle"
218
                        if temp in wordin[wor]:
219
                            center = temp
74 szabot 220
            if center == u"":
65 szabot 221
                foundit = False
222
                break
91 szabot 223
            temp = wordin[wor].split(center)
224
            if len(temp) != 2:
225
                foundit = False
226
                break
227
            pref, posf = temp
119 szabot 228
            last = u""
229
            while last != pref:
230
                last = pref
298 muzer 231
                for pre in prefixes:
119 szabot 232
                    if pref != u"":
298 muzer 233
                        if pref.endswith(pre["navi"]):
234
                            if pre["navi"] in foundprefs[wor]:
119 szabot 235
                                break
298 muzer 236
                            foundprefs[wor].append((pre["navi"], pre["orig_navi"])) # only needed here, to handle lenition
237
                            pref = pref[:-len(pre["navi"])]
120 szabot 238
                            break
65 szabot 239
            if pref != u"":
240
                foundit = False
244 szabot 241
                foundprefs = []
65 szabot 242
                break
119 szabot 243
            last = u""
244
            while last != posf:
245
                last = posf
288 muzer 246
                for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119 szabot 247
                    if posf != u"":
248
                        if posf.startswith(pos):
284 muzer 249
                            if (pos, posid) in foundposts[wor]:
119 szabot 250
                                break
298 muzer 251
                            if pos != u"ä" or word["orig_navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284 muzer 252
                                foundposts[wor].append((pos, posid))
244 szabot 253
                                posf = posf[len(pos):]
254
                                break
255
                            else:
256
                                break
82 szabot 257
            if posf != u"":
80 szabot 258
                foundit = False
244 szabot 259
                foundposts = []
80 szabot 260
                break
65 szabot 261
        if foundit == True:
262
            foundword = word
56 szabot 263
            break
65 szabot 264
    if foundit == True:
300 muzer 265
        ret["pref"] = foundprefs
266
        ret["post"] = foundposts
267
        ret["inf"] = foundins
296 muzer 268
        ret["len"] = word["lenited"]
71 szabot 269
        ret["word"] = foundword
77 szabot 270
    return ret
271
 
272
def parsesent(sent):
101 szabot 273
    sent = sent.strip().lower().replace(u"’", u"'")
283 muzer 274
    sent = re.sub(r"[^\wìä' ]", u"", sent)
275
    sent = re.sub(r"\ +", u" ", sent)
89 szabot 276
    sent = sent.split(u" ")
77 szabot 277
    ret = []
278
    left = len(sent)
279
    while left:
300 muzer 280
        word = None
281
        if sent[len(sent) - left] not in BANNEDNUMBERS:
282
            word = parsenum.parse(sent[len(sent) - left])
103 szabot 283
        if word == None:
284
            word = parseword(sent[-left:])
78 szabot 285
        left -= len(word["word"]["navi"].split(" "))
77 szabot 286
        ret.append(word)
136 muzer 287
    return ret