Subversion Repositories navi

Rev

Rev 300 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
283 muzer 23
import tsimapiak.dbconnector as dbconnector
24
import tsimapiak.parsenum as parsenum
297 muzer 25
import itertools
246 szabot 26
import re
56 szabot 27
 
221 muzer 28
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 29
wordlist = dbconnector.getnavilist()
65 szabot 30
 
283 muzer 31
prefixes, infixes, postfixes = dbconnector.getaffixlists()
187 muzer 32
 
276 muzer 33
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283 muzer 34
BROKENWORDS = (
298 muzer 35
    (u"sami", u"si", u"", u"am", u"", (()), (()), False, "si"), # otherwise parses as sa (tsa-lenited) + mi
296 muzer 36
    #(u"to", u"to", u"", u"", u"", (()), (()), False),
283 muzer 37
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
298 muzer 38
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False, "soaia"), # does not parse, irregular form
296 muzer 39
    #(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
40
    #(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
298 muzer 41
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False, "kä"), # otherwise parses as kìm (spin) + ä (genitive)
42
    (u"apxay", u"pxay", u"", u"", u"", [[(u"a", "a")]], (()), False, "pxay"), # otherwise parses as apxa + -y (genitive)
290 muzer 43
    #(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
44
    #(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
45
    #(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
296 muzer 46
    #(u"ka", u"ka", u"", u"", u"", (()), (()), False),
47
    #(u"uo", u"uo", u"", u"", u"", (()), (()), False),
48
    #(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
49
    #(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
298 muzer 50
    (u"tse", u"tse", u"", u"", u"", (()), (()), False, "tse"), # otherwise parses as tsa'u abbreviated (special case)
300 muzer 51
    (u"por", u"po", u"", u"", u"", (()), [[("r", None)]], False, "po"), # otherwise parses as lenited pxor which is unlikely
283 muzer 52
)
53
 
300 muzer 54
BANNEDNUMBERS = { # words which must not be parsed by the number parser
55
    "pey" # more likely dictionary word pey than lenited pxey 3
56
}
57
 
283 muzer 58
#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
59
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
60
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
61
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
62
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
63
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 64
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 65
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 66
 
290 muzer 67
EXTRAINFIXES = [
298 muzer 68
    {"id": "-1", "navi": "eiy", "orig_navi": "ei", "gloss": "LAUD.", "position": 2},
69
    {"id": "-2", "navi": "eng", "orig_navi": "äng", "gloss": "PEJ.", "position": 2},
290 muzer 70
]
71
 
72
EXTRAPOSTFIXES = [
298 muzer 73
    {"id": "-3", "navi": "eyä", "orig_navi": "yä", "gloss": "GEN."},
300 muzer 74
    {"id": "-4", "navi": "pxì", "orig_navi": "pxì", "gloss": "FRAC."},
290 muzer 75
]
76
 
284 muzer 77
EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283 muzer 78
 
246 szabot 79
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 80
 
289 muzer 81
# Let's lenit the prefixes
82
extraprefixes = []
83
for prefix in prefixes:
84
    for letter, replacement in LENIT:
85
        if prefix["navi"].startswith(letter):
290 muzer 86
            new_prefix = prefix["navi"].replace(letter, replacement, 1)
87
            if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
298 muzer 88
                extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD", "orig_navi": prefix["navi"]})
289 muzer 89
            break
90
 
91
prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290 muzer 92
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
93
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289 muzer 94
 
296 muzer 95
# Let's lenit the dictionary
96
extrawords = []
97
for word in wordlist:
297 muzer 98
    splitword = word["navi"].split(" ")
99
    splitinfix = word["infix"].split(" ")
100
    lenitword = {}
101
    lenitinfix = {}
102
    for i, wor in enumerate(splitword):
103
        for letter, replacement in LENIT:
104
            if wor.startswith(letter):
105
                lenitword[i] = wor.replace(letter, replacement, 1)
106
                lenitinfix[i] = splitinfix[i].replace(letter, replacement, 1)
107
                break
108
 
109
    s = list(lenitword.keys())
110
    for lenits in itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(1, len(s)+1)):
111
        new_word = ""
112
        new_infix = ""
113
        for i, wor in enumerate(splitword):
114
            if i in lenits:
115
                new_word += lenitword[i]
116
                new_infix += lenitinfix[i]
117
            else:
118
                new_word += wor
119
                new_infix += splitinfix[i]
120
            new_word += " "
121
            new_infix += " "
122
        print(f"Generated lenited {new_word} from {word['navi']}")
123
        new_word = new_word[:-1]
124
        new_infix = new_infix[:-1]
298 muzer 125
        extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True, "orig_navi": word["navi"]})
297 muzer 126
 
296 muzer 127
wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)
128
 
301 muzer 129
# TODO add reef Na'vi
130
 
56 szabot 131
def parseword(wordin):
187 muzer 132
    tempid = 0
133
    temptype = u""
246 szabot 134
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 135
        if wordin[0] == brokenword[0]:
187 muzer 136
            for word in wordlist:
203 muzer 137
                if brokenword[1] == word["navi"]:
187 muzer 138
                    tempid = word["id"]
204 muzer 139
                    temptype = word["type"]
298 muzer 140
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype, "orig_navi": brokenword[8]}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
300 muzer 141
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u"", "orig_navi": "[" + wordin[0] + "]"}, "len": False, "pref": [], "post": [], "inf": ["", "", ""]}
65 szabot 142
    for word in wordlist:
222 muzer 143
        word["navi"] = word["navi"].lower()
65 szabot 144
        foundit = True
145
        foundprefs = []
146
        foundposts = []
74 szabot 147
        splitword = word["infix"].split(u" ")
172 muzer 148
        foundins = [u"", u"", u""]
74 szabot 149
        if len(wordin) < len(splitword):
68 szabot 150
            foundit = False
246 szabot 151
            continue
65 szabot 152
        for wor in range(len(splitword)):
76 szabot 153
            if not foundit:
154
                break
65 szabot 155
            foundprefs.append([])
156
            foundposts.append([])
157
            center = u""
284 muzer 158
            if u"<0>" in splitword[wor]:
185 muzer 159
                tempin1 = []
160
                tempin2 = []
161
                tempin3 = []
283 muzer 162
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185 muzer 163
                    if in1 in wordin[wor]:
164
                        tempin1.append(in1)
283 muzer 165
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185 muzer 166
                    if in2 in wordin[wor]:
167
                        tempin2.append(in2)
283 muzer 168
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185 muzer 169
                    if in3 in wordin[wor]:
170
                        tempin3.append(in3)
180 szabot 171
                for in1 in tempin1:
172
                    for in2 in tempin2:
173
                        for in3 in tempin3:
284 muzer 174
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
175
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 176
                                foundins = [in1, in2, in3]
177
                                break
246 szabot 178
                        if center != u"":
179
                            break
180
                    if center != u"":
181
                        break
65 szabot 182
            else:
183
                if splitword[wor] in wordin[wor]:
184
                    center = splitword[wor]
95 szabot 185
                if center == u"":
186
                    if splitword[wor].endswith(u"nga"):
290 muzer 187
                        temp = splitword[wor][:-3] + u"nge"
95 szabot 188
                        if temp in wordin[wor]:
189
                            center = temp
271 muzer 190
                    if splitword[wor].endswith(u"fo"):
290 muzer 191
                        temp = splitword[wor][:-2] + u"fe"
103 szabot 192
                        if temp in wordin[wor]:
193
                            center = temp
273 muzer 194
                    if splitword[wor].endswith(u"po"):
290 muzer 195
                        temp = splitword[wor][:-2] + u"pe"
273 muzer 196
                        if temp in wordin[wor]:
197
                            center = temp
258 muzer 198
                    if splitword[wor].endswith(u"tsa"):
290 muzer 199
                        temp = splitword[wor][:-3] + u"tse"
258 muzer 200
                        if temp in wordin[wor]:
201
                            center = temp
290 muzer 202
                    if splitword[wor].endswith(u"fko"):
203
                        temp = splitword[wor][:-3] + u"fke"
204
                        if temp in wordin[wor]:
205
                            center = temp
206
                    if splitword[wor].endswith(u"sa'u"):
207
                        temp = splitword[wor][:-4] + u"se"
208
                        if temp in wordin[wor]:
209
                            center = temp
210
                    if splitword[wor].endswith(u"sa"):
211
                        temp = splitword[wor][:-2] + u"se"
212
                        if temp in wordin[wor]:
213
                            center = temp
214
                    if splitword[wor].endswith(u"sno"):
215
                        temp = splitword[wor][:-3] + u"sne"
216
                        if temp in wordin[wor]:
217
                            center = temp
218
                    if splitword[wor].endswith(u"ayla"):
219
                        temp = splitword[wor][:-3] + u"ayle"
220
                        if temp in wordin[wor]:
221
                            center = temp
74 szabot 222
            if center == u"":
65 szabot 223
                foundit = False
224
                break
91 szabot 225
            temp = wordin[wor].split(center)
226
            if len(temp) != 2:
227
                foundit = False
228
                break
229
            pref, posf = temp
119 szabot 230
            last = u""
231
            while last != pref:
232
                last = pref
298 muzer 233
                for pre in prefixes:
119 szabot 234
                    if pref != u"":
298 muzer 235
                        if pref.endswith(pre["navi"]):
236
                            if pre["navi"] in foundprefs[wor]:
119 szabot 237
                                break
298 muzer 238
                            foundprefs[wor].append((pre["navi"], pre["orig_navi"])) # only needed here, to handle lenition
239
                            pref = pref[:-len(pre["navi"])]
120 szabot 240
                            break
65 szabot 241
            if pref != u"":
242
                foundit = False
244 szabot 243
                foundprefs = []
65 szabot 244
                break
119 szabot 245
            last = u""
246
            while last != posf:
247
                last = posf
288 muzer 248
                for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119 szabot 249
                    if posf != u"":
250
                        if posf.startswith(pos):
284 muzer 251
                            if (pos, posid) in foundposts[wor]:
119 szabot 252
                                break
298 muzer 253
                            if pos != u"ä" or word["orig_navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284 muzer 254
                                foundposts[wor].append((pos, posid))
244 szabot 255
                                posf = posf[len(pos):]
256
                                break
257
                            else:
258
                                break
82 szabot 259
            if posf != u"":
80 szabot 260
                foundit = False
244 szabot 261
                foundposts = []
80 szabot 262
                break
65 szabot 263
        if foundit == True:
264
            foundword = word
56 szabot 265
            break
65 szabot 266
    if foundit == True:
300 muzer 267
        ret["pref"] = foundprefs
268
        ret["post"] = foundposts
269
        ret["inf"] = foundins
296 muzer 270
        ret["len"] = word["lenited"]
71 szabot 271
        ret["word"] = foundword
77 szabot 272
    return ret
273
 
274
def parsesent(sent):
101 szabot 275
    sent = sent.strip().lower().replace(u"’", u"'")
301 muzer 276
    sent = sent.replace("ù", "u") # Basic support for reef Na'vi
283 muzer 277
    sent = re.sub(r"[^\wìä' ]", u"", sent)
278
    sent = re.sub(r"\ +", u" ", sent)
89 szabot 279
    sent = sent.split(u" ")
77 szabot 280
    ret = []
281
    left = len(sent)
282
    while left:
300 muzer 283
        word = None
284
        if sent[len(sent) - left] not in BANNEDNUMBERS:
285
            word = parsenum.parse(sent[len(sent) - left])
103 szabot 286
        if word == None:
287
            word = parseword(sent[-left:])
78 szabot 288
        left -= len(word["word"]["navi"].split(" "))
77 szabot 289
        ret.append(word)
136 muzer 290
    return ret