Subversion Repositories navi

Rev

Rev 297 | Rev 300 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
283 muzer 23
import tsimapiak.dbconnector as dbconnector
24
import tsimapiak.parsenum as parsenum
297 muzer 25
import itertools
246 szabot 26
import re
56 szabot 27
 
221 muzer 28
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 29
wordlist = dbconnector.getnavilist()
65 szabot 30
 
283 muzer 31
prefixes, infixes, postfixes = dbconnector.getaffixlists()
187 muzer 32
 
276 muzer 33
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283 muzer 34
BROKENWORDS = (
298 muzer 35
    (u"sami", u"si", u"", u"am", u"", (()), (()), False, "si"), # otherwise parses as sa (tsa-lenited) + mi
296 muzer 36
    #(u"to", u"to", u"", u"", u"", (()), (()), False),
283 muzer 37
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
298 muzer 38
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False, "soaia"), # does not parse, irregular form
296 muzer 39
    #(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
40
    #(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
298 muzer 41
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False, "kä"), # otherwise parses as kìm (spin) + ä (genitive)
42
    (u"apxay", u"pxay", u"", u"", u"", [[(u"a", "a")]], (()), False, "pxay"), # otherwise parses as apxa + -y (genitive)
290 muzer 43
    #(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
44
    #(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
45
    #(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
296 muzer 46
    #(u"ka", u"ka", u"", u"", u"", (()), (()), False),
47
    #(u"uo", u"uo", u"", u"", u"", (()), (()), False),
48
    #(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
49
    #(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
298 muzer 50
    (u"tse", u"tse", u"", u"", u"", (()), (()), False, "tse"), # otherwise parses as tsa'u abbreviated (special case)
283 muzer 51
)
52
 
53
#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
54
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
55
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
56
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
57
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
58
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 59
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 60
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 61
 
290 muzer 62
EXTRAINFIXES = [
298 muzer 63
    {"id": "-1", "navi": "eiy", "orig_navi": "ei", "gloss": "LAUD.", "position": 2},
64
    {"id": "-2", "navi": "eng", "orig_navi": "äng", "gloss": "PEJ.", "position": 2},
290 muzer 65
]
66
 
67
EXTRAPOSTFIXES = [
298 muzer 68
    {"id": "-3", "navi": "eyä", "orig_navi": "yä", "gloss": "GEN."},
290 muzer 69
]
70
 
284 muzer 71
EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283 muzer 72
 
246 szabot 73
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 74
 
289 muzer 75
# Let's lenit the prefixes
76
extraprefixes = []
77
for prefix in prefixes:
78
    for letter, replacement in LENIT:
79
        if prefix["navi"].startswith(letter):
290 muzer 80
            new_prefix = prefix["navi"].replace(letter, replacement, 1)
81
            if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
298 muzer 82
                extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD", "orig_navi": prefix["navi"]})
289 muzer 83
            break
84
 
85
prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290 muzer 86
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
87
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289 muzer 88
 
296 muzer 89
# Let's lenit the dictionary
90
extrawords = []
91
for word in wordlist:
297 muzer 92
    splitword = word["navi"].split(" ")
93
    splitinfix = word["infix"].split(" ")
94
    lenitword = {}
95
    lenitinfix = {}
96
    for i, wor in enumerate(splitword):
97
        for letter, replacement in LENIT:
98
            if wor.startswith(letter):
99
                lenitword[i] = wor.replace(letter, replacement, 1)
100
                lenitinfix[i] = splitinfix[i].replace(letter, replacement, 1)
101
                break
102
 
103
    s = list(lenitword.keys())
104
    for lenits in itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(1, len(s)+1)):
105
        new_word = ""
106
        new_infix = ""
107
        for i, wor in enumerate(splitword):
108
            if i in lenits:
109
                new_word += lenitword[i]
110
                new_infix += lenitinfix[i]
111
            else:
112
                new_word += wor
113
                new_infix += splitinfix[i]
114
            new_word += " "
115
            new_infix += " "
116
        print(f"Generated lenited {new_word} from {word['navi']}")
117
        new_word = new_word[:-1]
118
        new_infix = new_infix[:-1]
298 muzer 119
        extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True, "orig_navi": word["navi"]})
297 muzer 120
 
296 muzer 121
wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)
122
 
56 szabot 123
def parseword(wordin):
187 muzer 124
    tempid = 0
125
    temptype = u""
246 szabot 126
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 127
        if wordin[0] == brokenword[0]:
187 muzer 128
            for word in wordlist:
203 muzer 129
                if brokenword[1] == word["navi"]:
187 muzer 130
                    tempid = word["id"]
204 muzer 131
                    temptype = word["type"]
298 muzer 132
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype, "orig_navi": brokenword[8]}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
133
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u"", "orig_navi": "[" + wordin[0] + "]"}, "len": False}
65 szabot 134
    for word in wordlist:
222 muzer 135
        word["navi"] = word["navi"].lower()
65 szabot 136
        foundit = True
137
        foundprefs = []
138
        foundposts = []
74 szabot 139
        splitword = word["infix"].split(u" ")
172 muzer 140
        foundins = [u"", u"", u""]
74 szabot 141
        if len(wordin) < len(splitword):
68 szabot 142
            foundit = False
246 szabot 143
            continue
65 szabot 144
        for wor in range(len(splitword)):
76 szabot 145
            if not foundit:
146
                break
65 szabot 147
            foundprefs.append([])
148
            foundposts.append([])
149
            center = u""
284 muzer 150
            if u"<0>" in splitword[wor]:
185 muzer 151
                tempin1 = []
152
                tempin2 = []
153
                tempin3 = []
283 muzer 154
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185 muzer 155
                    if in1 in wordin[wor]:
156
                        tempin1.append(in1)
283 muzer 157
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185 muzer 158
                    if in2 in wordin[wor]:
159
                        tempin2.append(in2)
283 muzer 160
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185 muzer 161
                    if in3 in wordin[wor]:
162
                        tempin3.append(in3)
180 szabot 163
                for in1 in tempin1:
164
                    for in2 in tempin2:
165
                        for in3 in tempin3:
284 muzer 166
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
167
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 168
                                foundins = [in1, in2, in3]
169
                                break
246 szabot 170
                        if center != u"":
171
                            break
172
                    if center != u"":
173
                        break
65 szabot 174
            else:
175
                if splitword[wor] in wordin[wor]:
176
                    center = splitword[wor]
95 szabot 177
                if center == u"":
178
                    if splitword[wor].endswith(u"nga"):
290 muzer 179
                        temp = splitword[wor][:-3] + u"nge"
95 szabot 180
                        if temp in wordin[wor]:
181
                            center = temp
271 muzer 182
                    if splitword[wor].endswith(u"fo"):
290 muzer 183
                        temp = splitword[wor][:-2] + u"fe"
103 szabot 184
                        if temp in wordin[wor]:
185
                            center = temp
273 muzer 186
                    if splitword[wor].endswith(u"po"):
290 muzer 187
                        temp = splitword[wor][:-2] + u"pe"
273 muzer 188
                        if temp in wordin[wor]:
189
                            center = temp
258 muzer 190
                    if splitword[wor].endswith(u"tsa"):
290 muzer 191
                        temp = splitword[wor][:-3] + u"tse"
258 muzer 192
                        if temp in wordin[wor]:
193
                            center = temp
290 muzer 194
                    if splitword[wor].endswith(u"fko"):
195
                        temp = splitword[wor][:-3] + u"fke"
196
                        if temp in wordin[wor]:
197
                            center = temp
198
                    if splitword[wor].endswith(u"sa'u"):
199
                        temp = splitword[wor][:-4] + u"se"
200
                        if temp in wordin[wor]:
201
                            center = temp
202
                    if splitword[wor].endswith(u"sa"):
203
                        temp = splitword[wor][:-2] + u"se"
204
                        if temp in wordin[wor]:
205
                            center = temp
206
                    if splitword[wor].endswith(u"sno"):
207
                        temp = splitword[wor][:-3] + u"sne"
208
                        if temp in wordin[wor]:
209
                            center = temp
210
                    if splitword[wor].endswith(u"ayla"):
211
                        temp = splitword[wor][:-3] + u"ayle"
212
                        if temp in wordin[wor]:
213
                            center = temp
74 szabot 214
            if center == u"":
65 szabot 215
                foundit = False
216
                break
91 szabot 217
            temp = wordin[wor].split(center)
218
            if len(temp) != 2:
219
                foundit = False
220
                break
221
            pref, posf = temp
119 szabot 222
            last = u""
223
            while last != pref:
224
                last = pref
298 muzer 225
                for pre in prefixes:
119 szabot 226
                    if pref != u"":
298 muzer 227
                        if pref.endswith(pre["navi"]):
228
                            if pre["navi"] in foundprefs[wor]:
119 szabot 229
                                break
298 muzer 230
                            foundprefs[wor].append((pre["navi"], pre["orig_navi"])) # only needed here, to handle lenition
231
                            pref = pref[:-len(pre["navi"])]
120 szabot 232
                            break
65 szabot 233
            if pref != u"":
234
                foundit = False
244 szabot 235
                foundprefs = []
65 szabot 236
                break
119 szabot 237
            last = u""
238
            while last != posf:
239
                last = posf
288 muzer 240
                for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119 szabot 241
                    if posf != u"":
242
                        if posf.startswith(pos):
284 muzer 243
                            if (pos, posid) in foundposts[wor]:
119 szabot 244
                                break
298 muzer 245
                            if pos != u"ä" or word["orig_navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284 muzer 246
                                foundposts[wor].append((pos, posid))
244 szabot 247
                                posf = posf[len(pos):]
248
                                break
249
                            else:
250
                                break
82 szabot 251
            if posf != u"":
80 szabot 252
                foundit = False
244 szabot 253
                foundposts = []
80 szabot 254
                break
65 szabot 255
        if foundit == True:
256
            foundword = word
56 szabot 257
            break
87 szabot 258
    ret["pref"] = foundprefs
259
    ret["post"] = foundposts
260
    ret["inf"] = foundins
65 szabot 261
    if foundit == True:
296 muzer 262
        ret["len"] = word["lenited"]
71 szabot 263
        ret["word"] = foundword
77 szabot 264
    return ret
265
 
266
def parsesent(sent):
101 szabot 267
    sent = sent.strip().lower().replace(u"’", u"'")
283 muzer 268
    sent = re.sub(r"[^\wìä' ]", u"", sent)
269
    sent = re.sub(r"\ +", u" ", sent)
89 szabot 270
    sent = sent.split(u" ")
77 szabot 271
    ret = []
272
    left = len(sent)
273
    while left:
246 szabot 274
        word = parsenum.parse(sent[len(sent) - left])
103 szabot 275
        if word == None:
276
            word = parseword(sent[-left:])
78 szabot 277
        left -= len(word["word"]["navi"].split(" "))
77 szabot 278
        ret.append(word)
136 muzer 279
    return ret