Subversion Repositories navi

Rev

Rev 288 | Rev 290 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
283 muzer 23
import tsimapiak.dbconnector as dbconnector
24
import tsimapiak.parsenum as parsenum
246 szabot 25
import re
56 szabot 26
 
221 muzer 27
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 28
wordlist = dbconnector.getnavilist()
65 szabot 29
 
283 muzer 30
prefixes, infixes, postfixes = dbconnector.getaffixlists()
187 muzer 31
 
276 muzer 32
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283 muzer 33
BROKENWORDS = (
34
    (u"sami", u"si", u"", u"am", u"", (()), (()), False),
35
    (u"to", u"to", u"", u"", u"", (()), (()), False),
36
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
284 muzer 37
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False),
283 muzer 38
    (u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
39
    (u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
40
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False),
41
    (u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False),
285 muzer 42
    (u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
284 muzer 43
    (u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
283 muzer 44
    (u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
45
    (u"ka", u"ka", u"", u"", u"", (()), (()), False),
46
    (u"uo", u"uo", u"", u"", u"", (()), (()), False),
47
    (u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
48
    (u"sim", u"sim", u"", u"", u"", (()), (()), False) # probably not tsim lenited
49
)
50
 
51
#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
52
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
53
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
54
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
55
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
56
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 57
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 58
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 59
 
284 muzer 60
EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283 muzer 61
 
246 szabot 62
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 63
 
289 muzer 64
# Let's lenit the prefixes
65
extraprefixes = []
66
for prefix in prefixes:
67
    for letter, replacement in LENIT:
68
        if prefix["navi"].startswith(letter):
69
            extraprefixes.append({"id": prefix["id"], "navi": prefix["navi"].replace(letter, replacement, 1), "gloss": prefix["gloss"] + ".LENTD"})
70
            break
71
 
72
prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
73
 
56 szabot 74
def parseword(wordin):
187 muzer 75
    tempid = 0
76
    temptype = u""
246 szabot 77
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 78
        if wordin[0] == brokenword[0]:
187 muzer 79
            for word in wordlist:
203 muzer 80
                if brokenword[1] == word["navi"]:
187 muzer 81
                    tempid = word["id"]
204 muzer 82
                    temptype = word["type"]
187 muzer 83
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90 szabot 84
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65 szabot 85
    for word in wordlist:
222 muzer 86
        word["navi"] = word["navi"].lower()
65 szabot 87
        foundit = True
88
        foundprefs = []
89
        foundposts = []
99 szabot 90
        lenited = False
74 szabot 91
        splitword = word["infix"].split(u" ")
172 muzer 92
        foundins = [u"", u"", u""]
74 szabot 93
        if len(wordin) < len(splitword):
68 szabot 94
            foundit = False
246 szabot 95
            continue
65 szabot 96
        for wor in range(len(splitword)):
76 szabot 97
            if not foundit:
98
                break
65 szabot 99
            foundprefs.append([])
100
            foundposts.append([])
101
            center = u""
284 muzer 102
            if u"<0>" in splitword[wor]:
185 muzer 103
                tempin1 = []
104
                tempin2 = []
105
                tempin3 = []
283 muzer 106
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185 muzer 107
                    if in1 in wordin[wor]:
108
                        tempin1.append(in1)
283 muzer 109
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185 muzer 110
                    if in2 in wordin[wor]:
111
                        tempin2.append(in2)
283 muzer 112
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185 muzer 113
                    if in3 in wordin[wor]:
114
                        tempin3.append(in3)
180 szabot 115
                for in1 in tempin1:
116
                    for in2 in tempin2:
117
                        for in3 in tempin3:
284 muzer 118
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
119
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 120
                                foundins = [in1, in2, in3]
121
                                break
246 szabot 122
                        if center != u"":
123
                            break
124
                    if center != u"":
125
                        break
65 szabot 126
            else:
127
                if splitword[wor] in wordin[wor]:
128
                    center = splitword[wor]
216 muzer 129
                if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246 szabot 130
                    for i in LENIT:
92 szabot 131
                        temp = u""
91 szabot 132
                        if splitword[wor].startswith(i[0]):
92 szabot 133
                            temp = i[1] + splitword[wor][len(i[0]):]
134
                            if temp in wordin[wor]:
99 szabot 135
                                lenited = True
92 szabot 136
                                center = temp
95 szabot 137
                if center == u"":
138
                    if splitword[wor].endswith(u"nga"):
97 szabot 139
                        temp = splitword[wor][:-3] + u"ng"
95 szabot 140
                        if temp in wordin[wor]:
141
                            center = temp
271 muzer 142
                    if splitword[wor].endswith(u"fo"):
143
                        temp = splitword[wor][:-2] + u"f"
103 szabot 144
                        if temp in wordin[wor]:
145
                            center = temp
273 muzer 146
                    if splitword[wor].endswith(u"po"):
147
                        temp = splitword[wor][:-2] + u"p"
148
                        if temp in wordin[wor]:
149
                            center = temp
258 muzer 150
                    if splitword[wor].endswith(u"tsa"):
151
                        temp = splitword[wor][:-3] + u"ts"
152
                        if temp in wordin[wor]:
153
                            center = temp
74 szabot 154
            if center == u"":
65 szabot 155
                foundit = False
156
                break
91 szabot 157
            temp = wordin[wor].split(center)
158
            if len(temp) != 2:
159
                foundit = False
160
                break
161
            pref, posf = temp
119 szabot 162
            last = u""
163
            while last != pref:
164
                last = pref
283 muzer 165
                for pre in [x["navi"] for x in prefixes]:
119 szabot 166
                    if pref != u"":
167
                        if pref.endswith(pre):
168
                            if pre in foundprefs[wor]:
169
                                break
170
                            foundprefs[wor].append(pre)
171
                            pref = pref[:-len(pre)]
120 szabot 172
                            break
65 szabot 173
            if pref != u"":
174
                foundit = False
244 szabot 175
                foundprefs = []
65 szabot 176
                break
119 szabot 177
            last = u""
178
            while last != posf:
179
                last = posf
288 muzer 180
                for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119 szabot 181
                    if posf != u"":
182
                        if posf.startswith(pos):
284 muzer 183
                            if (pos, posid) in foundposts[wor]:
119 szabot 184
                                break
244 szabot 185
                            if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284 muzer 186
                                foundposts[wor].append((pos, posid))
244 szabot 187
                                posf = posf[len(pos):]
188
                                break
189
                            else:
190
                                break
82 szabot 191
            if posf != u"":
80 szabot 192
                foundit = False
244 szabot 193
                foundposts = []
80 szabot 194
                break
65 szabot 195
        if foundit == True:
196
            foundword = word
56 szabot 197
            break
87 szabot 198
    ret["pref"] = foundprefs
199
    ret["post"] = foundposts
200
    ret["inf"] = foundins
99 szabot 201
    ret["len"] = lenited
65 szabot 202
    if foundit == True:
71 szabot 203
        ret["word"] = foundword
77 szabot 204
    return ret
205
 
206
def parsesent(sent):
101 szabot 207
    sent = sent.strip().lower().replace(u"’", u"'")
283 muzer 208
    sent = re.sub(r"[^\wìä' ]", u"", sent)
209
    sent = re.sub(r"\ +", u" ", sent)
89 szabot 210
    sent = sent.split(u" ")
77 szabot 211
    ret = []
212
    left = len(sent)
213
    while left:
246 szabot 214
        word = parsenum.parse(sent[len(sent) - left])
103 szabot 215
        if word == None:
216
            word = parseword(sent[-left:])
78 szabot 217
        left -= len(word["word"]["navi"].split(" "))
77 szabot 218
        ret.append(word)
136 muzer 219
    return ret