Subversion Repositories navi

Rev

Rev 283 | Rev 285 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
283 muzer 23
import tsimapiak.dbconnector as dbconnector
24
import tsimapiak.parsenum as parsenum
246 szabot 25
import re
56 szabot 26
 
221 muzer 27
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 28
wordlist = dbconnector.getnavilist()
65 szabot 29
 
283 muzer 30
prefixes, infixes, postfixes = dbconnector.getaffixlists()
187 muzer 31
 
276 muzer 32
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283 muzer 33
BROKENWORDS = (
34
    (u"sami", u"si", u"", u"am", u"", (()), (()), False),
35
    (u"to", u"to", u"", u"", u"", (()), (()), False),
36
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
284 muzer 37
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False),
283 muzer 38
    (u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
39
    (u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
40
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False),
41
    (u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False),
42
    (u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False), # TODO remember why on earth this is needed; how is awng interpreted as awnga?
284 muzer 43
    (u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
283 muzer 44
    (u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
45
    (u"ka", u"ka", u"", u"", u"", (()), (()), False),
46
    (u"uo", u"uo", u"", u"", u"", (()), (()), False),
47
    (u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
48
    (u"sim", u"sim", u"", u"", u"", (()), (()), False) # probably not tsim lenited
49
)
50
 
51
#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
52
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
53
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
54
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
55
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
56
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 57
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 58
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 59
 
284 muzer 60
EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283 muzer 61
 
246 szabot 62
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 63
 
56 szabot 64
def parseword(wordin):
187 muzer 65
    tempid = 0
66
    temptype = u""
246 szabot 67
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 68
        if wordin[0] == brokenword[0]:
187 muzer 69
            for word in wordlist:
203 muzer 70
                if brokenword[1] == word["navi"]:
187 muzer 71
                    tempid = word["id"]
204 muzer 72
                    temptype = word["type"]
187 muzer 73
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90 szabot 74
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65 szabot 75
    for word in wordlist:
222 muzer 76
        word["navi"] = word["navi"].lower()
65 szabot 77
        foundit = True
78
        foundprefs = []
79
        foundposts = []
99 szabot 80
        lenited = False
74 szabot 81
        splitword = word["infix"].split(u" ")
172 muzer 82
        foundins = [u"", u"", u""]
74 szabot 83
        if len(wordin) < len(splitword):
68 szabot 84
            foundit = False
246 szabot 85
            continue
65 szabot 86
        for wor in range(len(splitword)):
76 szabot 87
            if not foundit:
88
                break
65 szabot 89
            foundprefs.append([])
90
            foundposts.append([])
91
            center = u""
284 muzer 92
            if u"<0>" in splitword[wor]:
185 muzer 93
                tempin1 = []
94
                tempin2 = []
95
                tempin3 = []
283 muzer 96
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185 muzer 97
                    if in1 in wordin[wor]:
98
                        tempin1.append(in1)
283 muzer 99
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185 muzer 100
                    if in2 in wordin[wor]:
101
                        tempin2.append(in2)
283 muzer 102
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185 muzer 103
                    if in3 in wordin[wor]:
104
                        tempin3.append(in3)
180 szabot 105
                for in1 in tempin1:
106
                    for in2 in tempin2:
107
                        for in3 in tempin3:
284 muzer 108
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
109
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 110
                                foundins = [in1, in2, in3]
111
                                break
246 szabot 112
                        if center != u"":
113
                            break
114
                    if center != u"":
115
                        break
65 szabot 116
            else:
117
                if splitword[wor] in wordin[wor]:
118
                    center = splitword[wor]
216 muzer 119
                if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246 szabot 120
                    for i in LENIT:
92 szabot 121
                        temp = u""
91 szabot 122
                        if splitword[wor].startswith(i[0]):
92 szabot 123
                            temp = i[1] + splitword[wor][len(i[0]):]
124
                            if temp in wordin[wor]:
99 szabot 125
                                lenited = True
92 szabot 126
                                center = temp
95 szabot 127
                if center == u"":
128
                    if splitword[wor].endswith(u"nga"):
97 szabot 129
                        temp = splitword[wor][:-3] + u"ng"
95 szabot 130
                        if temp in wordin[wor]:
131
                            center = temp
271 muzer 132
                    if splitword[wor].endswith(u"fo"):
133
                        temp = splitword[wor][:-2] + u"f"
103 szabot 134
                        if temp in wordin[wor]:
135
                            center = temp
273 muzer 136
                    if splitword[wor].endswith(u"po"):
137
                        temp = splitword[wor][:-2] + u"p"
138
                        if temp in wordin[wor]:
139
                            center = temp
258 muzer 140
                    if splitword[wor].endswith(u"tsa"):
141
                        temp = splitword[wor][:-3] + u"ts"
142
                        if temp in wordin[wor]:
143
                            center = temp
74 szabot 144
            if center == u"":
65 szabot 145
                foundit = False
146
                break
91 szabot 147
            temp = wordin[wor].split(center)
148
            if len(temp) != 2:
149
                foundit = False
150
                break
151
            pref, posf = temp
119 szabot 152
            last = u""
153
            while last != pref:
154
                last = pref
283 muzer 155
                for pre in [x["navi"] for x in prefixes]:
119 szabot 156
                    if pref != u"":
157
                        if pref.endswith(pre):
158
                            if pre in foundprefs[wor]:
159
                                break
160
                            foundprefs[wor].append(pre)
161
                            pref = pref[:-len(pre)]
120 szabot 162
                            break
65 szabot 163
            if pref != u"":
164
                foundit = False
244 szabot 165
                foundprefs = []
65 szabot 166
                break
119 szabot 167
            last = u""
168
            while last != posf:
169
                last = posf
284 muzer 170
                for pos, posid in [(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP):
119 szabot 171
                    if posf != u"":
172
                        if posf.startswith(pos):
284 muzer 173
                            if (pos, posid) in foundposts[wor]:
119 szabot 174
                                break
244 szabot 175
                            if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284 muzer 176
                                foundposts[wor].append((pos, posid))
244 szabot 177
                                posf = posf[len(pos):]
178
                                break
179
                            else:
180
                                break
82 szabot 181
            if posf != u"":
80 szabot 182
                foundit = False
244 szabot 183
                foundposts = []
80 szabot 184
                break
65 szabot 185
        if foundit == True:
186
            foundword = word
56 szabot 187
            break
87 szabot 188
    ret["pref"] = foundprefs
189
    ret["post"] = foundposts
190
    ret["inf"] = foundins
99 szabot 191
    ret["len"] = lenited
65 szabot 192
    if foundit == True:
71 szabot 193
        ret["word"] = foundword
77 szabot 194
    return ret
195
 
196
def parsesent(sent):
101 szabot 197
    sent = sent.strip().lower().replace(u"’", u"'")
283 muzer 198
    sent = re.sub(r"[^\wìä' ]", u"", sent)
199
    sent = re.sub(r"\ +", u" ", sent)
89 szabot 200
    sent = sent.split(u" ")
77 szabot 201
    ret = []
202
    left = len(sent)
203
    while left:
246 szabot 204
        word = parsenum.parse(sent[len(sent) - left])
103 szabot 205
        if word == None:
206
            word = parseword(sent[-left:])
78 szabot 207
        left -= len(word["word"]["navi"].split(" "))
77 szabot 208
        ret.append(word)
136 muzer 209
    return ret