Subversion Repositories navi

Rev

Rev 279 | Rev 283 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
66 szabot 23
import dbconnector
103 szabot 24
import parsenum
246 szabot 25
import re
56 szabot 26
 
221 muzer 27
#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263 muzer 28
wordlist = dbconnector.getnavilist()
65 szabot 29
 
187 muzer 30
 
280 muzer 31
BROKENWORDS = ((u"sami", u"si", u"", u"am", u"", (()), (()), False), (u"to", u"to", u"", u"", u"", (()), (()), False), (u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False), (u"soaiä", u"soaia", u"", u"", u"", (()), [[u"ä"]], False), (u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False), (u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False), (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False), (u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False), (u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False), (u"kawnga", u"kawng", u"", u"", u"", (()), [[u"a"]], False), (u"kawng", u"kawng", u"", u"", u"", (()), (()), False), (u"ka", u"ka", u"", u"", u"", (()), (()), False), (u"uo", u"uo", u"", u"", u"", (()), (()), False), (u"sìk", u"sìk", u"", u"", u"", (()), (()), False), (u"sim", u"sim", u"", u"", u"", (()), (()), False)) # probably not tsim lenited
276 muzer 32
# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
246 szabot 33
INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
34
INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
35
INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
267 muzer 36
PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
246 szabot 37
ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
256 muzer 38
POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 39
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 40
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 41
 
246 szabot 42
LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91 szabot 43
 
56 szabot 44
def parseword(wordin):
187 muzer 45
    tempid = 0
46
    temptype = u""
246 szabot 47
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 48
        if wordin[0] == brokenword[0]:
187 muzer 49
            for word in wordlist:
203 muzer 50
                if brokenword[1] == word["navi"]:
187 muzer 51
                    tempid = word["id"]
204 muzer 52
                    temptype = word["type"]
187 muzer 53
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90 szabot 54
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65 szabot 55
    for word in wordlist:
222 muzer 56
        word["navi"] = word["navi"].lower()
65 szabot 57
        foundit = True
58
        foundprefs = []
59
        foundposts = []
99 szabot 60
        lenited = False
74 szabot 61
        splitword = word["infix"].split(u" ")
172 muzer 62
        foundins = [u"", u"", u""]
74 szabot 63
        if len(wordin) < len(splitword):
68 szabot 64
            foundit = False
246 szabot 65
            continue
65 szabot 66
        for wor in range(len(splitword)):
76 szabot 67
            if not foundit:
68
                break
65 szabot 69
            foundprefs.append([])
70
            foundposts.append([])
71
            center = u""
72
            if u"<1>" in splitword[wor]:
185 muzer 73
                tempin1 = []
74
                tempin2 = []
75
                tempin3 = []
246 szabot 76
                for in1 in INFIXES1:
185 muzer 77
                    if in1 in wordin[wor]:
78
                        tempin1.append(in1)
246 szabot 79
                for in2 in INFIXES2:
185 muzer 80
                    if in2 in wordin[wor]:
81
                        tempin2.append(in2)
246 szabot 82
                for in3 in INFIXES3:
185 muzer 83
                    if in3 in wordin[wor]:
84
                        tempin3.append(in3)
180 szabot 85
                for in1 in tempin1:
86
                    for in2 in tempin2:
87
                        for in3 in tempin3:
246 szabot 88
                            if splitword[wor].replace(u"<1><2>", in1 + in2).replace(u"<3>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
89
                                center = splitword[wor].replace(u"<1><2>", in1 + in2).replace(u"<3>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65 szabot 90
                                foundins = [in1, in2, in3]
91
                                break
246 szabot 92
                        if center != u"":
93
                            break
94
                    if center != u"":
95
                        break
65 szabot 96
            else:
97
                if splitword[wor] in wordin[wor]:
98
                    center = splitword[wor]
216 muzer 99
                if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246 szabot 100
                    for i in LENIT:
92 szabot 101
                        temp = u""
91 szabot 102
                        if splitword[wor].startswith(i[0]):
92 szabot 103
                            temp = i[1] + splitword[wor][len(i[0]):]
104
                            if temp in wordin[wor]:
99 szabot 105
                                lenited = True
92 szabot 106
                                center = temp
95 szabot 107
                if center == u"":
108
                    if splitword[wor].endswith(u"nga"):
97 szabot 109
                        temp = splitword[wor][:-3] + u"ng"
95 szabot 110
                        if temp in wordin[wor]:
111
                            center = temp
271 muzer 112
                    if splitword[wor].endswith(u"fo"):
113
                        temp = splitword[wor][:-2] + u"f"
103 szabot 114
                        if temp in wordin[wor]:
115
                            center = temp
273 muzer 116
                    if splitword[wor].endswith(u"po"):
117
                        temp = splitword[wor][:-2] + u"p"
118
                        if temp in wordin[wor]:
119
                            center = temp
258 muzer 120
                    if splitword[wor].endswith(u"tsa"):
121
                        temp = splitword[wor][:-3] + u"ts"
122
                        if temp in wordin[wor]:
123
                            center = temp
74 szabot 124
            if center == u"":
65 szabot 125
                foundit = False
126
                break
91 szabot 127
            temp = wordin[wor].split(center)
128
            if len(temp) != 2:
129
                foundit = False
130
                break
131
            pref, posf = temp
119 szabot 132
            last = u""
133
            while last != pref:
134
                last = pref
246 szabot 135
                for pre in PREFIXES:
119 szabot 136
                    if pref != u"":
137
                        if pref.endswith(pre):
138
                            if pre in foundprefs[wor]:
139
                                break
140
                            foundprefs[wor].append(pre)
141
                            pref = pref[:-len(pre)]
120 szabot 142
                            break
65 szabot 143
            if pref != u"":
144
                foundit = False
244 szabot 145
                foundprefs = []
65 szabot 146
                break
119 szabot 147
            last = u""
148
            while last != posf:
149
                last = posf
246 szabot 150
                for pos in POSTFIXES:
119 szabot 151
                    if posf != u"":
152
                        if posf.startswith(pos):
153
                            if pos in foundposts[wor]:
154
                                break
244 szabot 155
                            if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
156
                                foundposts[wor].append(pos)
157
                                posf = posf[len(pos):]
158
                                break
159
                            else:
160
                                break
82 szabot 161
            if posf != u"":
80 szabot 162
                foundit = False
244 szabot 163
                foundposts = []
80 szabot 164
                break
65 szabot 165
        if foundit == True:
166
            foundword = word
56 szabot 167
            break
87 szabot 168
    ret["pref"] = foundprefs
169
    ret["post"] = foundposts
170
    ret["inf"] = foundins
99 szabot 171
    ret["len"] = lenited
65 szabot 172
    if foundit == True:
71 szabot 173
        ret["word"] = foundword
77 szabot 174
    return ret
175
 
176
def parsesent(sent):
101 szabot 177
    sent = sent.strip().lower().replace(u"’", u"'")
246 szabot 178
    sent = re.sub(ur"[^\wìä' ]", u"", sent)
179
    sent = re.sub(ur"\ +", u" ", sent)
89 szabot 180
    sent = sent.split(u" ")
77 szabot 181
    ret = []
182
    left = len(sent)
183
    while left:
246 szabot 184
        word = parsenum.parse(sent[len(sent) - left])
103 szabot 185
        if word == None:
186
            word = parseword(sent[-left:])
78 szabot 187
        left -= len(word["word"]["navi"].split(" "))
77 szabot 188
        ret.append(word)
136 muzer 189
    return ret