Subversion Repositories navi

Rev

Rev 213 | Rev 221 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
23
import re
66 szabot 24
import dbconnector
103 szabot 25
import parsenum
56 szabot 26
 
216 muzer 27
wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
65 szabot 28
 
187 muzer 29
 
216 muzer 30
brokenwords = ((u"sami", u"si", u"", u"am", u"", (()), (()), False), (u"to", u"to", u"", u"", u"", (()), (()), False), (u"poltxe", u"plltxe", u"", u"ol", u"", (()), (()), False)) # XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
94 szabot 31
infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
32
infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
33
infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
156 muzer 34
prefixes = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"a")
171 muzer 35
adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
121 szabot 36
postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 37
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 38
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 39
 
91 szabot 40
lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
41
 
56 szabot 42
def parseword(wordin):
187 muzer 43
    tempid = 0
44
    temptype = u""
216 muzer 45
    for brokenword in brokenwords: # XXX HACK - this is all code to work around bugs that shouldn't exist
191 muzer 46
        if wordin[0] == brokenword[0]:
187 muzer 47
            for word in wordlist:
203 muzer 48
                if brokenword[1] == word["navi"]:
187 muzer 49
                    tempid = word["id"]
204 muzer 50
                    temptype = word["type"]
187 muzer 51
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90 szabot 52
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65 szabot 53
    for word in wordlist:
54
        foundit = True
55
        foundprefs = []
56
        foundposts = []
99 szabot 57
        lenited = False
74 szabot 58
        splitword = word["infix"].split(u" ")
172 muzer 59
        foundins = [u"", u"", u""]
74 szabot 60
        if len(wordin) < len(splitword):
68 szabot 61
            foundit = False
62
            next
65 szabot 63
        for wor in range(len(splitword)):
76 szabot 64
            if not foundit:
65
                break
65 szabot 66
            foundprefs.append([])
67
            foundposts.append([])
68
            center = u""
69
            pre = []
70
            post = []
71
            if u"<1>" in splitword[wor]:
185 muzer 72
                tempin1 = []
73
                tempin2 = []
74
                tempin3 = []
75
                for in1 in infixes1:
76
                    if in1 in wordin[wor]:
77
                        tempin1.append(in1)
78
                for in2 in infixes2:
79
                    if in2 in wordin[wor]:
80
                        tempin2.append(in2)
81
                for in3 in infixes3:
82
                    if in3 in wordin[wor]:
83
                        tempin3.append(in3)
180 szabot 84
                for in1 in tempin1:
85
                    for in2 in tempin2:
86
                        for in3 in tempin3:
65 szabot 87
                            if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
88
                                center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
89
                                foundins = [in1, in2, in3]
90
                                break
75 szabot 91
                        if center != u"": break
92
                    if center != u"": break
65 szabot 93
            else:
94
                if splitword[wor] in wordin[wor]:
95
                    center = splitword[wor]
216 muzer 96
                if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
91 szabot 97
                    for i in lenit:
92 szabot 98
                        temp = u""
91 szabot 99
                        if splitword[wor].startswith(i[0]):
92 szabot 100
                            temp = i[1] + splitword[wor][len(i[0]):]
101
                            if temp in wordin[wor]:
99 szabot 102
                                lenited = True
92 szabot 103
                                center = temp
95 szabot 104
                if center == u"":
105
                    if splitword[wor].endswith(u"nga"):
97 szabot 106
                        temp = splitword[wor][:-3] + u"ng"
95 szabot 107
                        if temp in wordin[wor]:
108
                            center = temp
103 szabot 109
                    if splitword[wor].endswith(u"po"):
174 muzer 110
                        temp = splitword[wor][:-2] + u"p"
103 szabot 111
                        if temp in wordin[wor]:
112
                            center = temp
74 szabot 113
            if center == u"":
65 szabot 114
                foundit = False
115
                break
91 szabot 116
            temp = wordin[wor].split(center)
117
            if len(temp) != 2:
118
                foundit = False
119
                break
120
            pref, posf = temp
119 szabot 121
            last = u""
122
            while last != pref:
123
                last = pref
124
                for pre in prefixes:
125
                    if pref != u"":
126
                        if pref.endswith(pre):
127
                            if pre in foundprefs[wor]:
128
                                break
129
                            foundprefs[wor].append(pre)
130
                            pref = pref[:-len(pre)]
120 szabot 131
                            break
65 szabot 132
            if pref != u"":
133
                foundit = False
134
                break
119 szabot 135
            last = u""
136
            while last != posf:
137
                last = posf
138
                for pos in postfixes:
139
                    if posf != u"":
140
                        if posf.startswith(pos):
141
                            if pos in foundposts[wor]:
142
                                break
216 muzer 143
                            if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
209 muzer 144
                                foundposts[wor].append(pos)
145
                                posf = posf[len(pos):]
146
                                break
213 muzer 147
                            else:
209 muzer 148
                                break
82 szabot 149
            if posf != u"":
80 szabot 150
                foundit = False
151
                break
65 szabot 152
        if foundit == True:
153
            foundword = word
56 szabot 154
            break
87 szabot 155
    ret["pref"] = foundprefs
156
    ret["post"] = foundposts
157
    ret["inf"] = foundins
99 szabot 158
    ret["len"] = lenited
65 szabot 159
    if foundit == True:
71 szabot 160
        ret["word"] = foundword
77 szabot 161
    return ret
162
 
163
def parsesent(sent):
101 szabot 164
    sent = sent.strip().lower().replace(u"’", u"'")
100 szabot 165
    sent = re.sub(ur"[^\wìä' ]",u"",sent)
89 szabot 166
    sent = re.sub(ur"\ +",u" ",sent)
167
    sent = sent.split(u" ")
77 szabot 168
    ret = []
169
    left = len(sent)
170
    while left:
108 szabot 171
        word = parsenum.parse(sent[len(sent)-left])
103 szabot 172
        if word == None:
173
            word = parseword(sent[-left:])
78 szabot 174
        left -= len(word["word"]["navi"].split(" "))
77 szabot 175
        ret.append(word)
136 muzer 176
    return ret