Subversion Repositories navi

Rev

Rev 187 | Rev 191 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 szabot 1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
176 muzer 3
#    This file is part of Tsim Apiak.
4
#
5
#    Tsim Apiak is free software: you can redistribute it and/or modify
6
#    it under the terms of the GNU General Public Licence as published by
7
#    the Free Software Foundation, either version 3 of the Licence, or
8
#    (at your option) any later version. 
9
# 
10
#    In addition to this, you must also comply with clause 4 of the
11
#    Apache Licence, version 2.0, concerning attribution. Where there
12
#    is a contradiction between the two licences, the GPL
13
#    takes preference.
14
#
186 szabot 15
#    Tsim Apiak is distributed in the hope that it will be useful,
176 muzer 16
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
#    GNU General Public License for more details.
19
#
20
#    You should have received a copy of the GNU General Public License
21
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.
56 szabot 22
 
23
import re
66 szabot 24
import dbconnector
103 szabot 25
import parsenum
56 szabot 26
 
65 szabot 27
wordlist = dbconnector.getnavilist()
28
 
187 muzer 29
 
30
brokenwords = ((u"sami", u"si", u"", u"am", u"", None, None, False), (u"to", u"to", u"", u"", u"", False), (u"poltxe", u"plltxe", u"", u"ol", u"", None, None, False)) # These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes
94 szabot 31
infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
32
infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
33
infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
156 muzer 34
prefixes = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"a")
171 muzer 35
adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
121 szabot 36
postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62 szabot 37
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
74 szabot 38
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
56 szabot 39
 
91 szabot 40
lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
41
 
56 szabot 42
def parseword(wordin):
187 muzer 43
    tempid = 0
44
    temptype = u""
45
    for brokenword in brokenwords:
190 muzer 46
        print wordin
47
        print brokenword[0]
187 muzer 48
        if wordin == brokenword[0]:
49
            for word in wordlist:
50
                if wordin == word["navi"]:
51
                    tempid = word["id"]
52
                    temptype = word["partOfSpeech"]
53
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90 szabot 54
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65 szabot 55
    for word in wordlist:
56
        foundit = True
57
        foundprefs = []
58
        foundposts = []
99 szabot 59
        lenited = False
74 szabot 60
        splitword = word["infix"].split(u" ")
172 muzer 61
        foundins = [u"", u"", u""]
74 szabot 62
        if len(wordin) < len(splitword):
68 szabot 63
            foundit = False
64
            next
65 szabot 65
        for wor in range(len(splitword)):
76 szabot 66
            if not foundit:
67
                break
65 szabot 68
            foundprefs.append([])
69
            foundposts.append([])
70
            center = u""
71
            pre = []
72
            post = []
73
            if u"<1>" in splitword[wor]:
185 muzer 74
                tempin1 = []
75
                tempin2 = []
76
                tempin3 = []
77
                for in1 in infixes1:
78
                    if in1 in wordin[wor]:
79
                        tempin1.append(in1)
80
                for in2 in infixes2:
81
                    if in2 in wordin[wor]:
82
                        tempin2.append(in2)
83
                for in3 in infixes3:
84
                    if in3 in wordin[wor]:
85
                        tempin3.append(in3)
180 szabot 86
                for in1 in tempin1:
87
                    for in2 in tempin2:
88
                        for in3 in tempin3:
65 szabot 89
                            if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
90
                                center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
91
                                foundins = [in1, in2, in3]
92
                                break
75 szabot 93
                        if center != u"": break
94
                    if center != u"": break
65 szabot 95
            else:
96
                if splitword[wor] in wordin[wor]:
97
                    center = splitword[wor]
95 szabot 98
                if center == u"":
91 szabot 99
                    for i in lenit:
92 szabot 100
                        temp = u""
91 szabot 101
                        if splitword[wor].startswith(i[0]):
92 szabot 102
                            temp = i[1] + splitword[wor][len(i[0]):]
103
                            if temp in wordin[wor]:
99 szabot 104
                                lenited = True
92 szabot 105
                                center = temp
95 szabot 106
                if center == u"":
107
                    if splitword[wor].endswith(u"nga"):
97 szabot 108
                        temp = splitword[wor][:-3] + u"ng"
95 szabot 109
                        if temp in wordin[wor]:
110
                            center = temp
103 szabot 111
                    if splitword[wor].endswith(u"po"):
174 muzer 112
                        temp = splitword[wor][:-2] + u"p"
103 szabot 113
                        if temp in wordin[wor]:
114
                            center = temp
74 szabot 115
            if center == u"":
65 szabot 116
                foundit = False
117
                break
91 szabot 118
            temp = wordin[wor].split(center)
119
            if len(temp) != 2:
120
                foundit = False
121
                break
122
            pref, posf = temp
119 szabot 123
            last = u""
124
            while last != pref:
125
                last = pref
126
                for pre in prefixes:
127
                    if pref != u"":
128
                        if pref.endswith(pre):
129
                            if pre in foundprefs[wor]:
130
                                break
131
                            foundprefs[wor].append(pre)
132
                            pref = pref[:-len(pre)]
120 szabot 133
                            break
65 szabot 134
            if pref != u"":
135
                foundit = False
136
                break
119 szabot 137
            last = u""
138
            while last != posf:
139
                last = posf
140
                for pos in postfixes:
141
                    if posf != u"":
142
                        if posf.startswith(pos):
143
                            if pos in foundposts[wor]:
144
                                break
145
                            foundposts[wor].append(pos)
146
                            posf = posf[len(pos):]
120 szabot 147
                            break
82 szabot 148
            if posf != u"":
80 szabot 149
                foundit = False
150
                break
65 szabot 151
        if foundit == True:
152
            foundword = word
56 szabot 153
            break
87 szabot 154
    ret["pref"] = foundprefs
155
    ret["post"] = foundposts
156
    ret["inf"] = foundins
99 szabot 157
    ret["len"] = lenited
65 szabot 158
    if foundit == True:
71 szabot 159
        ret["word"] = foundword
77 szabot 160
    return ret
161
 
162
def parsesent(sent):
101 szabot 163
    sent = sent.strip().lower().replace(u"’", u"'")
100 szabot 164
    sent = re.sub(ur"[^\wìä' ]",u"",sent)
89 szabot 165
    sent = re.sub(ur"\ +",u" ",sent)
166
    sent = sent.split(u" ")
77 szabot 167
    ret = []
168
    left = len(sent)
169
    while left:
108 szabot 170
        word = parsenum.parse(sent[len(sent)-left])
103 szabot 171
        if word == None:
172
            word = parseword(sent[-left:])
78 szabot 173
        left -= len(word["word"]["navi"].split(" "))
77 szabot 174
        ret.append(word)
136 muzer 175
    return ret