WebSVN - navi - Rev 188 - /tsimapiak/parse.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# This file is part of Tsim Apiak.
#
# Tsim Apiak is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public Licence as published by
# the Free Software Foundation, either version 3 of the Licence, or
# (at your option) any later version.
#
# In addition to this, you must also comply with clause 4 of the
# Apache Licence, version 2.0, concerning attribution. Where there
# is a contradiction between the two licences, the GPL
# takes preference.
#
# Tsim Apiak is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.

import re
import dbconnector
import parsenum

wordlist = dbconnector.getnavilist()

brokenwords = ((u"sami", u"si", u"", u"am", u"", None, None, False), (u"to", u"to", u"", u"", u"", False), (u"poltxe", u"plltxe", u"", u"ol", u"", None, None, False)) # These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes
infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
prefixes = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"a")
adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"

lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))

def parseword(wordin):
tempid = 0
temptype = u""
for brokenword in brokenwords:
if wordin == brokenword[0]:
for word in wordlist:
if wordin == word["navi"]:
tempid = word["id"]
temptype = word["partOfSpeech"]
return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
for word in wordlist:
foundit = True
foundprefs = []
foundposts = []
lenited = False
splitword = word["infix"].split(u" ")
foundins = [u"", u"", u""]
if len(wordin) < len(splitword):
foundit = False
next
for wor in range(len(splitword)):
if not foundit:
break
foundprefs.append([])
foundposts.append([])
center = u""
pre = []
post = []
if u"<1>" in splitword[wor]:
tempin1 = []
tempin2 = []
tempin3 = []
for in1 in infixes1:
if in1 in wordin[wor]:
tempin1.append(in1)
for in2 in infixes2:
if in2 in wordin[wor]:
tempin2.append(in2)
for in3 in infixes3:
if in3 in wordin[wor]:
tempin3.append(in3)
for in1 in tempin1:
for in2 in tempin2:
for in3 in tempin3:
if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
foundins = [in1, in2, in3]
break
if center != u"": break
if center != u"": break
else:
if splitword[wor] in wordin[wor]:
center = splitword[wor]
if center == u"":
for i in lenit:
temp = u""
if splitword[wor].startswith(i[0]):
temp = i[1] + splitword[wor][len(i[0]):]
if temp in wordin[wor]:
lenited = True
center = temp
if center == u"":
if splitword[wor].endswith(u"nga"):
temp = splitword[wor][:-3] + u"ng"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"po"):
temp = splitword[wor][:-2] + u"p"
if temp in wordin[wor]:
center = temp
if center == u"":
foundit = False
break
temp = wordin[wor].split(center)
if len(temp) != 2:
foundit = False
break
pref, posf = temp
last = u""
while last != pref:
last = pref
for pre in prefixes:
if pref != u"":
if pref.endswith(pre):
if pre in foundprefs[wor]:
break
foundprefs[wor].append(pre)
pref = pref[:-len(pre)]
break
if pref != u"":
foundit = False
break
last = u""
while last != posf:
last = posf
for pos in postfixes:
if posf != u"":
if posf.startswith(pos):
if pos in foundposts[wor]:
break
foundposts[wor].append(pos)
posf = posf[len(pos):]
break
if posf != u"":
foundit = False
break
if foundit == True:
foundword = word
break
ret["pref"] = foundprefs
ret["post"] = foundposts
ret["inf"] = foundins
ret["len"] = lenited
if foundit == True:
ret["word"] = foundword
return ret

def parsesent(sent):
sent = sent.strip().lower().replace(u"’", u"'")
sent = re.sub(ur"[^\wìä' ]",u"",sent)
sent = re.sub(ur"\ +",u" ",sent)
sent = sent.split(u" ")
ret = []
left = len(sent)
while left:
word = parsenum.parse(sent[len(sent)-left])
if word == None:
word = parseword(sent[-left:])
left -= len(word["word"]["navi"].split(" "))
ret.append(word)
return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 209 - Rev 188