WebSVN - navi - Rev 295 - /tsimapiak/parse.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# This file is part of Tsim Apiak.
#
# Tsim Apiak is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public Licence as published by
# the Free Software Foundation, either version 3 of the Licence, or
# (at your option) any later version.
#
# In addition to this, you must also comply with clause 4 of the
# Apache Licence, version 2.0, concerning attribution. Where there
# is a contradiction between the two licences, the GPL
# takes preference.
#
# Tsim Apiak is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.

import tsimapiak.dbconnector as dbconnector
import tsimapiak.parsenum as parsenum
import re

#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
wordlist = dbconnector.getnavilist()

prefixes, infixes, postfixes = dbconnector.getaffixlists()

# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
BROKENWORDS = (
(u"sami", u"si", u"", u"am", u"", (()), (()), False),
(u"to", u"to", u"", u"", u"", (()), (()), False),
#(u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False),
(u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False),
(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
(u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False),
(u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False),
#(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
#(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
#(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
(u"ka", u"ka", u"", u"", u"", (()), (()), False),
(u"uo", u"uo", u"", u"", u"", (()), (()), False),
(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
(u"tse", u"tse", u"", u"", u"", (()), (()), False),
)

#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"

EXTRAINFIXES = [
{"id": "-1", "navi": "eiy", "gloss": "LAUD.", "position": 2},
{"id": "-2", "navi": "eng", "gloss": "PEJ.", "position": 2},
]

EXTRAPOSTFIXES = [
{"id": "-3", "navi": "eyä", "gloss": "GEN."},
]

EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't

LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))

# Let's lenit the prefixes
extraprefixes = []
for prefix in prefixes:
for letter, replacement in LENIT:
if prefix["navi"].startswith(letter):
new_prefix = prefix["navi"].replace(letter, replacement, 1)
if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD"})
break

prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)

def parseword(wordin):
tempid = 0
temptype = u""
for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
if wordin[0] == brokenword[0]:
for word in wordlist:
if brokenword[1] == word["navi"]:
tempid = word["id"]
temptype = word["type"]
return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
for word in wordlist:
word["navi"] = word["navi"].lower()
foundit = True
foundprefs = []
foundposts = []
lenited = False
splitword = word["infix"].split(u" ")
foundins = [u"", u"", u""]
if len(wordin) < len(splitword):
foundit = False
continue
for wor in range(len(splitword)):
if not foundit:
break
foundprefs.append([])
foundposts.append([])
center = u""
if u"<0>" in splitword[wor]:
tempin1 = []
tempin2 = []
tempin3 = []
for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
if in1 in wordin[wor]:
tempin1.append(in1)
for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
if in2 in wordin[wor]:
tempin2.append(in2)
for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
if in3 in wordin[wor]:
tempin3.append(in3)
for in1 in tempin1:
for in2 in tempin2:
for in3 in tempin3:
if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
foundins = [in1, in2, in3]
break
if center != u"":
break
if center != u"":
break
else:
if splitword[wor] in wordin[wor]:
center = splitword[wor]
if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
for i in LENIT:
temp = u""
if splitword[wor].startswith(i[0]):
temp = i[1] + splitword[wor][len(i[0]):]
if temp in wordin[wor]:
lenited = True
center = temp
if center == u"":
if splitword[wor].endswith(u"nga"):
temp = splitword[wor][:-3] + u"nge"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"fo"):
temp = splitword[wor][:-2] + u"fe"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"po"):
temp = splitword[wor][:-2] + u"pe"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"tsa"):
temp = splitword[wor][:-3] + u"tse"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"fko"):
temp = splitword[wor][:-3] + u"fke"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"sa'u"):
temp = splitword[wor][:-4] + u"se"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"sa"):
temp = splitword[wor][:-2] + u"se"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"sno"):
temp = splitword[wor][:-3] + u"sne"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"ayla"):
temp = splitword[wor][:-3] + u"ayle"
if temp in wordin[wor]:
center = temp
if center == u"":
foundit = False
break
temp = wordin[wor].split(center)
if len(temp) != 2:
foundit = False
break
pref, posf = temp
last = u""
while last != pref:
last = pref
for pre in [x["navi"] for x in prefixes]:
if pref != u"":
if pref.endswith(pre):
if pre in foundprefs[wor]:
break
foundprefs[wor].append(pre)
pref = pref[:-len(pre)]
break
if pref != u"":
foundit = False
foundprefs = []
break
last = u""
while last != posf:
last = posf
for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
if posf != u"":
if posf.startswith(pos):
if (pos, posid) in foundposts[wor]:
break
if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
foundposts[wor].append((pos, posid))
posf = posf[len(pos):]
break
else:
break
if posf != u"":
foundit = False
foundposts = []
break
if foundit == True:
foundword = word
break
ret["pref"] = foundprefs
ret["post"] = foundposts
ret["inf"] = foundins
ret["len"] = lenited
if foundit == True:
ret["word"] = foundword
return ret

def parsesent(sent):
sent = sent.strip().lower().replace(u"’", u"'")
sent = re.sub(r"[^\wìä' ]", u"", sent)
sent = re.sub(r"\ +", u" ", sent)
sent = sent.split(u" ")
ret = []
left = len(sent)
while left:
word = parsenum.parse(sent[len(sent) - left])
if word == None:
word = parseword(sent[-left:])
left -= len(word["word"]["navi"].split(" "))
ret.append(word)
return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py - Rev 295