WebSVN - navi - Rev 296 - /tsimapiak/parse.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# This file is part of Tsim Apiak.
#
# Tsim Apiak is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public Licence as published by
# the Free Software Foundation, either version 3 of the Licence, or
# (at your option) any later version.
#
# In addition to this, you must also comply with clause 4 of the
# Apache Licence, version 2.0, concerning attribution. Where there
# is a contradiction between the two licences, the GPL
# takes preference.
#
# Tsim Apiak is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.

import tsimapiak.dbconnector as dbconnector
import tsimapiak.parsenum as parsenum
import re

#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
wordlist = dbconnector.getnavilist()

prefixes, infixes, postfixes = dbconnector.getaffixlists()

# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
BROKENWORDS = (
(u"sami", u"si", u"", u"am", u"", (()), (()), False), # otherwise parses as sa (tsa-lenited) + mi
#(u"to", u"to", u"", u"", u"", (()), (()), False),
#(u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False),
(u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False), # does not parse, irregular form
#(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
#(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
(u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False), # otherwise parses as kìm (spin) + ä (genitive)
(u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False), # otherwise parses as apxa + -y (genitive)
#(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
#(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
#(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
#(u"ka", u"ka", u"", u"", u"", (()), (()), False),
#(u"uo", u"uo", u"", u"", u"", (()), (()), False),
#(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
#(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
(u"tse", u"tse", u"", u"", u"", (()), (()), False), # otherwise parses as tsa'u abbreviated (special case)
)

#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"

EXTRAINFIXES = [
{"id": "-1", "navi": "eiy", "gloss": "LAUD.", "position": 2},
{"id": "-2", "navi": "eng", "gloss": "PEJ.", "position": 2},
]

EXTRAPOSTFIXES = [
{"id": "-3", "navi": "eyä", "gloss": "GEN."},
]

EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't

LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))

# Let's lenit the prefixes
extraprefixes = []
for prefix in prefixes:
for letter, replacement in LENIT:
if prefix["navi"].startswith(letter):
new_prefix = prefix["navi"].replace(letter, replacement, 1)
if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD"})
break

prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)

# Let's lenit the dictionary
extrawords = []
for word in wordlist:
for letter, replacement in LENIT:
if word["navi"].startswith(letter):
new_word = word["navi"].replace(letter, replacement, 1)
new_infix = word["infix"].replace(letter, replacement, 1)
extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True})
wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)

def parseword(wordin):
tempid = 0
temptype = u""
for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
if wordin[0] == brokenword[0]:
for word in wordlist:
if brokenword[1] == word["navi"]:
tempid = word["id"]
temptype = word["type"]
return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}, "len": False}
for word in wordlist:
word["navi"] = word["navi"].lower()
foundit = True
foundprefs = []
foundposts = []
splitword = word["infix"].split(u" ")
foundins = [u"", u"", u""]
if len(wordin) < len(splitword):
foundit = False
continue
for wor in range(len(splitword)):
if not foundit:
break
foundprefs.append([])
foundposts.append([])
center = u""
if u"<0>" in splitword[wor]:
tempin1 = []
tempin2 = []
tempin3 = []
for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
if in1 in wordin[wor]:
tempin1.append(in1)
for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
if in2 in wordin[wor]:
tempin2.append(in2)
for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
if in3 in wordin[wor]:
tempin3.append(in3)
for in1 in tempin1:
for in2 in tempin2:
for in3 in tempin3:
if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
foundins = [in1, in2, in3]
break
if center != u"":
break
if center != u"":
break
else:
if splitword[wor] in wordin[wor]:
center = splitword[wor]
if center == u"":
if splitword[wor].endswith(u"nga"):
temp = splitword[wor][:-3] + u"nge"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"fo"):
temp = splitword[wor][:-2] + u"fe"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"po"):
temp = splitword[wor][:-2] + u"pe"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"tsa"):
temp = splitword[wor][:-3] + u"tse"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"fko"):
temp = splitword[wor][:-3] + u"fke"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"sa'u"):
temp = splitword[wor][:-4] + u"se"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"sa"):
temp = splitword[wor][:-2] + u"se"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"sno"):
temp = splitword[wor][:-3] + u"sne"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"ayla"):
temp = splitword[wor][:-3] + u"ayle"
if temp in wordin[wor]:
center = temp
if center == u"":
foundit = False
break
temp = wordin[wor].split(center)
if len(temp) != 2:
foundit = False
break
pref, posf = temp
last = u""
while last != pref:
last = pref
for pre in [x["navi"] for x in prefixes]:
if pref != u"":
if pref.endswith(pre):
if pre in foundprefs[wor]:
break
foundprefs[wor].append(pre)
pref = pref[:-len(pre)]
break
if pref != u"":
foundit = False
foundprefs = []
break
last = u""
while last != posf:
last = posf
for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
if posf != u"":
if posf.startswith(pos):
if (pos, posid) in foundposts[wor]:
break
if pos != u"ä" or (word["navi"] != u"pey" and word["navi"] != "fey"): # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
foundposts[wor].append((pos, posid))
posf = posf[len(pos):]
break
else:
break
if posf != u"":
foundit = False
foundposts = []
break
if foundit == True:
foundword = word
break
ret["pref"] = foundprefs
ret["post"] = foundposts
ret["inf"] = foundins
if foundit == True:
ret["len"] = word["lenited"]
ret["word"] = foundword
return ret

def parsesent(sent):
sent = sent.strip().lower().replace(u"’", u"'")
sent = re.sub(r"[^\wìä' ]", u"", sent)
sent = re.sub(r"\ +", u" ", sent)
sent = sent.split(u" ")
ret = []
left = len(sent)
while left:
word = parsenum.parse(sent[len(sent) - left])
if word == None:
word = parseword(sent[-left:])
left -= len(word["word"]["navi"].split(" "))
ret.append(word)
return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 146 - Rev 296