Subversion Repositories navi

Rev

Rev 294 | Rev 297 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

#!/usr/bin/python
# -*- coding: utf-8 -*-
#    This file is part of Tsim Apiak.
#
#    Tsim Apiak is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public Licence as published by
#    the Free Software Foundation, either version 3 of the Licence, or
#    (at your option) any later version.
#
#    In addition to this, you must also comply with clause 4 of the
#    Apache Licence, version 2.0, concerning attribution. Where there
#    is a contradiction between the two licences, the GPL
#    takes preference.
#
#    Tsim Apiak is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with Tsim Apiak.  If not, see <http://www.gnu.org/licenses/>.

import tsimapiak.dbconnector as dbconnector
import tsimapiak.parsenum as parsenum
import re

#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
wordlist = dbconnector.getnavilist()

prefixes, infixes, postfixes = dbconnector.getaffixlists()

# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
BROKENWORDS = (
    (u"sami", u"si", u"", u"am", u"", (()), (()), False), # otherwise parses as sa (tsa-lenited) + mi
    #(u"to", u"to", u"", u"", u"", (()), (()), False),
    #(u"frato", u"to", u"", u"", u"", [[u"fra"]],  (()), False),
    (u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False), # does not parse, irregular form
    #(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
    #(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
    (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False), # otherwise parses as kìm (spin) + ä (genitive)
    (u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False), # otherwise parses as apxa + -y (genitive)
    #(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
    #(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
    #(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
    #(u"ka", u"ka", u"", u"", u"", (()), (()), False),
    #(u"uo", u"uo", u"", u"", u"", (()), (()), False),
    #(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
    #(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
    (u"tse", u"tse", u"", u"", u"", (()), (()), False), # otherwise parses as tsa'u abbreviated (special case)
)

#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"

EXTRAINFIXES = [
    {"id": "-1", "navi": "eiy", "gloss": "LAUD.", "position": 2},
    {"id": "-2", "navi": "eng", "gloss": "PEJ.", "position": 2},
]

EXTRAPOSTFIXES = [
    {"id": "-3", "navi": "eyä", "gloss": "GEN."},
]

EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't

LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))

# Let's lenit the prefixes
extraprefixes = []
for prefix in prefixes:
    for letter, replacement in LENIT:
        if prefix["navi"].startswith(letter):
            new_prefix = prefix["navi"].replace(letter, replacement, 1)
            if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
                extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD"})
            break

prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)

# Let's lenit the dictionary
extrawords = []
for word in wordlist:
    for letter, replacement in LENIT:
        if word["navi"].startswith(letter):
            new_word = word["navi"].replace(letter, replacement, 1)
            new_infix = word["infix"].replace(letter, replacement, 1)
            extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True})
wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)

def parseword(wordin):
    tempid = 0
    temptype = u""
    for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
        if wordin[0] == brokenword[0]:
            for word in wordlist:
                if brokenword[1] == word["navi"]:
                    tempid = word["id"]
                    temptype = word["type"]
            return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}, "len": False}
    for word in wordlist:
        word["navi"] = word["navi"].lower()
        foundit = True
        foundprefs = []
        foundposts = []
        splitword = word["infix"].split(u" ")
        foundins = [u"", u"", u""]
        if len(wordin) < len(splitword):
            foundit = False
            continue
        for wor in range(len(splitword)):
            if not foundit:
                break
            foundprefs.append([])
            foundposts.append([])
            center = u""
            if u"<0>" in splitword[wor]:
                tempin1 = []
                tempin2 = []
                tempin3 = []
                for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
                    if in1 in wordin[wor]:
                        tempin1.append(in1)
                for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
                    if in2 in wordin[wor]:
                        tempin2.append(in2)
                for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
                    if in3 in wordin[wor]:
                        tempin3.append(in3)
                for in1 in tempin1:
                    for in2 in tempin2:
                        for in3 in tempin3:
                            if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
                                center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
                                foundins = [in1, in2, in3]
                                break
                        if center != u"":
                            break
                    if center != u"":
                        break
            else:
                if splitword[wor] in wordin[wor]:
                    center = splitword[wor]
                if center == u"":
                    if splitword[wor].endswith(u"nga"):
                        temp = splitword[wor][:-3] + u"nge"
                        if temp in wordin[wor]:
                            center = temp
                    if splitword[wor].endswith(u"fo"):
                        temp = splitword[wor][:-2] + u"fe"
                        if temp in wordin[wor]:
                            center = temp
                    if splitword[wor].endswith(u"po"):
                        temp = splitword[wor][:-2] + u"pe"
                        if temp in wordin[wor]:
                            center = temp
                    if splitword[wor].endswith(u"tsa"):
                        temp = splitword[wor][:-3] + u"tse"
                        if temp in wordin[wor]:
                            center = temp
                    if splitword[wor].endswith(u"fko"):
                        temp = splitword[wor][:-3] + u"fke"
                        if temp in wordin[wor]:
                            center = temp
                    if splitword[wor].endswith(u"sa'u"):
                        temp = splitword[wor][:-4] + u"se"
                        if temp in wordin[wor]:
                            center = temp
                    if splitword[wor].endswith(u"sa"):
                        temp = splitword[wor][:-2] + u"se"
                        if temp in wordin[wor]:
                            center = temp
                    if splitword[wor].endswith(u"sno"):
                        temp = splitword[wor][:-3] + u"sne"
                        if temp in wordin[wor]:
                            center = temp
                    if splitword[wor].endswith(u"ayla"):
                        temp = splitword[wor][:-3] + u"ayle"
                        if temp in wordin[wor]:
                            center = temp
            if center == u"":
                foundit = False
                break
            temp = wordin[wor].split(center)
            if len(temp) != 2:
                foundit = False
                break
            pref, posf = temp
            last = u""
            while last != pref:
                last = pref
                for pre in [x["navi"] for x in prefixes]:
                    if pref != u"":
                        if pref.endswith(pre):
                            if pre in foundprefs[wor]:
                                break
                            foundprefs[wor].append(pre)
                            pref = pref[:-len(pre)]
                            break
            if pref != u"":
                foundit = False
                foundprefs = []
                break
            last = u""
            while last != posf:
                last = posf
                for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
                    if posf != u"":
                        if posf.startswith(pos):
                            if (pos, posid) in foundposts[wor]:
                                break
                            if pos != u"ä" or (word["navi"] != u"pey" and word["navi"] != "fey"): # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
                                foundposts[wor].append((pos, posid))
                                posf = posf[len(pos):]
                                break
                            else:
                                break
            if posf != u"":
                foundit = False
                foundposts = []
                break
        if foundit == True:
            foundword = word
            break
    ret["pref"] = foundprefs
    ret["post"] = foundposts
    ret["inf"] = foundins
    if foundit == True:
        ret["len"] = word["lenited"]
        ret["word"] = foundword
    return ret

def parsesent(sent):
    sent = sent.strip().lower().replace(u"’", u"'")
    sent = re.sub(r"[^\wìä' ]", u"", sent)
    sent = re.sub(r"\ +", u" ", sent)
    sent = sent.split(u" ")
    ret = []
    left = len(sent)
    while left:
        word = parsenum.parse(sent[len(sent) - left])
        if word == None:
            word = parseword(sent[-left:])
        left -= len(word["word"]["navi"].split(" "))
        ret.append(word)
    return ret