Subversion Repositories navi

Compare Revisions

Ignore whitespace Rev 48 → Rev 174

/tsimapiak/parse.py
0,0 → 1,131
#!/usr/bin/python
# -*- coding: utf-8 -*-
 
import re
import dbconnector
import parsenum
 
wordlist = dbconnector.getnavilist()
 
infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
prefixes = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"a")
adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
 
lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
 
def parseword(wordin):
ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
for word in wordlist:
foundit = True
foundprefs = []
foundposts = []
lenited = False
splitword = word["infix"].split(u" ")
foundins = [u"", u"", u""]
if len(wordin) < len(splitword):
foundit = False
next
for wor in range(len(splitword)):
if not foundit:
break
foundprefs.append([])
foundposts.append([])
center = u""
pre = []
post = []
if u"<1>" in splitword[wor]:
for in1 in infixes1:
for in2 in infixes2:
for in3 in infixes3:
if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
foundins = [in1, in2, in3]
break
if center != u"": break
if center != u"": break
else:
if splitword[wor] in wordin[wor]:
center = splitword[wor]
if center == u"":
for i in lenit:
temp = u""
if splitword[wor].startswith(i[0]):
temp = i[1] + splitword[wor][len(i[0]):]
if temp in wordin[wor]:
lenited = True
center = temp
if center == u"":
if splitword[wor].endswith(u"nga"):
temp = splitword[wor][:-3] + u"ng"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"po"):
temp = splitword[wor][:-2] + u"p"
if temp in wordin[wor]:
center = temp
if center == u"":
foundit = False
break
temp = wordin[wor].split(center)
if len(temp) != 2:
foundit = False
break
pref, posf = temp
last = u""
while last != pref:
last = pref
for pre in prefixes:
if pref != u"":
if pref.endswith(pre):
if pre in foundprefs[wor]:
break
foundprefs[wor].append(pre)
pref = pref[:-len(pre)]
break
if pref != u"":
foundit = False
break
last = u""
while last != posf:
last = posf
for pos in postfixes:
if posf != u"":
if posf.startswith(pos):
if pos in foundposts[wor]:
break
foundposts[wor].append(pos)
posf = posf[len(pos):]
break
if posf != u"":
foundit = False
break
if foundit == True:
foundword = word
break
ret["pref"] = foundprefs
ret["post"] = foundposts
ret["inf"] = foundins
ret["len"] = lenited
if foundit == True:
ret["word"] = foundword
return ret
 
def parsesent(sent):
sent = sent.strip().lower().replace(u"’", u"'")
sent = re.sub(ur"[^\wìä' ]",u"",sent)
sent = re.sub(ur"\ +",u" ",sent)
sent = sent.split(u" ")
ret = []
left = len(sent)
while left:
word = parsenum.parse(sent[len(sent)-left])
if word == None:
word = parseword(sent[-left:])
left -= len(word["word"]["navi"].split(" "))
ret.append(word)
return ret
/tsimapiak/translate.py
0,0 → 1,43
# -*- coding: utf-8 -*-
import parse
import dbconnector
 
infixes1 = ((u"awn", u"P.PART"), (u"eyk", u"CAUS"), (u"us", u"A.PART"), (u"äp", u"REFL."))
infixes2 = ((u"ìyev", u"FUT.SUBJ"), (u"iyev", u"FUT.SUBJ"), (u"ìmìy", u"REC.PAST.REC.FUT"), (u"arm", u"IMPF.PAST"), (u"asy", u"FUT.D"), (u"ilv", u"PRES.PER.SUBJ"), (u"ìmv", u"REC.PAST.SUBJ"), (u"imv", u"PAST.SUBJ"), (u"ìrm", u"IMPF.REC.PAST"), (u"irv", u"PRES.IMPF.SUBJ"), (u"ìsy", u"IMM.FUT.D"), (u"aly", u"PERF.FUT"), (u"ary", u"IMPF.FUT"), (u"ìly", u"PERF.IMM.FUT"), (u"ìry", u"IMPF.IMM.FUT"), (u"ìlm", u"PERF.REC.PAST"), (u"alm", u"PERF.PAST"), (u"am", u"PAST."), (u"ay", u"FUT."), (u"er", u"IMPF."), (u"ìm", u"REC.PAST"), (u"iv", u"SUBJ."), (u"ìy", u"IMM.FUT"), (u"ol", u"PERF."))
infixes3 = ((u"äng", u"PEJ."), (u"ats", u"INFR."), (u"eiy", u"LAUD."), (u"ei", u"LAUD."), (u"uy", u"HON."))
prefixes = ((u"tsay", u"those"), (u"say", u"those-LENTD"), (u"fay", u"these"), (u"fra", u"every"), (u"pxe", u"TRI."), (u"fne", u"type"), (u"tsa", u"that"), (u"sa", u"that-LENTD"), (u"pe", u"what"), (u"fe", u"what-LENTD"), (u"le", u"ADJD."), (u"nì", u"ADVD."), (u"sä", u"INSTD."), (u"tì", u"NOUND."), (u"sì", u"NOUND.-LENTD"), (u"ay", u"PL."), (u"me", u"DU."), (u"fì", u"this"), (u"ke", u"not"), (u"he", u"not-LENTD"), (u"a", u"ADJ.POST"))
adpositions = ((u"mungwrr", u"except"), (u"kxamlä", u"through"), (u"pximaw", u"right.after"), (u"pxisre", u"right.before"), (u"tafkip", u"from.up.among"), (u"nemfa", u"into.inside"), (u"takip", u"from among"), (u"mìkam", u"between"), (u"teri", u"about.concerning"), (u"fkip", u"up.among"), (u"luke", u"without"), (u"pxel", u"like.as"), (u"pxaw", u"around"), (u"rofa", u"beside.alongside"), (u"ìlä", u"by.via.following"), (u"fpi", u"for.the.sake/benefit.of"), (u"ftu", u"from.direction"), (u"kip", u"among"), (u"lok", u"close.to"), (u"maw", u"after.time"), (u"sre", u"before.time"), (u"sìn", u"on.onto"), (u"vay", u"up.to"), (u"eo", u"before.in.front.of"), (u"fa", u"with.by.means.of"), (u"hu", u"with.accompaniment"), (u"io", u"above"), (u"ka", u"across"), (u"mì", u"in.on"), (u"na", u"like.as"), (u"ne", u"to.towards"), (u"ro", u"at.locative"), (u"ta", u"from"), (u"uo", u"behind"), (u"wä", u"against.opposition"), (u"äo", u"below"), (u"to", u"than"), (u"sì", u"and"))
postfixes = adpositions + ((u"tsyìp", u"DIM."), (u"eyä", u"GEN."), (u"ìri", u"TOP."), (u"ìl", u"ERG."), (u"it", u"ACC"), (u"lo", u"MULT."), (u"ri", u"TOP."), (u"ru", u"DAT."), (u"ti", u"ACC."), (u"ur", u"DAT."), (u"ve", u"ORD."), (u"yä", u"GEN."), (u"ya", u"VOC."), (u"tu", u"OBJD."), (u"vi", u"PART."), (u"yu", u"AGENTD."), (u"an", u"MASC."), (u"ng", u"INCL."), (u"ke", u"not"), (u"e", u"FEM."), (u"o", u"INDEF."), (u"l", u"ERG."), (u"t", u"ACC."), (u"y", u"GEN."), (u"a", u"ADJ.PRE"), (u"ä", u"GEN."), (u"r", u"DAT."))
 
def translatesent(sent, lang):
sent = parse.parsesent(sent)
for word in sent:
if word["word"]["id"] != 0:
word["translated"] = dbconnector.translate(word["word"]["id"],lang)
else:
word["translated"] = word["word"]["navi"]
if word["inf"][0] != u"":
for fix in infixes1:
if fix[0] == word["inf"][0]:
word["translated"] += '-' + fix[1]
if word["inf"][1] != u"":
for fix in infixes2:
if fix[0] == word["inf"][1]:
word["translated"] += '-' + fix[1]
if word["inf"][2] != u"":
for fix in infixes3:
if fix[0] == word["inf"][2]:
word["translated"] += '-' + fix[1]
for temp in word["pref"]: # double array? WTF?
for navf in temp:
for fix in prefixes:
if fix[0] == navf:
word["translated"] += '-' + fix[1]
for temp in word["post"]: # double array? WTF?
for navf in temp:
for fix in postfixes:
if fix[0] == navf:
word["translated"] += '-' + fix[1]
if word["len"]:
word["translated"] += '-' + 'LENTD'
return sent
/tsimapiak/parsenum.py
1,81 → 1,129
#!/usr/bin/python
# -*- coding: utf-8 -*-
 
import re
 
num = [u"kew",
u"'aw",
u"mune",
u"pxey",
u"tsìng",
u"mrr",
u"pukap",
u"kinä"]
 
rem = [u"aw",
u"mun",
u"pey",
u"sìng",
u"mrr",
u"fu",
u"hin"]
 
base = [u"",
u"me",
u"pxe",
u"tsì",
u"mrr",
u"pu",
u"ki"]
 
 
numre = \
u"^(?:(" + "|".join(base) + u")zazam??)?" + \
u"(?:(" + "|".join(base) + u")vozam??)?" + \
u"(?:(" + "|".join(base) + u")zam??)?" + \
u"(?:(" + "|".join(base) + u")vo(?:l(?=a|))?)?" + \
u"((?:" + "|".join(rem) + u")|" + \
u"(?:" + "|".join(num) + u"))?$"
numre = re.compile(numre)
 
def parse(numin):
if type(numin) != unicode:
return None
if numin == u"":
return None
numin = numin.replace(u"í",u"ì").replace(u"á",u"ä")
try:
mat = numre.match(numin).groups()
except:
return None
numout = 0
numoct = 0
try:
numout += rem.index(mat[4]) + 1
numoct += rem.index(mat[4]) + 1
except:
try:
numout += num.index(mat[4])
numoct += num.index(mat[4])
except: pass
try:
numout += (base.index(mat[3]) + 1) * 8
numoct += (base.index(mat[3]) + 1) * 10
except: pass
try:
numout += (base.index(mat[2]) + 1) * 8**2
numoct += (base.index(mat[2]) + 1) * 10**2
except: pass
try:
numout += (base.index(mat[1]) + 1) * 8**3
numoct += (base.index(mat[1]) + 1) * 10**3
except: pass
try:
numout += (base.index(mat[0]) + 1) * 8**4
numoct += (base.index(mat[0]) + 1) * 10**4
except: pass
return numout, numoct
 
 
if __name__ == "__main__":
print parse(u"mrrvolaw")
#!/usr/bin/python
# -*- coding: utf-8 -*-
 
num = [u"kew",
u"'aw",
u"mune",
u"pxey",
u"tsìng",
u"mrr",
u"pukap",
u"kinä"]
 
numord = [u"kew",
u"'aw",
u"mu",
u"pxey",
u"tsì",
u"mrr",
u"pu",
u"ki"]
 
rem = [u"aw",
u"mun",
u"pey",
u"sìng",
u"mrr",
u"fu",
u"hin"]
 
remord = [u"aw",
u"mu",
u"pey",
u"sì",
u"mrr",
u"fu",
u"hi"]
 
base = [u"",
u"me",
u"pxe",
u"tsì",
u"mrr",
u"pu",
u"ki"]
 
def parse(numin):
if u"mm" in numin:
return None
if (numin[0] == u"a") and (numin[len(numin)-1] == u"a"):
return None
prefs = []
posts = []
outoct = 0
outdec = 0
ret = {"word": {"id": 0, "navi": u"", "infix": u"", "type": u""}, "pref": [prefs], "post": [posts], "inf": [u"", u"", u""], "len": False, "dec": outdec, "oct": outoct}
if numin[0] == u"a":
prefs.append(u"a")
numin = numin[1:]
if numin[len(numin)-1] == u"a":
posts.append(u"a")
numin = numin[:-1]
if numin[-2:] == u"ve":
posts.append(u"ve")
numin = numin[:-2]
#base numbers
for n in range(len(num)):
if u"ve" in posts:
if numin == numord[n]:
outoct = n
outdec = n
ret["word"]["navi"] = unicode(outdec) + u"."
ret["dec"] = outdec
ret["oct"] = outoct
return ret
else:
if numin == num[n]:
outoct = n
outdec = n
ret["word"]["navi"] = unicode(outdec)
ret["dec"] = outdec
ret["oct"] = outoct
return ret
#other numbers
for n in range(len(base)):
if numin.startswith(base[n] + u"zazam"):
outoct += (n+1) * (10**4)
outdec += (n+1) * (8**4)
numin = numin[len(base[n]) + 5:]
for n in range(len(base)):
if numin.startswith(base[n] + u"vozam"):
outoct += (n+1) * (10**3)
outdec += (n+1) * (8**3)
numin = numin[len(base[n]) + 5:]
for n in range(len(base)):
if numin.startswith(base[n] + u"zam"):
outoct += (n+1) * (10**2)
outdec += (n+1) * (8**2)
numin = numin[len(base[n]) + 3:]
for n in range(len(base)):
if numin.startswith(base[n] + u"vol"):
outoct += (n+1) * 10
outdec += (n+1) * 8
numin = numin[len(base[n]) + 3:]
if numin.startswith(base[n] + u"vo"):
outoct += (n+1) * 10
outdec += (n+1) * 8
numin = numin[len(base[n]) + 2:]
for n in range(len(rem)):
if u"ve" in posts:
if numin == remord[n]:
outoct += n + 1
outdec += n + 1
numin = u""
else:
if numin == rem[n]:
outoct += n + 1
outdec += n + 1
numin = u""
if numin == u"":
ret["word"]["navi"] = unicode(outdec) if not u"ve" in posts else unicode(outdec) + u"."
ret["dec"] = outdec
ret["oct"] = outoct
return ret
else:
return None
 
if __name__ == "__main__":
print parse(u"mevolawve")
/tsimapiak/dbconnector.py
9,44 → 9,24
current = u""
db = tornado.database.Connection("127.0.0.1", "navi", user="navi", password="navi")
for row in db.query("""
SELECT *, CHAR_LENGTH(navi) AS NL
SELECT *
FROM `metaWords`
ORDER BY NL DESC"""):
if row["partOfSpeech"] in (u"v.", u"vin.", u"vtr."):
current = unicode(row["ipa"])
current = current.replace(ur"ɛ",ur"e").replace(ur".",ur"").replace(ur"ɾ",ur"r") \
.replace(ur"ɪ",ur"ì").replace(ur"ˈ",ur"").replace(ur"'",ur"x") \
.replace(ur"ŋ",ur"ng").replace(ur"j",ur"y").replace(ur"ʔ",ur"'") \
.replace(ur"æ",ur"ä").replace(ur"ˌ",ur"").replace(ur"\t{ts}",ur"ts") \
.replace(ur"ṛ",ur"rr").replace(ur"ḷ",ur"ll").replace(ur"k̚",ur"k ") \
.replace(ur"p̚",ur"p ").replace(ur"t̚",ur"t ").replace(ur"'̚",ur"' ") \
.replace(u"\\",ur"").replace(ur"(",ur"").replace(ur")",ur"") \
.replace(ur"[",ur"").replace(ur"]",ur"").replace(ur" "," ") \
.strip()
current = re.sub(ur" or.*","",current)
current = re.sub(ur"z(.*)engk(.*)e",ur"z\1enk\2e",current)
current = re.sub(ur"t(.*)ì(m|n)\ ",ur"t\1ìng ",current)
current = current.split(ur"$cdot$")
if len(current) == 3:
current = current[0] + u"<0><1>" + current[1] + u"<2>" + current[2]
elif len(current) == 2:
current = current[0] + u"<0><1><2>" + current[1]
else:
current = u"<0><1><2>" + current[0]
WHERE partOfSpeech <> 'num.' AND partOfSpeech <> "prefix"
ORDER BY CHAR_LENGTH(navi) DESC"""):
if row["infixes"]:
ret.append({"id": row["id"], "navi": row["navi"], "infix": row["infixes"].lower(), "type": row["partOfSpeech"]})
else:
current = unicode(row["navi"])
ret.append([row["id"], row["navi"], current, row["partOfSpeech"]])
ret.append({"id": row["id"], "navi": row["navi"], "infix": row["navi"].lower(), "type": row["partOfSpeech"]})
db.close()
return ret
 
def getnavi(word):
ret = []
def translate(wid,language):
db = tornado.database.Connection("127.0.0.1", "navi", user="navi", password="navi")
for row in db.query("""
SELECT *
FROM `metaWords`
WHERE navi = ?""",word):
ret.append([row["id"],row["navi"], row["infix"], row["partOfSpeech"]])
FROM `localizedWords`
WHERE id = %s AND languageCode = %s""",wid,language):
ret = row["localized"]
break
db.close()
return ret
return ret
/webapp/static/favicon.ico
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
/webapp/static/favicon.ico
Property changes:
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: webapp/templates/translate.html
===================================================================
--- webapp/templates/translate.html (nonexistent)
+++ webapp/templates/translate.html (revision 174)
@@ -0,0 +1,68 @@
+{% extends "base.html" %}
+
+{% block title %}Translator{% end %}
+
+{% block body %}
+Na'vi sentence:
+
+
+
+
+
+
+
+
+
+
+
+{% if out %}
+
+
+ Words
+ Translated
+ Parts
+ Data
+
+{% for wor in out %}
+
+ {{ wor["word"]["navi"] }}
+ {{ wor["translated"] }}
+ Infixes:
+ {{ u", ".join(wor["inf"]) }}
+
+
+ Prefixes:
+ {{ u"; ".join(u", ".join(x) for x in wor["pref"]) }}
+
+
+ Postfixes:
+ {{ u"; ".join(u", ".join(x) for x in wor["post"]) }}
+
+
+ Lenited:
+ {{ str(wor["len"]) }}
+
+{% end %}
+
+{% end %}
+

This program uses Eana Eltu for the list of words and infix positions (but nothing else), created by Tuiq and Taronyu. Thanks also go to the rest of the Learn Na'vi community!

+
+{% if lang != "eng" %}
+
+{% end %}
+{% end %}
Index: webapp/templates/parse.html
===================================================================
--- webapp/templates/parse.html (nonexistent)
+++ webapp/templates/parse.html (revision 174)
@@ -0,0 +1,43 @@
+{% extends "base.html" %}
+
+{% block title %}Sentence parser{% end %}
+
+{% block body %}
+Na'vi sentence:
+
+
+
+
+{% if out %}
+
+
+ Words
+ Parts
+ Data
+
+{% for wor in out %}
+
+ {{ wor["word"]["navi"] }}
+ Infixes:
+ {{ u", ".join(wor["inf"]) }}
+
+
+ Prefixes:
+ {{ u"; ".join(u", ".join(x) for x in wor["pref"]) }}
+
+
+ Postfixes:
+ {{ u"; ".join(u", ".join(x) for x in wor["post"]) }}
+
+
+ Lenited:
+ {{ str(wor["len"]) }}
+
+{% end %}
+
+{% end %}
+

This program uses Eana Eltu for the list of words and infix positions (but nothing else), created by Tuiq and Taronyu. Thanks also go to the rest of the Learn Na'vi community!

+
+{% end %}
Index: webapp/templates/base.html
===================================================================
--- webapp/templates/base.html (revision 48)
+++ webapp/templates/base.html (revision 174)
@@ -1,6 +1,7 @@
Tsim Apiak - {% block title %}Title{% end %}
+