Subversion Repositories navi

Compare Revisions

Ignore whitespace Rev 48 → Rev 133

/tsimapiak/parsenum.py
12,6 → 12,15
u"pukap",
u"kinä"]
 
numord = [u"kewve",
u"'awve",
u"muve",
u"pxeyve",
u"tsìve",
u"mrrve",
u"puve",
u"kive"]
 
rem = [u"aw",
u"mun",
u"pey",
20,6 → 29,14
u"fu",
u"hin"]
 
remord = [u"awve",
u"muve",
u"peyve",
u"sìve",
u"mrrve",
u"fuve",
u"hive"]
 
base = [u"",
u"me",
u"pxe",
30,20 → 47,17
 
 
numre = \
u"^(?:(" + "|".join(base) + u")zazam??)?" + \
u"(?:(" + "|".join(base) + u")vozam??)?" + \
u"(?:(" + "|".join(base) + u")zam??)?" + \
u"(?:(" + "|".join(base) + u")vo(?:l(?=a|))?)?" + \
u"((?:" + "|".join(rem) + u")|" + \
u"(?:" + "|".join(num) + u"))?$"
u"^(a?)(?:(?:(" + "|".join(base) + u")zaza(?=m)(?:ve(?=$))?)?" + \
u"(?:(" + "|".join(base) + u")voza(?=m)(?:ve(?=$))?)?" + \
u"(?:(" + "|".join(base) + u")za(?=m)(?:ve(?=$))?)?" + \
u"(?:(" + "|".join(base) + u")(?:vol|vo(?=a|$))(?:ve(?=$))?)?" + \
u"(?:" + "|".join(remord + rem) + u"))|" + \
u"(?:" + "|".join(numord + num) + u")?(a?)$"
numre = re.compile(numre)
 
def parse(numin):
if type(numin) != unicode:
if numin in (u"a", u"aa", u"ave", u"avea", u"ve", u"vea"):
return None
if numin == u"":
return None
numin = numin.replace(u"í",u"ì").replace(u"á",u"ä")
try:
mat = numre.match(numin).groups()
except:
51,31 → 65,43
numout = 0
numoct = 0
try:
numout += rem.index(mat[4]) + 1
numoct += rem.index(mat[4]) + 1
numout += rem.index(mat[5]) + 1
numoct += rem.index(mat[5]) + 1
except:
try:
numout += num.index(mat[4])
numoct += num.index(mat[4])
numout += num.index(mat[5])
numoct += num.index(mat[5])
except: pass
try:
numout += (base.index(mat[3]) + 1) * 8
numoct += (base.index(mat[3]) + 1) * 10
numout += (base.index(mat[4]) + 1) * 8
numoct += (base.index(mat[4]) + 1) * 10
except: pass
try:
numout += (base.index(mat[2]) + 1) * 8**2
numoct += (base.index(mat[2]) + 1) * 10**2
numout += (base.index(mat[3]) + 1) * 8**2
numoct += (base.index(mat[3]) + 1) * 10**2
except: pass
try:
numout += (base.index(mat[1]) + 1) * 8**3
numoct += (base.index(mat[1]) + 1) * 10**3
numout += (base.index(mat[2]) + 1) * 8**3
numoct += (base.index(mat[2]) + 1) * 10**3
except: pass
try:
numout += (base.index(mat[0]) + 1) * 8**4
numoct += (base.index(mat[0]) + 1) * 10**4
numout += (base.index(mat[1]) + 1) * 8**4
numoct += (base.index(mat[1]) + 1) * 10**4
except: pass
return numout, numoct
retnum = unicode(numout)
if mat[6] != u"":
retnum += u"."
prefs = []
posts = []
if mat[0] != u"":
prefs.append(mat[0])
if mat[6] != u"":
posts.append(mat[6])
if mat[7] != u"":
posts.append(mat[7])
return {"word": {"id": 0, "navi": retnum, "infix": u"", "type": u""}, "pref": [prefs], "post": [posts], "inf": [u"", u"", u""], "len": False, "dec": numout, "oct": numoct}
#return numout, numoct
 
 
if __name__ == "__main__":
print parse(u"mrrvolaw")
print parse(u"mrrvolawvea")
/tsimapiak/parse.py
0,0 → 1,131
#!/usr/bin/python
# -*- coding: utf-8 -*-
 
import re
import dbconnector
import parsenum
 
wordlist = dbconnector.getnavilist()
 
infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
prefixes = (u"tsay", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"pe", u"le", u"nì", u"sä", u"tì", u"ay", u"me", u"fì", u"ke", u"a")
adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìla", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to")
postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
 
lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
 
def parseword(wordin):
ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
for word in wordlist:
foundit = True
foundprefs = []
foundposts = []
lenited = False
splitword = word["infix"].split(u" ")
if len(wordin) < len(splitword):
foundit = False
next
for wor in range(len(splitword)):
if not foundit:
break
foundprefs.append([])
foundposts.append([])
center = u""
foundins = [u"", u"", u""]
pre = []
post = []
if u"<1>" in splitword[wor]:
for in1 in infixes1:
for in2 in infixes2:
for in3 in infixes3:
if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
foundins = [in1, in2, in3]
break
if center != u"": break
if center != u"": break
else:
if splitword[wor] in wordin[wor]:
center = splitword[wor]
if center == u"":
for i in lenit:
temp = u""
if splitword[wor].startswith(i[0]):
temp = i[1] + splitword[wor][len(i[0]):]
if temp in wordin[wor]:
lenited = True
center = temp
if center == u"":
if splitword[wor].endswith(u"nga"):
temp = splitword[wor][:-3] + u"ng"
if temp in wordin[wor]:
center = temp
if splitword[wor].endswith(u"po"):
temp = splitword[wor][:-3] + u"p"
if temp in wordin[wor]:
center = temp
if center == u"":
foundit = False
break
temp = wordin[wor].split(center)
if len(temp) != 2:
foundit = False
break
pref, posf = temp
last = u""
while last != pref:
last = pref
for pre in prefixes:
if pref != u"":
if pref.endswith(pre):
if pre in foundprefs[wor]:
break
foundprefs[wor].append(pre)
pref = pref[:-len(pre)]
break
if pref != u"":
foundit = False
break
last = u""
while last != posf:
last = posf
for pos in postfixes:
if posf != u"":
if posf.startswith(pos):
if pos in foundposts[wor]:
break
foundposts[wor].append(pos)
posf = posf[len(pos):]
break
if posf != u"":
foundit = False
break
if foundit == True:
foundword = word
break
ret["pref"] = foundprefs
ret["post"] = foundposts
ret["inf"] = foundins
ret["len"] = lenited
if foundit == True:
ret["word"] = foundword
return ret
 
def parsesent(sent):
sent = sent.strip().lower().replace(u"’", u"'")
sent = re.sub(ur"[^\wìä' ]",u"",sent)
sent = re.sub(ur"\ +",u" ",sent)
sent = sent.split(u" ")
ret = []
left = len(sent)
while left:
word = parsenum.parse(sent[len(sent)-left])
if word == None:
word = parseword(sent[-left:])
left -= len(word["word"]["navi"].split(" "))
ret.append(word)
return ret
/tsimapiak/dbconnector.py
9,33 → 9,14
current = u""
db = tornado.database.Connection("127.0.0.1", "navi", user="navi", password="navi")
for row in db.query("""
SELECT *, CHAR_LENGTH(navi) AS NL
SELECT *
FROM `metaWords`
ORDER BY NL DESC"""):
if row["partOfSpeech"] in (u"v.", u"vin.", u"vtr."):
current = unicode(row["ipa"])
current = current.replace(ur"ɛ",ur"e").replace(ur".",ur"").replace(ur"ɾ",ur"r") \
.replace(ur"ɪ",ur"ì").replace(ur"ˈ",ur"").replace(ur"'",ur"x") \
.replace(ur"ŋ",ur"ng").replace(ur"j",ur"y").replace(ur"ʔ",ur"'") \
.replace(ur"æ",ur"ä").replace(ur"ˌ",ur"").replace(ur"\t{ts}",ur"ts") \
.replace(ur"ṛ",ur"rr").replace(ur"ḷ",ur"ll").replace(ur"k̚",ur"k ") \
.replace(ur"p̚",ur"p ").replace(ur"t̚",ur"t ").replace(ur"'̚",ur"' ") \
.replace(u"\\",ur"").replace(ur"(",ur"").replace(ur")",ur"") \
.replace(ur"[",ur"").replace(ur"]",ur"").replace(ur" "," ") \
.strip()
current = re.sub(ur" or.*","",current)
current = re.sub(ur"z(.*)engk(.*)e",ur"z\1enk\2e",current)
current = re.sub(ur"t(.*)ì(m|n)\ ",ur"t\1ìng ",current)
current = current.split(ur"$cdot$")
if len(current) == 3:
current = current[0] + u"<0><1>" + current[1] + u"<2>" + current[2]
elif len(current) == 2:
current = current[0] + u"<0><1><2>" + current[1]
else:
current = u"<0><1><2>" + current[0]
WHERE partOfSpeech <> 'num.' AND partOfSpeech <> "prefix"
ORDER BY CHAR_LENGTH(navi) DESC"""):
if row["infixes"]:
ret.append({"id": row["id"], "navi": row["navi"], "infix": row["infixes"].lower(), "type": row["partOfSpeech"]})
else:
current = unicode(row["navi"])
ret.append([row["id"], row["navi"], current, row["partOfSpeech"]])
ret.append({"id": row["id"], "navi": row["navi"], "infix": row["navi"].lower(), "type": row["partOfSpeech"]})
db.close()
return ret
 
46,7 → 27,9
SELECT *
FROM `metaWords`
WHERE navi = ?""",word):
ret.append([row["id"],row["navi"], row["infix"], row["partOfSpeech"]])
if row["infixes"]:
ret.append({"id": row["id"], "navi": row["navi"], "infix": row["infixes"].lower(), "type": row["partOfSpeech"]})
else:
ret.append({"id": row["id"], "navi": row["navi"], "infix": row["navi"].lower(), "type": row["partOfSpeech"]})
db.close()
return ret
return ret
/webapp/static/favicon.ico
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
/webapp/static/favicon.ico
Property changes:
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: webapp/templates/parse.html
===================================================================
--- webapp/templates/parse.html (nonexistent)
+++ webapp/templates/parse.html (revision 133)
@@ -0,0 +1,43 @@
+{% extends "base.html" %}
+
+{% block title %}Sentence parser{% end %}
+
+{% block body %}
+Na'vi sentence:
+
+
+
+
+{% if out %}
+
+
+ Words
+ Parts
+ Data
+
+{% for wor in out %}
+
+ {{ wor["word"]["navi"] }}
+ Infixes:
+ {{ u", ".join(wor["inf"]) }}
+
+
+ Prefixes:
+ {{ u"; ".join(u", ".join(x) for x in wor["pref"]) }}
+
+
+ Postfixes:
+ {{ u"; ".join(u", ".join(x) for x in wor["post"]) }}
+
+
+ Lenited:
+ {{ str(wor["len"]) }}
+
+{% end %}
+
+{% end %}
+

This program uses Eana Eltu for the list of words and infix positions (but nothing else), created by Tuiq and Taronyu. Thanks also go to the rest of the Learn Na'vi community!

+
+{% end %}
Index: webapp/templates/base.html
===================================================================
--- webapp/templates/base.html (revision 48)
+++ webapp/templates/base.html (revision 133)
@@ -1,6 +1,7 @@
Tsim Apiak - {% block title %}Title{% end %}
+