WebSVN - navi - Path Comparison - / Rev 89 and / Rev 117

Ignore whitespace Rev 89 → Rev 117

/tsimapiak/parse2.py
File deleted
\ No newline at end of file

 /tsimapiak/parsenum.py
 ,52 → 30,61
 numre = \
-      u"^(?:(" + "|".join(base) + u")zazam??)?" + \
+      u"^(a?)(?:(" + "|".join(base) + u")zazam??)?" + \
       u"(?:(" + "|".join(base) + u")vozam??)?" + \
       u"(?:(" + "|".join(base) + u")zam??)?" + \
       u"(?:(" + "|".join(base) + u")vo(?:l(?=a|))?)?" + \
       u"((?:" + "|".join(rem) + u")|" + \
-      u"(?:" + "|".join(num) + u"))?$"
+      u"(?:" + "|".join(num) + u"))?((?:ve)?)(a?)$"
 numre = re.compile(numre)
 def parse(numin):
-    if type(numin) != unicode:
-        return None
-    if numin == u"":
-        return None
-    numin = numin.replace(u"í",u"ì").replace(u"á",u"ä")
     try:
         mat = numre.match(numin).groups()
     except:
         return None
+    if mat[5] == u"" and mat[4] == u"" and mat[3] == u"" and mat[2] == u"" and mat[1] == u"":
+        return None
     numout = 0
     numoct = 0
     try:
-        numout += rem.index(mat[4]) + 1
-        numoct += rem.index(mat[4]) + 1
+        numout += rem.index(mat[5]) + 1
+        numoct += rem.index(mat[5]) + 1
     except:
         try:
-            numout += num.index(mat[4])
-            numoct += num.index(mat[4])
+            numout += num.index(mat[5])
+            numoct += num.index(mat[5])
         except: pass
     try:
-        numout += (base.index(mat[3]) + 1) * 8
-        numoct += (base.index(mat[3]) + 1) * 10
+        numout += (base.index(mat[4]) + 1) * 8
+        numoct += (base.index(mat[4]) + 1) * 10
     except: pass
     try:
-        numout += (base.index(mat[2]) + 1) * 8**2
-        numoct += (base.index(mat[2]) + 1) * 10**2
+        numout += (base.index(mat[3]) + 1) * 8**2
+        numoct += (base.index(mat[3]) + 1) * 10**2
     except: pass
     try:
-        numout += (base.index(mat[1]) + 1) * 8**3
-        numoct += (base.index(mat[1]) + 1) * 10**3
+        numout += (base.index(mat[2]) + 1) * 8**3
+        numoct += (base.index(mat[2]) + 1) * 10**3
     except: pass
     try:
-        numout += (base.index(mat[0]) + 1) * 8**4
-        numoct += (base.index(mat[0]) + 1) * 10**4
+        numout += (base.index(mat[1]) + 1) * 8**4
+        numoct += (base.index(mat[1]) + 1) * 10**4
     except: pass
-    return numout, numoct
+    retnum = unicode(numout)
+    if mat[6] != u"":
+        retnum += u"."
+    prefs = []
+    posts = []
+    if mat[0] != u"":
+        prefs.append(mat[0])
+    if mat[6] != u"":
+        posts.append(mat[6])
+    if mat[7] != u"":
+        posts.append(mat[7])
+    return {"word": {"id": 0, "navi": retnum, "infix": u"", "type": u""}, "pref": [prefs], "post": [posts], "inf": [u"", u"", u""], "len": False, "dec": numout, "oct": numoct}
+    #return numout, numoct
 if __name__ == "__main__":
-    print parse(u"mrrvolaw")
+    print parse(u"mrrvolawvea")

 /tsimapiak/parse.py
 ,85 → 2,118
 # -*- coding: utf-8 -*-
 import re
-from dbconnector import getnavilist
+import dbconnector
+import parsenum
-wordlist = getnavilist()
+wordlist = dbconnector.getnavilist()
-infixes0 = [ u"awn", u"eyk", u"us", u"äp" ]
-infixes1 = [ u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv" u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol" ]
-infixes2 = [ u"äng", u"ats", u"eiy", u"ei", u"uy" ]
+infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
+infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
+infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
+prefixes = (u"tsay", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"pe", u"le", u"nì", u"sä", u"tì", u"ay", u"me", u"fì", u"ke", u"a")
+adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìla", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to")
+postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
+#prefixesn = ur"(?P<npr>(?:(?:fì|tsa)?(?:me|pxe|ay|fra)?|(?:fay)?|(?:tsay)?)(?:fne)?(?:tì|sä)?"
+#prefixesv = ur"(?P<vpr>(?:nì|sä|tì|rä'ä |ke )?)"
-# Returns array with Word,Infix 0,Infix 1,Infix 2,Case,Gender,Number suffixes,Inclusive,Indefinite,Vocative (suffix),Plural,Adposition,Adject pre,Adject suff,am/ay/tu/vi/yu,adverbial,nominalise,sä,fne,lenited?
-def parsefix(original):
-  realword = u""
-  infix0 = u""
-  infix1 = u""
-  infix2 = u""
-  infix01 = u""
-  infix_1 = u""
-  infix_2 = u""
-  for eachword in wordlist:
-    regex = re.sub(u" ",u"[^ ]* [^ ]*",eachword["infix"])
-    regex = re.sub(u"^",u"[^ ]*",regex)
-    regex = re.sub(u"$",u"[^ ]*",regex)
-    regex = re.sub(u"<1><2>",u"[^ ]*",regex)
-    regex = re.sub(u"<3>",u"[^ ]*",regex)
-    if re.match(regex,original):
-      realword = eachword["infix"]
-      break
-  if realword == u"":
-    return [u"[" + original + u"]",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u""]
-  else:
-    if re.search(u"<",realword):
-      beginning = re.sub(u"<1><2>.*",u"",realword)
-      middle = re.sub(u".*<1><2>(.*)<3>.*",ur"\1",realword)
-      end = re.sub(u".*<3>",u"",realword)
-      infix01 = re.sub(u".*?" + re.sub(u"<1><2>",u"([^ ]*)",re.sub(u"<3>",u"[^ ]*",realword)) + u".*?",ur"\1",original)
-      infix_2 = re.sub(u".*?" + re.sub(u"<3>",u"([^ ]*)",re.sub(u"<1><2>",u"[^ ]*",realword)) + u".*?",ur"\1",original)
-      for eachinfix in infixes0:
-        if infix01.startswith(eachinfix):
-          infix0 = eachinfix
-          infix_1 = infix01[len(eachinfix):]
-          break
-        else:
-          infix0 = u""
-          infix_1 = infix01
-      gotinfix1 = False
-      for eachinfix in infixes1:
-        if infix_1.startswith(eachinfix):
-          infix1 = eachinfix
-          infix_1 = infix_1[len(eachinfix):]
-          if infix_1 != u"":
-            if re.search(u"<1><2><3>",realword):
-              infix_2 = infix_1
+lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
+def parseword(wordin):
+    ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
+    for word in wordlist:
+        foundit = True
+        foundprefs = []
+        foundposts = []
+        lenited = False
+        splitword = word["infix"].split(u" ")
+        if len(wordin) < len(splitword):
+            foundit = False
+            next
+        for wor in range(len(splitword)):
+            if not foundit:
+                break
+            foundprefs.append([])
+            foundposts.append([])
+            center = u""
+            foundins = [u"", u"", u""]
+            pre = []
+            post = []
+            if u"<1>" in splitword[wor]:
+                for in1 in infixes1:
+                    for in2 in infixes2:
+                        for in3 in infixes3:
+                            if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
+                                center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
+                                foundins = [in1, in2, in3]
+                                break
+                        if center != u"": break
+                    if center != u"": break
             else:
-              return [u"[" + original + u"]",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u""]
-          gotinfix1 = True
-          break
-      if gotinfix1 == False:
-        if re.search(u"<1><2><3>",realword):
-          if infix_1 == u"":
-            infix_2 = infix_1
-            infix1 = u""
-        elif infix_1 == u"":
-          infix1 = u""
-        else:
-          return [u"[" + original + u"]",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u""]
-      gotinfix2 = False
-      for eachinfix in infixes2:
-        if infix_2.startswith(eachinfix):
-          infix2 = infix_2[:len(eachinfix)]
-          infix_2 = infix_2[len(eachinfix) - 1:]
-          gotinfix2 = True
-          break
-      if gotinfix2 == False or infix_2 != u"":
-        if infix_2.startswith(end):
-          suffixes = infix2[len(end) - 1:] + end
-        elif infix_2 == u"":
-          infix2 = u""
-        else:
-          return [u"[" + original + u"]",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u""]
-#      print u"0" + unicode(infix0) + u" 1" + unicode(infix1) + u" 2" + unicode(infix2)
-      return [realword,infix0,infix1,infix2,u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u""]
-    else:
-      return [realword,u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u"",u""]
+                if splitword[wor] in wordin[wor]:
+                    center = splitword[wor]
+                if center == u"":
+                    for i in lenit:
+                        temp = u""
+                        if splitword[wor].startswith(i[0]):
+                            temp = i[1] + splitword[wor][len(i[0]):]
+                            if temp in wordin[wor]:
+                                lenited = True
+                                center = temp
+                if center == u"":
+                    if splitword[wor].endswith(u"nga"):
+                        temp = splitword[wor][:-3] + u"ng"
+                        if temp in wordin[wor]:
+                            center = temp
+                    if splitword[wor].endswith(u"po"):
+                        temp = splitword[wor][:-3] + u"p"
+                        if temp in wordin[wor]:
+                            center = temp
+            if center == u"":
+                foundit = False
+                break
+            temp = wordin[wor].split(center)
+            if len(temp) != 2:
+                foundit = False
+                break
+            pref, posf = temp
+            for pre in prefixes:
+                if pref != u"":
+                    if pref.endswith(pre):
+                        foundprefs[wor].append(pre)
+                        pref = pref[:-len(pre)]
+            if pref != u"":
+                foundit = False
+                break
+            for pos in postfixes:
+                if posf != u"":
+                    if posf.startswith(pos):
+                        foundposts[wor].append(pos)
+                        posf = posf[len(pos):]
+            if posf != u"":
+                foundit = False
+                break
+        if foundit == True:
+            foundword = word
+            break
+    ret["pref"] = foundprefs
+    ret["post"] = foundposts
+    ret["inf"] = foundins
+    ret["len"] = lenited
+    if foundit == True:
+        ret["word"] = foundword
+    return ret
+def parsesent(sent):
+    sent = sent.strip().lower().replace(u"’", u"'")
+    sent = re.sub(ur"[^\wìä' ]",u"",sent)
+    sent = re.sub(ur"\ +",u" ",sent)
+    sent = sent.split(u" ")
+    ret = []
+    left = len(sent)
+    while left:
+        word = parsenum.parse(sent[len(sent)-left])
+        if word == None:
+            word = parseword(sent[-left:])
+        left -= len(word["word"]["navi"].split(" "))
+        ret.append(word)
+    return ret

 /tsimapiak/dbconnector.py
 ,9 → 9,10
     current = u""
     db = tornado.database.Connection("127.0.0.1", "navi", user="navi", password="navi")
     for row in db.query("""
-    SELECT *, CHAR_LENGTH(navi) AS NL
+    SELECT *
     FROM `metaWords`
-    ORDER BY NL DESC"""):
+    WHERE partOfSpeech <> 'num.' AND partOfSpeech <> "prefix"
+    ORDER BY CHAR_LENGTH(navi) DESC"""):
         if row["infixes"]:
             ret.append({"id": row["id"], "navi": row["navi"], "infix": row["infixes"].lower(), "type": row["partOfSpeech"]})
         else:

 /webapp/main.py
 ,7 → 12,6
 from tsimapiak import parsenum
 from tsimapiak import dbconnector
 from tsimapiak import parse
-from tsimapiak import parse2
 class Index(tornado.web.RequestHandler):
     def get(self):
 ,8 → 28,10
             self.redirect("/number")
         numout = parsenum.parse(num.replace(" ",""))
         if numout == None:
-            numout = -1
-        self.render("templates/number.html", last=num, numout=numout)
+            numoutt = -1
+        else:
+            numoutt = (numout["dec"], numout["oct"])
+        self.render("templates/number.html", last=num, numout=numoutt)
 class Restart(tornado.web.RequestHandler):
     def get(self):
 ,31 → 51,18
     def post(self):
         try:
-            word = self.get_argument("word").strip()
+            word = self.get_argument("word")
         except:
             self.redirect("/parse")
-        out = parse.parsefix(word)
+        out = parse.parsesent(word)
         self.render("templates/parse.html", last=word, out=out)
-class Parse2(tornado.web.RequestHandler):
-    def get(self):
-        self.render("templates/parse2.html", last="", out=None)
-    def post(self):
-        try:
-            word = self.get_argument("word")
-        except:
-            self.redirect("/parse2")
-        out = parse2.parsesent(word)
-        self.render("templates/parse2.html", last=word, out=out)
 application = tornado.web.Application([
     ("/", Index),
     ("/number", Number),
     ("/restart", Restart),
     ("/testdb", TestDB),
-    ("/parse", Parse),
-    ("/parse2", Parse2)
+    ("/parse", Parse)
 ])
 if __name__ == "__main__":

/webapp/templates/parse2.html
File deleted
\ No newline at end of file

 /webapp/templates/parse.html
 ,13 → 8,36
 <input id="word" name="word" type="text" value="{{last}}" style="width: 100%;" />
 <input name="btn" type="submit" value="Parse!" />
 </form>
-{% if type(out) == list %}
-{{ out[0]}} <br />
-{{ out[1]}} <br />
-{{ out[2]}} <br />
-{{ out[3]}}
+{% if out %}
+<table border="1">
+<tr>
+    <th>Words</th>
+    <th>Parts</th>
+    <th>Data</th>
+</tr>
+{% for wor in out %}
+<tr>
+    <td rowspan="4">{{ wor["word"]["navi"] }}</td>
+    <td>Infixes:</td>
+    <td>{{ u", ".join(wor["inf"]) }}</td>
+</tr>
+<tr>
+    <td>Prefixes:</td>
+    <td>{{ u"; ".join(u", ".join(x) for x in wor["pref"]) }}</td>
+</tr>
+<tr>
+    <td>Postfixes:</td>
+    <td>{{ u"; ".join(u", ".join(x) for x in wor["post"]) }}</td>
+</tr>
+<tr>
+    <td>Lenited:</td>
+    <td>{{ str(wor["len"]) }}</td>
+</tr>
 {% end %}
+</table>
+{% end %}
 <script type="text/javascript">
 document.getElementById("word").focus();
 </script>
-{% end %}
+<p>This program uses Eana Eltu for the list of words and infix positions (but nothing else), created by Tuiq and Taronyu. Thanks also go to the rest of the Learn Na'vi community!</p>
+{% end %}

Subversion Repositories navi

Compare Revisions

Ignore whitespace Rev 89 → Rev 117