WebSVN - navi - Blame - Rev 259 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
66	szabot	23	import dbconnector
103	szabot	24	import parsenum
246	szabot	25	import re
56	szabot	26
221	muzer	27	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
		28	wordlist = dbconnector.getnavilist()
65	szabot	29
187	muzer	30
246	szabot	31	BROKENWORDS = ((u"sami", u"si", u"", u"am", u"", (()), (()), False), (u"to", u"to", u"", u"", u"", (()), (()), False), (u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False)) # XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, PREFIXES, suffixes. Things that can take affixes should go in the above list instead.
		32	INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
		33	INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		34	INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
254	muzer	35	PREFIXES = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m")
246	szabot	36	ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
256	muzer	37	POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	38	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	39	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	40
246	szabot	41	LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91	szabot	42
56	szabot	43	def parseword(wordin):
187	muzer	44	tempid = 0
		45	temptype = u""
246	szabot	46	for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	47	if wordin[0] == brokenword[0]:
187	muzer	48	for word in wordlist:
203	muzer	49	if brokenword[1] == word["navi"]:
187	muzer	50	tempid = word["id"]
204	muzer	51	temptype = word["type"]
187	muzer	52	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90	szabot	53	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65	szabot	54	for word in wordlist:
222	muzer	55	word["navi"] = word["navi"].lower()
65	szabot	56	foundit = True
		57	foundprefs = []
		58	foundposts = []
99	szabot	59	lenited = False
74	szabot	60	splitword = word["infix"].split(u" ")
172	muzer	61	foundins = [u"", u"", u""]
74	szabot	62	if len(wordin) < len(splitword):
68	szabot	63	foundit = False
246	szabot	64	continue
65	szabot	65	for wor in range(len(splitword)):
76	szabot	66	if not foundit:
		67	break
65	szabot	68	foundprefs.append([])
		69	foundposts.append([])
		70	center = u""
		71	if u"<1>" in splitword[wor]:
185	muzer	72	tempin1 = []
		73	tempin2 = []
		74	tempin3 = []
246	szabot	75	for in1 in INFIXES1:
185	muzer	76	if in1 in wordin[wor]:
		77	tempin1.append(in1)
246	szabot	78	for in2 in INFIXES2:
185	muzer	79	if in2 in wordin[wor]:
		80	tempin2.append(in2)
246	szabot	81	for in3 in INFIXES3:
185	muzer	82	if in3 in wordin[wor]:
		83	tempin3.append(in3)
180	szabot	84	for in1 in tempin1:
		85	for in2 in tempin2:
		86	for in3 in tempin3:
246	szabot	87	if splitword[wor].replace(u"<1><2>", in1 + in2).replace(u"<3>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
		88	center = splitword[wor].replace(u"<1><2>", in1 + in2).replace(u"<3>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65	szabot	89	foundins = [in1, in2, in3]
		90	break
246	szabot	91	if center != u"":
		92	break
		93	if center != u"":
		94	break
65	szabot	95	else:
		96	if splitword[wor] in wordin[wor]:
		97	center = splitword[wor]
216	muzer	98	if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246	szabot	99	for i in LENIT:
92	szabot	100	temp = u""
91	szabot	101	if splitword[wor].startswith(i[0]):
92	szabot	102	temp = i[1] + splitword[wor][len(i[0]):]
		103	if temp in wordin[wor]:
99	szabot	104	lenited = True
92	szabot	105	center = temp
95	szabot	106	if center == u"":
		107	if splitword[wor].endswith(u"nga"):
97	szabot	108	temp = splitword[wor][:-3] + u"ng"
95	szabot	109	if temp in wordin[wor]:
		110	center = temp
103	szabot	111	if splitword[wor].endswith(u"po"):
174	muzer	112	temp = splitword[wor][:-2] + u"p"
103	szabot	113	if temp in wordin[wor]:
		114	center = temp
258	muzer	115	if splitword[wor].endswith(u"tsa"):
		116	temp = splitword[wor][:-3] + u"ts"
		117	if temp in wordin[wor]:
		118	center = temp
74	szabot	119	if center == u"":
65	szabot	120	foundit = False
		121	break
91	szabot	122	temp = wordin[wor].split(center)
		123	if len(temp) != 2:
		124	foundit = False
		125	break
		126	pref, posf = temp
119	szabot	127	last = u""
		128	while last != pref:
		129	last = pref
246	szabot	130	for pre in PREFIXES:
119	szabot	131	if pref != u"":
		132	if pref.endswith(pre):
		133	if pre in foundprefs[wor]:
		134	break
		135	foundprefs[wor].append(pre)
		136	pref = pref[:-len(pre)]
120	szabot	137	break
65	szabot	138	if pref != u"":
		139	foundit = False
244	szabot	140	foundprefs = []
65	szabot	141	break
119	szabot	142	last = u""
		143	while last != posf:
		144	last = posf
246	szabot	145	for pos in POSTFIXES:
119	szabot	146	if posf != u"":
		147	if posf.startswith(pos):
		148	if pos in foundposts[wor]:
		149	break
244	szabot	150	if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
		151	foundposts[wor].append(pos)
		152	posf = posf[len(pos):]
		153	break
		154	else:
		155	break
82	szabot	156	if posf != u"":
80	szabot	157	foundit = False
244	szabot	158	foundposts = []
80	szabot	159	break
65	szabot	160	if foundit == True:
		161	foundword = word
56	szabot	162	break
87	szabot	163	ret["pref"] = foundprefs
		164	ret["post"] = foundposts
		165	ret["inf"] = foundins
99	szabot	166	ret["len"] = lenited
65	szabot	167	if foundit == True:
71	szabot	168	ret["word"] = foundword
77	szabot	169	return ret
		170
		171	def parsesent(sent):
101	szabot	172	sent = sent.strip().lower().replace(u"’", u"'")
246	szabot	173	sent = re.sub(ur"[^\wìä' ]", u"", sent)
		174	sent = re.sub(ur"\ +", u" ", sent)
89	szabot	175	sent = sent.split(u" ")
77	szabot	176	ret = []
		177	left = len(sent)
		178	while left:
246	szabot	179	word = parsenum.parse(sent[len(sent) - left])
103	szabot	180	if word == None:
		181	word = parseword(sent[-left:])
78	szabot	182	left -= len(word["word"]["navi"].split(" "))
77	szabot	183	ret.append(word)
136	muzer	184	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py - Rev 259