WebSVN - navi - Blame - Rev 280 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
66	szabot	23	import dbconnector
103	szabot	24	import parsenum
246	szabot	25	import re
56	szabot	26
221	muzer	27	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263	muzer	28	wordlist = dbconnector.getnavilist()
65	szabot	29
187	muzer	30
280	muzer	31	BROKENWORDS = ((u"sami", u"si", u"", u"am", u"", (()), (()), False), (u"to", u"to", u"", u"", u"", (()), (()), False), (u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False), (u"soaiä", u"soaia", u"", u"", u"", (()), [[u"ä"]], False), (u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False), (u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False), (u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False), (u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False), (u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False), (u"kawnga", u"kawng", u"", u"", u"", (()), [[u"a"]], False), (u"kawng", u"kawng", u"", u"", u"", (()), (()), False), (u"ka", u"ka", u"", u"", u"", (()), (()), False), (u"uo", u"uo", u"", u"", u"", (()), (()), False), (u"sìk", u"sìk", u"", u"", u"", (()), (()), False), (u"sim", u"sim", u"", u"", u"", (()), (()), False)) # probably not tsim lenited
276	muzer	32	# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
246	szabot	33	INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
		34	INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		35	INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
267	muzer	36	PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
246	szabot	37	ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
256	muzer	38	POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	39	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	40	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	41
246	szabot	42	LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91	szabot	43
56	szabot	44	def parseword(wordin):
187	muzer	45	tempid = 0
		46	temptype = u""
246	szabot	47	for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	48	if wordin[0] == brokenword[0]:
187	muzer	49	for word in wordlist:
203	muzer	50	if brokenword[1] == word["navi"]:
187	muzer	51	tempid = word["id"]
204	muzer	52	temptype = word["type"]
187	muzer	53	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90	szabot	54	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65	szabot	55	for word in wordlist:
222	muzer	56	word["navi"] = word["navi"].lower()
65	szabot	57	foundit = True
		58	foundprefs = []
		59	foundposts = []
99	szabot	60	lenited = False
74	szabot	61	splitword = word["infix"].split(u" ")
172	muzer	62	foundins = [u"", u"", u""]
74	szabot	63	if len(wordin) < len(splitword):
68	szabot	64	foundit = False
246	szabot	65	continue
65	szabot	66	for wor in range(len(splitword)):
76	szabot	67	if not foundit:
		68	break
65	szabot	69	foundprefs.append([])
		70	foundposts.append([])
		71	center = u""
		72	if u"<1>" in splitword[wor]:
185	muzer	73	tempin1 = []
		74	tempin2 = []
		75	tempin3 = []
246	szabot	76	for in1 in INFIXES1:
185	muzer	77	if in1 in wordin[wor]:
		78	tempin1.append(in1)
246	szabot	79	for in2 in INFIXES2:
185	muzer	80	if in2 in wordin[wor]:
		81	tempin2.append(in2)
246	szabot	82	for in3 in INFIXES3:
185	muzer	83	if in3 in wordin[wor]:
		84	tempin3.append(in3)
180	szabot	85	for in1 in tempin1:
		86	for in2 in tempin2:
		87	for in3 in tempin3:
246	szabot	88	if splitword[wor].replace(u"<1><2>", in1 + in2).replace(u"<3>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
		89	center = splitword[wor].replace(u"<1><2>", in1 + in2).replace(u"<3>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65	szabot	90	foundins = [in1, in2, in3]
		91	break
246	szabot	92	if center != u"":
		93	break
		94	if center != u"":
		95	break
65	szabot	96	else:
		97	if splitword[wor] in wordin[wor]:
		98	center = splitword[wor]
216	muzer	99	if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246	szabot	100	for i in LENIT:
92	szabot	101	temp = u""
91	szabot	102	if splitword[wor].startswith(i[0]):
92	szabot	103	temp = i[1] + splitword[wor][len(i[0]):]
		104	if temp in wordin[wor]:
99	szabot	105	lenited = True
92	szabot	106	center = temp
95	szabot	107	if center == u"":
		108	if splitword[wor].endswith(u"nga"):
97	szabot	109	temp = splitword[wor][:-3] + u"ng"
95	szabot	110	if temp in wordin[wor]:
		111	center = temp
271	muzer	112	if splitword[wor].endswith(u"fo"):
		113	temp = splitword[wor][:-2] + u"f"
103	szabot	114	if temp in wordin[wor]:
		115	center = temp
273	muzer	116	if splitword[wor].endswith(u"po"):
		117	temp = splitword[wor][:-2] + u"p"
		118	if temp in wordin[wor]:
		119	center = temp
258	muzer	120	if splitword[wor].endswith(u"tsa"):
		121	temp = splitword[wor][:-3] + u"ts"
		122	if temp in wordin[wor]:
		123	center = temp
74	szabot	124	if center == u"":
65	szabot	125	foundit = False
		126	break
91	szabot	127	temp = wordin[wor].split(center)
		128	if len(temp) != 2:
		129	foundit = False
		130	break
		131	pref, posf = temp
119	szabot	132	last = u""
		133	while last != pref:
		134	last = pref
246	szabot	135	for pre in PREFIXES:
119	szabot	136	if pref != u"":
		137	if pref.endswith(pre):
		138	if pre in foundprefs[wor]:
		139	break
		140	foundprefs[wor].append(pre)
		141	pref = pref[:-len(pre)]
120	szabot	142	break
65	szabot	143	if pref != u"":
		144	foundit = False
244	szabot	145	foundprefs = []
65	szabot	146	break
119	szabot	147	last = u""
		148	while last != posf:
		149	last = posf
246	szabot	150	for pos in POSTFIXES:
119	szabot	151	if posf != u"":
		152	if posf.startswith(pos):
		153	if pos in foundposts[wor]:
		154	break
244	szabot	155	if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
		156	foundposts[wor].append(pos)
		157	posf = posf[len(pos):]
		158	break
		159	else:
		160	break
82	szabot	161	if posf != u"":
80	szabot	162	foundit = False
244	szabot	163	foundposts = []
80	szabot	164	break
65	szabot	165	if foundit == True:
		166	foundword = word
56	szabot	167	break
87	szabot	168	ret["pref"] = foundprefs
		169	ret["post"] = foundposts
		170	ret["inf"] = foundins
99	szabot	171	ret["len"] = lenited
65	szabot	172	if foundit == True:
71	szabot	173	ret["word"] = foundword
77	szabot	174	return ret
		175
		176	def parsesent(sent):
101	szabot	177	sent = sent.strip().lower().replace(u"’", u"'")
246	szabot	178	sent = re.sub(ur"[^\wìä' ]", u"", sent)
		179	sent = re.sub(ur"\ +", u" ", sent)
89	szabot	180	sent = sent.split(u" ")
77	szabot	181	ret = []
		182	left = len(sent)
		183	while left:
246	szabot	184	word = parsenum.parse(sent[len(sent) - left])
103	szabot	185	if word == None:
		186	word = parseword(sent[-left:])
78	szabot	187	left -= len(word["word"]["navi"].split(" "))
77	szabot	188	ret.append(word)
136	muzer	189	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 280 - Rev 280