WebSVN - navi - Blame - Rev 284 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
283	muzer	23	import tsimapiak.dbconnector as dbconnector
		24	import tsimapiak.parsenum as parsenum
246	szabot	25	import re
56	szabot	26
221	muzer	27	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263	muzer	28	wordlist = dbconnector.getnavilist()
65	szabot	29
283	muzer	30	prefixes, infixes, postfixes = dbconnector.getaffixlists()
187	muzer	31
276	muzer	32	# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283	muzer	33	BROKENWORDS = (
		34	(u"sami", u"si", u"", u"am", u"", (()), (()), False),
		35	(u"to", u"to", u"", u"", u"", (()), (()), False),
		36	#(u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False),
284	muzer	37	(u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False),
283	muzer	38	(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
		39	(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
		40	(u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False),
		41	(u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False),
		42	(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False), # TODO remember why on earth this is needed; how is awng interpreted as awnga?
284	muzer	43	(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
283	muzer	44	(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
		45	(u"ka", u"ka", u"", u"", u"", (()), (()), False),
		46	(u"uo", u"uo", u"", u"", u"", (()), (()), False),
		47	(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
		48	(u"sim", u"sim", u"", u"", u"", (()), (()), False) # probably not tsim lenited
		49	)
		50
		51	#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
		52	#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		53	#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
		54	#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
		55	#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
		56	#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	57	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	58	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	59
284	muzer	60	EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283	muzer	61
246	szabot	62	LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91	szabot	63
56	szabot	64	def parseword(wordin):
187	muzer	65	tempid = 0
		66	temptype = u""
246	szabot	67	for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	68	if wordin[0] == brokenword[0]:
187	muzer	69	for word in wordlist:
203	muzer	70	if brokenword[1] == word["navi"]:
187	muzer	71	tempid = word["id"]
204	muzer	72	temptype = word["type"]
187	muzer	73	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90	szabot	74	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65	szabot	75	for word in wordlist:
222	muzer	76	word["navi"] = word["navi"].lower()
65	szabot	77	foundit = True
		78	foundprefs = []
		79	foundposts = []
99	szabot	80	lenited = False
74	szabot	81	splitword = word["infix"].split(u" ")
172	muzer	82	foundins = [u"", u"", u""]
74	szabot	83	if len(wordin) < len(splitword):
68	szabot	84	foundit = False
246	szabot	85	continue
65	szabot	86	for wor in range(len(splitword)):
76	szabot	87	if not foundit:
		88	break
65	szabot	89	foundprefs.append([])
		90	foundposts.append([])
		91	center = u""
284	muzer	92	if u"<0>" in splitword[wor]:
185	muzer	93	tempin1 = []
		94	tempin2 = []
		95	tempin3 = []
283	muzer	96	for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185	muzer	97	if in1 in wordin[wor]:
		98	tempin1.append(in1)
283	muzer	99	for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185	muzer	100	if in2 in wordin[wor]:
		101	tempin2.append(in2)
283	muzer	102	for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185	muzer	103	if in3 in wordin[wor]:
		104	tempin3.append(in3)
180	szabot	105	for in1 in tempin1:
		106	for in2 in tempin2:
		107	for in3 in tempin3:
284	muzer	108	if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
		109	center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65	szabot	110	foundins = [in1, in2, in3]
		111	break
246	szabot	112	if center != u"":
		113	break
		114	if center != u"":
		115	break
65	szabot	116	else:
		117	if splitword[wor] in wordin[wor]:
		118	center = splitword[wor]
216	muzer	119	if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246	szabot	120	for i in LENIT:
92	szabot	121	temp = u""
91	szabot	122	if splitword[wor].startswith(i[0]):
92	szabot	123	temp = i[1] + splitword[wor][len(i[0]):]
		124	if temp in wordin[wor]:
99	szabot	125	lenited = True
92	szabot	126	center = temp
95	szabot	127	if center == u"":
		128	if splitword[wor].endswith(u"nga"):
97	szabot	129	temp = splitword[wor][:-3] + u"ng"
95	szabot	130	if temp in wordin[wor]:
		131	center = temp
271	muzer	132	if splitword[wor].endswith(u"fo"):
		133	temp = splitword[wor][:-2] + u"f"
103	szabot	134	if temp in wordin[wor]:
		135	center = temp
273	muzer	136	if splitword[wor].endswith(u"po"):
		137	temp = splitword[wor][:-2] + u"p"
		138	if temp in wordin[wor]:
		139	center = temp
258	muzer	140	if splitword[wor].endswith(u"tsa"):
		141	temp = splitword[wor][:-3] + u"ts"
		142	if temp in wordin[wor]:
		143	center = temp
74	szabot	144	if center == u"":
65	szabot	145	foundit = False
		146	break
91	szabot	147	temp = wordin[wor].split(center)
		148	if len(temp) != 2:
		149	foundit = False
		150	break
		151	pref, posf = temp
119	szabot	152	last = u""
		153	while last != pref:
		154	last = pref
283	muzer	155	for pre in [x["navi"] for x in prefixes]:
119	szabot	156	if pref != u"":
		157	if pref.endswith(pre):
		158	if pre in foundprefs[wor]:
		159	break
		160	foundprefs[wor].append(pre)
		161	pref = pref[:-len(pre)]
120	szabot	162	break
65	szabot	163	if pref != u"":
		164	foundit = False
244	szabot	165	foundprefs = []
65	szabot	166	break
119	szabot	167	last = u""
		168	while last != posf:
		169	last = posf
284	muzer	170	for pos, posid in [(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP):
119	szabot	171	if posf != u"":
		172	if posf.startswith(pos):
284	muzer	173	if (pos, posid) in foundposts[wor]:
119	szabot	174	break
244	szabot	175	if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284	muzer	176	foundposts[wor].append((pos, posid))
244	szabot	177	posf = posf[len(pos):]
		178	break
		179	else:
		180	break
82	szabot	181	if posf != u"":
80	szabot	182	foundit = False
244	szabot	183	foundposts = []
80	szabot	184	break
65	szabot	185	if foundit == True:
		186	foundword = word
56	szabot	187	break
87	szabot	188	ret["pref"] = foundprefs
		189	ret["post"] = foundposts
		190	ret["inf"] = foundins
99	szabot	191	ret["len"] = lenited
65	szabot	192	if foundit == True:
71	szabot	193	ret["word"] = foundword
77	szabot	194	return ret
		195
		196	def parsesent(sent):
101	szabot	197	sent = sent.strip().lower().replace(u"’", u"'")
283	muzer	198	sent = re.sub(r"[^\wìä' ]", u"", sent)
		199	sent = re.sub(r"\ +", u" ", sent)
89	szabot	200	sent = sent.split(u" ")
77	szabot	201	ret = []
		202	left = len(sent)
		203	while left:
246	szabot	204	word = parsenum.parse(sent[len(sent) - left])
103	szabot	205	if word == None:
		206	word = parseword(sent[-left:])
78	szabot	207	left -= len(word["word"]["navi"].split(" "))
77	szabot	208	ret.append(word)
136	muzer	209	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 284 - Rev 284