WebSVN - navi - Blame - Rev 289 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
283	muzer	23	import tsimapiak.dbconnector as dbconnector
		24	import tsimapiak.parsenum as parsenum
246	szabot	25	import re
56	szabot	26
221	muzer	27	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263	muzer	28	wordlist = dbconnector.getnavilist()
65	szabot	29
283	muzer	30	prefixes, infixes, postfixes = dbconnector.getaffixlists()
187	muzer	31
276	muzer	32	# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283	muzer	33	BROKENWORDS = (
		34	(u"sami", u"si", u"", u"am", u"", (()), (()), False),
		35	(u"to", u"to", u"", u"", u"", (()), (()), False),
		36	#(u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False),
284	muzer	37	(u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False),
283	muzer	38	(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
		39	(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
		40	(u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False),
		41	(u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False),
285	muzer	42	(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
284	muzer	43	(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
283	muzer	44	(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
		45	(u"ka", u"ka", u"", u"", u"", (()), (()), False),
		46	(u"uo", u"uo", u"", u"", u"", (()), (()), False),
		47	(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
		48	(u"sim", u"sim", u"", u"", u"", (()), (()), False) # probably not tsim lenited
		49	)
		50
		51	#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
		52	#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		53	#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
		54	#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
		55	#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
		56	#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	57	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	58	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	59
284	muzer	60	EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283	muzer	61
246	szabot	62	LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91	szabot	63
289	muzer	64	# Let's lenit the prefixes
		65	extraprefixes = []
		66	for prefix in prefixes:
		67	for letter, replacement in LENIT:
		68	if prefix["navi"].startswith(letter):
		69	extraprefixes.append({"id": prefix["id"], "navi": prefix["navi"].replace(letter, replacement, 1), "gloss": prefix["gloss"] + ".LENTD"})
		70	break
		71
		72	prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
		73
56	szabot	74	def parseword(wordin):
187	muzer	75	tempid = 0
		76	temptype = u""
246	szabot	77	for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	78	if wordin[0] == brokenword[0]:
187	muzer	79	for word in wordlist:
203	muzer	80	if brokenword[1] == word["navi"]:
187	muzer	81	tempid = word["id"]
204	muzer	82	temptype = word["type"]
187	muzer	83	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90	szabot	84	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65	szabot	85	for word in wordlist:
222	muzer	86	word["navi"] = word["navi"].lower()
65	szabot	87	foundit = True
		88	foundprefs = []
		89	foundposts = []
99	szabot	90	lenited = False
74	szabot	91	splitword = word["infix"].split(u" ")
172	muzer	92	foundins = [u"", u"", u""]
74	szabot	93	if len(wordin) < len(splitword):
68	szabot	94	foundit = False
246	szabot	95	continue
65	szabot	96	for wor in range(len(splitword)):
76	szabot	97	if not foundit:
		98	break
65	szabot	99	foundprefs.append([])
		100	foundposts.append([])
		101	center = u""
284	muzer	102	if u"<0>" in splitword[wor]:
185	muzer	103	tempin1 = []
		104	tempin2 = []
		105	tempin3 = []
283	muzer	106	for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185	muzer	107	if in1 in wordin[wor]:
		108	tempin1.append(in1)
283	muzer	109	for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185	muzer	110	if in2 in wordin[wor]:
		111	tempin2.append(in2)
283	muzer	112	for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185	muzer	113	if in3 in wordin[wor]:
		114	tempin3.append(in3)
180	szabot	115	for in1 in tempin1:
		116	for in2 in tempin2:
		117	for in3 in tempin3:
284	muzer	118	if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
		119	center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65	szabot	120	foundins = [in1, in2, in3]
		121	break
246	szabot	122	if center != u"":
		123	break
		124	if center != u"":
		125	break
65	szabot	126	else:
		127	if splitword[wor] in wordin[wor]:
		128	center = splitword[wor]
216	muzer	129	if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246	szabot	130	for i in LENIT:
92	szabot	131	temp = u""
91	szabot	132	if splitword[wor].startswith(i[0]):
92	szabot	133	temp = i[1] + splitword[wor][len(i[0]):]
		134	if temp in wordin[wor]:
99	szabot	135	lenited = True
92	szabot	136	center = temp
95	szabot	137	if center == u"":
		138	if splitword[wor].endswith(u"nga"):
97	szabot	139	temp = splitword[wor][:-3] + u"ng"
95	szabot	140	if temp in wordin[wor]:
		141	center = temp
271	muzer	142	if splitword[wor].endswith(u"fo"):
		143	temp = splitword[wor][:-2] + u"f"
103	szabot	144	if temp in wordin[wor]:
		145	center = temp
273	muzer	146	if splitword[wor].endswith(u"po"):
		147	temp = splitword[wor][:-2] + u"p"
		148	if temp in wordin[wor]:
		149	center = temp
258	muzer	150	if splitword[wor].endswith(u"tsa"):
		151	temp = splitword[wor][:-3] + u"ts"
		152	if temp in wordin[wor]:
		153	center = temp
74	szabot	154	if center == u"":
65	szabot	155	foundit = False
		156	break
91	szabot	157	temp = wordin[wor].split(center)
		158	if len(temp) != 2:
		159	foundit = False
		160	break
		161	pref, posf = temp
119	szabot	162	last = u""
		163	while last != pref:
		164	last = pref
283	muzer	165	for pre in [x["navi"] for x in prefixes]:
119	szabot	166	if pref != u"":
		167	if pref.endswith(pre):
		168	if pre in foundprefs[wor]:
		169	break
		170	foundprefs[wor].append(pre)
		171	pref = pref[:-len(pre)]
120	szabot	172	break
65	szabot	173	if pref != u"":
		174	foundit = False
244	szabot	175	foundprefs = []
65	szabot	176	break
119	szabot	177	last = u""
		178	while last != posf:
		179	last = posf
288	muzer	180	for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119	szabot	181	if posf != u"":
		182	if posf.startswith(pos):
284	muzer	183	if (pos, posid) in foundposts[wor]:
119	szabot	184	break
244	szabot	185	if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284	muzer	186	foundposts[wor].append((pos, posid))
244	szabot	187	posf = posf[len(pos):]
		188	break
		189	else:
		190	break
82	szabot	191	if posf != u"":
80	szabot	192	foundit = False
244	szabot	193	foundposts = []
80	szabot	194	break
65	szabot	195	if foundit == True:
		196	foundword = word
56	szabot	197	break
87	szabot	198	ret["pref"] = foundprefs
		199	ret["post"] = foundposts
		200	ret["inf"] = foundins
99	szabot	201	ret["len"] = lenited
65	szabot	202	if foundit == True:
71	szabot	203	ret["word"] = foundword
77	szabot	204	return ret
		205
		206	def parsesent(sent):
101	szabot	207	sent = sent.strip().lower().replace(u"’", u"'")
283	muzer	208	sent = re.sub(r"[^\wìä' ]", u"", sent)
		209	sent = re.sub(r"\ +", u" ", sent)
89	szabot	210	sent = sent.split(u" ")
77	szabot	211	ret = []
		212	left = len(sent)
		213	while left:
246	szabot	214	word = parsenum.parse(sent[len(sent) - left])
103	szabot	215	if word == None:
		216	word = parseword(sent[-left:])
78	szabot	217	left -= len(word["word"]["navi"].split(" "))
77	szabot	218	ret.append(word)
136	muzer	219	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 301 - Rev 289