WebSVN - navi - Blame - Rev 296 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
283	muzer	23	import tsimapiak.dbconnector as dbconnector
		24	import tsimapiak.parsenum as parsenum
246	szabot	25	import re
56	szabot	26
221	muzer	27	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263	muzer	28	wordlist = dbconnector.getnavilist()
65	szabot	29
283	muzer	30	prefixes, infixes, postfixes = dbconnector.getaffixlists()
187	muzer	31
276	muzer	32	# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283	muzer	33	BROKENWORDS = (
296	muzer	34	(u"sami", u"si", u"", u"am", u"", (()), (()), False), # otherwise parses as sa (tsa-lenited) + mi
		35	#(u"to", u"to", u"", u"", u"", (()), (()), False),
283	muzer	36	#(u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False),
296	muzer	37	(u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False), # does not parse, irregular form
		38	#(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
		39	#(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
		40	(u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False), # otherwise parses as kìm (spin) + ä (genitive)
		41	(u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False), # otherwise parses as apxa + -y (genitive)
290	muzer	42	#(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
		43	#(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
		44	#(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
296	muzer	45	#(u"ka", u"ka", u"", u"", u"", (()), (()), False),
		46	#(u"uo", u"uo", u"", u"", u"", (()), (()), False),
		47	#(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
		48	#(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
		49	(u"tse", u"tse", u"", u"", u"", (()), (()), False), # otherwise parses as tsa'u abbreviated (special case)
283	muzer	50	)
		51
		52	#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
		53	#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		54	#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
		55	#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
		56	#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
		57	#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	58	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	59	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	60
290	muzer	61	EXTRAINFIXES = [
		62	{"id": "-1", "navi": "eiy", "gloss": "LAUD.", "position": 2},
		63	{"id": "-2", "navi": "eng", "gloss": "PEJ.", "position": 2},
		64	]
		65
		66	EXTRAPOSTFIXES = [
		67	{"id": "-3", "navi": "eyä", "gloss": "GEN."},
		68	]
		69
284	muzer	70	EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283	muzer	71
246	szabot	72	LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91	szabot	73
289	muzer	74	# Let's lenit the prefixes
		75	extraprefixes = []
		76	for prefix in prefixes:
		77	for letter, replacement in LENIT:
		78	if prefix["navi"].startswith(letter):
290	muzer	79	new_prefix = prefix["navi"].replace(letter, replacement, 1)
		80	if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
		81	extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD"})
289	muzer	82	break
		83
		84	prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290	muzer	85	infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
		86	postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289	muzer	87
296	muzer	88	# Let's lenit the dictionary
		89	extrawords = []
		90	for word in wordlist:
		91	for letter, replacement in LENIT:
		92	if word["navi"].startswith(letter):
		93	new_word = word["navi"].replace(letter, replacement, 1)
		94	new_infix = word["infix"].replace(letter, replacement, 1)
		95	extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True})
		96	wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)
		97
56	szabot	98	def parseword(wordin):
187	muzer	99	tempid = 0
		100	temptype = u""
246	szabot	101	for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	102	if wordin[0] == brokenword[0]:
187	muzer	103	for word in wordlist:
203	muzer	104	if brokenword[1] == word["navi"]:
187	muzer	105	tempid = word["id"]
204	muzer	106	temptype = word["type"]
187	muzer	107	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
296	muzer	108	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}, "len": False}
65	szabot	109	for word in wordlist:
222	muzer	110	word["navi"] = word["navi"].lower()
65	szabot	111	foundit = True
		112	foundprefs = []
		113	foundposts = []
74	szabot	114	splitword = word["infix"].split(u" ")
172	muzer	115	foundins = [u"", u"", u""]
74	szabot	116	if len(wordin) < len(splitword):
68	szabot	117	foundit = False
246	szabot	118	continue
65	szabot	119	for wor in range(len(splitword)):
76	szabot	120	if not foundit:
		121	break
65	szabot	122	foundprefs.append([])
		123	foundposts.append([])
		124	center = u""
284	muzer	125	if u"<0>" in splitword[wor]:
185	muzer	126	tempin1 = []
		127	tempin2 = []
		128	tempin3 = []
283	muzer	129	for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185	muzer	130	if in1 in wordin[wor]:
		131	tempin1.append(in1)
283	muzer	132	for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185	muzer	133	if in2 in wordin[wor]:
		134	tempin2.append(in2)
283	muzer	135	for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185	muzer	136	if in3 in wordin[wor]:
		137	tempin3.append(in3)
180	szabot	138	for in1 in tempin1:
		139	for in2 in tempin2:
		140	for in3 in tempin3:
284	muzer	141	if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
		142	center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65	szabot	143	foundins = [in1, in2, in3]
		144	break
246	szabot	145	if center != u"":
		146	break
		147	if center != u"":
		148	break
65	szabot	149	else:
		150	if splitword[wor] in wordin[wor]:
		151	center = splitword[wor]
95	szabot	152	if center == u"":
		153	if splitword[wor].endswith(u"nga"):
290	muzer	154	temp = splitword[wor][:-3] + u"nge"
95	szabot	155	if temp in wordin[wor]:
		156	center = temp
271	muzer	157	if splitword[wor].endswith(u"fo"):
290	muzer	158	temp = splitword[wor][:-2] + u"fe"
103	szabot	159	if temp in wordin[wor]:
		160	center = temp
273	muzer	161	if splitword[wor].endswith(u"po"):
290	muzer	162	temp = splitword[wor][:-2] + u"pe"
273	muzer	163	if temp in wordin[wor]:
		164	center = temp
258	muzer	165	if splitword[wor].endswith(u"tsa"):
290	muzer	166	temp = splitword[wor][:-3] + u"tse"
258	muzer	167	if temp in wordin[wor]:
		168	center = temp
290	muzer	169	if splitword[wor].endswith(u"fko"):
		170	temp = splitword[wor][:-3] + u"fke"
		171	if temp in wordin[wor]:
		172	center = temp
		173	if splitword[wor].endswith(u"sa'u"):
		174	temp = splitword[wor][:-4] + u"se"
		175	if temp in wordin[wor]:
		176	center = temp
		177	if splitword[wor].endswith(u"sa"):
		178	temp = splitword[wor][:-2] + u"se"
		179	if temp in wordin[wor]:
		180	center = temp
		181	if splitword[wor].endswith(u"sno"):
		182	temp = splitword[wor][:-3] + u"sne"
		183	if temp in wordin[wor]:
		184	center = temp
		185	if splitword[wor].endswith(u"ayla"):
		186	temp = splitword[wor][:-3] + u"ayle"
		187	if temp in wordin[wor]:
		188	center = temp
74	szabot	189	if center == u"":
65	szabot	190	foundit = False
		191	break
91	szabot	192	temp = wordin[wor].split(center)
		193	if len(temp) != 2:
		194	foundit = False
		195	break
		196	pref, posf = temp
119	szabot	197	last = u""
		198	while last != pref:
		199	last = pref
283	muzer	200	for pre in [x["navi"] for x in prefixes]:
119	szabot	201	if pref != u"":
		202	if pref.endswith(pre):
		203	if pre in foundprefs[wor]:
		204	break
		205	foundprefs[wor].append(pre)
		206	pref = pref[:-len(pre)]
120	szabot	207	break
65	szabot	208	if pref != u"":
		209	foundit = False
244	szabot	210	foundprefs = []
65	szabot	211	break
119	szabot	212	last = u""
		213	while last != posf:
		214	last = posf
288	muzer	215	for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119	szabot	216	if posf != u"":
		217	if posf.startswith(pos):
284	muzer	218	if (pos, posid) in foundposts[wor]:
119	szabot	219	break
296	muzer	220	if pos != u"ä" or (word["navi"] != u"pey" and word["navi"] != "fey"): # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284	muzer	221	foundposts[wor].append((pos, posid))
244	szabot	222	posf = posf[len(pos):]
		223	break
		224	else:
		225	break
82	szabot	226	if posf != u"":
80	szabot	227	foundit = False
244	szabot	228	foundposts = []
80	szabot	229	break
65	szabot	230	if foundit == True:
		231	foundword = word
56	szabot	232	break
87	szabot	233	ret["pref"] = foundprefs
		234	ret["post"] = foundposts
		235	ret["inf"] = foundins
65	szabot	236	if foundit == True:
296	muzer	237	ret["len"] = word["lenited"]
71	szabot	238	ret["word"] = foundword
77	szabot	239	return ret
		240
		241	def parsesent(sent):
101	szabot	242	sent = sent.strip().lower().replace(u"’", u"'")
283	muzer	243	sent = re.sub(r"[^\wìä' ]", u"", sent)
		244	sent = re.sub(r"\ +", u" ", sent)
89	szabot	245	sent = sent.split(u" ")
77	szabot	246	ret = []
		247	left = len(sent)
		248	while left:
246	szabot	249	word = parsenum.parse(sent[len(sent) - left])
103	szabot	250	if word == None:
		251	word = parseword(sent[-left:])
78	szabot	252	left -= len(word["word"]["navi"].split(" "))
77	szabot	253	ret.append(word)
136	muzer	254	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 301 - Rev 296