WebSVN - navi - Blame - Rev 298 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
283	muzer	23	import tsimapiak.dbconnector as dbconnector
		24	import tsimapiak.parsenum as parsenum
297	muzer	25	import itertools
246	szabot	26	import re
56	szabot	27
221	muzer	28	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263	muzer	29	wordlist = dbconnector.getnavilist()
65	szabot	30
283	muzer	31	prefixes, infixes, postfixes = dbconnector.getaffixlists()
187	muzer	32
276	muzer	33	# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283	muzer	34	BROKENWORDS = (
298	muzer	35	(u"sami", u"si", u"", u"am", u"", (()), (()), False, "si"), # otherwise parses as sa (tsa-lenited) + mi
296	muzer	36	#(u"to", u"to", u"", u"", u"", (()), (()), False),
283	muzer	37	#(u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False),
298	muzer	38	(u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False, "soaia"), # does not parse, irregular form
296	muzer	39	#(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
		40	#(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
298	muzer	41	(u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False, "kä"), # otherwise parses as kìm (spin) + ä (genitive)
		42	(u"apxay", u"pxay", u"", u"", u"", [[(u"a", "a")]], (()), False, "pxay"), # otherwise parses as apxa + -y (genitive)
290	muzer	43	#(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
		44	#(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
		45	#(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
296	muzer	46	#(u"ka", u"ka", u"", u"", u"", (()), (()), False),
		47	#(u"uo", u"uo", u"", u"", u"", (()), (()), False),
		48	#(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
		49	#(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
298	muzer	50	(u"tse", u"tse", u"", u"", u"", (()), (()), False, "tse"), # otherwise parses as tsa'u abbreviated (special case)
283	muzer	51	)
		52
		53	#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
		54	#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		55	#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
		56	#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
		57	#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
		58	#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	59	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	60	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	61
290	muzer	62	EXTRAINFIXES = [
298	muzer	63	{"id": "-1", "navi": "eiy", "orig_navi": "ei", "gloss": "LAUD.", "position": 2},
		64	{"id": "-2", "navi": "eng", "orig_navi": "äng", "gloss": "PEJ.", "position": 2},
290	muzer	65	]
		66
		67	EXTRAPOSTFIXES = [
298	muzer	68	{"id": "-3", "navi": "eyä", "orig_navi": "yä", "gloss": "GEN."},
290	muzer	69	]
		70
284	muzer	71	EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283	muzer	72
246	szabot	73	LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91	szabot	74
289	muzer	75	# Let's lenit the prefixes
		76	extraprefixes = []
		77	for prefix in prefixes:
		78	for letter, replacement in LENIT:
		79	if prefix["navi"].startswith(letter):
290	muzer	80	new_prefix = prefix["navi"].replace(letter, replacement, 1)
		81	if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
298	muzer	82	extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD", "orig_navi": prefix["navi"]})
289	muzer	83	break
		84
		85	prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290	muzer	86	infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
		87	postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289	muzer	88
296	muzer	89	# Let's lenit the dictionary
		90	extrawords = []
		91	for word in wordlist:
297	muzer	92	splitword = word["navi"].split(" ")
		93	splitinfix = word["infix"].split(" ")
		94	lenitword = {}
		95	lenitinfix = {}
		96	for i, wor in enumerate(splitword):
		97	for letter, replacement in LENIT:
		98	if wor.startswith(letter):
		99	lenitword[i] = wor.replace(letter, replacement, 1)
		100	lenitinfix[i] = splitinfix[i].replace(letter, replacement, 1)
		101	break
		102
		103	s = list(lenitword.keys())
		104	for lenits in itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(1, len(s)+1)):
		105	new_word = ""
		106	new_infix = ""
		107	for i, wor in enumerate(splitword):
		108	if i in lenits:
		109	new_word += lenitword[i]
		110	new_infix += lenitinfix[i]
		111	else:
		112	new_word += wor
		113	new_infix += splitinfix[i]
		114	new_word += " "
		115	new_infix += " "
		116	print(f"Generated lenited {new_word} from {word['navi']}")
		117	new_word = new_word[:-1]
		118	new_infix = new_infix[:-1]
298	muzer	119	extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True, "orig_navi": word["navi"]})
297	muzer	120
296	muzer	121	wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)
		122
56	szabot	123	def parseword(wordin):
187	muzer	124	tempid = 0
		125	temptype = u""
246	szabot	126	for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	127	if wordin[0] == brokenword[0]:
187	muzer	128	for word in wordlist:
203	muzer	129	if brokenword[1] == word["navi"]:
187	muzer	130	tempid = word["id"]
204	muzer	131	temptype = word["type"]
298	muzer	132	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype, "orig_navi": brokenword[8]}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
		133	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u"", "orig_navi": "[" + wordin[0] + "]"}, "len": False}
65	szabot	134	for word in wordlist:
222	muzer	135	word["navi"] = word["navi"].lower()
65	szabot	136	foundit = True
		137	foundprefs = []
		138	foundposts = []
74	szabot	139	splitword = word["infix"].split(u" ")
172	muzer	140	foundins = [u"", u"", u""]
74	szabot	141	if len(wordin) < len(splitword):
68	szabot	142	foundit = False
246	szabot	143	continue
65	szabot	144	for wor in range(len(splitword)):
76	szabot	145	if not foundit:
		146	break
65	szabot	147	foundprefs.append([])
		148	foundposts.append([])
		149	center = u""
284	muzer	150	if u"<0>" in splitword[wor]:
185	muzer	151	tempin1 = []
		152	tempin2 = []
		153	tempin3 = []
283	muzer	154	for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185	muzer	155	if in1 in wordin[wor]:
		156	tempin1.append(in1)
283	muzer	157	for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185	muzer	158	if in2 in wordin[wor]:
		159	tempin2.append(in2)
283	muzer	160	for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185	muzer	161	if in3 in wordin[wor]:
		162	tempin3.append(in3)
180	szabot	163	for in1 in tempin1:
		164	for in2 in tempin2:
		165	for in3 in tempin3:
284	muzer	166	if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
		167	center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65	szabot	168	foundins = [in1, in2, in3]
		169	break
246	szabot	170	if center != u"":
		171	break
		172	if center != u"":
		173	break
65	szabot	174	else:
		175	if splitword[wor] in wordin[wor]:
		176	center = splitword[wor]
95	szabot	177	if center == u"":
		178	if splitword[wor].endswith(u"nga"):
290	muzer	179	temp = splitword[wor][:-3] + u"nge"
95	szabot	180	if temp in wordin[wor]:
		181	center = temp
271	muzer	182	if splitword[wor].endswith(u"fo"):
290	muzer	183	temp = splitword[wor][:-2] + u"fe"
103	szabot	184	if temp in wordin[wor]:
		185	center = temp
273	muzer	186	if splitword[wor].endswith(u"po"):
290	muzer	187	temp = splitword[wor][:-2] + u"pe"
273	muzer	188	if temp in wordin[wor]:
		189	center = temp
258	muzer	190	if splitword[wor].endswith(u"tsa"):
290	muzer	191	temp = splitword[wor][:-3] + u"tse"
258	muzer	192	if temp in wordin[wor]:
		193	center = temp
290	muzer	194	if splitword[wor].endswith(u"fko"):
		195	temp = splitword[wor][:-3] + u"fke"
		196	if temp in wordin[wor]:
		197	center = temp
		198	if splitword[wor].endswith(u"sa'u"):
		199	temp = splitword[wor][:-4] + u"se"
		200	if temp in wordin[wor]:
		201	center = temp
		202	if splitword[wor].endswith(u"sa"):
		203	temp = splitword[wor][:-2] + u"se"
		204	if temp in wordin[wor]:
		205	center = temp
		206	if splitword[wor].endswith(u"sno"):
		207	temp = splitword[wor][:-3] + u"sne"
		208	if temp in wordin[wor]:
		209	center = temp
		210	if splitword[wor].endswith(u"ayla"):
		211	temp = splitword[wor][:-3] + u"ayle"
		212	if temp in wordin[wor]:
		213	center = temp
74	szabot	214	if center == u"":
65	szabot	215	foundit = False
		216	break
91	szabot	217	temp = wordin[wor].split(center)
		218	if len(temp) != 2:
		219	foundit = False
		220	break
		221	pref, posf = temp
119	szabot	222	last = u""
		223	while last != pref:
		224	last = pref
298	muzer	225	for pre in prefixes:
119	szabot	226	if pref != u"":
298	muzer	227	if pref.endswith(pre["navi"]):
		228	if pre["navi"] in foundprefs[wor]:
119	szabot	229	break
298	muzer	230	foundprefs[wor].append((pre["navi"], pre["orig_navi"])) # only needed here, to handle lenition
		231	pref = pref[:-len(pre["navi"])]
120	szabot	232	break
65	szabot	233	if pref != u"":
		234	foundit = False
244	szabot	235	foundprefs = []
65	szabot	236	break
119	szabot	237	last = u""
		238	while last != posf:
		239	last = posf
288	muzer	240	for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119	szabot	241	if posf != u"":
		242	if posf.startswith(pos):
284	muzer	243	if (pos, posid) in foundposts[wor]:
119	szabot	244	break
298	muzer	245	if pos != u"ä" or word["orig_navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284	muzer	246	foundposts[wor].append((pos, posid))
244	szabot	247	posf = posf[len(pos):]
		248	break
		249	else:
		250	break
82	szabot	251	if posf != u"":
80	szabot	252	foundit = False
244	szabot	253	foundposts = []
80	szabot	254	break
65	szabot	255	if foundit == True:
		256	foundword = word
56	szabot	257	break
87	szabot	258	ret["pref"] = foundprefs
		259	ret["post"] = foundposts
		260	ret["inf"] = foundins
65	szabot	261	if foundit == True:
296	muzer	262	ret["len"] = word["lenited"]
71	szabot	263	ret["word"] = foundword
77	szabot	264	return ret
		265
		266	def parsesent(sent):
101	szabot	267	sent = sent.strip().lower().replace(u"’", u"'")
283	muzer	268	sent = re.sub(r"[^\wìä' ]", u"", sent)
		269	sent = re.sub(r"\ +", u" ", sent)
89	szabot	270	sent = sent.split(u" ")
77	szabot	271	ret = []
		272	left = len(sent)
		273	while left:
246	szabot	274	word = parsenum.parse(sent[len(sent) - left])
103	szabot	275	if word == None:
		276	word = parseword(sent[-left:])
78	szabot	277	left -= len(word["word"]["navi"].split(" "))
77	szabot	278	ret.append(word)
136	muzer	279	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 172 - Rev 298