WebSVN - navi - Blame - Rev 301 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
283	muzer	23	import tsimapiak.dbconnector as dbconnector
		24	import tsimapiak.parsenum as parsenum
297	muzer	25	import itertools
246	szabot	26	import re
56	szabot	27
221	muzer	28	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263	muzer	29	wordlist = dbconnector.getnavilist()
65	szabot	30
283	muzer	31	prefixes, infixes, postfixes = dbconnector.getaffixlists()
187	muzer	32
276	muzer	33	# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283	muzer	34	BROKENWORDS = (
298	muzer	35	(u"sami", u"si", u"", u"am", u"", (()), (()), False, "si"), # otherwise parses as sa (tsa-lenited) + mi
296	muzer	36	#(u"to", u"to", u"", u"", u"", (()), (()), False),
283	muzer	37	#(u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False),
298	muzer	38	(u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False, "soaia"), # does not parse, irregular form
296	muzer	39	#(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
		40	#(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
298	muzer	41	(u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False, "kä"), # otherwise parses as kìm (spin) + ä (genitive)
		42	(u"apxay", u"pxay", u"", u"", u"", [[(u"a", "a")]], (()), False, "pxay"), # otherwise parses as apxa + -y (genitive)
290	muzer	43	#(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
		44	#(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
		45	#(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
296	muzer	46	#(u"ka", u"ka", u"", u"", u"", (()), (()), False),
		47	#(u"uo", u"uo", u"", u"", u"", (()), (()), False),
		48	#(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
		49	#(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
298	muzer	50	(u"tse", u"tse", u"", u"", u"", (()), (()), False, "tse"), # otherwise parses as tsa'u abbreviated (special case)
300	muzer	51	(u"por", u"po", u"", u"", u"", (()), [[("r", None)]], False, "po"), # otherwise parses as lenited pxor which is unlikely
283	muzer	52	)
		53
300	muzer	54	BANNEDNUMBERS = { # words which must not be parsed by the number parser
		55	"pey" # more likely dictionary word pey than lenited pxey 3
		56	}
		57
283	muzer	58	#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
		59	#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		60	#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
		61	#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
		62	#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
		63	#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	64	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	65	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	66
290	muzer	67	EXTRAINFIXES = [
298	muzer	68	{"id": "-1", "navi": "eiy", "orig_navi": "ei", "gloss": "LAUD.", "position": 2},
		69	{"id": "-2", "navi": "eng", "orig_navi": "äng", "gloss": "PEJ.", "position": 2},
290	muzer	70	]
		71
		72	EXTRAPOSTFIXES = [
298	muzer	73	{"id": "-3", "navi": "eyä", "orig_navi": "yä", "gloss": "GEN."},
300	muzer	74	{"id": "-4", "navi": "pxì", "orig_navi": "pxì", "gloss": "FRAC."},
290	muzer	75	]
		76
284	muzer	77	EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283	muzer	78
246	szabot	79	LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91	szabot	80
289	muzer	81	# Let's lenit the prefixes
		82	extraprefixes = []
		83	for prefix in prefixes:
		84	for letter, replacement in LENIT:
		85	if prefix["navi"].startswith(letter):
290	muzer	86	new_prefix = prefix["navi"].replace(letter, replacement, 1)
		87	if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
298	muzer	88	extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD", "orig_navi": prefix["navi"]})
289	muzer	89	break
		90
		91	prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290	muzer	92	infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
		93	postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289	muzer	94
296	muzer	95	# Let's lenit the dictionary
		96	extrawords = []
		97	for word in wordlist:
297	muzer	98	splitword = word["navi"].split(" ")
		99	splitinfix = word["infix"].split(" ")
		100	lenitword = {}
		101	lenitinfix = {}
		102	for i, wor in enumerate(splitword):
		103	for letter, replacement in LENIT:
		104	if wor.startswith(letter):
		105	lenitword[i] = wor.replace(letter, replacement, 1)
		106	lenitinfix[i] = splitinfix[i].replace(letter, replacement, 1)
		107	break
		108
		109	s = list(lenitword.keys())
		110	for lenits in itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(1, len(s)+1)):
		111	new_word = ""
		112	new_infix = ""
		113	for i, wor in enumerate(splitword):
		114	if i in lenits:
		115	new_word += lenitword[i]
		116	new_infix += lenitinfix[i]
		117	else:
		118	new_word += wor
		119	new_infix += splitinfix[i]
		120	new_word += " "
		121	new_infix += " "
		122	print(f"Generated lenited {new_word} from {word['navi']}")
		123	new_word = new_word[:-1]
		124	new_infix = new_infix[:-1]
298	muzer	125	extrawords.append({"id": word["id"], "navi": new_word, "infix": new_infix, "type": word["type"], "lenited": True, "orig_navi": word["navi"]})
297	muzer	126
296	muzer	127	wordlist = sorted(wordlist + extrawords, key=lambda x: len(x["navi"]) * 2 + (0 if x["lenited"] else 1), reverse=True)
		128
301	muzer	129	# TODO add reef Na'vi
		130
56	szabot	131	def parseword(wordin):
187	muzer	132	tempid = 0
		133	temptype = u""
246	szabot	134	for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	135	if wordin[0] == brokenword[0]:
187	muzer	136	for word in wordlist:
203	muzer	137	if brokenword[1] == word["navi"]:
187	muzer	138	tempid = word["id"]
204	muzer	139	temptype = word["type"]
298	muzer	140	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype, "orig_navi": brokenword[8]}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
300	muzer	141	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u"", "orig_navi": "[" + wordin[0] + "]"}, "len": False, "pref": [], "post": [], "inf": ["", "", ""]}
65	szabot	142	for word in wordlist:
222	muzer	143	word["navi"] = word["navi"].lower()
65	szabot	144	foundit = True
		145	foundprefs = []
		146	foundposts = []
74	szabot	147	splitword = word["infix"].split(u" ")
172	muzer	148	foundins = [u"", u"", u""]
74	szabot	149	if len(wordin) < len(splitword):
68	szabot	150	foundit = False
246	szabot	151	continue
65	szabot	152	for wor in range(len(splitword)):
76	szabot	153	if not foundit:
		154	break
65	szabot	155	foundprefs.append([])
		156	foundposts.append([])
		157	center = u""
284	muzer	158	if u"<0>" in splitword[wor]:
185	muzer	159	tempin1 = []
		160	tempin2 = []
		161	tempin3 = []
283	muzer	162	for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185	muzer	163	if in1 in wordin[wor]:
		164	tempin1.append(in1)
283	muzer	165	for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185	muzer	166	if in2 in wordin[wor]:
		167	tempin2.append(in2)
283	muzer	168	for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185	muzer	169	if in3 in wordin[wor]:
		170	tempin3.append(in3)
180	szabot	171	for in1 in tempin1:
		172	for in2 in tempin2:
		173	for in3 in tempin3:
284	muzer	174	if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
		175	center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65	szabot	176	foundins = [in1, in2, in3]
		177	break
246	szabot	178	if center != u"":
		179	break
		180	if center != u"":
		181	break
65	szabot	182	else:
		183	if splitword[wor] in wordin[wor]:
		184	center = splitword[wor]
95	szabot	185	if center == u"":
		186	if splitword[wor].endswith(u"nga"):
290	muzer	187	temp = splitword[wor][:-3] + u"nge"
95	szabot	188	if temp in wordin[wor]:
		189	center = temp
271	muzer	190	if splitword[wor].endswith(u"fo"):
290	muzer	191	temp = splitword[wor][:-2] + u"fe"
103	szabot	192	if temp in wordin[wor]:
		193	center = temp
273	muzer	194	if splitword[wor].endswith(u"po"):
290	muzer	195	temp = splitword[wor][:-2] + u"pe"
273	muzer	196	if temp in wordin[wor]:
		197	center = temp
258	muzer	198	if splitword[wor].endswith(u"tsa"):
290	muzer	199	temp = splitword[wor][:-3] + u"tse"
258	muzer	200	if temp in wordin[wor]:
		201	center = temp
290	muzer	202	if splitword[wor].endswith(u"fko"):
		203	temp = splitword[wor][:-3] + u"fke"
		204	if temp in wordin[wor]:
		205	center = temp
		206	if splitword[wor].endswith(u"sa'u"):
		207	temp = splitword[wor][:-4] + u"se"
		208	if temp in wordin[wor]:
		209	center = temp
		210	if splitword[wor].endswith(u"sa"):
		211	temp = splitword[wor][:-2] + u"se"
		212	if temp in wordin[wor]:
		213	center = temp
		214	if splitword[wor].endswith(u"sno"):
		215	temp = splitword[wor][:-3] + u"sne"
		216	if temp in wordin[wor]:
		217	center = temp
		218	if splitword[wor].endswith(u"ayla"):
		219	temp = splitword[wor][:-3] + u"ayle"
		220	if temp in wordin[wor]:
		221	center = temp
74	szabot	222	if center == u"":
65	szabot	223	foundit = False
		224	break
91	szabot	225	temp = wordin[wor].split(center)
		226	if len(temp) != 2:
		227	foundit = False
		228	break
		229	pref, posf = temp
119	szabot	230	last = u""
		231	while last != pref:
		232	last = pref
298	muzer	233	for pre in prefixes:
119	szabot	234	if pref != u"":
298	muzer	235	if pref.endswith(pre["navi"]):
		236	if pre["navi"] in foundprefs[wor]:
119	szabot	237	break
298	muzer	238	foundprefs[wor].append((pre["navi"], pre["orig_navi"])) # only needed here, to handle lenition
		239	pref = pref[:-len(pre["navi"])]
120	szabot	240	break
65	szabot	241	if pref != u"":
		242	foundit = False
244	szabot	243	foundprefs = []
65	szabot	244	break
119	szabot	245	last = u""
		246	while last != posf:
		247	last = posf
288	muzer	248	for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119	szabot	249	if posf != u"":
		250	if posf.startswith(pos):
284	muzer	251	if (pos, posid) in foundposts[wor]:
119	szabot	252	break
298	muzer	253	if pos != u"ä" or word["orig_navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284	muzer	254	foundposts[wor].append((pos, posid))
244	szabot	255	posf = posf[len(pos):]
		256	break
		257	else:
		258	break
82	szabot	259	if posf != u"":
80	szabot	260	foundit = False
244	szabot	261	foundposts = []
80	szabot	262	break
65	szabot	263	if foundit == True:
		264	foundword = word
56	szabot	265	break
65	szabot	266	if foundit == True:
300	muzer	267	ret["pref"] = foundprefs
		268	ret["post"] = foundposts
		269	ret["inf"] = foundins
296	muzer	270	ret["len"] = word["lenited"]
71	szabot	271	ret["word"] = foundword
77	szabot	272	return ret
		273
		274	def parsesent(sent):
101	szabot	275	sent = sent.strip().lower().replace(u"’", u"'")
301	muzer	276	sent = sent.replace("ù", "u") # Basic support for reef Na'vi
283	muzer	277	sent = re.sub(r"[^\wìä' ]", u"", sent)
		278	sent = re.sub(r"\ +", u" ", sent)
89	szabot	279	sent = sent.split(u" ")
77	szabot	280	ret = []
		281	left = len(sent)
		282	while left:
300	muzer	283	word = None
		284	if sent[len(sent) - left] not in BANNEDNUMBERS:
		285	word = parsenum.parse(sent[len(sent) - left])
103	szabot	286	if word == None:
		287	word = parseword(sent[-left:])
78	szabot	288	left -= len(word["word"]["navi"].split(" "))
77	szabot	289	ret.append(word)
136	muzer	290	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 301 - Rev 301