WebSVN - navi - Blame - Rev 294 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
283	muzer	23	import tsimapiak.dbconnector as dbconnector
		24	import tsimapiak.parsenum as parsenum
246	szabot	25	import re
56	szabot	26
221	muzer	27	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
263	muzer	28	wordlist = dbconnector.getnavilist()
65	szabot	29
283	muzer	30	prefixes, infixes, postfixes = dbconnector.getaffixlists()
187	muzer	31
276	muzer	32	# XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
283	muzer	33	BROKENWORDS = (
		34	(u"sami", u"si", u"", u"am", u"", (()), (()), False),
		35	(u"to", u"to", u"", u"", u"", (()), (()), False),
		36	#(u"frato", u"to", u"", u"", u"", [[u"fra"]], (()), False),
284	muzer	37	(u"soaiä", u"soaia", u"", u"", u"", (()), [[(u"ä", None)]], False),
283	muzer	38	(u"mengenga", u"ngenga", u"", u"", u"", [[u"me"]], (()), False),
		39	(u"pxengenga", u"ngenga", u"", u"", u"", [[u"pxe"]], (()), False),
		40	(u"kìmä", u"kä", u"", u"ìm", u"", (()), (()), False),
		41	(u"apxay", u"pxay", u"", u"", u"", [[u"a"]], (()), False),
290	muzer	42	#(u"akawng", u"kawng", u"", u"", u"", [[u"a"]], (()), False),
		43	#(u"kawnga", u"kawng", u"", u"", u"", (()), [[(u"a", None)]], False),
		44	#(u"kawng", u"kawng", u"", u"", u"", (()), (()), False),
283	muzer	45	(u"ka", u"ka", u"", u"", u"", (()), (()), False),
		46	(u"uo", u"uo", u"", u"", u"", (()), (()), False),
		47	(u"sìk", u"sìk", u"", u"", u"", (()), (()), False),
294	muzer	48	(u"sim", u"sim", u"", u"", u"", (()), (()), False), # probably not tsim lenited
		49	(u"tse", u"tse", u"", u"", u"", (()), (()), False),
283	muzer	50	)
		51
		52	#INFIXES1 = (u"awn", u"eyk", u"us", u"äp", u"")
		53	#INFIXES2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		54	#INFIXES3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
		55	#PREFIXES = (u"tsay", u"fray", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"kel", u"lek", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"px", u"a", u"m", u"k")
		56	#ADPOSITIONS = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
		57	#POSTFIXES = ADPOSITIONS + (u"tsyìp", u"eyä", u"ìri", u"aru", u"ati", u"ayä", u"ari", u"ay", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"al", u"at", u"ar", u"ey", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	58	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	59	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	60
290	muzer	61	EXTRAINFIXES = [
		62	{"id": "-1", "navi": "eiy", "gloss": "LAUD.", "position": 2},
		63	{"id": "-2", "navi": "eng", "gloss": "PEJ.", "position": 2},
		64	]
		65
		66	EXTRAPOSTFIXES = [
		67	{"id": "-3", "navi": "eyä", "gloss": "GEN."},
		68	]
		69
284	muzer	70	EXTRAADP = (("to", [x["id"] for x in wordlist if x["navi"] == "to"][0]), ("sì", [x["id"] for x in wordlist if x["navi"] == "sì"][0])) # words that act like adpositions but technically aren't
283	muzer	71
246	szabot	72	LENIT = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
91	szabot	73
289	muzer	74	# Let's lenit the prefixes
		75	extraprefixes = []
		76	for prefix in prefixes:
		77	for letter, replacement in LENIT:
		78	if prefix["navi"].startswith(letter):
290	muzer	79	new_prefix = prefix["navi"].replace(letter, replacement, 1)
		80	if not [x for x in prefixes if x["navi"] == new_prefix]: # always assume a dictionary word over a lenited prefix
		81	extraprefixes.append({"id": prefix["id"], "navi": new_prefix, "gloss": prefix["gloss"] + ".LENTD"})
289	muzer	82	break
		83
		84	prefixes = sorted(prefixes + extraprefixes, key=lambda x: len(x["navi"]), reverse=True)
290	muzer	85	infixes = sorted(infixes + EXTRAINFIXES, key=lambda x: len(x["navi"]), reverse=True)
		86	postfixes = sorted(postfixes + EXTRAPOSTFIXES, key=lambda x: len(x["navi"]), reverse=True)
289	muzer	87
56	szabot	88	def parseword(wordin):
187	muzer	89	tempid = 0
		90	temptype = u""
246	szabot	91	for brokenword in BROKENWORDS: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	92	if wordin[0] == brokenword[0]:
187	muzer	93	for word in wordlist:
203	muzer	94	if brokenword[1] == word["navi"]:
187	muzer	95	tempid = word["id"]
204	muzer	96	temptype = word["type"]
187	muzer	97	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90	szabot	98	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65	szabot	99	for word in wordlist:
222	muzer	100	word["navi"] = word["navi"].lower()
65	szabot	101	foundit = True
		102	foundprefs = []
		103	foundposts = []
99	szabot	104	lenited = False
74	szabot	105	splitword = word["infix"].split(u" ")
172	muzer	106	foundins = [u"", u"", u""]
74	szabot	107	if len(wordin) < len(splitword):
68	szabot	108	foundit = False
246	szabot	109	continue
65	szabot	110	for wor in range(len(splitword)):
76	szabot	111	if not foundit:
		112	break
65	szabot	113	foundprefs.append([])
		114	foundposts.append([])
		115	center = u""
284	muzer	116	if u"<0>" in splitword[wor]:
185	muzer	117	tempin1 = []
		118	tempin2 = []
		119	tempin3 = []
283	muzer	120	for in1 in [x["navi"] for x in infixes if x["position"] == 0] + [""]:
185	muzer	121	if in1 in wordin[wor]:
		122	tempin1.append(in1)
283	muzer	123	for in2 in [x["navi"] for x in infixes if x["position"] == 1] + [""]:
185	muzer	124	if in2 in wordin[wor]:
		125	tempin2.append(in2)
283	muzer	126	for in3 in [x["navi"] for x in infixes if x["position"] == 2] + [""]:
185	muzer	127	if in3 in wordin[wor]:
		128	tempin3.append(in3)
180	szabot	129	for in1 in tempin1:
		130	for in2 in tempin2:
		131	for in3 in tempin3:
284	muzer	132	if splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r") in wordin[wor]:
		133	center = splitword[wor].replace(u"<0><1>", in1 + in2).replace(u"<2>", in3).replace(u"lll", u"l").replace(u"rrr", u"r")
65	szabot	134	foundins = [in1, in2, in3]
		135	break
246	szabot	136	if center != u"":
		137	break
		138	if center != u"":
		139	break
65	szabot	140	else:
		141	if splitword[wor] in wordin[wor]:
		142	center = splitword[wor]
216	muzer	143	if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
246	szabot	144	for i in LENIT:
92	szabot	145	temp = u""
91	szabot	146	if splitword[wor].startswith(i[0]):
92	szabot	147	temp = i[1] + splitword[wor][len(i[0]):]
		148	if temp in wordin[wor]:
99	szabot	149	lenited = True
92	szabot	150	center = temp
95	szabot	151	if center == u"":
		152	if splitword[wor].endswith(u"nga"):
290	muzer	153	temp = splitword[wor][:-3] + u"nge"
95	szabot	154	if temp in wordin[wor]:
		155	center = temp
271	muzer	156	if splitword[wor].endswith(u"fo"):
290	muzer	157	temp = splitword[wor][:-2] + u"fe"
103	szabot	158	if temp in wordin[wor]:
		159	center = temp
273	muzer	160	if splitword[wor].endswith(u"po"):
290	muzer	161	temp = splitword[wor][:-2] + u"pe"
273	muzer	162	if temp in wordin[wor]:
		163	center = temp
258	muzer	164	if splitword[wor].endswith(u"tsa"):
290	muzer	165	temp = splitword[wor][:-3] + u"tse"
258	muzer	166	if temp in wordin[wor]:
		167	center = temp
290	muzer	168	if splitword[wor].endswith(u"fko"):
		169	temp = splitword[wor][:-3] + u"fke"
		170	if temp in wordin[wor]:
		171	center = temp
		172	if splitword[wor].endswith(u"sa'u"):
		173	temp = splitword[wor][:-4] + u"se"
		174	if temp in wordin[wor]:
		175	center = temp
		176	if splitword[wor].endswith(u"sa"):
		177	temp = splitword[wor][:-2] + u"se"
		178	if temp in wordin[wor]:
		179	center = temp
		180	if splitword[wor].endswith(u"sno"):
		181	temp = splitword[wor][:-3] + u"sne"
		182	if temp in wordin[wor]:
		183	center = temp
		184	if splitword[wor].endswith(u"ayla"):
		185	temp = splitword[wor][:-3] + u"ayle"
		186	if temp in wordin[wor]:
		187	center = temp
74	szabot	188	if center == u"":
65	szabot	189	foundit = False
		190	break
91	szabot	191	temp = wordin[wor].split(center)
		192	if len(temp) != 2:
		193	foundit = False
		194	break
		195	pref, posf = temp
119	szabot	196	last = u""
		197	while last != pref:
		198	last = pref
283	muzer	199	for pre in [x["navi"] for x in prefixes]:
119	szabot	200	if pref != u"":
		201	if pref.endswith(pre):
		202	if pre in foundprefs[wor]:
		203	break
		204	foundprefs[wor].append(pre)
		205	pref = pref[:-len(pre)]
120	szabot	206	break
65	szabot	207	if pref != u"":
		208	foundit = False
244	szabot	209	foundprefs = []
65	szabot	210	break
119	szabot	211	last = u""
		212	while last != posf:
		213	last = posf
288	muzer	214	for pos, posid in sorted([(x["navi"], None) for x in postfixes] + [(x["navi"], x["id"]) for x in wordlist if x["type"] == "adp."] + list(EXTRAADP), key=lambda x: len(x[0]), reverse=True):
119	szabot	215	if posf != u"":
		216	if posf.startswith(pos):
284	muzer	217	if (pos, posid) in foundposts[wor]:
119	szabot	218	break
244	szabot	219	if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
284	muzer	220	foundposts[wor].append((pos, posid))
244	szabot	221	posf = posf[len(pos):]
		222	break
		223	else:
		224	break
82	szabot	225	if posf != u"":
80	szabot	226	foundit = False
244	szabot	227	foundposts = []
80	szabot	228	break
65	szabot	229	if foundit == True:
		230	foundword = word
56	szabot	231	break
87	szabot	232	ret["pref"] = foundprefs
		233	ret["post"] = foundposts
		234	ret["inf"] = foundins
99	szabot	235	ret["len"] = lenited
65	szabot	236	if foundit == True:
71	szabot	237	ret["word"] = foundword
77	szabot	238	return ret
		239
		240	def parsesent(sent):
101	szabot	241	sent = sent.strip().lower().replace(u"’", u"'")
283	muzer	242	sent = re.sub(r"[^\wìä' ]", u"", sent)
		243	sent = re.sub(r"\ +", u" ", sent)
89	szabot	244	sent = sent.split(u" ")
77	szabot	245	ret = []
		246	left = len(sent)
		247	while left:
246	szabot	248	word = parsenum.parse(sent[len(sent) - left])
103	szabot	249	if word == None:
		250	word = parseword(sent[-left:])
78	szabot	251	left -= len(word["word"]["navi"].split(" "))
77	szabot	252	ret.append(word)
136	muzer	253	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py - Rev 294