WebSVN - navi - Blame - Rev 221 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
		23	import re
66	szabot	24	import dbconnector
103	szabot	25	import parsenum
56	szabot	26
221	muzer	27	#wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}] # XXX HACK - extra proper nouns
		28	wordlist = dbconnector.getnavilist()
65	szabot	29
187	muzer	30
216	muzer	31	brokenwords = ((u"sami", u"si", u"", u"am", u"", (()), (()), False), (u"to", u"to", u"", u"", u"", (()), (()), False), (u"poltxe", u"plltxe", u"", u"ol", u"", (()), (()), False)) # XXX HACK - These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes. Things that can take affixes should go in the above list instead.
94	szabot	32	infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
		33	infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		34	infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
156	muzer	35	prefixes = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"a")
171	muzer	36	adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
121	szabot	37	postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	38	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	39	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	40
91	szabot	41	lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
		42
56	szabot	43	def parseword(wordin):
187	muzer	44	tempid = 0
		45	temptype = u""
216	muzer	46	for brokenword in brokenwords: # XXX HACK - this is all code to work around bugs that shouldn't exist
191	muzer	47	if wordin[0] == brokenword[0]:
187	muzer	48	for word in wordlist:
203	muzer	49	if brokenword[1] == word["navi"]:
187	muzer	50	tempid = word["id"]
204	muzer	51	temptype = word["type"]
187	muzer	52	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90	szabot	53	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65	szabot	54	for word in wordlist:
		55	foundit = True
		56	foundprefs = []
		57	foundposts = []
99	szabot	58	lenited = False
74	szabot	59	splitword = word["infix"].split(u" ")
172	muzer	60	foundins = [u"", u"", u""]
74	szabot	61	if len(wordin) < len(splitword):
68	szabot	62	foundit = False
		63	next
65	szabot	64	for wor in range(len(splitword)):
76	szabot	65	if not foundit:
		66	break
65	szabot	67	foundprefs.append([])
		68	foundposts.append([])
		69	center = u""
		70	pre = []
		71	post = []
		72	if u"<1>" in splitword[wor]:
185	muzer	73	tempin1 = []
		74	tempin2 = []
		75	tempin3 = []
		76	for in1 in infixes1:
		77	if in1 in wordin[wor]:
		78	tempin1.append(in1)
		79	for in2 in infixes2:
		80	if in2 in wordin[wor]:
		81	tempin2.append(in2)
		82	for in3 in infixes3:
		83	if in3 in wordin[wor]:
		84	tempin3.append(in3)
180	szabot	85	for in1 in tempin1:
		86	for in2 in tempin2:
		87	for in3 in tempin3:
65	szabot	88	if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
		89	center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
		90	foundins = [in1, in2, in3]
		91	break
75	szabot	92	if center != u"": break
		93	if center != u"": break
65	szabot	94	else:
		95	if splitword[wor] in wordin[wor]:
		96	center = splitword[wor]
216	muzer	97	if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX HACK - workaround to fix pay being lenited pxay. Maybe fixable without hardcoding?
91	szabot	98	for i in lenit:
92	szabot	99	temp = u""
91	szabot	100	if splitword[wor].startswith(i[0]):
92	szabot	101	temp = i[1] + splitword[wor][len(i[0]):]
		102	if temp in wordin[wor]:
99	szabot	103	lenited = True
92	szabot	104	center = temp
95	szabot	105	if center == u"":
		106	if splitword[wor].endswith(u"nga"):
97	szabot	107	temp = splitword[wor][:-3] + u"ng"
95	szabot	108	if temp in wordin[wor]:
		109	center = temp
103	szabot	110	if splitword[wor].endswith(u"po"):
174	muzer	111	temp = splitword[wor][:-2] + u"p"
103	szabot	112	if temp in wordin[wor]:
		113	center = temp
74	szabot	114	if center == u"":
65	szabot	115	foundit = False
		116	break
91	szabot	117	temp = wordin[wor].split(center)
		118	if len(temp) != 2:
		119	foundit = False
		120	break
		121	pref, posf = temp
119	szabot	122	last = u""
		123	while last != pref:
		124	last = pref
		125	for pre in prefixes:
		126	if pref != u"":
		127	if pref.endswith(pre):
		128	if pre in foundprefs[wor]:
		129	break
		130	foundprefs[wor].append(pre)
		131	pref = pref[:-len(pre)]
120	szabot	132	break
65	szabot	133	if pref != u"":
		134	foundit = False
		135	break
119	szabot	136	last = u""
		137	while last != posf:
		138	last = posf
		139	for pos in postfixes:
		140	if posf != u"":
		141	if posf.startswith(pos):
		142	if pos in foundposts[wor]:
		143	break
216	muzer	144	if pos != u"ä" or word["navi"] != u"pey": # XXX HACK - fix for peyä. THIS SHOULD NOT BE HERE!
209	muzer	145	foundposts[wor].append(pos)
		146	posf = posf[len(pos):]
		147	break
213	muzer	148	else:
209	muzer	149	break
82	szabot	150	if posf != u"":
80	szabot	151	foundit = False
		152	break
65	szabot	153	if foundit == True:
		154	foundword = word
56	szabot	155	break
87	szabot	156	ret["pref"] = foundprefs
		157	ret["post"] = foundposts
		158	ret["inf"] = foundins
99	szabot	159	ret["len"] = lenited
65	szabot	160	if foundit == True:
71	szabot	161	ret["word"] = foundword
77	szabot	162	return ret
		163
		164	def parsesent(sent):
101	szabot	165	sent = sent.strip().lower().replace(u"’", u"'")
100	szabot	166	sent = re.sub(ur"[^\wìä' ]",u"",sent)
89	szabot	167	sent = re.sub(ur"\ +",u" ",sent)
		168	sent = sent.split(u" ")
77	szabot	169	ret = []
		170	left = len(sent)
		171	while left:
108	szabot	172	word = parsenum.parse(sent[len(sent)-left])
103	szabot	173	if word == None:
		174	word = parseword(sent[-left:])
78	szabot	175	left -= len(word["word"]["navi"].split(" "))
77	szabot	176	ret.append(word)
136	muzer	177	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 260 - Rev 221