WebSVN - navi - Blame - Rev 210 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
		23	import re
66	szabot	24	import dbconnector
103	szabot	25	import parsenum
56	szabot	26
201	muzer	27	wordlist = [{"id": 0, "navi": u"tawtute", "infix": u"tawtute", "type": u"n."}] + dbconnector.getnavilist() + [{"id": 0, "navi": u"na'vi", "infix": u"na'vi", "type": u"n."}]
65	szabot	28
187	muzer	29
195	muzer	30	brokenwords = ((u"sami", u"si", u"", u"am", u"", (()), (()), False), (u"to", u"to", u"", u"", u"", (()), (()), False), (u"poltxe", u"plltxe", u"", u"ol", u"", (()), (()), False)) # These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! If it can take infixes, consider adding it to the main wordlist above (see the examples). The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes
94	szabot	31	infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
		32	infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		33	infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
156	muzer	34	prefixes = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"a")
171	muzer	35	adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
121	szabot	36	postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	37	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	38	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	39
91	szabot	40	lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
		41
56	szabot	42	def parseword(wordin):
187	muzer	43	tempid = 0
		44	temptype = u""
		45	for brokenword in brokenwords:
191	muzer	46	if wordin[0] == brokenword[0]:
187	muzer	47	for word in wordlist:
203	muzer	48	if brokenword[1] == word["navi"]:
187	muzer	49	tempid = word["id"]
204	muzer	50	temptype = word["type"]
187	muzer	51	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90	szabot	52	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65	szabot	53	for word in wordlist:
		54	foundit = True
		55	foundprefs = []
		56	foundposts = []
99	szabot	57	lenited = False
74	szabot	58	splitword = word["infix"].split(u" ")
172	muzer	59	foundins = [u"", u"", u""]
74	szabot	60	if len(wordin) < len(splitword):
68	szabot	61	foundit = False
		62	next
65	szabot	63	for wor in range(len(splitword)):
76	szabot	64	if not foundit:
		65	break
65	szabot	66	foundprefs.append([])
		67	foundposts.append([])
		68	center = u""
		69	pre = []
		70	post = []
		71	if u"<1>" in splitword[wor]:
185	muzer	72	tempin1 = []
		73	tempin2 = []
		74	tempin3 = []
		75	for in1 in infixes1:
		76	if in1 in wordin[wor]:
		77	tempin1.append(in1)
		78	for in2 in infixes2:
		79	if in2 in wordin[wor]:
		80	tempin2.append(in2)
		81	for in3 in infixes3:
		82	if in3 in wordin[wor]:
		83	tempin3.append(in3)
180	szabot	84	for in1 in tempin1:
		85	for in2 in tempin2:
		86	for in3 in tempin3:
65	szabot	87	if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
		88	center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
		89	foundins = [in1, in2, in3]
		90	break
75	szabot	91	if center != u"": break
		92	if center != u"": break
65	szabot	93	else:
		94	if splitword[wor] in wordin[wor]:
		95	center = splitword[wor]
207	muzer	96	if center == u"" and (wordin[wor] == u"paya" or splitword[wor] != u"pxay"): # XXX - maybe fixable without hardcoding?
91	szabot	97	for i in lenit:
92	szabot	98	temp = u""
91	szabot	99	if splitword[wor].startswith(i[0]):
92	szabot	100	temp = i[1] + splitword[wor][len(i[0]):]
		101	if temp in wordin[wor]:
99	szabot	102	lenited = True
92	szabot	103	center = temp
95	szabot	104	if center == u"":
		105	if splitword[wor].endswith(u"nga"):
97	szabot	106	temp = splitword[wor][:-3] + u"ng"
95	szabot	107	if temp in wordin[wor]:
		108	center = temp
103	szabot	109	if splitword[wor].endswith(u"po"):
174	muzer	110	temp = splitword[wor][:-2] + u"p"
103	szabot	111	if temp in wordin[wor]:
		112	center = temp
74	szabot	113	if center == u"":
65	szabot	114	foundit = False
		115	break
91	szabot	116	temp = wordin[wor].split(center)
		117	if len(temp) != 2:
		118	foundit = False
		119	break
		120	pref, posf = temp
119	szabot	121	last = u""
		122	while last != pref:
		123	last = pref
		124	for pre in prefixes:
		125	if pref != u"":
		126	if pref.endswith(pre):
		127	if pre in foundprefs[wor]:
		128	break
		129	foundprefs[wor].append(pre)
		130	pref = pref[:-len(pre)]
120	szabot	131	break
65	szabot	132	if pref != u"":
		133	foundit = False
		134	break
119	szabot	135	last = u""
		136	while last != posf:
		137	last = posf
		138	for pos in postfixes:
		139	if posf != u"":
		140	if posf.startswith(pos):
		141	if pos in foundposts[wor]:
		142	break
209	muzer	143	if pos != ä \|\| word["navi"] != "pey"
		144	foundposts[wor].append(pos)
		145	posf = posf[len(pos):]
		146	break
		147	else
		148	break
82	szabot	149	if posf != u"":
80	szabot	150	foundit = False
		151	break
65	szabot	152	if foundit == True:
		153	foundword = word
56	szabot	154	break
87	szabot	155	ret["pref"] = foundprefs
		156	ret["post"] = foundposts
		157	ret["inf"] = foundins
99	szabot	158	ret["len"] = lenited
65	szabot	159	if foundit == True:
71	szabot	160	ret["word"] = foundword
77	szabot	161	return ret
		162
		163	def parsesent(sent):
101	szabot	164	sent = sent.strip().lower().replace(u"’", u"'")
100	szabot	165	sent = re.sub(ur"[^\wìä' ]",u"",sent)
89	szabot	166	sent = re.sub(ur"\ +",u" ",sent)
		167	sent = sent.split(u" ")
77	szabot	168	ret = []
		169	left = len(sent)
		170	while left:
108	szabot	171	word = parsenum.parse(sent[len(sent)-left])
103	szabot	172	if word == None:
		173	word = parseword(sent[-left:])
78	szabot	174	left -= len(word["word"]["navi"].split(" "))
77	szabot	175	ret.append(word)
136	muzer	176	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py - Rev 210