WebSVN - navi - Blame - Rev 190 - /tsimapiak/parse.py

Rev	Author	Line No.	Line
56	szabot	1	#!/usr/bin/python
		2	# -- coding: utf-8 --
176	muzer	3	# This file is part of Tsim Apiak.
		4	#
		5	# Tsim Apiak is free software: you can redistribute it and/or modify
		6	# it under the terms of the GNU General Public Licence as published by
		7	# the Free Software Foundation, either version 3 of the Licence, or
		8	# (at your option) any later version.
		9	#
		10	# In addition to this, you must also comply with clause 4 of the
		11	# Apache Licence, version 2.0, concerning attribution. Where there
		12	# is a contradiction between the two licences, the GPL
		13	# takes preference.
		14	#
186	szabot	15	# Tsim Apiak is distributed in the hope that it will be useful,
176	muzer	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	# GNU General Public License for more details.
		19	#
		20	# You should have received a copy of the GNU General Public License
		21	# along with Tsim Apiak. If not, see <http://www.gnu.org/licenses/>.
56	szabot	22
		23	import re
66	szabot	24	import dbconnector
103	szabot	25	import parsenum
56	szabot	26
65	szabot	27	wordlist = dbconnector.getnavilist()
		28
187	muzer	29
		30	brokenwords = ((u"sami", u"si", u"", u"am", u"", None, None, False), (u"to", u"to", u"", u"", u"", False), (u"poltxe", u"plltxe", u"", u"ol", u"", None, None, False)) # These are words that are either not in Eana Eltu, or that get interpreted wrongly for whatever reason. The latter should be removed from this list when the parser gets more sophisticated. The former should also have an entry in the equivalent array in the translator! The order is - original, Na'vi root, 0-pos infix, 1-pos infix, 2-pos infix, prefixes, suffixes
94	szabot	31	infixes1 = (u"awn", u"eyk", u"us", u"äp", u"")
		32	infixes2 = (u"ìyev", u"iyev", u"ìmìy", u"arm", u"asy", u"ilv", u"ìmv", u"imv", u"ìrm", u"irv", u"ìsy", u"aly", u"ary", u"ìly", u"ìry", u"ìlm", u"alm", u"am", u"ay", u"er", u"ìm", u"iv", u"ìy", u"ol", u"")
		33	infixes3 = (u"äng", u"ats", u"eiy", u"ei", u"uy", u"")
156	muzer	34	prefixes = (u"tsay", u"say", u"fay", u"fra", u"pxe", u"fne", u"tsa", u"sa", u"pe", u"fe", u"le", u"nì", u"sä", u"tì", u"sì", u"ay", u"me", u"fì", u"ke", u"he", u"a")
171	muzer	35	adpositions = (u"mungwrr", u"kxamlä", u"pximaw", u"pxisre", u"tafkip", u"nemfa", u"takip", u"mìkam", u"teri", u"fkip", u"luke", u"pxel", u"pxaw", u"rofa", u"ìlä", u"fpi", u"ftu", u"kip", u"lok", u"maw", u"sre", u"sìn", u"vay", u"eo", u"fa", u"hu", u"io", u"ka", u"mì", u"na", u"ne", u"ro", u"ta", u"uo", u"wä", u"äo", u"to", u"sì")
121	szabot	36	postfixes = adpositions + (u"tsyìp", u"eyä", u"ìri", u"ìl", u"it", u"lo", u"ri", u"ru", u"ti", u"ur", u"ve", u"yä", u"ya", u"tu", u"vi", u"yu", u"an", u"ng", u"ke", u"e", u"o", u"l", u"t", u"y", u"a", u"ä", u"r")
62	szabot	37	#prefixesn = ur"(?P<npr>(?:(?:fì\|tsa)?(?:me\|pxe\|ay\|fra)?\|(?:fay)?\|(?:tsay)?)(?:fne)?(?:tì\|sä)?"
74	szabot	38	#prefixesv = ur"(?P<vpr>(?:nì\|sä\|tì\|rä'ä \|ke )?)"
56	szabot	39
91	szabot	40	lenit = ((u"px", u"p"), (u"tx", u"t"), (u"kx", u"k"), (u"ts", u"s"), (u"t", u"s"), (u"p", u"f"), (u"k", u"h"), (u"'", u""))
		41
56	szabot	42	def parseword(wordin):
187	muzer	43	tempid = 0
		44	temptype = u""
		45	for brokenword in brokenwords:
190	muzer	46	print wordin
		47	print brokenword[0]
187	muzer	48	if wordin == brokenword[0]:
		49	for word in wordlist:
		50	if wordin == word["navi"]:
		51	tempid = word["id"]
		52	temptype = word["partOfSpeech"]
		53	return {"word": {"id": tempid, "navi": brokenword[1], "infix": u"", "type": temptype}, "pref": brokenword[5], "post": brokenword[6], "len": brokenword[7], "inf": (brokenword[2], brokenword[3], brokenword[4]) }
90	szabot	54	ret = {"word": {"id": 0, "navi": u"[" + wordin[0] + u"]", "infix": u"", "type": u""}}
65	szabot	55	for word in wordlist:
		56	foundit = True
		57	foundprefs = []
		58	foundposts = []
99	szabot	59	lenited = False
74	szabot	60	splitword = word["infix"].split(u" ")
172	muzer	61	foundins = [u"", u"", u""]
74	szabot	62	if len(wordin) < len(splitword):
68	szabot	63	foundit = False
		64	next
65	szabot	65	for wor in range(len(splitword)):
76	szabot	66	if not foundit:
		67	break
65	szabot	68	foundprefs.append([])
		69	foundposts.append([])
		70	center = u""
		71	pre = []
		72	post = []
		73	if u"<1>" in splitword[wor]:
185	muzer	74	tempin1 = []
		75	tempin2 = []
		76	tempin3 = []
		77	for in1 in infixes1:
		78	if in1 in wordin[wor]:
		79	tempin1.append(in1)
		80	for in2 in infixes2:
		81	if in2 in wordin[wor]:
		82	tempin2.append(in2)
		83	for in3 in infixes3:
		84	if in3 in wordin[wor]:
		85	tempin3.append(in3)
180	szabot	86	for in1 in tempin1:
		87	for in2 in tempin2:
		88	for in3 in tempin3:
65	szabot	89	if splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3) in wordin[wor]:
		90	center = splitword[wor].replace(u"<1><2>",in1+in2).replace(u"<3>",in3)
		91	foundins = [in1, in2, in3]
		92	break
75	szabot	93	if center != u"": break
		94	if center != u"": break
65	szabot	95	else:
		96	if splitword[wor] in wordin[wor]:
		97	center = splitword[wor]
95	szabot	98	if center == u"":
91	szabot	99	for i in lenit:
92	szabot	100	temp = u""
91	szabot	101	if splitword[wor].startswith(i[0]):
92	szabot	102	temp = i[1] + splitword[wor][len(i[0]):]
		103	if temp in wordin[wor]:
99	szabot	104	lenited = True
92	szabot	105	center = temp
95	szabot	106	if center == u"":
		107	if splitword[wor].endswith(u"nga"):
97	szabot	108	temp = splitword[wor][:-3] + u"ng"
95	szabot	109	if temp in wordin[wor]:
		110	center = temp
103	szabot	111	if splitword[wor].endswith(u"po"):
174	muzer	112	temp = splitword[wor][:-2] + u"p"
103	szabot	113	if temp in wordin[wor]:
		114	center = temp
74	szabot	115	if center == u"":
65	szabot	116	foundit = False
		117	break
91	szabot	118	temp = wordin[wor].split(center)
		119	if len(temp) != 2:
		120	foundit = False
		121	break
		122	pref, posf = temp
119	szabot	123	last = u""
		124	while last != pref:
		125	last = pref
		126	for pre in prefixes:
		127	if pref != u"":
		128	if pref.endswith(pre):
		129	if pre in foundprefs[wor]:
		130	break
		131	foundprefs[wor].append(pre)
		132	pref = pref[:-len(pre)]
120	szabot	133	break
65	szabot	134	if pref != u"":
		135	foundit = False
		136	break
119	szabot	137	last = u""
		138	while last != posf:
		139	last = posf
		140	for pos in postfixes:
		141	if posf != u"":
		142	if posf.startswith(pos):
		143	if pos in foundposts[wor]:
		144	break
		145	foundposts[wor].append(pos)
		146	posf = posf[len(pos):]
120	szabot	147	break
82	szabot	148	if posf != u"":
80	szabot	149	foundit = False
		150	break
65	szabot	151	if foundit == True:
		152	foundword = word
56	szabot	153	break
87	szabot	154	ret["pref"] = foundprefs
		155	ret["post"] = foundposts
		156	ret["inf"] = foundins
99	szabot	157	ret["len"] = lenited
65	szabot	158	if foundit == True:
71	szabot	159	ret["word"] = foundword
77	szabot	160	return ret
		161
		162	def parsesent(sent):
101	szabot	163	sent = sent.strip().lower().replace(u"’", u"'")
100	szabot	164	sent = re.sub(ur"[^\wìä' ]",u"",sent)
89	szabot	165	sent = re.sub(ur"\ +",u" ",sent)
		166	sent = sent.split(u" ")
77	szabot	167	ret = []
		168	left = len(sent)
		169	while left:
108	szabot	170	word = parsenum.parse(sent[len(sent)-left])
103	szabot	171	if word == None:
		172	word = parseword(sent[-left:])
78	szabot	173	left -= len(word["word"]["navi"].split(" "))
77	szabot	174	ret.append(word)
136	muzer	175	return ret

Subversion Repositories navi

(root)/tsimapiak/parse.py @ 301 - Rev 190