Spaces:

NILC-ICMC-USP
/

Portparser.v2

Running

App Files Files Community

Portparser.v2 / src /postproc /postprocess.py

NILC-ICMC-USP

Update src/postproc/postprocess.py

ac78309 verified 3 months ago

raw

history blame contribute delete

24.3 kB

	#################################################
	### Post Processing Program to Portparser.v3
	#################################################
	#
	# (c) Lucelene Lopes 2025
	#
	##################
	# main function: fixLemmaFeatures()
	# It performs the correction of some lemmas and morphological
	# features from the input file and saves it in the output file.
	# The options are:
	# -h or -help to print out the options
	# -o or -output to inform (next) the output file
	# -l or -lemma to perform only the lemma corrections
	# -f or -feats to perform only the features corrections
	# -q or -quiet to not generate the changes report (.rep.tsv)
	##################
	import sys, os
	import lexikon
	from conlluFile import conlluFile
	lex = lexikon.UDlexPT()

	#################################################
	### Function CMD line arguments capture
	#################################################
	def parseOptions(arguments):
	# default options, doLemma e doFeats alteráveis para True, se ambos False
	output_file, input_file, doLemma, doFeats, quiet = "", "", False, False, False
	#print(arguments)
	i = 1
	while i < len(arguments):
	if (arguments[i][0] == "-"):
	# ajuda (help) - mostra ajuda, nada é executado
	if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \
	(arguments[i] == "-help"):
	print("Opções:\n-h ajuda\n-o arquivo de saída", \
	"\n-l executa apenas correção de lemma", \
	"\n-f executa apenas correção de features", \
	"\n-q não salva relatório (correção Quieta)", \
	"\nExemplo de utilização:", \
	"\n python3 postproc.py -o yyy.conllu xxx.conllu", \
	"\nBusca as sentenças no arquivo 'xxx.conllu',", \
	" corrige lemmas e features e salva as", \
	" sentenças no arquivo 'yyy.conllu''", \
	sep="")
	return None
	# faz correção de lemma
	elif ((arguments[i][1] == "l") and (len(arguments[i])==2)) or \
	(arguments[i] == "-lemma"):
	doLemma = True
	i += 1
	# faz correção de feats
	elif ((arguments[i][1] == "f") and (len(arguments[i])==2)) or \
	(arguments[i] == "-feats"):
	doFeats = True
	i += 1
	# modo quieto (sem relatório)
	elif ((arguments[i][1] == "q") and (len(arguments[i])==2)) or \
	(arguments[i] == "-quiet"):
	quiet = True
	i += 1
	# arquivo de saída
	elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \
	(arguments[i] == "-output"):
	output_file = arguments[i+1]
	i += 2
	# arquivo de entrada - último parâmetro (sem -i antes)
	else:
	if (os.path.isfile(arguments[i])):
	input_file = arguments[i]
	break
	else:
	print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i]))
	return None
	if (not doLemma and not doFeats):
	doLemma, doFeats = True, True
	#print(output_file, input_file, doLemma, doFeats)
	output_file, input_file = arguments[2], arguments[3]
	return [output_file, input_file, doLemma, doFeats, quiet]

	#################################################
	### Function - read usual abbreviations
	#################################################
	def getUsualAbbr():
	infile = open("./src/postproc/usAbbr.tsv", "r")
	abbr = []
	for line in infile:
	if (line[0] == "#"):
	continue
	buf = line[:-1].split("\t")
	if (buf[1] == "abbr"):
	abbr.append([buf[0], buf[2], buf[3], buf[4]])
	return abbr

	#################################################
	### Function - Check if word is in the abbreviation
	#################################################
	def isAbbr(listAbbr, form):
	for a in listAbbr:
	if (form == a[0]):
	return True
	return False

	#################################################
	### Function - get info word is in an abbreviation list
	#################################################
	def isWithin(listAbbr, form):
	for a in listAbbr:
	if (form == a[0]):
	return a[1],a[2],a[3]
	return None, None, None

	#################################################
	### Function - Print a frequency list
	#################################################
	def print_reps(repfile, accName, acc):
	print("\n==========================================================\n", file=repfile)
	for i in range(len(acc)):
	print("{:8} - fixed: {:6>}".format(accName[i], acc[i]), file=repfile)
	print("{:8} - fixed: {:6>}".format(accName[i], acc[i]))

	#################################################
	### Function - fix upper letters in coumpound words
	#################################################
	def fixCompoundUpper(form, lemma, upos, feats):
	if (upos in ["PROPN", "SYM", "X", "PUNCT"]):
	return upos, form, "_"
	else:
	lemma = lemma.lower()
	# # deal with the lemma
	# dashesF = form.count("-")
	# dashesL = lemma.count("-")
	# if (dashesF == dashesL):
	# buf = lemma
	# bits = []
	# for i in range(dashesL):
	# dash = buf.index("-")
	# bits.append(buf[:dash])
	# buf = buf[dash+1:]
	# for j in range(1,len(bits[-1])):
	# if (bits[-1][j].isupper()):
	# bits[-1] = bits[-1][:j]+bits[-1][j].lower()+bits[-1][j+1:]
	# lemma = bits[0]
	# for i in range(1,len(bits)):
	# lemma += "-"+bits[i]
	# lemma += "-"+buf
	# deal with the features
	#### not yet
	return upos, lemma, feats

	#################################################
	### Function - assemble feats
	#################################################
	def featsFull(feat, abbr=False, extpos="", voicepass=False, prontype="", verbform="", numtype=""):
	def ignoreCase(f):
	return f.lower()
	# disassemble the string
	if (feat == "_"):
	feats = []
	else:
	feats = feat.split("\|")
	# deal with Abbr=Yes
	if (abbr) and ("Abbr=Yes" not in feats):
	feats.append("Abbr=Yes")
	if (not abbr) and ("Abbr=Yes" in feats):
	feats.remove("Abbr=Yes")
	# deal with ExtPos=
	if (extpos != "") and ("ExtPos="+extpos not in feats):
	feats.append("ExtPos="+extpos)
	to_rem = []
	for f in feats:
	if (f[:7] == "ExtPos=") and (f != "ExtPos="+extpos):
	to_rem.append(f)
	for trf in to_rem:
	feats.remove(trf)
	# deal with Voice=Pass
	if (voicepass) and ("Voice=Pass" not in feats):
	feats.append("Voice=Pass")
	if (not voicepass) and ("Voice=Pass" in feats):
	feats.remove("Voice=Pass")
	# deal with PronType=
	if (prontype != None):
	if (prontype != "") and ("PronType="+prontype not in feats):
	feats.append("PronType="+prontype)
	to_rem = []
	for f in feats:
	if (f[:9] == "PronType=") and (f != "PronType="+prontype):
	to_rem.append(f)
	for trf in to_rem:
	feats.remove(trf)
	# deal with VerbForm=
	if (verbform != None):
	if (verbform != "") and ("VerbForm="+verbform not in feats):
	feats.append("VerbForm="+verbform)
	to_rem = []
	for f in feats:
	if (f[:9] == "VerbForm=") and (f != "VerbForm="+verbform):
	to_rem.append(f)
	for trf in to_rem:
	feats.remove(trf)
	# deal with NumType=
	if (numtype != None):
	if (numtype != "") and ("NumType="+numtype not in feats):
	feats.append("NumType="+numtype)
	to_rem = []
	for f in feats:
	if (f[:8] == "NumType=") and (f != "NumType="+numtype):
	to_rem.append(f)
	for trf in to_rem:
	feats.remove(trf)
	# assemble the string
	if (feats == []):
	return "_"
	else:
	feats.sort(key=ignoreCase)
	ans = ""
	for f in feats:
	ans += f+"\|"
	return ans[:-1]

	#################################################
	### Function - locate the fixed heads in the sentence
	#################################################
	def locateExtPos(tks):
	fixeds = []
	for tk in tks:
	if (tk[7] == "fixed") and (tk[6] not in fixeds):
	fixeds.append(tk[6])
	return fixeds

	#################################################
	### Function - check options separating lemma and features
	#################################################
	def sepLEMMA_FEATS(options):
	opLEMMA = []
	opFEATS = []
	for o in options:
	if (o[0] not in opLEMMA):
	opLEMMA.append(o[0])
	if (o[2] not in opFEATS):
	opFEATS.append(o[2])
	return opLEMMA, opFEATS

	#################################################
	### Main Function - Postprocess fix of UPOS, LEMMA and FEATS
	#################################################
	def posprocFix():
	# if compound word # fix - replace upper case in Lemma only
	# if the word is within known unambiguous abbr # correct arbitrarily
	lexOutOfTags = ["PROPN", "PUNCT", "SYM", "X"] # correct arbitrarily
	lexCloseTags = ["ADP", "ADV", "CCONJ", "SCONJ"] # correct if unique in lex, erase feats (features are impossible)
	lexPronDetTags = ["DET", "PRON"] # correct if unique in lex, require 'PronType', erase impossible features
	lexOpenTags = ["ADJ", "INTJ", "NOUN", "NUM"] # correct if unique in lex, erase impossible features
	lexVerbTags = ["AUX", "VERB"] # correct if unique in lex, require 'VerbForm', erase impossible features
	digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
	ordinalsignsFem = ['ª', 'a']
	ordinalsignsMasc = ['º', '°', 'o']
	ordinalsignsNeut = ['.']

	if (len(sys.argv) == 1):
	arguments = ["xxx.conllu", "yyy.conllu", True, True, False] # output file, input file, do lemmas, do features, run quiet(false)
	print("Assumindo default: 'yyy.conllu' como arquivo de entrada, 'xxx.conllu' como arquivo de saída, e executando correção de lemas e features.")
	else:
	arguments = parseOptions(sys.argv)
	if (arguments != None):
	if (arguments[0] == ""):
	print("Assumindo 'xxx.conllu' como arquivo de saída")
	arguments[0] = 'xxx.conllu'
	if not os.path.isfile(arguments[1]):
	print(arguments[1], "Arquivo de entrada inválido - por favor corrija e tente novamente")
	else:
	outfile = open(arguments[0], "w")
	if (not arguments[4]): repfile = open(arguments[0]+".rep.tsv", "w")
	base = conlluFile(arguments[1])
	# counters
	accName = ["Pchanged", "Lchanged", "Fchanged"]
	acc = [0]*len(accName)
	# usual Abbr (read from .tsv with "form", "kind", "UPOS", "LEMMA", "FEATS")
	usualAbbr = getUsualAbbr()
	# main loop
	for i in range(base.getS()):
	b = base.getSentByIndex(i)
	fixeds = locateExtPos(b[4])
	for tk in b[4]:
	# level down contracted tokens info, but ID and FORM
	if ("-" in tk[0]):
	tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9] = "_", "_", "_", "_", "_", "_", "_", "_"
	continue
	# fix out of lexikon tokens
	if (tk[3] in lexOutOfTags):
	if (tk[3] in ["PROPN", "PUNCT", "SYM"]):
	pos, lem, feat = tk[3], tk[1], "_"
	elif (tk[3] == "X"):
	if ("Foreign=Yes" in tk[5]):
	pos, lem, feat = tk[3], tk[1], "Foreign=Yes"
	else:
	pos, lem, feat = tk[3], tk[1], "_"
	# fix only lemma in compound words
	elif ("-" in tk[1]):
	pos, lem, feat = fixCompoundUpper(tk[1], tk[2], tk[3], tk[5])
	# fix known abbreviations
	elif (isAbbr(usualAbbr, tk[1].lower())) and (tk[3] in ["ADP", "NOUN"]):
	pos, lem, feat = isWithin(usualAbbr, tk[1].lower())
	# fix numerical NUM, ADJ, NOUN
	elif (tk[3] in ["ADJ", "NOUN", "NUM"]) and (not tk[1].isalpha()):
	if (tk[3] == "NOUN"):
	pos, lem, feat = tk[3], tk[1], "_"
	elif (tk[3] == "ADJ"):
	if (tk[1][-1] in ordinalsignsMasc):
	pos, lem, feat = tk[3], tk[1], "Gender=Masc\|NumType=Ord"
	elif (tk[1][-1] in ordinalsignsFem):
	pos, lem, feat = tk[3], tk[1], "Gender=Fem\|NumType=Ord"
	elif (tk[1][-1] in ordinalsignsNeut):
	pos, lem, feat = tk[3], tk[1], "NumType=Ord"
	else:
	pos, lem, feat = tk[3], tk[1], "_"
	elif (tk[3] == "NUM"):
	if (tk[1][-1] in ordinalsignsMasc):
	pos, lem, feat = tk[3], tk[1], "Gender=Masc\|NumType=Ord"
	elif (tk[1][-1] in ordinalsignsFem):
	pos, lem, feat = tk[3], tk[1], "Gender=Fem\|NumType=Ord"
	elif (tk[1][-1] in ordinalsignsNeut):
	pos, lem, feat = tk[3], tk[1], "NumType=Ord"
	else:
	pos, lem, feat = tk[3], tk[1], "NumType=Card"
	# fix closed tags - ADP, ADV, CCONJ, SCONJ
	elif (tk[3] in lexCloseTags):
	options = lex.pget(tk[1].lower(), tk[3])
	opLEMMA, opFEATS = sepLEMMA_FEATS(options)
	abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2])
	if (tk[0] in fixeds):
	if (tk[7] == "cc"):
	extpos = "CCONJ"
	elif (tk[7] == "advmod"):
	extpos = "ADV"
	elif (tk[7] == "case"):
	extpos = "ADP"
	elif (tk[7] == "mark"):
	extpos = "SCONJ"
	elif (tk[3] == "PRON"):
	extpos = "PRON"
	else:
	extpos = tk[3]
	else:
	extpos = ""
	if (len(options) == 0): # out of the lex
	pos, lem, feat = tk[3], tk[2].lower(), featsFull("_", abbr, extpos=extpos)
	elif (len(options) == 1): # unambiguous in the lex
	pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos)
	else: # ambiguous in the lex - do nothing
	pos = tk[3]
	lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
	feat = featsFull(opFEATS[0], abbr, extpos=extpos) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos)
	# fix Pron and Det tags - PRON, DET
	elif (tk[3] in lexPronDetTags):
	options = lex.pget(tk[1].lower(), tk[3])
	opLEMMA, opFEATS = sepLEMMA_FEATS(options)
	abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1]))
	if (tk[0] in fixeds):
	if (tk[7] == "cc"):
	extpos = "CCONJ"
	elif (tk[7] == "advmod"):
	extpos = "ADV"
	elif (tk[7] == "case"):
	extpos = "ADP"
	elif (tk[7] == "mark"):
	extpos = "SCONJ"
	elif (tk[3] == "PRON"):
	extpos = "PRON"
	else:
	extpos = tk[3]
	else:
	extpos = ""
	if ("PronType" in tk[5]):
	idx = tk[5].index("PronType=")+9
	prontype = tk[5][idx:idx+3]
	elif (tk[3] == "PRON"):
	prontype = "Dem"
	elif (tk[3] == "DET"):
	prontype = "Art"
	if (len(options) == 0): # out of the lex
	pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, prontype=prontype)
	elif (len(options) == 1): # unambiguous in the lex
	pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, prontype=None)
	else: # ambiguous in the lex - do nothing
	pos = tk[3]
	lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
	feat = featsFull(opFEATS[0], abbr, extpos=extpos, prontype=prontype) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, prontype=prontype)
	# fix Open tags - ADJ, INTJ, NOUN, NUM
	elif (tk[3] in lexOpenTags):
	options = lex.pget(tk[1].lower(), tk[3])
	opLEMMA, opFEATS = sepLEMMA_FEATS(options)
	abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1]))
	if (tk[0] in fixeds):
	if (tk[7] == "cc"):
	extpos = "CCONJ"
	elif (tk[7] == "advmod"):
	extpos = "ADV"
	elif (tk[7] == "case"):
	extpos = "ADP"
	elif (tk[7] == "mark"):
	extpos = "SCONJ"
	elif (tk[3] == "PRON"):
	extpos = "PRON"
	else:
	extpos = tk[3]
	else:
	extpos = ""
	if ("VerbForm=Part" in tk[5]) and (tk[3] == "ADJ"):
	verbform = "Part"
	else:
	verbform = ""
	if ("NumType=Ord" in tk[5]) and (tk[3] in ["ADJ", "NUM"]):
	numtype = "Ord"
	elif ("NumType=Card" in tk[5]) and (tk[3] == "NUM"):
	numtype = "Card"
	else:
	numtype = ""
	if (len(options) == 0): # out of the lex
	pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, numtype=numtype)
	elif (len(options) == 1): # unambiguous in the lex
	pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, numtype=None)
	else: # ambiguous in the lex - do nothing
	pos = tk[3]
	lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
	feat = featsFull(opFEATS[0], abbr, extpos=extpos, verbform=None, numtype=None) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, verbform=None, numtype=None)
	# fix Verb tags - AUX, VERB
	elif (tk[3] in lexVerbTags):
	options = lex.pget(tk[1].lower(), tk[3])
	opLEMMA, opFEATS = sepLEMMA_FEATS(options)
	abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2])
	if (tk[0] in fixeds):
	if (tk[7] == "cc"):
	extpos = "CCONJ"
	elif (tk[7] == "advmod"):
	extpos = "ADV"
	elif (tk[7] == "case"):
	extpos = "ADP"
	elif (tk[7] == "mark"):
	extpos = "SCONJ"
	elif (tk[3] == "PRON"):
	extpos = "PRON"
	else:
	extpos = tk[3]
	else:
	extpos = ""
	if ("VerbForm=Inf" in tk[5]):
	verbform = "Inf"
	elif ("VerbForm=Ger" in tk[5]):
	verbform = "Ger"
	elif ("VerbForm=Part" in tk[5]):
	verbform = "Part"
	elif ("VerbForm=Fin" in tk[5]):
	verbform = "Fin"
	else:
	if (tk[1][-1].lower() == "r"):
	verbform = "Inf"
	else:
	verbform = "Fin"
	if ("Voice=Pass" in tk[5]):
	voicepass = True
	else:
	voicepass = False
	if (len(options) == 0): # out of the lex
	pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, voicepass=voicepass)
	elif (len(options) == 1): # unambiguous in the lex
	pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, voicepass=voicepass)
	else: # ambiguous in the lex - do nothing
	pos = tk[3]
	lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
	feat = featsFull(opFEATS[0], abbr, extpos=extpos, verbform=None, voicepass=voicepass) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, verbform=None, voicepass=voicepass)
	# do reports and change
	if (pos != tk[3]):
	print(b[0], tk[0], tk[1], tk[3], "UPOS", tk[3], pos, sep="\t", file=repfile)
	acc[accName.index("Pchanged")] += 1
	tk[3] = pos
	if (lem != tk[2]):
	print(b[0], tk[0], tk[1], tk[3], "LEMMA", tk[2], lem, sep="\t", file=repfile)
	acc[accName.index("Lchanged")] += 1
	tk[2] = lem
	if (feat != tk[5]):
	if ("ExtPos=" not in feat):
	print(b[0], tk[0], tk[1], tk[3], "FEATS", tk[5], feat, sep="\t", file=repfile)
	acc[accName.index("Fchanged")] += 1
	tk[5] = feat
	if (not arguments[4]): print_reps(repfile, accName, acc)
	if (not arguments[4]): repfile.close()
	base.printNoHeader(outfile)
	outfile.close()

	posprocFix()