Portparser.v2 / src /postproc /postprocess.py
NILC-ICMC-USP's picture
Update src/postproc/postprocess.py
ac78309 verified
#################################################
### Post Processing Program to Portparser.v3
#################################################
#
# (c) Lucelene Lopes 2025
#
##################
# main function: fixLemmaFeatures()
# It performs the correction of some lemmas and morphological
# features from the input file and saves it in the output file.
# The options are:
# -h or -help to print out the options
# -o or -output to inform (next) the output file
# -l or -lemma to perform only the lemma corrections
# -f or -feats to perform only the features corrections
# -q or -quiet to not generate the changes report (.rep.tsv)
##################
import sys, os
import lexikon
from conlluFile import conlluFile
lex = lexikon.UDlexPT()
#################################################
### Function CMD line arguments capture
#################################################
def parseOptions(arguments):
# default options, doLemma e doFeats alteráveis para True, se ambos False
output_file, input_file, doLemma, doFeats, quiet = "", "", False, False, False
#print(arguments)
i = 1
while i < len(arguments):
if (arguments[i][0] == "-"):
# ajuda (help) - mostra ajuda, nada é executado
if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \
(arguments[i] == "-help"):
print("Opções:\n-h ajuda\n-o arquivo de saída", \
"\n-l executa apenas correção de lemma", \
"\n-f executa apenas correção de features", \
"\n-q não salva relatório (correção Quieta)", \
"\nExemplo de utilização:", \
"\n python3 postproc.py -o yyy.conllu xxx.conllu", \
"\nBusca as sentenças no arquivo 'xxx.conllu',", \
" corrige lemmas e features e salva as", \
" sentenças no arquivo 'yyy.conllu''", \
sep="")
return None
# faz correção de lemma
elif ((arguments[i][1] == "l") and (len(arguments[i])==2)) or \
(arguments[i] == "-lemma"):
doLemma = True
i += 1
# faz correção de feats
elif ((arguments[i][1] == "f") and (len(arguments[i])==2)) or \
(arguments[i] == "-feats"):
doFeats = True
i += 1
# modo quieto (sem relatório)
elif ((arguments[i][1] == "q") and (len(arguments[i])==2)) or \
(arguments[i] == "-quiet"):
quiet = True
i += 1
# arquivo de saída
elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \
(arguments[i] == "-output"):
output_file = arguments[i+1]
i += 2
# arquivo de entrada - último parâmetro (sem -i antes)
else:
if (os.path.isfile(arguments[i])):
input_file = arguments[i]
break
else:
print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i]))
return None
if (not doLemma and not doFeats):
doLemma, doFeats = True, True
#print(output_file, input_file, doLemma, doFeats)
output_file, input_file = arguments[2], arguments[3]
return [output_file, input_file, doLemma, doFeats, quiet]
#################################################
### Function - read usual abbreviations
#################################################
def getUsualAbbr():
infile = open("./src/postproc/usAbbr.tsv", "r")
abbr = []
for line in infile:
if (line[0] == "#"):
continue
buf = line[:-1].split("\t")
if (buf[1] == "abbr"):
abbr.append([buf[0], buf[2], buf[3], buf[4]])
return abbr
#################################################
### Function - Check if word is in the abbreviation
#################################################
def isAbbr(listAbbr, form):
for a in listAbbr:
if (form == a[0]):
return True
return False
#################################################
### Function - get info word is in an abbreviation list
#################################################
def isWithin(listAbbr, form):
for a in listAbbr:
if (form == a[0]):
return a[1],a[2],a[3]
return None, None, None
#################################################
### Function - Print a frequency list
#################################################
def print_reps(repfile, accName, acc):
print("\n==========================================================\n", file=repfile)
for i in range(len(acc)):
print("{:8} - fixed: {:6>}".format(accName[i], acc[i]), file=repfile)
print("{:8} - fixed: {:6>}".format(accName[i], acc[i]))
#################################################
### Function - fix upper letters in coumpound words
#################################################
def fixCompoundUpper(form, lemma, upos, feats):
if (upos in ["PROPN", "SYM", "X", "PUNCT"]):
return upos, form, "_"
else:
lemma = lemma.lower()
# # deal with the lemma
# dashesF = form.count("-")
# dashesL = lemma.count("-")
# if (dashesF == dashesL):
# buf = lemma
# bits = []
# for i in range(dashesL):
# dash = buf.index("-")
# bits.append(buf[:dash])
# buf = buf[dash+1:]
# for j in range(1,len(bits[-1])):
# if (bits[-1][j].isupper()):
# bits[-1] = bits[-1][:j]+bits[-1][j].lower()+bits[-1][j+1:]
# lemma = bits[0]
# for i in range(1,len(bits)):
# lemma += "-"+bits[i]
# lemma += "-"+buf
# deal with the features
#### not yet
return upos, lemma, feats
#################################################
### Function - assemble feats
#################################################
def featsFull(feat, abbr=False, extpos="", voicepass=False, prontype="", verbform="", numtype=""):
def ignoreCase(f):
return f.lower()
# disassemble the string
if (feat == "_"):
feats = []
else:
feats = feat.split("|")
# deal with Abbr=Yes
if (abbr) and ("Abbr=Yes" not in feats):
feats.append("Abbr=Yes")
if (not abbr) and ("Abbr=Yes" in feats):
feats.remove("Abbr=Yes")
# deal with ExtPos=
if (extpos != "") and ("ExtPos="+extpos not in feats):
feats.append("ExtPos="+extpos)
to_rem = []
for f in feats:
if (f[:7] == "ExtPos=") and (f != "ExtPos="+extpos):
to_rem.append(f)
for trf in to_rem:
feats.remove(trf)
# deal with Voice=Pass
if (voicepass) and ("Voice=Pass" not in feats):
feats.append("Voice=Pass")
if (not voicepass) and ("Voice=Pass" in feats):
feats.remove("Voice=Pass")
# deal with PronType=
if (prontype != None):
if (prontype != "") and ("PronType="+prontype not in feats):
feats.append("PronType="+prontype)
to_rem = []
for f in feats:
if (f[:9] == "PronType=") and (f != "PronType="+prontype):
to_rem.append(f)
for trf in to_rem:
feats.remove(trf)
# deal with VerbForm=
if (verbform != None):
if (verbform != "") and ("VerbForm="+verbform not in feats):
feats.append("VerbForm="+verbform)
to_rem = []
for f in feats:
if (f[:9] == "VerbForm=") and (f != "VerbForm="+verbform):
to_rem.append(f)
for trf in to_rem:
feats.remove(trf)
# deal with NumType=
if (numtype != None):
if (numtype != "") and ("NumType="+numtype not in feats):
feats.append("NumType="+numtype)
to_rem = []
for f in feats:
if (f[:8] == "NumType=") and (f != "NumType="+numtype):
to_rem.append(f)
for trf in to_rem:
feats.remove(trf)
# assemble the string
if (feats == []):
return "_"
else:
feats.sort(key=ignoreCase)
ans = ""
for f in feats:
ans += f+"|"
return ans[:-1]
#################################################
### Function - locate the fixed heads in the sentence
#################################################
def locateExtPos(tks):
fixeds = []
for tk in tks:
if (tk[7] == "fixed") and (tk[6] not in fixeds):
fixeds.append(tk[6])
return fixeds
#################################################
### Function - check options separating lemma and features
#################################################
def sepLEMMA_FEATS(options):
opLEMMA = []
opFEATS = []
for o in options:
if (o[0] not in opLEMMA):
opLEMMA.append(o[0])
if (o[2] not in opFEATS):
opFEATS.append(o[2])
return opLEMMA, opFEATS
#################################################
### Main Function - Postprocess fix of UPOS, LEMMA and FEATS
#################################################
def posprocFix():
# if compound word # fix - replace upper case in Lemma only
# if the word is within known unambiguous abbr # correct arbitrarily
lexOutOfTags = ["PROPN", "PUNCT", "SYM", "X"] # correct arbitrarily
lexCloseTags = ["ADP", "ADV", "CCONJ", "SCONJ"] # correct if unique in lex, erase feats (features are impossible)
lexPronDetTags = ["DET", "PRON"] # correct if unique in lex, require 'PronType', erase impossible features
lexOpenTags = ["ADJ", "INTJ", "NOUN", "NUM"] # correct if unique in lex, erase impossible features
lexVerbTags = ["AUX", "VERB"] # correct if unique in lex, require 'VerbForm', erase impossible features
digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
ordinalsignsFem = ['ª', 'a']
ordinalsignsMasc = ['º', '°', 'o']
ordinalsignsNeut = ['.']
if (len(sys.argv) == 1):
arguments = ["xxx.conllu", "yyy.conllu", True, True, False] # output file, input file, do lemmas, do features, run quiet(false)
print("Assumindo default: 'yyy.conllu' como arquivo de entrada, 'xxx.conllu' como arquivo de saída, e executando correção de lemas e features.")
else:
arguments = parseOptions(sys.argv)
if (arguments != None):
if (arguments[0] == ""):
print("Assumindo 'xxx.conllu' como arquivo de saída")
arguments[0] = 'xxx.conllu'
if not os.path.isfile(arguments[1]):
print(arguments[1], "Arquivo de entrada inválido - por favor corrija e tente novamente")
else:
outfile = open(arguments[0], "w")
if (not arguments[4]): repfile = open(arguments[0]+".rep.tsv", "w")
base = conlluFile(arguments[1])
# counters
accName = ["Pchanged", "Lchanged", "Fchanged"]
acc = [0]*len(accName)
# usual Abbr (read from .tsv with "form", "kind", "UPOS", "LEMMA", "FEATS")
usualAbbr = getUsualAbbr()
# main loop
for i in range(base.getS()):
b = base.getSentByIndex(i)
fixeds = locateExtPos(b[4])
for tk in b[4]:
# level down contracted tokens info, but ID and FORM
if ("-" in tk[0]):
tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9] = "_", "_", "_", "_", "_", "_", "_", "_"
continue
# fix out of lexikon tokens
if (tk[3] in lexOutOfTags):
if (tk[3] in ["PROPN", "PUNCT", "SYM"]):
pos, lem, feat = tk[3], tk[1], "_"
elif (tk[3] == "X"):
if ("Foreign=Yes" in tk[5]):
pos, lem, feat = tk[3], tk[1], "Foreign=Yes"
else:
pos, lem, feat = tk[3], tk[1], "_"
# fix only lemma in compound words
elif ("-" in tk[1]):
pos, lem, feat = fixCompoundUpper(tk[1], tk[2], tk[3], tk[5])
# fix known abbreviations
elif (isAbbr(usualAbbr, tk[1].lower())) and (tk[3] in ["ADP", "NOUN"]):
pos, lem, feat = isWithin(usualAbbr, tk[1].lower())
# fix numerical NUM, ADJ, NOUN
elif (tk[3] in ["ADJ", "NOUN", "NUM"]) and (not tk[1].isalpha()):
if (tk[3] == "NOUN"):
pos, lem, feat = tk[3], tk[1], "_"
elif (tk[3] == "ADJ"):
if (tk[1][-1] in ordinalsignsMasc):
pos, lem, feat = tk[3], tk[1], "Gender=Masc|NumType=Ord"
elif (tk[1][-1] in ordinalsignsFem):
pos, lem, feat = tk[3], tk[1], "Gender=Fem|NumType=Ord"
elif (tk[1][-1] in ordinalsignsNeut):
pos, lem, feat = tk[3], tk[1], "NumType=Ord"
else:
pos, lem, feat = tk[3], tk[1], "_"
elif (tk[3] == "NUM"):
if (tk[1][-1] in ordinalsignsMasc):
pos, lem, feat = tk[3], tk[1], "Gender=Masc|NumType=Ord"
elif (tk[1][-1] in ordinalsignsFem):
pos, lem, feat = tk[3], tk[1], "Gender=Fem|NumType=Ord"
elif (tk[1][-1] in ordinalsignsNeut):
pos, lem, feat = tk[3], tk[1], "NumType=Ord"
else:
pos, lem, feat = tk[3], tk[1], "NumType=Card"
# fix closed tags - ADP, ADV, CCONJ, SCONJ
elif (tk[3] in lexCloseTags):
options = lex.pget(tk[1].lower(), tk[3])
opLEMMA, opFEATS = sepLEMMA_FEATS(options)
abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2])
if (tk[0] in fixeds):
if (tk[7] == "cc"):
extpos = "CCONJ"
elif (tk[7] == "advmod"):
extpos = "ADV"
elif (tk[7] == "case"):
extpos = "ADP"
elif (tk[7] == "mark"):
extpos = "SCONJ"
elif (tk[3] == "PRON"):
extpos = "PRON"
else:
extpos = tk[3]
else:
extpos = ""
if (len(options) == 0): # out of the lex
pos, lem, feat = tk[3], tk[2].lower(), featsFull("_", abbr, extpos=extpos)
elif (len(options) == 1): # unambiguous in the lex
pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos)
else: # ambiguous in the lex - do nothing
pos = tk[3]
lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
feat = featsFull(opFEATS[0], abbr, extpos=extpos) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos)
# fix Pron and Det tags - PRON, DET
elif (tk[3] in lexPronDetTags):
options = lex.pget(tk[1].lower(), tk[3])
opLEMMA, opFEATS = sepLEMMA_FEATS(options)
abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1]))
if (tk[0] in fixeds):
if (tk[7] == "cc"):
extpos = "CCONJ"
elif (tk[7] == "advmod"):
extpos = "ADV"
elif (tk[7] == "case"):
extpos = "ADP"
elif (tk[7] == "mark"):
extpos = "SCONJ"
elif (tk[3] == "PRON"):
extpos = "PRON"
else:
extpos = tk[3]
else:
extpos = ""
if ("PronType" in tk[5]):
idx = tk[5].index("PronType=")+9
prontype = tk[5][idx:idx+3]
elif (tk[3] == "PRON"):
prontype = "Dem"
elif (tk[3] == "DET"):
prontype = "Art"
if (len(options) == 0): # out of the lex
pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, prontype=prontype)
elif (len(options) == 1): # unambiguous in the lex
pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, prontype=None)
else: # ambiguous in the lex - do nothing
pos = tk[3]
lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
feat = featsFull(opFEATS[0], abbr, extpos=extpos, prontype=prontype) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, prontype=prontype)
# fix Open tags - ADJ, INTJ, NOUN, NUM
elif (tk[3] in lexOpenTags):
options = lex.pget(tk[1].lower(), tk[3])
opLEMMA, opFEATS = sepLEMMA_FEATS(options)
abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1]))
if (tk[0] in fixeds):
if (tk[7] == "cc"):
extpos = "CCONJ"
elif (tk[7] == "advmod"):
extpos = "ADV"
elif (tk[7] == "case"):
extpos = "ADP"
elif (tk[7] == "mark"):
extpos = "SCONJ"
elif (tk[3] == "PRON"):
extpos = "PRON"
else:
extpos = tk[3]
else:
extpos = ""
if ("VerbForm=Part" in tk[5]) and (tk[3] == "ADJ"):
verbform = "Part"
else:
verbform = ""
if ("NumType=Ord" in tk[5]) and (tk[3] in ["ADJ", "NUM"]):
numtype = "Ord"
elif ("NumType=Card" in tk[5]) and (tk[3] == "NUM"):
numtype = "Card"
else:
numtype = ""
if (len(options) == 0): # out of the lex
pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, numtype=numtype)
elif (len(options) == 1): # unambiguous in the lex
pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, numtype=None)
else: # ambiguous in the lex - do nothing
pos = tk[3]
lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
feat = featsFull(opFEATS[0], abbr, extpos=extpos, verbform=None, numtype=None) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, verbform=None, numtype=None)
# fix Verb tags - AUX, VERB
elif (tk[3] in lexVerbTags):
options = lex.pget(tk[1].lower(), tk[3])
opLEMMA, opFEATS = sepLEMMA_FEATS(options)
abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2])
if (tk[0] in fixeds):
if (tk[7] == "cc"):
extpos = "CCONJ"
elif (tk[7] == "advmod"):
extpos = "ADV"
elif (tk[7] == "case"):
extpos = "ADP"
elif (tk[7] == "mark"):
extpos = "SCONJ"
elif (tk[3] == "PRON"):
extpos = "PRON"
else:
extpos = tk[3]
else:
extpos = ""
if ("VerbForm=Inf" in tk[5]):
verbform = "Inf"
elif ("VerbForm=Ger" in tk[5]):
verbform = "Ger"
elif ("VerbForm=Part" in tk[5]):
verbform = "Part"
elif ("VerbForm=Fin" in tk[5]):
verbform = "Fin"
else:
if (tk[1][-1].lower() == "r"):
verbform = "Inf"
else:
verbform = "Fin"
if ("Voice=Pass" in tk[5]):
voicepass = True
else:
voicepass = False
if (len(options) == 0): # out of the lex
pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, voicepass=voicepass)
elif (len(options) == 1): # unambiguous in the lex
pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, voicepass=voicepass)
else: # ambiguous in the lex - do nothing
pos = tk[3]
lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
feat = featsFull(opFEATS[0], abbr, extpos=extpos, verbform=None, voicepass=voicepass) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, verbform=None, voicepass=voicepass)
# do reports and change
if (pos != tk[3]):
print(b[0], tk[0], tk[1], tk[3], "UPOS", tk[3], pos, sep="\t", file=repfile)
acc[accName.index("Pchanged")] += 1
tk[3] = pos
if (lem != tk[2]):
print(b[0], tk[0], tk[1], tk[3], "LEMMA", tk[2], lem, sep="\t", file=repfile)
acc[accName.index("Lchanged")] += 1
tk[2] = lem
if (feat != tk[5]):
if ("ExtPos=" not in feat):
print(b[0], tk[0], tk[1], tk[3], "FEATS", tk[5], feat, sep="\t", file=repfile)
acc[accName.index("Fchanged")] += 1
tk[5] = feat
if (not arguments[4]): print_reps(repfile, accName, acc)
if (not arguments[4]): repfile.close()
base.printNoHeader(outfile)
outfile.close()
posprocFix()