################################################# ### Post Processing Program to Portparser.v3 ################################################# # # (c) Lucelene Lopes 2025 # ################## # main function: fixLemmaFeatures() # It performs the correction of some lemmas and morphological # features from the input file and saves it in the output file. # The options are: # -h or -help to print out the options # -o or -output to inform (next) the output file # -l or -lemma to perform only the lemma corrections # -f or -feats to perform only the features corrections # -q or -quiet to not generate the changes report (.rep.tsv) ################## import sys, os import lexikon from conlluFile import conlluFile lex = lexikon.UDlexPT() ################################################# ### Function CMD line arguments capture ################################################# def parseOptions(arguments): # default options, doLemma e doFeats alteráveis para True, se ambos False output_file, input_file, doLemma, doFeats, quiet = "", "", False, False, False #print(arguments) i = 1 while i < len(arguments): if (arguments[i][0] == "-"): # ajuda (help) - mostra ajuda, nada é executado if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \ (arguments[i] == "-help"): print("Opções:\n-h ajuda\n-o arquivo de saída", \ "\n-l executa apenas correção de lemma", \ "\n-f executa apenas correção de features", \ "\n-q não salva relatório (correção Quieta)", \ "\nExemplo de utilização:", \ "\n python3 postproc.py -o yyy.conllu xxx.conllu", \ "\nBusca as sentenças no arquivo 'xxx.conllu',", \ " corrige lemmas e features e salva as", \ " sentenças no arquivo 'yyy.conllu''", \ sep="") return None # faz correção de lemma elif ((arguments[i][1] == "l") and (len(arguments[i])==2)) or \ (arguments[i] == "-lemma"): doLemma = True i += 1 # faz correção de feats elif ((arguments[i][1] == "f") and (len(arguments[i])==2)) or \ (arguments[i] == "-feats"): doFeats = True i += 1 # modo quieto (sem relatório) elif ((arguments[i][1] == "q") and (len(arguments[i])==2)) or \ (arguments[i] == "-quiet"): quiet = True i += 1 # arquivo de saída elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \ (arguments[i] == "-output"): output_file = arguments[i+1] i += 2 # arquivo de entrada - último parâmetro (sem -i antes) else: if (os.path.isfile(arguments[i])): input_file = arguments[i] break else: print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i])) return None if (not doLemma and not doFeats): doLemma, doFeats = True, True #print(output_file, input_file, doLemma, doFeats) output_file, input_file = arguments[2], arguments[3] return [output_file, input_file, doLemma, doFeats, quiet] ################################################# ### Function - read usual abbreviations ################################################# def getUsualAbbr(): infile = open("./src/postproc/usAbbr.tsv", "r") abbr = [] for line in infile: if (line[0] == "#"): continue buf = line[:-1].split("\t") if (buf[1] == "abbr"): abbr.append([buf[0], buf[2], buf[3], buf[4]]) return abbr ################################################# ### Function - Check if word is in the abbreviation ################################################# def isAbbr(listAbbr, form): for a in listAbbr: if (form == a[0]): return True return False ################################################# ### Function - get info word is in an abbreviation list ################################################# def isWithin(listAbbr, form): for a in listAbbr: if (form == a[0]): return a[1],a[2],a[3] return None, None, None ################################################# ### Function - Print a frequency list ################################################# def print_reps(repfile, accName, acc): print("\n==========================================================\n", file=repfile) for i in range(len(acc)): print("{:8} - fixed: {:6>}".format(accName[i], acc[i]), file=repfile) print("{:8} - fixed: {:6>}".format(accName[i], acc[i])) ################################################# ### Function - fix upper letters in coumpound words ################################################# def fixCompoundUpper(form, lemma, upos, feats): if (upos in ["PROPN", "SYM", "X", "PUNCT"]): return upos, form, "_" else: lemma = lemma.lower() # # deal with the lemma # dashesF = form.count("-") # dashesL = lemma.count("-") # if (dashesF == dashesL): # buf = lemma # bits = [] # for i in range(dashesL): # dash = buf.index("-") # bits.append(buf[:dash]) # buf = buf[dash+1:] # for j in range(1,len(bits[-1])): # if (bits[-1][j].isupper()): # bits[-1] = bits[-1][:j]+bits[-1][j].lower()+bits[-1][j+1:] # lemma = bits[0] # for i in range(1,len(bits)): # lemma += "-"+bits[i] # lemma += "-"+buf # deal with the features #### not yet return upos, lemma, feats ################################################# ### Function - assemble feats ################################################# def featsFull(feat, abbr=False, extpos="", voicepass=False, prontype="", verbform="", numtype=""): def ignoreCase(f): return f.lower() # disassemble the string if (feat == "_"): feats = [] else: feats = feat.split("|") # deal with Abbr=Yes if (abbr) and ("Abbr=Yes" not in feats): feats.append("Abbr=Yes") if (not abbr) and ("Abbr=Yes" in feats): feats.remove("Abbr=Yes") # deal with ExtPos= if (extpos != "") and ("ExtPos="+extpos not in feats): feats.append("ExtPos="+extpos) to_rem = [] for f in feats: if (f[:7] == "ExtPos=") and (f != "ExtPos="+extpos): to_rem.append(f) for trf in to_rem: feats.remove(trf) # deal with Voice=Pass if (voicepass) and ("Voice=Pass" not in feats): feats.append("Voice=Pass") if (not voicepass) and ("Voice=Pass" in feats): feats.remove("Voice=Pass") # deal with PronType= if (prontype != None): if (prontype != "") and ("PronType="+prontype not in feats): feats.append("PronType="+prontype) to_rem = [] for f in feats: if (f[:9] == "PronType=") and (f != "PronType="+prontype): to_rem.append(f) for trf in to_rem: feats.remove(trf) # deal with VerbForm= if (verbform != None): if (verbform != "") and ("VerbForm="+verbform not in feats): feats.append("VerbForm="+verbform) to_rem = [] for f in feats: if (f[:9] == "VerbForm=") and (f != "VerbForm="+verbform): to_rem.append(f) for trf in to_rem: feats.remove(trf) # deal with NumType= if (numtype != None): if (numtype != "") and ("NumType="+numtype not in feats): feats.append("NumType="+numtype) to_rem = [] for f in feats: if (f[:8] == "NumType=") and (f != "NumType="+numtype): to_rem.append(f) for trf in to_rem: feats.remove(trf) # assemble the string if (feats == []): return "_" else: feats.sort(key=ignoreCase) ans = "" for f in feats: ans += f+"|" return ans[:-1] ################################################# ### Function - locate the fixed heads in the sentence ################################################# def locateExtPos(tks): fixeds = [] for tk in tks: if (tk[7] == "fixed") and (tk[6] not in fixeds): fixeds.append(tk[6]) return fixeds ################################################# ### Function - check options separating lemma and features ################################################# def sepLEMMA_FEATS(options): opLEMMA = [] opFEATS = [] for o in options: if (o[0] not in opLEMMA): opLEMMA.append(o[0]) if (o[2] not in opFEATS): opFEATS.append(o[2]) return opLEMMA, opFEATS ################################################# ### Main Function - Postprocess fix of UPOS, LEMMA and FEATS ################################################# def posprocFix(): # if compound word # fix - replace upper case in Lemma only # if the word is within known unambiguous abbr # correct arbitrarily lexOutOfTags = ["PROPN", "PUNCT", "SYM", "X"] # correct arbitrarily lexCloseTags = ["ADP", "ADV", "CCONJ", "SCONJ"] # correct if unique in lex, erase feats (features are impossible) lexPronDetTags = ["DET", "PRON"] # correct if unique in lex, require 'PronType', erase impossible features lexOpenTags = ["ADJ", "INTJ", "NOUN", "NUM"] # correct if unique in lex, erase impossible features lexVerbTags = ["AUX", "VERB"] # correct if unique in lex, require 'VerbForm', erase impossible features digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] ordinalsignsFem = ['ª', 'a'] ordinalsignsMasc = ['º', '°', 'o'] ordinalsignsNeut = ['.'] if (len(sys.argv) == 1): arguments = ["xxx.conllu", "yyy.conllu", True, True, False] # output file, input file, do lemmas, do features, run quiet(false) print("Assumindo default: 'yyy.conllu' como arquivo de entrada, 'xxx.conllu' como arquivo de saída, e executando correção de lemas e features.") else: arguments = parseOptions(sys.argv) if (arguments != None): if (arguments[0] == ""): print("Assumindo 'xxx.conllu' como arquivo de saída") arguments[0] = 'xxx.conllu' if not os.path.isfile(arguments[1]): print(arguments[1], "Arquivo de entrada inválido - por favor corrija e tente novamente") else: outfile = open(arguments[0], "w") if (not arguments[4]): repfile = open(arguments[0]+".rep.tsv", "w") base = conlluFile(arguments[1]) # counters accName = ["Pchanged", "Lchanged", "Fchanged"] acc = [0]*len(accName) # usual Abbr (read from .tsv with "form", "kind", "UPOS", "LEMMA", "FEATS") usualAbbr = getUsualAbbr() # main loop for i in range(base.getS()): b = base.getSentByIndex(i) fixeds = locateExtPos(b[4]) for tk in b[4]: # level down contracted tokens info, but ID and FORM if ("-" in tk[0]): tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9] = "_", "_", "_", "_", "_", "_", "_", "_" continue # fix out of lexikon tokens if (tk[3] in lexOutOfTags): if (tk[3] in ["PROPN", "PUNCT", "SYM"]): pos, lem, feat = tk[3], tk[1], "_" elif (tk[3] == "X"): if ("Foreign=Yes" in tk[5]): pos, lem, feat = tk[3], tk[1], "Foreign=Yes" else: pos, lem, feat = tk[3], tk[1], "_" # fix only lemma in compound words elif ("-" in tk[1]): pos, lem, feat = fixCompoundUpper(tk[1], tk[2], tk[3], tk[5]) # fix known abbreviations elif (isAbbr(usualAbbr, tk[1].lower())) and (tk[3] in ["ADP", "NOUN"]): pos, lem, feat = isWithin(usualAbbr, tk[1].lower()) # fix numerical NUM, ADJ, NOUN elif (tk[3] in ["ADJ", "NOUN", "NUM"]) and (not tk[1].isalpha()): if (tk[3] == "NOUN"): pos, lem, feat = tk[3], tk[1], "_" elif (tk[3] == "ADJ"): if (tk[1][-1] in ordinalsignsMasc): pos, lem, feat = tk[3], tk[1], "Gender=Masc|NumType=Ord" elif (tk[1][-1] in ordinalsignsFem): pos, lem, feat = tk[3], tk[1], "Gender=Fem|NumType=Ord" elif (tk[1][-1] in ordinalsignsNeut): pos, lem, feat = tk[3], tk[1], "NumType=Ord" else: pos, lem, feat = tk[3], tk[1], "_" elif (tk[3] == "NUM"): if (tk[1][-1] in ordinalsignsMasc): pos, lem, feat = tk[3], tk[1], "Gender=Masc|NumType=Ord" elif (tk[1][-1] in ordinalsignsFem): pos, lem, feat = tk[3], tk[1], "Gender=Fem|NumType=Ord" elif (tk[1][-1] in ordinalsignsNeut): pos, lem, feat = tk[3], tk[1], "NumType=Ord" else: pos, lem, feat = tk[3], tk[1], "NumType=Card" # fix closed tags - ADP, ADV, CCONJ, SCONJ elif (tk[3] in lexCloseTags): options = lex.pget(tk[1].lower(), tk[3]) opLEMMA, opFEATS = sepLEMMA_FEATS(options) abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2]) if (tk[0] in fixeds): if (tk[7] == "cc"): extpos = "CCONJ" elif (tk[7] == "advmod"): extpos = "ADV" elif (tk[7] == "case"): extpos = "ADP" elif (tk[7] == "mark"): extpos = "SCONJ" elif (tk[3] == "PRON"): extpos = "PRON" else: extpos = tk[3] else: extpos = "" if (len(options) == 0): # out of the lex pos, lem, feat = tk[3], tk[2].lower(), featsFull("_", abbr, extpos=extpos) elif (len(options) == 1): # unambiguous in the lex pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos) else: # ambiguous in the lex - do nothing pos = tk[3] lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower() feat = featsFull(opFEATS[0], abbr, extpos=extpos) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos) # fix Pron and Det tags - PRON, DET elif (tk[3] in lexPronDetTags): options = lex.pget(tk[1].lower(), tk[3]) opLEMMA, opFEATS = sepLEMMA_FEATS(options) abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1])) if (tk[0] in fixeds): if (tk[7] == "cc"): extpos = "CCONJ" elif (tk[7] == "advmod"): extpos = "ADV" elif (tk[7] == "case"): extpos = "ADP" elif (tk[7] == "mark"): extpos = "SCONJ" elif (tk[3] == "PRON"): extpos = "PRON" else: extpos = tk[3] else: extpos = "" if ("PronType" in tk[5]): idx = tk[5].index("PronType=")+9 prontype = tk[5][idx:idx+3] elif (tk[3] == "PRON"): prontype = "Dem" elif (tk[3] == "DET"): prontype = "Art" if (len(options) == 0): # out of the lex pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, prontype=prontype) elif (len(options) == 1): # unambiguous in the lex pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, prontype=None) else: # ambiguous in the lex - do nothing pos = tk[3] lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower() feat = featsFull(opFEATS[0], abbr, extpos=extpos, prontype=prontype) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, prontype=prontype) # fix Open tags - ADJ, INTJ, NOUN, NUM elif (tk[3] in lexOpenTags): options = lex.pget(tk[1].lower(), tk[3]) opLEMMA, opFEATS = sepLEMMA_FEATS(options) abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1])) if (tk[0] in fixeds): if (tk[7] == "cc"): extpos = "CCONJ" elif (tk[7] == "advmod"): extpos = "ADV" elif (tk[7] == "case"): extpos = "ADP" elif (tk[7] == "mark"): extpos = "SCONJ" elif (tk[3] == "PRON"): extpos = "PRON" else: extpos = tk[3] else: extpos = "" if ("VerbForm=Part" in tk[5]) and (tk[3] == "ADJ"): verbform = "Part" else: verbform = "" if ("NumType=Ord" in tk[5]) and (tk[3] in ["ADJ", "NUM"]): numtype = "Ord" elif ("NumType=Card" in tk[5]) and (tk[3] == "NUM"): numtype = "Card" else: numtype = "" if (len(options) == 0): # out of the lex pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, numtype=numtype) elif (len(options) == 1): # unambiguous in the lex pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, numtype=None) else: # ambiguous in the lex - do nothing pos = tk[3] lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower() feat = featsFull(opFEATS[0], abbr, extpos=extpos, verbform=None, numtype=None) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, verbform=None, numtype=None) # fix Verb tags - AUX, VERB elif (tk[3] in lexVerbTags): options = lex.pget(tk[1].lower(), tk[3]) opLEMMA, opFEATS = sepLEMMA_FEATS(options) abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2]) if (tk[0] in fixeds): if (tk[7] == "cc"): extpos = "CCONJ" elif (tk[7] == "advmod"): extpos = "ADV" elif (tk[7] == "case"): extpos = "ADP" elif (tk[7] == "mark"): extpos = "SCONJ" elif (tk[3] == "PRON"): extpos = "PRON" else: extpos = tk[3] else: extpos = "" if ("VerbForm=Inf" in tk[5]): verbform = "Inf" elif ("VerbForm=Ger" in tk[5]): verbform = "Ger" elif ("VerbForm=Part" in tk[5]): verbform = "Part" elif ("VerbForm=Fin" in tk[5]): verbform = "Fin" else: if (tk[1][-1].lower() == "r"): verbform = "Inf" else: verbform = "Fin" if ("Voice=Pass" in tk[5]): voicepass = True else: voicepass = False if (len(options) == 0): # out of the lex pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, voicepass=voicepass) elif (len(options) == 1): # unambiguous in the lex pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, voicepass=voicepass) else: # ambiguous in the lex - do nothing pos = tk[3] lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower() feat = featsFull(opFEATS[0], abbr, extpos=extpos, verbform=None, voicepass=voicepass) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, verbform=None, voicepass=voicepass) # do reports and change if (pos != tk[3]): print(b[0], tk[0], tk[1], tk[3], "UPOS", tk[3], pos, sep="\t", file=repfile) acc[accName.index("Pchanged")] += 1 tk[3] = pos if (lem != tk[2]): print(b[0], tk[0], tk[1], tk[3], "LEMMA", tk[2], lem, sep="\t", file=repfile) acc[accName.index("Lchanged")] += 1 tk[2] = lem if (feat != tk[5]): if ("ExtPos=" not in feat): print(b[0], tk[0], tk[1], tk[3], "FEATS", tk[5], feat, sep="\t", file=repfile) acc[accName.index("Fchanged")] += 1 tk[5] = feat if (not arguments[4]): print_reps(repfile, accName, acc) if (not arguments[4]): repfile.close() base.printNoHeader(outfile) outfile.close() posprocFix()