Spaces:

NILC-ICMC-USP
/

Portparser.v2

Running

File size: 24,272 Bytes

#################################################
### Post Processing Program to Portparser.v3
#################################################
#
#  (c) Lucelene Lopes 2025
#
##################
#  main function: fixLemmaFeatures()
#    It performs the correction of some lemmas and morphological
#    features from the input file and saves it in the output file.
#    The options are:
#        -h or -help       to print out the options
#        -o or -output     to inform (next) the output file
#        -l or -lemma      to perform only the lemma corrections
#        -f or -feats      to perform only the features corrections
#        -q or -quiet      to not generate the changes report (.rep.tsv)
##################
import sys, os
import lexikon
from conlluFile import conlluFile
lex = lexikon.UDlexPT()

#################################################
### Function CMD line arguments capture
#################################################
def parseOptions(arguments):
    # default options, doLemma e doFeats alteráveis para True, se ambos False
    output_file, input_file, doLemma, doFeats, quiet = "", "", False, False, False
    #print(arguments)
    i = 1
    while i < len(arguments):
        if (arguments[i][0] == "-"):
            # ajuda (help) - mostra ajuda, nada é executado
            if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \
               (arguments[i] == "-help"):
                print("Opções:\n-h ajuda\n-o arquivo de saída", \
                      "\n-l executa apenas correção de lemma", \
                      "\n-f executa apenas correção de features", \
                      "\n-q não salva relatório (correção Quieta)", \
                      "\nExemplo de utilização:", \
                      "\n   python3 postproc.py -o yyy.conllu xxx.conllu", \
                      "\nBusca as sentenças no arquivo 'xxx.conllu',", \
                      "  corrige lemmas e features e salva as", \
                      "  sentenças no arquivo 'yyy.conllu''", \
                      sep="")
                return None
            # faz correção de lemma
            elif ((arguments[i][1] == "l") and (len(arguments[i])==2)) or \
               (arguments[i] == "-lemma"):
                doLemma = True
                i += 1
            # faz correção de feats
            elif ((arguments[i][1] == "f") and (len(arguments[i])==2)) or \
               (arguments[i] == "-feats"):
                doFeats = True
                i += 1
            # modo quieto (sem relatório)
            elif ((arguments[i][1] == "q") and (len(arguments[i])==2)) or \
               (arguments[i] == "-quiet"):
                quiet = True
                i += 1
            # arquivo de saída
            elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \
               (arguments[i] == "-output"):
                output_file = arguments[i+1]
                i += 2
        # arquivo de entrada - último parâmetro (sem -i antes)
        else:
            if (os.path.isfile(arguments[i])):
                input_file = arguments[i]
                break
            else:
                print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i]))
                return None
    if (not doLemma and not doFeats):
        doLemma, doFeats = True, True
    #print(output_file, input_file, doLemma, doFeats)
    output_file, input_file = arguments[2], arguments[3]
    return [output_file, input_file, doLemma, doFeats, quiet]

#################################################
### Function - read usual abbreviations
#################################################
def getUsualAbbr():
    infile = open("./src/postproc/usAbbr.tsv", "r")
    abbr = []
    for line in infile:
        if (line[0] == "#"):
            continue
        buf = line[:-1].split("\t")
        if (buf[1] == "abbr"):
            abbr.append([buf[0], buf[2], buf[3], buf[4]])
    return abbr

#################################################
### Function - Check if word is in the abbreviation
#################################################
def isAbbr(listAbbr, form):
    for a in listAbbr:
        if (form == a[0]):
            return True
    return False

#################################################
### Function - get info word is in an abbreviation list
#################################################
def isWithin(listAbbr, form):
    for a in listAbbr:
        if (form == a[0]):
            return a[1],a[2],a[3]
    return None, None, None

#################################################
### Function - Print a frequency list
#################################################
def print_reps(repfile, accName, acc):
    print("\n==========================================================\n", file=repfile)
    for i in range(len(acc)):
        print("{:8} - fixed: {:6>}".format(accName[i], acc[i]), file=repfile)
        print("{:8} - fixed: {:6>}".format(accName[i], acc[i]))

#################################################
### Function - fix upper letters in coumpound words
#################################################
def fixCompoundUpper(form, lemma, upos, feats):
    if (upos in ["PROPN", "SYM", "X", "PUNCT"]):
        return upos, form, "_"
    else:
        lemma = lemma.lower()
        # # deal with the lemma
        # dashesF = form.count("-")
        # dashesL = lemma.count("-")
        # if (dashesF == dashesL):
        #     buf = lemma
        #     bits = []
        #     for i in range(dashesL):
        #         dash = buf.index("-")
        #         bits.append(buf[:dash])
        #         buf = buf[dash+1:]
        #         for j in range(1,len(bits[-1])):
        #             if (bits[-1][j].isupper()):
        #                 bits[-1] = bits[-1][:j]+bits[-1][j].lower()+bits[-1][j+1:]
        #     lemma = bits[0]
        #     for i in range(1,len(bits)):
        #         lemma += "-"+bits[i]
        #     lemma += "-"+buf
        # deal with the features
        #### not yet
        return upos, lemma, feats

#################################################
### Function - assemble feats
#################################################
def featsFull(feat, abbr=False, extpos="", voicepass=False, prontype="", verbform="", numtype=""):
    def ignoreCase(f):
        return f.lower()
    # disassemble the string
    if (feat == "_"):
        feats = []
    else:
        feats = feat.split("|")
    # deal with Abbr=Yes
    if (abbr) and ("Abbr=Yes" not in feats):
        feats.append("Abbr=Yes")
    if (not abbr) and ("Abbr=Yes" in feats):
        feats.remove("Abbr=Yes")
    # deal with ExtPos=
    if (extpos != "") and ("ExtPos="+extpos not in feats):
        feats.append("ExtPos="+extpos)
    to_rem = []
    for f in feats:
        if (f[:7] == "ExtPos=") and (f != "ExtPos="+extpos):
            to_rem.append(f)
    for trf in to_rem:
        feats.remove(trf)
    # deal with Voice=Pass
    if (voicepass) and ("Voice=Pass" not in feats):
        feats.append("Voice=Pass")
    if (not voicepass) and ("Voice=Pass" in feats):
        feats.remove("Voice=Pass")
    # deal with PronType=
    if (prontype != None):
        if (prontype != "") and ("PronType="+prontype not in feats):
            feats.append("PronType="+prontype)
        to_rem = []
        for f in feats:
            if (f[:9] == "PronType=") and (f != "PronType="+prontype):
                to_rem.append(f)
        for trf in to_rem:
            feats.remove(trf)
    # deal with VerbForm=
    if (verbform != None):
        if (verbform != "") and ("VerbForm="+verbform not in feats):
            feats.append("VerbForm="+verbform)
        to_rem = []
        for f in feats:
            if (f[:9] == "VerbForm=") and (f != "VerbForm="+verbform):
                to_rem.append(f)
        for trf in to_rem:
            feats.remove(trf)
    # deal with NumType=
    if (numtype != None):
        if (numtype != "") and ("NumType="+numtype not in feats):
            feats.append("NumType="+numtype)
        to_rem = []
        for f in feats:
            if (f[:8] == "NumType=") and (f != "NumType="+numtype):
                to_rem.append(f)
        for trf in to_rem:
            feats.remove(trf)
    # assemble the string
    if (feats == []):
        return "_"
    else:
        feats.sort(key=ignoreCase)
        ans = ""
        for f in feats:
            ans += f+"|"
        return ans[:-1]

#################################################
### Function - locate the fixed heads in the sentence
#################################################
def locateExtPos(tks):
    fixeds = []
    for tk in tks:
        if (tk[7] == "fixed") and (tk[6] not in fixeds):
            fixeds.append(tk[6])
    return fixeds

#################################################
### Function - check options separating lemma and features
#################################################
def sepLEMMA_FEATS(options):
    opLEMMA = []
    opFEATS = []
    for o in options:
        if (o[0] not in opLEMMA):
            opLEMMA.append(o[0])
        if (o[2] not in opFEATS):
            opFEATS.append(o[2])
    return opLEMMA, opFEATS

#################################################
### Main Function - Postprocess fix of UPOS, LEMMA and FEATS
#################################################
def posprocFix():
    # if compound word                                 # fix - replace upper case in Lemma only
    # if the word is within known unambiguous abbr     # correct arbitrarily
    lexOutOfTags   = ["PROPN", "PUNCT", "SYM", "X"]    # correct arbitrarily
    lexCloseTags   = ["ADP", "ADV", "CCONJ", "SCONJ"]  # correct if unique in lex, erase feats (features are impossible)
    lexPronDetTags = ["DET", "PRON"]                   # correct if unique in lex, require 'PronType', erase impossible features
    lexOpenTags    = ["ADJ", "INTJ", "NOUN", "NUM"]    # correct if unique in lex, erase impossible features
    lexVerbTags    = ["AUX", "VERB"]                   # correct if unique in lex, require 'VerbForm', erase impossible features
    digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    ordinalsignsFem = ['ª', 'a']
    ordinalsignsMasc = ['º', '°', 'o']
    ordinalsignsNeut = ['.']

    if (len(sys.argv) == 1):
        arguments = ["xxx.conllu", "yyy.conllu", True, True, False] # output file, input file, do lemmas, do features, run quiet(false)
        print("Assumindo default: 'yyy.conllu' como arquivo de entrada, 'xxx.conllu' como arquivo de saída, e executando correção de lemas e features.")
    else:
        arguments = parseOptions(sys.argv)
    if (arguments != None):
        if (arguments[0] == ""):
            print("Assumindo 'xxx.conllu' como arquivo de saída")
            arguments[0] = 'xxx.conllu'
        if not os.path.isfile(arguments[1]):
            print(arguments[1], "Arquivo de entrada inválido - por favor corrija e tente novamente")
        else:
            outfile = open(arguments[0], "w")
            if (not arguments[4]): repfile = open(arguments[0]+".rep.tsv", "w")
            base = conlluFile(arguments[1])
            # counters
            accName = ["Pchanged", "Lchanged", "Fchanged"]
            acc = [0]*len(accName)
            # usual Abbr (read from .tsv with "form", "kind", "UPOS", "LEMMA", "FEATS")
            usualAbbr = getUsualAbbr()
            # main loop
            for i in range(base.getS()):
                b = base.getSentByIndex(i)
                fixeds = locateExtPos(b[4])
                for tk in b[4]:
                    # level down contracted tokens info, but ID and FORM
                    if ("-" in tk[0]):
                        tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9] = "_", "_", "_", "_", "_", "_", "_", "_"
                        continue
                    # fix out of lexikon tokens
                    if (tk[3] in lexOutOfTags):
                        if (tk[3] in ["PROPN", "PUNCT", "SYM"]):
                            pos, lem, feat = tk[3], tk[1], "_"
                        elif (tk[3] == "X"):
                            if ("Foreign=Yes" in tk[5]):
                                pos, lem, feat = tk[3], tk[1], "Foreign=Yes"
                            else:
                                pos, lem, feat = tk[3], tk[1], "_"
                    # fix only lemma in compound words
                    elif ("-" in tk[1]):
                        pos, lem, feat = fixCompoundUpper(tk[1], tk[2], tk[3], tk[5])
                    # fix known abbreviations
                    elif (isAbbr(usualAbbr, tk[1].lower())) and (tk[3] in ["ADP", "NOUN"]):
                        pos, lem, feat = isWithin(usualAbbr, tk[1].lower())
                    # fix numerical NUM, ADJ, NOUN
                    elif (tk[3] in ["ADJ", "NOUN", "NUM"]) and (not tk[1].isalpha()):
                        if (tk[3] == "NOUN"):
                            pos, lem, feat = tk[3], tk[1], "_"
                        elif (tk[3] == "ADJ"):
                            if (tk[1][-1] in ordinalsignsMasc):
                                pos, lem, feat = tk[3], tk[1], "Gender=Masc|NumType=Ord"
                            elif (tk[1][-1] in ordinalsignsFem):
                                pos, lem, feat = tk[3], tk[1], "Gender=Fem|NumType=Ord"
                            elif (tk[1][-1] in ordinalsignsNeut):
                                pos, lem, feat = tk[3], tk[1], "NumType=Ord"
                            else:
                                pos, lem, feat = tk[3], tk[1], "_"
                        elif (tk[3] == "NUM"):
                            if (tk[1][-1] in ordinalsignsMasc):
                                pos, lem, feat = tk[3], tk[1], "Gender=Masc|NumType=Ord"
                            elif (tk[1][-1] in ordinalsignsFem):
                                pos, lem, feat = tk[3], tk[1], "Gender=Fem|NumType=Ord"
                            elif (tk[1][-1] in ordinalsignsNeut):
                                pos, lem, feat = tk[3], tk[1], "NumType=Ord"
                            else:
                                pos, lem, feat = tk[3], tk[1], "NumType=Card"
                    # fix closed tags - ADP, ADV, CCONJ, SCONJ
                    elif (tk[3] in lexCloseTags):
                        options = lex.pget(tk[1].lower(), tk[3])
                        opLEMMA, opFEATS = sepLEMMA_FEATS(options)
                        abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2])
                        if (tk[0] in fixeds):
                            if   (tk[7] == "cc"):
                                extpos = "CCONJ"
                            elif (tk[7] == "advmod"):
                                extpos = "ADV"
                            elif (tk[7] == "case"):
                                extpos = "ADP"
                            elif (tk[7] == "mark"):
                                extpos = "SCONJ"
                            elif (tk[3] == "PRON"):
                                extpos = "PRON"
                            else:
                                extpos = tk[3]
                        else:
                            extpos = ""
                        if (len(options) == 0):      # out of the lex
                            pos, lem, feat = tk[3], tk[2].lower(), featsFull("_", abbr, extpos=extpos)
                        elif (len(options) == 1):    # unambiguous in the lex
                            pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos)
                        else:                        # ambiguous in the lex - do nothing
                            pos = tk[3]
                            lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
                            feat = featsFull(opFEATS[0], abbr, extpos=extpos) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos)
                    # fix Pron and Det tags - PRON, DET
                    elif (tk[3] in lexPronDetTags):
                        options = lex.pget(tk[1].lower(), tk[3])
                        opLEMMA, opFEATS = sepLEMMA_FEATS(options)
                        abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1]))
                        if (tk[0] in fixeds):
                            if   (tk[7] == "cc"):
                                extpos = "CCONJ"
                            elif (tk[7] == "advmod"):
                                extpos = "ADV"
                            elif (tk[7] == "case"):
                                extpos = "ADP"
                            elif (tk[7] == "mark"):
                                extpos = "SCONJ"
                            elif (tk[3] == "PRON"):
                                extpos = "PRON"
                            else:
                                extpos = tk[3]
                        else:
                            extpos = ""
                        if ("PronType" in tk[5]):
                            idx = tk[5].index("PronType=")+9
                            prontype = tk[5][idx:idx+3]
                        elif (tk[3] == "PRON"):
                            prontype = "Dem"
                        elif (tk[3] == "DET"):
                            prontype = "Art"
                        if (len(options) == 0):      # out of the lex
                            pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, prontype=prontype)
                        elif (len(options) == 1):    # unambiguous in the lex
                            pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, prontype=None)
                        else:                        # ambiguous in the lex - do nothing
                            pos = tk[3]
                            lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
                            feat = featsFull(opFEATS[0], abbr, extpos=extpos, prontype=prontype) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, prontype=prontype)
                    # fix Open tags - ADJ, INTJ, NOUN, NUM
                    elif (tk[3] in lexOpenTags):
                        options = lex.pget(tk[1].lower(), tk[3])
                        opLEMMA, opFEATS = sepLEMMA_FEATS(options)
                        abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1]))
                        if (tk[0] in fixeds):
                            if   (tk[7] == "cc"):
                                extpos = "CCONJ"
                            elif (tk[7] == "advmod"):
                                extpos = "ADV"
                            elif (tk[7] == "case"):
                                extpos = "ADP"
                            elif (tk[7] == "mark"):
                                extpos = "SCONJ"
                            elif (tk[3] == "PRON"):
                                extpos = "PRON"
                            else:
                                extpos = tk[3]
                        else:
                            extpos = ""
                        if ("VerbForm=Part" in tk[5]) and (tk[3] == "ADJ"):
                            verbform = "Part"
                        else:
                            verbform = ""
                        if ("NumType=Ord" in tk[5]) and (tk[3] in ["ADJ", "NUM"]):
                            numtype = "Ord"
                        elif ("NumType=Card" in tk[5]) and (tk[3] == "NUM"):
                            numtype = "Card"
                        else:
                            numtype = ""
                        if (len(options) == 0):      # out of the lex
                            pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, numtype=numtype)
                        elif (len(options) == 1):    # unambiguous in the lex
                            pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, numtype=None)
                        else:                        # ambiguous in the lex - do nothing
                            pos = tk[3]
                            lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
                            feat = featsFull(opFEATS[0], abbr, extpos=extpos, verbform=None, numtype=None) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, verbform=None, numtype=None)
                    # fix Verb tags - AUX, VERB
                    elif (tk[3] in lexVerbTags):
                        options = lex.pget(tk[1].lower(), tk[3])
                        opLEMMA, opFEATS = sepLEMMA_FEATS(options)
                        abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2])
                        if (tk[0] in fixeds):
                            if   (tk[7] == "cc"):
                                extpos = "CCONJ"
                            elif (tk[7] == "advmod"):
                                extpos = "ADV"
                            elif (tk[7] == "case"):
                                extpos = "ADP"
                            elif (tk[7] == "mark"):
                                extpos = "SCONJ"
                            elif (tk[3] == "PRON"):
                                extpos = "PRON"
                            else:
                                extpos = tk[3]
                        else:
                            extpos = ""
                        if   ("VerbForm=Inf" in tk[5]):
                            verbform = "Inf"
                        elif ("VerbForm=Ger" in tk[5]):
                            verbform = "Ger"
                        elif ("VerbForm=Part" in tk[5]):
                            verbform = "Part"
                        elif ("VerbForm=Fin" in tk[5]):
                            verbform = "Fin"
                        else:
                            if (tk[1][-1].lower() == "r"):
                                verbform = "Inf"
                            else:
                                verbform = "Fin"
                        if ("Voice=Pass" in tk[5]):
                            voicepass = True
                        else:
                            voicepass = False
                        if (len(options) == 0):      # out of the lex
                            pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, voicepass=voicepass)
                        elif (len(options) == 1):    # unambiguous in the lex
                            pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, voicepass=voicepass)
                        else:                        # ambiguous in the lex - do nothing
                            pos = tk[3]
                            lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
                            feat = featsFull(opFEATS[0], abbr, extpos=extpos, verbform=None, voicepass=voicepass) if (len(opFEATS) == 1) else featsFull(tk[5], abbr, extpos=extpos, verbform=None, voicepass=voicepass)
                    # do reports and change
                    if (pos != tk[3]):
                        print(b[0], tk[0], tk[1], tk[3], "UPOS", tk[3], pos, sep="\t", file=repfile)
                        acc[accName.index("Pchanged")] += 1
                        tk[3] = pos
                    if (lem != tk[2]):
                        print(b[0], tk[0], tk[1], tk[3], "LEMMA", tk[2], lem, sep="\t", file=repfile)
                        acc[accName.index("Lchanged")] += 1
                        tk[2] = lem
                    if (feat != tk[5]):
                        if ("ExtPos=" not in feat):
                            print(b[0], tk[0], tk[1], tk[3], "FEATS", tk[5], feat, sep="\t", file=repfile)
                            acc[accName.index("Fchanged")] += 1
                        tk[5] = feat
            if (not arguments[4]): print_reps(repfile, accName, acc)
            if (not arguments[4]): repfile.close()
            base.printNoHeader(outfile)
            outfile.close()

posprocFix()