Spaces:

NILC-ICMC-USP
/

Portparser.v2

Running

File size: 15,175 Bytes

ec63fa6

# conlluFile.py - a Python 3 package to handle a CoNNL-U files (.conllu) in data structure (base)
#
# (c) Lucelene Lopes 2023
#
# member functions:
#    conlluFile - the constructor from an input conllu file (name) - default no name creates an empty base
#                                                                  - default considering contracted word (skipAg=True if not)
# Acessors
#    getBase(self):              # return the whole base
#    getHeader(self):            # return in a single string the initial lines of the conllu
#    getS(self):                 # return the number of sentences
#    getT(self):                 # return the number of tokens (ignoring contracted words)
#    getSandT(self):             # return the number of sentences and tokens
#    getSentByID(self, SID):     # return a sentence by its SID (string) - return none if absent
#    getSentByIndex(self, ind):  # return a sentence by its index (int) - return none if absent
#    getSentInd(self, SID):      # return the index (int) of the sentence with SID (string) - return -1 if absent
#    getSentID(self, ind):       # return the SID (string) for the sentence indexed by ind - return -1 if absent
#    isSIDin(self, SID):         # return True if the SID is in the base
#    isINDin(self, ind):         # return True if the index (int) is in the base
#    isSentTagged(self, ind):    # return True if the sentence indexed by ind (int) has a non empty tag (b[5])
#    numberSentSize(self, size): # return how many sentences in the base have this size
#    sentSizeRange(self):        # return the smallest and largest sentence size within the base
#    getAllSIDs(self):           # return the list with all sentence IDs
# Mutators
#    addToBase(self, name):          # add a conllu file (name) to the base considering contracted word or not (skipAg)
#    removeSentInd(self, s):         # remove the sentence with id s from base
#    removeSentSID(self, s):         # remove the sentence with SID s (string) from base
#    tagTokenAtSID(self,s,t,tag):    # sets tag (string) for s is the SID (string), t is the token id (string)
#    tagTokenAtSent(self,s,t,tag):   # sets tag (string) for s is the sentence index (int), t is the token id (string)
#    tagSent(self,s,tag):            # sets tag (string) for s is the sentence index (int)
#    setSentTags(self):              # set the sentence tags (additional info) based on the tokens tags (additional info)
#    sortBase(self):                 # sort the base according to SID
# Prints
#    printSent(self, ind, outfile, nodeprel=False): # prints out a sentence by its index (int) in a outfile with all 10 fields
#    printHeaderToo(self, outfile, nodeprel=False): # prints out the whole base in an outfile
#    printNoHeader(self, outfile, nodeprel=False):  # prints out the whole base in an outfile
#
# Sentence structure
    ## b[0] SID - sentence ID
    ## b[1] TEXT - text of the sentence
    ## b[2] number of tokens (not including the contracted word lines)
    ## b[3] lines of the header (including, but not limited to, the '# sent_id =' and '# text' lines)
    ## b[4] token lines (including contracted word lines)
    ##   each token line has 10 elements of the CoNLL-U format, plus one place holder for information
    ## b[5] status of change (a place holder for information)


class conlluFile:
    def __init__(self, name="", skipAg=False):   # create a base from an input conllu file (name) considering contracted word or not (skipAg)
        # Instance variables:
        #   self.base      - the whole base
        #   self.header    - the first lines before the actual sentences
        #   self.s         - the total number of sentences
        #   self.t         - the total number of tokens
        self.base = []
        if (name == ""):
            self.header, self.s, self.t = "", 0, 0
        else:
            infile = open(name, "r")
            self.s, self.t = 0, 0
            SID = "HEADER"
            self.header = ""
            for line in infile:
                if ((SID == "HEADER") and (line[:12] != "# sent_id = ")):
                    self.header += line
                elif (((SID == "") or SID == "HEADER") and (line[:12] == "# sent_id = ")):
                    SID = line[12:-1]
                    dumpHead = []
                    dumpHead.append(line[:-1])
                    logiS = []
                    TEXT = ""
                    tk = 0
                elif ((SID != "") and (line[:-1] != "")):
                    if (line[0] == "#"):
                        dumpHead.append(line[:-1])
                        if (line[:9] == "# text = "):
                            TEXT = line[9:-1]
                    else:
                        buf = line[:-1].split()
                        if (buf[3][0] == "["):
                            buf[3] = buf[3][1:-1]
                        buf.append("")  # holder for token change status (information place holder)
                        if (skipAg):
                            if ("-" not in buf[0]):
                                tk += 1
                                logiS.append(buf)
                        else:
                            logiS.append(buf)
                            if ("-" not in buf[0]):
                                tk += 1
                elif ((SID != "") and (line[:-1] == "")):
                    if not (self.isSIDin(SID)):
                        self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
                        ## b[0] SID
                        ## b[1] TEXT
                        ## b[2] number of tokens (not including contracted words)
                        ## b[3] lines of the header
                        ## b[4] tokens (including contracted words)
                        ## b[5] status of change (initially empty)
                        self.s += 1
                        self.t += tk
                    else:
                        print("Duplicated SID:", SID)
                    SID = ""
            if (SID != ""):
                if not (self.isSIDin(SID)):
                    self.base.append([SID,TEXT,tk,dumpHead,logiS])
                    self.s += 1
                    self.t += tk
                else:
                    print("Duplicated SID:", SID)
            infile.close()
        self.base.sort()
    def addToBase(self,name, skipAg=False): # add a conllu file (name) to the base considering contracted word or not (skipAg)
        newAcc = 0
        infile = open(name, "r")
        SID = "HEADER"
        self.header = ""
        for line in infile:
            if ((SID == "HEADER") and (line[:12] != "# sent_id = ")):
                self.header += line
            elif (((SID == "") or SID == "HEADER") and (line[:12] == "# sent_id = ")):
                SID = line[12:-1]
                dumpHead = []
                dumpHead.append(line[:-1])
                logiS = []
                TEXT = ""
                tk = 0
            elif ((SID != "") and (line[:-1] != "")):
                if (line[0] == "#"):
                    dumpHead.append(line[:-1])
                    if (line[:9] == "# text = "):
                        TEXT = line[9:-1]
                else:
                    buf = line[:-1].split("\t")
                    if (buf[3][0] == "["):
                        buf[3] = buf[3][1:-1]
                    buf.append("")  # holder for token change status
                    if (skipAg):
                        if ("-" not in buf[0]):
                            tk += 1
                            logiS.append(buf)
                    else:
                        logiS.append(buf)
                        if ("-" not in buf[0]):
                            tk += 1
            elif ((SID != "") and (line[:-1] == "")):
                if not (self.isSIDin(SID)):
                    self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
                        ## b[0] SID
                        ## b[1] TEXT
                        ## b[2] number of tokens (not including contracted words)
                        ## b[3] lines of the header
                        ## b[4] tokens (including contracted words)
                        ## b[5] status of change (initially empty)
                    self.s += 1
                    self.t += tk
                else:
                    newAcc += 1
                SID = ""
        if (SID != ""):
            if not (self.isSIDin(SID)):
                self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
                self.s += 1
                self.t += tk
            else:
                newAcc += 1
        infile.close()
        print("Already existent:", newAcc)
    def removeSentInd(self, s):         # remove the sentence with id s (int) from base
        self.s -= 1
        self.t -= self.base[s][2]
        self.base.remove(s)
    def removeSentSID(self, s):         # remove the sentence with SID s (string) from base
        for i in range(self.s):
            if (self.base[i][0] == s):
                break
        if (i < self.s):
            self.s -= 1
            self.t -= self.base[s][2]
            self.base.remove(s)
        else:
            input("Trying to remove an absent SID")
    def getBase(self):   # return the whole base
        return self.base
    def getHeader(self):  # return in a single string the initial lines of the conllu
        return self.header
    def getS(self):   # return the number of sentences
        return self.s
    def getT(self):   # return the number of tokens (ignoring contracted words)
        return self.t
    def getSandT(self):   # return the number of sentences and tokens (ignoring contracted words)
        return self.s, self.t
    def getSentByID(self, SID):   # return a sentence by its SID (string) - return none if absent
        for b in self.base:
            if (b[0] == SID):
                return b
        return "none"
    def getSentByIndex(self, ind):  # return a sentence by its index (int) - return none if absent
        if (ind < self.s):
            return self.base[ind]
        else:
            return "none"
    def getSentInd(self, SID):  # return the index (int) of the sentence with SID (string) - return -1 if absent
        for i in range(len(self.base)):
            if (self.base[i][0] == SID):
                return i
        return -1
    def getSentID(self, ind):  # return the SID (string) for the sentence indexed by ind - return -1 if absent
        if (ind < self.s):
            return self.base[ind][0]
        else:
            return -1
    def isSIDin(self, SID):  # return True if the SID (string) is in the base
        for b in self.base:
            if (b[0] == SID):
                return True
        return False
    def isINDin(self, ind):  # return True if the index (int) is in the base
        return (ind < self.s)
    def isSentTagged(self, ind):  # return True if the sentence indexed by ind (int) has a non empty tag (b[5])
        return (self.base[ind][5] != "")
    def numberSentSize(self, size):  # return how many sentences have this size
        ans = 0
        for b in self.base:
            if (b[2] == size):
                ans += 1
        return ans
    def sentSizeRange(self):   # return the smallest and largest sentence size within the base
        smallest, largest = self.base[0][2], self.base[0][2]
        for b in self.base:
            if (b[2] < smallest):
                smallest = b[2]
            if (b[2] > largest):
                largest = b[2]
        return smallest, largest
    def getAllSIDs(self):           # return the list with all sentence IDs
        ans = []
        for b in self.base:
            ans.append(b[0])
        ans.sort()
        return ans
    def tagTokenAtSID(self,s,t,tag): # sets tag (string) for s is the SID (string), t is the token id (string)
        ind = self.getSentInd(s)
        for tk in self.base[ind][4]:
            if (t == tk[0]):
                tk[10] = tag
    def tagTokenAtSent(self,s,t,tag): # sets tag (string) for s is the sentence index (int), t is the token id (string)
        for tk in self.base[s][4]:
            if (t == tk[0]):
                tk[10] = tag
    def tagSent(self,s,tag):    # sets tag (string) for s is the sentence index (int)
        self.base[s][5] = tag
    def setSentTags(self):  # set the sentence tags (additional info) based on the tokens tags (additional info)
        for b in self.base:
            for tk in b[4]:
                if (tk[10] != ""):
                    if (b[5] == ""):
                        b[5] = tk[10]
                    else:
                        if (tk[10] < b[5]):
                            b[5] = tk[10]
    def sortBase(self):
        self.base.sort()
    def printSent(self, ind, outfile, nodeprel=False): # prints out a sentence by its index (int) in a outfile with all 10 fields
        for line in self.base[ind][3]:
            print(line, file=outfile)
        for tk in self.base[ind][4]:
            if nodeprel:
                print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
            else:
                print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
        print(file=outfile)
    def printHeaderToo(self, outfile, nodeprel=False): # prints out the whole base in an outfile with header
        print(self.header, end="", file=outfile)
        for ind in range(self.s):
            for line in self.base[ind][3]:
                print(line, file=outfile)
            for tk in self.base[ind][4]:
                if nodeprel:
                    print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
                else:
                    print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
            print(file=outfile)
    def printNoHeader(self, outfile, nodeprel=False): # prints out the whole base in an outfile without header
        for ind in range(self.s):
            for line in self.base[ind][3]:
                print(line, file=outfile)
            for tk in self.base[ind][4]:
                if nodeprel:
                    print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
                else:
                    print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
            print(file=outfile)


def usageExample(name):
    # Open a .conllu file with "name"
    base = conlluFile(name)
    # Get the number of sentences and tokens
    s, t = base.getSandT()
    # to count all tokens tagged with PUNCT PoS tag
    total_PUNCT = 0
    # get all sentences, one after another
    for i in range(s):
        b = base.getSentByIndex(i)
        # get all token, one after another
        for tk in b[4]:
            if (tk[3] == "PUNCT"):
                total_PUNCT += 1
    # say the percentage of PUNCT in the base
    print("Tokens tagged as PUNCT are {} out of {} ({}%)".format(total_PUNCT, t, round(t*100/t, 2)))