Spaces:
Running
Running
| # conlluFile.py - a Python 3 package to handle a CoNNL-U files (.conllu) in data structure (base) | |
| # | |
| # (c) Lucelene Lopes 2023 | |
| # | |
| # member functions: | |
| # conlluFile - the constructor from an input conllu file (name) - default no name creates an empty base | |
| # - default considering contracted word (skipAg=True if not) | |
| # Acessors | |
| # getBase(self): # return the whole base | |
| # getHeader(self): # return in a single string the initial lines of the conllu | |
| # getS(self): # return the number of sentences | |
| # getT(self): # return the number of tokens (ignoring contracted words) | |
| # getSandT(self): # return the number of sentences and tokens | |
| # getSentByID(self, SID): # return a sentence by its SID (string) - return none if absent | |
| # getSentByIndex(self, ind): # return a sentence by its index (int) - return none if absent | |
| # getSentInd(self, SID): # return the index (int) of the sentence with SID (string) - return -1 if absent | |
| # getSentID(self, ind): # return the SID (string) for the sentence indexed by ind - return -1 if absent | |
| # isSIDin(self, SID): # return True if the SID is in the base | |
| # isINDin(self, ind): # return True if the index (int) is in the base | |
| # isSentTagged(self, ind): # return True if the sentence indexed by ind (int) has a non empty tag (b[5]) | |
| # numberSentSize(self, size): # return how many sentences in the base have this size | |
| # sentSizeRange(self): # return the smallest and largest sentence size within the base | |
| # getAllSIDs(self): # return the list with all sentence IDs | |
| # Mutators | |
| # addToBase(self, name): # add a conllu file (name) to the base considering contracted word or not (skipAg) | |
| # removeSentInd(self, s): # remove the sentence with id s from base | |
| # removeSentSID(self, s): # remove the sentence with SID s (string) from base | |
| # tagTokenAtSID(self,s,t,tag): # sets tag (string) for s is the SID (string), t is the token id (string) | |
| # tagTokenAtSent(self,s,t,tag): # sets tag (string) for s is the sentence index (int), t is the token id (string) | |
| # tagSent(self,s,tag): # sets tag (string) for s is the sentence index (int) | |
| # setSentTags(self): # set the sentence tags (additional info) based on the tokens tags (additional info) | |
| # sortBase(self): # sort the base according to SID | |
| # Prints | |
| # printSent(self, ind, outfile, nodeprel=False): # prints out a sentence by its index (int) in a outfile with all 10 fields | |
| # printHeaderToo(self, outfile, nodeprel=False): # prints out the whole base in an outfile | |
| # printNoHeader(self, outfile, nodeprel=False): # prints out the whole base in an outfile | |
| # | |
| # Sentence structure | |
| ## b[0] SID - sentence ID | |
| ## b[1] TEXT - text of the sentence | |
| ## b[2] number of tokens (not including the contracted word lines) | |
| ## b[3] lines of the header (including, but not limited to, the '# sent_id =' and '# text' lines) | |
| ## b[4] token lines (including contracted word lines) | |
| ## each token line has 10 elements of the CoNLL-U format, plus one place holder for information | |
| ## b[5] status of change (a place holder for information) | |
| class conlluFile: | |
| def __init__(self, name="", skipAg=False): # create a base from an input conllu file (name) considering contracted word or not (skipAg) | |
| # Instance variables: | |
| # self.base - the whole base | |
| # self.header - the first lines before the actual sentences | |
| # self.s - the total number of sentences | |
| # self.t - the total number of tokens | |
| self.base = [] | |
| if (name == ""): | |
| self.header, self.s, self.t = "", 0, 0 | |
| else: | |
| infile = open(name, "r") | |
| self.s, self.t = 0, 0 | |
| SID = "HEADER" | |
| self.header = "" | |
| for line in infile: | |
| if ((SID == "HEADER") and (line[:12] != "# sent_id = ")): | |
| self.header += line | |
| elif (((SID == "") or SID == "HEADER") and (line[:12] == "# sent_id = ")): | |
| SID = line[12:-1] | |
| dumpHead = [] | |
| dumpHead.append(line[:-1]) | |
| logiS = [] | |
| TEXT = "" | |
| tk = 0 | |
| elif ((SID != "") and (line[:-1] != "")): | |
| if (line[0] == "#"): | |
| dumpHead.append(line[:-1]) | |
| if (line[:9] == "# text = "): | |
| TEXT = line[9:-1] | |
| else: | |
| buf = line[:-1].split() | |
| if (buf[3][0] == "["): | |
| buf[3] = buf[3][1:-1] | |
| buf.append("") # holder for token change status (information place holder) | |
| if (skipAg): | |
| if ("-" not in buf[0]): | |
| tk += 1 | |
| logiS.append(buf) | |
| else: | |
| logiS.append(buf) | |
| if ("-" not in buf[0]): | |
| tk += 1 | |
| elif ((SID != "") and (line[:-1] == "")): | |
| if not (self.isSIDin(SID)): | |
| self.base.append([SID,TEXT,tk,dumpHead,logiS,""]) | |
| ## b[0] SID | |
| ## b[1] TEXT | |
| ## b[2] number of tokens (not including contracted words) | |
| ## b[3] lines of the header | |
| ## b[4] tokens (including contracted words) | |
| ## b[5] status of change (initially empty) | |
| self.s += 1 | |
| self.t += tk | |
| else: | |
| print("Duplicated SID:", SID) | |
| SID = "" | |
| if (SID != ""): | |
| if not (self.isSIDin(SID)): | |
| self.base.append([SID,TEXT,tk,dumpHead,logiS]) | |
| self.s += 1 | |
| self.t += tk | |
| else: | |
| print("Duplicated SID:", SID) | |
| infile.close() | |
| self.base.sort() | |
| def addToBase(self,name, skipAg=False): # add a conllu file (name) to the base considering contracted word or not (skipAg) | |
| newAcc = 0 | |
| infile = open(name, "r") | |
| SID = "HEADER" | |
| self.header = "" | |
| for line in infile: | |
| if ((SID == "HEADER") and (line[:12] != "# sent_id = ")): | |
| self.header += line | |
| elif (((SID == "") or SID == "HEADER") and (line[:12] == "# sent_id = ")): | |
| SID = line[12:-1] | |
| dumpHead = [] | |
| dumpHead.append(line[:-1]) | |
| logiS = [] | |
| TEXT = "" | |
| tk = 0 | |
| elif ((SID != "") and (line[:-1] != "")): | |
| if (line[0] == "#"): | |
| dumpHead.append(line[:-1]) | |
| if (line[:9] == "# text = "): | |
| TEXT = line[9:-1] | |
| else: | |
| buf = line[:-1].split("\t") | |
| if (buf[3][0] == "["): | |
| buf[3] = buf[3][1:-1] | |
| buf.append("") # holder for token change status | |
| if (skipAg): | |
| if ("-" not in buf[0]): | |
| tk += 1 | |
| logiS.append(buf) | |
| else: | |
| logiS.append(buf) | |
| if ("-" not in buf[0]): | |
| tk += 1 | |
| elif ((SID != "") and (line[:-1] == "")): | |
| if not (self.isSIDin(SID)): | |
| self.base.append([SID,TEXT,tk,dumpHead,logiS,""]) | |
| ## b[0] SID | |
| ## b[1] TEXT | |
| ## b[2] number of tokens (not including contracted words) | |
| ## b[3] lines of the header | |
| ## b[4] tokens (including contracted words) | |
| ## b[5] status of change (initially empty) | |
| self.s += 1 | |
| self.t += tk | |
| else: | |
| newAcc += 1 | |
| SID = "" | |
| if (SID != ""): | |
| if not (self.isSIDin(SID)): | |
| self.base.append([SID,TEXT,tk,dumpHead,logiS,""]) | |
| self.s += 1 | |
| self.t += tk | |
| else: | |
| newAcc += 1 | |
| infile.close() | |
| print("Already existent:", newAcc) | |
| def removeSentInd(self, s): # remove the sentence with id s (int) from base | |
| self.s -= 1 | |
| self.t -= self.base[s][2] | |
| self.base.remove(s) | |
| def removeSentSID(self, s): # remove the sentence with SID s (string) from base | |
| for i in range(self.s): | |
| if (self.base[i][0] == s): | |
| break | |
| if (i < self.s): | |
| self.s -= 1 | |
| self.t -= self.base[s][2] | |
| self.base.remove(s) | |
| else: | |
| input("Trying to remove an absent SID") | |
| def getBase(self): # return the whole base | |
| return self.base | |
| def getHeader(self): # return in a single string the initial lines of the conllu | |
| return self.header | |
| def getS(self): # return the number of sentences | |
| return self.s | |
| def getT(self): # return the number of tokens (ignoring contracted words) | |
| return self.t | |
| def getSandT(self): # return the number of sentences and tokens (ignoring contracted words) | |
| return self.s, self.t | |
| def getSentByID(self, SID): # return a sentence by its SID (string) - return none if absent | |
| for b in self.base: | |
| if (b[0] == SID): | |
| return b | |
| return "none" | |
| def getSentByIndex(self, ind): # return a sentence by its index (int) - return none if absent | |
| if (ind < self.s): | |
| return self.base[ind] | |
| else: | |
| return "none" | |
| def getSentInd(self, SID): # return the index (int) of the sentence with SID (string) - return -1 if absent | |
| for i in range(len(self.base)): | |
| if (self.base[i][0] == SID): | |
| return i | |
| return -1 | |
| def getSentID(self, ind): # return the SID (string) for the sentence indexed by ind - return -1 if absent | |
| if (ind < self.s): | |
| return self.base[ind][0] | |
| else: | |
| return -1 | |
| def isSIDin(self, SID): # return True if the SID (string) is in the base | |
| for b in self.base: | |
| if (b[0] == SID): | |
| return True | |
| return False | |
| def isINDin(self, ind): # return True if the index (int) is in the base | |
| return (ind < self.s) | |
| def isSentTagged(self, ind): # return True if the sentence indexed by ind (int) has a non empty tag (b[5]) | |
| return (self.base[ind][5] != "") | |
| def numberSentSize(self, size): # return how many sentences have this size | |
| ans = 0 | |
| for b in self.base: | |
| if (b[2] == size): | |
| ans += 1 | |
| return ans | |
| def sentSizeRange(self): # return the smallest and largest sentence size within the base | |
| smallest, largest = self.base[0][2], self.base[0][2] | |
| for b in self.base: | |
| if (b[2] < smallest): | |
| smallest = b[2] | |
| if (b[2] > largest): | |
| largest = b[2] | |
| return smallest, largest | |
| def getAllSIDs(self): # return the list with all sentence IDs | |
| ans = [] | |
| for b in self.base: | |
| ans.append(b[0]) | |
| ans.sort() | |
| return ans | |
| def tagTokenAtSID(self,s,t,tag): # sets tag (string) for s is the SID (string), t is the token id (string) | |
| ind = self.getSentInd(s) | |
| for tk in self.base[ind][4]: | |
| if (t == tk[0]): | |
| tk[10] = tag | |
| def tagTokenAtSent(self,s,t,tag): # sets tag (string) for s is the sentence index (int), t is the token id (string) | |
| for tk in self.base[s][4]: | |
| if (t == tk[0]): | |
| tk[10] = tag | |
| def tagSent(self,s,tag): # sets tag (string) for s is the sentence index (int) | |
| self.base[s][5] = tag | |
| def setSentTags(self): # set the sentence tags (additional info) based on the tokens tags (additional info) | |
| for b in self.base: | |
| for tk in b[4]: | |
| if (tk[10] != ""): | |
| if (b[5] == ""): | |
| b[5] = tk[10] | |
| else: | |
| if (tk[10] < b[5]): | |
| b[5] = tk[10] | |
| def sortBase(self): | |
| self.base.sort() | |
| def printSent(self, ind, outfile, nodeprel=False): # prints out a sentence by its index (int) in a outfile with all 10 fields | |
| for line in self.base[ind][3]: | |
| print(line, file=outfile) | |
| for tk in self.base[ind][4]: | |
| if nodeprel: | |
| print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile) | |
| else: | |
| print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile) | |
| print(file=outfile) | |
| def printHeaderToo(self, outfile, nodeprel=False): # prints out the whole base in an outfile with header | |
| print(self.header, end="", file=outfile) | |
| for ind in range(self.s): | |
| for line in self.base[ind][3]: | |
| print(line, file=outfile) | |
| for tk in self.base[ind][4]: | |
| if nodeprel: | |
| print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile) | |
| else: | |
| print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile) | |
| print(file=outfile) | |
| def printNoHeader(self, outfile, nodeprel=False): # prints out the whole base in an outfile without header | |
| for ind in range(self.s): | |
| for line in self.base[ind][3]: | |
| print(line, file=outfile) | |
| for tk in self.base[ind][4]: | |
| if nodeprel: | |
| print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile) | |
| else: | |
| print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile) | |
| print(file=outfile) | |
| def usageExample(name): | |
| # Open a .conllu file with "name" | |
| base = conlluFile(name) | |
| # Get the number of sentences and tokens | |
| s, t = base.getSandT() | |
| # to count all tokens tagged with PUNCT PoS tag | |
| total_PUNCT = 0 | |
| # get all sentences, one after another | |
| for i in range(s): | |
| b = base.getSentByIndex(i) | |
| # get all token, one after another | |
| for tk in b[4]: | |
| if (tk[3] == "PUNCT"): | |
| total_PUNCT += 1 | |
| # say the percentage of PUNCT in the base | |
| print("Tokens tagged as PUNCT are {} out of {} ({}%)".format(total_PUNCT, t, round(t*100/t, 2))) | |