Portparser.v2 / src /postproc /conlluFile.py
NILC-ICMC-USP's picture
Upload 82 files
ec63fa6 verified
# conlluFile.py - a Python 3 package to handle a CoNNL-U files (.conllu) in data structure (base)
#
# (c) Lucelene Lopes 2023
#
# member functions:
# conlluFile - the constructor from an input conllu file (name) - default no name creates an empty base
# - default considering contracted word (skipAg=True if not)
# Acessors
# getBase(self): # return the whole base
# getHeader(self): # return in a single string the initial lines of the conllu
# getS(self): # return the number of sentences
# getT(self): # return the number of tokens (ignoring contracted words)
# getSandT(self): # return the number of sentences and tokens
# getSentByID(self, SID): # return a sentence by its SID (string) - return none if absent
# getSentByIndex(self, ind): # return a sentence by its index (int) - return none if absent
# getSentInd(self, SID): # return the index (int) of the sentence with SID (string) - return -1 if absent
# getSentID(self, ind): # return the SID (string) for the sentence indexed by ind - return -1 if absent
# isSIDin(self, SID): # return True if the SID is in the base
# isINDin(self, ind): # return True if the index (int) is in the base
# isSentTagged(self, ind): # return True if the sentence indexed by ind (int) has a non empty tag (b[5])
# numberSentSize(self, size): # return how many sentences in the base have this size
# sentSizeRange(self): # return the smallest and largest sentence size within the base
# getAllSIDs(self): # return the list with all sentence IDs
# Mutators
# addToBase(self, name): # add a conllu file (name) to the base considering contracted word or not (skipAg)
# removeSentInd(self, s): # remove the sentence with id s from base
# removeSentSID(self, s): # remove the sentence with SID s (string) from base
# tagTokenAtSID(self,s,t,tag): # sets tag (string) for s is the SID (string), t is the token id (string)
# tagTokenAtSent(self,s,t,tag): # sets tag (string) for s is the sentence index (int), t is the token id (string)
# tagSent(self,s,tag): # sets tag (string) for s is the sentence index (int)
# setSentTags(self): # set the sentence tags (additional info) based on the tokens tags (additional info)
# sortBase(self): # sort the base according to SID
# Prints
# printSent(self, ind, outfile, nodeprel=False): # prints out a sentence by its index (int) in a outfile with all 10 fields
# printHeaderToo(self, outfile, nodeprel=False): # prints out the whole base in an outfile
# printNoHeader(self, outfile, nodeprel=False): # prints out the whole base in an outfile
#
# Sentence structure
## b[0] SID - sentence ID
## b[1] TEXT - text of the sentence
## b[2] number of tokens (not including the contracted word lines)
## b[3] lines of the header (including, but not limited to, the '# sent_id =' and '# text' lines)
## b[4] token lines (including contracted word lines)
## each token line has 10 elements of the CoNLL-U format, plus one place holder for information
## b[5] status of change (a place holder for information)
class conlluFile:
def __init__(self, name="", skipAg=False): # create a base from an input conllu file (name) considering contracted word or not (skipAg)
# Instance variables:
# self.base - the whole base
# self.header - the first lines before the actual sentences
# self.s - the total number of sentences
# self.t - the total number of tokens
self.base = []
if (name == ""):
self.header, self.s, self.t = "", 0, 0
else:
infile = open(name, "r")
self.s, self.t = 0, 0
SID = "HEADER"
self.header = ""
for line in infile:
if ((SID == "HEADER") and (line[:12] != "# sent_id = ")):
self.header += line
elif (((SID == "") or SID == "HEADER") and (line[:12] == "# sent_id = ")):
SID = line[12:-1]
dumpHead = []
dumpHead.append(line[:-1])
logiS = []
TEXT = ""
tk = 0
elif ((SID != "") and (line[:-1] != "")):
if (line[0] == "#"):
dumpHead.append(line[:-1])
if (line[:9] == "# text = "):
TEXT = line[9:-1]
else:
buf = line[:-1].split()
if (buf[3][0] == "["):
buf[3] = buf[3][1:-1]
buf.append("") # holder for token change status (information place holder)
if (skipAg):
if ("-" not in buf[0]):
tk += 1
logiS.append(buf)
else:
logiS.append(buf)
if ("-" not in buf[0]):
tk += 1
elif ((SID != "") and (line[:-1] == "")):
if not (self.isSIDin(SID)):
self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
## b[0] SID
## b[1] TEXT
## b[2] number of tokens (not including contracted words)
## b[3] lines of the header
## b[4] tokens (including contracted words)
## b[5] status of change (initially empty)
self.s += 1
self.t += tk
else:
print("Duplicated SID:", SID)
SID = ""
if (SID != ""):
if not (self.isSIDin(SID)):
self.base.append([SID,TEXT,tk,dumpHead,logiS])
self.s += 1
self.t += tk
else:
print("Duplicated SID:", SID)
infile.close()
self.base.sort()
def addToBase(self,name, skipAg=False): # add a conllu file (name) to the base considering contracted word or not (skipAg)
newAcc = 0
infile = open(name, "r")
SID = "HEADER"
self.header = ""
for line in infile:
if ((SID == "HEADER") and (line[:12] != "# sent_id = ")):
self.header += line
elif (((SID == "") or SID == "HEADER") and (line[:12] == "# sent_id = ")):
SID = line[12:-1]
dumpHead = []
dumpHead.append(line[:-1])
logiS = []
TEXT = ""
tk = 0
elif ((SID != "") and (line[:-1] != "")):
if (line[0] == "#"):
dumpHead.append(line[:-1])
if (line[:9] == "# text = "):
TEXT = line[9:-1]
else:
buf = line[:-1].split("\t")
if (buf[3][0] == "["):
buf[3] = buf[3][1:-1]
buf.append("") # holder for token change status
if (skipAg):
if ("-" not in buf[0]):
tk += 1
logiS.append(buf)
else:
logiS.append(buf)
if ("-" not in buf[0]):
tk += 1
elif ((SID != "") and (line[:-1] == "")):
if not (self.isSIDin(SID)):
self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
## b[0] SID
## b[1] TEXT
## b[2] number of tokens (not including contracted words)
## b[3] lines of the header
## b[4] tokens (including contracted words)
## b[5] status of change (initially empty)
self.s += 1
self.t += tk
else:
newAcc += 1
SID = ""
if (SID != ""):
if not (self.isSIDin(SID)):
self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
self.s += 1
self.t += tk
else:
newAcc += 1
infile.close()
print("Already existent:", newAcc)
def removeSentInd(self, s): # remove the sentence with id s (int) from base
self.s -= 1
self.t -= self.base[s][2]
self.base.remove(s)
def removeSentSID(self, s): # remove the sentence with SID s (string) from base
for i in range(self.s):
if (self.base[i][0] == s):
break
if (i < self.s):
self.s -= 1
self.t -= self.base[s][2]
self.base.remove(s)
else:
input("Trying to remove an absent SID")
def getBase(self): # return the whole base
return self.base
def getHeader(self): # return in a single string the initial lines of the conllu
return self.header
def getS(self): # return the number of sentences
return self.s
def getT(self): # return the number of tokens (ignoring contracted words)
return self.t
def getSandT(self): # return the number of sentences and tokens (ignoring contracted words)
return self.s, self.t
def getSentByID(self, SID): # return a sentence by its SID (string) - return none if absent
for b in self.base:
if (b[0] == SID):
return b
return "none"
def getSentByIndex(self, ind): # return a sentence by its index (int) - return none if absent
if (ind < self.s):
return self.base[ind]
else:
return "none"
def getSentInd(self, SID): # return the index (int) of the sentence with SID (string) - return -1 if absent
for i in range(len(self.base)):
if (self.base[i][0] == SID):
return i
return -1
def getSentID(self, ind): # return the SID (string) for the sentence indexed by ind - return -1 if absent
if (ind < self.s):
return self.base[ind][0]
else:
return -1
def isSIDin(self, SID): # return True if the SID (string) is in the base
for b in self.base:
if (b[0] == SID):
return True
return False
def isINDin(self, ind): # return True if the index (int) is in the base
return (ind < self.s)
def isSentTagged(self, ind): # return True if the sentence indexed by ind (int) has a non empty tag (b[5])
return (self.base[ind][5] != "")
def numberSentSize(self, size): # return how many sentences have this size
ans = 0
for b in self.base:
if (b[2] == size):
ans += 1
return ans
def sentSizeRange(self): # return the smallest and largest sentence size within the base
smallest, largest = self.base[0][2], self.base[0][2]
for b in self.base:
if (b[2] < smallest):
smallest = b[2]
if (b[2] > largest):
largest = b[2]
return smallest, largest
def getAllSIDs(self): # return the list with all sentence IDs
ans = []
for b in self.base:
ans.append(b[0])
ans.sort()
return ans
def tagTokenAtSID(self,s,t,tag): # sets tag (string) for s is the SID (string), t is the token id (string)
ind = self.getSentInd(s)
for tk in self.base[ind][4]:
if (t == tk[0]):
tk[10] = tag
def tagTokenAtSent(self,s,t,tag): # sets tag (string) for s is the sentence index (int), t is the token id (string)
for tk in self.base[s][4]:
if (t == tk[0]):
tk[10] = tag
def tagSent(self,s,tag): # sets tag (string) for s is the sentence index (int)
self.base[s][5] = tag
def setSentTags(self): # set the sentence tags (additional info) based on the tokens tags (additional info)
for b in self.base:
for tk in b[4]:
if (tk[10] != ""):
if (b[5] == ""):
b[5] = tk[10]
else:
if (tk[10] < b[5]):
b[5] = tk[10]
def sortBase(self):
self.base.sort()
def printSent(self, ind, outfile, nodeprel=False): # prints out a sentence by its index (int) in a outfile with all 10 fields
for line in self.base[ind][3]:
print(line, file=outfile)
for tk in self.base[ind][4]:
if nodeprel:
print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
else:
print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
print(file=outfile)
def printHeaderToo(self, outfile, nodeprel=False): # prints out the whole base in an outfile with header
print(self.header, end="", file=outfile)
for ind in range(self.s):
for line in self.base[ind][3]:
print(line, file=outfile)
for tk in self.base[ind][4]:
if nodeprel:
print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
else:
print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
print(file=outfile)
def printNoHeader(self, outfile, nodeprel=False): # prints out the whole base in an outfile without header
for ind in range(self.s):
for line in self.base[ind][3]:
print(line, file=outfile)
for tk in self.base[ind][4]:
if nodeprel:
print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
else:
print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
print(file=outfile)
def usageExample(name):
# Open a .conllu file with "name"
base = conlluFile(name)
# Get the number of sentences and tokens
s, t = base.getSandT()
# to count all tokens tagged with PUNCT PoS tag
total_PUNCT = 0
# get all sentences, one after another
for i in range(s):
b = base.getSentByIndex(i)
# get all token, one after another
for tk in b[4]:
if (tk[3] == "PUNCT"):
total_PUNCT += 1
# say the percentage of PUNCT in the base
print("Tokens tagged as PUNCT are {} out of {} ({}%)".format(total_PUNCT, t, round(t*100/t, 2)))