File size: 15,175 Bytes
ec63fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# conlluFile.py - a Python 3 package to handle a CoNNL-U files (.conllu) in data structure (base)
#
# (c) Lucelene Lopes 2023
#
# member functions:
#    conlluFile - the constructor from an input conllu file (name) - default no name creates an empty base
#                                                                  - default considering contracted word (skipAg=True if not)
# Acessors
#    getBase(self):              # return the whole base
#    getHeader(self):            # return in a single string the initial lines of the conllu
#    getS(self):                 # return the number of sentences
#    getT(self):                 # return the number of tokens (ignoring contracted words)
#    getSandT(self):             # return the number of sentences and tokens
#    getSentByID(self, SID):     # return a sentence by its SID (string) - return none if absent
#    getSentByIndex(self, ind):  # return a sentence by its index (int) - return none if absent
#    getSentInd(self, SID):      # return the index (int) of the sentence with SID (string) - return -1 if absent
#    getSentID(self, ind):       # return the SID (string) for the sentence indexed by ind - return -1 if absent
#    isSIDin(self, SID):         # return True if the SID is in the base
#    isINDin(self, ind):         # return True if the index (int) is in the base
#    isSentTagged(self, ind):    # return True if the sentence indexed by ind (int) has a non empty tag (b[5])
#    numberSentSize(self, size): # return how many sentences in the base have this size
#    sentSizeRange(self):        # return the smallest and largest sentence size within the base
#    getAllSIDs(self):           # return the list with all sentence IDs
# Mutators
#    addToBase(self, name):          # add a conllu file (name) to the base considering contracted word or not (skipAg)
#    removeSentInd(self, s):         # remove the sentence with id s from base
#    removeSentSID(self, s):         # remove the sentence with SID s (string) from base
#    tagTokenAtSID(self,s,t,tag):    # sets tag (string) for s is the SID (string), t is the token id (string)
#    tagTokenAtSent(self,s,t,tag):   # sets tag (string) for s is the sentence index (int), t is the token id (string)
#    tagSent(self,s,tag):            # sets tag (string) for s is the sentence index (int)
#    setSentTags(self):              # set the sentence tags (additional info) based on the tokens tags (additional info)
#    sortBase(self):                 # sort the base according to SID
# Prints
#    printSent(self, ind, outfile, nodeprel=False): # prints out a sentence by its index (int) in a outfile with all 10 fields
#    printHeaderToo(self, outfile, nodeprel=False): # prints out the whole base in an outfile
#    printNoHeader(self, outfile, nodeprel=False):  # prints out the whole base in an outfile
#
# Sentence structure
    ## b[0] SID - sentence ID
    ## b[1] TEXT - text of the sentence
    ## b[2] number of tokens (not including the contracted word lines)
    ## b[3] lines of the header (including, but not limited to, the '# sent_id =' and '# text' lines)
    ## b[4] token lines (including contracted word lines)
    ##   each token line has 10 elements of the CoNLL-U format, plus one place holder for information
    ## b[5] status of change (a place holder for information)


class conlluFile:
    def __init__(self, name="", skipAg=False):   # create a base from an input conllu file (name) considering contracted word or not (skipAg)
        # Instance variables:
        #   self.base      - the whole base
        #   self.header    - the first lines before the actual sentences
        #   self.s         - the total number of sentences
        #   self.t         - the total number of tokens
        self.base = []
        if (name == ""):
            self.header, self.s, self.t = "", 0, 0
        else:
            infile = open(name, "r")
            self.s, self.t = 0, 0
            SID = "HEADER"
            self.header = ""
            for line in infile:
                if ((SID == "HEADER") and (line[:12] != "# sent_id = ")):
                    self.header += line
                elif (((SID == "") or SID == "HEADER") and (line[:12] == "# sent_id = ")):
                    SID = line[12:-1]
                    dumpHead = []
                    dumpHead.append(line[:-1])
                    logiS = []
                    TEXT = ""
                    tk = 0
                elif ((SID != "") and (line[:-1] != "")):
                    if (line[0] == "#"):
                        dumpHead.append(line[:-1])
                        if (line[:9] == "# text = "):
                            TEXT = line[9:-1]
                    else:
                        buf = line[:-1].split()
                        if (buf[3][0] == "["):
                            buf[3] = buf[3][1:-1]
                        buf.append("")  # holder for token change status (information place holder)
                        if (skipAg):
                            if ("-" not in buf[0]):
                                tk += 1
                                logiS.append(buf)
                        else:
                            logiS.append(buf)
                            if ("-" not in buf[0]):
                                tk += 1
                elif ((SID != "") and (line[:-1] == "")):
                    if not (self.isSIDin(SID)):
                        self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
                        ## b[0] SID
                        ## b[1] TEXT
                        ## b[2] number of tokens (not including contracted words)
                        ## b[3] lines of the header
                        ## b[4] tokens (including contracted words)
                        ## b[5] status of change (initially empty)
                        self.s += 1
                        self.t += tk
                    else:
                        print("Duplicated SID:", SID)
                    SID = ""
            if (SID != ""):
                if not (self.isSIDin(SID)):
                    self.base.append([SID,TEXT,tk,dumpHead,logiS])
                    self.s += 1
                    self.t += tk
                else:
                    print("Duplicated SID:", SID)
            infile.close()
        self.base.sort()
    def addToBase(self,name, skipAg=False): # add a conllu file (name) to the base considering contracted word or not (skipAg)
        newAcc = 0
        infile = open(name, "r")
        SID = "HEADER"
        self.header = ""
        for line in infile:
            if ((SID == "HEADER") and (line[:12] != "# sent_id = ")):
                self.header += line
            elif (((SID == "") or SID == "HEADER") and (line[:12] == "# sent_id = ")):
                SID = line[12:-1]
                dumpHead = []
                dumpHead.append(line[:-1])
                logiS = []
                TEXT = ""
                tk = 0
            elif ((SID != "") and (line[:-1] != "")):
                if (line[0] == "#"):
                    dumpHead.append(line[:-1])
                    if (line[:9] == "# text = "):
                        TEXT = line[9:-1]
                else:
                    buf = line[:-1].split("\t")
                    if (buf[3][0] == "["):
                        buf[3] = buf[3][1:-1]
                    buf.append("")  # holder for token change status
                    if (skipAg):
                        if ("-" not in buf[0]):
                            tk += 1
                            logiS.append(buf)
                    else:
                        logiS.append(buf)
                        if ("-" not in buf[0]):
                            tk += 1
            elif ((SID != "") and (line[:-1] == "")):
                if not (self.isSIDin(SID)):
                    self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
                        ## b[0] SID
                        ## b[1] TEXT
                        ## b[2] number of tokens (not including contracted words)
                        ## b[3] lines of the header
                        ## b[4] tokens (including contracted words)
                        ## b[5] status of change (initially empty)
                    self.s += 1
                    self.t += tk
                else:
                    newAcc += 1
                SID = ""
        if (SID != ""):
            if not (self.isSIDin(SID)):
                self.base.append([SID,TEXT,tk,dumpHead,logiS,""])
                self.s += 1
                self.t += tk
            else:
                newAcc += 1
        infile.close()
        print("Already existent:", newAcc)
    def removeSentInd(self, s):         # remove the sentence with id s (int) from base
        self.s -= 1
        self.t -= self.base[s][2]
        self.base.remove(s)
    def removeSentSID(self, s):         # remove the sentence with SID s (string) from base
        for i in range(self.s):
            if (self.base[i][0] == s):
                break
        if (i < self.s):
            self.s -= 1
            self.t -= self.base[s][2]
            self.base.remove(s)
        else:
            input("Trying to remove an absent SID")
    def getBase(self):   # return the whole base
        return self.base
    def getHeader(self):  # return in a single string the initial lines of the conllu
        return self.header
    def getS(self):   # return the number of sentences
        return self.s
    def getT(self):   # return the number of tokens (ignoring contracted words)
        return self.t
    def getSandT(self):   # return the number of sentences and tokens (ignoring contracted words)
        return self.s, self.t
    def getSentByID(self, SID):   # return a sentence by its SID (string) - return none if absent
        for b in self.base:
            if (b[0] == SID):
                return b
        return "none"
    def getSentByIndex(self, ind):  # return a sentence by its index (int) - return none if absent
        if (ind < self.s):
            return self.base[ind]
        else:
            return "none"
    def getSentInd(self, SID):  # return the index (int) of the sentence with SID (string) - return -1 if absent
        for i in range(len(self.base)):
            if (self.base[i][0] == SID):
                return i
        return -1
    def getSentID(self, ind):  # return the SID (string) for the sentence indexed by ind - return -1 if absent
        if (ind < self.s):
            return self.base[ind][0]
        else:
            return -1
    def isSIDin(self, SID):  # return True if the SID (string) is in the base
        for b in self.base:
            if (b[0] == SID):
                return True
        return False
    def isINDin(self, ind):  # return True if the index (int) is in the base
        return (ind < self.s)
    def isSentTagged(self, ind):  # return True if the sentence indexed by ind (int) has a non empty tag (b[5])
        return (self.base[ind][5] != "")
    def numberSentSize(self, size):  # return how many sentences have this size
        ans = 0
        for b in self.base:
            if (b[2] == size):
                ans += 1
        return ans
    def sentSizeRange(self):   # return the smallest and largest sentence size within the base
        smallest, largest = self.base[0][2], self.base[0][2]
        for b in self.base:
            if (b[2] < smallest):
                smallest = b[2]
            if (b[2] > largest):
                largest = b[2]
        return smallest, largest
    def getAllSIDs(self):           # return the list with all sentence IDs
        ans = []
        for b in self.base:
            ans.append(b[0])
        ans.sort()
        return ans
    def tagTokenAtSID(self,s,t,tag): # sets tag (string) for s is the SID (string), t is the token id (string)
        ind = self.getSentInd(s)
        for tk in self.base[ind][4]:
            if (t == tk[0]):
                tk[10] = tag
    def tagTokenAtSent(self,s,t,tag): # sets tag (string) for s is the sentence index (int), t is the token id (string)
        for tk in self.base[s][4]:
            if (t == tk[0]):
                tk[10] = tag
    def tagSent(self,s,tag):    # sets tag (string) for s is the sentence index (int)
        self.base[s][5] = tag
    def setSentTags(self):  # set the sentence tags (additional info) based on the tokens tags (additional info)
        for b in self.base:
            for tk in b[4]:
                if (tk[10] != ""):
                    if (b[5] == ""):
                        b[5] = tk[10]
                    else:
                        if (tk[10] < b[5]):
                            b[5] = tk[10]
    def sortBase(self):
        self.base.sort()
    def printSent(self, ind, outfile, nodeprel=False): # prints out a sentence by its index (int) in a outfile with all 10 fields
        for line in self.base[ind][3]:
            print(line, file=outfile)
        for tk in self.base[ind][4]:
            if nodeprel:
                print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
            else:
                print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
        print(file=outfile)
    def printHeaderToo(self, outfile, nodeprel=False): # prints out the whole base in an outfile with header
        print(self.header, end="", file=outfile)
        for ind in range(self.s):
            for line in self.base[ind][3]:
                print(line, file=outfile)
            for tk in self.base[ind][4]:
                if nodeprel:
                    print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
                else:
                    print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
            print(file=outfile)
    def printNoHeader(self, outfile, nodeprel=False): # prints out the whole base in an outfile without header
        for ind in range(self.s):
            for line in self.base[ind][3]:
                print(line, file=outfile)
            for tk in self.base[ind][4]:
                if nodeprel:
                    print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], "_", "_", "_", tk[9], sep="\t", file=outfile)
                else:
                    print(tk[0], tk[1], tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9], sep="\t", file=outfile)
            print(file=outfile)


def usageExample(name):
    # Open a .conllu file with "name"
    base = conlluFile(name)
    # Get the number of sentences and tokens
    s, t = base.getSandT()
    # to count all tokens tagged with PUNCT PoS tag
    total_PUNCT = 0
    # get all sentences, one after another
    for i in range(s):
        b = base.getSentByIndex(i)
        # get all token, one after another
        for tk in b[4]:
            if (tk[3] == "PUNCT"):
                total_PUNCT += 1
    # say the percentage of PUNCT in the base
    print("Tokens tagged as PUNCT are {} out of {} ({}%)".format(total_PUNCT, t, round(t*100/t, 2)))