aaljabari commited on
Commit
fa45496
·
verified ·
1 Parent(s): 2126b32

Create IBO_to_XML.py

Browse files
Files changed (1) hide show
  1. IBO_to_XML.py +135 -0
IBO_to_XML.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # By Wasim Khatib
2
+ # Version 2.0
3
+ # This function take a list a set of annotated entities, in this format: [["صرح","O"],
4
+ # ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
5
+ # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
6
+ # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
7
+ # after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
8
+ # جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
9
+ # This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
10
+ # start with ignore I- tags if they don’t have B-tags.
11
+ import numpy as np
12
+
13
+
14
+ def IBO_to_XML(temp):
15
+ xml_output = ""
16
+
17
+ temp_entities = sortTags(temp)
18
+
19
+ temp_list = list()
20
+
21
+ # initlize the temp_list
22
+ temp_list.append("")
23
+ word_position = 0
24
+
25
+ # For each entity, convert ibo to xml list.
26
+ for entity in temp_entities:
27
+ counter_tag = 0
28
+ # For each tag
29
+ for tag in str(entity[1]).split():
30
+
31
+ # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
32
+ if counter_tag >= len(temp_list):
33
+ temp_list.append("")
34
+
35
+ # If the tag is equal O then and word position not equal zero then add all from templist to output ist
36
+ if "O" == tag and word_position != 0:
37
+ for j in range(len(temp_list),0,-1):
38
+ if temp_list[j-1]!= "":
39
+ xml_output+=" </"+str(temp_list[j-1])+">"
40
+ temp_list[j-1] = ""
41
+
42
+ # if its not equal O and its correct tag like B-tag or I-tag and its B
43
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
44
+ # if the templist of counter tag is not empty then we need add xml word that contains
45
+ # </name of previous tag> its mean that we closed the tag in xml in xml_output
46
+ if temp_list[counter_tag] != "":
47
+ xml_output+=" </"+str(temp_list[counter_tag])+">"
48
+ # After that we replace the previous tag from templist in new tag
49
+ temp_list[counter_tag] = str(tag).split("-")[1]
50
+ # And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
51
+ xml_output += " <" + str(temp_list[counter_tag]) + ">"
52
+
53
+
54
+
55
+ # if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
56
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
57
+ # we need to check if this tag like previous tag
58
+ for j in range(counter_tag,len(temp_list)):
59
+ # if its equal then will break the loop and continue
60
+ if temp_list[j] == tag[2:]:
61
+ break
62
+ # if not then we need to add xml word to close the tag like </name of previous> in xml_output
63
+ else:
64
+ if temp_list[j] != "":
65
+ xml_output+=" </"+str(temp_list[j])+">"
66
+ temp_list[j] = ""
67
+ counter_tag += 1
68
+ word_position += 1
69
+ # Add word in xml_output
70
+ xml_output +=" "+str(entity[0])
71
+ # Add all xml words in xml_output
72
+ for j in range(0, len(temp_list)):
73
+ if temp_list[j] != "":
74
+ xml_output+=" </"+str(temp_list[j])+">"
75
+ return xml_output.strip()
76
+
77
+
78
+ def sortTags(entities):
79
+ temp_entities = entities
80
+ temp_counter = 0
81
+ # For each entity, this loop will sort each tag of entitiy, first it will check if the
82
+ # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
83
+ for entity in temp_entities:
84
+ tags = entity[1].split()
85
+ for tag in tags:
86
+ # if the counter is not 0 then, will complete
87
+ if temp_counter != 0:
88
+ # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
89
+ # count how many tag in previous tags
90
+ if "I-" == tag[0:2]:
91
+ counter_of_this_tag = 0
92
+ counter_of_previous_tag = 0
93
+ for word in tags:
94
+ if tag.split("-")[1] in word:
95
+ counter_of_this_tag+=1
96
+ for word in temp_entities[temp_counter-1][1].split():
97
+ if tag.split("-")[1] in word:
98
+ counter_of_previous_tag+=1
99
+ # if the counter of previous tag is bigger than counter of this tag, then we
100
+ # need to add I-tag in this tags
101
+ if counter_of_previous_tag > counter_of_this_tag:
102
+ tags.append("I-"+tag.split("-")[1])
103
+ # Sort the tags
104
+ tags.sort()
105
+ # Need to revers the tags because it should begins with I
106
+ tags.reverse()
107
+ # If the counter is not 0 then we can complete
108
+ if temp_counter != 0:
109
+ this_tags = tags
110
+ previous_tags = temp_entities[temp_counter - 1][1].split()
111
+ sorted_tags = list()
112
+
113
+ # Check if the this tag is not O and previous tags is not O, then will complete,
114
+ # if not then it will ignor this tag
115
+ if "O" not in this_tags and "O" not in previous_tags:
116
+ index = 0
117
+ #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
118
+ for i in previous_tags:
119
+ j = 0
120
+ while this_tags and j < len(this_tags):
121
+ if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
122
+ sorted_tags.insert(index, this_tags.pop(j))
123
+ break
124
+ elif this_tags[j][0:2] == "B-":
125
+ break
126
+ j += 1
127
+ index += 1
128
+ sorted_tags += this_tags
129
+ tags = sorted_tags
130
+ str_tag = " "
131
+ str_tag = str_tag.join(tags)
132
+ str_tag = str_tag.strip()
133
+ temp_entities[temp_counter][1] = str_tag
134
+ temp_counter += 1
135
+ return temp_entities