#!env python ##### updates annotation to match raw (not tokenized) positions ##### Usage: ./update_annotations.py path_to_data import os import sys import itertools import re def update_annotations_in_file(path, filename): # first read annotations from tokenized file annotations = [] ann_path = path + "\\" + filename + ".tok.ann" if(not os.path.isfile(ann_path)): return with open(ann_path, "r", encoding="utf-8") as ann_f: for ann_l in ann_f: ann_row = ann_l.split("\t") annotation = {} annotation["row"] = ann_row annotation["text"] = ann_row[2].strip("\n") start_end = ann_row[1].split(' ') annotation["type"] = start_end[0] annotation["start"] = int(start_end[1]) annotation["end"] = int(start_end[2]) annotations.append(annotation) # assume that tokenized file has structure where position of words always >= from position in original file # because of additional spaces and linebreaks content = "" with open(path + "\\" + filename + ".txt", "r", encoding="utf-8") as txt_f: content = txt_f.read() for ann in annotations: missed_re = re.compile(r"[\[\]‐\s\-№]", re.IGNORECASE) # first clean tokenized annotation from spaces text = missed_re.sub("", ann["row"][2]) # then go back from tokenized start position length = len(ann["row"][2]) + 3 # to be sure missed_simbols = " []-‐№" for i in itertools.count(): newStart = ann["start"] - i if(newStart < 0): print("Token {0} not found in file {1}!".format(ann["row"][2].strip(), filename)) break newEnd = newStart + length # clean original text from spaces original_text_escaped = missed_re.sub("", content[newStart:newEnd]) position = original_text_escaped.find(text) if(position == 0): # now fix position by restoring spaces original_text = content[newStart:newEnd] newEnd = newStart + len(text) for i in range(len(original_text)): if(original_text[i] in missed_simbols): if(i <= newEnd-newStart-1): newEnd = newEnd + 1 else: break ann["start"] = newStart ann["end"] = newEnd ann["text"] = content[newStart:newEnd].strip() break # and save result with open(path + "\\" + filename + ".ann", "w", encoding="utf-8") as ann_f: for ann in annotations: if("text" in ann): ann_f.write("{0}\t{1} {2} {3}\t{4}\n".format(ann["row"][0], ann["type"], ann["start"], ann["end"], ann["text"])) if __name__ == '__main__': if len(sys.argv) < 2: print("./update_annotations.py path_to_data") else: for f in os.listdir(sys.argv[1]): filename, file_extension = os.path.splitext(f) if(file_extension == ".txt"): second_name, second_extension = os.path.splitext(filename) if(len(second_extension) != 3): update_annotations_in_file(sys.argv[1],filename)