import argparse import os from ner_utils import read_data_to_iob, read_train_test_split import stanza from mitie import named_entity_extractor from sklearn.metrics import classification_report from tqdm import tqdm def process_with_stanza(model_path: str, data_x: list[str]) -> list[list[str]]: """ Process `data_x` with Stanza pipeline. Data must be pre tokenized. :param model_path: path to stanza model file :param data_x: list of tokenized data set to run evaluation on :returns: labels in iob format. 2d array corresponding to data_x structure. (we preserve split by documents) """ print('Initializing Stanza pipeline') stanza.download('uk') nlp = stanza.Pipeline('uk', processors='tokenize,pos,lemma', ner_model_path=model_path, ner_forward_charlm_path="", ner_backward_charlm_path="", tokenize_pretokenized=True) y_res_stanza = [] for x in tqdm(data_x, total=len(data_x)): # print(x) doc = nlp(x) iob = [] for t in doc.iter_tokens(): token_str = t.ner if token_str.startswith('E'): token_str = 'I' + token_str[1:] elif token_str.startswith('S'): token_str = 'B' + token_str[1:] iob.append(token_str) y_res_stanza.append(iob) print(f'Stanza total set length: {len(y_res_stanza)}') return y_res_stanza def process_with_mitie(model_path: str, data_x: list[str]) -> list[list[str]]: """ Process data in `data_x` using Mitie model provided via `model_path`. :param model_path: path to a file with Mitie model :param data_x: data samples to run model on :return: iob representation of model output. List items correspond to documents in data_x """ print('Loading MITIE model') model = named_entity_extractor(model_path) print('Processing documents with Mitie model') ent_lst = [] for x in tqdm(data_x, total=len(data_x)): entities = model.extract_entities(x.split()) # (range, tag, score) ent_lst.append(entities) # convert to iob y_res = [] for xi in range(len(data_x)): x = data_x[xi] ents = ent_lst[xi] tokens = x.split() ent_i = 0 iob = [] for ti in range(len(tokens)): if ent_i >= len(ents): iob.append('O') continue rng, tag, _ = ents[ent_i] start_i, end_i = rng.start, rng.stop if ent_i >= len(ents) or ti < start_i: iob.append('O') elif ti == start_i: iob.append('B-' + tag) if start_i == end_i - 1: ent_i = ent_i + 1 elif start_i < ti < end_i: iob.append('I-' + tag) if ti == end_i - 1: ent_i = ent_i + 1 y_res.append(iob) return y_res def print_report(y_gt: list[list[str]], y: list[list[str]], report_name: str = ''): """ Print classification report using sklearn classification_report. Input parameters will be converted to a flat structure prior to calculating a report. :param y_gt: ground truth labels :param y: calculated labels :param report_name: Name to printed in the header of the console output """ y_gt_flat = [item for sublist in y_gt for item in sublist] y_flat = [item for sublist in y for item in sublist] print(f'Classification report: {report_name}') print(classification_report(y_gt_flat, y_flat)) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Evaluate NER models on TEST data set from "train-test-split.txt". Supported models: Mitie, Stanza') parser.add_argument('--mitie', type=str, help='Path to trained MITIE model.') parser.add_argument('--stanza', type=str, help='Path to trained Staza model.') parser.add_argument('--split_file', type=str, default='doc/dev-test-split.txt', help='Path to txt file with Train/Test split (optional).') parser.print_usage() args = parser.parse_args() if not args.mitie and not args.stanza: print('At least one model must be specified to run this script: --mitie, --stanza') elif not os.path.exists('data'): print("Error: data folder not found. Make sure you are running this script from the ner-uk project root") else: _, test_files = read_train_test_split(args.split_file) print('Reading test data from files') x_test, y_test = read_data_to_iob(test_files) if args.mitie: y_mitie = process_with_mitie(args.mitie, x_test) if args.stanza: y_stanza = process_with_stanza(args.stanza, x_test) # printing reports if args.mitie: print_report(y_test, y_mitie, 'Mitie') if args.stanza: print_report(y_test, y_stanza, 'Stanza')