#!/usr/bin/env python import shutil import requests from ner_utils import BsfInfo, parse_bsf, read_train_test_split from mitie import * import argparse import os import multiprocessing from tqdm import tqdm """ Script intended to train mitie NER model using lang-uk data set in current repository. Run `python3 scripts/train_mitie_ner.py` from root to run with default configuration. Check down below in the file for all cmd line arguments. """ # don't include url params at the end - logic is trying to parse for extension of the file. feature_extractor_url = 'https://lang.org.ua/static/downloads/ner-aux/total_word_feature_extractor.tokenized.400k.dat.zip' def prepare_mitie_training_data(dev_files): # convert char offset in ner-uk markup to token based MITIE markup # and create MITIE samples base_path = './data/' samples = [] for f_name in dev_files: # read ann with open(base_path + f_name + '.ann', 'r') as f: annotations = parse_bsf(f.read()) # read tokens with open(base_path + f_name + '.txt', 'r') as f: tok_txt = f.read() tokens = tok_txt.split() # convert char offset to token offset tok_ann = [] tok_idx = 0 ann: BsfInfo for ann in annotations: tok_start = 0 in_token = False tok_end = 0 for i in range(tok_idx, len(tokens)): tok_idx = i + 1 if not in_token and ann.token.startswith(tokens[i]): tok_start = i tok_end = i + 1 in_token = (len(ann.token) != len(tokens[i])) if len(ann.token) == len(tokens[i]): break elif in_token and ann.token.endswith(tokens[i]): tok_end = i + 1 in_token = False break tok_ann.append(BsfInfo(ann.id, ann.tag, tok_start, tok_end, ann.token)) # Create MITIE sample sample = ner_training_instance(tokens) for t_ann in tok_ann: sample.add_entity(xrange(t_ann.start_idx, t_ann.end_idx), t_ann.tag) samples.append(sample) print(f'Converted to MITIE format. Sample documents {len(samples)}') return samples def download_file(download_url, file_name): resp = requests.get(download_url, stream=True) total = int(resp.headers.get('content-length', 0)) with open(file_name, 'wb') as file, tqdm( desc=file_name, total=total, unit='iB', unit_scale=True, unit_divisor=1024, ) as bar: for data in resp.iter_content(chunk_size=1024): size = file.write(data) bar.update(size) return file_name def run_training(cpu_threads, config_path, feature_extractor_path): dev_files, test_files = read_train_test_split(config_path) print(f'Loaded corpus file split configuration (documents): DEV={len(dev_files)}, TEST={len(test_files)}') samples = prepare_mitie_training_data(dev_files) # check for workspace folder existence workspace_folder = os.path.join('workspace', 'mitie') if not os.path.exists(workspace_folder): os.makedirs(workspace_folder) # Training if not feature_extractor_path or len(feature_extractor_path.strip()) == 0: # try to download pretrained file feature_extractor_path = os.path.join(workspace_folder, 'total_word_feature_extractor.tokenized.400k.dat') if not os.path.exists(feature_extractor_path): print(f'Feature extractor file not provided or not found. ' f'\nTrying to download from {feature_extractor_url}') ext = feature_extractor_url.split('.')[-1] download_file(feature_extractor_url, feature_extractor_path + '.' + ext) shutil.unpack_archive(feature_extractor_path + '.' + ext, workspace_folder) os.remove(feature_extractor_path + '.' + ext) trainer = ner_trainer(feature_extractor_path) for s in samples: trainer.add(s) trainer.num_threads = cpu_threads print("Launching training process... go get a cup of tea... it's gonna be slow") # takes long here ner = trainer.train() model_path = os.path.join(workspace_folder, "mitie_ner_model.dat") ner.save_to_disk(model_path) print(f'Training finished. Model saved to "{model_path}"') if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run MITIE training process using annotated NER data from `data` ' 'folder and using pretrained feature extractor') parser.add_argument('--fte_path', type=str, help='Path to pretrained FeaTure Extractor. For instructions on how to train one - read ' 'https://github.com/mit-nlp/MITIE/blob/master/examples/python/train_ner.py. ' 'If not provided, this script will try to download some of the prior versions of it.') parser.add_argument('--threads', type=int, default=multiprocessing.cpu_count(), help='Number of threads to use for training.') parser.add_argument('--split_file', type=str, default='doc/dev-test-split.txt', help='Path to txt file with Train/Test split.') parser.print_usage() args = parser.parse_args() if not os.path.exists('data'): print("Error: data folder not found. Make sure you are running this script from the ner-uk project root") else: run_training(args.threads, args.split_file, args.fte_path)