#!/usr/bin/env python

# Data conversion utility for ner-uk data set. Originally created for purposes of training Stanza model.
# Later Stanza has merged some of this code inside.
# Terminology:
# IOB https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) Some sources refer to this as BIO
# BEIOS - extended IOB with E == end (for multi word tokens)

from typing import List, Optional
import argparse
import logging
import os
import glob
import pathlib
from typing import Tuple

from tqdm import tqdm
from random import choices, shuffle
from os.path import splitext

from ner_utils import convert_bsf, BsfInfo, OverlapStrategy

log = logging.getLogger(__name__)
log.setLevel(logging.INFO)


def convert_bsf_in_folder(
    src_dir_path: pathlib.Path,
    dst_dir_path: pathlib.Path,
    converter: str = "beios",
    doc_delim: str = "\n",
    train_test_split_file: Optional[pathlib.Path] = None,
    overlap_strategy: OverlapStrategy = OverlapStrategy.REMOVE_INNER,
) -> None:
    """

    :param doc_delim: delimiter to be used between documents
    :param src_dir_path: path to directory with BSF marked files
    :param dst_dir_path: where to save output data
    :param converter: `beios` or `iob` output formats
    :param train_test_split_file: path to file containing train/test lists of file names
    :param overlap_strategy: how to handle overlapping entities
    :return:
    """
    # following 2 constants need to comply with stanza naming for corpus and language
    corpus_name = "Ukrainian-languk-2.0-outer"

    ann_path = src_dir_path / "**/*.ann"
    ann_files = glob.glob(str(ann_path))
    ann_files.sort()

    tok_path = src_dir_path / "**/*.txt"
    tok_files = glob.glob(str(tok_path))
    tok_files.sort()

    corpus_folder = dst_dir_path / corpus_name
    if not corpus_folder.exists():
        corpus_folder.mkdir(parents=True)

    if not ann_files or not tok_files:
        log.error(
            f"Token and annotation files are not found at specified path {ann_path}"
        )
        return
    if len(ann_files) != len(tok_files):
        log.error(
            f"Mismatch between Annotation and Token files. Ann files: {len(ann_files)}, token files: {len(tok_files)}"
        )
        return

    train_set = []
    dev_set = []
    test_set = []

    data_sets = [train_set, dev_set, test_set]
    split_weights = (8, 1, 1)

    if train_test_split_file is not None:
        train_names, dev_names, test_names = read_languk_train_test_split(
            train_test_split_file, 0
        )

    log.info(f'Found {len(tok_files)} files in data folder "{src_dir_path}"')
    for tok_fname, ann_fname in tqdm(
        zip(tok_files, ann_files), total=len(tok_files), unit="file"
    ):
        if splitext(tok_fname)[0] != splitext(ann_fname)[0]:
            tqdm.write(
                f"Token and Annotation file names do not match ann={ann_fname}, tok={tok_fname}"
            )
            continue

        with open(tok_fname) as tok_file, open(ann_fname) as ann_file:
            token_data = tok_file.read()
            ann_data = ann_file.read()

            try:
                out_data = convert_bsf(
                    token_data, ann_data, converter, overlap_strategy=overlap_strategy
                )
            except ValueError as e:
                tqdm.write(f"Error processing {tok_fname}: {e}")
                continue

            if train_test_split_file is None:
                target_dataset = choices(data_sets, split_weights)[0]
            else:
                target_dataset = train_set
                fkey = splitext(os.path.basename(tok_fname))[0]
                if fkey in dev_names:
                    target_dataset = dev_set
                elif fkey in test_names:
                    target_dataset = test_set

            target_dataset.append(out_data)
    log.info(
        f"Data is split as following: train={len(train_set)}, dev={len(dev_set)}, test={len(test_set)}"
    )

    # writing data to {train/dev/test}.iob files
    names = ["train", "dev", "test"]
    if doc_delim != "\n":
        doc_delim = "\n" + doc_delim + "\n"
    for idx, name in enumerate(names):
        fname = corpus_folder / (name + ".iob")
        with fname.open("w") as f:
            f.write(doc_delim.join(data_sets[idx]))
        log.info("Writing to " + str(fname))

    log.info("All done")


def read_languk_train_test_split(
    file_path: pathlib.Path, dev_split: float = 0.1
) -> Tuple:
    """
    Read predefined split of train and test files in data set.
    Originally located under doc/dev-test-split.txt
    :param file_path: path to dev-test-split.txt file (should include file name with extension)
    :param dev_split: 0 to 1 float value defining how much to allocate to dev split
    :return: tuple of (train, dev, test) each containing list of files to be used for respective data sets
    """
    log.info(
        f'Trying to read train/dev/test split from file "{str(file_path)}". Dev allocation = {dev_split}'
    )
    train_files, test_files, dev_files = [], [], []
    container = test_files
    with file_path.open("r") as f:
        for ln in f:
            ln = ln.strip()
            if ln == "DEV":
                container = train_files
            elif ln == "TEST":
                container = test_files
            elif ln == "":
                pass
            else:
                container.append(ln)

    # split in file only contains train and test split.
    # For Stanza training we need train, dev, test
    # We will take part of train as dev set
    # This way anyone using test set outside of this code base can be sure that there was no data set pollution
    shuffle(train_files)
    dev_files = train_files[: int(len(train_files) * dev_split)]
    train_files = train_files[int(len(train_files) * dev_split) :]

    assert len(set(train_files).intersection(set(dev_files))) == 0

    log.info(
        f"Files in each set: train={len(train_files)}, dev={len(dev_files)}, test={len(test_files)}"
    )
    return train_files, dev_files, test_files


if __name__ == "__main__":
    logging.basicConfig()

    parser = argparse.ArgumentParser(
        description="Convert lang-uk NER data set from Brat stadoff format to BEIOS or IOB format"
        " (compatible with Stanza NER model training requirements)."
    )
    parser.add_argument(
        "-c",
        type=str,
        default="iob",
        help="`beios` or `iob` formats to be used for output",
    )

    parser.add_argument(
        "--src_dataset",
        type=pathlib.Path,
        default="data",
        help='Dir with lang-uk dataset "data" folder (https://github.com/lang-uk/ner-uk/data)',
    )
    parser.add_argument(
        "--dst",
        type=pathlib.Path,
        default="workspace/data",
        help="Where to store the converted dataset",
    )

    parser.add_argument(
        "--doc_delim",
        type=str,
        default="\n",
        help="Delimiter to be used to separate documents in the output data",
    )
    parser.add_argument(
        "--overlap_strategy",
        type=OverlapStrategy,
        default=OverlapStrategy.REMOVE_INNER,
        choices=[e.value for e in OverlapStrategy],
        help="How to handle overlapping entities",
    )

    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        "--split_file",
        type=pathlib.Path,
        default="doc/dev-test-split.txt",
        help="Name of a file containing Train/Test split (files in train and test set)",
    )
    group.add_argument(
        "--split_randomly",
        action="store_true",
        help="Randomly create train/dev/test sets from src_dataset.",
    )

    parser.print_usage()
    args = parser.parse_args()

    split_file = None if args.split_randomly else args.split_file
    convert_bsf_in_folder(
        args.src_dataset,
        args.dst,
        args.c,
        args.doc_delim,
        train_test_split_file=split_file,
        overlap_strategy=args.overlap_strategy,
    )