import argparse
import glob
from pathlib import Path
from collections import defaultdict, Counter
from ner_utils import read_proofreaded_bsf_data, read_train_test_split

if __name__ == "__main__":
    stats = defaultdict(Counter)

    parser = argparse.ArgumentParser(
        description="A script to align annotations after the proof-reading"
    )
    parser.add_argument("input_dir", type=Path, help="A dir with annotations and texts")
    parser.add_argument(
        "split_file", type=Path, help="A file with train/dev/test split"
    )

    args = parser.parse_args()
    dev_split, test_split = read_train_test_split(args.split_file)

    for txt_file in map(
        Path,
        glob.iglob(str(args.input_dir / "**/*.txt")),
    ):
        # print(f"Processing {txt_file}")
        file_id: str = txt_file.stem
        orig_ann_file = txt_file.with_suffix(".ann")

        if not orig_ann_file.exists():
            print(f"No annotation file found for {file_id}")
            continue

        if file_id in dev_split:
            split_name = "dev"
        elif file_id in test_split:
            split_name = "test"
        else:
            print(f"File {file_id} is not in the split file")
            continue

        split_stats = stats[split_name]

        for bsf in read_proofreaded_bsf_data(orig_ann_file):
            split_stats["total_entities"] += 1
            split_stats[bsf.tag] += 1

    for split_name, split_stats in stats.items():
        print(f"Split: {split_name}")
        for tag, count in split_stats.items():
            print(f"{tag}: {count}")
        print()