#!/usr/bin/env python

# Calculate basic statistics of the data set

from pathlib import Path

from convert_data import read_languk_train_test_split
from ner_utils import parse_bsf

from collections import Counter
from itertools import chain


def read_data(root_path:Path, f_names: list):
    """
    Read data from a list of files in `f_names`.
    :param root_path: path of the working dir
    :param f_names: list of file names to read from
    :return: list of tuples (document split in sentences, annotation data in BsfInfo structures)
    """
    base_path = root_path / Path('data/')
    
    tok_storage = []
    ann_storage = []
    for f_name in f_names:
        # read ann
        with open(base_path / (f_name + '.ann'), 'r') as f:
            annotations = parse_bsf(f.read())
        # read tokens
        with open(base_path / (f_name + '.txt'), 'r') as f:
            tok_txt = f.read()

        tok_sent = [sent for sent in tok_txt.split('\n') if len(sent) > 0]
        tok_storage.append(tok_sent)
        ann_storage.append(annotations)
    
    return tok_storage, ann_storage


# check where we are located and resolve path properly
# to safely execute from within or outside of the scripts folder
root_path = Path('./') if Path('./data').exists() else Path('../')
if not Path('./data').exists() and not Path('../data').exists():
    print('Please run this script either within ner-uk root folder or ner-uk/scripts folder.')
    exit

train_names, dev_names, test_names = read_languk_train_test_split(root_path / Path("doc/dev-test-split.txt"), 0)

#doc sent
train_cnt = len(train_names)
test_cnt = len(test_names)
total_cnt = train_cnt + test_cnt


train_data, train_ann = read_data(root_path, train_names)
test_data, test_ann = read_data(root_path, test_names)

# sentense stats
train_sent_cnt = [len(sents) for sents in train_data]

test_sent_cnt = [len(sents) for sents in test_data]

train_sent_n = sum(train_sent_cnt)
test_sent_n = sum(test_sent_cnt)
total_sent = train_sent_n + test_sent_n

# Tokens
train_tok_cnt = len(list(chain.from_iterable(sent.split() for sent in chain.from_iterable(train_data))))
test_tok_cnt = len(list(chain.from_iterable(sent.split() for sent in chain.from_iterable(test_data))))

# Tags

ann_train_cnt = Counter([ann.tag for ann in chain.from_iterable(train_ann)])
ann_test_ctn = Counter([ann.tag for ann in chain.from_iterable(test_ann)])

# tok_train_cnt = Counter([ for sent in chain.from_iterable(train_data)])
ann_lbl = list(ann_train_cnt)


# Print it all 
print(f'Documents: train={train_cnt}, test={test_cnt}, split={train_cnt/total_cnt :.2f}/{test_cnt/total_cnt :.2f}')

print(f'Sentence stats: train={train_sent_n}, test={test_sent_n}, split={train_sent_n/total_sent :.2f}/{test_sent_n/total_sent :.2f}')

print('Train set tags')
print(ann_train_cnt)
print(*[f'{lbl}: {(ann_train_cnt[lbl]/sum(ann_train_cnt.values())) :.2f} ' for lbl in ann_lbl])


print('\nTest set tags')
print(ann_test_ctn)
print(*[f'{lbl}: {(ann_test_ctn[lbl]/sum(ann_test_ctn.values())) :.2f} ' for lbl in ann_lbl])


total_tag_cnt = {lbl: ann_test_ctn[lbl] + ann_train_cnt[lbl] for lbl in ann_lbl}

print(f'\nSplit train/test:')

print(*[f'{lbl}={ann_train_cnt[lbl]/total_tag_cnt[lbl] :.2f}/{ann_test_ctn[lbl]/total_tag_cnt[lbl] :.2f}' for lbl in ann_lbl])


print('\nAbsolute total:')
print(f'- {train_cnt + test_cnt} текстів')
print(f'- {train_tok_cnt + test_tok_cnt :_} токенів')
print(f'- {sum(total_tag_cnt.values()) :_} сутностей NER')
print(f'  - ПЕРС {ann_train_cnt["PERS"] + ann_test_ctn["PERS"]:_}')
print(f'  - ЛОК {ann_train_cnt["LOC"] + ann_test_ctn["LOC"]:_}')
print(f'  - ОРГ {ann_train_cnt["ORG"] + ann_test_ctn["ORG"]:_}')
print(f'  - РІЗН {ann_train_cnt["MISC"] + ann_test_ctn["MISC"]:_}')