""" Okay, the whole folder is a mess already, and we going to add a little bit more. """ from typing import List import argparse from pathlib import Path import glob import shutil from collections import Counter def deduplicate(fname: Path) -> str: """ Some texts exported from the proof-reading tool have duplicates its content duplicated again and again. This function removes the duplicates. """ text = fname.open("r", encoding="utf-8").read() lines = text.split("\n") if lines and lines[0].strip() == "": print(f"Empty first line {fname}") return "\n".join(lines) try: duplicate_loc = lines[1:].index(lines[0]) except ValueError: print(f"No duplicates found in {fname}") return "\n".join(lines) return "\n".join(lines[:duplicate_loc + 1]) if __name__ == "__main__": parser: argparse.ArgumentParser = argparse.ArgumentParser( description="A script to compile a corpus after the proof-reading into the same " "format as we have now in the repo" ) parser.add_argument("input_dir", type=Path, help="A dir with annotations and texts") parser.add_argument("output_dir", type=Path, help="A dir to store exported files") tags_count: Counter = Counter() files_converted: int = 0 args: argparse.Namespace = parser.parse_args() args.output_dir.mkdir(exist_ok=True, parents=True) for txt_file in map(Path, glob.iglob(str(args.input_dir / "*.txt"))): fname: str = txt_file.name file_id: str = txt_file.stem with (args.output_dir / f"{file_id}.txt").open("w") as fp_out: text = deduplicate(txt_file) fp_out.write(text) ann_files = glob.glob(str(args.input_dir / file_id / "*.ann")) if len(ann_files) != 1: print(f"Oh crap, cannot find right annotation file for the {file_id}: {len(ann_files)} files found") shutil.copy(ann_files[0], args.output_dir / f"{file_id}.ann") print(f"Total number of files converted: {files_converted}") print(f"Total number of tags: {sum(tags_count.values())}") print("\n".join(f"{k}: {v}" for k, v in tags_count.most_common()))