# saves the dataset to a binary file for training. following was helpful:
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
# prepare.py in karpathy/nanoGPT

import numpy as np
import sentencepiece as spm
from datasets import load_dataset, Value, Features
import random
from tqdm import tqdm
import ftfy

num_proc = 24
dataset = load_dataset("json",
                       # original files have too many different columns:
                       #data_files={"train": ["ubertext.fiction.filter_rus_gcld+short.orig.jsonl", "ubertext.news.filter_rus_gcld+short.orig.jsonl", "ubertext.wikipedia.filter_rus_gcld+short.orig.jsonl"]},

                       # instead do this:
                       # cat *jsonl | jq -rc '{text,title,date_of_publish,tags}' > ubertext.jsonl
                       data_files={"train": ["ubertext.jsonl"]}, 
                       features=Features(
                                {'text': Value(dtype='string', id=None),
                                 'title': Value(dtype='string', id=None),
                                 'date_of_publish': Value(dtype='string', id=None),
                                 'tags': [Value(dtype='string', id=None)]}))

# owt by default only contains the 'train' split, so create a test split
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test') # rename the test split to val

sp = spm.SentencePieceProcessor(model_file='wiki.model')

class Tok:
    endoftext = 50256
    endofprompt = 1

def process(example):
    text, title, date_of_publish, tags = example['text'], example.get('title'), example.get('date_of_publish'), example.get('tags')
    meta = []
    if title:
        title = ftfy.fix_text(title)
        meta.append(f'тема: {title}\n')
    if tags:
        tags = ', '.join(tags)
        tags = ftfy.fix_text(tags)
        meta.append(f'мітки: {tags}\n')
    if date_of_publish:
        year = date_of_publish.split('-')[0]
        meta.append(f'рік: {year}\n')
    random.shuffle(meta)
    pre = ''.join(meta)
    random.shuffle(meta)
    post = ''.join(meta)

    text = ftfy.fix_text(text)
    text = pre + '\n\n' + text + '\n\n' + post
    ids = sp.encode(text)
    ids = ids + [Tok.endoftext]
    out = {'ids': ids, 'len': len(ids), 'text_len': len(text)}
    return out

# tokenize the dataset
tokenized = split_dataset.map(
    process,
    desc="tokenizing",
    remove_columns=['text', 'title', 'date_of_publish', 'tags'],
    num_proc=num_proc,
)

t_compression_ratio = 0.4


def compresses_well(example):
    # from Cramming:  
    # drop all entries from the dataset where the number of tokens in the entry is larger than t times the number of raw characters. 
    # This removes, for example, sequences consisting of hard-to-compress HTML or markdown code
    return example['len'] < t_compression_ratio * example['text_len']

filtered = tokenized.filter(
    compresses_well,
    desc="filtering",
    num_proc=num_proc,
)

# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in filtered.items():
    arr_len = np.sum(dset['len'])
    # preallocate space in a temporary file to store the concatenated ids
    filename = f'ubertext_t{t_compression_ratio}_{split}_wiki.bin'
    arr = np.memmap(filename, dtype=np.uint16, mode='w+', shape=(arr_len,))
    total_batches = 1024
    idx = 0
    
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
         # Batch together samples for faster write
         batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
         arr_batch = np.concatenate(batch['ids'])
         # Write into mmap
         arr[idx : idx + len(arr_batch)] = arr_batch
         idx += len(arr_batch)
    arr.flush()

## before filtering
# -rw-rw-r-- 1 proger proger 7.7G Jan  6 03:28 train.bin
# -rw-rw-r-- 1 proger proger 3.9M Jan  6 03:28 val.bin
## t=0.25
# -rw-rw-r-- 1 proger proger 4.6G Jan  6 12:19 train.bin
# -rw-rw-r-- 1 proger proger 2.4M Jan  6 12:19 val.bin
## t=0.4
# -rw-rw-r-- 1 proger proger 7.6G Jan  6 12:48 ubertext_t0.4_train.bin
# -rw-rw-r-- 1 proger proger 3.8M Jan  6 12:48 ubertext_t0.4_val.bin


import matplotlib.pyplot as plt
counts, bins = np.histogram(filtered["train"]["len"], bins=range(10, 2<<10))
plt.figure(figsize=(10, 5))
plt.stairs(counts, bins)
plt.savefig('hist10.png')

counts, bins = np.histogram(filtered["train"]["len"], bins=range(10, 2<<9))
plt.figure(figsize=(10, 5))
plt.stairs(counts, bins)
plt.savefig('hist9.png')

counts, bins = np.histogram(filtered["train"]["len"], bins=range(10, 2<<14))
plt.figure(figsize=(10, 5))
plt.stairs(counts, bins)
plt.savefig('hist14.png')

counts, bins = np.histogram(filtered["train"]["len"], bins=range(10, 2<<19))
plt.figure(figsize=(10, 5))
plt.stairs(counts, bins)
plt.savefig('hist19.png')