# saves the dataset to a binary file for training. following was helpful: # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py # prepare.py in karpathy/nanoGPT import numpy as np import sentencepiece as spm from datasets import load_dataset, Value, Features import random from tqdm import tqdm import ftfy num_proc = 24 dataset = load_dataset("json", # original files have too many different columns: #data_files={"train": ["ubertext.fiction.filter_rus_gcld+short.orig.jsonl", "ubertext.news.filter_rus_gcld+short.orig.jsonl", "ubertext.wikipedia.filter_rus_gcld+short.orig.jsonl"]}, # instead do this: # cat *jsonl | jq -rc '{text,title,date_of_publish,tags}' > ubertext.jsonl data_files={"train": ["ubertext.jsonl"]}, features=Features( {'text': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'date_of_publish': Value(dtype='string', id=None), 'tags': [Value(dtype='string', id=None)]})) # owt by default only contains the 'train' split, so create a test split split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True) split_dataset['val'] = split_dataset.pop('test') # rename the test split to val sp = spm.SentencePieceProcessor(model_file='wiki.model') class Tok: endoftext = 50256 endofprompt = 1 def process(example): text, title, date_of_publish, tags = example['text'], example.get('title'), example.get('date_of_publish'), example.get('tags') meta = [] if title: title = ftfy.fix_text(title) meta.append(f'тема: {title}\n') if tags: tags = ', '.join(tags) tags = ftfy.fix_text(tags) meta.append(f'мітки: {tags}\n') if date_of_publish: year = date_of_publish.split('-')[0] meta.append(f'рік: {year}\n') random.shuffle(meta) pre = ''.join(meta) random.shuffle(meta) post = ''.join(meta) text = ftfy.fix_text(text) text = pre + '\n\n' + text + '\n\n' + post ids = sp.encode(text) ids = ids + [Tok.endoftext] out = {'ids': ids, 'len': len(ids), 'text_len': len(text)} return out # tokenize the dataset tokenized = split_dataset.map( process, desc="tokenizing", remove_columns=['text', 'title', 'date_of_publish', 'tags'], num_proc=num_proc, ) t_compression_ratio = 0.4 def compresses_well(example): # from Cramming: # drop all entries from the dataset where the number of tokens in the entry is larger than t times the number of raw characters. # This removes, for example, sequences consisting of hard-to-compress HTML or markdown code return example['len'] < t_compression_ratio * example['text_len'] filtered = tokenized.filter( compresses_well, desc="filtering", num_proc=num_proc, ) # concatenate all the ids in each dataset into one large file we can use for training for split, dset in filtered.items(): arr_len = np.sum(dset['len']) # preallocate space in a temporary file to store the concatenated ids filename = f'ubertext_t{t_compression_ratio}_{split}_wiki.bin' arr = np.memmap(filename, dtype=np.uint16, mode='w+', shape=(arr_len,)) total_batches = 1024 idx = 0 for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'): # Batch together samples for faster write batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy') arr_batch = np.concatenate(batch['ids']) # Write into mmap arr[idx : idx + len(arr_batch)] = arr_batch idx += len(arr_batch) arr.flush() ## before filtering # -rw-rw-r-- 1 proger proger 7.7G Jan 6 03:28 train.bin # -rw-rw-r-- 1 proger proger 3.9M Jan 6 03:28 val.bin ## t=0.25 # -rw-rw-r-- 1 proger proger 4.6G Jan 6 12:19 train.bin # -rw-rw-r-- 1 proger proger 2.4M Jan 6 12:19 val.bin ## t=0.4 # -rw-rw-r-- 1 proger proger 7.6G Jan 6 12:48 ubertext_t0.4_train.bin # -rw-rw-r-- 1 proger proger 3.8M Jan 6 12:48 ubertext_t0.4_val.bin import matplotlib.pyplot as plt counts, bins = np.histogram(filtered["train"]["len"], bins=range(10, 2<<10)) plt.figure(figsize=(10, 5)) plt.stairs(counts, bins) plt.savefig('hist10.png') counts, bins = np.histogram(filtered["train"]["len"], bins=range(10, 2<<9)) plt.figure(figsize=(10, 5)) plt.stairs(counts, bins) plt.savefig('hist9.png') counts, bins = np.histogram(filtered["train"]["len"], bins=range(10, 2<<14)) plt.figure(figsize=(10, 5)) plt.stairs(counts, bins) plt.savefig('hist14.png') counts, bins = np.histogram(filtered["train"]["len"], bins=range(10, 2<<19)) plt.figure(figsize=(10, 5)) plt.stairs(counts, bins) plt.savefig('hist19.png')