import os import sys from huggingface_hub import hf_hub_download # Download the GPT-2 tokens of Fineweb100B from huggingface. This # saves about an hour of startup time compared to regenerating them. def get(fname): local_dir = os.path.join(os.path.dirname(__file__), 'fineweb100B') if not os.path.exists(os.path.join(local_dir, fname)): hf_hub_download(repo_id="kjj0/fineweb100B-gpt2", filename=fname, repo_type="dataset", local_dir=local_dir) get("fineweb_val_%06d.bin" % 0) num_chunks = 1030 # full fineweb100B. Each chunk is 100M tokens if len(sys.argv) >= 2: # we can pass an argument to download less num_chunks = int(sys.argv[1]) for i in range(1, num_chunks+1): get("fineweb_train_%06d.bin" % i)