import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 09:12:46 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 111W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28781ms step_avg:nanms step:2/1480 train_time:28885ms step_avg:nanms step:3/1480 train_time:29009ms step_avg:nanms step:4/1480 train_time:29151ms step_avg:nanms step:5/1480 train_time:29293ms step_avg:nanms step:6/1480 train_time:29436ms step_avg:nanms step:7/1480 train_time:29578ms step_avg:nanms step:8/1480 train_time:29719ms step_avg:nanms step:9/1480 train_time:29862ms step_avg:nanms step:10/1480 train_time:30006ms step_avg:nanms step:11/1480 train_time:146ms step_avg:nanms step:12/1480 train_time:287ms step_avg:nanms step:13/1480 train_time:430ms step_avg:143.26ms step:14/1480 train_time:572ms step_avg:142.93ms step:15/1480 train_time:714ms step_avg:142.79ms step:16/1480 train_time:857ms step_avg:142.77ms step:17/1480 train_time:1000ms step_avg:142.90ms step:18/1480 train_time:1143ms step_avg:142.87ms step:19/1480 train_time:1287ms step_avg:143.02ms step:20/1480 train_time:1430ms step_avg:142.97ms step:21/1480 train_time:1572ms step_avg:142.87ms step:22/1480 train_time:1713ms step_avg:142.75ms step:23/1480 train_time:1856ms step_avg:142.77ms step:24/1480 train_time:1998ms step_avg:142.71ms step:25/1480 train_time:2140ms step_avg:142.65ms step:26/1480 train_time:2282ms step_avg:142.65ms step:27/1480 train_time:2426ms step_avg:142.70ms step:28/1480 train_time:2569ms step_avg:142.73ms step:29/1480 train_time:2711ms step_avg:142.68ms step:30/1480 train_time:2854ms step_avg:142.72ms step:31/1480 train_time:2997ms step_avg:142.73ms step:32/1480 train_time:3141ms step_avg:142.79ms step:33/1480 train_time:3285ms step_avg:142.84ms step:34/1480 train_time:3428ms step_avg:142.82ms step:35/1480 train_time:3570ms step_avg:142.79ms step:36/1480 train_time:3712ms step_avg:142.75ms step:37/1480 train_time:3854ms step_avg:142.73ms step:38/1480 train_time:3995ms step_avg:142.69ms step:39/1480 train_time:4140ms step_avg:142.76ms step:40/1480 train_time:4283ms step_avg:142.77ms step:41/1480 train_time:4426ms step_avg:142.77ms step:42/1480 train_time:4568ms step_avg:142.75ms step:43/1480 train_time:4710ms step_avg:142.71ms step:44/1480 train_time:4851ms step_avg:142.68ms step:45/1480 train_time:4993ms step_avg:142.66ms step:46/1480 train_time:5134ms step_avg:142.62ms step:47/1480 train_time:5276ms step_avg:142.60ms step:48/1480 train_time:5420ms step_avg:142.63ms step:49/1480 train_time:5564ms step_avg:142.67ms step:50/1480 train_time:5706ms step_avg:142.65ms step:51/1480 train_time:5849ms step_avg:142.66ms step:52/1480 train_time:5991ms step_avg:142.65ms step:53/1480 train_time:6134ms step_avg:142.65ms step:54/1480 train_time:6276ms step_avg:142.64ms step:55/1480 train_time:6422ms step_avg:142.71ms step:56/1480 train_time:6568ms step_avg:142.78ms step:57/1480 train_time:6709ms step_avg:142.75ms step:58/1480 train_time:6851ms step_avg:142.74ms step:59/1480 train_time:6994ms step_avg:142.73ms step:60/1480 train_time:7137ms step_avg:142.74ms step:61/1480 train_time:7282ms step_avg:142.78ms step:62/1480 train_time:7426ms step_avg:142.80ms step:63/1480 train_time:7567ms step_avg:142.77ms step:64/1480 train_time:7711ms step_avg:142.80ms step:65/1480 train_time:7853ms step_avg:142.78ms step:66/1480 train_time:7994ms step_avg:142.76ms step:67/1480 train_time:8136ms step_avg:142.73ms step:68/1480 train_time:8279ms step_avg:142.74ms step:69/1480 train_time:8422ms step_avg:142.74ms step:70/1480 train_time:8566ms step_avg:142.77ms step:71/1480 train_time:8708ms step_avg:142.76ms step:72/1480 train_time:8851ms step_avg:142.75ms step:73/1480 train_time:8992ms step_avg:142.73ms step:74/1480 train_time:9134ms step_avg:142.72ms step:75/1480 train_time:9276ms step_avg:142.71ms step:76/1480 train_time:9420ms step_avg:142.72ms step:77/1480 train_time:9562ms step_avg:142.72ms step:78/1480 train_time:9705ms step_avg:142.72ms step:79/1480 train_time:9847ms step_avg:142.71ms step:80/1480 train_time:9990ms step_avg:142.71ms step:81/1480 train_time:10521ms step_avg:148.18ms step:82/1480 train_time:10624ms step_avg:147.56ms step:83/1480 train_time:10768ms step_avg:147.50ms step:84/1480 train_time:10909ms step_avg:147.42ms step:85/1480 train_time:11050ms step_avg:147.33ms step:86/1480 train_time:11192ms step_avg:147.27ms step:87/1480 train_time:11334ms step_avg:147.20ms step:88/1480 train_time:11476ms step_avg:147.13ms step:89/1480 train_time:11619ms step_avg:147.08ms step:90/1480 train_time:11763ms step_avg:147.03ms step:91/1480 train_time:11905ms step_avg:146.98ms step:92/1480 train_time:12048ms step_avg:146.93ms step:93/1480 train_time:12191ms step_avg:146.88ms step:94/1480 train_time:12334ms step_avg:146.83ms step:95/1480 train_time:12474ms step_avg:146.76ms step:96/1480 train_time:12991ms step_avg:151.06ms step:97/1480 train_time:13492ms step_avg:155.08ms step:98/1480 train_time:13594ms step_avg:154.48ms step:99/1480 train_time:13736ms step_avg:154.34ms step:100/1480 train_time:13879ms step_avg:154.21ms step:101/1480 train_time:14024ms step_avg:154.11ms step:102/1480 train_time:14164ms step_avg:153.96ms step:103/1480 train_time:14305ms step_avg:153.82ms step:104/1480 train_time:14451ms step_avg:153.73ms step:105/1480 train_time:14595ms step_avg:153.63ms step:106/1480 train_time:14737ms step_avg:153.51ms step:107/1480 train_time:14879ms step_avg:153.40ms step:108/1480 train_time:15022ms step_avg:153.29ms step:109/1480 train_time:15166ms step_avg:153.19ms step:110/1480 train_time:15308ms step_avg:153.08ms step:111/1480 train_time:15451ms step_avg:152.98ms step:112/1480 train_time:15596ms step_avg:152.91ms step:113/1480 train_time:15744ms step_avg:152.85ms step:114/1480 train_time:15890ms step_avg:152.79ms step:115/1480 train_time:16034ms step_avg:152.71ms step:116/1480 train_time:16179ms step_avg:152.63ms step:117/1480 train_time:16325ms step_avg:152.57ms step:118/1480 train_time:16470ms step_avg:152.50ms step:119/1480 train_time:16615ms step_avg:152.43ms step:120/1480 train_time:16763ms step_avg:152.39ms step:121/1480 train_time:16909ms step_avg:152.33ms step:122/1480 train_time:17056ms step_avg:152.29ms step:123/1480 train_time:17202ms step_avg:152.23ms step:124/1480 train_time:17349ms step_avg:152.18ms step:125/1480 train_time:17493ms step_avg:152.12ms step:125/1480 val_loss:4.4099 train_time:17558ms step_avg:152.68ms step:126/1480 train_time:17650ms step_avg:152.16ms step:127/1480 train_time:17796ms step_avg:152.10ms step:128/1480 train_time:17942ms step_avg:152.05ms step:129/1480 train_time:18088ms step_avg:152.00ms step:130/1480 train_time:18233ms step_avg:151.94ms step:131/1480 train_time:18377ms step_avg:151.88ms step:132/1480 train_time:18522ms step_avg:151.82ms step:133/1480 train_time:18669ms step_avg:151.78ms step:134/1480 train_time:18814ms step_avg:151.73ms step:135/1480 train_time:18961ms step_avg:151.69ms step:136/1480 train_time:19107ms step_avg:151.64ms step:137/1480 train_time:19252ms step_avg:151.59ms step:138/1480 train_time:19397ms step_avg:151.54ms step:139/1480 train_time:19543ms step_avg:151.50ms step:140/1480 train_time:19690ms step_avg:151.46ms step:141/1480 train_time:19835ms step_avg:151.41ms step:142/1480 train_time:19981ms step_avg:151.37ms step:143/1480 train_time:20128ms step_avg:151.34ms step:144/1480 train_time:20273ms step_avg:151.29ms step:145/1480 train_time:20418ms step_avg:151.24ms step:146/1480 train_time:20565ms step_avg:151.21ms step:147/1480 train_time:20712ms step_avg:151.18ms step:148/1480 train_time:20857ms step_avg:151.14ms step:149/1480 train_time:21004ms step_avg:151.11ms step:150/1480 train_time:21150ms step_avg:151.07ms step:151/1480 train_time:21295ms step_avg:151.03ms step:152/1480 train_time:21441ms step_avg:150.99ms step:153/1480 train_time:21587ms step_avg:150.96ms step:154/1480 train_time:21732ms step_avg:150.92ms step:155/1480 train_time:21877ms step_avg:150.88ms step:156/1480 train_time:22024ms step_avg:150.85ms step:157/1480 train_time:22171ms step_avg:150.82ms step:158/1480 train_time:22315ms step_avg:150.78ms step:159/1480 train_time:22461ms step_avg:150.75ms step:160/1480 train_time:22608ms step_avg:150.72ms step:161/1480 train_time:22753ms step_avg:150.68ms step:162/1480 train_time:22899ms step_avg:150.65ms step:163/1480 train_time:23046ms step_avg:150.63ms step:164/1480 train_time:23192ms step_avg:150.60ms step:165/1480 train_time:23336ms step_avg:150.55ms step:166/1480 train_time:23482ms step_avg:150.53ms step:167/1480 train_time:23629ms step_avg:150.51ms step:168/1480 train_time:23775ms step_avg:150.47ms step:169/1480 train_time:23921ms step_avg:150.44ms step:170/1480 train_time:24068ms step_avg:150.42ms step:171/1480 train_time:24213ms step_avg:150.39ms step:172/1480 train_time:24359ms step_avg:150.36ms step:173/1480 train_time:24506ms step_avg:150.34ms step:174/1480 train_time:24652ms step_avg:150.32ms step:175/1480 train_time:24797ms step_avg:150.28ms step:176/1480 train_time:24943ms step_avg:150.26ms step:177/1480 train_time:25090ms step_avg:150.24ms step:178/1480 train_time:25235ms step_avg:150.21ms step:179/1480 train_time:25381ms step_avg:150.19ms step:180/1480 train_time:25912ms step_avg:152.42ms step:181/1480 train_time:26019ms step_avg:152.16ms step:182/1480 train_time:26166ms step_avg:152.13ms step:183/1480 train_time:26312ms step_avg:152.09ms step:184/1480 train_time:26456ms step_avg:152.05ms step:185/1480 train_time:26601ms step_avg:152.01ms step:186/1480 train_time:26747ms step_avg:151.97ms step:187/1480 train_time:26893ms step_avg:151.94ms step:188/1480 train_time:27040ms step_avg:151.91ms step:189/1480 train_time:27203ms step_avg:151.97ms step:190/1480 train_time:27333ms step_avg:151.85ms step:191/1480 train_time:27478ms step_avg:151.81ms step:192/1480 train_time:27625ms step_avg:151.79ms step:193/1480 train_time:27771ms step_avg:151.75ms step:194/1480 train_time:27915ms step_avg:151.71ms step:195/1480 train_time:28061ms step_avg:151.68ms step:196/1480 train_time:28208ms step_avg:151.66ms step:197/1480 train_time:28353ms step_avg:151.62ms step:198/1480 train_time:28499ms step_avg:151.59ms step:199/1480 train_time:28647ms step_avg:151.57ms step:200/1480 train_time:28793ms step_avg:151.54ms step:201/1480 train_time:28940ms step_avg:151.52ms step:202/1480 train_time:29086ms step_avg:151.49ms step:203/1480 train_time:29232ms step_avg:151.46ms step:204/1480 train_time:29377ms step_avg:151.43ms step:205/1480 train_time:29523ms step_avg:151.40ms step:206/1480 train_time:29670ms step_avg:151.38ms step:207/1480 train_time:29815ms step_avg:151.34ms step:208/1480 train_time:29960ms step_avg:151.32ms step:209/1480 train_time:30107ms step_avg:151.29ms step:210/1480 train_time:30252ms step_avg:151.26ms step:211/1480 train_time:30398ms step_avg:151.23ms step:212/1480 train_time:30545ms step_avg:151.21ms step:213/1480 train_time:30691ms step_avg:151.19ms step:214/1480 train_time:30836ms step_avg:151.16ms step:215/1480 train_time:30983ms step_avg:151.13ms step:216/1480 train_time:31130ms step_avg:151.12ms step:217/1480 train_time:31275ms step_avg:151.09ms step:218/1480 train_time:31421ms step_avg:151.06ms step:219/1480 train_time:31568ms step_avg:151.04ms step:220/1480 train_time:31713ms step_avg:151.01ms step:221/1480 train_time:32250ms step_avg:152.84ms step:222/1480 train_time:32353ms step_avg:152.61ms step:223/1480 train_time:32500ms step_avg:152.58ms step:224/1480 train_time:32648ms step_avg:152.56ms step:225/1480 train_time:32796ms step_avg:152.54ms step:226/1480 train_time:32944ms step_avg:152.52ms step:227/1480 train_time:33092ms step_avg:152.50ms step:228/1480 train_time:33240ms step_avg:152.48ms step:229/1480 train_time:33390ms step_avg:152.47ms step:230/1480 train_time:33538ms step_avg:152.44ms step:231/1480 train_time:33686ms step_avg:152.43ms step:232/1480 train_time:33834ms step_avg:152.40ms step:233/1480 train_time:33982ms step_avg:152.39ms step:234/1480 train_time:34131ms step_avg:152.37ms step:235/1480 train_time:34279ms step_avg:152.35ms step:236/1480 train_time:34428ms step_avg:152.34ms step:237/1480 train_time:34577ms step_avg:152.32ms step:238/1480 train_time:34726ms step_avg:152.31ms step:239/1480 train_time:34873ms step_avg:152.29ms step:240/1480 train_time:35021ms step_avg:152.26ms step:241/1480 train_time:35169ms step_avg:152.25ms step:242/1480 train_time:35317ms step_avg:152.23ms step:243/1480 train_time:35467ms step_avg:152.22ms step:244/1480 train_time:35614ms step_avg:152.20ms step:245/1480 train_time:35763ms step_avg:152.18ms step:246/1480 train_time:35913ms step_avg:152.17ms step:247/1480 train_time:36060ms step_avg:152.15ms step:248/1480 train_time:36209ms step_avg:152.14ms step:249/1480 train_time:36357ms step_avg:152.12ms step:250/1480 train_time:36507ms step_avg:152.11ms step:250/1480 val_loss:3.9866 train_time:36573ms step_avg:152.39ms step:251/1480 train_time:36668ms step_avg:152.15ms step:252/1480 train_time:36812ms step_avg:152.12ms step:253/1480 train_time:36960ms step_avg:152.10ms step:254/1480 train_time:37108ms step_avg:152.08ms step:255/1480 train_time:37257ms step_avg:152.07ms step:256/1480 train_time:37405ms step_avg:152.05ms step:257/1480 train_time:37553ms step_avg:152.04ms step:258/1480 train_time:37701ms step_avg:152.02ms step:259/1480 train_time:37850ms step_avg:152.01ms step:260/1480 train_time:37999ms step_avg:152.00ms step:261/1480 train_time:38147ms step_avg:151.98ms step:262/1480 train_time:38297ms step_avg:151.97ms step:263/1480 train_time:38444ms step_avg:151.95ms step:264/1480 train_time:38593ms step_avg:151.94ms step:265/1480 train_time:38741ms step_avg:151.92ms step:266/1480 train_time:38889ms step_avg:151.91ms step:267/1480 train_time:39038ms step_avg:151.90ms step:268/1480 train_time:39187ms step_avg:151.89ms step:269/1480 train_time:39336ms step_avg:151.88ms step:270/1480 train_time:39485ms step_avg:151.87ms step:271/1480 train_time:39635ms step_avg:151.86ms step:272/1480 train_time:39782ms step_avg:151.84ms step:273/1480 train_time:39931ms step_avg:151.83ms step:274/1480 train_time:40080ms step_avg:151.82ms step:275/1480 train_time:40229ms step_avg:151.81ms step:276/1480 train_time:40378ms step_avg:151.80ms step:277/1480 train_time:40526ms step_avg:151.78ms step:278/1480 train_time:40674ms step_avg:151.77ms step:279/1480 train_time:40822ms step_avg:151.76ms step:280/1480 train_time:40971ms step_avg:151.75ms step:281/1480 train_time:41120ms step_avg:151.74ms step:282/1480 train_time:41269ms step_avg:151.72ms step:283/1480 train_time:41417ms step_avg:151.71ms step:284/1480 train_time:41565ms step_avg:151.70ms step:285/1480 train_time:41715ms step_avg:151.69ms step:286/1480 train_time:41862ms step_avg:151.67ms step:287/1480 train_time:42011ms step_avg:151.66ms step:288/1480 train_time:42160ms step_avg:151.65ms step:289/1480 train_time:42308ms step_avg:151.64ms step:290/1480 train_time:42457ms step_avg:151.63ms step:291/1480 train_time:42605ms step_avg:151.62ms step:292/1480 train_time:42754ms step_avg:151.61ms step:293/1480 train_time:42902ms step_avg:151.60ms step:294/1480 train_time:43051ms step_avg:151.59ms step:295/1480 train_time:43199ms step_avg:151.57ms step:296/1480 train_time:43347ms step_avg:151.56ms step:297/1480 train_time:43496ms step_avg:151.55ms step:298/1480 train_time:43644ms step_avg:151.54ms step:299/1480 train_time:43793ms step_avg:151.53ms step:300/1480 train_time:43941ms step_avg:151.52ms step:301/1480 train_time:44090ms step_avg:151.51ms step:302/1480 train_time:44239ms step_avg:151.50ms step:303/1480 train_time:44387ms step_avg:151.49ms step:304/1480 train_time:44536ms step_avg:151.48ms step:305/1480 train_time:44684ms step_avg:151.47ms step:306/1480 train_time:44835ms step_avg:151.47ms step:307/1480 train_time:44982ms step_avg:151.46ms step:308/1480 train_time:45132ms step_avg:151.45ms step:309/1480 train_time:45279ms step_avg:151.44ms step:310/1480 train_time:45429ms step_avg:151.43ms step:311/1480 train_time:45578ms step_avg:151.42ms step:312/1480 train_time:45726ms step_avg:151.41ms step:313/1480 train_time:45876ms step_avg:151.41ms step:314/1480 train_time:46024ms step_avg:151.40ms step:315/1480 train_time:46172ms step_avg:151.38ms step:316/1480 train_time:46320ms step_avg:151.37ms step:317/1480 train_time:46469ms step_avg:151.36ms step:318/1480 train_time:46618ms step_avg:151.36ms step:319/1480 train_time:46766ms step_avg:151.35ms step:320/1480 train_time:46915ms step_avg:151.34ms step:321/1480 train_time:47063ms step_avg:151.33ms step:322/1480 train_time:47213ms step_avg:151.32ms step:323/1480 train_time:47361ms step_avg:151.31ms step:324/1480 train_time:47509ms step_avg:151.30ms step:325/1480 train_time:47657ms step_avg:151.29ms step:326/1480 train_time:47805ms step_avg:151.28ms step:327/1480 train_time:47955ms step_avg:151.28ms step:328/1480 train_time:48103ms step_avg:151.27ms step:329/1480 train_time:48252ms step_avg:151.26ms step:330/1480 train_time:48401ms step_avg:151.25ms step:331/1480 train_time:48551ms step_avg:151.25ms step:332/1480 train_time:48702ms step_avg:151.25ms step:333/1480 train_time:48854ms step_avg:151.25ms step:334/1480 train_time:49004ms step_avg:151.25ms step:335/1480 train_time:49156ms step_avg:151.25ms step:336/1480 train_time:49306ms step_avg:151.25ms step:337/1480 train_time:49457ms step_avg:151.25ms step:338/1480 train_time:49608ms step_avg:151.24ms step:339/1480 train_time:49759ms step_avg:151.24ms step:340/1480 train_time:49910ms step_avg:151.24ms step:341/1480 train_time:50061ms step_avg:151.24ms step:342/1480 train_time:50213ms step_avg:151.24ms step:343/1480 train_time:50363ms step_avg:151.24ms step:344/1480 train_time:50514ms step_avg:151.24ms step:345/1480 train_time:50664ms step_avg:151.23ms step:346/1480 train_time:50815ms step_avg:151.24ms step:347/1480 train_time:50966ms step_avg:151.23ms step:348/1480 train_time:51119ms step_avg:151.24ms step:349/1480 train_time:51270ms step_avg:151.24ms step:350/1480 train_time:51420ms step_avg:151.24ms step:351/1480 train_time:51571ms step_avg:151.23ms step:352/1480 train_time:51722ms step_avg:151.23ms step:353/1480 train_time:51873ms step_avg:151.23ms step:354/1480 train_time:52023ms step_avg:151.23ms step:355/1480 train_time:52174ms step_avg:151.23ms step:356/1480 train_time:52325ms step_avg:151.23ms step:357/1480 train_time:52475ms step_avg:151.23ms step:358/1480 train_time:52627ms step_avg:151.23ms step:359/1480 train_time:52777ms step_avg:151.22ms step:360/1480 train_time:52929ms step_avg:151.23ms step:361/1480 train_time:53080ms step_avg:151.23ms step:362/1480 train_time:53231ms step_avg:151.22ms step:363/1480 train_time:53382ms step_avg:151.22ms step:364/1480 train_time:53533ms step_avg:151.22ms step:365/1480 train_time:53684ms step_avg:151.22ms step:366/1480 train_time:53835ms step_avg:151.22ms step:367/1480 train_time:53986ms step_avg:151.22ms step:368/1480 train_time:54138ms step_avg:151.22ms step:369/1480 train_time:54287ms step_avg:151.22ms step:370/1480 train_time:54439ms step_avg:151.22ms step:371/1480 train_time:54589ms step_avg:151.22ms step:372/1480 train_time:54740ms step_avg:151.22ms step:373/1480 train_time:54891ms step_avg:151.21ms step:374/1480 train_time:55041ms step_avg:151.21ms step:375/1480 train_time:55192ms step_avg:151.21ms step:375/1480 val_loss:3.8097 train_time:55260ms step_avg:151.40ms step:376/1480 train_time:55354ms step_avg:151.24ms step:377/1480 train_time:55502ms step_avg:151.23ms step:378/1480 train_time:55652ms step_avg:151.23ms step:379/1480 train_time:55816ms step_avg:151.26ms step:380/1480 train_time:55952ms step_avg:151.22ms step:381/1480 train_time:56102ms step_avg:151.22ms step:382/1480 train_time:56252ms step_avg:151.22ms step:383/1480 train_time:56404ms step_avg:151.22ms step:384/1480 train_time:56554ms step_avg:151.21ms step:385/1480 train_time:56706ms step_avg:151.22ms step:386/1480 train_time:56855ms step_avg:151.21ms step:387/1480 train_time:57006ms step_avg:151.21ms step:388/1480 train_time:57157ms step_avg:151.21ms step:389/1480 train_time:57308ms step_avg:151.21ms step:390/1480 train_time:57459ms step_avg:151.21ms step:391/1480 train_time:57610ms step_avg:151.21ms step:392/1480 train_time:57763ms step_avg:151.21ms step:393/1480 train_time:57913ms step_avg:151.21ms step:394/1480 train_time:58064ms step_avg:151.21ms step:395/1480 train_time:58214ms step_avg:151.21ms step:396/1480 train_time:58365ms step_avg:151.21ms step:397/1480 train_time:58516ms step_avg:151.20ms step:398/1480 train_time:58668ms step_avg:151.21ms step:399/1480 train_time:58819ms step_avg:151.21ms step:400/1480 train_time:58970ms step_avg:151.21ms step:401/1480 train_time:59122ms step_avg:151.21ms step:402/1480 train_time:59273ms step_avg:151.21ms step:403/1480 train_time:59424ms step_avg:151.21ms step:404/1480 train_time:59574ms step_avg:151.20ms step:405/1480 train_time:59725ms step_avg:151.20ms step:406/1480 train_time:59876ms step_avg:151.20ms step:407/1480 train_time:60027ms step_avg:151.20ms step:408/1480 train_time:60177ms step_avg:151.20ms step:409/1480 train_time:60328ms step_avg:151.20ms step:410/1480 train_time:60479ms step_avg:151.20ms step:411/1480 train_time:60629ms step_avg:151.20ms step:412/1480 train_time:60780ms step_avg:151.19ms step:413/1480 train_time:60930ms step_avg:151.19ms step:414/1480 train_time:61081ms step_avg:151.19ms step:415/1480 train_time:61231ms step_avg:151.19ms step:416/1480 train_time:61382ms step_avg:151.19ms step:417/1480 train_time:61533ms step_avg:151.19ms step:418/1480 train_time:61684ms step_avg:151.19ms step:419/1480 train_time:61835ms step_avg:151.19ms step:420/1480 train_time:61987ms step_avg:151.19ms step:421/1480 train_time:62137ms step_avg:151.19ms step:422/1480 train_time:62289ms step_avg:151.19ms step:423/1480 train_time:62440ms step_avg:151.19ms step:424/1480 train_time:62591ms step_avg:151.19ms step:425/1480 train_time:62743ms step_avg:151.19ms step:426/1480 train_time:62894ms step_avg:151.19ms step:427/1480 train_time:63045ms step_avg:151.19ms step:428/1480 train_time:63195ms step_avg:151.19ms step:429/1480 train_time:63347ms step_avg:151.19ms step:430/1480 train_time:63497ms step_avg:151.18ms step:431/1480 train_time:63649ms step_avg:151.18ms step:432/1480 train_time:63799ms step_avg:151.18ms step:433/1480 train_time:63950ms step_avg:151.18ms step:434/1480 train_time:64101ms step_avg:151.18ms step:435/1480 train_time:64252ms step_avg:151.18ms step:436/1480 train_time:64403ms step_avg:151.18ms step:437/1480 train_time:64553ms step_avg:151.18ms step:438/1480 train_time:64705ms step_avg:151.18ms step:439/1480 train_time:64855ms step_avg:151.18ms step:440/1480 train_time:65007ms step_avg:151.18ms step:441/1480 train_time:65159ms step_avg:151.18ms step:442/1480 train_time:65312ms step_avg:151.19ms step:443/1480 train_time:65466ms step_avg:151.19ms step:444/1480 train_time:65620ms step_avg:151.20ms step:445/1480 train_time:65772ms step_avg:151.20ms step:446/1480 train_time:65926ms step_avg:151.21ms step:447/1480 train_time:66078ms step_avg:151.21ms step:448/1480 train_time:66230ms step_avg:151.21ms step:449/1480 train_time:66382ms step_avg:151.21ms step:450/1480 train_time:66536ms step_avg:151.22ms step:451/1480 train_time:66689ms step_avg:151.22ms step:452/1480 train_time:66843ms step_avg:151.23ms step:453/1480 train_time:66996ms step_avg:151.23ms step:454/1480 train_time:67148ms step_avg:151.24ms step:455/1480 train_time:67302ms step_avg:151.24ms step:456/1480 train_time:67454ms step_avg:151.24ms step:457/1480 train_time:67607ms step_avg:151.24ms step:458/1480 train_time:67758ms step_avg:151.25ms step:459/1480 train_time:67912ms step_avg:151.25ms step:460/1480 train_time:68066ms step_avg:151.26ms step:461/1480 train_time:68219ms step_avg:151.26ms step:462/1480 train_time:68373ms step_avg:151.27ms step:463/1480 train_time:68526ms step_avg:151.27ms step:464/1480 train_time:68678ms step_avg:151.27ms step:465/1480 train_time:68830ms step_avg:151.28ms step:466/1480 train_time:68982ms step_avg:151.28ms step:467/1480 train_time:69137ms step_avg:151.29ms step:468/1480 train_time:69291ms step_avg:151.29ms step:469/1480 train_time:69444ms step_avg:151.29ms step:470/1480 train_time:69596ms step_avg:151.29ms step:471/1480 train_time:69749ms step_avg:151.30ms step:472/1480 train_time:69903ms step_avg:151.30ms step:473/1480 train_time:70054ms step_avg:151.31ms step:474/1480 train_time:70208ms step_avg:151.31ms step:475/1480 train_time:70360ms step_avg:151.31ms step:476/1480 train_time:70513ms step_avg:151.32ms step:477/1480 train_time:70666ms step_avg:151.32ms step:478/1480 train_time:70819ms step_avg:151.32ms step:479/1480 train_time:70973ms step_avg:151.33ms step:480/1480 train_time:71126ms step_avg:151.33ms step:481/1480 train_time:71278ms step_avg:151.33ms step:482/1480 train_time:71430ms step_avg:151.34ms step:483/1480 train_time:71583ms step_avg:151.34ms step:484/1480 train_time:71738ms step_avg:151.35ms step:485/1480 train_time:71891ms step_avg:151.35ms step:486/1480 train_time:72045ms step_avg:151.35ms step:487/1480 train_time:72197ms step_avg:151.36ms step:488/1480 train_time:72349ms step_avg:151.36ms step:489/1480 train_time:72502ms step_avg:151.36ms step:490/1480 train_time:72654ms step_avg:151.36ms step:491/1480 train_time:72807ms step_avg:151.37ms step:492/1480 train_time:72959ms step_avg:151.37ms step:493/1480 train_time:73113ms step_avg:151.37ms step:494/1480 train_time:73267ms step_avg:151.38ms step:495/1480 train_time:73421ms step_avg:151.38ms step:496/1480 train_time:73574ms step_avg:151.39ms step:497/1480 train_time:73727ms step_avg:151.39ms step:498/1480 train_time:73880ms step_avg:151.39ms step:499/1480 train_time:74032ms step_avg:151.39ms step:500/1480 train_time:74184ms step_avg:151.40ms step:500/1480 val_loss:3.6870 train_time:74254ms step_avg:151.54ms step:501/1480 train_time:74349ms step_avg:151.42ms step:502/1480 train_time:74495ms step_avg:151.41ms step:503/1480 train_time:74648ms step_avg:151.42ms step:504/1480 train_time:74800ms step_avg:151.42ms step:505/1480 train_time:74952ms step_avg:151.42ms step:506/1480 train_time:75104ms step_avg:151.42ms step:507/1480 train_time:75257ms step_avg:151.42ms step:508/1480 train_time:75412ms step_avg:151.43ms step:509/1480 train_time:75568ms step_avg:151.44ms step:510/1480 train_time:75721ms step_avg:151.44ms step:511/1480 train_time:75873ms step_avg:151.44ms step:512/1480 train_time:76026ms step_avg:151.45ms step:513/1480 train_time:76178ms step_avg:151.45ms step:514/1480 train_time:76332ms step_avg:151.45ms step:515/1480 train_time:76486ms step_avg:151.46ms step:516/1480 train_time:76640ms step_avg:151.46ms step:517/1480 train_time:76794ms step_avg:151.47ms step:518/1480 train_time:76946ms step_avg:151.47ms step:519/1480 train_time:77099ms step_avg:151.47ms step:520/1480 train_time:77252ms step_avg:151.47ms step:521/1480 train_time:77406ms step_avg:151.48ms step:522/1480 train_time:77559ms step_avg:151.48ms step:523/1480 train_time:77713ms step_avg:151.49ms step:524/1480 train_time:77868ms step_avg:151.49ms step:525/1480 train_time:78020ms step_avg:151.50ms step:526/1480 train_time:78173ms step_avg:151.50ms step:527/1480 train_time:78324ms step_avg:151.50ms step:528/1480 train_time:78475ms step_avg:151.50ms step:529/1480 train_time:78630ms step_avg:151.50ms step:530/1480 train_time:78785ms step_avg:151.51ms step:531/1480 train_time:78939ms step_avg:151.51ms step:532/1480 train_time:79090ms step_avg:151.51ms step:533/1480 train_time:79243ms step_avg:151.52ms step:534/1480 train_time:79397ms step_avg:151.52ms step:535/1480 train_time:79549ms step_avg:151.52ms step:536/1480 train_time:79702ms step_avg:151.52ms step:537/1480 train_time:79856ms step_avg:151.53ms step:538/1480 train_time:80010ms step_avg:151.53ms step:539/1480 train_time:80163ms step_avg:151.54ms step:540/1480 train_time:80318ms step_avg:151.54ms step:541/1480 train_time:80470ms step_avg:151.54ms step:542/1480 train_time:80622ms step_avg:151.55ms step:543/1480 train_time:80775ms step_avg:151.55ms step:544/1480 train_time:80928ms step_avg:151.55ms step:545/1480 train_time:81081ms step_avg:151.55ms step:546/1480 train_time:81236ms step_avg:151.56ms step:547/1480 train_time:81388ms step_avg:151.56ms step:548/1480 train_time:81541ms step_avg:151.56ms step:549/1480 train_time:81693ms step_avg:151.56ms step:550/1480 train_time:81846ms step_avg:151.57ms step:551/1480 train_time:82000ms step_avg:151.57ms step:552/1480 train_time:82155ms step_avg:151.58ms step:553/1480 train_time:82311ms step_avg:151.59ms step:554/1480 train_time:82466ms step_avg:151.59ms step:555/1480 train_time:82621ms step_avg:151.60ms step:556/1480 train_time:82775ms step_avg:151.60ms step:557/1480 train_time:82929ms step_avg:151.61ms step:558/1480 train_time:83084ms step_avg:151.61ms step:559/1480 train_time:83239ms step_avg:151.62ms step:560/1480 train_time:83394ms step_avg:151.63ms step:561/1480 train_time:83548ms step_avg:151.63ms step:562/1480 train_time:83703ms step_avg:151.64ms step:563/1480 train_time:83858ms step_avg:151.64ms step:564/1480 train_time:84014ms step_avg:151.65ms step:565/1480 train_time:84169ms step_avg:151.66ms step:566/1480 train_time:84323ms step_avg:151.66ms step:567/1480 train_time:84477ms step_avg:151.66ms step:568/1480 train_time:84633ms step_avg:151.67ms step:569/1480 train_time:84799ms step_avg:151.70ms step:570/1480 train_time:84943ms step_avg:151.68ms step:571/1480 train_time:85098ms step_avg:151.69ms step:572/1480 train_time:85252ms step_avg:151.69ms step:573/1480 train_time:85408ms step_avg:151.70ms step:574/1480 train_time:85562ms step_avg:151.71ms step:575/1480 train_time:85717ms step_avg:151.71ms step:576/1480 train_time:85872ms step_avg:151.72ms step:577/1480 train_time:86027ms step_avg:151.72ms step:578/1480 train_time:86181ms step_avg:151.73ms step:579/1480 train_time:86335ms step_avg:151.73ms step:580/1480 train_time:86490ms step_avg:151.74ms step:581/1480 train_time:86644ms step_avg:151.74ms step:582/1480 train_time:86800ms step_avg:151.75ms step:583/1480 train_time:86953ms step_avg:151.75ms step:584/1480 train_time:87109ms step_avg:151.76ms step:585/1480 train_time:87264ms step_avg:151.76ms step:586/1480 train_time:87419ms step_avg:151.77ms step:587/1480 train_time:87573ms step_avg:151.77ms step:588/1480 train_time:87728ms step_avg:151.78ms step:589/1480 train_time:87882ms step_avg:151.78ms step:590/1480 train_time:88038ms step_avg:151.79ms step:591/1480 train_time:88191ms step_avg:151.79ms step:592/1480 train_time:88347ms step_avg:151.80ms step:593/1480 train_time:88502ms step_avg:151.80ms step:594/1480 train_time:88658ms step_avg:151.81ms step:595/1480 train_time:88814ms step_avg:151.82ms step:596/1480 train_time:88971ms step_avg:151.83ms step:597/1480 train_time:89125ms step_avg:151.83ms step:598/1480 train_time:89279ms step_avg:151.84ms step:599/1480 train_time:89435ms step_avg:151.84ms step:600/1480 train_time:89589ms step_avg:151.85ms step:601/1480 train_time:89743ms step_avg:151.85ms step:602/1480 train_time:89898ms step_avg:151.85ms step:603/1480 train_time:90054ms step_avg:151.86ms step:604/1480 train_time:90208ms step_avg:151.86ms step:605/1480 train_time:90364ms step_avg:151.87ms step:606/1480 train_time:90519ms step_avg:151.88ms step:607/1480 train_time:90674ms step_avg:151.88ms step:608/1480 train_time:90829ms step_avg:151.89ms step:609/1480 train_time:90983ms step_avg:151.89ms step:610/1480 train_time:91138ms step_avg:151.90ms step:611/1480 train_time:91292ms step_avg:151.90ms step:612/1480 train_time:91447ms step_avg:151.90ms step:613/1480 train_time:91602ms step_avg:151.91ms step:614/1480 train_time:91757ms step_avg:151.92ms step:615/1480 train_time:91912ms step_avg:151.92ms step:616/1480 train_time:92066ms step_avg:151.92ms step:617/1480 train_time:92222ms step_avg:151.93ms step:618/1480 train_time:92375ms step_avg:151.93ms step:619/1480 train_time:92531ms step_avg:151.94ms step:620/1480 train_time:92686ms step_avg:151.94ms step:621/1480 train_time:92842ms step_avg:151.95ms step:622/1480 train_time:92997ms step_avg:151.96ms step:623/1480 train_time:93152ms step_avg:151.96ms step:624/1480 train_time:93307ms step_avg:151.97ms step:625/1480 train_time:93461ms step_avg:151.97ms step:625/1480 val_loss:3.6051 train_time:93532ms step_avg:152.08ms step:626/1480 train_time:93624ms step_avg:151.99ms step:627/1480 train_time:93778ms step_avg:151.99ms step:628/1480 train_time:93933ms step_avg:151.99ms step:629/1480 train_time:94087ms step_avg:152.00ms step:630/1480 train_time:94241ms step_avg:152.00ms step:631/1480 train_time:94396ms step_avg:152.01ms step:632/1480 train_time:94550ms step_avg:152.01ms step:633/1480 train_time:94706ms step_avg:152.02ms step:634/1480 train_time:94860ms step_avg:152.02ms step:635/1480 train_time:95014ms step_avg:152.02ms step:636/1480 train_time:95168ms step_avg:152.03ms step:637/1480 train_time:95323ms step_avg:152.03ms step:638/1480 train_time:95478ms step_avg:152.04ms step:639/1480 train_time:95632ms step_avg:152.04ms step:640/1480 train_time:95787ms step_avg:152.04ms step:641/1480 train_time:95941ms step_avg:152.05ms step:642/1480 train_time:96096ms step_avg:152.05ms step:643/1480 train_time:96251ms step_avg:152.05ms step:644/1480 train_time:96406ms step_avg:152.06ms step:645/1480 train_time:96563ms step_avg:152.07ms step:646/1480 train_time:96719ms step_avg:152.07ms step:647/1480 train_time:96874ms step_avg:152.08ms step:648/1480 train_time:97029ms step_avg:152.08ms step:649/1480 train_time:97184ms step_avg:152.09ms step:650/1480 train_time:97339ms step_avg:152.09ms step:651/1480 train_time:97495ms step_avg:152.10ms step:652/1480 train_time:97650ms step_avg:152.10ms step:653/1480 train_time:97804ms step_avg:152.11ms step:654/1480 train_time:97960ms step_avg:152.11ms step:655/1480 train_time:98115ms step_avg:152.12ms step:656/1480 train_time:98269ms step_avg:152.12ms step:657/1480 train_time:98424ms step_avg:152.12ms step:658/1480 train_time:98579ms step_avg:152.13ms step:659/1480 train_time:98735ms step_avg:152.13ms step:660/1480 train_time:98892ms step_avg:152.14ms step:661/1480 train_time:99048ms step_avg:152.15ms step:662/1480 train_time:99204ms step_avg:152.15ms step:663/1480 train_time:99360ms step_avg:152.16ms step:664/1480 train_time:99517ms step_avg:152.17ms step:665/1480 train_time:99673ms step_avg:152.17ms step:666/1480 train_time:99829ms step_avg:152.18ms step:667/1480 train_time:99985ms step_avg:152.18ms step:668/1480 train_time:100143ms step_avg:152.19ms step:669/1480 train_time:100300ms step_avg:152.20ms step:670/1480 train_time:100457ms step_avg:152.21ms step:671/1480 train_time:100612ms step_avg:152.21ms step:672/1480 train_time:100768ms step_avg:152.22ms step:673/1480 train_time:100924ms step_avg:152.22ms step:674/1480 train_time:101081ms step_avg:152.23ms step:675/1480 train_time:101239ms step_avg:152.24ms step:676/1480 train_time:101397ms step_avg:152.25ms step:677/1480 train_time:101554ms step_avg:152.26ms step:678/1480 train_time:101710ms step_avg:152.26ms step:679/1480 train_time:101865ms step_avg:152.27ms step:680/1480 train_time:102024ms step_avg:152.27ms step:681/1480 train_time:102180ms step_avg:152.28ms step:682/1480 train_time:102336ms step_avg:152.29ms step:683/1480 train_time:102492ms step_avg:152.29ms step:684/1480 train_time:102649ms step_avg:152.30ms step:685/1480 train_time:102806ms step_avg:152.31ms step:686/1480 train_time:102962ms step_avg:152.31ms step:687/1480 train_time:103118ms step_avg:152.32ms step:688/1480 train_time:103275ms step_avg:152.32ms step:689/1480 train_time:103432ms step_avg:152.33ms step:690/1480 train_time:103589ms step_avg:152.34ms step:691/1480 train_time:103745ms step_avg:152.34ms step:692/1480 train_time:103902ms step_avg:152.35ms step:693/1480 train_time:104060ms step_avg:152.36ms step:694/1480 train_time:104217ms step_avg:152.36ms step:695/1480 train_time:104372ms step_avg:152.37ms step:696/1480 train_time:104528ms step_avg:152.37ms step:697/1480 train_time:104685ms step_avg:152.38ms step:698/1480 train_time:104840ms step_avg:152.38ms step:699/1480 train_time:104997ms step_avg:152.39ms step:700/1480 train_time:105153ms step_avg:152.40ms step:701/1480 train_time:105309ms step_avg:152.40ms step:702/1480 train_time:105465ms step_avg:152.41ms step:703/1480 train_time:105622ms step_avg:152.41ms step:704/1480 train_time:105779ms step_avg:152.42ms step:705/1480 train_time:105934ms step_avg:152.42ms step:706/1480 train_time:106090ms step_avg:152.43ms step:707/1480 train_time:106247ms step_avg:152.43ms step:708/1480 train_time:106403ms step_avg:152.44ms step:709/1480 train_time:106559ms step_avg:152.45ms step:710/1480 train_time:106715ms step_avg:152.45ms step:711/1480 train_time:106870ms step_avg:152.45ms step:712/1480 train_time:107027ms step_avg:152.46ms step:713/1480 train_time:107185ms step_avg:152.47ms step:714/1480 train_time:107341ms step_avg:152.47ms step:715/1480 train_time:107496ms step_avg:152.48ms step:716/1480 train_time:107652ms step_avg:152.48ms step:717/1480 train_time:107808ms step_avg:152.49ms step:718/1480 train_time:107964ms step_avg:152.49ms step:719/1480 train_time:108121ms step_avg:152.50ms step:720/1480 train_time:108278ms step_avg:152.50ms step:721/1480 train_time:108435ms step_avg:152.51ms step:722/1480 train_time:108591ms step_avg:152.52ms step:723/1480 train_time:108747ms step_avg:152.52ms step:724/1480 train_time:108904ms step_avg:152.53ms step:725/1480 train_time:109061ms step_avg:152.53ms step:726/1480 train_time:109218ms step_avg:152.54ms step:727/1480 train_time:109375ms step_avg:152.54ms step:728/1480 train_time:109531ms step_avg:152.55ms step:729/1480 train_time:109688ms step_avg:152.56ms step:730/1480 train_time:109844ms step_avg:152.56ms step:731/1480 train_time:110001ms step_avg:152.57ms step:732/1480 train_time:110157ms step_avg:152.57ms step:733/1480 train_time:110314ms step_avg:152.58ms step:734/1480 train_time:110469ms step_avg:152.58ms step:735/1480 train_time:110626ms step_avg:152.59ms step:736/1480 train_time:110782ms step_avg:152.59ms step:737/1480 train_time:110938ms step_avg:152.60ms step:738/1480 train_time:111093ms step_avg:152.60ms step:739/1480 train_time:111248ms step_avg:152.60ms step:740/1480 train_time:111407ms step_avg:152.61ms step:741/1480 train_time:111564ms step_avg:152.62ms step:742/1480 train_time:111720ms step_avg:152.62ms step:743/1480 train_time:111876ms step_avg:152.63ms step:744/1480 train_time:112032ms step_avg:152.63ms step:745/1480 train_time:112188ms step_avg:152.64ms step:746/1480 train_time:112344ms step_avg:152.64ms step:747/1480 train_time:112501ms step_avg:152.65ms step:748/1480 train_time:112660ms step_avg:152.66ms step:749/1480 train_time:112819ms step_avg:152.66ms step:750/1480 train_time:112975ms step_avg:152.67ms step:750/1480 val_loss:3.5482 train_time:113048ms step_avg:152.77ms step:751/1480 train_time:113138ms step_avg:152.68ms step:752/1480 train_time:113297ms step_avg:152.69ms step:753/1480 train_time:113453ms step_avg:152.70ms step:754/1480 train_time:113609ms step_avg:152.70ms step:755/1480 train_time:113764ms step_avg:152.70ms step:756/1480 train_time:113921ms step_avg:152.71ms step:757/1480 train_time:114079ms step_avg:152.72ms step:758/1480 train_time:114236ms step_avg:152.72ms step:759/1480 train_time:114403ms step_avg:152.74ms step:760/1480 train_time:114551ms step_avg:152.73ms step:761/1480 train_time:114707ms step_avg:152.74ms step:762/1480 train_time:114864ms step_avg:152.74ms step:763/1480 train_time:115021ms step_avg:152.75ms step:764/1480 train_time:115178ms step_avg:152.76ms step:765/1480 train_time:115335ms step_avg:152.76ms step:766/1480 train_time:115492ms step_avg:152.77ms step:767/1480 train_time:115647ms step_avg:152.77ms step:768/1480 train_time:115805ms step_avg:152.78ms step:769/1480 train_time:115963ms step_avg:152.78ms step:770/1480 train_time:116120ms step_avg:152.79ms step:771/1480 train_time:116279ms step_avg:152.80ms step:772/1480 train_time:116437ms step_avg:152.80ms step:773/1480 train_time:116594ms step_avg:152.81ms step:774/1480 train_time:116751ms step_avg:152.82ms step:775/1480 train_time:116907ms step_avg:152.82ms step:776/1480 train_time:117066ms step_avg:152.83ms step:777/1480 train_time:117226ms step_avg:152.84ms step:778/1480 train_time:117384ms step_avg:152.84ms step:779/1480 train_time:117540ms step_avg:152.85ms step:780/1480 train_time:117700ms step_avg:152.86ms step:781/1480 train_time:117857ms step_avg:152.86ms step:782/1480 train_time:118015ms step_avg:152.87ms step:783/1480 train_time:118174ms step_avg:152.88ms step:784/1480 train_time:118332ms step_avg:152.88ms step:785/1480 train_time:118490ms step_avg:152.89ms step:786/1480 train_time:118646ms step_avg:152.89ms step:787/1480 train_time:118803ms step_avg:152.90ms step:788/1480 train_time:118962ms step_avg:152.91ms step:789/1480 train_time:119119ms step_avg:152.91ms step:790/1480 train_time:119277ms step_avg:152.92ms step:791/1480 train_time:119438ms step_avg:152.93ms step:792/1480 train_time:119598ms step_avg:152.94ms step:793/1480 train_time:119756ms step_avg:152.94ms step:794/1480 train_time:119914ms step_avg:152.95ms step:795/1480 train_time:120074ms step_avg:152.96ms step:796/1480 train_time:120235ms step_avg:152.97ms step:797/1480 train_time:120394ms step_avg:152.98ms step:798/1480 train_time:120552ms step_avg:152.98ms step:799/1480 train_time:120712ms step_avg:152.99ms step:800/1480 train_time:120870ms step_avg:153.00ms step:801/1480 train_time:121028ms step_avg:153.01ms step:802/1480 train_time:121187ms step_avg:153.01ms step:803/1480 train_time:121344ms step_avg:153.02ms step:804/1480 train_time:121501ms step_avg:153.02ms step:805/1480 train_time:121661ms step_avg:153.03ms step:806/1480 train_time:121818ms step_avg:153.04ms step:807/1480 train_time:121974ms step_avg:153.04ms step:808/1480 train_time:122132ms step_avg:153.05ms step:809/1480 train_time:122290ms step_avg:153.05ms step:810/1480 train_time:122446ms step_avg:153.06ms step:811/1480 train_time:122605ms step_avg:153.06ms step:812/1480 train_time:122762ms step_avg:153.07ms step:813/1480 train_time:122918ms step_avg:153.07ms step:814/1480 train_time:123076ms step_avg:153.08ms step:815/1480 train_time:123233ms step_avg:153.08ms step:816/1480 train_time:123392ms step_avg:153.09ms step:817/1480 train_time:123550ms step_avg:153.10ms step:818/1480 train_time:123707ms step_avg:153.10ms step:819/1480 train_time:123865ms step_avg:153.11ms step:820/1480 train_time:124024ms step_avg:153.12ms step:821/1480 train_time:124181ms step_avg:153.12ms step:822/1480 train_time:124339ms step_avg:153.13ms step:823/1480 train_time:124498ms step_avg:153.13ms step:824/1480 train_time:124655ms step_avg:153.14ms step:825/1480 train_time:124814ms step_avg:153.15ms step:826/1480 train_time:124976ms step_avg:153.16ms step:827/1480 train_time:125136ms step_avg:153.17ms step:828/1480 train_time:125295ms step_avg:153.17ms step:829/1480 train_time:125454ms step_avg:153.18ms step:830/1480 train_time:125613ms step_avg:153.19ms step:831/1480 train_time:125771ms step_avg:153.19ms step:832/1480 train_time:125931ms step_avg:153.20ms step:833/1480 train_time:126087ms step_avg:153.20ms step:834/1480 train_time:126245ms step_avg:153.21ms step:835/1480 train_time:126403ms step_avg:153.22ms step:836/1480 train_time:126563ms step_avg:153.22ms step:837/1480 train_time:126721ms step_avg:153.23ms step:838/1480 train_time:126878ms step_avg:153.23ms step:839/1480 train_time:127037ms step_avg:153.24ms step:840/1480 train_time:127195ms step_avg:153.25ms step:841/1480 train_time:127351ms step_avg:153.25ms step:842/1480 train_time:127508ms step_avg:153.25ms step:843/1480 train_time:127665ms step_avg:153.26ms step:844/1480 train_time:127821ms step_avg:153.26ms step:845/1480 train_time:127980ms step_avg:153.27ms step:846/1480 train_time:128138ms step_avg:153.28ms step:847/1480 train_time:128297ms step_avg:153.28ms step:848/1480 train_time:128456ms step_avg:153.29ms step:849/1480 train_time:128613ms step_avg:153.29ms step:850/1480 train_time:128770ms step_avg:153.30ms step:851/1480 train_time:128930ms step_avg:153.31ms step:852/1480 train_time:129089ms step_avg:153.31ms step:853/1480 train_time:129246ms step_avg:153.32ms step:854/1480 train_time:129405ms step_avg:153.32ms step:855/1480 train_time:129562ms step_avg:153.33ms step:856/1480 train_time:129720ms step_avg:153.33ms step:857/1480 train_time:129879ms step_avg:153.34ms step:858/1480 train_time:130039ms step_avg:153.35ms step:859/1480 train_time:130199ms step_avg:153.36ms step:860/1480 train_time:130357ms step_avg:153.36ms step:861/1480 train_time:130516ms step_avg:153.37ms step:862/1480 train_time:130678ms step_avg:153.38ms step:863/1480 train_time:130835ms step_avg:153.38ms step:864/1480 train_time:130995ms step_avg:153.39ms step:865/1480 train_time:131152ms step_avg:153.39ms step:866/1480 train_time:131310ms step_avg:153.40ms step:867/1480 train_time:131469ms step_avg:153.41ms step:868/1480 train_time:131626ms step_avg:153.41ms step:869/1480 train_time:131784ms step_avg:153.42ms step:870/1480 train_time:131942ms step_avg:153.42ms step:871/1480 train_time:132100ms step_avg:153.43ms step:872/1480 train_time:132258ms step_avg:153.43ms step:873/1480 train_time:132415ms step_avg:153.44ms step:874/1480 train_time:132575ms step_avg:153.44ms step:875/1480 train_time:132736ms step_avg:153.45ms step:875/1480 val_loss:3.5050 train_time:132809ms step_avg:153.54ms step:876/1480 train_time:132900ms step_avg:153.46ms step:877/1480 train_time:133054ms step_avg:153.46ms step:878/1480 train_time:133211ms step_avg:153.47ms step:879/1480 train_time:133370ms step_avg:153.48ms step:880/1480 train_time:133528ms step_avg:153.48ms step:881/1480 train_time:133685ms step_avg:153.48ms step:882/1480 train_time:133846ms step_avg:153.49ms step:883/1480 train_time:134006ms step_avg:153.50ms step:884/1480 train_time:134168ms step_avg:153.51ms step:885/1480 train_time:134329ms step_avg:153.52ms step:886/1480 train_time:134488ms step_avg:153.53ms step:887/1480 train_time:134648ms step_avg:153.53ms step:888/1480 train_time:134811ms step_avg:153.54ms step:889/1480 train_time:134971ms step_avg:153.55ms step:890/1480 train_time:135129ms step_avg:153.56ms step:891/1480 train_time:135290ms step_avg:153.56ms step:892/1480 train_time:135449ms step_avg:153.57ms step:893/1480 train_time:135607ms step_avg:153.58ms step:894/1480 train_time:135767ms step_avg:153.58ms step:895/1480 train_time:135929ms step_avg:153.59ms step:896/1480 train_time:136088ms step_avg:153.60ms step:897/1480 train_time:136250ms step_avg:153.61ms step:898/1480 train_time:136408ms step_avg:153.61ms step:899/1480 train_time:136567ms step_avg:153.62ms step:900/1480 train_time:136726ms step_avg:153.62ms step:901/1480 train_time:136887ms step_avg:153.63ms step:902/1480 train_time:137045ms step_avg:153.64ms step:903/1480 train_time:137207ms step_avg:153.65ms step:904/1480 train_time:137366ms step_avg:153.65ms step:905/1480 train_time:137525ms step_avg:153.66ms step:906/1480 train_time:137685ms step_avg:153.67ms step:907/1480 train_time:137846ms step_avg:153.67ms step:908/1480 train_time:138005ms step_avg:153.68ms step:909/1480 train_time:138166ms step_avg:153.69ms step:910/1480 train_time:138331ms step_avg:153.70ms step:911/1480 train_time:138489ms step_avg:153.71ms step:912/1480 train_time:138650ms step_avg:153.71ms step:913/1480 train_time:138811ms step_avg:153.72ms step:914/1480 train_time:138971ms step_avg:153.73ms step:915/1480 train_time:139133ms step_avg:153.74ms step:916/1480 train_time:139292ms step_avg:153.74ms step:917/1480 train_time:139451ms step_avg:153.75ms step:918/1480 train_time:139612ms step_avg:153.76ms step:919/1480 train_time:139774ms step_avg:153.77ms step:920/1480 train_time:139933ms step_avg:153.77ms step:921/1480 train_time:140091ms step_avg:153.78ms step:922/1480 train_time:140254ms step_avg:153.79ms step:923/1480 train_time:140411ms step_avg:153.79ms step:924/1480 train_time:140570ms step_avg:153.80ms step:925/1480 train_time:140730ms step_avg:153.80ms step:926/1480 train_time:140890ms step_avg:153.81ms step:927/1480 train_time:141049ms step_avg:153.82ms step:928/1480 train_time:141208ms step_avg:153.82ms step:929/1480 train_time:141368ms step_avg:153.83ms step:930/1480 train_time:141528ms step_avg:153.83ms step:931/1480 train_time:141687ms step_avg:153.84ms step:932/1480 train_time:141848ms step_avg:153.85ms step:933/1480 train_time:142007ms step_avg:153.85ms step:934/1480 train_time:142166ms step_avg:153.86ms step:935/1480 train_time:142328ms step_avg:153.87ms step:936/1480 train_time:142486ms step_avg:153.87ms step:937/1480 train_time:142649ms step_avg:153.88ms step:938/1480 train_time:142806ms step_avg:153.89ms step:939/1480 train_time:142969ms step_avg:153.90ms step:940/1480 train_time:143132ms step_avg:153.90ms step:941/1480 train_time:143289ms step_avg:153.91ms step:942/1480 train_time:143449ms step_avg:153.92ms step:943/1480 train_time:143609ms step_avg:153.92ms step:944/1480 train_time:143772ms step_avg:153.93ms step:945/1480 train_time:143930ms step_avg:153.94ms step:946/1480 train_time:144093ms step_avg:153.95ms step:947/1480 train_time:144253ms step_avg:153.95ms step:948/1480 train_time:144412ms step_avg:153.96ms step:949/1480 train_time:144581ms step_avg:153.97ms step:950/1480 train_time:144731ms step_avg:153.97ms step:951/1480 train_time:144892ms step_avg:153.98ms step:952/1480 train_time:145051ms step_avg:153.98ms step:953/1480 train_time:145213ms step_avg:153.99ms step:954/1480 train_time:145374ms step_avg:154.00ms step:955/1480 train_time:145531ms step_avg:154.00ms step:956/1480 train_time:145691ms step_avg:154.01ms step:957/1480 train_time:145852ms step_avg:154.01ms step:958/1480 train_time:146013ms step_avg:154.02ms step:959/1480 train_time:146172ms step_avg:154.03ms step:960/1480 train_time:146332ms step_avg:154.03ms step:961/1480 train_time:146491ms step_avg:154.04ms step:962/1480 train_time:146649ms step_avg:154.04ms step:963/1480 train_time:146809ms step_avg:154.05ms step:964/1480 train_time:146970ms step_avg:154.06ms step:965/1480 train_time:147129ms step_avg:154.06ms step:966/1480 train_time:147289ms step_avg:154.07ms step:967/1480 train_time:147447ms step_avg:154.07ms step:968/1480 train_time:147607ms step_avg:154.08ms step:969/1480 train_time:147767ms step_avg:154.08ms step:970/1480 train_time:147926ms step_avg:154.09ms step:971/1480 train_time:148085ms step_avg:154.09ms step:972/1480 train_time:148244ms step_avg:154.10ms step:973/1480 train_time:148403ms step_avg:154.10ms step:974/1480 train_time:148565ms step_avg:154.11ms step:975/1480 train_time:148726ms step_avg:154.12ms step:976/1480 train_time:148886ms step_avg:154.13ms step:977/1480 train_time:149046ms step_avg:154.13ms step:978/1480 train_time:149206ms step_avg:154.14ms step:979/1480 train_time:149367ms step_avg:154.15ms step:980/1480 train_time:149528ms step_avg:154.15ms step:981/1480 train_time:149691ms step_avg:154.16ms step:982/1480 train_time:149849ms step_avg:154.17ms step:983/1480 train_time:150007ms step_avg:154.17ms step:984/1480 train_time:150166ms step_avg:154.17ms step:985/1480 train_time:150329ms step_avg:154.18ms step:986/1480 train_time:150488ms step_avg:154.19ms step:987/1480 train_time:150647ms step_avg:154.19ms step:988/1480 train_time:150806ms step_avg:154.20ms step:989/1480 train_time:150966ms step_avg:154.20ms step:990/1480 train_time:151129ms step_avg:154.21ms step:991/1480 train_time:151290ms step_avg:154.22ms step:992/1480 train_time:151456ms step_avg:154.23ms step:993/1480 train_time:151623ms step_avg:154.25ms step:994/1480 train_time:151784ms step_avg:154.25ms step:995/1480 train_time:151942ms step_avg:154.26ms step:996/1480 train_time:152100ms step_avg:154.26ms step:997/1480 train_time:152260ms step_avg:154.27ms step:998/1480 train_time:152419ms step_avg:154.27ms step:999/1480 train_time:152579ms step_avg:154.28ms step:1000/1480 train_time:152740ms step_avg:154.28ms step:1000/1480 val_loss:3.4408 train_time:152813ms step_avg:154.36ms step:1001/1480 train_time:152907ms step_avg:154.30ms step:1002/1480 train_time:153065ms step_avg:154.30ms step:1003/1480 train_time:153228ms step_avg:154.31ms step:1004/1480 train_time:153389ms step_avg:154.32ms step:1005/1480 train_time:153548ms step_avg:154.32ms step:1006/1480 train_time:153708ms step_avg:154.33ms step:1007/1480 train_time:153867ms step_avg:154.33ms step:1008/1480 train_time:154028ms step_avg:154.34ms step:1009/1480 train_time:154193ms step_avg:154.35ms step:1010/1480 train_time:154353ms step_avg:154.35ms step:1011/1480 train_time:154511ms step_avg:154.36ms step:1012/1480 train_time:154668ms step_avg:154.36ms step:1013/1480 train_time:154829ms step_avg:154.37ms step:1014/1480 train_time:154989ms step_avg:154.37ms step:1015/1480 train_time:155150ms step_avg:154.38ms step:1016/1480 train_time:155310ms step_avg:154.38ms step:1017/1480 train_time:155470ms step_avg:154.39ms step:1018/1480 train_time:155630ms step_avg:154.39ms step:1019/1480 train_time:155791ms step_avg:154.40ms step:1020/1480 train_time:155951ms step_avg:154.41ms step:1021/1480 train_time:156110ms step_avg:154.41ms step:1022/1480 train_time:156269ms step_avg:154.42ms step:1023/1480 train_time:156430ms step_avg:154.42ms step:1024/1480 train_time:156590ms step_avg:154.43ms step:1025/1480 train_time:156751ms step_avg:154.43ms step:1026/1480 train_time:156910ms step_avg:154.44ms step:1027/1480 train_time:157069ms step_avg:154.44ms step:1028/1480 train_time:157229ms step_avg:154.45ms step:1029/1480 train_time:157393ms step_avg:154.46ms step:1030/1480 train_time:157552ms step_avg:154.46ms step:1031/1480 train_time:157710ms step_avg:154.47ms step:1032/1480 train_time:157873ms step_avg:154.47ms step:1033/1480 train_time:158034ms step_avg:154.48ms step:1034/1480 train_time:158194ms step_avg:154.49ms step:1035/1480 train_time:158354ms step_avg:154.49ms step:1036/1480 train_time:158512ms step_avg:154.50ms step:1037/1480 train_time:158672ms step_avg:154.50ms step:1038/1480 train_time:158831ms step_avg:154.50ms step:1039/1480 train_time:158992ms step_avg:154.51ms step:1040/1480 train_time:159151ms step_avg:154.52ms step:1041/1480 train_time:159312ms step_avg:154.52ms step:1042/1480 train_time:159470ms step_avg:154.53ms step:1043/1480 train_time:159628ms step_avg:154.53ms step:1044/1480 train_time:159788ms step_avg:154.53ms step:1045/1480 train_time:159948ms step_avg:154.54ms step:1046/1480 train_time:160107ms step_avg:154.54ms step:1047/1480 train_time:160266ms step_avg:154.55ms step:1048/1480 train_time:160427ms step_avg:154.55ms step:1049/1480 train_time:160588ms step_avg:154.56ms step:1050/1480 train_time:160748ms step_avg:154.57ms step:1051/1480 train_time:160910ms step_avg:154.57ms step:1052/1480 train_time:161069ms step_avg:154.58ms step:1053/1480 train_time:161229ms step_avg:154.58ms step:1054/1480 train_time:161389ms step_avg:154.59ms step:1055/1480 train_time:161548ms step_avg:154.59ms step:1056/1480 train_time:161707ms step_avg:154.60ms step:1057/1480 train_time:161867ms step_avg:154.60ms step:1058/1480 train_time:162029ms step_avg:154.61ms step:1059/1480 train_time:162191ms step_avg:154.62ms step:1060/1480 train_time:162352ms step_avg:154.62ms step:1061/1480 train_time:162510ms step_avg:154.62ms step:1062/1480 train_time:162668ms step_avg:154.63ms step:1063/1480 train_time:162828ms step_avg:154.63ms step:1064/1480 train_time:162986ms step_avg:154.64ms step:1065/1480 train_time:163146ms step_avg:154.64ms step:1066/1480 train_time:163308ms step_avg:154.65ms step:1067/1480 train_time:163469ms step_avg:154.65ms step:1068/1480 train_time:163629ms step_avg:154.66ms step:1069/1480 train_time:163793ms step_avg:154.67ms step:1070/1480 train_time:163953ms step_avg:154.67ms step:1071/1480 train_time:164119ms step_avg:154.68ms step:1072/1480 train_time:164280ms step_avg:154.69ms step:1073/1480 train_time:164439ms step_avg:154.69ms step:1074/1480 train_time:164600ms step_avg:154.70ms step:1075/1480 train_time:164761ms step_avg:154.70ms step:1076/1480 train_time:164922ms step_avg:154.71ms step:1077/1480 train_time:165082ms step_avg:154.72ms step:1078/1480 train_time:165247ms step_avg:154.73ms step:1079/1480 train_time:165410ms step_avg:154.73ms step:1080/1480 train_time:165570ms step_avg:154.74ms step:1081/1480 train_time:165730ms step_avg:154.74ms step:1082/1480 train_time:165890ms step_avg:154.75ms step:1083/1480 train_time:166048ms step_avg:154.75ms step:1084/1480 train_time:166209ms step_avg:154.76ms step:1085/1480 train_time:166368ms step_avg:154.76ms step:1086/1480 train_time:166529ms step_avg:154.77ms step:1087/1480 train_time:166688ms step_avg:154.77ms step:1088/1480 train_time:166848ms step_avg:154.78ms step:1089/1480 train_time:167011ms step_avg:154.78ms step:1090/1480 train_time:167175ms step_avg:154.79ms step:1091/1480 train_time:167334ms step_avg:154.80ms step:1092/1480 train_time:167496ms step_avg:154.80ms step:1093/1480 train_time:167659ms step_avg:154.81ms step:1094/1480 train_time:167820ms step_avg:154.82ms step:1095/1480 train_time:167981ms step_avg:154.82ms step:1096/1480 train_time:168143ms step_avg:154.83ms step:1097/1480 train_time:168306ms step_avg:154.84ms step:1098/1480 train_time:168467ms step_avg:154.84ms step:1099/1480 train_time:168628ms step_avg:154.85ms step:1100/1480 train_time:168792ms step_avg:154.85ms step:1101/1480 train_time:168953ms step_avg:154.86ms step:1102/1480 train_time:169115ms step_avg:154.87ms step:1103/1480 train_time:169282ms step_avg:154.88ms step:1104/1480 train_time:169444ms step_avg:154.89ms step:1105/1480 train_time:169608ms step_avg:154.89ms step:1106/1480 train_time:169768ms step_avg:154.90ms step:1107/1480 train_time:169929ms step_avg:154.90ms step:1108/1480 train_time:170088ms step_avg:154.91ms step:1109/1480 train_time:170248ms step_avg:154.91ms step:1110/1480 train_time:170408ms step_avg:154.92ms step:1111/1480 train_time:170569ms step_avg:154.92ms step:1112/1480 train_time:170730ms step_avg:154.93ms step:1113/1480 train_time:170899ms step_avg:154.94ms step:1114/1480 train_time:171062ms step_avg:154.95ms step:1115/1480 train_time:171225ms step_avg:154.96ms step:1116/1480 train_time:171385ms step_avg:154.96ms step:1117/1480 train_time:171550ms step_avg:154.97ms step:1118/1480 train_time:171715ms step_avg:154.98ms step:1119/1480 train_time:171875ms step_avg:154.98ms step:1120/1480 train_time:172034ms step_avg:154.99ms step:1121/1480 train_time:172197ms step_avg:154.99ms step:1122/1480 train_time:172358ms step_avg:155.00ms step:1123/1480 train_time:172518ms step_avg:155.00ms step:1124/1480 train_time:172680ms step_avg:155.01ms step:1125/1480 train_time:172843ms step_avg:155.02ms step:1125/1480 val_loss:3.3862 train_time:172918ms step_avg:155.08ms step:1126/1480 train_time:173008ms step_avg:155.03ms step:1127/1480 train_time:173169ms step_avg:155.03ms step:1128/1480 train_time:173330ms step_avg:155.04ms step:1129/1480 train_time:173495ms step_avg:155.04ms step:1130/1480 train_time:173658ms step_avg:155.05ms step:1131/1480 train_time:173825ms step_avg:155.06ms step:1132/1480 train_time:173985ms step_avg:155.07ms step:1133/1480 train_time:174148ms step_avg:155.07ms step:1134/1480 train_time:174312ms step_avg:155.08ms step:1135/1480 train_time:174474ms step_avg:155.09ms step:1136/1480 train_time:174636ms step_avg:155.09ms step:1137/1480 train_time:174796ms step_avg:155.10ms step:1138/1480 train_time:174959ms step_avg:155.11ms step:1139/1480 train_time:175128ms step_avg:155.12ms step:1140/1480 train_time:175281ms step_avg:155.12ms step:1141/1480 train_time:175445ms step_avg:155.12ms step:1142/1480 train_time:175605ms step_avg:155.13ms step:1143/1480 train_time:175768ms step_avg:155.14ms step:1144/1480 train_time:175930ms step_avg:155.14ms step:1145/1480 train_time:176089ms step_avg:155.14ms step:1146/1480 train_time:176254ms step_avg:155.15ms step:1147/1480 train_time:176415ms step_avg:155.16ms step:1148/1480 train_time:176576ms step_avg:155.16ms step:1149/1480 train_time:176738ms step_avg:155.17ms step:1150/1480 train_time:176898ms step_avg:155.17ms step:1151/1480 train_time:177061ms step_avg:155.18ms step:1152/1480 train_time:177225ms step_avg:155.19ms step:1153/1480 train_time:177389ms step_avg:155.20ms step:1154/1480 train_time:177551ms step_avg:155.20ms step:1155/1480 train_time:177714ms step_avg:155.21ms step:1156/1480 train_time:177880ms step_avg:155.22ms step:1157/1480 train_time:178042ms step_avg:155.22ms step:1158/1480 train_time:178202ms step_avg:155.23ms step:1159/1480 train_time:178363ms step_avg:155.23ms step:1160/1480 train_time:178522ms step_avg:155.24ms step:1161/1480 train_time:178684ms step_avg:155.24ms step:1162/1480 train_time:178848ms step_avg:155.25ms step:1163/1480 train_time:179013ms step_avg:155.26ms step:1164/1480 train_time:179176ms step_avg:155.27ms step:1165/1480 train_time:179335ms step_avg:155.27ms step:1166/1480 train_time:179497ms step_avg:155.27ms step:1167/1480 train_time:179657ms step_avg:155.28ms step:1168/1480 train_time:179817ms step_avg:155.28ms step:1169/1480 train_time:179979ms step_avg:155.29ms step:1170/1480 train_time:180141ms step_avg:155.29ms step:1171/1480 train_time:180302ms step_avg:155.30ms step:1172/1480 train_time:180461ms step_avg:155.30ms step:1173/1480 train_time:180623ms step_avg:155.31ms step:1174/1480 train_time:180797ms step_avg:155.32ms step:1175/1480 train_time:180959ms step_avg:155.33ms step:1176/1480 train_time:181122ms step_avg:155.34ms step:1177/1480 train_time:181289ms step_avg:155.35ms step:1178/1480 train_time:181449ms step_avg:155.35ms step:1179/1480 train_time:181609ms step_avg:155.35ms step:1180/1480 train_time:181777ms step_avg:155.37ms step:1181/1480 train_time:181939ms step_avg:155.37ms step:1182/1480 train_time:182099ms step_avg:155.37ms step:1183/1480 train_time:182261ms step_avg:155.38ms step:1184/1480 train_time:182421ms step_avg:155.38ms step:1185/1480 train_time:182585ms step_avg:155.39ms step:1186/1480 train_time:182749ms step_avg:155.40ms step:1187/1480 train_time:182920ms step_avg:155.41ms step:1188/1480 train_time:183079ms step_avg:155.41ms step:1189/1480 train_time:183240ms step_avg:155.42ms step:1190/1480 train_time:183401ms step_avg:155.42ms step:1191/1480 train_time:183566ms step_avg:155.43ms step:1192/1480 train_time:183726ms step_avg:155.44ms step:1193/1480 train_time:183884ms step_avg:155.44ms step:1194/1480 train_time:184046ms step_avg:155.44ms step:1195/1480 train_time:184209ms step_avg:155.45ms step:1196/1480 train_time:184378ms step_avg:155.46ms step:1197/1480 train_time:184540ms step_avg:155.47ms step:1198/1480 train_time:184710ms step_avg:155.48ms step:1199/1480 train_time:184874ms step_avg:155.49ms step:1200/1480 train_time:185035ms step_avg:155.49ms step:1201/1480 train_time:185196ms step_avg:155.50ms step:1202/1480 train_time:185366ms step_avg:155.51ms step:1203/1480 train_time:185533ms step_avg:155.52ms step:1204/1480 train_time:185696ms step_avg:155.52ms step:1205/1480 train_time:185858ms step_avg:155.53ms step:1206/1480 train_time:186019ms step_avg:155.53ms step:1207/1480 train_time:186179ms step_avg:155.54ms step:1208/1480 train_time:186339ms step_avg:155.54ms step:1209/1480 train_time:186501ms step_avg:155.55ms step:1210/1480 train_time:186667ms step_avg:155.56ms step:1211/1480 train_time:186831ms step_avg:155.56ms step:1212/1480 train_time:186994ms step_avg:155.57ms step:1213/1480 train_time:187158ms step_avg:155.58ms step:1214/1480 train_time:187323ms step_avg:155.58ms step:1215/1480 train_time:187486ms step_avg:155.59ms step:1216/1480 train_time:187647ms step_avg:155.59ms step:1217/1480 train_time:187811ms step_avg:155.60ms step:1218/1480 train_time:187974ms step_avg:155.61ms step:1219/1480 train_time:188141ms step_avg:155.62ms step:1220/1480 train_time:188303ms step_avg:155.62ms step:1221/1480 train_time:188463ms step_avg:155.63ms step:1222/1480 train_time:188623ms step_avg:155.63ms step:1223/1480 train_time:188786ms step_avg:155.64ms step:1224/1480 train_time:188954ms step_avg:155.65ms step:1225/1480 train_time:189119ms step_avg:155.65ms step:1226/1480 train_time:189283ms step_avg:155.66ms step:1227/1480 train_time:189447ms step_avg:155.67ms step:1228/1480 train_time:189611ms step_avg:155.67ms step:1229/1480 train_time:189775ms step_avg:155.68ms step:1230/1480 train_time:189942ms step_avg:155.69ms step:1231/1480 train_time:190108ms step_avg:155.70ms step:1232/1480 train_time:190273ms step_avg:155.71ms step:1233/1480 train_time:190434ms step_avg:155.71ms step:1234/1480 train_time:190596ms step_avg:155.72ms step:1235/1480 train_time:190761ms step_avg:155.72ms step:1236/1480 train_time:190920ms step_avg:155.73ms step:1237/1480 train_time:191081ms step_avg:155.73ms step:1238/1480 train_time:191256ms step_avg:155.75ms step:1239/1480 train_time:191418ms step_avg:155.75ms step:1240/1480 train_time:191582ms step_avg:155.76ms step:1241/1480 train_time:191748ms step_avg:155.77ms step:1242/1480 train_time:191910ms step_avg:155.77ms step:1243/1480 train_time:192075ms step_avg:155.78ms step:1244/1480 train_time:192235ms step_avg:155.78ms step:1245/1480 train_time:192398ms step_avg:155.79ms step:1246/1480 train_time:192561ms step_avg:155.79ms step:1247/1480 train_time:192723ms step_avg:155.80ms step:1248/1480 train_time:192884ms step_avg:155.80ms step:1249/1480 train_time:193045ms step_avg:155.81ms step:1250/1480 train_time:193208ms step_avg:155.81ms step:1250/1480 val_loss:3.3366 train_time:193283ms step_avg:155.87ms step:1251/1480 train_time:193378ms step_avg:155.82ms step:1252/1480 train_time:193540ms step_avg:155.83ms step:1253/1480 train_time:193700ms step_avg:155.83ms step:1254/1480 train_time:193860ms step_avg:155.84ms step:1255/1480 train_time:194031ms step_avg:155.85ms step:1256/1480 train_time:194198ms step_avg:155.86ms step:1257/1480 train_time:194359ms step_avg:155.86ms step:1258/1480 train_time:194524ms step_avg:155.87ms step:1259/1480 train_time:194687ms step_avg:155.87ms step:1260/1480 train_time:194847ms step_avg:155.88ms step:1261/1480 train_time:195012ms step_avg:155.88ms step:1262/1480 train_time:195177ms step_avg:155.89ms step:1263/1480 train_time:195341ms step_avg:155.90ms step:1264/1480 train_time:195500ms step_avg:155.90ms step:1265/1480 train_time:195659ms step_avg:155.90ms step:1266/1480 train_time:195822ms step_avg:155.91ms step:1267/1480 train_time:195983ms step_avg:155.91ms step:1268/1480 train_time:196143ms step_avg:155.92ms step:1269/1480 train_time:196309ms step_avg:155.92ms step:1270/1480 train_time:196471ms step_avg:155.93ms step:1271/1480 train_time:196634ms step_avg:155.94ms step:1272/1480 train_time:196795ms step_avg:155.94ms step:1273/1480 train_time:196958ms step_avg:155.94ms step:1274/1480 train_time:197123ms step_avg:155.95ms step:1275/1480 train_time:197283ms step_avg:155.96ms step:1276/1480 train_time:197443ms step_avg:155.96ms step:1277/1480 train_time:197604ms step_avg:155.96ms step:1278/1480 train_time:197764ms step_avg:155.97ms step:1279/1480 train_time:197925ms step_avg:155.97ms step:1280/1480 train_time:198093ms step_avg:155.98ms step:1281/1480 train_time:198255ms step_avg:155.98ms step:1282/1480 train_time:198415ms step_avg:155.99ms step:1283/1480 train_time:198579ms step_avg:155.99ms step:1284/1480 train_time:198742ms step_avg:156.00ms step:1285/1480 train_time:198903ms step_avg:156.00ms step:1286/1480 train_time:199064ms step_avg:156.01ms step:1287/1480 train_time:199225ms step_avg:156.01ms step:1288/1480 train_time:199388ms step_avg:156.02ms step:1289/1480 train_time:199557ms step_avg:156.03ms step:1290/1480 train_time:199726ms step_avg:156.04ms step:1291/1480 train_time:199892ms step_avg:156.04ms step:1292/1480 train_time:200055ms step_avg:156.05ms step:1293/1480 train_time:200221ms step_avg:156.06ms step:1294/1480 train_time:200384ms step_avg:156.06ms step:1295/1480 train_time:200546ms step_avg:156.07ms step:1296/1480 train_time:200708ms step_avg:156.07ms step:1297/1480 train_time:200871ms step_avg:156.08ms step:1298/1480 train_time:201034ms step_avg:156.08ms step:1299/1480 train_time:201198ms step_avg:156.09ms step:1300/1480 train_time:201358ms step_avg:156.09ms step:1301/1480 train_time:201520ms step_avg:156.10ms step:1302/1480 train_time:201686ms step_avg:156.10ms step:1303/1480 train_time:201857ms step_avg:156.11ms step:1304/1480 train_time:202021ms step_avg:156.12ms step:1305/1480 train_time:202182ms step_avg:156.12ms step:1306/1480 train_time:202345ms step_avg:156.13ms step:1307/1480 train_time:202507ms step_avg:156.13ms step:1308/1480 train_time:202668ms step_avg:156.14ms step:1309/1480 train_time:202837ms step_avg:156.15ms step:1310/1480 train_time:202999ms step_avg:156.15ms step:1311/1480 train_time:203160ms step_avg:156.16ms step:1312/1480 train_time:203324ms step_avg:156.16ms step:1313/1480 train_time:203486ms step_avg:156.17ms step:1314/1480 train_time:203651ms step_avg:156.17ms step:1315/1480 train_time:203816ms step_avg:156.18ms step:1316/1480 train_time:203976ms step_avg:156.18ms step:1317/1480 train_time:204138ms step_avg:156.19ms step:1318/1480 train_time:204304ms step_avg:156.20ms step:1319/1480 train_time:204471ms step_avg:156.20ms step:1320/1480 train_time:204638ms step_avg:156.21ms step:1321/1480 train_time:204801ms step_avg:156.22ms step:1322/1480 train_time:204969ms step_avg:156.23ms step:1323/1480 train_time:205135ms step_avg:156.23ms step:1324/1480 train_time:205299ms step_avg:156.24ms step:1325/1480 train_time:205469ms step_avg:156.25ms step:1326/1480 train_time:205635ms step_avg:156.26ms step:1327/1480 train_time:205798ms step_avg:156.26ms step:1328/1480 train_time:205960ms step_avg:156.27ms step:1329/1480 train_time:206144ms step_avg:156.29ms step:1330/1480 train_time:206308ms step_avg:156.29ms step:1331/1480 train_time:206472ms step_avg:156.30ms step:1332/1480 train_time:206635ms step_avg:156.30ms step:1333/1480 train_time:206800ms step_avg:156.31ms step:1334/1480 train_time:206963ms step_avg:156.32ms step:1335/1480 train_time:207124ms step_avg:156.32ms step:1336/1480 train_time:207294ms step_avg:156.33ms step:1337/1480 train_time:207460ms step_avg:156.34ms step:1338/1480 train_time:207623ms step_avg:156.34ms step:1339/1480 train_time:207788ms step_avg:156.35ms step:1340/1480 train_time:207952ms step_avg:156.35ms step:1341/1480 train_time:208114ms step_avg:156.36ms step:1342/1480 train_time:208279ms step_avg:156.37ms step:1343/1480 train_time:208441ms step_avg:156.37ms step:1344/1480 train_time:208602ms step_avg:156.37ms step:1345/1480 train_time:208772ms step_avg:156.38ms step:1346/1480 train_time:208932ms step_avg:156.39ms step:1347/1480 train_time:209097ms step_avg:156.39ms step:1348/1480 train_time:209259ms step_avg:156.40ms step:1349/1480 train_time:209422ms step_avg:156.40ms step:1350/1480 train_time:209589ms step_avg:156.41ms step:1351/1480 train_time:209752ms step_avg:156.41ms step:1352/1480 train_time:209916ms step_avg:156.42ms step:1353/1480 train_time:210082ms step_avg:156.43ms step:1354/1480 train_time:210245ms step_avg:156.43ms step:1355/1480 train_time:210406ms step_avg:156.44ms step:1356/1480 train_time:210570ms step_avg:156.44ms step:1357/1480 train_time:210735ms step_avg:156.45ms step:1358/1480 train_time:210899ms step_avg:156.45ms step:1359/1480 train_time:211063ms step_avg:156.46ms step:1360/1480 train_time:211229ms step_avg:156.47ms step:1361/1480 train_time:211398ms step_avg:156.48ms step:1362/1480 train_time:211562ms step_avg:156.48ms step:1363/1480 train_time:211730ms step_avg:156.49ms step:1364/1480 train_time:211894ms step_avg:156.49ms step:1365/1480 train_time:212054ms step_avg:156.50ms step:1366/1480 train_time:212219ms step_avg:156.50ms step:1367/1480 train_time:212382ms step_avg:156.51ms step:1368/1480 train_time:212549ms step_avg:156.52ms step:1369/1480 train_time:212718ms step_avg:156.53ms step:1370/1480 train_time:212883ms step_avg:156.53ms step:1371/1480 train_time:213046ms step_avg:156.54ms step:1372/1480 train_time:213214ms step_avg:156.54ms step:1373/1480 train_time:213375ms step_avg:156.55ms step:1374/1480 train_time:213541ms step_avg:156.56ms step:1375/1480 train_time:213703ms step_avg:156.56ms step:1375/1480 val_loss:3.2983 train_time:213778ms step_avg:156.61ms step:1376/1480 train_time:213870ms step_avg:156.57ms step:1377/1480 train_time:214036ms step_avg:156.57ms step:1378/1480 train_time:214197ms step_avg:156.58ms step:1379/1480 train_time:214362ms step_avg:156.58ms step:1380/1480 train_time:214525ms step_avg:156.59ms step:1381/1480 train_time:214693ms step_avg:156.60ms step:1382/1480 train_time:214857ms step_avg:156.60ms step:1383/1480 train_time:215019ms step_avg:156.61ms step:1384/1480 train_time:215184ms step_avg:156.61ms step:1385/1480 train_time:215343ms step_avg:156.61ms step:1386/1480 train_time:215506ms step_avg:156.62ms step:1387/1480 train_time:215671ms step_avg:156.62ms step:1388/1480 train_time:215834ms step_avg:156.63ms step:1389/1480 train_time:216000ms step_avg:156.64ms step:1390/1480 train_time:216160ms step_avg:156.64ms step:1391/1480 train_time:216323ms step_avg:156.64ms step:1392/1480 train_time:216487ms step_avg:156.65ms step:1393/1480 train_time:216650ms step_avg:156.65ms step:1394/1480 train_time:216815ms step_avg:156.66ms step:1395/1480 train_time:216977ms step_avg:156.66ms step:1396/1480 train_time:217141ms step_avg:156.67ms step:1397/1480 train_time:217302ms step_avg:156.67ms step:1398/1480 train_time:217461ms step_avg:156.67ms step:1399/1480 train_time:217625ms step_avg:156.68ms step:1400/1480 train_time:217792ms step_avg:156.68ms step:1401/1480 train_time:217953ms step_avg:156.69ms step:1402/1480 train_time:218115ms step_avg:156.69ms step:1403/1480 train_time:218281ms step_avg:156.70ms step:1404/1480 train_time:218444ms step_avg:156.70ms step:1405/1480 train_time:218608ms step_avg:156.71ms step:1406/1480 train_time:218772ms step_avg:156.71ms step:1407/1480 train_time:218934ms step_avg:156.72ms step:1408/1480 train_time:219095ms step_avg:156.72ms step:1409/1480 train_time:219268ms step_avg:156.73ms step:1410/1480 train_time:219431ms step_avg:156.74ms step:1411/1480 train_time:219593ms step_avg:156.74ms step:1412/1480 train_time:219756ms step_avg:156.74ms step:1413/1480 train_time:219919ms step_avg:156.75ms step:1414/1480 train_time:220082ms step_avg:156.75ms step:1415/1480 train_time:220248ms step_avg:156.76ms step:1416/1480 train_time:220420ms step_avg:156.77ms step:1417/1480 train_time:220585ms step_avg:156.78ms step:1418/1480 train_time:220748ms step_avg:156.78ms step:1419/1480 train_time:220913ms step_avg:156.79ms step:1420/1480 train_time:221077ms step_avg:156.79ms step:1421/1480 train_time:221241ms step_avg:156.80ms step:1422/1480 train_time:221404ms step_avg:156.80ms step:1423/1480 train_time:221565ms step_avg:156.80ms step:1424/1480 train_time:221731ms step_avg:156.81ms step:1425/1480 train_time:221899ms step_avg:156.82ms step:1426/1480 train_time:222062ms step_avg:156.82ms step:1427/1480 train_time:222227ms step_avg:156.83ms step:1428/1480 train_time:222390ms step_avg:156.83ms step:1429/1480 train_time:222554ms step_avg:156.84ms step:1430/1480 train_time:222719ms step_avg:156.84ms step:1431/1480 train_time:222883ms step_avg:156.85ms step:1432/1480 train_time:223051ms step_avg:156.86ms step:1433/1480 train_time:223219ms step_avg:156.87ms step:1434/1480 train_time:223389ms step_avg:156.87ms step:1435/1480 train_time:223556ms step_avg:156.88ms step:1436/1480 train_time:223721ms step_avg:156.89ms step:1437/1480 train_time:223883ms step_avg:156.89ms step:1438/1480 train_time:224044ms step_avg:156.89ms step:1439/1480 train_time:224211ms step_avg:156.90ms step:1440/1480 train_time:224374ms step_avg:156.90ms step:1441/1480 train_time:224538ms step_avg:156.91ms step:1442/1480 train_time:224704ms step_avg:156.92ms step:1443/1480 train_time:224877ms step_avg:156.93ms step:1444/1480 train_time:225040ms step_avg:156.93ms step:1445/1480 train_time:225201ms step_avg:156.93ms step:1446/1480 train_time:225367ms step_avg:156.94ms step:1447/1480 train_time:225536ms step_avg:156.95ms step:1448/1480 train_time:225697ms step_avg:156.95ms step:1449/1480 train_time:225859ms step_avg:156.96ms step:1450/1480 train_time:226023ms step_avg:156.96ms step:1451/1480 train_time:226186ms step_avg:156.96ms step:1452/1480 train_time:226352ms step_avg:156.97ms step:1453/1480 train_time:226516ms step_avg:156.98ms step:1454/1480 train_time:226678ms step_avg:156.98ms step:1455/1480 train_time:226847ms step_avg:156.99ms step:1456/1480 train_time:227011ms step_avg:156.99ms step:1457/1480 train_time:227173ms step_avg:157.00ms step:1458/1480 train_time:227337ms step_avg:157.00ms step:1459/1480 train_time:227502ms step_avg:157.01ms step:1460/1480 train_time:227664ms step_avg:157.01ms step:1461/1480 train_time:227829ms step_avg:157.02ms step:1462/1480 train_time:227994ms step_avg:157.02ms step:1463/1480 train_time:228160ms step_avg:157.03ms step:1464/1480 train_time:228327ms step_avg:157.03ms step:1465/1480 train_time:228491ms step_avg:157.04ms step:1466/1480 train_time:228654ms step_avg:157.04ms step:1467/1480 train_time:228819ms step_avg:157.05ms step:1468/1480 train_time:228982ms step_avg:157.05ms step:1469/1480 train_time:229144ms step_avg:157.06ms step:1470/1480 train_time:229312ms step_avg:157.06ms step:1471/1480 train_time:229482ms step_avg:157.07ms step:1472/1480 train_time:229652ms step_avg:157.08ms step:1473/1480 train_time:229816ms step_avg:157.09ms step:1474/1480 train_time:229982ms step_avg:157.09ms step:1475/1480 train_time:230152ms step_avg:157.10ms step:1476/1480 train_time:230316ms step_avg:157.10ms step:1477/1480 train_time:230482ms step_avg:157.11ms step:1478/1480 train_time:230654ms step_avg:157.12ms step:1479/1480 train_time:230819ms step_avg:157.13ms step:1480/1480 train_time:230981ms step_avg:157.13ms step:1480/1480 val_loss:3.2792 train_time:231056ms step_avg:157.18ms peak memory consumption: 34239 MiB