import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 10:36:50 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29246ms step_avg:nanms step:2/1480 train_time:29354ms step_avg:nanms step:3/1480 train_time:29476ms step_avg:nanms step:4/1480 train_time:29615ms step_avg:nanms step:5/1480 train_time:29757ms step_avg:nanms step:6/1480 train_time:29901ms step_avg:nanms step:7/1480 train_time:30040ms step_avg:nanms step:8/1480 train_time:30183ms step_avg:nanms step:9/1480 train_time:30325ms step_avg:nanms step:10/1480 train_time:30469ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.48ms step:14/1480 train_time:567ms step_avg:141.84ms step:15/1480 train_time:711ms step_avg:142.29ms step:16/1480 train_time:853ms step_avg:142.23ms step:17/1480 train_time:996ms step_avg:142.33ms step:18/1480 train_time:1138ms step_avg:142.19ms step:19/1480 train_time:1281ms step_avg:142.33ms step:20/1480 train_time:1423ms step_avg:142.27ms step:21/1480 train_time:1566ms step_avg:142.36ms step:22/1480 train_time:1710ms step_avg:142.48ms step:23/1480 train_time:1853ms step_avg:142.53ms step:24/1480 train_time:1995ms step_avg:142.48ms step:25/1480 train_time:2137ms step_avg:142.47ms step:26/1480 train_time:2279ms step_avg:142.44ms step:27/1480 train_time:2421ms step_avg:142.39ms step:28/1480 train_time:2562ms step_avg:142.33ms step:29/1480 train_time:2705ms step_avg:142.36ms step:30/1480 train_time:2848ms step_avg:142.41ms step:31/1480 train_time:2993ms step_avg:142.51ms step:32/1480 train_time:3135ms step_avg:142.51ms step:33/1480 train_time:3279ms step_avg:142.55ms step:34/1480 train_time:3421ms step_avg:142.55ms step:35/1480 train_time:3563ms step_avg:142.54ms step:36/1480 train_time:3707ms step_avg:142.59ms step:37/1480 train_time:3852ms step_avg:142.65ms step:38/1480 train_time:3994ms step_avg:142.66ms step:39/1480 train_time:4137ms step_avg:142.67ms step:40/1480 train_time:4282ms step_avg:142.72ms step:41/1480 train_time:4425ms step_avg:142.75ms step:42/1480 train_time:4567ms step_avg:142.72ms step:43/1480 train_time:4712ms step_avg:142.79ms step:44/1480 train_time:4855ms step_avg:142.79ms step:45/1480 train_time:4997ms step_avg:142.78ms step:46/1480 train_time:5139ms step_avg:142.76ms step:47/1480 train_time:5281ms step_avg:142.74ms step:48/1480 train_time:5423ms step_avg:142.71ms step:49/1480 train_time:5566ms step_avg:142.73ms step:50/1480 train_time:5709ms step_avg:142.72ms step:51/1480 train_time:5853ms step_avg:142.75ms step:52/1480 train_time:5996ms step_avg:142.76ms step:53/1480 train_time:6139ms step_avg:142.76ms step:54/1480 train_time:6282ms step_avg:142.78ms step:55/1480 train_time:6425ms step_avg:142.79ms step:56/1480 train_time:6569ms step_avg:142.81ms step:57/1480 train_time:6712ms step_avg:142.81ms step:58/1480 train_time:6855ms step_avg:142.80ms step:59/1480 train_time:6996ms step_avg:142.77ms step:60/1480 train_time:7137ms step_avg:142.75ms step:61/1480 train_time:7281ms step_avg:142.76ms step:62/1480 train_time:7423ms step_avg:142.75ms step:63/1480 train_time:7566ms step_avg:142.76ms step:64/1480 train_time:7712ms step_avg:142.81ms step:65/1480 train_time:7854ms step_avg:142.80ms step:66/1480 train_time:7997ms step_avg:142.80ms step:67/1480 train_time:8138ms step_avg:142.76ms step:68/1480 train_time:8280ms step_avg:142.76ms step:69/1480 train_time:8422ms step_avg:142.74ms step:70/1480 train_time:8566ms step_avg:142.76ms step:71/1480 train_time:8710ms step_avg:142.78ms step:72/1480 train_time:8854ms step_avg:142.80ms step:73/1480 train_time:8996ms step_avg:142.80ms step:74/1480 train_time:9138ms step_avg:142.77ms step:75/1480 train_time:9279ms step_avg:142.76ms step:76/1480 train_time:9421ms step_avg:142.74ms step:77/1480 train_time:9565ms step_avg:142.76ms step:78/1480 train_time:9709ms step_avg:142.77ms step:79/1480 train_time:9853ms step_avg:142.79ms step:80/1480 train_time:10465ms step_avg:149.51ms step:81/1480 train_time:10570ms step_avg:148.87ms step:82/1480 train_time:10712ms step_avg:148.78ms step:83/1480 train_time:10854ms step_avg:148.69ms step:84/1480 train_time:10997ms step_avg:148.61ms step:85/1480 train_time:11138ms step_avg:148.51ms step:86/1480 train_time:11281ms step_avg:148.44ms step:87/1480 train_time:11424ms step_avg:148.37ms step:88/1480 train_time:11570ms step_avg:148.33ms step:89/1480 train_time:11714ms step_avg:148.28ms step:90/1480 train_time:11856ms step_avg:148.20ms step:91/1480 train_time:11998ms step_avg:148.12ms step:92/1480 train_time:12139ms step_avg:148.03ms step:93/1480 train_time:12281ms step_avg:147.97ms step:94/1480 train_time:12422ms step_avg:147.89ms step:95/1480 train_time:12565ms step_avg:147.83ms step:96/1480 train_time:12708ms step_avg:147.77ms step:97/1480 train_time:12852ms step_avg:147.72ms step:98/1480 train_time:13375ms step_avg:151.99ms step:99/1480 train_time:13480ms step_avg:151.46ms step:100/1480 train_time:13621ms step_avg:151.34ms step:101/1480 train_time:13765ms step_avg:151.27ms step:102/1480 train_time:13904ms step_avg:151.14ms step:103/1480 train_time:14046ms step_avg:151.03ms step:104/1480 train_time:14188ms step_avg:150.94ms step:105/1480 train_time:14333ms step_avg:150.88ms step:106/1480 train_time:14478ms step_avg:150.81ms step:107/1480 train_time:14619ms step_avg:150.71ms step:108/1480 train_time:14760ms step_avg:150.61ms step:109/1480 train_time:14903ms step_avg:150.53ms step:110/1480 train_time:15045ms step_avg:150.45ms step:111/1480 train_time:15191ms step_avg:150.40ms step:112/1480 train_time:15337ms step_avg:150.36ms step:113/1480 train_time:15484ms step_avg:150.33ms step:114/1480 train_time:15629ms step_avg:150.28ms step:115/1480 train_time:15776ms step_avg:150.25ms step:116/1480 train_time:15921ms step_avg:150.19ms step:117/1480 train_time:16066ms step_avg:150.15ms step:118/1480 train_time:16213ms step_avg:150.12ms step:119/1480 train_time:16358ms step_avg:150.07ms step:120/1480 train_time:16503ms step_avg:150.03ms step:121/1480 train_time:16649ms step_avg:149.99ms step:122/1480 train_time:16796ms step_avg:149.97ms step:123/1480 train_time:16941ms step_avg:149.92ms step:124/1480 train_time:17088ms step_avg:149.89ms step:125/1480 train_time:17233ms step_avg:149.86ms step:125/1480 val_loss:4.4235 train_time:17298ms step_avg:150.42ms step:126/1480 train_time:17396ms step_avg:149.97ms step:127/1480 train_time:17537ms step_avg:149.89ms step:128/1480 train_time:17683ms step_avg:149.86ms step:129/1480 train_time:17828ms step_avg:149.81ms step:130/1480 train_time:17973ms step_avg:149.77ms step:131/1480 train_time:18119ms step_avg:149.74ms step:132/1480 train_time:18265ms step_avg:149.71ms step:133/1480 train_time:18410ms step_avg:149.67ms step:134/1480 train_time:18555ms step_avg:149.64ms step:135/1480 train_time:18702ms step_avg:149.62ms step:136/1480 train_time:18847ms step_avg:149.58ms step:137/1480 train_time:18993ms step_avg:149.55ms step:138/1480 train_time:19140ms step_avg:149.53ms step:139/1480 train_time:19286ms step_avg:149.50ms step:140/1480 train_time:19431ms step_avg:149.47ms step:141/1480 train_time:19578ms step_avg:149.45ms step:142/1480 train_time:19726ms step_avg:149.44ms step:143/1480 train_time:19870ms step_avg:149.40ms step:144/1480 train_time:20016ms step_avg:149.37ms step:145/1480 train_time:20164ms step_avg:149.36ms step:146/1480 train_time:20308ms step_avg:149.32ms step:147/1480 train_time:20453ms step_avg:149.29ms step:148/1480 train_time:20599ms step_avg:149.27ms step:149/1480 train_time:20745ms step_avg:149.25ms step:150/1480 train_time:20891ms step_avg:149.22ms step:151/1480 train_time:21037ms step_avg:149.20ms step:152/1480 train_time:21183ms step_avg:149.18ms step:153/1480 train_time:21329ms step_avg:149.15ms step:154/1480 train_time:21473ms step_avg:149.12ms step:155/1480 train_time:21619ms step_avg:149.10ms step:156/1480 train_time:21766ms step_avg:149.08ms step:157/1480 train_time:21911ms step_avg:149.05ms step:158/1480 train_time:22056ms step_avg:149.03ms step:159/1480 train_time:22203ms step_avg:149.01ms step:160/1480 train_time:22347ms step_avg:148.98ms step:161/1480 train_time:22492ms step_avg:148.96ms step:162/1480 train_time:22639ms step_avg:148.94ms step:163/1480 train_time:22785ms step_avg:148.92ms step:164/1480 train_time:22930ms step_avg:148.90ms step:165/1480 train_time:23075ms step_avg:148.87ms step:166/1480 train_time:23223ms step_avg:148.86ms step:167/1480 train_time:23367ms step_avg:148.84ms step:168/1480 train_time:23513ms step_avg:148.81ms step:169/1480 train_time:23660ms step_avg:148.80ms step:170/1480 train_time:23806ms step_avg:148.79ms step:171/1480 train_time:23950ms step_avg:148.76ms step:172/1480 train_time:24096ms step_avg:148.74ms step:173/1480 train_time:24243ms step_avg:148.73ms step:174/1480 train_time:24387ms step_avg:148.70ms step:175/1480 train_time:24533ms step_avg:148.69ms step:176/1480 train_time:24679ms step_avg:148.67ms step:177/1480 train_time:24826ms step_avg:148.66ms step:178/1480 train_time:24971ms step_avg:148.63ms step:179/1480 train_time:25117ms step_avg:148.62ms step:180/1480 train_time:25264ms step_avg:148.61ms step:181/1480 train_time:25409ms step_avg:148.59ms step:182/1480 train_time:25554ms step_avg:148.57ms step:183/1480 train_time:25701ms step_avg:148.56ms step:184/1480 train_time:25846ms step_avg:148.54ms step:185/1480 train_time:25992ms step_avg:148.52ms step:186/1480 train_time:26138ms step_avg:148.51ms step:187/1480 train_time:26284ms step_avg:148.50ms step:188/1480 train_time:26430ms step_avg:148.48ms step:189/1480 train_time:26598ms step_avg:148.59ms step:190/1480 train_time:26722ms step_avg:148.46ms step:191/1480 train_time:26867ms step_avg:148.44ms step:192/1480 train_time:27013ms step_avg:148.42ms step:193/1480 train_time:27158ms step_avg:148.41ms step:194/1480 train_time:27305ms step_avg:148.39ms step:195/1480 train_time:27449ms step_avg:148.37ms step:196/1480 train_time:27595ms step_avg:148.36ms step:197/1480 train_time:27742ms step_avg:148.35ms step:198/1480 train_time:27887ms step_avg:148.34ms step:199/1480 train_time:28033ms step_avg:148.32ms step:200/1480 train_time:28180ms step_avg:148.31ms step:201/1480 train_time:28328ms step_avg:148.31ms step:202/1480 train_time:28470ms step_avg:148.28ms step:203/1480 train_time:28616ms step_avg:148.27ms step:204/1480 train_time:28762ms step_avg:148.26ms step:205/1480 train_time:28908ms step_avg:148.25ms step:206/1480 train_time:29053ms step_avg:148.23ms step:207/1480 train_time:29199ms step_avg:148.22ms step:208/1480 train_time:29346ms step_avg:148.21ms step:209/1480 train_time:29491ms step_avg:148.19ms step:210/1480 train_time:29637ms step_avg:148.18ms step:211/1480 train_time:29783ms step_avg:148.17ms step:212/1480 train_time:29930ms step_avg:148.17ms step:213/1480 train_time:30076ms step_avg:148.16ms step:214/1480 train_time:30222ms step_avg:148.15ms step:215/1480 train_time:30367ms step_avg:148.13ms step:216/1480 train_time:30513ms step_avg:148.12ms step:217/1480 train_time:30659ms step_avg:148.11ms step:218/1480 train_time:31194ms step_avg:149.97ms step:219/1480 train_time:31298ms step_avg:149.75ms step:220/1480 train_time:31444ms step_avg:149.74ms step:221/1480 train_time:32044ms step_avg:151.87ms step:222/1480 train_time:32609ms step_avg:153.82ms step:223/1480 train_time:32718ms step_avg:153.60ms step:224/1480 train_time:32866ms step_avg:153.58ms step:225/1480 train_time:33014ms step_avg:153.55ms step:226/1480 train_time:33164ms step_avg:153.53ms step:227/1480 train_time:33311ms step_avg:153.51ms step:228/1480 train_time:33459ms step_avg:153.48ms step:229/1480 train_time:33608ms step_avg:153.46ms step:230/1480 train_time:33757ms step_avg:153.44ms step:231/1480 train_time:33907ms step_avg:153.42ms step:232/1480 train_time:34055ms step_avg:153.40ms step:233/1480 train_time:34204ms step_avg:153.38ms step:234/1480 train_time:34351ms step_avg:153.35ms step:235/1480 train_time:34499ms step_avg:153.33ms step:236/1480 train_time:34648ms step_avg:153.31ms step:237/1480 train_time:34797ms step_avg:153.29ms step:238/1480 train_time:34947ms step_avg:153.28ms step:239/1480 train_time:35095ms step_avg:153.25ms step:240/1480 train_time:35245ms step_avg:153.24ms step:241/1480 train_time:35392ms step_avg:153.21ms step:242/1480 train_time:35542ms step_avg:153.20ms step:243/1480 train_time:35689ms step_avg:153.17ms step:244/1480 train_time:35839ms step_avg:153.16ms step:245/1480 train_time:35987ms step_avg:153.14ms step:246/1480 train_time:36136ms step_avg:153.12ms step:247/1480 train_time:36285ms step_avg:153.10ms step:248/1480 train_time:36434ms step_avg:153.08ms step:249/1480 train_time:36583ms step_avg:153.07ms step:250/1480 train_time:36731ms step_avg:153.05ms step:250/1480 val_loss:3.9892 train_time:36799ms step_avg:153.33ms step:251/1480 train_time:36897ms step_avg:153.10ms step:252/1480 train_time:37038ms step_avg:153.05ms step:253/1480 train_time:37185ms step_avg:153.03ms step:254/1480 train_time:37334ms step_avg:153.01ms step:255/1480 train_time:37481ms step_avg:152.98ms step:256/1480 train_time:37629ms step_avg:152.96ms step:257/1480 train_time:37777ms step_avg:152.94ms step:258/1480 train_time:37926ms step_avg:152.93ms step:259/1480 train_time:38075ms step_avg:152.91ms step:260/1480 train_time:38223ms step_avg:152.89ms step:261/1480 train_time:38372ms step_avg:152.88ms step:262/1480 train_time:38519ms step_avg:152.85ms step:263/1480 train_time:38668ms step_avg:152.84ms step:264/1480 train_time:38816ms step_avg:152.82ms step:265/1480 train_time:38965ms step_avg:152.81ms step:266/1480 train_time:39114ms step_avg:152.79ms step:267/1480 train_time:39263ms step_avg:152.77ms step:268/1480 train_time:39413ms step_avg:152.76ms step:269/1480 train_time:39560ms step_avg:152.74ms step:270/1480 train_time:39709ms step_avg:152.73ms step:271/1480 train_time:39858ms step_avg:152.71ms step:272/1480 train_time:40006ms step_avg:152.70ms step:273/1480 train_time:40155ms step_avg:152.68ms step:274/1480 train_time:40303ms step_avg:152.66ms step:275/1480 train_time:40452ms step_avg:152.65ms step:276/1480 train_time:40599ms step_avg:152.63ms step:277/1480 train_time:40748ms step_avg:152.61ms step:278/1480 train_time:40897ms step_avg:152.60ms step:279/1480 train_time:41045ms step_avg:152.58ms step:280/1480 train_time:41195ms step_avg:152.57ms step:281/1480 train_time:41343ms step_avg:152.56ms step:282/1480 train_time:41492ms step_avg:152.55ms step:283/1480 train_time:41641ms step_avg:152.53ms step:284/1480 train_time:41790ms step_avg:152.52ms step:285/1480 train_time:41938ms step_avg:152.50ms step:286/1480 train_time:42087ms step_avg:152.49ms step:287/1480 train_time:42236ms step_avg:152.48ms step:288/1480 train_time:42385ms step_avg:152.46ms step:289/1480 train_time:42534ms step_avg:152.45ms step:290/1480 train_time:42682ms step_avg:152.43ms step:291/1480 train_time:42830ms step_avg:152.42ms step:292/1480 train_time:42978ms step_avg:152.40ms step:293/1480 train_time:43126ms step_avg:152.39ms step:294/1480 train_time:43276ms step_avg:152.38ms step:295/1480 train_time:43423ms step_avg:152.36ms step:296/1480 train_time:43573ms step_avg:152.35ms step:297/1480 train_time:43721ms step_avg:152.34ms step:298/1480 train_time:43870ms step_avg:152.33ms step:299/1480 train_time:44018ms step_avg:152.31ms step:300/1480 train_time:44168ms step_avg:152.30ms step:301/1480 train_time:44317ms step_avg:152.29ms step:302/1480 train_time:44464ms step_avg:152.28ms step:303/1480 train_time:44613ms step_avg:152.26ms step:304/1480 train_time:44761ms step_avg:152.25ms step:305/1480 train_time:44910ms step_avg:152.24ms step:306/1480 train_time:45057ms step_avg:152.22ms step:307/1480 train_time:45205ms step_avg:152.21ms step:308/1480 train_time:45355ms step_avg:152.20ms step:309/1480 train_time:45502ms step_avg:152.18ms step:310/1480 train_time:45652ms step_avg:152.17ms step:311/1480 train_time:45799ms step_avg:152.16ms step:312/1480 train_time:45947ms step_avg:152.14ms step:313/1480 train_time:46096ms step_avg:152.13ms step:314/1480 train_time:46246ms step_avg:152.13ms step:315/1480 train_time:46395ms step_avg:152.12ms step:316/1480 train_time:46544ms step_avg:152.10ms step:317/1480 train_time:46694ms step_avg:152.10ms step:318/1480 train_time:46841ms step_avg:152.08ms step:319/1480 train_time:46991ms step_avg:152.07ms step:320/1480 train_time:47138ms step_avg:152.06ms step:321/1480 train_time:47288ms step_avg:152.05ms step:322/1480 train_time:47437ms step_avg:152.04ms step:323/1480 train_time:47585ms step_avg:152.03ms step:324/1480 train_time:47734ms step_avg:152.02ms step:325/1480 train_time:47881ms step_avg:152.00ms step:326/1480 train_time:48032ms step_avg:152.00ms step:327/1480 train_time:48179ms step_avg:151.98ms step:328/1480 train_time:48328ms step_avg:151.98ms step:329/1480 train_time:48477ms step_avg:151.96ms step:330/1480 train_time:48626ms step_avg:151.96ms step:331/1480 train_time:48777ms step_avg:151.95ms step:332/1480 train_time:48928ms step_avg:151.95ms step:333/1480 train_time:49078ms step_avg:151.94ms step:334/1480 train_time:49228ms step_avg:151.94ms step:335/1480 train_time:49378ms step_avg:151.93ms step:336/1480 train_time:49529ms step_avg:151.93ms step:337/1480 train_time:49681ms step_avg:151.93ms step:338/1480 train_time:49832ms step_avg:151.93ms step:339/1480 train_time:49982ms step_avg:151.92ms step:340/1480 train_time:50134ms step_avg:151.92ms step:341/1480 train_time:50283ms step_avg:151.91ms step:342/1480 train_time:50434ms step_avg:151.91ms step:343/1480 train_time:50584ms step_avg:151.90ms step:344/1480 train_time:50735ms step_avg:151.90ms step:345/1480 train_time:50886ms step_avg:151.90ms step:346/1480 train_time:51037ms step_avg:151.90ms step:347/1480 train_time:51189ms step_avg:151.89ms step:348/1480 train_time:51339ms step_avg:151.89ms step:349/1480 train_time:51491ms step_avg:151.89ms step:350/1480 train_time:51641ms step_avg:151.88ms step:351/1480 train_time:51793ms step_avg:151.88ms step:352/1480 train_time:51943ms step_avg:151.88ms step:353/1480 train_time:52095ms step_avg:151.88ms step:354/1480 train_time:52245ms step_avg:151.87ms step:355/1480 train_time:52396ms step_avg:151.87ms step:356/1480 train_time:52547ms step_avg:151.87ms step:357/1480 train_time:52698ms step_avg:151.87ms step:358/1480 train_time:52850ms step_avg:151.87ms step:359/1480 train_time:53000ms step_avg:151.86ms step:360/1480 train_time:53151ms step_avg:151.86ms step:361/1480 train_time:53302ms step_avg:151.86ms step:362/1480 train_time:53454ms step_avg:151.86ms step:363/1480 train_time:53604ms step_avg:151.85ms step:364/1480 train_time:53756ms step_avg:151.85ms step:365/1480 train_time:53906ms step_avg:151.85ms step:366/1480 train_time:54057ms step_avg:151.85ms step:367/1480 train_time:54208ms step_avg:151.84ms step:368/1480 train_time:54358ms step_avg:151.84ms step:369/1480 train_time:54509ms step_avg:151.84ms step:370/1480 train_time:54659ms step_avg:151.83ms step:371/1480 train_time:54809ms step_avg:151.83ms step:372/1480 train_time:54960ms step_avg:151.82ms step:373/1480 train_time:55111ms step_avg:151.82ms step:374/1480 train_time:55262ms step_avg:151.82ms step:375/1480 train_time:55412ms step_avg:151.81ms step:375/1480 val_loss:3.8023 train_time:55481ms step_avg:152.00ms step:376/1480 train_time:55577ms step_avg:151.85ms step:377/1480 train_time:55724ms step_avg:151.84ms step:378/1480 train_time:55875ms step_avg:151.83ms step:379/1480 train_time:56043ms step_avg:151.88ms step:380/1480 train_time:56176ms step_avg:151.83ms step:381/1480 train_time:56326ms step_avg:151.82ms step:382/1480 train_time:56477ms step_avg:151.82ms step:383/1480 train_time:56629ms step_avg:151.82ms step:384/1480 train_time:56781ms step_avg:151.82ms step:385/1480 train_time:56933ms step_avg:151.82ms step:386/1480 train_time:57085ms step_avg:151.82ms step:387/1480 train_time:57235ms step_avg:151.82ms step:388/1480 train_time:57386ms step_avg:151.82ms step:389/1480 train_time:57536ms step_avg:151.81ms step:390/1480 train_time:57688ms step_avg:151.81ms step:391/1480 train_time:57838ms step_avg:151.81ms step:392/1480 train_time:57990ms step_avg:151.81ms step:393/1480 train_time:58141ms step_avg:151.80ms step:394/1480 train_time:58292ms step_avg:151.80ms step:395/1480 train_time:58443ms step_avg:151.80ms step:396/1480 train_time:58594ms step_avg:151.80ms step:397/1480 train_time:58745ms step_avg:151.80ms step:398/1480 train_time:58895ms step_avg:151.79ms step:399/1480 train_time:59047ms step_avg:151.79ms step:400/1480 train_time:59199ms step_avg:151.79ms step:401/1480 train_time:59350ms step_avg:151.79ms step:402/1480 train_time:59502ms step_avg:151.79ms step:403/1480 train_time:59653ms step_avg:151.79ms step:404/1480 train_time:59804ms step_avg:151.79ms step:405/1480 train_time:59954ms step_avg:151.78ms step:406/1480 train_time:60107ms step_avg:151.78ms step:407/1480 train_time:60257ms step_avg:151.78ms step:408/1480 train_time:60409ms step_avg:151.78ms step:409/1480 train_time:60559ms step_avg:151.78ms step:410/1480 train_time:60710ms step_avg:151.78ms step:411/1480 train_time:60861ms step_avg:151.77ms step:412/1480 train_time:61012ms step_avg:151.77ms step:413/1480 train_time:61163ms step_avg:151.77ms step:414/1480 train_time:61314ms step_avg:151.77ms step:415/1480 train_time:61466ms step_avg:151.77ms step:416/1480 train_time:61616ms step_avg:151.76ms step:417/1480 train_time:61768ms step_avg:151.76ms step:418/1480 train_time:61918ms step_avg:151.76ms step:419/1480 train_time:62070ms step_avg:151.76ms step:420/1480 train_time:62221ms step_avg:151.76ms step:421/1480 train_time:62372ms step_avg:151.76ms step:422/1480 train_time:62522ms step_avg:151.75ms step:423/1480 train_time:62673ms step_avg:151.75ms step:424/1480 train_time:62823ms step_avg:151.75ms step:425/1480 train_time:62973ms step_avg:151.74ms step:426/1480 train_time:63126ms step_avg:151.74ms step:427/1480 train_time:63276ms step_avg:151.74ms step:428/1480 train_time:63427ms step_avg:151.74ms step:429/1480 train_time:63579ms step_avg:151.74ms step:430/1480 train_time:63730ms step_avg:151.74ms step:431/1480 train_time:63882ms step_avg:151.74ms step:432/1480 train_time:64032ms step_avg:151.74ms step:433/1480 train_time:64184ms step_avg:151.74ms step:434/1480 train_time:64335ms step_avg:151.73ms step:435/1480 train_time:64486ms step_avg:151.73ms step:436/1480 train_time:64636ms step_avg:151.73ms step:437/1480 train_time:64788ms step_avg:151.73ms step:438/1480 train_time:64938ms step_avg:151.72ms step:439/1480 train_time:65090ms step_avg:151.73ms step:440/1480 train_time:65241ms step_avg:151.72ms step:441/1480 train_time:65394ms step_avg:151.73ms step:442/1480 train_time:65548ms step_avg:151.73ms step:443/1480 train_time:65700ms step_avg:151.73ms step:444/1480 train_time:65853ms step_avg:151.74ms step:445/1480 train_time:66007ms step_avg:151.74ms step:446/1480 train_time:66158ms step_avg:151.74ms step:447/1480 train_time:66312ms step_avg:151.74ms step:448/1480 train_time:66465ms step_avg:151.75ms step:449/1480 train_time:66618ms step_avg:151.75ms step:450/1480 train_time:66771ms step_avg:151.75ms step:451/1480 train_time:66924ms step_avg:151.76ms step:452/1480 train_time:67077ms step_avg:151.76ms step:453/1480 train_time:67230ms step_avg:151.76ms step:454/1480 train_time:67385ms step_avg:151.77ms step:455/1480 train_time:67538ms step_avg:151.77ms step:456/1480 train_time:67691ms step_avg:151.77ms step:457/1480 train_time:67844ms step_avg:151.78ms step:458/1480 train_time:67996ms step_avg:151.78ms step:459/1480 train_time:68149ms step_avg:151.78ms step:460/1480 train_time:68301ms step_avg:151.78ms step:461/1480 train_time:68454ms step_avg:151.78ms step:462/1480 train_time:68608ms step_avg:151.79ms step:463/1480 train_time:68760ms step_avg:151.79ms step:464/1480 train_time:68913ms step_avg:151.79ms step:465/1480 train_time:69066ms step_avg:151.79ms step:466/1480 train_time:69218ms step_avg:151.79ms step:467/1480 train_time:69371ms step_avg:151.80ms step:468/1480 train_time:69523ms step_avg:151.80ms step:469/1480 train_time:69676ms step_avg:151.80ms step:470/1480 train_time:69829ms step_avg:151.80ms step:471/1480 train_time:69982ms step_avg:151.80ms step:472/1480 train_time:70135ms step_avg:151.81ms step:473/1480 train_time:70289ms step_avg:151.81ms step:474/1480 train_time:70442ms step_avg:151.82ms step:475/1480 train_time:70595ms step_avg:151.82ms step:476/1480 train_time:70749ms step_avg:151.82ms step:477/1480 train_time:70902ms step_avg:151.82ms step:478/1480 train_time:71055ms step_avg:151.83ms step:479/1480 train_time:71208ms step_avg:151.83ms step:480/1480 train_time:71361ms step_avg:151.83ms step:481/1480 train_time:71513ms step_avg:151.83ms step:482/1480 train_time:71666ms step_avg:151.84ms step:483/1480 train_time:71819ms step_avg:151.84ms step:484/1480 train_time:71972ms step_avg:151.84ms step:485/1480 train_time:72124ms step_avg:151.84ms step:486/1480 train_time:72277ms step_avg:151.84ms step:487/1480 train_time:72429ms step_avg:151.84ms step:488/1480 train_time:72583ms step_avg:151.85ms step:489/1480 train_time:72736ms step_avg:151.85ms step:490/1480 train_time:72889ms step_avg:151.85ms step:491/1480 train_time:73043ms step_avg:151.86ms step:492/1480 train_time:73196ms step_avg:151.86ms step:493/1480 train_time:73350ms step_avg:151.86ms step:494/1480 train_time:73502ms step_avg:151.86ms step:495/1480 train_time:73656ms step_avg:151.87ms step:496/1480 train_time:73810ms step_avg:151.87ms step:497/1480 train_time:73963ms step_avg:151.87ms step:498/1480 train_time:74116ms step_avg:151.88ms step:499/1480 train_time:74269ms step_avg:151.88ms step:500/1480 train_time:74422ms step_avg:151.88ms step:500/1480 val_loss:3.6853 train_time:74491ms step_avg:152.02ms step:501/1480 train_time:74583ms step_avg:151.90ms step:502/1480 train_time:74733ms step_avg:151.90ms step:503/1480 train_time:74887ms step_avg:151.90ms step:504/1480 train_time:75040ms step_avg:151.90ms step:505/1480 train_time:75193ms step_avg:151.90ms step:506/1480 train_time:75345ms step_avg:151.91ms step:507/1480 train_time:75497ms step_avg:151.91ms step:508/1480 train_time:75652ms step_avg:151.91ms step:509/1480 train_time:75805ms step_avg:151.91ms step:510/1480 train_time:75958ms step_avg:151.92ms step:511/1480 train_time:76112ms step_avg:151.92ms step:512/1480 train_time:76265ms step_avg:151.92ms step:513/1480 train_time:76418ms step_avg:151.93ms step:514/1480 train_time:76571ms step_avg:151.93ms step:515/1480 train_time:76725ms step_avg:151.93ms step:516/1480 train_time:76879ms step_avg:151.94ms step:517/1480 train_time:77033ms step_avg:151.94ms step:518/1480 train_time:77187ms step_avg:151.94ms step:519/1480 train_time:77340ms step_avg:151.94ms step:520/1480 train_time:77494ms step_avg:151.95ms step:521/1480 train_time:77646ms step_avg:151.95ms step:522/1480 train_time:77799ms step_avg:151.95ms step:523/1480 train_time:77953ms step_avg:151.95ms step:524/1480 train_time:78105ms step_avg:151.95ms step:525/1480 train_time:78257ms step_avg:151.96ms step:526/1480 train_time:78411ms step_avg:151.96ms step:527/1480 train_time:78563ms step_avg:151.96ms step:528/1480 train_time:78716ms step_avg:151.96ms step:529/1480 train_time:78871ms step_avg:151.97ms step:530/1480 train_time:79023ms step_avg:151.97ms step:531/1480 train_time:79177ms step_avg:151.97ms step:532/1480 train_time:79329ms step_avg:151.97ms step:533/1480 train_time:79482ms step_avg:151.97ms step:534/1480 train_time:79634ms step_avg:151.97ms step:535/1480 train_time:79789ms step_avg:151.98ms step:536/1480 train_time:79943ms step_avg:151.98ms step:537/1480 train_time:80096ms step_avg:151.99ms step:538/1480 train_time:80250ms step_avg:151.99ms step:539/1480 train_time:80402ms step_avg:151.99ms step:540/1480 train_time:80555ms step_avg:151.99ms step:541/1480 train_time:80709ms step_avg:152.00ms step:542/1480 train_time:80861ms step_avg:151.99ms step:543/1480 train_time:81014ms step_avg:152.00ms step:544/1480 train_time:81168ms step_avg:152.00ms step:545/1480 train_time:81321ms step_avg:152.00ms step:546/1480 train_time:81474ms step_avg:152.00ms step:547/1480 train_time:81627ms step_avg:152.01ms step:548/1480 train_time:81779ms step_avg:152.01ms step:549/1480 train_time:81932ms step_avg:152.01ms step:550/1480 train_time:82087ms step_avg:152.01ms step:551/1480 train_time:82242ms step_avg:152.02ms step:552/1480 train_time:82397ms step_avg:152.02ms step:553/1480 train_time:82553ms step_avg:152.03ms step:554/1480 train_time:82708ms step_avg:152.04ms step:555/1480 train_time:82862ms step_avg:152.04ms step:556/1480 train_time:83016ms step_avg:152.04ms step:557/1480 train_time:83172ms step_avg:152.05ms step:558/1480 train_time:83327ms step_avg:152.06ms step:559/1480 train_time:83482ms step_avg:152.06ms step:560/1480 train_time:83636ms step_avg:152.07ms step:561/1480 train_time:83791ms step_avg:152.07ms step:562/1480 train_time:83945ms step_avg:152.07ms step:563/1480 train_time:84099ms step_avg:152.08ms step:564/1480 train_time:84255ms step_avg:152.09ms step:565/1480 train_time:84411ms step_avg:152.09ms step:566/1480 train_time:84567ms step_avg:152.10ms step:567/1480 train_time:84723ms step_avg:152.11ms step:568/1480 train_time:84877ms step_avg:152.11ms step:569/1480 train_time:85047ms step_avg:152.14ms step:570/1480 train_time:85188ms step_avg:152.12ms step:571/1480 train_time:85343ms step_avg:152.13ms step:572/1480 train_time:85498ms step_avg:152.13ms step:573/1480 train_time:85653ms step_avg:152.14ms step:574/1480 train_time:85809ms step_avg:152.14ms step:575/1480 train_time:85964ms step_avg:152.15ms step:576/1480 train_time:86118ms step_avg:152.15ms step:577/1480 train_time:86273ms step_avg:152.16ms step:578/1480 train_time:86427ms step_avg:152.16ms step:579/1480 train_time:86581ms step_avg:152.16ms step:580/1480 train_time:86734ms step_avg:152.17ms step:581/1480 train_time:86890ms step_avg:152.17ms step:582/1480 train_time:87046ms step_avg:152.18ms step:583/1480 train_time:87200ms step_avg:152.18ms step:584/1480 train_time:87355ms step_avg:152.19ms step:585/1480 train_time:87511ms step_avg:152.19ms step:586/1480 train_time:87666ms step_avg:152.20ms step:587/1480 train_time:87821ms step_avg:152.20ms step:588/1480 train_time:87975ms step_avg:152.21ms step:589/1480 train_time:88129ms step_avg:152.21ms step:590/1480 train_time:88286ms step_avg:152.22ms step:591/1480 train_time:88441ms step_avg:152.22ms step:592/1480 train_time:88596ms step_avg:152.23ms step:593/1480 train_time:88751ms step_avg:152.23ms step:594/1480 train_time:88907ms step_avg:152.24ms step:595/1480 train_time:89065ms step_avg:152.25ms step:596/1480 train_time:89221ms step_avg:152.25ms step:597/1480 train_time:89375ms step_avg:152.26ms step:598/1480 train_time:89529ms step_avg:152.26ms step:599/1480 train_time:89685ms step_avg:152.27ms step:600/1480 train_time:89839ms step_avg:152.27ms step:601/1480 train_time:89994ms step_avg:152.27ms step:602/1480 train_time:90148ms step_avg:152.28ms step:603/1480 train_time:90303ms step_avg:152.28ms step:604/1480 train_time:90459ms step_avg:152.29ms step:605/1480 train_time:90615ms step_avg:152.29ms step:606/1480 train_time:90771ms step_avg:152.30ms step:607/1480 train_time:90925ms step_avg:152.30ms step:608/1480 train_time:91081ms step_avg:152.31ms step:609/1480 train_time:91236ms step_avg:152.31ms step:610/1480 train_time:91392ms step_avg:152.32ms step:611/1480 train_time:91547ms step_avg:152.32ms step:612/1480 train_time:91701ms step_avg:152.33ms step:613/1480 train_time:91856ms step_avg:152.33ms step:614/1480 train_time:92011ms step_avg:152.34ms step:615/1480 train_time:92166ms step_avg:152.34ms step:616/1480 train_time:92320ms step_avg:152.34ms step:617/1480 train_time:92475ms step_avg:152.35ms step:618/1480 train_time:92629ms step_avg:152.35ms step:619/1480 train_time:92784ms step_avg:152.36ms step:620/1480 train_time:92939ms step_avg:152.36ms step:621/1480 train_time:93094ms step_avg:152.36ms step:622/1480 train_time:93250ms step_avg:152.37ms step:623/1480 train_time:93405ms step_avg:152.37ms step:624/1480 train_time:93559ms step_avg:152.38ms step:625/1480 train_time:93714ms step_avg:152.38ms step:625/1480 val_loss:3.6035 train_time:93786ms step_avg:152.50ms step:626/1480 train_time:93877ms step_avg:152.40ms step:627/1480 train_time:94032ms step_avg:152.40ms step:628/1480 train_time:94188ms step_avg:152.41ms step:629/1480 train_time:94342ms step_avg:152.41ms step:630/1480 train_time:94497ms step_avg:152.42ms step:631/1480 train_time:94651ms step_avg:152.42ms step:632/1480 train_time:94806ms step_avg:152.42ms step:633/1480 train_time:94962ms step_avg:152.43ms step:634/1480 train_time:95117ms step_avg:152.43ms step:635/1480 train_time:95273ms step_avg:152.44ms step:636/1480 train_time:95428ms step_avg:152.44ms step:637/1480 train_time:95583ms step_avg:152.45ms step:638/1480 train_time:95738ms step_avg:152.45ms step:639/1480 train_time:95892ms step_avg:152.45ms step:640/1480 train_time:96047ms step_avg:152.46ms step:641/1480 train_time:96201ms step_avg:152.46ms step:642/1480 train_time:96356ms step_avg:152.46ms step:643/1480 train_time:96512ms step_avg:152.47ms step:644/1480 train_time:96668ms step_avg:152.47ms step:645/1480 train_time:96823ms step_avg:152.48ms step:646/1480 train_time:96980ms step_avg:152.48ms step:647/1480 train_time:97135ms step_avg:152.49ms step:648/1480 train_time:97290ms step_avg:152.49ms step:649/1480 train_time:97445ms step_avg:152.50ms step:650/1480 train_time:97599ms step_avg:152.50ms step:651/1480 train_time:97756ms step_avg:152.51ms step:652/1480 train_time:97911ms step_avg:152.51ms step:653/1480 train_time:98066ms step_avg:152.51ms step:654/1480 train_time:98221ms step_avg:152.52ms step:655/1480 train_time:98375ms step_avg:152.52ms step:656/1480 train_time:98530ms step_avg:152.52ms step:657/1480 train_time:98685ms step_avg:152.53ms step:658/1480 train_time:98841ms step_avg:152.53ms step:659/1480 train_time:98995ms step_avg:152.54ms step:660/1480 train_time:99151ms step_avg:152.54ms step:661/1480 train_time:99310ms step_avg:152.55ms step:662/1480 train_time:99467ms step_avg:152.56ms step:663/1480 train_time:99621ms step_avg:152.56ms step:664/1480 train_time:99778ms step_avg:152.57ms step:665/1480 train_time:99935ms step_avg:152.57ms step:666/1480 train_time:100091ms step_avg:152.58ms step:667/1480 train_time:100249ms step_avg:152.59ms step:668/1480 train_time:100407ms step_avg:152.59ms step:669/1480 train_time:100565ms step_avg:152.60ms step:670/1480 train_time:100720ms step_avg:152.61ms step:671/1480 train_time:100877ms step_avg:152.61ms step:672/1480 train_time:101032ms step_avg:152.62ms step:673/1480 train_time:101188ms step_avg:152.62ms step:674/1480 train_time:101345ms step_avg:152.63ms step:675/1480 train_time:101501ms step_avg:152.63ms step:676/1480 train_time:101659ms step_avg:152.64ms step:677/1480 train_time:101816ms step_avg:152.65ms step:678/1480 train_time:101972ms step_avg:152.65ms step:679/1480 train_time:102128ms step_avg:152.66ms step:680/1480 train_time:102286ms step_avg:152.67ms step:681/1480 train_time:102441ms step_avg:152.67ms step:682/1480 train_time:102599ms step_avg:152.68ms step:683/1480 train_time:102755ms step_avg:152.68ms step:684/1480 train_time:102913ms step_avg:152.69ms step:685/1480 train_time:103069ms step_avg:152.70ms step:686/1480 train_time:103225ms step_avg:152.70ms step:687/1480 train_time:103382ms step_avg:152.71ms step:688/1480 train_time:103540ms step_avg:152.71ms step:689/1480 train_time:103697ms step_avg:152.72ms step:690/1480 train_time:103855ms step_avg:152.73ms step:691/1480 train_time:104012ms step_avg:152.73ms step:692/1480 train_time:104169ms step_avg:152.74ms step:693/1480 train_time:104325ms step_avg:152.75ms step:694/1480 train_time:104482ms step_avg:152.75ms step:695/1480 train_time:104639ms step_avg:152.76ms step:696/1480 train_time:104794ms step_avg:152.76ms step:697/1480 train_time:104950ms step_avg:152.77ms step:698/1480 train_time:105107ms step_avg:152.77ms step:699/1480 train_time:105263ms step_avg:152.78ms step:700/1480 train_time:105420ms step_avg:152.78ms step:701/1480 train_time:105575ms step_avg:152.79ms step:702/1480 train_time:105730ms step_avg:152.79ms step:703/1480 train_time:105887ms step_avg:152.80ms step:704/1480 train_time:106043ms step_avg:152.80ms step:705/1480 train_time:106199ms step_avg:152.80ms step:706/1480 train_time:106358ms step_avg:152.81ms step:707/1480 train_time:106514ms step_avg:152.82ms step:708/1480 train_time:106671ms step_avg:152.82ms step:709/1480 train_time:106826ms step_avg:152.83ms step:710/1480 train_time:106983ms step_avg:152.83ms step:711/1480 train_time:107139ms step_avg:152.84ms step:712/1480 train_time:107295ms step_avg:152.84ms step:713/1480 train_time:107452ms step_avg:152.85ms step:714/1480 train_time:107609ms step_avg:152.85ms step:715/1480 train_time:107765ms step_avg:152.86ms step:716/1480 train_time:107920ms step_avg:152.86ms step:717/1480 train_time:108076ms step_avg:152.87ms step:718/1480 train_time:108231ms step_avg:152.87ms step:719/1480 train_time:108387ms step_avg:152.87ms step:720/1480 train_time:108545ms step_avg:152.88ms step:721/1480 train_time:108702ms step_avg:152.89ms step:722/1480 train_time:108858ms step_avg:152.89ms step:723/1480 train_time:109014ms step_avg:152.90ms step:724/1480 train_time:109170ms step_avg:152.90ms step:725/1480 train_time:109326ms step_avg:152.90ms step:726/1480 train_time:109484ms step_avg:152.91ms step:727/1480 train_time:109641ms step_avg:152.92ms step:728/1480 train_time:109798ms step_avg:152.92ms step:729/1480 train_time:109954ms step_avg:152.93ms step:730/1480 train_time:110112ms step_avg:152.93ms step:731/1480 train_time:110269ms step_avg:152.94ms step:732/1480 train_time:110424ms step_avg:152.94ms step:733/1480 train_time:110582ms step_avg:152.95ms step:734/1480 train_time:110740ms step_avg:152.96ms step:735/1480 train_time:110896ms step_avg:152.96ms step:736/1480 train_time:111051ms step_avg:152.96ms step:737/1480 train_time:111207ms step_avg:152.97ms step:738/1480 train_time:111363ms step_avg:152.97ms step:739/1480 train_time:111519ms step_avg:152.98ms step:740/1480 train_time:111678ms step_avg:152.98ms step:741/1480 train_time:111836ms step_avg:152.99ms step:742/1480 train_time:111992ms step_avg:152.99ms step:743/1480 train_time:112148ms step_avg:153.00ms step:744/1480 train_time:112305ms step_avg:153.00ms step:745/1480 train_time:112463ms step_avg:153.01ms step:746/1480 train_time:112620ms step_avg:153.02ms step:747/1480 train_time:112776ms step_avg:153.02ms step:748/1480 train_time:112936ms step_avg:153.03ms step:749/1480 train_time:113092ms step_avg:153.03ms step:750/1480 train_time:113247ms step_avg:153.04ms step:750/1480 val_loss:3.5485 train_time:113319ms step_avg:153.13ms step:751/1480 train_time:113409ms step_avg:153.05ms step:752/1480 train_time:113567ms step_avg:153.05ms step:753/1480 train_time:113723ms step_avg:153.06ms step:754/1480 train_time:113879ms step_avg:153.06ms step:755/1480 train_time:114035ms step_avg:153.07ms step:756/1480 train_time:114192ms step_avg:153.07ms step:757/1480 train_time:114349ms step_avg:153.08ms step:758/1480 train_time:114507ms step_avg:153.08ms step:759/1480 train_time:114680ms step_avg:153.11ms step:760/1480 train_time:114822ms step_avg:153.10ms step:761/1480 train_time:114978ms step_avg:153.10ms step:762/1480 train_time:115135ms step_avg:153.11ms step:763/1480 train_time:115292ms step_avg:153.11ms step:764/1480 train_time:115449ms step_avg:153.12ms step:765/1480 train_time:115606ms step_avg:153.12ms step:766/1480 train_time:115764ms step_avg:153.13ms step:767/1480 train_time:115920ms step_avg:153.13ms step:768/1480 train_time:116077ms step_avg:153.14ms step:769/1480 train_time:116234ms step_avg:153.14ms step:770/1480 train_time:116392ms step_avg:153.15ms step:771/1480 train_time:116550ms step_avg:153.15ms step:772/1480 train_time:116710ms step_avg:153.16ms step:773/1480 train_time:116868ms step_avg:153.17ms step:774/1480 train_time:117026ms step_avg:153.18ms step:775/1480 train_time:117185ms step_avg:153.18ms step:776/1480 train_time:117343ms step_avg:153.19ms step:777/1480 train_time:117504ms step_avg:153.20ms step:778/1480 train_time:117663ms step_avg:153.21ms step:779/1480 train_time:117820ms step_avg:153.21ms step:780/1480 train_time:117979ms step_avg:153.22ms step:781/1480 train_time:118136ms step_avg:153.22ms step:782/1480 train_time:118293ms step_avg:153.23ms step:783/1480 train_time:118451ms step_avg:153.24ms step:784/1480 train_time:118609ms step_avg:153.24ms step:785/1480 train_time:118767ms step_avg:153.25ms step:786/1480 train_time:118926ms step_avg:153.25ms step:787/1480 train_time:119084ms step_avg:153.26ms step:788/1480 train_time:119244ms step_avg:153.27ms step:789/1480 train_time:119401ms step_avg:153.27ms step:790/1480 train_time:119558ms step_avg:153.28ms step:791/1480 train_time:119718ms step_avg:153.29ms step:792/1480 train_time:119876ms step_avg:153.29ms step:793/1480 train_time:120032ms step_avg:153.30ms step:794/1480 train_time:120191ms step_avg:153.31ms step:795/1480 train_time:120353ms step_avg:153.32ms step:796/1480 train_time:120512ms step_avg:153.32ms step:797/1480 train_time:120671ms step_avg:153.33ms step:798/1480 train_time:120829ms step_avg:153.34ms step:799/1480 train_time:120990ms step_avg:153.35ms step:800/1480 train_time:121149ms step_avg:153.35ms step:801/1480 train_time:121306ms step_avg:153.36ms step:802/1480 train_time:121465ms step_avg:153.37ms step:803/1480 train_time:121625ms step_avg:153.37ms step:804/1480 train_time:121782ms step_avg:153.38ms step:805/1480 train_time:121941ms step_avg:153.39ms step:806/1480 train_time:122097ms step_avg:153.39ms step:807/1480 train_time:122255ms step_avg:153.39ms step:808/1480 train_time:122412ms step_avg:153.40ms step:809/1480 train_time:122569ms step_avg:153.40ms step:810/1480 train_time:122727ms step_avg:153.41ms step:811/1480 train_time:122883ms step_avg:153.41ms step:812/1480 train_time:123041ms step_avg:153.42ms step:813/1480 train_time:123197ms step_avg:153.42ms step:814/1480 train_time:123356ms step_avg:153.43ms step:815/1480 train_time:123511ms step_avg:153.43ms step:816/1480 train_time:123672ms step_avg:153.44ms step:817/1480 train_time:123828ms step_avg:153.44ms step:818/1480 train_time:123985ms step_avg:153.45ms step:819/1480 train_time:124145ms step_avg:153.45ms step:820/1480 train_time:124303ms step_avg:153.46ms step:821/1480 train_time:124461ms step_avg:153.47ms step:822/1480 train_time:124618ms step_avg:153.47ms step:823/1480 train_time:124776ms step_avg:153.48ms step:824/1480 train_time:124933ms step_avg:153.48ms step:825/1480 train_time:125093ms step_avg:153.49ms step:826/1480 train_time:125251ms step_avg:153.49ms step:827/1480 train_time:125410ms step_avg:153.50ms step:828/1480 train_time:125570ms step_avg:153.51ms step:829/1480 train_time:125730ms step_avg:153.52ms step:830/1480 train_time:125891ms step_avg:153.53ms step:831/1480 train_time:126050ms step_avg:153.53ms step:832/1480 train_time:126207ms step_avg:153.54ms step:833/1480 train_time:126365ms step_avg:153.54ms step:834/1480 train_time:126523ms step_avg:153.55ms step:835/1480 train_time:126679ms step_avg:153.55ms step:836/1480 train_time:126838ms step_avg:153.56ms step:837/1480 train_time:126995ms step_avg:153.56ms step:838/1480 train_time:127154ms step_avg:153.57ms step:839/1480 train_time:127311ms step_avg:153.57ms step:840/1480 train_time:127469ms step_avg:153.58ms step:841/1480 train_time:127628ms step_avg:153.58ms step:842/1480 train_time:127786ms step_avg:153.59ms step:843/1480 train_time:127943ms step_avg:153.59ms step:844/1480 train_time:128098ms step_avg:153.60ms step:845/1480 train_time:128257ms step_avg:153.60ms step:846/1480 train_time:128415ms step_avg:153.61ms step:847/1480 train_time:128574ms step_avg:153.61ms step:848/1480 train_time:128732ms step_avg:153.62ms step:849/1480 train_time:128889ms step_avg:153.62ms step:850/1480 train_time:129048ms step_avg:153.63ms step:851/1480 train_time:129207ms step_avg:153.63ms step:852/1480 train_time:129366ms step_avg:153.64ms step:853/1480 train_time:129524ms step_avg:153.65ms step:854/1480 train_time:129682ms step_avg:153.65ms step:855/1480 train_time:129839ms step_avg:153.66ms step:856/1480 train_time:129996ms step_avg:153.66ms step:857/1480 train_time:130154ms step_avg:153.67ms step:858/1480 train_time:130314ms step_avg:153.67ms step:859/1480 train_time:130473ms step_avg:153.68ms step:860/1480 train_time:130631ms step_avg:153.68ms step:861/1480 train_time:130790ms step_avg:153.69ms step:862/1480 train_time:130952ms step_avg:153.70ms step:863/1480 train_time:131112ms step_avg:153.71ms step:864/1480 train_time:131271ms step_avg:153.71ms step:865/1480 train_time:131429ms step_avg:153.72ms step:866/1480 train_time:131588ms step_avg:153.72ms step:867/1480 train_time:131747ms step_avg:153.73ms step:868/1480 train_time:131905ms step_avg:153.74ms step:869/1480 train_time:132063ms step_avg:153.74ms step:870/1480 train_time:132220ms step_avg:153.74ms step:871/1480 train_time:132376ms step_avg:153.75ms step:872/1480 train_time:132535ms step_avg:153.75ms step:873/1480 train_time:132693ms step_avg:153.76ms step:874/1480 train_time:132852ms step_avg:153.76ms step:875/1480 train_time:133011ms step_avg:153.77ms step:875/1480 val_loss:3.5027 train_time:133083ms step_avg:153.85ms step:876/1480 train_time:133175ms step_avg:153.78ms step:877/1480 train_time:133331ms step_avg:153.78ms step:878/1480 train_time:133488ms step_avg:153.79ms step:879/1480 train_time:133647ms step_avg:153.79ms step:880/1480 train_time:133805ms step_avg:153.80ms step:881/1480 train_time:133963ms step_avg:153.80ms step:882/1480 train_time:134121ms step_avg:153.81ms step:883/1480 train_time:134281ms step_avg:153.82ms step:884/1480 train_time:134445ms step_avg:153.83ms step:885/1480 train_time:134606ms step_avg:153.84ms step:886/1480 train_time:134766ms step_avg:153.84ms step:887/1480 train_time:134926ms step_avg:153.85ms step:888/1480 train_time:135089ms step_avg:153.86ms step:889/1480 train_time:135252ms step_avg:153.87ms step:890/1480 train_time:135409ms step_avg:153.87ms step:891/1480 train_time:135568ms step_avg:153.88ms step:892/1480 train_time:135728ms step_avg:153.89ms step:893/1480 train_time:135885ms step_avg:153.89ms step:894/1480 train_time:136045ms step_avg:153.90ms step:895/1480 train_time:136206ms step_avg:153.91ms step:896/1480 train_time:136364ms step_avg:153.91ms step:897/1480 train_time:136526ms step_avg:153.92ms step:898/1480 train_time:136686ms step_avg:153.93ms step:899/1480 train_time:136847ms step_avg:153.93ms step:900/1480 train_time:137006ms step_avg:153.94ms step:901/1480 train_time:137165ms step_avg:153.94ms step:902/1480 train_time:137323ms step_avg:153.95ms step:903/1480 train_time:137483ms step_avg:153.96ms step:904/1480 train_time:137643ms step_avg:153.96ms step:905/1480 train_time:137803ms step_avg:153.97ms step:906/1480 train_time:137962ms step_avg:153.98ms step:907/1480 train_time:138125ms step_avg:153.99ms step:908/1480 train_time:138283ms step_avg:153.99ms step:909/1480 train_time:138442ms step_avg:154.00ms step:910/1480 train_time:138606ms step_avg:154.01ms step:911/1480 train_time:138766ms step_avg:154.01ms step:912/1480 train_time:138927ms step_avg:154.02ms step:913/1480 train_time:139088ms step_avg:154.03ms step:914/1480 train_time:139248ms step_avg:154.04ms step:915/1480 train_time:139410ms step_avg:154.04ms step:916/1480 train_time:139569ms step_avg:154.05ms step:917/1480 train_time:139728ms step_avg:154.05ms step:918/1480 train_time:139889ms step_avg:154.06ms step:919/1480 train_time:140051ms step_avg:154.07ms step:920/1480 train_time:140210ms step_avg:154.08ms step:921/1480 train_time:140369ms step_avg:154.08ms step:922/1480 train_time:140530ms step_avg:154.09ms step:923/1480 train_time:140688ms step_avg:154.09ms step:924/1480 train_time:140848ms step_avg:154.10ms step:925/1480 train_time:141009ms step_avg:154.11ms step:926/1480 train_time:141168ms step_avg:154.11ms step:927/1480 train_time:141327ms step_avg:154.12ms step:928/1480 train_time:141485ms step_avg:154.12ms step:929/1480 train_time:141646ms step_avg:154.13ms step:930/1480 train_time:141806ms step_avg:154.14ms step:931/1480 train_time:141966ms step_avg:154.14ms step:932/1480 train_time:142126ms step_avg:154.15ms step:933/1480 train_time:142286ms step_avg:154.16ms step:934/1480 train_time:142446ms step_avg:154.16ms step:935/1480 train_time:142606ms step_avg:154.17ms step:936/1480 train_time:142765ms step_avg:154.17ms step:937/1480 train_time:142927ms step_avg:154.18ms step:938/1480 train_time:143085ms step_avg:154.19ms step:939/1480 train_time:143248ms step_avg:154.20ms step:940/1480 train_time:143410ms step_avg:154.20ms step:941/1480 train_time:143568ms step_avg:154.21ms step:942/1480 train_time:143726ms step_avg:154.21ms step:943/1480 train_time:143886ms step_avg:154.22ms step:944/1480 train_time:144049ms step_avg:154.23ms step:945/1480 train_time:144208ms step_avg:154.23ms step:946/1480 train_time:144371ms step_avg:154.24ms step:947/1480 train_time:144531ms step_avg:154.25ms step:948/1480 train_time:144690ms step_avg:154.25ms step:949/1480 train_time:144864ms step_avg:154.27ms step:950/1480 train_time:145010ms step_avg:154.27ms step:951/1480 train_time:145172ms step_avg:154.27ms step:952/1480 train_time:145330ms step_avg:154.28ms step:953/1480 train_time:145491ms step_avg:154.29ms step:954/1480 train_time:145653ms step_avg:154.29ms step:955/1480 train_time:145811ms step_avg:154.30ms step:956/1480 train_time:145969ms step_avg:154.30ms step:957/1480 train_time:146131ms step_avg:154.31ms step:958/1480 train_time:146293ms step_avg:154.32ms step:959/1480 train_time:146452ms step_avg:154.32ms step:960/1480 train_time:146613ms step_avg:154.33ms step:961/1480 train_time:146772ms step_avg:154.33ms step:962/1480 train_time:146931ms step_avg:154.34ms step:963/1480 train_time:147090ms step_avg:154.34ms step:964/1480 train_time:147251ms step_avg:154.35ms step:965/1480 train_time:147410ms step_avg:154.36ms step:966/1480 train_time:147570ms step_avg:154.36ms step:967/1480 train_time:147727ms step_avg:154.36ms step:968/1480 train_time:147886ms step_avg:154.37ms step:969/1480 train_time:148047ms step_avg:154.38ms step:970/1480 train_time:148207ms step_avg:154.38ms step:971/1480 train_time:148366ms step_avg:154.39ms step:972/1480 train_time:148526ms step_avg:154.39ms step:973/1480 train_time:148684ms step_avg:154.40ms step:974/1480 train_time:148846ms step_avg:154.40ms step:975/1480 train_time:149007ms step_avg:154.41ms step:976/1480 train_time:149168ms step_avg:154.42ms step:977/1480 train_time:149327ms step_avg:154.42ms step:978/1480 train_time:149486ms step_avg:154.43ms step:979/1480 train_time:149648ms step_avg:154.44ms step:980/1480 train_time:149808ms step_avg:154.44ms step:981/1480 train_time:149967ms step_avg:154.45ms step:982/1480 train_time:150127ms step_avg:154.45ms step:983/1480 train_time:150287ms step_avg:154.46ms step:984/1480 train_time:150446ms step_avg:154.46ms step:985/1480 train_time:150610ms step_avg:154.47ms step:986/1480 train_time:150769ms step_avg:154.48ms step:987/1480 train_time:150928ms step_avg:154.48ms step:988/1480 train_time:151085ms step_avg:154.48ms step:989/1480 train_time:151245ms step_avg:154.49ms step:990/1480 train_time:151407ms step_avg:154.50ms step:991/1480 train_time:151569ms step_avg:154.50ms step:992/1480 train_time:151734ms step_avg:154.52ms step:993/1480 train_time:151902ms step_avg:154.53ms step:994/1480 train_time:152062ms step_avg:154.53ms step:995/1480 train_time:152221ms step_avg:154.54ms step:996/1480 train_time:152379ms step_avg:154.54ms step:997/1480 train_time:152539ms step_avg:154.55ms step:998/1480 train_time:152698ms step_avg:154.55ms step:999/1480 train_time:152859ms step_avg:154.56ms step:1000/1480 train_time:153022ms step_avg:154.57ms step:1000/1480 val_loss:3.4381 train_time:153097ms step_avg:154.64ms step:1001/1480 train_time:153195ms step_avg:154.59ms step:1002/1480 train_time:153349ms step_avg:154.59ms step:1003/1480 train_time:153512ms step_avg:154.59ms step:1004/1480 train_time:153675ms step_avg:154.60ms step:1005/1480 train_time:153834ms step_avg:154.61ms step:1006/1480 train_time:153995ms step_avg:154.61ms step:1007/1480 train_time:154154ms step_avg:154.62ms step:1008/1480 train_time:154314ms step_avg:154.62ms step:1009/1480 train_time:154481ms step_avg:154.64ms step:1010/1480 train_time:154639ms step_avg:154.64ms step:1011/1480 train_time:154799ms step_avg:154.64ms step:1012/1480 train_time:154956ms step_avg:154.65ms step:1013/1480 train_time:155118ms step_avg:154.65ms step:1014/1480 train_time:155278ms step_avg:154.66ms step:1015/1480 train_time:155440ms step_avg:154.67ms step:1016/1480 train_time:155599ms step_avg:154.67ms step:1017/1480 train_time:155760ms step_avg:154.68ms step:1018/1480 train_time:155919ms step_avg:154.68ms step:1019/1480 train_time:156082ms step_avg:154.69ms step:1020/1480 train_time:156242ms step_avg:154.70ms step:1021/1480 train_time:156402ms step_avg:154.70ms step:1022/1480 train_time:156561ms step_avg:154.70ms step:1023/1480 train_time:156723ms step_avg:154.71ms step:1024/1480 train_time:156883ms step_avg:154.72ms step:1025/1480 train_time:157046ms step_avg:154.72ms step:1026/1480 train_time:157207ms step_avg:154.73ms step:1027/1480 train_time:157368ms step_avg:154.74ms step:1028/1480 train_time:157530ms step_avg:154.74ms step:1029/1480 train_time:157694ms step_avg:154.75ms step:1030/1480 train_time:157854ms step_avg:154.76ms step:1031/1480 train_time:158013ms step_avg:154.76ms step:1032/1480 train_time:158176ms step_avg:154.77ms step:1033/1480 train_time:158337ms step_avg:154.78ms step:1034/1480 train_time:158498ms step_avg:154.78ms step:1035/1480 train_time:158658ms step_avg:154.79ms step:1036/1480 train_time:158818ms step_avg:154.79ms step:1037/1480 train_time:158980ms step_avg:154.80ms step:1038/1480 train_time:159142ms step_avg:154.81ms step:1039/1480 train_time:159306ms step_avg:154.82ms step:1040/1480 train_time:159466ms step_avg:154.82ms step:1041/1480 train_time:159628ms step_avg:154.83ms step:1042/1480 train_time:159787ms step_avg:154.83ms step:1043/1480 train_time:159946ms step_avg:154.84ms step:1044/1480 train_time:160106ms step_avg:154.84ms step:1045/1480 train_time:160266ms step_avg:154.85ms step:1046/1480 train_time:160425ms step_avg:154.85ms step:1047/1480 train_time:160586ms step_avg:154.86ms step:1048/1480 train_time:160748ms step_avg:154.86ms step:1049/1480 train_time:160909ms step_avg:154.87ms step:1050/1480 train_time:161071ms step_avg:154.88ms step:1051/1480 train_time:161232ms step_avg:154.88ms step:1052/1480 train_time:161393ms step_avg:154.89ms step:1053/1480 train_time:161552ms step_avg:154.89ms step:1054/1480 train_time:161715ms step_avg:154.90ms step:1055/1480 train_time:161874ms step_avg:154.90ms step:1056/1480 train_time:162036ms step_avg:154.91ms step:1057/1480 train_time:162196ms step_avg:154.92ms step:1058/1480 train_time:162358ms step_avg:154.92ms step:1059/1480 train_time:162520ms step_avg:154.93ms step:1060/1480 train_time:162683ms step_avg:154.94ms step:1061/1480 train_time:162842ms step_avg:154.94ms step:1062/1480 train_time:163000ms step_avg:154.94ms step:1063/1480 train_time:163158ms step_avg:154.95ms step:1064/1480 train_time:163316ms step_avg:154.95ms step:1065/1480 train_time:163480ms step_avg:154.96ms step:1066/1480 train_time:163641ms step_avg:154.96ms step:1067/1480 train_time:163807ms step_avg:154.97ms step:1068/1480 train_time:163968ms step_avg:154.98ms step:1069/1480 train_time:164133ms step_avg:154.99ms step:1070/1480 train_time:164292ms step_avg:154.99ms step:1071/1480 train_time:164455ms step_avg:155.00ms step:1072/1480 train_time:164614ms step_avg:155.00ms step:1073/1480 train_time:164772ms step_avg:155.01ms step:1074/1480 train_time:164932ms step_avg:155.01ms step:1075/1480 train_time:165094ms step_avg:155.02ms step:1076/1480 train_time:165254ms step_avg:155.02ms step:1077/1480 train_time:165414ms step_avg:155.03ms step:1078/1480 train_time:165579ms step_avg:155.04ms step:1079/1480 train_time:165742ms step_avg:155.04ms step:1080/1480 train_time:165903ms step_avg:155.05ms step:1081/1480 train_time:166065ms step_avg:155.06ms step:1082/1480 train_time:166225ms step_avg:155.06ms step:1083/1480 train_time:166386ms step_avg:155.07ms step:1084/1480 train_time:166548ms step_avg:155.07ms step:1085/1480 train_time:166709ms step_avg:155.08ms step:1086/1480 train_time:166870ms step_avg:155.08ms step:1087/1480 train_time:167031ms step_avg:155.09ms step:1088/1480 train_time:167192ms step_avg:155.09ms step:1089/1480 train_time:167355ms step_avg:155.10ms step:1090/1480 train_time:167518ms step_avg:155.11ms step:1091/1480 train_time:167678ms step_avg:155.11ms step:1092/1480 train_time:167837ms step_avg:155.12ms step:1093/1480 train_time:167998ms step_avg:155.12ms step:1094/1480 train_time:168158ms step_avg:155.13ms step:1095/1480 train_time:168317ms step_avg:155.13ms step:1096/1480 train_time:168480ms step_avg:155.14ms step:1097/1480 train_time:168642ms step_avg:155.14ms step:1098/1480 train_time:168805ms step_avg:155.15ms step:1099/1480 train_time:168967ms step_avg:155.16ms step:1100/1480 train_time:169130ms step_avg:155.17ms step:1101/1480 train_time:169295ms step_avg:155.17ms step:1102/1480 train_time:169456ms step_avg:155.18ms step:1103/1480 train_time:169621ms step_avg:155.19ms step:1104/1480 train_time:169783ms step_avg:155.20ms step:1105/1480 train_time:169947ms step_avg:155.20ms step:1106/1480 train_time:170110ms step_avg:155.21ms step:1107/1480 train_time:170272ms step_avg:155.22ms step:1108/1480 train_time:170431ms step_avg:155.22ms step:1109/1480 train_time:170592ms step_avg:155.22ms step:1110/1480 train_time:170752ms step_avg:155.23ms step:1111/1480 train_time:170916ms step_avg:155.24ms step:1112/1480 train_time:171077ms step_avg:155.24ms step:1113/1480 train_time:171244ms step_avg:155.25ms step:1114/1480 train_time:171407ms step_avg:155.26ms step:1115/1480 train_time:171569ms step_avg:155.27ms step:1116/1480 train_time:171730ms step_avg:155.27ms step:1117/1480 train_time:171894ms step_avg:155.28ms step:1118/1480 train_time:172058ms step_avg:155.29ms step:1119/1480 train_time:172218ms step_avg:155.29ms step:1120/1480 train_time:172379ms step_avg:155.30ms step:1121/1480 train_time:172541ms step_avg:155.30ms step:1122/1480 train_time:172701ms step_avg:155.31ms step:1123/1480 train_time:172861ms step_avg:155.31ms step:1124/1480 train_time:173025ms step_avg:155.32ms step:1125/1480 train_time:173188ms step_avg:155.33ms step:1125/1480 val_loss:3.3824 train_time:173264ms step_avg:155.39ms step:1126/1480 train_time:173359ms step_avg:155.34ms step:1127/1480 train_time:173515ms step_avg:155.34ms step:1128/1480 train_time:173674ms step_avg:155.34ms step:1129/1480 train_time:173838ms step_avg:155.35ms step:1130/1480 train_time:173999ms step_avg:155.36ms step:1131/1480 train_time:174166ms step_avg:155.37ms step:1132/1480 train_time:174325ms step_avg:155.37ms step:1133/1480 train_time:174487ms step_avg:155.38ms step:1134/1480 train_time:174650ms step_avg:155.38ms step:1135/1480 train_time:174811ms step_avg:155.39ms step:1136/1480 train_time:174974ms step_avg:155.39ms step:1137/1480 train_time:175136ms step_avg:155.40ms step:1138/1480 train_time:175301ms step_avg:155.41ms step:1139/1480 train_time:175475ms step_avg:155.43ms step:1140/1480 train_time:175624ms step_avg:155.42ms step:1141/1480 train_time:175787ms step_avg:155.43ms step:1142/1480 train_time:175947ms step_avg:155.43ms step:1143/1480 train_time:176111ms step_avg:155.44ms step:1144/1480 train_time:176273ms step_avg:155.44ms step:1145/1480 train_time:176432ms step_avg:155.45ms step:1146/1480 train_time:176595ms step_avg:155.45ms step:1147/1480 train_time:176758ms step_avg:155.46ms step:1148/1480 train_time:176919ms step_avg:155.46ms step:1149/1480 train_time:177081ms step_avg:155.47ms step:1150/1480 train_time:177242ms step_avg:155.48ms step:1151/1480 train_time:177404ms step_avg:155.48ms step:1152/1480 train_time:177568ms step_avg:155.49ms step:1153/1480 train_time:177732ms step_avg:155.50ms step:1154/1480 train_time:177894ms step_avg:155.50ms step:1155/1480 train_time:178056ms step_avg:155.51ms step:1156/1480 train_time:178224ms step_avg:155.52ms step:1157/1480 train_time:178387ms step_avg:155.52ms step:1158/1480 train_time:178548ms step_avg:155.53ms step:1159/1480 train_time:178707ms step_avg:155.53ms step:1160/1480 train_time:178867ms step_avg:155.54ms step:1161/1480 train_time:179029ms step_avg:155.54ms step:1162/1480 train_time:179190ms step_avg:155.55ms step:1163/1480 train_time:179355ms step_avg:155.55ms step:1164/1480 train_time:179517ms step_avg:155.56ms step:1165/1480 train_time:179677ms step_avg:155.56ms step:1166/1480 train_time:179841ms step_avg:155.57ms step:1167/1480 train_time:180001ms step_avg:155.58ms step:1168/1480 train_time:180163ms step_avg:155.58ms step:1169/1480 train_time:180324ms step_avg:155.59ms step:1170/1480 train_time:180484ms step_avg:155.59ms step:1171/1480 train_time:180647ms step_avg:155.60ms step:1172/1480 train_time:180807ms step_avg:155.60ms step:1173/1480 train_time:180969ms step_avg:155.61ms step:1174/1480 train_time:181137ms step_avg:155.62ms step:1175/1480 train_time:181299ms step_avg:155.62ms step:1176/1480 train_time:181464ms step_avg:155.63ms step:1177/1480 train_time:181629ms step_avg:155.64ms step:1178/1480 train_time:181791ms step_avg:155.64ms step:1179/1480 train_time:181951ms step_avg:155.65ms step:1180/1480 train_time:182121ms step_avg:155.66ms step:1181/1480 train_time:182284ms step_avg:155.67ms step:1182/1480 train_time:182444ms step_avg:155.67ms step:1183/1480 train_time:182606ms step_avg:155.67ms step:1184/1480 train_time:182768ms step_avg:155.68ms step:1185/1480 train_time:182931ms step_avg:155.69ms step:1186/1480 train_time:183094ms step_avg:155.69ms step:1187/1480 train_time:183267ms step_avg:155.71ms step:1188/1480 train_time:183426ms step_avg:155.71ms step:1189/1480 train_time:183588ms step_avg:155.71ms step:1190/1480 train_time:183750ms step_avg:155.72ms step:1191/1480 train_time:183911ms step_avg:155.73ms step:1192/1480 train_time:184071ms step_avg:155.73ms step:1193/1480 train_time:184231ms step_avg:155.73ms step:1194/1480 train_time:184393ms step_avg:155.74ms step:1195/1480 train_time:184557ms step_avg:155.74ms step:1196/1480 train_time:184727ms step_avg:155.76ms step:1197/1480 train_time:184889ms step_avg:155.76ms step:1198/1480 train_time:185058ms step_avg:155.77ms step:1199/1480 train_time:185222ms step_avg:155.78ms step:1200/1480 train_time:185383ms step_avg:155.78ms step:1201/1480 train_time:185544ms step_avg:155.79ms step:1202/1480 train_time:185712ms step_avg:155.80ms step:1203/1480 train_time:185879ms step_avg:155.81ms step:1204/1480 train_time:186044ms step_avg:155.82ms step:1205/1480 train_time:186205ms step_avg:155.82ms step:1206/1480 train_time:186365ms step_avg:155.82ms step:1207/1480 train_time:186526ms step_avg:155.83ms step:1208/1480 train_time:186687ms step_avg:155.83ms step:1209/1480 train_time:186851ms step_avg:155.84ms step:1210/1480 train_time:187017ms step_avg:155.85ms step:1211/1480 train_time:187181ms step_avg:155.85ms step:1212/1480 train_time:187344ms step_avg:155.86ms step:1213/1480 train_time:187507ms step_avg:155.87ms step:1214/1480 train_time:187672ms step_avg:155.87ms step:1215/1480 train_time:187838ms step_avg:155.88ms step:1216/1480 train_time:187999ms step_avg:155.89ms step:1217/1480 train_time:188163ms step_avg:155.89ms step:1218/1480 train_time:188325ms step_avg:155.90ms step:1219/1480 train_time:188491ms step_avg:155.91ms step:1220/1480 train_time:188654ms step_avg:155.91ms step:1221/1480 train_time:188815ms step_avg:155.92ms step:1222/1480 train_time:188976ms step_avg:155.92ms step:1223/1480 train_time:189140ms step_avg:155.93ms step:1224/1480 train_time:189305ms step_avg:155.94ms step:1225/1480 train_time:189468ms step_avg:155.94ms step:1226/1480 train_time:189633ms step_avg:155.95ms step:1227/1480 train_time:189798ms step_avg:155.96ms step:1228/1480 train_time:189962ms step_avg:155.96ms step:1229/1480 train_time:190124ms step_avg:155.97ms step:1230/1480 train_time:190293ms step_avg:155.98ms step:1231/1480 train_time:190459ms step_avg:155.99ms step:1232/1480 train_time:190625ms step_avg:155.99ms step:1233/1480 train_time:190785ms step_avg:156.00ms step:1234/1480 train_time:190947ms step_avg:156.00ms step:1235/1480 train_time:191113ms step_avg:156.01ms step:1236/1480 train_time:191276ms step_avg:156.02ms step:1237/1480 train_time:191438ms step_avg:156.02ms step:1238/1480 train_time:191609ms step_avg:156.03ms step:1239/1480 train_time:191770ms step_avg:156.04ms step:1240/1480 train_time:191934ms step_avg:156.04ms step:1241/1480 train_time:192099ms step_avg:156.05ms step:1242/1480 train_time:192262ms step_avg:156.06ms step:1243/1480 train_time:192426ms step_avg:156.06ms step:1244/1480 train_time:192586ms step_avg:156.07ms step:1245/1480 train_time:192748ms step_avg:156.07ms step:1246/1480 train_time:192911ms step_avg:156.08ms step:1247/1480 train_time:193073ms step_avg:156.08ms step:1248/1480 train_time:193235ms step_avg:156.09ms step:1249/1480 train_time:193398ms step_avg:156.09ms step:1250/1480 train_time:193561ms step_avg:156.10ms step:1250/1480 val_loss:3.3325 train_time:193636ms step_avg:156.16ms step:1251/1480 train_time:193730ms step_avg:156.11ms step:1252/1480 train_time:193893ms step_avg:156.11ms step:1253/1480 train_time:194054ms step_avg:156.12ms step:1254/1480 train_time:194216ms step_avg:156.12ms step:1255/1480 train_time:194386ms step_avg:156.13ms step:1256/1480 train_time:194551ms step_avg:156.14ms step:1257/1480 train_time:194714ms step_avg:156.15ms step:1258/1480 train_time:194879ms step_avg:156.15ms step:1259/1480 train_time:195042ms step_avg:156.16ms step:1260/1480 train_time:195203ms step_avg:156.16ms step:1261/1480 train_time:195366ms step_avg:156.17ms step:1262/1480 train_time:195531ms step_avg:156.18ms step:1263/1480 train_time:195696ms step_avg:156.18ms step:1264/1480 train_time:195855ms step_avg:156.18ms step:1265/1480 train_time:196016ms step_avg:156.19ms step:1266/1480 train_time:196178ms step_avg:156.19ms step:1267/1480 train_time:196339ms step_avg:156.20ms step:1268/1480 train_time:196504ms step_avg:156.20ms step:1269/1480 train_time:196671ms step_avg:156.21ms step:1270/1480 train_time:196833ms step_avg:156.22ms step:1271/1480 train_time:196996ms step_avg:156.22ms step:1272/1480 train_time:197157ms step_avg:156.23ms step:1273/1480 train_time:197319ms step_avg:156.23ms step:1274/1480 train_time:197484ms step_avg:156.24ms step:1275/1480 train_time:197646ms step_avg:156.24ms step:1276/1480 train_time:197807ms step_avg:156.25ms step:1277/1480 train_time:197969ms step_avg:156.25ms step:1278/1480 train_time:198130ms step_avg:156.25ms step:1279/1480 train_time:198292ms step_avg:156.26ms step:1280/1480 train_time:198459ms step_avg:156.27ms step:1281/1480 train_time:198619ms step_avg:156.27ms step:1282/1480 train_time:198778ms step_avg:156.27ms step:1283/1480 train_time:198941ms step_avg:156.28ms step:1284/1480 train_time:199105ms step_avg:156.28ms step:1285/1480 train_time:199270ms step_avg:156.29ms step:1286/1480 train_time:199432ms step_avg:156.30ms step:1287/1480 train_time:199595ms step_avg:156.30ms step:1288/1480 train_time:199757ms step_avg:156.30ms step:1289/1480 train_time:199927ms step_avg:156.31ms step:1290/1480 train_time:200095ms step_avg:156.32ms step:1291/1480 train_time:200258ms step_avg:156.33ms step:1292/1480 train_time:200423ms step_avg:156.34ms step:1293/1480 train_time:200591ms step_avg:156.35ms step:1294/1480 train_time:200753ms step_avg:156.35ms step:1295/1480 train_time:200915ms step_avg:156.35ms step:1296/1480 train_time:201078ms step_avg:156.36ms step:1297/1480 train_time:201241ms step_avg:156.36ms step:1298/1480 train_time:201404ms step_avg:156.37ms step:1299/1480 train_time:201568ms step_avg:156.38ms step:1300/1480 train_time:201729ms step_avg:156.38ms step:1301/1480 train_time:201891ms step_avg:156.38ms step:1302/1480 train_time:202057ms step_avg:156.39ms step:1303/1480 train_time:202225ms step_avg:156.40ms step:1304/1480 train_time:202391ms step_avg:156.41ms step:1305/1480 train_time:202552ms step_avg:156.41ms step:1306/1480 train_time:202717ms step_avg:156.42ms step:1307/1480 train_time:202878ms step_avg:156.42ms step:1308/1480 train_time:203038ms step_avg:156.42ms step:1309/1480 train_time:203206ms step_avg:156.43ms step:1310/1480 train_time:203369ms step_avg:156.44ms step:1311/1480 train_time:203531ms step_avg:156.44ms step:1312/1480 train_time:203697ms step_avg:156.45ms step:1313/1480 train_time:203859ms step_avg:156.45ms step:1314/1480 train_time:204022ms step_avg:156.46ms step:1315/1480 train_time:204185ms step_avg:156.46ms step:1316/1480 train_time:204345ms step_avg:156.47ms step:1317/1480 train_time:204507ms step_avg:156.47ms step:1318/1480 train_time:204675ms step_avg:156.48ms step:1319/1480 train_time:204840ms step_avg:156.49ms step:1320/1480 train_time:205010ms step_avg:156.50ms step:1321/1480 train_time:205174ms step_avg:156.50ms step:1322/1480 train_time:205344ms step_avg:156.51ms step:1323/1480 train_time:205509ms step_avg:156.52ms step:1324/1480 train_time:205673ms step_avg:156.52ms step:1325/1480 train_time:205841ms step_avg:156.53ms step:1326/1480 train_time:206007ms step_avg:156.54ms step:1327/1480 train_time:206171ms step_avg:156.55ms step:1328/1480 train_time:206334ms step_avg:156.55ms step:1329/1480 train_time:206522ms step_avg:156.57ms step:1330/1480 train_time:206682ms step_avg:156.58ms step:1331/1480 train_time:206846ms step_avg:156.58ms step:1332/1480 train_time:207009ms step_avg:156.59ms step:1333/1480 train_time:207175ms step_avg:156.59ms step:1334/1480 train_time:207337ms step_avg:156.60ms step:1335/1480 train_time:207497ms step_avg:156.60ms step:1336/1480 train_time:207666ms step_avg:156.61ms step:1337/1480 train_time:207833ms step_avg:156.62ms step:1338/1480 train_time:207997ms step_avg:156.62ms step:1339/1480 train_time:208160ms step_avg:156.63ms step:1340/1480 train_time:208323ms step_avg:156.63ms step:1341/1480 train_time:208484ms step_avg:156.64ms step:1342/1480 train_time:208650ms step_avg:156.64ms step:1343/1480 train_time:208814ms step_avg:156.65ms step:1344/1480 train_time:208975ms step_avg:156.65ms step:1345/1480 train_time:209143ms step_avg:156.66ms step:1346/1480 train_time:209304ms step_avg:156.67ms step:1347/1480 train_time:209467ms step_avg:156.67ms step:1348/1480 train_time:209632ms step_avg:156.68ms step:1349/1480 train_time:209795ms step_avg:156.68ms step:1350/1480 train_time:209960ms step_avg:156.69ms step:1351/1480 train_time:210121ms step_avg:156.69ms step:1352/1480 train_time:210285ms step_avg:156.69ms step:1353/1480 train_time:210450ms step_avg:156.70ms step:1354/1480 train_time:210615ms step_avg:156.71ms step:1355/1480 train_time:210777ms step_avg:156.71ms step:1356/1480 train_time:210940ms step_avg:156.72ms step:1357/1480 train_time:211104ms step_avg:156.72ms step:1358/1480 train_time:211269ms step_avg:156.73ms step:1359/1480 train_time:211433ms step_avg:156.73ms step:1360/1480 train_time:211598ms step_avg:156.74ms step:1361/1480 train_time:211765ms step_avg:156.75ms step:1362/1480 train_time:211931ms step_avg:156.75ms step:1363/1480 train_time:212099ms step_avg:156.76ms step:1364/1480 train_time:212261ms step_avg:156.77ms step:1365/1480 train_time:212420ms step_avg:156.77ms step:1366/1480 train_time:212585ms step_avg:156.77ms step:1367/1480 train_time:212748ms step_avg:156.78ms step:1368/1480 train_time:212913ms step_avg:156.78ms step:1369/1480 train_time:213081ms step_avg:156.79ms step:1370/1480 train_time:213248ms step_avg:156.80ms step:1371/1480 train_time:213413ms step_avg:156.81ms step:1372/1480 train_time:213579ms step_avg:156.81ms step:1373/1480 train_time:213740ms step_avg:156.82ms step:1374/1480 train_time:213909ms step_avg:156.82ms step:1375/1480 train_time:214073ms step_avg:156.83ms step:1375/1480 val_loss:3.2944 train_time:214147ms step_avg:156.88ms step:1376/1480 train_time:214241ms step_avg:156.84ms step:1377/1480 train_time:214400ms step_avg:156.84ms step:1378/1480 train_time:214561ms step_avg:156.84ms step:1379/1480 train_time:214727ms step_avg:156.85ms step:1380/1480 train_time:214891ms step_avg:156.85ms step:1381/1480 train_time:215058ms step_avg:156.86ms step:1382/1480 train_time:215221ms step_avg:156.87ms step:1383/1480 train_time:215385ms step_avg:156.87ms step:1384/1480 train_time:215552ms step_avg:156.88ms step:1385/1480 train_time:215713ms step_avg:156.88ms step:1386/1480 train_time:215878ms step_avg:156.89ms step:1387/1480 train_time:216044ms step_avg:156.89ms step:1388/1480 train_time:216204ms step_avg:156.90ms step:1389/1480 train_time:216371ms step_avg:156.90ms step:1390/1480 train_time:216533ms step_avg:156.91ms step:1391/1480 train_time:216694ms step_avg:156.91ms step:1392/1480 train_time:216858ms step_avg:156.92ms step:1393/1480 train_time:217020ms step_avg:156.92ms step:1394/1480 train_time:217186ms step_avg:156.93ms step:1395/1480 train_time:217349ms step_avg:156.93ms step:1396/1480 train_time:217513ms step_avg:156.94ms step:1397/1480 train_time:217674ms step_avg:156.94ms step:1398/1480 train_time:217835ms step_avg:156.94ms step:1399/1480 train_time:217995ms step_avg:156.94ms step:1400/1480 train_time:218163ms step_avg:156.95ms step:1401/1480 train_time:218324ms step_avg:156.95ms step:1402/1480 train_time:218488ms step_avg:156.96ms step:1403/1480 train_time:218655ms step_avg:156.97ms step:1404/1480 train_time:218818ms step_avg:156.97ms step:1405/1480 train_time:218985ms step_avg:156.98ms step:1406/1480 train_time:219151ms step_avg:156.99ms step:1407/1480 train_time:219313ms step_avg:156.99ms step:1408/1480 train_time:219474ms step_avg:156.99ms step:1409/1480 train_time:219647ms step_avg:157.00ms step:1410/1480 train_time:219810ms step_avg:157.01ms step:1411/1480 train_time:219970ms step_avg:157.01ms step:1412/1480 train_time:220133ms step_avg:157.01ms step:1413/1480 train_time:220296ms step_avg:157.02ms step:1414/1480 train_time:220460ms step_avg:157.02ms step:1415/1480 train_time:220624ms step_avg:157.03ms step:1416/1480 train_time:220797ms step_avg:157.04ms step:1417/1480 train_time:220960ms step_avg:157.04ms step:1418/1480 train_time:221126ms step_avg:157.05ms step:1419/1480 train_time:221291ms step_avg:157.06ms step:1420/1480 train_time:221457ms step_avg:157.06ms step:1421/1480 train_time:221620ms step_avg:157.07ms step:1422/1480 train_time:221785ms step_avg:157.07ms step:1423/1480 train_time:221947ms step_avg:157.08ms step:1424/1480 train_time:222115ms step_avg:157.08ms step:1425/1480 train_time:222285ms step_avg:157.09ms step:1426/1480 train_time:222449ms step_avg:157.10ms step:1427/1480 train_time:222614ms step_avg:157.10ms step:1428/1480 train_time:222777ms step_avg:157.11ms step:1429/1480 train_time:222938ms step_avg:157.11ms step:1430/1480 train_time:223101ms step_avg:157.11ms step:1431/1480 train_time:223267ms step_avg:157.12ms step:1432/1480 train_time:223436ms step_avg:157.13ms step:1433/1480 train_time:223603ms step_avg:157.14ms step:1434/1480 train_time:223773ms step_avg:157.14ms step:1435/1480 train_time:223939ms step_avg:157.15ms step:1436/1480 train_time:224102ms step_avg:157.15ms step:1437/1480 train_time:224263ms step_avg:157.16ms step:1438/1480 train_time:224425ms step_avg:157.16ms step:1439/1480 train_time:224592ms step_avg:157.17ms step:1440/1480 train_time:224754ms step_avg:157.17ms step:1441/1480 train_time:224919ms step_avg:157.18ms step:1442/1480 train_time:225085ms step_avg:157.18ms step:1443/1480 train_time:225259ms step_avg:157.19ms step:1444/1480 train_time:225421ms step_avg:157.20ms step:1445/1480 train_time:225584ms step_avg:157.20ms step:1446/1480 train_time:225750ms step_avg:157.21ms step:1447/1480 train_time:225918ms step_avg:157.21ms step:1448/1480 train_time:226081ms step_avg:157.22ms step:1449/1480 train_time:226245ms step_avg:157.22ms step:1450/1480 train_time:226410ms step_avg:157.23ms step:1451/1480 train_time:226573ms step_avg:157.23ms step:1452/1480 train_time:226739ms step_avg:157.24ms step:1453/1480 train_time:226900ms step_avg:157.24ms step:1454/1480 train_time:227062ms step_avg:157.25ms step:1455/1480 train_time:227233ms step_avg:157.25ms step:1456/1480 train_time:227395ms step_avg:157.26ms step:1457/1480 train_time:227559ms step_avg:157.26ms step:1458/1480 train_time:227723ms step_avg:157.27ms step:1459/1480 train_time:227889ms step_avg:157.27ms step:1460/1480 train_time:228054ms step_avg:157.28ms step:1461/1480 train_time:228218ms step_avg:157.28ms step:1462/1480 train_time:228382ms step_avg:157.29ms step:1463/1480 train_time:228549ms step_avg:157.29ms step:1464/1480 train_time:228715ms step_avg:157.30ms step:1465/1480 train_time:228878ms step_avg:157.30ms step:1466/1480 train_time:229039ms step_avg:157.31ms step:1467/1480 train_time:229204ms step_avg:157.31ms step:1468/1480 train_time:229367ms step_avg:157.32ms step:1469/1480 train_time:229530ms step_avg:157.32ms step:1470/1480 train_time:229698ms step_avg:157.33ms step:1471/1480 train_time:229870ms step_avg:157.34ms step:1472/1480 train_time:230040ms step_avg:157.35ms step:1473/1480 train_time:230203ms step_avg:157.35ms step:1474/1480 train_time:230371ms step_avg:157.36ms step:1475/1480 train_time:230542ms step_avg:157.37ms step:1476/1480 train_time:230704ms step_avg:157.37ms step:1477/1480 train_time:230871ms step_avg:157.38ms step:1478/1480 train_time:231040ms step_avg:157.38ms step:1479/1480 train_time:231206ms step_avg:157.39ms step:1480/1480 train_time:231368ms step_avg:157.39ms step:1480/1480 val_loss:3.2756 train_time:231445ms step_avg:157.45ms peak memory consumption: 34239 MiB