import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 07:37:28 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 111W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29387ms step_avg:nanms step:2/1480 train_time:29583ms step_avg:nanms step:3/1480 train_time:29708ms step_avg:nanms step:4/1480 train_time:29849ms step_avg:nanms step:5/1480 train_time:29990ms step_avg:nanms step:6/1480 train_time:30133ms step_avg:nanms step:7/1480 train_time:30273ms step_avg:nanms step:8/1480 train_time:30414ms step_avg:nanms step:9/1480 train_time:30558ms step_avg:nanms step:10/1480 train_time:30703ms step_avg:nanms step:11/1480 train_time:145ms step_avg:nanms step:12/1480 train_time:285ms step_avg:nanms step:13/1480 train_time:426ms step_avg:141.94ms step:14/1480 train_time:568ms step_avg:142.01ms step:15/1480 train_time:710ms step_avg:141.95ms step:16/1480 train_time:852ms step_avg:142.01ms step:17/1480 train_time:997ms step_avg:142.49ms step:18/1480 train_time:1141ms step_avg:142.66ms step:19/1480 train_time:1285ms step_avg:142.79ms step:20/1480 train_time:1427ms step_avg:142.66ms step:21/1480 train_time:1569ms step_avg:142.60ms step:22/1480 train_time:1712ms step_avg:142.68ms step:23/1480 train_time:1858ms step_avg:142.91ms step:24/1480 train_time:2000ms step_avg:142.83ms step:25/1480 train_time:2142ms step_avg:142.78ms step:26/1480 train_time:2285ms step_avg:142.79ms step:27/1480 train_time:2427ms step_avg:142.76ms step:28/1480 train_time:2569ms step_avg:142.70ms step:29/1480 train_time:2711ms step_avg:142.67ms step:30/1480 train_time:3231ms step_avg:161.53ms step:31/1480 train_time:3330ms step_avg:158.55ms step:32/1480 train_time:3473ms step_avg:157.85ms step:33/1480 train_time:3614ms step_avg:157.12ms step:34/1480 train_time:3756ms step_avg:156.52ms step:35/1480 train_time:3900ms step_avg:155.99ms step:36/1480 train_time:4042ms step_avg:155.45ms step:37/1480 train_time:4186ms step_avg:155.04ms step:38/1480 train_time:4333ms step_avg:154.73ms step:39/1480 train_time:4477ms step_avg:154.39ms step:40/1480 train_time:4622ms step_avg:154.07ms step:41/1480 train_time:4764ms step_avg:153.68ms step:42/1480 train_time:4907ms step_avg:153.35ms step:43/1480 train_time:5049ms step_avg:153.00ms step:44/1480 train_time:5193ms step_avg:152.73ms step:45/1480 train_time:5336ms step_avg:152.46ms step:46/1480 train_time:5481ms step_avg:152.24ms step:47/1480 train_time:5622ms step_avg:151.95ms step:48/1480 train_time:5766ms step_avg:151.73ms step:49/1480 train_time:5908ms step_avg:151.49ms step:50/1480 train_time:6050ms step_avg:151.24ms step:51/1480 train_time:6193ms step_avg:151.04ms step:52/1480 train_time:6336ms step_avg:150.85ms step:53/1480 train_time:6478ms step_avg:150.66ms step:54/1480 train_time:6622ms step_avg:150.50ms step:55/1480 train_time:6765ms step_avg:150.33ms step:56/1480 train_time:6906ms step_avg:150.12ms step:57/1480 train_time:7047ms step_avg:149.94ms step:58/1480 train_time:7190ms step_avg:149.79ms step:59/1480 train_time:7334ms step_avg:149.67ms step:60/1480 train_time:7477ms step_avg:149.54ms step:61/1480 train_time:7620ms step_avg:149.41ms step:62/1480 train_time:7762ms step_avg:149.28ms step:63/1480 train_time:7905ms step_avg:149.15ms step:64/1480 train_time:8046ms step_avg:149.00ms step:65/1480 train_time:8188ms step_avg:148.87ms step:66/1480 train_time:8329ms step_avg:148.74ms step:67/1480 train_time:8472ms step_avg:148.62ms step:68/1480 train_time:8613ms step_avg:148.51ms step:69/1480 train_time:8756ms step_avg:148.41ms step:70/1480 train_time:8899ms step_avg:148.32ms step:71/1480 train_time:9041ms step_avg:148.22ms step:72/1480 train_time:9184ms step_avg:148.13ms step:73/1480 train_time:9325ms step_avg:148.02ms step:74/1480 train_time:9468ms step_avg:147.93ms step:75/1480 train_time:9611ms step_avg:147.86ms step:76/1480 train_time:9753ms step_avg:147.78ms step:77/1480 train_time:9897ms step_avg:147.71ms step:78/1480 train_time:10040ms step_avg:147.64ms step:79/1480 train_time:10183ms step_avg:147.59ms step:80/1480 train_time:10326ms step_avg:147.51ms step:81/1480 train_time:10468ms step_avg:147.44ms step:82/1480 train_time:10609ms step_avg:147.35ms step:83/1480 train_time:10751ms step_avg:147.28ms step:84/1480 train_time:10896ms step_avg:147.24ms step:85/1480 train_time:11039ms step_avg:147.19ms step:86/1480 train_time:11183ms step_avg:147.15ms step:87/1480 train_time:11325ms step_avg:147.08ms step:88/1480 train_time:11468ms step_avg:147.03ms step:89/1480 train_time:11611ms step_avg:146.98ms step:90/1480 train_time:11757ms step_avg:146.96ms step:91/1480 train_time:11900ms step_avg:146.92ms step:92/1480 train_time:12042ms step_avg:146.86ms step:93/1480 train_time:12184ms step_avg:146.80ms step:94/1480 train_time:12327ms step_avg:146.75ms step:95/1480 train_time:12468ms step_avg:146.69ms step:96/1480 train_time:12610ms step_avg:146.63ms step:97/1480 train_time:12752ms step_avg:146.57ms step:98/1480 train_time:12895ms step_avg:146.54ms step:99/1480 train_time:13039ms step_avg:146.51ms step:100/1480 train_time:13182ms step_avg:146.47ms step:101/1480 train_time:13328ms step_avg:146.46ms step:102/1480 train_time:13467ms step_avg:146.38ms step:103/1480 train_time:13608ms step_avg:146.32ms step:104/1480 train_time:13751ms step_avg:146.29ms step:105/1480 train_time:13895ms step_avg:146.27ms step:106/1480 train_time:14038ms step_avg:146.23ms step:107/1480 train_time:14179ms step_avg:146.18ms step:108/1480 train_time:14323ms step_avg:146.15ms step:109/1480 train_time:14465ms step_avg:146.11ms step:110/1480 train_time:14606ms step_avg:146.06ms step:111/1480 train_time:14750ms step_avg:146.04ms step:112/1480 train_time:14895ms step_avg:146.03ms step:113/1480 train_time:15041ms step_avg:146.03ms step:114/1480 train_time:15186ms step_avg:146.02ms step:115/1480 train_time:15331ms step_avg:146.01ms step:116/1480 train_time:15477ms step_avg:146.01ms step:117/1480 train_time:15624ms step_avg:146.02ms step:118/1480 train_time:15768ms step_avg:146.00ms step:119/1480 train_time:15913ms step_avg:145.99ms step:120/1480 train_time:16060ms step_avg:146.00ms step:121/1480 train_time:16205ms step_avg:145.99ms step:122/1480 train_time:16350ms step_avg:145.99ms step:123/1480 train_time:16497ms step_avg:145.99ms step:124/1480 train_time:16643ms step_avg:145.99ms step:125/1480 train_time:16790ms step_avg:146.00ms step:125/1480 val_loss:4.4262 train_time:16855ms step_avg:146.56ms step:126/1480 train_time:16949ms step_avg:146.11ms step:127/1480 train_time:17091ms step_avg:146.08ms step:128/1480 train_time:17237ms step_avg:146.08ms step:129/1480 train_time:17382ms step_avg:146.06ms step:130/1480 train_time:17527ms step_avg:146.05ms step:131/1480 train_time:17672ms step_avg:146.05ms step:132/1480 train_time:17818ms step_avg:146.05ms step:133/1480 train_time:17962ms step_avg:146.03ms step:134/1480 train_time:18109ms step_avg:146.04ms step:135/1480 train_time:18257ms step_avg:146.06ms step:136/1480 train_time:18402ms step_avg:146.04ms step:137/1480 train_time:18547ms step_avg:146.04ms step:138/1480 train_time:18694ms step_avg:146.04ms step:139/1480 train_time:18839ms step_avg:146.04ms step:140/1480 train_time:18984ms step_avg:146.03ms step:141/1480 train_time:19131ms step_avg:146.04ms step:142/1480 train_time:19277ms step_avg:146.04ms step:143/1480 train_time:19422ms step_avg:146.03ms step:144/1480 train_time:19567ms step_avg:146.02ms step:145/1480 train_time:19713ms step_avg:146.02ms step:146/1480 train_time:19859ms step_avg:146.02ms step:147/1480 train_time:20004ms step_avg:146.02ms step:148/1480 train_time:20150ms step_avg:146.02ms step:149/1480 train_time:20296ms step_avg:146.02ms step:150/1480 train_time:20442ms step_avg:146.01ms step:151/1480 train_time:20589ms step_avg:146.02ms step:152/1480 train_time:20736ms step_avg:146.03ms step:153/1480 train_time:20880ms step_avg:146.01ms step:154/1480 train_time:21025ms step_avg:146.01ms step:155/1480 train_time:21171ms step_avg:146.01ms step:156/1480 train_time:21317ms step_avg:146.01ms step:157/1480 train_time:21461ms step_avg:146.00ms step:158/1480 train_time:21606ms step_avg:145.99ms step:159/1480 train_time:21753ms step_avg:145.99ms step:160/1480 train_time:21899ms step_avg:145.99ms step:161/1480 train_time:22045ms step_avg:145.99ms step:162/1480 train_time:22191ms step_avg:146.00ms step:163/1480 train_time:22338ms step_avg:146.00ms step:164/1480 train_time:22483ms step_avg:146.00ms step:165/1480 train_time:22630ms step_avg:146.00ms step:166/1480 train_time:22776ms step_avg:146.00ms step:167/1480 train_time:22921ms step_avg:145.99ms step:168/1480 train_time:23066ms step_avg:145.99ms step:169/1480 train_time:23213ms step_avg:145.99ms step:170/1480 train_time:23359ms step_avg:145.99ms step:171/1480 train_time:23504ms step_avg:145.99ms step:172/1480 train_time:23650ms step_avg:145.99ms step:173/1480 train_time:23797ms step_avg:145.99ms step:174/1480 train_time:23942ms step_avg:145.99ms step:175/1480 train_time:24089ms step_avg:145.99ms step:176/1480 train_time:24236ms step_avg:146.00ms step:177/1480 train_time:24380ms step_avg:145.99ms step:178/1480 train_time:24526ms step_avg:145.99ms step:179/1480 train_time:24672ms step_avg:145.99ms step:180/1480 train_time:24818ms step_avg:145.99ms step:181/1480 train_time:24963ms step_avg:145.98ms step:182/1480 train_time:25108ms step_avg:145.98ms step:183/1480 train_time:25256ms step_avg:145.99ms step:184/1480 train_time:25401ms step_avg:145.98ms step:185/1480 train_time:25547ms step_avg:145.99ms step:186/1480 train_time:25694ms step_avg:145.99ms step:187/1480 train_time:25840ms step_avg:145.99ms step:188/1480 train_time:25985ms step_avg:145.98ms step:189/1480 train_time:26147ms step_avg:146.08ms step:190/1480 train_time:26277ms step_avg:145.99ms step:191/1480 train_time:26423ms step_avg:145.98ms step:192/1480 train_time:26568ms step_avg:145.98ms step:193/1480 train_time:26714ms step_avg:145.98ms step:194/1480 train_time:26859ms step_avg:145.97ms step:195/1480 train_time:27004ms step_avg:145.97ms step:196/1480 train_time:27150ms step_avg:145.97ms step:197/1480 train_time:27296ms step_avg:145.97ms step:198/1480 train_time:27443ms step_avg:145.97ms step:199/1480 train_time:27590ms step_avg:145.98ms step:200/1480 train_time:27737ms step_avg:145.99ms step:201/1480 train_time:27885ms step_avg:145.99ms step:202/1480 train_time:28028ms step_avg:145.98ms step:203/1480 train_time:28175ms step_avg:145.98ms step:204/1480 train_time:28321ms step_avg:145.98ms step:205/1480 train_time:28465ms step_avg:145.97ms step:206/1480 train_time:28611ms step_avg:145.98ms step:207/1480 train_time:28758ms step_avg:145.98ms step:208/1480 train_time:28903ms step_avg:145.97ms step:209/1480 train_time:29048ms step_avg:145.97ms step:210/1480 train_time:29194ms step_avg:145.97ms step:211/1480 train_time:29339ms step_avg:145.97ms step:212/1480 train_time:29484ms step_avg:145.96ms step:213/1480 train_time:29631ms step_avg:145.97ms step:214/1480 train_time:29778ms step_avg:145.97ms step:215/1480 train_time:29923ms step_avg:145.97ms step:216/1480 train_time:30069ms step_avg:145.97ms step:217/1480 train_time:30215ms step_avg:145.97ms step:218/1480 train_time:30360ms step_avg:145.96ms step:219/1480 train_time:30506ms step_avg:145.96ms step:220/1480 train_time:30652ms step_avg:145.96ms step:221/1480 train_time:31208ms step_avg:147.91ms step:222/1480 train_time:31319ms step_avg:147.73ms step:223/1480 train_time:31467ms step_avg:147.73ms step:224/1480 train_time:31616ms step_avg:147.74ms step:225/1480 train_time:31763ms step_avg:147.74ms step:226/1480 train_time:31911ms step_avg:147.74ms step:227/1480 train_time:32059ms step_avg:147.74ms step:228/1480 train_time:32206ms step_avg:147.74ms step:229/1480 train_time:32355ms step_avg:147.74ms step:230/1480 train_time:32503ms step_avg:147.74ms step:231/1480 train_time:32651ms step_avg:147.74ms step:232/1480 train_time:32799ms step_avg:147.75ms step:233/1480 train_time:32948ms step_avg:147.75ms step:234/1480 train_time:33096ms step_avg:147.75ms step:235/1480 train_time:33245ms step_avg:147.76ms step:236/1480 train_time:33394ms step_avg:147.76ms step:237/1480 train_time:33541ms step_avg:147.76ms step:238/1480 train_time:33690ms step_avg:147.76ms step:239/1480 train_time:33839ms step_avg:147.77ms step:240/1480 train_time:33986ms step_avg:147.77ms step:241/1480 train_time:34135ms step_avg:147.77ms step:242/1480 train_time:34282ms step_avg:147.77ms step:243/1480 train_time:34430ms step_avg:147.77ms step:244/1480 train_time:34579ms step_avg:147.77ms step:245/1480 train_time:34727ms step_avg:147.77ms step:246/1480 train_time:34875ms step_avg:147.78ms step:247/1480 train_time:35023ms step_avg:147.78ms step:248/1480 train_time:35171ms step_avg:147.78ms step:249/1480 train_time:35319ms step_avg:147.78ms step:250/1480 train_time:35467ms step_avg:147.78ms step:250/1480 val_loss:3.9961 train_time:35534ms step_avg:148.06ms step:251/1480 train_time:35625ms step_avg:147.82ms step:252/1480 train_time:35772ms step_avg:147.82ms step:253/1480 train_time:35920ms step_avg:147.82ms step:254/1480 train_time:36069ms step_avg:147.82ms step:255/1480 train_time:36216ms step_avg:147.82ms step:256/1480 train_time:36365ms step_avg:147.82ms step:257/1480 train_time:36513ms step_avg:147.83ms step:258/1480 train_time:36660ms step_avg:147.82ms step:259/1480 train_time:36809ms step_avg:147.83ms step:260/1480 train_time:36957ms step_avg:147.83ms step:261/1480 train_time:37105ms step_avg:147.83ms step:262/1480 train_time:37254ms step_avg:147.84ms step:263/1480 train_time:37402ms step_avg:147.83ms step:264/1480 train_time:37551ms step_avg:147.84ms step:265/1480 train_time:37698ms step_avg:147.84ms step:266/1480 train_time:37846ms step_avg:147.84ms step:267/1480 train_time:37995ms step_avg:147.84ms step:268/1480 train_time:38143ms step_avg:147.84ms step:269/1480 train_time:38292ms step_avg:147.85ms step:270/1480 train_time:38440ms step_avg:147.85ms step:271/1480 train_time:38590ms step_avg:147.85ms step:272/1480 train_time:38737ms step_avg:147.85ms step:273/1480 train_time:38886ms step_avg:147.85ms step:274/1480 train_time:39034ms step_avg:147.86ms step:275/1480 train_time:39183ms step_avg:147.86ms step:276/1480 train_time:39331ms step_avg:147.86ms step:277/1480 train_time:39481ms step_avg:147.87ms step:278/1480 train_time:39630ms step_avg:147.87ms step:279/1480 train_time:39778ms step_avg:147.87ms step:280/1480 train_time:39926ms step_avg:147.88ms step:281/1480 train_time:40076ms step_avg:147.88ms step:282/1480 train_time:40223ms step_avg:147.88ms step:283/1480 train_time:40372ms step_avg:147.88ms step:284/1480 train_time:40520ms step_avg:147.88ms step:285/1480 train_time:40669ms step_avg:147.89ms step:286/1480 train_time:40817ms step_avg:147.89ms step:287/1480 train_time:40965ms step_avg:147.89ms step:288/1480 train_time:41115ms step_avg:147.90ms step:289/1480 train_time:41263ms step_avg:147.90ms step:290/1480 train_time:41412ms step_avg:147.90ms step:291/1480 train_time:41559ms step_avg:147.90ms step:292/1480 train_time:41708ms step_avg:147.90ms step:293/1480 train_time:41856ms step_avg:147.90ms step:294/1480 train_time:42004ms step_avg:147.90ms step:295/1480 train_time:42153ms step_avg:147.90ms step:296/1480 train_time:42300ms step_avg:147.90ms step:297/1480 train_time:42450ms step_avg:147.91ms step:298/1480 train_time:42597ms step_avg:147.91ms step:299/1480 train_time:42746ms step_avg:147.91ms step:300/1480 train_time:42895ms step_avg:147.91ms step:301/1480 train_time:43043ms step_avg:147.91ms step:302/1480 train_time:43192ms step_avg:147.92ms step:303/1480 train_time:43340ms step_avg:147.92ms step:304/1480 train_time:43489ms step_avg:147.92ms step:305/1480 train_time:43637ms step_avg:147.92ms step:306/1480 train_time:43786ms step_avg:147.92ms step:307/1480 train_time:43934ms step_avg:147.93ms step:308/1480 train_time:44084ms step_avg:147.93ms step:309/1480 train_time:44232ms step_avg:147.93ms step:310/1480 train_time:44381ms step_avg:147.94ms step:311/1480 train_time:44530ms step_avg:147.94ms step:312/1480 train_time:44678ms step_avg:147.94ms step:313/1480 train_time:44828ms step_avg:147.95ms step:314/1480 train_time:44976ms step_avg:147.95ms step:315/1480 train_time:45124ms step_avg:147.95ms step:316/1480 train_time:45273ms step_avg:147.95ms step:317/1480 train_time:45421ms step_avg:147.95ms step:318/1480 train_time:45571ms step_avg:147.96ms step:319/1480 train_time:45718ms step_avg:147.96ms step:320/1480 train_time:45868ms step_avg:147.96ms step:321/1480 train_time:46017ms step_avg:147.96ms step:322/1480 train_time:46165ms step_avg:147.96ms step:323/1480 train_time:46314ms step_avg:147.97ms step:324/1480 train_time:46462ms step_avg:147.97ms step:325/1480 train_time:46612ms step_avg:147.97ms step:326/1480 train_time:46759ms step_avg:147.97ms step:327/1480 train_time:46908ms step_avg:147.97ms step:328/1480 train_time:47056ms step_avg:147.97ms step:329/1480 train_time:47203ms step_avg:147.97ms step:330/1480 train_time:47354ms step_avg:147.98ms step:331/1480 train_time:47503ms step_avg:147.99ms step:332/1480 train_time:47655ms step_avg:148.00ms step:333/1480 train_time:47804ms step_avg:148.00ms step:334/1480 train_time:47955ms step_avg:148.01ms step:335/1480 train_time:48104ms step_avg:148.01ms step:336/1480 train_time:48255ms step_avg:148.02ms step:337/1480 train_time:48406ms step_avg:148.03ms step:338/1480 train_time:48557ms step_avg:148.04ms step:339/1480 train_time:48707ms step_avg:148.05ms step:340/1480 train_time:48857ms step_avg:148.05ms step:341/1480 train_time:49009ms step_avg:148.06ms step:342/1480 train_time:49159ms step_avg:148.07ms step:343/1480 train_time:49310ms step_avg:148.08ms step:344/1480 train_time:49460ms step_avg:148.08ms step:345/1480 train_time:49611ms step_avg:148.09ms step:346/1480 train_time:49761ms step_avg:148.10ms step:347/1480 train_time:49913ms step_avg:148.11ms step:348/1480 train_time:50063ms step_avg:148.12ms step:349/1480 train_time:50215ms step_avg:148.13ms step:350/1480 train_time:50366ms step_avg:148.14ms step:351/1480 train_time:50517ms step_avg:148.14ms step:352/1480 train_time:50669ms step_avg:148.15ms step:353/1480 train_time:50819ms step_avg:148.16ms step:354/1480 train_time:50970ms step_avg:148.17ms step:355/1480 train_time:51120ms step_avg:148.18ms step:356/1480 train_time:51272ms step_avg:148.18ms step:357/1480 train_time:51423ms step_avg:148.19ms step:358/1480 train_time:51574ms step_avg:148.20ms step:359/1480 train_time:51724ms step_avg:148.21ms step:360/1480 train_time:51876ms step_avg:148.22ms step:361/1480 train_time:52026ms step_avg:148.22ms step:362/1480 train_time:52177ms step_avg:148.23ms step:363/1480 train_time:52327ms step_avg:148.24ms step:364/1480 train_time:52478ms step_avg:148.24ms step:365/1480 train_time:52628ms step_avg:148.25ms step:366/1480 train_time:52779ms step_avg:148.26ms step:367/1480 train_time:52930ms step_avg:148.26ms step:368/1480 train_time:53080ms step_avg:148.27ms step:369/1480 train_time:53232ms step_avg:148.28ms step:370/1480 train_time:53383ms step_avg:148.29ms step:371/1480 train_time:53534ms step_avg:148.29ms step:372/1480 train_time:53685ms step_avg:148.30ms step:373/1480 train_time:53836ms step_avg:148.31ms step:374/1480 train_time:53988ms step_avg:148.32ms step:375/1480 train_time:54138ms step_avg:148.32ms step:375/1480 val_loss:3.8123 train_time:54207ms step_avg:148.51ms step:376/1480 train_time:54297ms step_avg:148.35ms step:377/1480 train_time:54448ms step_avg:148.36ms step:378/1480 train_time:54599ms step_avg:148.37ms step:379/1480 train_time:54762ms step_avg:148.41ms step:380/1480 train_time:54899ms step_avg:148.38ms step:381/1480 train_time:55049ms step_avg:148.38ms step:382/1480 train_time:55199ms step_avg:148.38ms step:383/1480 train_time:55350ms step_avg:148.39ms step:384/1480 train_time:55501ms step_avg:148.40ms step:385/1480 train_time:55652ms step_avg:148.41ms step:386/1480 train_time:55803ms step_avg:148.41ms step:387/1480 train_time:55955ms step_avg:148.42ms step:388/1480 train_time:56105ms step_avg:148.43ms step:389/1480 train_time:56256ms step_avg:148.43ms step:390/1480 train_time:56409ms step_avg:148.44ms step:391/1480 train_time:56559ms step_avg:148.45ms step:392/1480 train_time:56710ms step_avg:148.46ms step:393/1480 train_time:56859ms step_avg:148.46ms step:394/1480 train_time:57011ms step_avg:148.47ms step:395/1480 train_time:57161ms step_avg:148.47ms step:396/1480 train_time:57313ms step_avg:148.48ms step:397/1480 train_time:57463ms step_avg:148.48ms step:398/1480 train_time:57615ms step_avg:148.49ms step:399/1480 train_time:57765ms step_avg:148.50ms step:400/1480 train_time:57917ms step_avg:148.50ms step:401/1480 train_time:58068ms step_avg:148.51ms step:402/1480 train_time:58218ms step_avg:148.52ms step:403/1480 train_time:58370ms step_avg:148.52ms step:404/1480 train_time:58521ms step_avg:148.53ms step:405/1480 train_time:58672ms step_avg:148.54ms step:406/1480 train_time:58822ms step_avg:148.54ms step:407/1480 train_time:58974ms step_avg:148.55ms step:408/1480 train_time:59124ms step_avg:148.55ms step:409/1480 train_time:59275ms step_avg:148.56ms step:410/1480 train_time:59425ms step_avg:148.56ms step:411/1480 train_time:59576ms step_avg:148.57ms step:412/1480 train_time:59726ms step_avg:148.57ms step:413/1480 train_time:59877ms step_avg:148.58ms step:414/1480 train_time:60029ms step_avg:148.59ms step:415/1480 train_time:60178ms step_avg:148.59ms step:416/1480 train_time:60330ms step_avg:148.59ms step:417/1480 train_time:60479ms step_avg:148.60ms step:418/1480 train_time:60631ms step_avg:148.61ms step:419/1480 train_time:60781ms step_avg:148.61ms step:420/1480 train_time:60932ms step_avg:148.62ms step:421/1480 train_time:61083ms step_avg:148.62ms step:422/1480 train_time:61235ms step_avg:148.63ms step:423/1480 train_time:61386ms step_avg:148.64ms step:424/1480 train_time:61537ms step_avg:148.64ms step:425/1480 train_time:61689ms step_avg:148.65ms step:426/1480 train_time:61839ms step_avg:148.65ms step:427/1480 train_time:61991ms step_avg:148.66ms step:428/1480 train_time:62141ms step_avg:148.66ms step:429/1480 train_time:62293ms step_avg:148.67ms step:430/1480 train_time:62443ms step_avg:148.67ms step:431/1480 train_time:62594ms step_avg:148.68ms step:432/1480 train_time:62744ms step_avg:148.68ms step:433/1480 train_time:62895ms step_avg:148.69ms step:434/1480 train_time:63045ms step_avg:148.69ms step:435/1480 train_time:63195ms step_avg:148.69ms step:436/1480 train_time:63346ms step_avg:148.70ms step:437/1480 train_time:63497ms step_avg:148.70ms step:438/1480 train_time:63647ms step_avg:148.71ms step:439/1480 train_time:63798ms step_avg:148.71ms step:440/1480 train_time:63950ms step_avg:148.72ms step:441/1480 train_time:64104ms step_avg:148.73ms step:442/1480 train_time:64257ms step_avg:148.74ms step:443/1480 train_time:64410ms step_avg:148.75ms step:444/1480 train_time:64562ms step_avg:148.76ms step:445/1480 train_time:64715ms step_avg:148.77ms step:446/1480 train_time:64868ms step_avg:148.78ms step:447/1480 train_time:65021ms step_avg:148.79ms step:448/1480 train_time:65174ms step_avg:148.80ms step:449/1480 train_time:65326ms step_avg:148.81ms step:450/1480 train_time:65479ms step_avg:148.82ms step:451/1480 train_time:65632ms step_avg:148.83ms step:452/1480 train_time:65785ms step_avg:148.84ms step:453/1480 train_time:65938ms step_avg:148.84ms step:454/1480 train_time:66091ms step_avg:148.85ms step:455/1480 train_time:66243ms step_avg:148.86ms step:456/1480 train_time:66396ms step_avg:148.87ms step:457/1480 train_time:66549ms step_avg:148.88ms step:458/1480 train_time:66701ms step_avg:148.89ms step:459/1480 train_time:66854ms step_avg:148.89ms step:460/1480 train_time:67007ms step_avg:148.91ms step:461/1480 train_time:67160ms step_avg:148.91ms step:462/1480 train_time:67313ms step_avg:148.92ms step:463/1480 train_time:67466ms step_avg:148.93ms step:464/1480 train_time:67619ms step_avg:148.94ms step:465/1480 train_time:67772ms step_avg:148.95ms step:466/1480 train_time:67923ms step_avg:148.95ms step:467/1480 train_time:68076ms step_avg:148.96ms step:468/1480 train_time:68229ms step_avg:148.97ms step:469/1480 train_time:68381ms step_avg:148.98ms step:470/1480 train_time:68534ms step_avg:148.99ms step:471/1480 train_time:68687ms step_avg:149.00ms step:472/1480 train_time:68839ms step_avg:149.00ms step:473/1480 train_time:68992ms step_avg:149.01ms step:474/1480 train_time:69144ms step_avg:149.02ms step:475/1480 train_time:69297ms step_avg:149.02ms step:476/1480 train_time:69450ms step_avg:149.03ms step:477/1480 train_time:69603ms step_avg:149.04ms step:478/1480 train_time:69757ms step_avg:149.05ms step:479/1480 train_time:69910ms step_avg:149.06ms step:480/1480 train_time:70063ms step_avg:149.07ms step:481/1480 train_time:70216ms step_avg:149.08ms step:482/1480 train_time:70368ms step_avg:149.09ms step:483/1480 train_time:70520ms step_avg:149.09ms step:484/1480 train_time:70674ms step_avg:149.10ms step:485/1480 train_time:70827ms step_avg:149.11ms step:486/1480 train_time:70980ms step_avg:149.12ms step:487/1480 train_time:71133ms step_avg:149.13ms step:488/1480 train_time:71286ms step_avg:149.13ms step:489/1480 train_time:71440ms step_avg:149.14ms step:490/1480 train_time:71593ms step_avg:149.15ms step:491/1480 train_time:71745ms step_avg:149.16ms step:492/1480 train_time:71897ms step_avg:149.16ms step:493/1480 train_time:72050ms step_avg:149.17ms step:494/1480 train_time:72202ms step_avg:149.18ms step:495/1480 train_time:72354ms step_avg:149.18ms step:496/1480 train_time:72509ms step_avg:149.20ms step:497/1480 train_time:72662ms step_avg:149.20ms step:498/1480 train_time:72815ms step_avg:149.21ms step:499/1480 train_time:72967ms step_avg:149.22ms step:500/1480 train_time:73120ms step_avg:149.22ms step:500/1480 val_loss:3.6902 train_time:73189ms step_avg:149.37ms step:501/1480 train_time:73280ms step_avg:149.25ms step:502/1480 train_time:73432ms step_avg:149.25ms step:503/1480 train_time:73584ms step_avg:149.26ms step:504/1480 train_time:73736ms step_avg:149.26ms step:505/1480 train_time:73887ms step_avg:149.27ms step:506/1480 train_time:74039ms step_avg:149.27ms step:507/1480 train_time:74191ms step_avg:149.28ms step:508/1480 train_time:74344ms step_avg:149.29ms step:509/1480 train_time:74498ms step_avg:149.30ms step:510/1480 train_time:74651ms step_avg:149.30ms step:511/1480 train_time:74804ms step_avg:149.31ms step:512/1480 train_time:74956ms step_avg:149.32ms step:513/1480 train_time:75108ms step_avg:149.32ms step:514/1480 train_time:75260ms step_avg:149.33ms step:515/1480 train_time:75413ms step_avg:149.33ms step:516/1480 train_time:75568ms step_avg:149.34ms step:517/1480 train_time:75722ms step_avg:149.35ms step:518/1480 train_time:75874ms step_avg:149.36ms step:519/1480 train_time:76026ms step_avg:149.36ms step:520/1480 train_time:76180ms step_avg:149.37ms step:521/1480 train_time:76332ms step_avg:149.38ms step:522/1480 train_time:76485ms step_avg:149.39ms step:523/1480 train_time:76639ms step_avg:149.39ms step:524/1480 train_time:76791ms step_avg:149.40ms step:525/1480 train_time:76943ms step_avg:149.40ms step:526/1480 train_time:77096ms step_avg:149.41ms step:527/1480 train_time:77249ms step_avg:149.42ms step:528/1480 train_time:77402ms step_avg:149.43ms step:529/1480 train_time:77555ms step_avg:149.43ms step:530/1480 train_time:77708ms step_avg:149.44ms step:531/1480 train_time:77860ms step_avg:149.44ms step:532/1480 train_time:78013ms step_avg:149.45ms step:533/1480 train_time:78165ms step_avg:149.46ms step:534/1480 train_time:78318ms step_avg:149.46ms step:535/1480 train_time:78471ms step_avg:149.47ms step:536/1480 train_time:78624ms step_avg:149.47ms step:537/1480 train_time:78778ms step_avg:149.48ms step:538/1480 train_time:78930ms step_avg:149.49ms step:539/1480 train_time:79083ms step_avg:149.50ms step:540/1480 train_time:79236ms step_avg:149.50ms step:541/1480 train_time:79388ms step_avg:149.51ms step:542/1480 train_time:79540ms step_avg:149.51ms step:543/1480 train_time:79693ms step_avg:149.52ms step:544/1480 train_time:79846ms step_avg:149.52ms step:545/1480 train_time:80000ms step_avg:149.53ms step:546/1480 train_time:80153ms step_avg:149.54ms step:547/1480 train_time:80306ms step_avg:149.55ms step:548/1480 train_time:80458ms step_avg:149.55ms step:549/1480 train_time:80611ms step_avg:149.56ms step:550/1480 train_time:80764ms step_avg:149.56ms step:551/1480 train_time:80919ms step_avg:149.57ms step:552/1480 train_time:81074ms step_avg:149.58ms step:553/1480 train_time:81229ms step_avg:149.59ms step:554/1480 train_time:81384ms step_avg:149.60ms step:555/1480 train_time:81538ms step_avg:149.61ms step:556/1480 train_time:81692ms step_avg:149.62ms step:557/1480 train_time:81846ms step_avg:149.63ms step:558/1480 train_time:82002ms step_avg:149.64ms step:559/1480 train_time:82157ms step_avg:149.65ms step:560/1480 train_time:82311ms step_avg:149.66ms step:561/1480 train_time:82465ms step_avg:149.66ms step:562/1480 train_time:82620ms step_avg:149.67ms step:563/1480 train_time:82774ms step_avg:149.68ms step:564/1480 train_time:82929ms step_avg:149.69ms step:565/1480 train_time:83084ms step_avg:149.70ms step:566/1480 train_time:83239ms step_avg:149.71ms step:567/1480 train_time:83393ms step_avg:149.72ms step:568/1480 train_time:83547ms step_avg:149.73ms step:569/1480 train_time:83716ms step_avg:149.76ms step:570/1480 train_time:83857ms step_avg:149.75ms step:571/1480 train_time:84013ms step_avg:149.76ms step:572/1480 train_time:84167ms step_avg:149.76ms step:573/1480 train_time:84321ms step_avg:149.77ms step:574/1480 train_time:84478ms step_avg:149.78ms step:575/1480 train_time:84631ms step_avg:149.79ms step:576/1480 train_time:84786ms step_avg:149.80ms step:577/1480 train_time:84940ms step_avg:149.81ms step:578/1480 train_time:85095ms step_avg:149.81ms step:579/1480 train_time:85249ms step_avg:149.82ms step:580/1480 train_time:85404ms step_avg:149.83ms step:581/1480 train_time:85558ms step_avg:149.84ms step:582/1480 train_time:85712ms step_avg:149.85ms step:583/1480 train_time:85865ms step_avg:149.85ms step:584/1480 train_time:86022ms step_avg:149.86ms step:585/1480 train_time:86176ms step_avg:149.87ms step:586/1480 train_time:86331ms step_avg:149.88ms step:587/1480 train_time:86485ms step_avg:149.89ms step:588/1480 train_time:86639ms step_avg:149.89ms step:589/1480 train_time:86794ms step_avg:149.90ms step:590/1480 train_time:86948ms step_avg:149.91ms step:591/1480 train_time:87102ms step_avg:149.92ms step:592/1480 train_time:87258ms step_avg:149.93ms step:593/1480 train_time:87415ms step_avg:149.94ms step:594/1480 train_time:87572ms step_avg:149.95ms step:595/1480 train_time:87727ms step_avg:149.96ms step:596/1480 train_time:87882ms step_avg:149.97ms step:597/1480 train_time:88036ms step_avg:149.98ms step:598/1480 train_time:88191ms step_avg:149.98ms step:599/1480 train_time:88346ms step_avg:149.99ms step:600/1480 train_time:88500ms step_avg:150.00ms step:601/1480 train_time:88656ms step_avg:150.01ms step:602/1480 train_time:88810ms step_avg:150.02ms step:603/1480 train_time:88964ms step_avg:150.02ms step:604/1480 train_time:89120ms step_avg:150.03ms step:605/1480 train_time:89275ms step_avg:150.04ms step:606/1480 train_time:89430ms step_avg:150.05ms step:607/1480 train_time:89586ms step_avg:150.06ms step:608/1480 train_time:89741ms step_avg:150.07ms step:609/1480 train_time:89896ms step_avg:150.08ms step:610/1480 train_time:90050ms step_avg:150.08ms step:611/1480 train_time:90206ms step_avg:150.09ms step:612/1480 train_time:90360ms step_avg:150.10ms step:613/1480 train_time:90517ms step_avg:150.11ms step:614/1480 train_time:90673ms step_avg:150.12ms step:615/1480 train_time:90827ms step_avg:150.13ms step:616/1480 train_time:90982ms step_avg:150.13ms step:617/1480 train_time:91136ms step_avg:150.14ms step:618/1480 train_time:91292ms step_avg:150.15ms step:619/1480 train_time:91446ms step_avg:150.16ms step:620/1480 train_time:91600ms step_avg:150.16ms step:621/1480 train_time:91755ms step_avg:150.17ms step:622/1480 train_time:91909ms step_avg:150.18ms step:623/1480 train_time:92065ms step_avg:150.19ms step:624/1480 train_time:92220ms step_avg:150.19ms step:625/1480 train_time:92374ms step_avg:150.20ms step:625/1480 val_loss:3.6094 train_time:92445ms step_avg:150.32ms step:626/1480 train_time:92535ms step_avg:150.22ms step:627/1480 train_time:92689ms step_avg:150.22ms step:628/1480 train_time:92843ms step_avg:150.23ms step:629/1480 train_time:92997ms step_avg:150.24ms step:630/1480 train_time:93152ms step_avg:150.25ms step:631/1480 train_time:93306ms step_avg:150.25ms step:632/1480 train_time:93460ms step_avg:150.26ms step:633/1480 train_time:93618ms step_avg:150.27ms step:634/1480 train_time:93773ms step_avg:150.28ms step:635/1480 train_time:93927ms step_avg:150.28ms step:636/1480 train_time:94082ms step_avg:150.29ms step:637/1480 train_time:94237ms step_avg:150.30ms step:638/1480 train_time:94392ms step_avg:150.31ms step:639/1480 train_time:94545ms step_avg:150.31ms step:640/1480 train_time:94701ms step_avg:150.32ms step:641/1480 train_time:94856ms step_avg:150.33ms step:642/1480 train_time:95009ms step_avg:150.33ms step:643/1480 train_time:95163ms step_avg:150.34ms step:644/1480 train_time:95318ms step_avg:150.34ms step:645/1480 train_time:95473ms step_avg:150.35ms step:646/1480 train_time:95627ms step_avg:150.36ms step:647/1480 train_time:95782ms step_avg:150.36ms step:648/1480 train_time:95939ms step_avg:150.37ms step:649/1480 train_time:96094ms step_avg:150.38ms step:650/1480 train_time:96250ms step_avg:150.39ms step:651/1480 train_time:96405ms step_avg:150.40ms step:652/1480 train_time:96559ms step_avg:150.40ms step:653/1480 train_time:96713ms step_avg:150.41ms step:654/1480 train_time:96868ms step_avg:150.42ms step:655/1480 train_time:97023ms step_avg:150.42ms step:656/1480 train_time:97178ms step_avg:150.43ms step:657/1480 train_time:97332ms step_avg:150.44ms step:658/1480 train_time:97487ms step_avg:150.44ms step:659/1480 train_time:97642ms step_avg:150.45ms step:660/1480 train_time:97799ms step_avg:150.46ms step:661/1480 train_time:97956ms step_avg:150.47ms step:662/1480 train_time:98113ms step_avg:150.48ms step:663/1480 train_time:98268ms step_avg:150.49ms step:664/1480 train_time:98425ms step_avg:150.50ms step:665/1480 train_time:98581ms step_avg:150.51ms step:666/1480 train_time:98737ms step_avg:150.51ms step:667/1480 train_time:98893ms step_avg:150.52ms step:668/1480 train_time:99048ms step_avg:150.53ms step:669/1480 train_time:99205ms step_avg:150.54ms step:670/1480 train_time:99360ms step_avg:150.55ms step:671/1480 train_time:99517ms step_avg:150.56ms step:672/1480 train_time:99674ms step_avg:150.57ms step:673/1480 train_time:99830ms step_avg:150.57ms step:674/1480 train_time:99986ms step_avg:150.58ms step:675/1480 train_time:100143ms step_avg:150.59ms step:676/1480 train_time:100301ms step_avg:150.60ms step:677/1480 train_time:100457ms step_avg:150.61ms step:678/1480 train_time:100613ms step_avg:150.62ms step:679/1480 train_time:100768ms step_avg:150.63ms step:680/1480 train_time:100926ms step_avg:150.64ms step:681/1480 train_time:101082ms step_avg:150.64ms step:682/1480 train_time:101239ms step_avg:150.65ms step:683/1480 train_time:101396ms step_avg:150.66ms step:684/1480 train_time:101553ms step_avg:150.67ms step:685/1480 train_time:101708ms step_avg:150.68ms step:686/1480 train_time:101864ms step_avg:150.69ms step:687/1480 train_time:102021ms step_avg:150.70ms step:688/1480 train_time:102179ms step_avg:150.71ms step:689/1480 train_time:102337ms step_avg:150.72ms step:690/1480 train_time:102494ms step_avg:150.73ms step:691/1480 train_time:102651ms step_avg:150.74ms step:692/1480 train_time:102807ms step_avg:150.74ms step:693/1480 train_time:102963ms step_avg:150.75ms step:694/1480 train_time:103120ms step_avg:150.76ms step:695/1480 train_time:103276ms step_avg:150.77ms step:696/1480 train_time:103431ms step_avg:150.77ms step:697/1480 train_time:103587ms step_avg:150.78ms step:698/1480 train_time:103743ms step_avg:150.79ms step:699/1480 train_time:103900ms step_avg:150.80ms step:700/1480 train_time:104057ms step_avg:150.81ms step:701/1480 train_time:104212ms step_avg:150.81ms step:702/1480 train_time:104367ms step_avg:150.82ms step:703/1480 train_time:104523ms step_avg:150.83ms step:704/1480 train_time:104679ms step_avg:150.83ms step:705/1480 train_time:104837ms step_avg:150.84ms step:706/1480 train_time:104996ms step_avg:150.86ms step:707/1480 train_time:105152ms step_avg:150.86ms step:708/1480 train_time:105308ms step_avg:150.87ms step:709/1480 train_time:105464ms step_avg:150.88ms step:710/1480 train_time:105620ms step_avg:150.89ms step:711/1480 train_time:105777ms step_avg:150.89ms step:712/1480 train_time:105934ms step_avg:150.90ms step:713/1480 train_time:106092ms step_avg:150.91ms step:714/1480 train_time:106249ms step_avg:150.92ms step:715/1480 train_time:106404ms step_avg:150.93ms step:716/1480 train_time:106559ms step_avg:150.93ms step:717/1480 train_time:106716ms step_avg:150.94ms step:718/1480 train_time:106872ms step_avg:150.95ms step:719/1480 train_time:107027ms step_avg:150.95ms step:720/1480 train_time:107184ms step_avg:150.96ms step:721/1480 train_time:107341ms step_avg:150.97ms step:722/1480 train_time:107498ms step_avg:150.98ms step:723/1480 train_time:107653ms step_avg:150.99ms step:724/1480 train_time:107809ms step_avg:150.99ms step:725/1480 train_time:107965ms step_avg:151.00ms step:726/1480 train_time:108122ms step_avg:151.01ms step:727/1480 train_time:108279ms step_avg:151.02ms step:728/1480 train_time:108434ms step_avg:151.02ms step:729/1480 train_time:108592ms step_avg:151.03ms step:730/1480 train_time:108749ms step_avg:151.04ms step:731/1480 train_time:108904ms step_avg:151.05ms step:732/1480 train_time:109059ms step_avg:151.05ms step:733/1480 train_time:109218ms step_avg:151.06ms step:734/1480 train_time:109374ms step_avg:151.07ms step:735/1480 train_time:109530ms step_avg:151.08ms step:736/1480 train_time:109686ms step_avg:151.08ms step:737/1480 train_time:109841ms step_avg:151.09ms step:738/1480 train_time:109997ms step_avg:151.09ms step:739/1480 train_time:110152ms step_avg:151.10ms step:740/1480 train_time:110309ms step_avg:151.11ms step:741/1480 train_time:110467ms step_avg:151.12ms step:742/1480 train_time:110623ms step_avg:151.12ms step:743/1480 train_time:110779ms step_avg:151.13ms step:744/1480 train_time:110935ms step_avg:151.14ms step:745/1480 train_time:111091ms step_avg:151.14ms step:746/1480 train_time:111247ms step_avg:151.15ms step:747/1480 train_time:111403ms step_avg:151.16ms step:748/1480 train_time:111561ms step_avg:151.17ms step:749/1480 train_time:111718ms step_avg:151.17ms step:750/1480 train_time:111874ms step_avg:151.18ms step:750/1480 val_loss:3.5525 train_time:111946ms step_avg:151.28ms step:751/1480 train_time:112036ms step_avg:151.20ms step:752/1480 train_time:112192ms step_avg:151.20ms step:753/1480 train_time:112348ms step_avg:151.21ms step:754/1480 train_time:112505ms step_avg:151.22ms step:755/1480 train_time:112661ms step_avg:151.22ms step:756/1480 train_time:112817ms step_avg:151.23ms step:757/1480 train_time:112975ms step_avg:151.24ms step:758/1480 train_time:113131ms step_avg:151.24ms step:759/1480 train_time:113297ms step_avg:151.26ms step:760/1480 train_time:113445ms step_avg:151.26ms step:761/1480 train_time:113602ms step_avg:151.27ms step:762/1480 train_time:113757ms step_avg:151.27ms step:763/1480 train_time:113912ms step_avg:151.28ms step:764/1480 train_time:114069ms step_avg:151.29ms step:765/1480 train_time:114226ms step_avg:151.29ms step:766/1480 train_time:114385ms step_avg:151.30ms step:767/1480 train_time:114541ms step_avg:151.31ms step:768/1480 train_time:114698ms step_avg:151.32ms step:769/1480 train_time:114854ms step_avg:151.32ms step:770/1480 train_time:115011ms step_avg:151.33ms step:771/1480 train_time:115168ms step_avg:151.34ms step:772/1480 train_time:115325ms step_avg:151.34ms step:773/1480 train_time:115482ms step_avg:151.35ms step:774/1480 train_time:115640ms step_avg:151.36ms step:775/1480 train_time:115798ms step_avg:151.37ms step:776/1480 train_time:115957ms step_avg:151.38ms step:777/1480 train_time:116118ms step_avg:151.39ms step:778/1480 train_time:116278ms step_avg:151.40ms step:779/1480 train_time:116435ms step_avg:151.41ms step:780/1480 train_time:116592ms step_avg:151.42ms step:781/1480 train_time:116748ms step_avg:151.42ms step:782/1480 train_time:116907ms step_avg:151.43ms step:783/1480 train_time:117064ms step_avg:151.44ms step:784/1480 train_time:117222ms step_avg:151.45ms step:785/1480 train_time:117381ms step_avg:151.46ms step:786/1480 train_time:117539ms step_avg:151.47ms step:787/1480 train_time:117697ms step_avg:151.48ms step:788/1480 train_time:117854ms step_avg:151.48ms step:789/1480 train_time:118010ms step_avg:151.49ms step:790/1480 train_time:118169ms step_avg:151.50ms step:791/1480 train_time:118329ms step_avg:151.51ms step:792/1480 train_time:118488ms step_avg:151.52ms step:793/1480 train_time:118644ms step_avg:151.52ms step:794/1480 train_time:118803ms step_avg:151.53ms step:795/1480 train_time:118964ms step_avg:151.55ms step:796/1480 train_time:119125ms step_avg:151.56ms step:797/1480 train_time:119284ms step_avg:151.57ms step:798/1480 train_time:119443ms step_avg:151.58ms step:799/1480 train_time:119605ms step_avg:151.59ms step:800/1480 train_time:119763ms step_avg:151.60ms step:801/1480 train_time:119920ms step_avg:151.61ms step:802/1480 train_time:120080ms step_avg:151.62ms step:803/1480 train_time:120239ms step_avg:151.62ms step:804/1480 train_time:120396ms step_avg:151.63ms step:805/1480 train_time:120555ms step_avg:151.64ms step:806/1480 train_time:120711ms step_avg:151.65ms step:807/1480 train_time:120867ms step_avg:151.65ms step:808/1480 train_time:121025ms step_avg:151.66ms step:809/1480 train_time:121182ms step_avg:151.67ms step:810/1480 train_time:121341ms step_avg:151.68ms step:811/1480 train_time:121500ms step_avg:151.69ms step:812/1480 train_time:121659ms step_avg:151.69ms step:813/1480 train_time:121815ms step_avg:151.70ms step:814/1480 train_time:121972ms step_avg:151.71ms step:815/1480 train_time:122128ms step_avg:151.71ms step:816/1480 train_time:122287ms step_avg:151.72ms step:817/1480 train_time:122443ms step_avg:151.73ms step:818/1480 train_time:122602ms step_avg:151.73ms step:819/1480 train_time:122760ms step_avg:151.74ms step:820/1480 train_time:122918ms step_avg:151.75ms step:821/1480 train_time:123075ms step_avg:151.76ms step:822/1480 train_time:123232ms step_avg:151.76ms step:823/1480 train_time:123389ms step_avg:151.77ms step:824/1480 train_time:123546ms step_avg:151.78ms step:825/1480 train_time:123706ms step_avg:151.79ms step:826/1480 train_time:123866ms step_avg:151.80ms step:827/1480 train_time:124025ms step_avg:151.80ms step:828/1480 train_time:124182ms step_avg:151.81ms step:829/1480 train_time:124341ms step_avg:151.82ms step:830/1480 train_time:124502ms step_avg:151.83ms step:831/1480 train_time:124661ms step_avg:151.84ms step:832/1480 train_time:124820ms step_avg:151.85ms step:833/1480 train_time:124978ms step_avg:151.86ms step:834/1480 train_time:125139ms step_avg:151.87ms step:835/1480 train_time:125295ms step_avg:151.87ms step:836/1480 train_time:125453ms step_avg:151.88ms step:837/1480 train_time:125610ms step_avg:151.89ms step:838/1480 train_time:125769ms step_avg:151.90ms step:839/1480 train_time:125926ms step_avg:151.90ms step:840/1480 train_time:126085ms step_avg:151.91ms step:841/1480 train_time:126244ms step_avg:151.92ms step:842/1480 train_time:126404ms step_avg:151.93ms step:843/1480 train_time:126562ms step_avg:151.94ms step:844/1480 train_time:126719ms step_avg:151.94ms step:845/1480 train_time:126876ms step_avg:151.95ms step:846/1480 train_time:127035ms step_avg:151.96ms step:847/1480 train_time:127193ms step_avg:151.96ms step:848/1480 train_time:127349ms step_avg:151.97ms step:849/1480 train_time:127507ms step_avg:151.98ms step:850/1480 train_time:127667ms step_avg:151.98ms step:851/1480 train_time:127825ms step_avg:151.99ms step:852/1480 train_time:127982ms step_avg:152.00ms step:853/1480 train_time:128142ms step_avg:152.01ms step:854/1480 train_time:128300ms step_avg:152.01ms step:855/1480 train_time:128458ms step_avg:152.02ms step:856/1480 train_time:128614ms step_avg:152.03ms step:857/1480 train_time:128772ms step_avg:152.03ms step:858/1480 train_time:128931ms step_avg:152.04ms step:859/1480 train_time:129089ms step_avg:152.05ms step:860/1480 train_time:129246ms step_avg:152.05ms step:861/1480 train_time:129405ms step_avg:152.06ms step:862/1480 train_time:129568ms step_avg:152.08ms step:863/1480 train_time:129726ms step_avg:152.08ms step:864/1480 train_time:129886ms step_avg:152.09ms step:865/1480 train_time:130043ms step_avg:152.10ms step:866/1480 train_time:130203ms step_avg:152.11ms step:867/1480 train_time:130363ms step_avg:152.12ms step:868/1480 train_time:130519ms step_avg:152.12ms step:869/1480 train_time:130678ms step_avg:152.13ms step:870/1480 train_time:130835ms step_avg:152.13ms step:871/1480 train_time:130991ms step_avg:152.14ms step:872/1480 train_time:131148ms step_avg:152.14ms step:873/1480 train_time:131308ms step_avg:152.15ms step:874/1480 train_time:131467ms step_avg:152.16ms step:875/1480 train_time:131626ms step_avg:152.17ms step:875/1480 val_loss:3.5068 train_time:131699ms step_avg:152.25ms step:876/1480 train_time:131790ms step_avg:152.18ms step:877/1480 train_time:131946ms step_avg:152.19ms step:878/1480 train_time:132103ms step_avg:152.19ms step:879/1480 train_time:132261ms step_avg:152.20ms step:880/1480 train_time:132419ms step_avg:152.21ms step:881/1480 train_time:132577ms step_avg:152.21ms step:882/1480 train_time:132737ms step_avg:152.22ms step:883/1480 train_time:132898ms step_avg:152.23ms step:884/1480 train_time:133059ms step_avg:152.24ms step:885/1480 train_time:133220ms step_avg:152.25ms step:886/1480 train_time:133381ms step_avg:152.26ms step:887/1480 train_time:133541ms step_avg:152.27ms step:888/1480 train_time:133705ms step_avg:152.28ms step:889/1480 train_time:133866ms step_avg:152.29ms step:890/1480 train_time:134023ms step_avg:152.30ms step:891/1480 train_time:134183ms step_avg:152.31ms step:892/1480 train_time:134343ms step_avg:152.32ms step:893/1480 train_time:134500ms step_avg:152.32ms step:894/1480 train_time:134661ms step_avg:152.33ms step:895/1480 train_time:134823ms step_avg:152.34ms step:896/1480 train_time:134982ms step_avg:152.35ms step:897/1480 train_time:135143ms step_avg:152.36ms step:898/1480 train_time:135302ms step_avg:152.37ms step:899/1480 train_time:135462ms step_avg:152.38ms step:900/1480 train_time:135620ms step_avg:152.38ms step:901/1480 train_time:135782ms step_avg:152.39ms step:902/1480 train_time:135941ms step_avg:152.40ms step:903/1480 train_time:136101ms step_avg:152.41ms step:904/1480 train_time:136261ms step_avg:152.42ms step:905/1480 train_time:136419ms step_avg:152.42ms step:906/1480 train_time:136581ms step_avg:152.43ms step:907/1480 train_time:136743ms step_avg:152.44ms step:908/1480 train_time:136901ms step_avg:152.45ms step:909/1480 train_time:137062ms step_avg:152.46ms step:910/1480 train_time:137226ms step_avg:152.47ms step:911/1480 train_time:137386ms step_avg:152.48ms step:912/1480 train_time:137546ms step_avg:152.49ms step:913/1480 train_time:137707ms step_avg:152.50ms step:914/1480 train_time:137867ms step_avg:152.51ms step:915/1480 train_time:138027ms step_avg:152.52ms step:916/1480 train_time:138187ms step_avg:152.52ms step:917/1480 train_time:138345ms step_avg:152.53ms step:918/1480 train_time:138507ms step_avg:152.54ms step:919/1480 train_time:138668ms step_avg:152.55ms step:920/1480 train_time:138826ms step_avg:152.56ms step:921/1480 train_time:138986ms step_avg:152.56ms step:922/1480 train_time:139147ms step_avg:152.57ms step:923/1480 train_time:139305ms step_avg:152.58ms step:924/1480 train_time:139464ms step_avg:152.59ms step:925/1480 train_time:139624ms step_avg:152.59ms step:926/1480 train_time:139784ms step_avg:152.60ms step:927/1480 train_time:139943ms step_avg:152.61ms step:928/1480 train_time:140103ms step_avg:152.62ms step:929/1480 train_time:140262ms step_avg:152.62ms step:930/1480 train_time:140421ms step_avg:152.63ms step:931/1480 train_time:140581ms step_avg:152.64ms step:932/1480 train_time:140740ms step_avg:152.65ms step:933/1480 train_time:140900ms step_avg:152.65ms step:934/1480 train_time:141060ms step_avg:152.66ms step:935/1480 train_time:141219ms step_avg:152.67ms step:936/1480 train_time:141380ms step_avg:152.68ms step:937/1480 train_time:141541ms step_avg:152.69ms step:938/1480 train_time:141700ms step_avg:152.69ms step:939/1480 train_time:141862ms step_avg:152.70ms step:940/1480 train_time:142023ms step_avg:152.71ms step:941/1480 train_time:142182ms step_avg:152.72ms step:942/1480 train_time:142340ms step_avg:152.73ms step:943/1480 train_time:142502ms step_avg:152.73ms step:944/1480 train_time:142665ms step_avg:152.75ms step:945/1480 train_time:142824ms step_avg:152.75ms step:946/1480 train_time:142987ms step_avg:152.76ms step:947/1480 train_time:143147ms step_avg:152.77ms step:948/1480 train_time:143307ms step_avg:152.78ms step:949/1480 train_time:143476ms step_avg:152.80ms step:950/1480 train_time:143625ms step_avg:152.79ms step:951/1480 train_time:143787ms step_avg:152.80ms step:952/1480 train_time:143945ms step_avg:152.81ms step:953/1480 train_time:144106ms step_avg:152.82ms step:954/1480 train_time:144269ms step_avg:152.83ms step:955/1480 train_time:144425ms step_avg:152.83ms step:956/1480 train_time:144584ms step_avg:152.84ms step:957/1480 train_time:144745ms step_avg:152.85ms step:958/1480 train_time:144908ms step_avg:152.86ms step:959/1480 train_time:145066ms step_avg:152.86ms step:960/1480 train_time:145225ms step_avg:152.87ms step:961/1480 train_time:145385ms step_avg:152.88ms step:962/1480 train_time:145544ms step_avg:152.88ms step:963/1480 train_time:145704ms step_avg:152.89ms step:964/1480 train_time:145866ms step_avg:152.90ms step:965/1480 train_time:146024ms step_avg:152.90ms step:966/1480 train_time:146184ms step_avg:152.91ms step:967/1480 train_time:146343ms step_avg:152.92ms step:968/1480 train_time:146502ms step_avg:152.92ms step:969/1480 train_time:146662ms step_avg:152.93ms step:970/1480 train_time:146820ms step_avg:152.94ms step:971/1480 train_time:146981ms step_avg:152.95ms step:972/1480 train_time:147141ms step_avg:152.95ms step:973/1480 train_time:147299ms step_avg:152.96ms step:974/1480 train_time:147460ms step_avg:152.97ms step:975/1480 train_time:147621ms step_avg:152.98ms step:976/1480 train_time:147782ms step_avg:152.98ms step:977/1480 train_time:147941ms step_avg:152.99ms step:978/1480 train_time:148100ms step_avg:153.00ms step:979/1480 train_time:148261ms step_avg:153.00ms step:980/1480 train_time:148421ms step_avg:153.01ms step:981/1480 train_time:148583ms step_avg:153.02ms step:982/1480 train_time:148742ms step_avg:153.03ms step:983/1480 train_time:148901ms step_avg:153.03ms step:984/1480 train_time:149060ms step_avg:153.04ms step:985/1480 train_time:149222ms step_avg:153.05ms step:986/1480 train_time:149383ms step_avg:153.06ms step:987/1480 train_time:149542ms step_avg:153.06ms step:988/1480 train_time:149701ms step_avg:153.07ms step:989/1480 train_time:149861ms step_avg:153.08ms step:990/1480 train_time:150023ms step_avg:153.08ms step:991/1480 train_time:150183ms step_avg:153.09ms step:992/1480 train_time:150348ms step_avg:153.10ms step:993/1480 train_time:150515ms step_avg:153.12ms step:994/1480 train_time:150675ms step_avg:153.13ms step:995/1480 train_time:150834ms step_avg:153.13ms step:996/1480 train_time:150992ms step_avg:153.14ms step:997/1480 train_time:151151ms step_avg:153.14ms step:998/1480 train_time:151309ms step_avg:153.15ms step:999/1480 train_time:151470ms step_avg:153.15ms step:1000/1480 train_time:151629ms step_avg:153.16ms step:1000/1480 val_loss:3.4431 train_time:151702ms step_avg:153.23ms step:1001/1480 train_time:151792ms step_avg:153.17ms step:1002/1480 train_time:151953ms step_avg:153.18ms step:1003/1480 train_time:152116ms step_avg:153.19ms step:1004/1480 train_time:152278ms step_avg:153.20ms step:1005/1480 train_time:152438ms step_avg:153.20ms step:1006/1480 train_time:152598ms step_avg:153.21ms step:1007/1480 train_time:152757ms step_avg:153.22ms step:1008/1480 train_time:152917ms step_avg:153.22ms step:1009/1480 train_time:153083ms step_avg:153.24ms step:1010/1480 train_time:153244ms step_avg:153.24ms step:1011/1480 train_time:153403ms step_avg:153.25ms step:1012/1480 train_time:153561ms step_avg:153.25ms step:1013/1480 train_time:153723ms step_avg:153.26ms step:1014/1480 train_time:153882ms step_avg:153.27ms step:1015/1480 train_time:154046ms step_avg:153.28ms step:1016/1480 train_time:154206ms step_avg:153.29ms step:1017/1480 train_time:154368ms step_avg:153.29ms step:1018/1480 train_time:154528ms step_avg:153.30ms step:1019/1480 train_time:154690ms step_avg:153.31ms step:1020/1480 train_time:154851ms step_avg:153.32ms step:1021/1480 train_time:155012ms step_avg:153.33ms step:1022/1480 train_time:155171ms step_avg:153.33ms step:1023/1480 train_time:155333ms step_avg:153.34ms step:1024/1480 train_time:155491ms step_avg:153.34ms step:1025/1480 train_time:155653ms step_avg:153.35ms step:1026/1480 train_time:155812ms step_avg:153.36ms step:1027/1480 train_time:155971ms step_avg:153.36ms step:1028/1480 train_time:156132ms step_avg:153.37ms step:1029/1480 train_time:156296ms step_avg:153.38ms step:1030/1480 train_time:156456ms step_avg:153.39ms step:1031/1480 train_time:156614ms step_avg:153.39ms step:1032/1480 train_time:156776ms step_avg:153.40ms step:1033/1480 train_time:156937ms step_avg:153.41ms step:1034/1480 train_time:157097ms step_avg:153.42ms step:1035/1480 train_time:157257ms step_avg:153.42ms step:1036/1480 train_time:157415ms step_avg:153.43ms step:1037/1480 train_time:157576ms step_avg:153.43ms step:1038/1480 train_time:157735ms step_avg:153.44ms step:1039/1480 train_time:157896ms step_avg:153.45ms step:1040/1480 train_time:158056ms step_avg:153.45ms step:1041/1480 train_time:158217ms step_avg:153.46ms step:1042/1480 train_time:158375ms step_avg:153.46ms step:1043/1480 train_time:158535ms step_avg:153.47ms step:1044/1480 train_time:158693ms step_avg:153.47ms step:1045/1480 train_time:158853ms step_avg:153.48ms step:1046/1480 train_time:159013ms step_avg:153.49ms step:1047/1480 train_time:159173ms step_avg:153.49ms step:1048/1480 train_time:159333ms step_avg:153.50ms step:1049/1480 train_time:159494ms step_avg:153.51ms step:1050/1480 train_time:159655ms step_avg:153.51ms step:1051/1480 train_time:159816ms step_avg:153.52ms step:1052/1480 train_time:159976ms step_avg:153.53ms step:1053/1480 train_time:160136ms step_avg:153.53ms step:1054/1480 train_time:160296ms step_avg:153.54ms step:1055/1480 train_time:160455ms step_avg:153.55ms step:1056/1480 train_time:160614ms step_avg:153.55ms step:1057/1480 train_time:160774ms step_avg:153.56ms step:1058/1480 train_time:160935ms step_avg:153.56ms step:1059/1480 train_time:161098ms step_avg:153.57ms step:1060/1480 train_time:161260ms step_avg:153.58ms step:1061/1480 train_time:161417ms step_avg:153.58ms step:1062/1480 train_time:161576ms step_avg:153.59ms step:1063/1480 train_time:161735ms step_avg:153.59ms step:1064/1480 train_time:161892ms step_avg:153.60ms step:1065/1480 train_time:162054ms step_avg:153.61ms step:1066/1480 train_time:162215ms step_avg:153.61ms step:1067/1480 train_time:162374ms step_avg:153.62ms step:1068/1480 train_time:162533ms step_avg:153.62ms step:1069/1480 train_time:162696ms step_avg:153.63ms step:1070/1480 train_time:162855ms step_avg:153.64ms step:1071/1480 train_time:163016ms step_avg:153.64ms step:1072/1480 train_time:163176ms step_avg:153.65ms step:1073/1480 train_time:163333ms step_avg:153.65ms step:1074/1480 train_time:163492ms step_avg:153.66ms step:1075/1480 train_time:163654ms step_avg:153.67ms step:1076/1480 train_time:163814ms step_avg:153.67ms step:1077/1480 train_time:163974ms step_avg:153.68ms step:1078/1480 train_time:164138ms step_avg:153.69ms step:1079/1480 train_time:164303ms step_avg:153.70ms step:1080/1480 train_time:164463ms step_avg:153.70ms step:1081/1480 train_time:164623ms step_avg:153.71ms step:1082/1480 train_time:164784ms step_avg:153.72ms step:1083/1480 train_time:164945ms step_avg:153.72ms step:1084/1480 train_time:165106ms step_avg:153.73ms step:1085/1480 train_time:165267ms step_avg:153.74ms step:1086/1480 train_time:165428ms step_avg:153.74ms step:1087/1480 train_time:165589ms step_avg:153.75ms step:1088/1480 train_time:165751ms step_avg:153.76ms step:1089/1480 train_time:165914ms step_avg:153.77ms step:1090/1480 train_time:166078ms step_avg:153.78ms step:1091/1480 train_time:166239ms step_avg:153.78ms step:1092/1480 train_time:166398ms step_avg:153.79ms step:1093/1480 train_time:166559ms step_avg:153.79ms step:1094/1480 train_time:166718ms step_avg:153.80ms step:1095/1480 train_time:166877ms step_avg:153.80ms step:1096/1480 train_time:167040ms step_avg:153.81ms step:1097/1480 train_time:167203ms step_avg:153.82ms step:1098/1480 train_time:167367ms step_avg:153.83ms step:1099/1480 train_time:167528ms step_avg:153.84ms step:1100/1480 train_time:167691ms step_avg:153.84ms step:1101/1480 train_time:167855ms step_avg:153.85ms step:1102/1480 train_time:168017ms step_avg:153.86ms step:1103/1480 train_time:168183ms step_avg:153.87ms step:1104/1480 train_time:168346ms step_avg:153.88ms step:1105/1480 train_time:168508ms step_avg:153.89ms step:1106/1480 train_time:168670ms step_avg:153.90ms step:1107/1480 train_time:168830ms step_avg:153.90ms step:1108/1480 train_time:168990ms step_avg:153.91ms step:1109/1480 train_time:169152ms step_avg:153.91ms step:1110/1480 train_time:169313ms step_avg:153.92ms step:1111/1480 train_time:169474ms step_avg:153.93ms step:1112/1480 train_time:169635ms step_avg:153.93ms step:1113/1480 train_time:169803ms step_avg:153.95ms step:1114/1480 train_time:169967ms step_avg:153.96ms step:1115/1480 train_time:170129ms step_avg:153.96ms step:1116/1480 train_time:170289ms step_avg:153.97ms step:1117/1480 train_time:170453ms step_avg:153.98ms step:1118/1480 train_time:170617ms step_avg:153.99ms step:1119/1480 train_time:170777ms step_avg:153.99ms step:1120/1480 train_time:170937ms step_avg:154.00ms step:1121/1480 train_time:171098ms step_avg:154.00ms step:1122/1480 train_time:171257ms step_avg:154.01ms step:1123/1480 train_time:171416ms step_avg:154.01ms step:1124/1480 train_time:171579ms step_avg:154.02ms step:1125/1480 train_time:171739ms step_avg:154.03ms step:1125/1480 val_loss:3.3873 train_time:171815ms step_avg:154.09ms step:1126/1480 train_time:171911ms step_avg:154.04ms step:1127/1480 train_time:172066ms step_avg:154.04ms step:1128/1480 train_time:172226ms step_avg:154.05ms step:1129/1480 train_time:172390ms step_avg:154.06ms step:1130/1480 train_time:172551ms step_avg:154.06ms step:1131/1480 train_time:172719ms step_avg:154.08ms step:1132/1480 train_time:172878ms step_avg:154.08ms step:1133/1480 train_time:173041ms step_avg:154.09ms step:1134/1480 train_time:173205ms step_avg:154.10ms step:1135/1480 train_time:173367ms step_avg:154.10ms step:1136/1480 train_time:173528ms step_avg:154.11ms step:1137/1480 train_time:173690ms step_avg:154.12ms step:1138/1480 train_time:173852ms step_avg:154.12ms step:1139/1480 train_time:174022ms step_avg:154.14ms step:1140/1480 train_time:174173ms step_avg:154.14ms step:1141/1480 train_time:174337ms step_avg:154.14ms step:1142/1480 train_time:174498ms step_avg:154.15ms step:1143/1480 train_time:174664ms step_avg:154.16ms step:1144/1480 train_time:174826ms step_avg:154.17ms step:1145/1480 train_time:174986ms step_avg:154.17ms step:1146/1480 train_time:175150ms step_avg:154.18ms step:1147/1480 train_time:175312ms step_avg:154.19ms step:1148/1480 train_time:175472ms step_avg:154.19ms step:1149/1480 train_time:175634ms step_avg:154.20ms step:1150/1480 train_time:175793ms step_avg:154.20ms step:1151/1480 train_time:175956ms step_avg:154.21ms step:1152/1480 train_time:176120ms step_avg:154.22ms step:1153/1480 train_time:176286ms step_avg:154.23ms step:1154/1480 train_time:176447ms step_avg:154.24ms step:1155/1480 train_time:176609ms step_avg:154.24ms step:1156/1480 train_time:176774ms step_avg:154.25ms step:1157/1480 train_time:176936ms step_avg:154.26ms step:1158/1480 train_time:177096ms step_avg:154.26ms step:1159/1480 train_time:177257ms step_avg:154.27ms step:1160/1480 train_time:177416ms step_avg:154.28ms step:1161/1480 train_time:177580ms step_avg:154.28ms step:1162/1480 train_time:177744ms step_avg:154.29ms step:1163/1480 train_time:177908ms step_avg:154.30ms step:1164/1480 train_time:178070ms step_avg:154.31ms step:1165/1480 train_time:178229ms step_avg:154.31ms step:1166/1480 train_time:178392ms step_avg:154.32ms step:1167/1480 train_time:178551ms step_avg:154.32ms step:1168/1480 train_time:178712ms step_avg:154.33ms step:1169/1480 train_time:178873ms step_avg:154.33ms step:1170/1480 train_time:179035ms step_avg:154.34ms step:1171/1480 train_time:179195ms step_avg:154.35ms step:1172/1480 train_time:179356ms step_avg:154.35ms step:1173/1480 train_time:179518ms step_avg:154.36ms step:1174/1480 train_time:179691ms step_avg:154.37ms step:1175/1480 train_time:179852ms step_avg:154.38ms step:1176/1480 train_time:180014ms step_avg:154.39ms step:1177/1480 train_time:180181ms step_avg:154.40ms step:1178/1480 train_time:180343ms step_avg:154.40ms step:1179/1480 train_time:180503ms step_avg:154.41ms step:1180/1480 train_time:180671ms step_avg:154.42ms step:1181/1480 train_time:180833ms step_avg:154.43ms step:1182/1480 train_time:180994ms step_avg:154.43ms step:1183/1480 train_time:181155ms step_avg:154.44ms step:1184/1480 train_time:181317ms step_avg:154.44ms step:1185/1480 train_time:181481ms step_avg:154.45ms step:1186/1480 train_time:181645ms step_avg:154.46ms step:1187/1480 train_time:181816ms step_avg:154.47ms step:1188/1480 train_time:181976ms step_avg:154.48ms step:1189/1480 train_time:182137ms step_avg:154.48ms step:1190/1480 train_time:182299ms step_avg:154.49ms step:1191/1480 train_time:182461ms step_avg:154.50ms step:1192/1480 train_time:182624ms step_avg:154.50ms step:1193/1480 train_time:182784ms step_avg:154.51ms step:1194/1480 train_time:182945ms step_avg:154.51ms step:1195/1480 train_time:183107ms step_avg:154.52ms step:1196/1480 train_time:183279ms step_avg:154.54ms step:1197/1480 train_time:183441ms step_avg:154.54ms step:1198/1480 train_time:183609ms step_avg:154.55ms step:1199/1480 train_time:183772ms step_avg:154.56ms step:1200/1480 train_time:183933ms step_avg:154.57ms step:1201/1480 train_time:184093ms step_avg:154.57ms step:1202/1480 train_time:184263ms step_avg:154.58ms step:1203/1480 train_time:184429ms step_avg:154.59ms step:1204/1480 train_time:184593ms step_avg:154.60ms step:1205/1480 train_time:184754ms step_avg:154.61ms step:1206/1480 train_time:184914ms step_avg:154.61ms step:1207/1480 train_time:185075ms step_avg:154.62ms step:1208/1480 train_time:185235ms step_avg:154.62ms step:1209/1480 train_time:185399ms step_avg:154.63ms step:1210/1480 train_time:185566ms step_avg:154.64ms step:1211/1480 train_time:185728ms step_avg:154.64ms step:1212/1480 train_time:185890ms step_avg:154.65ms step:1213/1480 train_time:186054ms step_avg:154.66ms step:1214/1480 train_time:186219ms step_avg:154.67ms step:1215/1480 train_time:186384ms step_avg:154.68ms step:1216/1480 train_time:186546ms step_avg:154.68ms step:1217/1480 train_time:186709ms step_avg:154.69ms step:1218/1480 train_time:186871ms step_avg:154.69ms step:1219/1480 train_time:187039ms step_avg:154.71ms step:1220/1480 train_time:187201ms step_avg:154.71ms step:1221/1480 train_time:187363ms step_avg:154.72ms step:1222/1480 train_time:187523ms step_avg:154.72ms step:1223/1480 train_time:187687ms step_avg:154.73ms step:1224/1480 train_time:187852ms step_avg:154.74ms step:1225/1480 train_time:188015ms step_avg:154.75ms step:1226/1480 train_time:188181ms step_avg:154.75ms step:1227/1480 train_time:188346ms step_avg:154.76ms step:1228/1480 train_time:188509ms step_avg:154.77ms step:1229/1480 train_time:188672ms step_avg:154.78ms step:1230/1480 train_time:188841ms step_avg:154.79ms step:1231/1480 train_time:189007ms step_avg:154.80ms step:1232/1480 train_time:189172ms step_avg:154.81ms step:1233/1480 train_time:189332ms step_avg:154.81ms step:1234/1480 train_time:189493ms step_avg:154.81ms step:1235/1480 train_time:189657ms step_avg:154.82ms step:1236/1480 train_time:189818ms step_avg:154.83ms step:1237/1480 train_time:189979ms step_avg:154.83ms step:1238/1480 train_time:190153ms step_avg:154.85ms step:1239/1480 train_time:190314ms step_avg:154.85ms step:1240/1480 train_time:190478ms step_avg:154.86ms step:1241/1480 train_time:190643ms step_avg:154.87ms step:1242/1480 train_time:190806ms step_avg:154.88ms step:1243/1480 train_time:190970ms step_avg:154.88ms step:1244/1480 train_time:191130ms step_avg:154.89ms step:1245/1480 train_time:191292ms step_avg:154.89ms step:1246/1480 train_time:191454ms step_avg:154.90ms step:1247/1480 train_time:191616ms step_avg:154.90ms step:1248/1480 train_time:191777ms step_avg:154.91ms step:1249/1480 train_time:191938ms step_avg:154.91ms step:1250/1480 train_time:192100ms step_avg:154.92ms step:1250/1480 val_loss:3.3378 train_time:192175ms step_avg:154.98ms step:1251/1480 train_time:192267ms step_avg:154.93ms step:1252/1480 train_time:192429ms step_avg:154.93ms step:1253/1480 train_time:192590ms step_avg:154.94ms step:1254/1480 train_time:192751ms step_avg:154.94ms step:1255/1480 train_time:192924ms step_avg:154.96ms step:1256/1480 train_time:193088ms step_avg:154.97ms step:1257/1480 train_time:193251ms step_avg:154.97ms step:1258/1480 train_time:193416ms step_avg:154.98ms step:1259/1480 train_time:193581ms step_avg:154.99ms step:1260/1480 train_time:193741ms step_avg:154.99ms step:1261/1480 train_time:193904ms step_avg:155.00ms step:1262/1480 train_time:194068ms step_avg:155.01ms step:1263/1480 train_time:194237ms step_avg:155.02ms step:1264/1480 train_time:194397ms step_avg:155.02ms step:1265/1480 train_time:194557ms step_avg:155.03ms step:1266/1480 train_time:194721ms step_avg:155.03ms step:1267/1480 train_time:194882ms step_avg:155.04ms step:1268/1480 train_time:195044ms step_avg:155.04ms step:1269/1480 train_time:195209ms step_avg:155.05ms step:1270/1480 train_time:195371ms step_avg:155.06ms step:1271/1480 train_time:195533ms step_avg:155.06ms step:1272/1480 train_time:195694ms step_avg:155.07ms step:1273/1480 train_time:195857ms step_avg:155.07ms step:1274/1480 train_time:196022ms step_avg:155.08ms step:1275/1480 train_time:196182ms step_avg:155.08ms step:1276/1480 train_time:196341ms step_avg:155.09ms step:1277/1480 train_time:196504ms step_avg:155.09ms step:1278/1480 train_time:196666ms step_avg:155.10ms step:1279/1480 train_time:196827ms step_avg:155.10ms step:1280/1480 train_time:196995ms step_avg:155.11ms step:1281/1480 train_time:197157ms step_avg:155.12ms step:1282/1480 train_time:197318ms step_avg:155.12ms step:1283/1480 train_time:197481ms step_avg:155.13ms step:1284/1480 train_time:197645ms step_avg:155.14ms step:1285/1480 train_time:197806ms step_avg:155.14ms step:1286/1480 train_time:197968ms step_avg:155.15ms step:1287/1480 train_time:198132ms step_avg:155.15ms step:1288/1480 train_time:198295ms step_avg:155.16ms step:1289/1480 train_time:198463ms step_avg:155.17ms step:1290/1480 train_time:198630ms step_avg:155.18ms step:1291/1480 train_time:198794ms step_avg:155.19ms step:1292/1480 train_time:198960ms step_avg:155.20ms step:1293/1480 train_time:199126ms step_avg:155.20ms step:1294/1480 train_time:199288ms step_avg:155.21ms step:1295/1480 train_time:199450ms step_avg:155.21ms step:1296/1480 train_time:199614ms step_avg:155.22ms step:1297/1480 train_time:199778ms step_avg:155.23ms step:1298/1480 train_time:199941ms step_avg:155.23ms step:1299/1480 train_time:200103ms step_avg:155.24ms step:1300/1480 train_time:200264ms step_avg:155.24ms step:1301/1480 train_time:200424ms step_avg:155.25ms step:1302/1480 train_time:200590ms step_avg:155.26ms step:1303/1480 train_time:200756ms step_avg:155.26ms step:1304/1480 train_time:200922ms step_avg:155.27ms step:1305/1480 train_time:201083ms step_avg:155.28ms step:1306/1480 train_time:201247ms step_avg:155.28ms step:1307/1480 train_time:201408ms step_avg:155.29ms step:1308/1480 train_time:201570ms step_avg:155.29ms step:1309/1480 train_time:201736ms step_avg:155.30ms step:1310/1480 train_time:201899ms step_avg:155.31ms step:1311/1480 train_time:202061ms step_avg:155.31ms step:1312/1480 train_time:202227ms step_avg:155.32ms step:1313/1480 train_time:202388ms step_avg:155.32ms step:1314/1480 train_time:202552ms step_avg:155.33ms step:1315/1480 train_time:202717ms step_avg:155.34ms step:1316/1480 train_time:202877ms step_avg:155.34ms step:1317/1480 train_time:203039ms step_avg:155.35ms step:1318/1480 train_time:203206ms step_avg:155.36ms step:1319/1480 train_time:203373ms step_avg:155.36ms step:1320/1480 train_time:203540ms step_avg:155.37ms step:1321/1480 train_time:203704ms step_avg:155.38ms step:1322/1480 train_time:203874ms step_avg:155.39ms step:1323/1480 train_time:204039ms step_avg:155.40ms step:1324/1480 train_time:204202ms step_avg:155.41ms step:1325/1480 train_time:204372ms step_avg:155.42ms step:1326/1480 train_time:204539ms step_avg:155.42ms step:1327/1480 train_time:204702ms step_avg:155.43ms step:1328/1480 train_time:204865ms step_avg:155.44ms step:1329/1480 train_time:205048ms step_avg:155.46ms step:1330/1480 train_time:205213ms step_avg:155.46ms step:1331/1480 train_time:205377ms step_avg:155.47ms step:1332/1480 train_time:205541ms step_avg:155.48ms step:1333/1480 train_time:205705ms step_avg:155.48ms step:1334/1480 train_time:205869ms step_avg:155.49ms step:1335/1480 train_time:206030ms step_avg:155.49ms step:1336/1480 train_time:206199ms step_avg:155.50ms step:1337/1480 train_time:206366ms step_avg:155.51ms step:1338/1480 train_time:206530ms step_avg:155.52ms step:1339/1480 train_time:206694ms step_avg:155.53ms step:1340/1480 train_time:206859ms step_avg:155.53ms step:1341/1480 train_time:207020ms step_avg:155.54ms step:1342/1480 train_time:207186ms step_avg:155.54ms step:1343/1480 train_time:207347ms step_avg:155.55ms step:1344/1480 train_time:207508ms step_avg:155.55ms step:1345/1480 train_time:207674ms step_avg:155.56ms step:1346/1480 train_time:207836ms step_avg:155.57ms step:1347/1480 train_time:207999ms step_avg:155.57ms step:1348/1480 train_time:208162ms step_avg:155.58ms step:1349/1480 train_time:208326ms step_avg:155.58ms step:1350/1480 train_time:208491ms step_avg:155.59ms step:1351/1480 train_time:208654ms step_avg:155.60ms step:1352/1480 train_time:208817ms step_avg:155.60ms step:1353/1480 train_time:208983ms step_avg:155.61ms step:1354/1480 train_time:209146ms step_avg:155.61ms step:1355/1480 train_time:209308ms step_avg:155.62ms step:1356/1480 train_time:209472ms step_avg:155.63ms step:1357/1480 train_time:209636ms step_avg:155.63ms step:1358/1480 train_time:209800ms step_avg:155.64ms step:1359/1480 train_time:209966ms step_avg:155.65ms step:1360/1480 train_time:210131ms step_avg:155.65ms step:1361/1480 train_time:210301ms step_avg:155.66ms step:1362/1480 train_time:210465ms step_avg:155.67ms step:1363/1480 train_time:210633ms step_avg:155.68ms step:1364/1480 train_time:210796ms step_avg:155.68ms step:1365/1480 train_time:210955ms step_avg:155.69ms step:1366/1480 train_time:211119ms step_avg:155.69ms step:1367/1480 train_time:211281ms step_avg:155.70ms step:1368/1480 train_time:211445ms step_avg:155.70ms step:1369/1480 train_time:211615ms step_avg:155.71ms step:1370/1480 train_time:211782ms step_avg:155.72ms step:1371/1480 train_time:211946ms step_avg:155.73ms step:1372/1480 train_time:212114ms step_avg:155.74ms step:1373/1480 train_time:212274ms step_avg:155.74ms step:1374/1480 train_time:212442ms step_avg:155.75ms step:1375/1480 train_time:212605ms step_avg:155.75ms step:1375/1480 val_loss:3.2994 train_time:212680ms step_avg:155.81ms step:1376/1480 train_time:212774ms step_avg:155.76ms step:1377/1480 train_time:212933ms step_avg:155.77ms step:1378/1480 train_time:213094ms step_avg:155.77ms step:1379/1480 train_time:213259ms step_avg:155.78ms step:1380/1480 train_time:213423ms step_avg:155.78ms step:1381/1480 train_time:213591ms step_avg:155.79ms step:1382/1480 train_time:213754ms step_avg:155.80ms step:1383/1480 train_time:213916ms step_avg:155.80ms step:1384/1480 train_time:214084ms step_avg:155.81ms step:1385/1480 train_time:214244ms step_avg:155.81ms step:1386/1480 train_time:214409ms step_avg:155.82ms step:1387/1480 train_time:214574ms step_avg:155.83ms step:1388/1480 train_time:214735ms step_avg:155.83ms step:1389/1480 train_time:214901ms step_avg:155.84ms step:1390/1480 train_time:215063ms step_avg:155.84ms step:1391/1480 train_time:215226ms step_avg:155.85ms step:1392/1480 train_time:215389ms step_avg:155.85ms step:1393/1480 train_time:215550ms step_avg:155.86ms step:1394/1480 train_time:215713ms step_avg:155.86ms step:1395/1480 train_time:215875ms step_avg:155.87ms step:1396/1480 train_time:216037ms step_avg:155.87ms step:1397/1480 train_time:216198ms step_avg:155.87ms step:1398/1480 train_time:216359ms step_avg:155.88ms step:1399/1480 train_time:216520ms step_avg:155.88ms step:1400/1480 train_time:216687ms step_avg:155.89ms step:1401/1480 train_time:216849ms step_avg:155.89ms step:1402/1480 train_time:217011ms step_avg:155.90ms step:1403/1480 train_time:217175ms step_avg:155.90ms step:1404/1480 train_time:217338ms step_avg:155.91ms step:1405/1480 train_time:217505ms step_avg:155.92ms step:1406/1480 train_time:217669ms step_avg:155.92ms step:1407/1480 train_time:217830ms step_avg:155.93ms step:1408/1480 train_time:217991ms step_avg:155.93ms step:1409/1480 train_time:218164ms step_avg:155.94ms step:1410/1480 train_time:218327ms step_avg:155.95ms step:1411/1480 train_time:218487ms step_avg:155.95ms step:1412/1480 train_time:218649ms step_avg:155.95ms step:1413/1480 train_time:218811ms step_avg:155.96ms step:1414/1480 train_time:218976ms step_avg:155.97ms step:1415/1480 train_time:219142ms step_avg:155.97ms step:1416/1480 train_time:219315ms step_avg:155.99ms step:1417/1480 train_time:219480ms step_avg:155.99ms step:1418/1480 train_time:219642ms step_avg:156.00ms step:1419/1480 train_time:219808ms step_avg:156.00ms step:1420/1480 train_time:219972ms step_avg:156.01ms step:1421/1480 train_time:220138ms step_avg:156.02ms step:1422/1480 train_time:220304ms step_avg:156.02ms step:1423/1480 train_time:220467ms step_avg:156.03ms step:1424/1480 train_time:220632ms step_avg:156.03ms step:1425/1480 train_time:220803ms step_avg:156.04ms step:1426/1480 train_time:220968ms step_avg:156.05ms step:1427/1480 train_time:221133ms step_avg:156.06ms step:1428/1480 train_time:221294ms step_avg:156.06ms step:1429/1480 train_time:221453ms step_avg:156.06ms step:1430/1480 train_time:221617ms step_avg:156.07ms step:1431/1480 train_time:221785ms step_avg:156.08ms step:1432/1480 train_time:221952ms step_avg:156.08ms step:1433/1480 train_time:222121ms step_avg:156.09ms step:1434/1480 train_time:222290ms step_avg:156.10ms step:1435/1480 train_time:222455ms step_avg:156.11ms step:1436/1480 train_time:222620ms step_avg:156.11ms step:1437/1480 train_time:222783ms step_avg:156.12ms step:1438/1480 train_time:222945ms step_avg:156.12ms step:1439/1480 train_time:223112ms step_avg:156.13ms step:1440/1480 train_time:223274ms step_avg:156.14ms step:1441/1480 train_time:223439ms step_avg:156.14ms step:1442/1480 train_time:223605ms step_avg:156.15ms step:1443/1480 train_time:223778ms step_avg:156.16ms step:1444/1480 train_time:223942ms step_avg:156.17ms step:1445/1480 train_time:224106ms step_avg:156.17ms step:1446/1480 train_time:224272ms step_avg:156.18ms step:1447/1480 train_time:224440ms step_avg:156.19ms step:1448/1480 train_time:224603ms step_avg:156.19ms step:1449/1480 train_time:224766ms step_avg:156.20ms step:1450/1480 train_time:224930ms step_avg:156.20ms step:1451/1480 train_time:225093ms step_avg:156.21ms step:1452/1480 train_time:225259ms step_avg:156.21ms step:1453/1480 train_time:225422ms step_avg:156.22ms step:1454/1480 train_time:225584ms step_avg:156.22ms step:1455/1480 train_time:225753ms step_avg:156.23ms step:1456/1480 train_time:225917ms step_avg:156.24ms step:1457/1480 train_time:226080ms step_avg:156.24ms step:1458/1480 train_time:226243ms step_avg:156.25ms step:1459/1480 train_time:226410ms step_avg:156.25ms step:1460/1480 train_time:226572ms step_avg:156.26ms step:1461/1480 train_time:226736ms step_avg:156.26ms step:1462/1480 train_time:226902ms step_avg:156.27ms step:1463/1480 train_time:227067ms step_avg:156.27ms step:1464/1480 train_time:227232ms step_avg:156.28ms step:1465/1480 train_time:227396ms step_avg:156.29ms step:1466/1480 train_time:227558ms step_avg:156.29ms step:1467/1480 train_time:227725ms step_avg:156.30ms step:1468/1480 train_time:227889ms step_avg:156.30ms step:1469/1480 train_time:228052ms step_avg:156.31ms step:1470/1480 train_time:228221ms step_avg:156.32ms step:1471/1480 train_time:228391ms step_avg:156.32ms step:1472/1480 train_time:228561ms step_avg:156.33ms step:1473/1480 train_time:228724ms step_avg:156.34ms step:1474/1480 train_time:228891ms step_avg:156.35ms step:1475/1480 train_time:229060ms step_avg:156.36ms step:1476/1480 train_time:229224ms step_avg:156.36ms step:1477/1480 train_time:229392ms step_avg:156.37ms step:1478/1480 train_time:229563ms step_avg:156.38ms step:1479/1480 train_time:229728ms step_avg:156.38ms step:1480/1480 train_time:229890ms step_avg:156.39ms step:1480/1480 val_loss:3.2807 train_time:229966ms step_avg:156.44ms peak memory consumption: 34239 MiB