import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 09:23:57 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 31C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 37C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 39C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29056ms step_avg:nanms step:2/1480 train_time:29164ms step_avg:nanms step:3/1480 train_time:29283ms step_avg:nanms step:4/1480 train_time:29423ms step_avg:nanms step:5/1480 train_time:29565ms step_avg:nanms step:6/1480 train_time:29706ms step_avg:nanms step:7/1480 train_time:29849ms step_avg:nanms step:8/1480 train_time:29991ms step_avg:nanms step:9/1480 train_time:30132ms step_avg:nanms step:10/1480 train_time:30275ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:280ms step_avg:nanms step:13/1480 train_time:423ms step_avg:140.89ms step:14/1480 train_time:564ms step_avg:141.08ms step:15/1480 train_time:707ms step_avg:141.35ms step:16/1480 train_time:849ms step_avg:141.50ms step:17/1480 train_time:991ms step_avg:141.61ms step:18/1480 train_time:1133ms step_avg:141.65ms step:19/1480 train_time:1275ms step_avg:141.67ms step:20/1480 train_time:1418ms step_avg:141.83ms step:21/1480 train_time:1560ms step_avg:141.86ms step:22/1480 train_time:1704ms step_avg:141.97ms step:23/1480 train_time:1848ms step_avg:142.15ms step:24/1480 train_time:1992ms step_avg:142.29ms step:25/1480 train_time:2134ms step_avg:142.27ms step:26/1480 train_time:2276ms step_avg:142.22ms step:27/1480 train_time:2417ms step_avg:142.18ms step:28/1480 train_time:2560ms step_avg:142.21ms step:29/1480 train_time:2701ms step_avg:142.18ms step:30/1480 train_time:2844ms step_avg:142.21ms step:31/1480 train_time:2988ms step_avg:142.28ms step:32/1480 train_time:3132ms step_avg:142.38ms step:33/1480 train_time:3274ms step_avg:142.33ms step:34/1480 train_time:3417ms step_avg:142.37ms step:35/1480 train_time:3560ms step_avg:142.40ms step:36/1480 train_time:3704ms step_avg:142.44ms step:37/1480 train_time:3848ms step_avg:142.53ms step:38/1480 train_time:3992ms step_avg:142.57ms step:39/1480 train_time:4134ms step_avg:142.54ms step:40/1480 train_time:4276ms step_avg:142.53ms step:41/1480 train_time:4417ms step_avg:142.48ms step:42/1480 train_time:4560ms step_avg:142.50ms step:43/1480 train_time:4703ms step_avg:142.51ms step:44/1480 train_time:4846ms step_avg:142.53ms step:45/1480 train_time:4989ms step_avg:142.53ms step:46/1480 train_time:5131ms step_avg:142.52ms step:47/1480 train_time:5273ms step_avg:142.51ms step:48/1480 train_time:5415ms step_avg:142.49ms step:49/1480 train_time:5556ms step_avg:142.47ms step:50/1480 train_time:5700ms step_avg:142.51ms step:51/1480 train_time:5842ms step_avg:142.49ms step:52/1480 train_time:5985ms step_avg:142.51ms step:53/1480 train_time:6129ms step_avg:142.52ms step:54/1480 train_time:6271ms step_avg:142.51ms step:55/1480 train_time:6413ms step_avg:142.52ms step:56/1480 train_time:6556ms step_avg:142.52ms step:57/1480 train_time:6697ms step_avg:142.50ms step:58/1480 train_time:6840ms step_avg:142.51ms step:59/1480 train_time:6982ms step_avg:142.49ms step:60/1480 train_time:7127ms step_avg:142.55ms step:61/1480 train_time:7271ms step_avg:142.56ms step:62/1480 train_time:7413ms step_avg:142.56ms step:63/1480 train_time:7555ms step_avg:142.55ms step:64/1480 train_time:7698ms step_avg:142.55ms step:65/1480 train_time:7839ms step_avg:142.53ms step:66/1480 train_time:7982ms step_avg:142.53ms step:67/1480 train_time:8123ms step_avg:142.52ms step:68/1480 train_time:8267ms step_avg:142.54ms step:69/1480 train_time:8412ms step_avg:142.58ms step:70/1480 train_time:8554ms step_avg:142.57ms step:71/1480 train_time:8697ms step_avg:142.57ms step:72/1480 train_time:8839ms step_avg:142.56ms step:73/1480 train_time:8981ms step_avg:142.56ms step:74/1480 train_time:9124ms step_avg:142.56ms step:75/1480 train_time:9267ms step_avg:142.56ms step:76/1480 train_time:9410ms step_avg:142.57ms step:77/1480 train_time:9553ms step_avg:142.58ms step:78/1480 train_time:9696ms step_avg:142.59ms step:79/1480 train_time:9837ms step_avg:142.57ms step:80/1480 train_time:10363ms step_avg:148.04ms step:81/1480 train_time:10459ms step_avg:147.32ms step:82/1480 train_time:10601ms step_avg:147.24ms step:83/1480 train_time:10743ms step_avg:147.17ms step:84/1480 train_time:10885ms step_avg:147.09ms step:85/1480 train_time:11027ms step_avg:147.03ms step:86/1480 train_time:11169ms step_avg:146.96ms step:87/1480 train_time:11312ms step_avg:146.90ms step:88/1480 train_time:11455ms step_avg:146.86ms step:89/1480 train_time:11598ms step_avg:146.81ms step:90/1480 train_time:11741ms step_avg:146.76ms step:91/1480 train_time:11884ms step_avg:146.71ms step:92/1480 train_time:12027ms step_avg:146.67ms step:93/1480 train_time:12170ms step_avg:146.63ms step:94/1480 train_time:12312ms step_avg:146.58ms step:95/1480 train_time:12456ms step_avg:146.54ms step:96/1480 train_time:12994ms step_avg:151.10ms step:97/1480 train_time:13512ms step_avg:155.31ms step:98/1480 train_time:13613ms step_avg:154.69ms step:99/1480 train_time:13755ms step_avg:154.56ms step:100/1480 train_time:13897ms step_avg:154.41ms step:101/1480 train_time:14042ms step_avg:154.31ms step:102/1480 train_time:14180ms step_avg:154.14ms step:103/1480 train_time:14323ms step_avg:154.01ms step:104/1480 train_time:14466ms step_avg:153.90ms step:105/1480 train_time:14611ms step_avg:153.80ms step:106/1480 train_time:14756ms step_avg:153.71ms step:107/1480 train_time:14899ms step_avg:153.59ms step:108/1480 train_time:15041ms step_avg:153.48ms step:109/1480 train_time:15184ms step_avg:153.37ms step:110/1480 train_time:15327ms step_avg:153.27ms step:111/1480 train_time:15471ms step_avg:153.18ms step:112/1480 train_time:15616ms step_avg:153.10ms step:113/1480 train_time:15761ms step_avg:153.01ms step:114/1480 train_time:15907ms step_avg:152.95ms step:115/1480 train_time:16054ms step_avg:152.89ms step:116/1480 train_time:16200ms step_avg:152.83ms step:117/1480 train_time:16345ms step_avg:152.76ms step:118/1480 train_time:16491ms step_avg:152.70ms step:119/1480 train_time:16637ms step_avg:152.63ms step:120/1480 train_time:16782ms step_avg:152.57ms step:121/1480 train_time:16930ms step_avg:152.52ms step:122/1480 train_time:17075ms step_avg:152.45ms step:123/1480 train_time:17220ms step_avg:152.39ms step:124/1480 train_time:17366ms step_avg:152.33ms step:125/1480 train_time:17512ms step_avg:152.28ms step:125/1480 val_loss:4.4164 train_time:17577ms step_avg:152.84ms step:126/1480 train_time:17671ms step_avg:152.34ms step:127/1480 train_time:17813ms step_avg:152.25ms step:128/1480 train_time:17959ms step_avg:152.20ms step:129/1480 train_time:18106ms step_avg:152.15ms step:130/1480 train_time:18251ms step_avg:152.09ms step:131/1480 train_time:18395ms step_avg:152.03ms step:132/1480 train_time:18540ms step_avg:151.97ms step:133/1480 train_time:18686ms step_avg:151.92ms step:134/1480 train_time:18831ms step_avg:151.87ms step:135/1480 train_time:18977ms step_avg:151.82ms step:136/1480 train_time:19123ms step_avg:151.77ms step:137/1480 train_time:19269ms step_avg:151.72ms step:138/1480 train_time:19413ms step_avg:151.67ms step:139/1480 train_time:19560ms step_avg:151.63ms step:140/1480 train_time:19706ms step_avg:151.59ms step:141/1480 train_time:19851ms step_avg:151.53ms step:142/1480 train_time:19997ms step_avg:151.49ms step:143/1480 train_time:20143ms step_avg:151.45ms step:144/1480 train_time:20290ms step_avg:151.41ms step:145/1480 train_time:20434ms step_avg:151.36ms step:146/1480 train_time:20580ms step_avg:151.33ms step:147/1480 train_time:20726ms step_avg:151.29ms step:148/1480 train_time:20871ms step_avg:151.24ms step:149/1480 train_time:21017ms step_avg:151.20ms step:150/1480 train_time:21164ms step_avg:151.17ms step:151/1480 train_time:21309ms step_avg:151.13ms step:152/1480 train_time:21455ms step_avg:151.09ms step:153/1480 train_time:21601ms step_avg:151.05ms step:154/1480 train_time:21747ms step_avg:151.02ms step:155/1480 train_time:21891ms step_avg:150.97ms step:156/1480 train_time:22037ms step_avg:150.94ms step:157/1480 train_time:22184ms step_avg:150.91ms step:158/1480 train_time:22330ms step_avg:150.88ms step:159/1480 train_time:22476ms step_avg:150.84ms step:160/1480 train_time:22622ms step_avg:150.81ms step:161/1480 train_time:22768ms step_avg:150.78ms step:162/1480 train_time:22913ms step_avg:150.75ms step:163/1480 train_time:23059ms step_avg:150.71ms step:164/1480 train_time:23206ms step_avg:150.69ms step:165/1480 train_time:23351ms step_avg:150.65ms step:166/1480 train_time:23496ms step_avg:150.61ms step:167/1480 train_time:23643ms step_avg:150.59ms step:168/1480 train_time:23790ms step_avg:150.57ms step:169/1480 train_time:23934ms step_avg:150.53ms step:170/1480 train_time:24080ms step_avg:150.50ms step:171/1480 train_time:24226ms step_avg:150.47ms step:172/1480 train_time:24372ms step_avg:150.45ms step:173/1480 train_time:24519ms step_avg:150.42ms step:174/1480 train_time:24666ms step_avg:150.40ms step:175/1480 train_time:24811ms step_avg:150.37ms step:176/1480 train_time:24958ms step_avg:150.35ms step:177/1480 train_time:25106ms step_avg:150.33ms step:178/1480 train_time:25251ms step_avg:150.30ms step:179/1480 train_time:25771ms step_avg:152.49ms step:180/1480 train_time:25878ms step_avg:152.22ms step:181/1480 train_time:26025ms step_avg:152.19ms step:182/1480 train_time:26170ms step_avg:152.15ms step:183/1480 train_time:26315ms step_avg:152.11ms step:184/1480 train_time:26461ms step_avg:152.07ms step:185/1480 train_time:26607ms step_avg:152.04ms step:186/1480 train_time:26752ms step_avg:152.00ms step:187/1480 train_time:26899ms step_avg:151.97ms step:188/1480 train_time:27046ms step_avg:151.94ms step:189/1480 train_time:27208ms step_avg:152.00ms step:190/1480 train_time:27336ms step_avg:151.87ms step:191/1480 train_time:27482ms step_avg:151.84ms step:192/1480 train_time:27628ms step_avg:151.80ms step:193/1480 train_time:27775ms step_avg:151.77ms step:194/1480 train_time:27921ms step_avg:151.74ms step:195/1480 train_time:28067ms step_avg:151.72ms step:196/1480 train_time:28212ms step_avg:151.68ms step:197/1480 train_time:28359ms step_avg:151.65ms step:198/1480 train_time:28506ms step_avg:151.63ms step:199/1480 train_time:28652ms step_avg:151.60ms step:200/1480 train_time:28797ms step_avg:151.57ms step:201/1480 train_time:28944ms step_avg:151.54ms step:202/1480 train_time:29090ms step_avg:151.51ms step:203/1480 train_time:29235ms step_avg:151.48ms step:204/1480 train_time:29382ms step_avg:151.45ms step:205/1480 train_time:29528ms step_avg:151.43ms step:206/1480 train_time:29673ms step_avg:151.39ms step:207/1480 train_time:29820ms step_avg:151.37ms step:208/1480 train_time:29966ms step_avg:151.35ms step:209/1480 train_time:30112ms step_avg:151.31ms step:210/1480 train_time:30258ms step_avg:151.29ms step:211/1480 train_time:30406ms step_avg:151.27ms step:212/1480 train_time:30551ms step_avg:151.24ms step:213/1480 train_time:30697ms step_avg:151.22ms step:214/1480 train_time:30843ms step_avg:151.19ms step:215/1480 train_time:30989ms step_avg:151.17ms step:216/1480 train_time:31134ms step_avg:151.14ms step:217/1480 train_time:31281ms step_avg:151.12ms step:218/1480 train_time:31428ms step_avg:151.09ms step:219/1480 train_time:31573ms step_avg:151.07ms step:220/1480 train_time:31721ms step_avg:151.05ms step:221/1480 train_time:32258ms step_avg:152.88ms step:222/1480 train_time:32363ms step_avg:152.66ms step:223/1480 train_time:32512ms step_avg:152.64ms step:224/1480 train_time:32661ms step_avg:152.62ms step:225/1480 train_time:32810ms step_avg:152.60ms step:226/1480 train_time:32957ms step_avg:152.58ms step:227/1480 train_time:33105ms step_avg:152.56ms step:228/1480 train_time:33252ms step_avg:152.53ms step:229/1480 train_time:33402ms step_avg:152.52ms step:230/1480 train_time:33550ms step_avg:152.50ms step:231/1480 train_time:33699ms step_avg:152.48ms step:232/1480 train_time:33848ms step_avg:152.47ms step:233/1480 train_time:33996ms step_avg:152.45ms step:234/1480 train_time:34145ms step_avg:152.43ms step:235/1480 train_time:34292ms step_avg:152.41ms step:236/1480 train_time:34441ms step_avg:152.39ms step:237/1480 train_time:34590ms step_avg:152.38ms step:238/1480 train_time:34737ms step_avg:152.35ms step:239/1480 train_time:34886ms step_avg:152.34ms step:240/1480 train_time:35033ms step_avg:152.32ms step:241/1480 train_time:35182ms step_avg:152.30ms step:242/1480 train_time:35330ms step_avg:152.28ms step:243/1480 train_time:35479ms step_avg:152.27ms step:244/1480 train_time:35627ms step_avg:152.25ms step:245/1480 train_time:35777ms step_avg:152.24ms step:246/1480 train_time:35926ms step_avg:152.23ms step:247/1480 train_time:36074ms step_avg:152.21ms step:248/1480 train_time:36223ms step_avg:152.20ms step:249/1480 train_time:36371ms step_avg:152.18ms step:250/1480 train_time:36519ms step_avg:152.16ms step:250/1480 val_loss:3.9985 train_time:36586ms step_avg:152.44ms step:251/1480 train_time:36677ms step_avg:152.19ms step:252/1480 train_time:36826ms step_avg:152.17ms step:253/1480 train_time:36974ms step_avg:152.16ms step:254/1480 train_time:37122ms step_avg:152.14ms step:255/1480 train_time:37270ms step_avg:152.12ms step:256/1480 train_time:37418ms step_avg:152.10ms step:257/1480 train_time:37566ms step_avg:152.09ms step:258/1480 train_time:37714ms step_avg:152.07ms step:259/1480 train_time:37864ms step_avg:152.07ms step:260/1480 train_time:38012ms step_avg:152.05ms step:261/1480 train_time:38161ms step_avg:152.04ms step:262/1480 train_time:38309ms step_avg:152.02ms step:263/1480 train_time:38458ms step_avg:152.01ms step:264/1480 train_time:38607ms step_avg:151.99ms step:265/1480 train_time:38755ms step_avg:151.98ms step:266/1480 train_time:38905ms step_avg:151.97ms step:267/1480 train_time:39052ms step_avg:151.95ms step:268/1480 train_time:39200ms step_avg:151.94ms step:269/1480 train_time:39349ms step_avg:151.93ms step:270/1480 train_time:39498ms step_avg:151.91ms step:271/1480 train_time:39647ms step_avg:151.91ms step:272/1480 train_time:39795ms step_avg:151.89ms step:273/1480 train_time:39945ms step_avg:151.88ms step:274/1480 train_time:40091ms step_avg:151.86ms step:275/1480 train_time:40240ms step_avg:151.85ms step:276/1480 train_time:40388ms step_avg:151.84ms step:277/1480 train_time:40537ms step_avg:151.82ms step:278/1480 train_time:40686ms step_avg:151.81ms step:279/1480 train_time:40833ms step_avg:151.80ms step:280/1480 train_time:40982ms step_avg:151.78ms step:281/1480 train_time:41130ms step_avg:151.77ms step:282/1480 train_time:41280ms step_avg:151.76ms step:283/1480 train_time:41429ms step_avg:151.75ms step:284/1480 train_time:41576ms step_avg:151.74ms step:285/1480 train_time:41727ms step_avg:151.73ms step:286/1480 train_time:41874ms step_avg:151.72ms step:287/1480 train_time:42023ms step_avg:151.71ms step:288/1480 train_time:42170ms step_avg:151.69ms step:289/1480 train_time:42319ms step_avg:151.68ms step:290/1480 train_time:42467ms step_avg:151.67ms step:291/1480 train_time:42615ms step_avg:151.66ms step:292/1480 train_time:42764ms step_avg:151.65ms step:293/1480 train_time:42912ms step_avg:151.63ms step:294/1480 train_time:43061ms step_avg:151.62ms step:295/1480 train_time:43209ms step_avg:151.61ms step:296/1480 train_time:43359ms step_avg:151.60ms step:297/1480 train_time:43507ms step_avg:151.59ms step:298/1480 train_time:43655ms step_avg:151.58ms step:299/1480 train_time:43803ms step_avg:151.57ms step:300/1480 train_time:43951ms step_avg:151.55ms step:301/1480 train_time:44101ms step_avg:151.55ms step:302/1480 train_time:44249ms step_avg:151.54ms step:303/1480 train_time:44399ms step_avg:151.53ms step:304/1480 train_time:44548ms step_avg:151.52ms step:305/1480 train_time:44696ms step_avg:151.51ms step:306/1480 train_time:44846ms step_avg:151.51ms step:307/1480 train_time:44993ms step_avg:151.49ms step:308/1480 train_time:45141ms step_avg:151.48ms step:309/1480 train_time:45289ms step_avg:151.47ms step:310/1480 train_time:45437ms step_avg:151.46ms step:311/1480 train_time:45586ms step_avg:151.45ms step:312/1480 train_time:45734ms step_avg:151.44ms step:313/1480 train_time:45882ms step_avg:151.43ms step:314/1480 train_time:46031ms step_avg:151.42ms step:315/1480 train_time:46180ms step_avg:151.41ms step:316/1480 train_time:46329ms step_avg:151.40ms step:317/1480 train_time:46476ms step_avg:151.39ms step:318/1480 train_time:46626ms step_avg:151.38ms step:319/1480 train_time:46773ms step_avg:151.37ms step:320/1480 train_time:46922ms step_avg:151.36ms step:321/1480 train_time:47070ms step_avg:151.35ms step:322/1480 train_time:47218ms step_avg:151.34ms step:323/1480 train_time:47367ms step_avg:151.33ms step:324/1480 train_time:47516ms step_avg:151.32ms step:325/1480 train_time:47664ms step_avg:151.32ms step:326/1480 train_time:47812ms step_avg:151.30ms step:327/1480 train_time:47961ms step_avg:151.30ms step:328/1480 train_time:48109ms step_avg:151.29ms step:329/1480 train_time:48257ms step_avg:151.28ms step:330/1480 train_time:48408ms step_avg:151.27ms step:331/1480 train_time:48558ms step_avg:151.27ms step:332/1480 train_time:48709ms step_avg:151.27ms step:333/1480 train_time:48861ms step_avg:151.27ms step:334/1480 train_time:49011ms step_avg:151.27ms step:335/1480 train_time:49162ms step_avg:151.27ms step:336/1480 train_time:49312ms step_avg:151.26ms step:337/1480 train_time:49463ms step_avg:151.26ms step:338/1480 train_time:49613ms step_avg:151.26ms step:339/1480 train_time:49764ms step_avg:151.26ms step:340/1480 train_time:49916ms step_avg:151.26ms step:341/1480 train_time:50067ms step_avg:151.26ms step:342/1480 train_time:50217ms step_avg:151.26ms step:343/1480 train_time:50369ms step_avg:151.26ms step:344/1480 train_time:50520ms step_avg:151.26ms step:345/1480 train_time:50671ms step_avg:151.26ms step:346/1480 train_time:50821ms step_avg:151.25ms step:347/1480 train_time:50972ms step_avg:151.25ms step:348/1480 train_time:51123ms step_avg:151.25ms step:349/1480 train_time:51272ms step_avg:151.25ms step:350/1480 train_time:51424ms step_avg:151.25ms step:351/1480 train_time:51574ms step_avg:151.24ms step:352/1480 train_time:51726ms step_avg:151.25ms step:353/1480 train_time:51877ms step_avg:151.25ms step:354/1480 train_time:52029ms step_avg:151.25ms step:355/1480 train_time:52179ms step_avg:151.24ms step:356/1480 train_time:52330ms step_avg:151.24ms step:357/1480 train_time:52481ms step_avg:151.24ms step:358/1480 train_time:52632ms step_avg:151.24ms step:359/1480 train_time:52783ms step_avg:151.24ms step:360/1480 train_time:52933ms step_avg:151.24ms step:361/1480 train_time:53085ms step_avg:151.24ms step:362/1480 train_time:53235ms step_avg:151.24ms step:363/1480 train_time:53387ms step_avg:151.24ms step:364/1480 train_time:53537ms step_avg:151.23ms step:365/1480 train_time:53688ms step_avg:151.23ms step:366/1480 train_time:53837ms step_avg:151.23ms step:367/1480 train_time:53989ms step_avg:151.23ms step:368/1480 train_time:54138ms step_avg:151.22ms step:369/1480 train_time:54289ms step_avg:151.22ms step:370/1480 train_time:54439ms step_avg:151.22ms step:371/1480 train_time:54590ms step_avg:151.22ms step:372/1480 train_time:54742ms step_avg:151.22ms step:373/1480 train_time:54893ms step_avg:151.22ms step:374/1480 train_time:55043ms step_avg:151.22ms step:375/1480 train_time:55193ms step_avg:151.21ms step:375/1480 val_loss:3.8160 train_time:55261ms step_avg:151.40ms step:376/1480 train_time:55355ms step_avg:151.24ms step:377/1480 train_time:55502ms step_avg:151.23ms step:378/1480 train_time:55651ms step_avg:151.23ms step:379/1480 train_time:55815ms step_avg:151.26ms step:380/1480 train_time:55951ms step_avg:151.22ms step:381/1480 train_time:56102ms step_avg:151.22ms step:382/1480 train_time:56252ms step_avg:151.21ms step:383/1480 train_time:56404ms step_avg:151.22ms step:384/1480 train_time:56554ms step_avg:151.21ms step:385/1480 train_time:56706ms step_avg:151.22ms step:386/1480 train_time:56856ms step_avg:151.21ms step:387/1480 train_time:57007ms step_avg:151.21ms step:388/1480 train_time:57157ms step_avg:151.21ms step:389/1480 train_time:57307ms step_avg:151.21ms step:390/1480 train_time:57457ms step_avg:151.20ms step:391/1480 train_time:57608ms step_avg:151.20ms step:392/1480 train_time:57758ms step_avg:151.20ms step:393/1480 train_time:57909ms step_avg:151.20ms step:394/1480 train_time:58060ms step_avg:151.20ms step:395/1480 train_time:58211ms step_avg:151.20ms step:396/1480 train_time:58362ms step_avg:151.20ms step:397/1480 train_time:58512ms step_avg:151.19ms step:398/1480 train_time:58664ms step_avg:151.20ms step:399/1480 train_time:58813ms step_avg:151.19ms step:400/1480 train_time:58965ms step_avg:151.19ms step:401/1480 train_time:59115ms step_avg:151.19ms step:402/1480 train_time:59266ms step_avg:151.19ms step:403/1480 train_time:59417ms step_avg:151.19ms step:404/1480 train_time:59569ms step_avg:151.19ms step:405/1480 train_time:59720ms step_avg:151.19ms step:406/1480 train_time:59871ms step_avg:151.19ms step:407/1480 train_time:60022ms step_avg:151.19ms step:408/1480 train_time:60172ms step_avg:151.19ms step:409/1480 train_time:60324ms step_avg:151.19ms step:410/1480 train_time:60474ms step_avg:151.19ms step:411/1480 train_time:60626ms step_avg:151.19ms step:412/1480 train_time:60776ms step_avg:151.18ms step:413/1480 train_time:60927ms step_avg:151.18ms step:414/1480 train_time:61078ms step_avg:151.18ms step:415/1480 train_time:61229ms step_avg:151.18ms step:416/1480 train_time:61379ms step_avg:151.18ms step:417/1480 train_time:61530ms step_avg:151.18ms step:418/1480 train_time:61680ms step_avg:151.18ms step:419/1480 train_time:61831ms step_avg:151.18ms step:420/1480 train_time:61983ms step_avg:151.18ms step:421/1480 train_time:62133ms step_avg:151.18ms step:422/1480 train_time:62284ms step_avg:151.18ms step:423/1480 train_time:62435ms step_avg:151.17ms step:424/1480 train_time:62587ms step_avg:151.18ms step:425/1480 train_time:62737ms step_avg:151.17ms step:426/1480 train_time:62888ms step_avg:151.17ms step:427/1480 train_time:63039ms step_avg:151.17ms step:428/1480 train_time:63190ms step_avg:151.17ms step:429/1480 train_time:63342ms step_avg:151.17ms step:430/1480 train_time:63491ms step_avg:151.17ms step:431/1480 train_time:63643ms step_avg:151.17ms step:432/1480 train_time:63793ms step_avg:151.17ms step:433/1480 train_time:63944ms step_avg:151.17ms step:434/1480 train_time:64095ms step_avg:151.17ms step:435/1480 train_time:64247ms step_avg:151.17ms step:436/1480 train_time:64396ms step_avg:151.16ms step:437/1480 train_time:64548ms step_avg:151.17ms step:438/1480 train_time:64697ms step_avg:151.16ms step:439/1480 train_time:64848ms step_avg:151.16ms step:440/1480 train_time:64999ms step_avg:151.16ms step:441/1480 train_time:65151ms step_avg:151.16ms step:442/1480 train_time:65304ms step_avg:151.17ms step:443/1480 train_time:65455ms step_avg:151.17ms step:444/1480 train_time:65608ms step_avg:151.17ms step:445/1480 train_time:65761ms step_avg:151.17ms step:446/1480 train_time:65913ms step_avg:151.18ms step:447/1480 train_time:66067ms step_avg:151.18ms step:448/1480 train_time:66220ms step_avg:151.19ms step:449/1480 train_time:66373ms step_avg:151.19ms step:450/1480 train_time:66527ms step_avg:151.20ms step:451/1480 train_time:66679ms step_avg:151.20ms step:452/1480 train_time:66832ms step_avg:151.20ms step:453/1480 train_time:66985ms step_avg:151.21ms step:454/1480 train_time:67138ms step_avg:151.21ms step:455/1480 train_time:67290ms step_avg:151.21ms step:456/1480 train_time:67444ms step_avg:151.22ms step:457/1480 train_time:67595ms step_avg:151.22ms step:458/1480 train_time:67748ms step_avg:151.22ms step:459/1480 train_time:67901ms step_avg:151.23ms step:460/1480 train_time:68054ms step_avg:151.23ms step:461/1480 train_time:68207ms step_avg:151.23ms step:462/1480 train_time:68360ms step_avg:151.24ms step:463/1480 train_time:68513ms step_avg:151.24ms step:464/1480 train_time:68667ms step_avg:151.25ms step:465/1480 train_time:68819ms step_avg:151.25ms step:466/1480 train_time:68971ms step_avg:151.25ms step:467/1480 train_time:69124ms step_avg:151.26ms step:468/1480 train_time:69277ms step_avg:151.26ms step:469/1480 train_time:69429ms step_avg:151.26ms step:470/1480 train_time:69582ms step_avg:151.26ms step:471/1480 train_time:69734ms step_avg:151.27ms step:472/1480 train_time:69886ms step_avg:151.27ms step:473/1480 train_time:70040ms step_avg:151.27ms step:474/1480 train_time:70193ms step_avg:151.28ms step:475/1480 train_time:70347ms step_avg:151.28ms step:476/1480 train_time:70500ms step_avg:151.29ms step:477/1480 train_time:70653ms step_avg:151.29ms step:478/1480 train_time:70806ms step_avg:151.29ms step:479/1480 train_time:70958ms step_avg:151.30ms step:480/1480 train_time:71109ms step_avg:151.30ms step:481/1480 train_time:71262ms step_avg:151.30ms step:482/1480 train_time:71415ms step_avg:151.30ms step:483/1480 train_time:71568ms step_avg:151.31ms step:484/1480 train_time:71724ms step_avg:151.32ms step:485/1480 train_time:71876ms step_avg:151.32ms step:486/1480 train_time:72029ms step_avg:151.32ms step:487/1480 train_time:72181ms step_avg:151.32ms step:488/1480 train_time:72333ms step_avg:151.32ms step:489/1480 train_time:72486ms step_avg:151.33ms step:490/1480 train_time:72640ms step_avg:151.33ms step:491/1480 train_time:72796ms step_avg:151.34ms step:492/1480 train_time:72949ms step_avg:151.35ms step:493/1480 train_time:73103ms step_avg:151.35ms step:494/1480 train_time:73254ms step_avg:151.35ms step:495/1480 train_time:73407ms step_avg:151.35ms step:496/1480 train_time:73560ms step_avg:151.36ms step:497/1480 train_time:73714ms step_avg:151.36ms step:498/1480 train_time:73866ms step_avg:151.37ms step:499/1480 train_time:74020ms step_avg:151.37ms step:500/1480 train_time:74173ms step_avg:151.37ms step:500/1480 val_loss:3.6874 train_time:74244ms step_avg:151.52ms step:501/1480 train_time:74334ms step_avg:151.39ms step:502/1480 train_time:74488ms step_avg:151.40ms step:503/1480 train_time:74640ms step_avg:151.40ms step:504/1480 train_time:74792ms step_avg:151.40ms step:505/1480 train_time:74943ms step_avg:151.40ms step:506/1480 train_time:75096ms step_avg:151.40ms step:507/1480 train_time:75249ms step_avg:151.41ms step:508/1480 train_time:75402ms step_avg:151.41ms step:509/1480 train_time:75557ms step_avg:151.42ms step:510/1480 train_time:75709ms step_avg:151.42ms step:511/1480 train_time:75862ms step_avg:151.42ms step:512/1480 train_time:76015ms step_avg:151.43ms step:513/1480 train_time:76167ms step_avg:151.43ms step:514/1480 train_time:76320ms step_avg:151.43ms step:515/1480 train_time:76474ms step_avg:151.43ms step:516/1480 train_time:76628ms step_avg:151.44ms step:517/1480 train_time:76781ms step_avg:151.44ms step:518/1480 train_time:76933ms step_avg:151.44ms step:519/1480 train_time:77087ms step_avg:151.45ms step:520/1480 train_time:77239ms step_avg:151.45ms step:521/1480 train_time:77392ms step_avg:151.45ms step:522/1480 train_time:77545ms step_avg:151.46ms step:523/1480 train_time:77699ms step_avg:151.46ms step:524/1480 train_time:77852ms step_avg:151.46ms step:525/1480 train_time:78004ms step_avg:151.46ms step:526/1480 train_time:78157ms step_avg:151.47ms step:527/1480 train_time:78309ms step_avg:151.47ms step:528/1480 train_time:78462ms step_avg:151.47ms step:529/1480 train_time:78616ms step_avg:151.48ms step:530/1480 train_time:78768ms step_avg:151.48ms step:531/1480 train_time:78920ms step_avg:151.48ms step:532/1480 train_time:79074ms step_avg:151.48ms step:533/1480 train_time:79226ms step_avg:151.48ms step:534/1480 train_time:79379ms step_avg:151.49ms step:535/1480 train_time:79534ms step_avg:151.49ms step:536/1480 train_time:79686ms step_avg:151.49ms step:537/1480 train_time:79839ms step_avg:151.50ms step:538/1480 train_time:79992ms step_avg:151.50ms step:539/1480 train_time:80146ms step_avg:151.50ms step:540/1480 train_time:80300ms step_avg:151.51ms step:541/1480 train_time:80453ms step_avg:151.51ms step:542/1480 train_time:80606ms step_avg:151.51ms step:543/1480 train_time:80758ms step_avg:151.52ms step:544/1480 train_time:80911ms step_avg:151.52ms step:545/1480 train_time:81062ms step_avg:151.52ms step:546/1480 train_time:81216ms step_avg:151.52ms step:547/1480 train_time:81368ms step_avg:151.52ms step:548/1480 train_time:81521ms step_avg:151.53ms step:549/1480 train_time:81675ms step_avg:151.53ms step:550/1480 train_time:81828ms step_avg:151.53ms step:551/1480 train_time:81982ms step_avg:151.54ms step:552/1480 train_time:82138ms step_avg:151.55ms step:553/1480 train_time:82292ms step_avg:151.55ms step:554/1480 train_time:82446ms step_avg:151.55ms step:555/1480 train_time:82600ms step_avg:151.56ms step:556/1480 train_time:82755ms step_avg:151.57ms step:557/1480 train_time:82909ms step_avg:151.57ms step:558/1480 train_time:83065ms step_avg:151.58ms step:559/1480 train_time:83219ms step_avg:151.58ms step:560/1480 train_time:83374ms step_avg:151.59ms step:561/1480 train_time:83529ms step_avg:151.59ms step:562/1480 train_time:83683ms step_avg:151.60ms step:563/1480 train_time:83837ms step_avg:151.60ms step:564/1480 train_time:83993ms step_avg:151.61ms step:565/1480 train_time:84148ms step_avg:151.62ms step:566/1480 train_time:84303ms step_avg:151.62ms step:567/1480 train_time:84458ms step_avg:151.63ms step:568/1480 train_time:84611ms step_avg:151.63ms step:569/1480 train_time:84775ms step_avg:151.66ms step:570/1480 train_time:84920ms step_avg:151.64ms step:571/1480 train_time:85074ms step_avg:151.65ms step:572/1480 train_time:85230ms step_avg:151.65ms step:573/1480 train_time:85386ms step_avg:151.66ms step:574/1480 train_time:85542ms step_avg:151.67ms step:575/1480 train_time:85695ms step_avg:151.67ms step:576/1480 train_time:85850ms step_avg:151.68ms step:577/1480 train_time:86005ms step_avg:151.69ms step:578/1480 train_time:86160ms step_avg:151.69ms step:579/1480 train_time:86316ms step_avg:151.70ms step:580/1480 train_time:86470ms step_avg:151.70ms step:581/1480 train_time:86625ms step_avg:151.71ms step:582/1480 train_time:86779ms step_avg:151.71ms step:583/1480 train_time:86934ms step_avg:151.72ms step:584/1480 train_time:87088ms step_avg:151.72ms step:585/1480 train_time:87243ms step_avg:151.73ms step:586/1480 train_time:87397ms step_avg:151.73ms step:587/1480 train_time:87552ms step_avg:151.74ms step:588/1480 train_time:87706ms step_avg:151.74ms step:589/1480 train_time:87860ms step_avg:151.74ms step:590/1480 train_time:88016ms step_avg:151.75ms step:591/1480 train_time:88169ms step_avg:151.75ms step:592/1480 train_time:88323ms step_avg:151.76ms step:593/1480 train_time:88478ms step_avg:151.76ms step:594/1480 train_time:88633ms step_avg:151.77ms step:595/1480 train_time:88789ms step_avg:151.78ms step:596/1480 train_time:88946ms step_avg:151.78ms step:597/1480 train_time:89100ms step_avg:151.79ms step:598/1480 train_time:89255ms step_avg:151.79ms step:599/1480 train_time:89409ms step_avg:151.80ms step:600/1480 train_time:89564ms step_avg:151.80ms step:601/1480 train_time:89718ms step_avg:151.81ms step:602/1480 train_time:89874ms step_avg:151.81ms step:603/1480 train_time:90029ms step_avg:151.82ms step:604/1480 train_time:90184ms step_avg:151.82ms step:605/1480 train_time:90339ms step_avg:151.83ms step:606/1480 train_time:90493ms step_avg:151.83ms step:607/1480 train_time:90649ms step_avg:151.84ms step:608/1480 train_time:90804ms step_avg:151.85ms step:609/1480 train_time:90959ms step_avg:151.85ms step:610/1480 train_time:91113ms step_avg:151.85ms step:611/1480 train_time:91267ms step_avg:151.86ms step:612/1480 train_time:91421ms step_avg:151.86ms step:613/1480 train_time:91577ms step_avg:151.87ms step:614/1480 train_time:91733ms step_avg:151.88ms step:615/1480 train_time:91888ms step_avg:151.88ms step:616/1480 train_time:92041ms step_avg:151.88ms step:617/1480 train_time:92195ms step_avg:151.89ms step:618/1480 train_time:92350ms step_avg:151.89ms step:619/1480 train_time:92506ms step_avg:151.90ms step:620/1480 train_time:92660ms step_avg:151.90ms step:621/1480 train_time:92816ms step_avg:151.91ms step:622/1480 train_time:92970ms step_avg:151.91ms step:623/1480 train_time:93126ms step_avg:151.92ms step:624/1480 train_time:93281ms step_avg:151.92ms step:625/1480 train_time:93437ms step_avg:151.93ms step:625/1480 val_loss:3.6073 train_time:93507ms step_avg:152.04ms step:626/1480 train_time:93597ms step_avg:151.94ms step:627/1480 train_time:93751ms step_avg:151.95ms step:628/1480 train_time:93906ms step_avg:151.95ms step:629/1480 train_time:94060ms step_avg:151.95ms step:630/1480 train_time:94215ms step_avg:151.96ms step:631/1480 train_time:94368ms step_avg:151.96ms step:632/1480 train_time:94521ms step_avg:151.96ms step:633/1480 train_time:94677ms step_avg:151.97ms step:634/1480 train_time:94832ms step_avg:151.98ms step:635/1480 train_time:94987ms step_avg:151.98ms step:636/1480 train_time:95141ms step_avg:151.98ms step:637/1480 train_time:95296ms step_avg:151.99ms step:638/1480 train_time:95451ms step_avg:151.99ms step:639/1480 train_time:95606ms step_avg:152.00ms step:640/1480 train_time:95760ms step_avg:152.00ms step:641/1480 train_time:95916ms step_avg:152.01ms step:642/1480 train_time:96069ms step_avg:152.01ms step:643/1480 train_time:96224ms step_avg:152.01ms step:644/1480 train_time:96378ms step_avg:152.02ms step:645/1480 train_time:96535ms step_avg:152.02ms step:646/1480 train_time:96689ms step_avg:152.03ms step:647/1480 train_time:96845ms step_avg:152.03ms step:648/1480 train_time:96999ms step_avg:152.04ms step:649/1480 train_time:97154ms step_avg:152.04ms step:650/1480 train_time:97309ms step_avg:152.05ms step:651/1480 train_time:97464ms step_avg:152.05ms step:652/1480 train_time:97618ms step_avg:152.05ms step:653/1480 train_time:97774ms step_avg:152.06ms step:654/1480 train_time:97928ms step_avg:152.06ms step:655/1480 train_time:98083ms step_avg:152.07ms step:656/1480 train_time:98238ms step_avg:152.07ms step:657/1480 train_time:98391ms step_avg:152.07ms step:658/1480 train_time:98546ms step_avg:152.08ms step:659/1480 train_time:98701ms step_avg:152.08ms step:660/1480 train_time:98857ms step_avg:152.09ms step:661/1480 train_time:99014ms step_avg:152.10ms step:662/1480 train_time:99170ms step_avg:152.10ms step:663/1480 train_time:99326ms step_avg:152.11ms step:664/1480 train_time:99481ms step_avg:152.11ms step:665/1480 train_time:99638ms step_avg:152.12ms step:666/1480 train_time:99794ms step_avg:152.13ms step:667/1480 train_time:99952ms step_avg:152.13ms step:668/1480 train_time:100108ms step_avg:152.14ms step:669/1480 train_time:100265ms step_avg:152.15ms step:670/1480 train_time:100421ms step_avg:152.15ms step:671/1480 train_time:100577ms step_avg:152.16ms step:672/1480 train_time:100734ms step_avg:152.17ms step:673/1480 train_time:100891ms step_avg:152.17ms step:674/1480 train_time:101047ms step_avg:152.18ms step:675/1480 train_time:101205ms step_avg:152.19ms step:676/1480 train_time:101362ms step_avg:152.19ms step:677/1480 train_time:101518ms step_avg:152.20ms step:678/1480 train_time:101675ms step_avg:152.21ms step:679/1480 train_time:101832ms step_avg:152.21ms step:680/1480 train_time:101989ms step_avg:152.22ms step:681/1480 train_time:102144ms step_avg:152.23ms step:682/1480 train_time:102300ms step_avg:152.23ms step:683/1480 train_time:102457ms step_avg:152.24ms step:684/1480 train_time:102613ms step_avg:152.24ms step:685/1480 train_time:102769ms step_avg:152.25ms step:686/1480 train_time:102926ms step_avg:152.26ms step:687/1480 train_time:103082ms step_avg:152.26ms step:688/1480 train_time:103239ms step_avg:152.27ms step:689/1480 train_time:103396ms step_avg:152.28ms step:690/1480 train_time:103554ms step_avg:152.28ms step:691/1480 train_time:103711ms step_avg:152.29ms step:692/1480 train_time:103867ms step_avg:152.30ms step:693/1480 train_time:104024ms step_avg:152.30ms step:694/1480 train_time:104181ms step_avg:152.31ms step:695/1480 train_time:104336ms step_avg:152.32ms step:696/1480 train_time:104492ms step_avg:152.32ms step:697/1480 train_time:104649ms step_avg:152.33ms step:698/1480 train_time:104805ms step_avg:152.33ms step:699/1480 train_time:104962ms step_avg:152.34ms step:700/1480 train_time:105118ms step_avg:152.35ms step:701/1480 train_time:105274ms step_avg:152.35ms step:702/1480 train_time:105432ms step_avg:152.36ms step:703/1480 train_time:105589ms step_avg:152.36ms step:704/1480 train_time:105745ms step_avg:152.37ms step:705/1480 train_time:105901ms step_avg:152.37ms step:706/1480 train_time:106060ms step_avg:152.38ms step:707/1480 train_time:106217ms step_avg:152.39ms step:708/1480 train_time:106372ms step_avg:152.39ms step:709/1480 train_time:106527ms step_avg:152.40ms step:710/1480 train_time:106683ms step_avg:152.40ms step:711/1480 train_time:106839ms step_avg:152.41ms step:712/1480 train_time:106997ms step_avg:152.42ms step:713/1480 train_time:107156ms step_avg:152.43ms step:714/1480 train_time:107312ms step_avg:152.43ms step:715/1480 train_time:107467ms step_avg:152.43ms step:716/1480 train_time:107621ms step_avg:152.44ms step:717/1480 train_time:107778ms step_avg:152.44ms step:718/1480 train_time:107935ms step_avg:152.45ms step:719/1480 train_time:108090ms step_avg:152.45ms step:720/1480 train_time:108248ms step_avg:152.46ms step:721/1480 train_time:108406ms step_avg:152.47ms step:722/1480 train_time:108561ms step_avg:152.47ms step:723/1480 train_time:108717ms step_avg:152.48ms step:724/1480 train_time:108874ms step_avg:152.48ms step:725/1480 train_time:109031ms step_avg:152.49ms step:726/1480 train_time:109186ms step_avg:152.49ms step:727/1480 train_time:109343ms step_avg:152.50ms step:728/1480 train_time:109499ms step_avg:152.51ms step:729/1480 train_time:109656ms step_avg:152.51ms step:730/1480 train_time:109814ms step_avg:152.52ms step:731/1480 train_time:109971ms step_avg:152.53ms step:732/1480 train_time:110127ms step_avg:152.53ms step:733/1480 train_time:110283ms step_avg:152.54ms step:734/1480 train_time:110440ms step_avg:152.54ms step:735/1480 train_time:110596ms step_avg:152.55ms step:736/1480 train_time:110753ms step_avg:152.55ms step:737/1480 train_time:110909ms step_avg:152.56ms step:738/1480 train_time:111064ms step_avg:152.56ms step:739/1480 train_time:111219ms step_avg:152.56ms step:740/1480 train_time:111377ms step_avg:152.57ms step:741/1480 train_time:111535ms step_avg:152.58ms step:742/1480 train_time:111690ms step_avg:152.58ms step:743/1480 train_time:111845ms step_avg:152.59ms step:744/1480 train_time:112002ms step_avg:152.59ms step:745/1480 train_time:112159ms step_avg:152.60ms step:746/1480 train_time:112315ms step_avg:152.60ms step:747/1480 train_time:112472ms step_avg:152.61ms step:748/1480 train_time:112633ms step_avg:152.62ms step:749/1480 train_time:112789ms step_avg:152.62ms step:750/1480 train_time:112945ms step_avg:152.63ms step:750/1480 val_loss:3.5520 train_time:113017ms step_avg:152.73ms step:751/1480 train_time:113107ms step_avg:152.64ms step:752/1480 train_time:113263ms step_avg:152.65ms step:753/1480 train_time:113420ms step_avg:152.65ms step:754/1480 train_time:113576ms step_avg:152.66ms step:755/1480 train_time:113731ms step_avg:152.66ms step:756/1480 train_time:113887ms step_avg:152.66ms step:757/1480 train_time:114045ms step_avg:152.67ms step:758/1480 train_time:114202ms step_avg:152.68ms step:759/1480 train_time:114367ms step_avg:152.69ms step:760/1480 train_time:114516ms step_avg:152.69ms step:761/1480 train_time:114672ms step_avg:152.69ms step:762/1480 train_time:114828ms step_avg:152.70ms step:763/1480 train_time:114984ms step_avg:152.70ms step:764/1480 train_time:115141ms step_avg:152.71ms step:765/1480 train_time:115298ms step_avg:152.71ms step:766/1480 train_time:115456ms step_avg:152.72ms step:767/1480 train_time:115613ms step_avg:152.73ms step:768/1480 train_time:115769ms step_avg:152.73ms step:769/1480 train_time:115926ms step_avg:152.74ms step:770/1480 train_time:116085ms step_avg:152.74ms step:771/1480 train_time:116244ms step_avg:152.75ms step:772/1480 train_time:116401ms step_avg:152.76ms step:773/1480 train_time:116559ms step_avg:152.76ms step:774/1480 train_time:116717ms step_avg:152.77ms step:775/1480 train_time:116877ms step_avg:152.78ms step:776/1480 train_time:117037ms step_avg:152.79ms step:777/1480 train_time:117199ms step_avg:152.80ms step:778/1480 train_time:117357ms step_avg:152.81ms step:779/1480 train_time:117513ms step_avg:152.81ms step:780/1480 train_time:117671ms step_avg:152.82ms step:781/1480 train_time:117828ms step_avg:152.83ms step:782/1480 train_time:117985ms step_avg:152.83ms step:783/1480 train_time:118143ms step_avg:152.84ms step:784/1480 train_time:118302ms step_avg:152.84ms step:785/1480 train_time:118459ms step_avg:152.85ms step:786/1480 train_time:118617ms step_avg:152.86ms step:787/1480 train_time:118776ms step_avg:152.86ms step:788/1480 train_time:118937ms step_avg:152.87ms step:789/1480 train_time:119094ms step_avg:152.88ms step:790/1480 train_time:119251ms step_avg:152.89ms step:791/1480 train_time:119409ms step_avg:152.89ms step:792/1480 train_time:119567ms step_avg:152.90ms step:793/1480 train_time:119724ms step_avg:152.90ms step:794/1480 train_time:119881ms step_avg:152.91ms step:795/1480 train_time:120042ms step_avg:152.92ms step:796/1480 train_time:120200ms step_avg:152.93ms step:797/1480 train_time:120359ms step_avg:152.93ms step:798/1480 train_time:120518ms step_avg:152.94ms step:799/1480 train_time:120679ms step_avg:152.95ms step:800/1480 train_time:120838ms step_avg:152.96ms step:801/1480 train_time:120995ms step_avg:152.96ms step:802/1480 train_time:121155ms step_avg:152.97ms step:803/1480 train_time:121313ms step_avg:152.98ms step:804/1480 train_time:121471ms step_avg:152.99ms step:805/1480 train_time:121630ms step_avg:152.99ms step:806/1480 train_time:121786ms step_avg:153.00ms step:807/1480 train_time:121944ms step_avg:153.00ms step:808/1480 train_time:122103ms step_avg:153.01ms step:809/1480 train_time:122259ms step_avg:153.02ms step:810/1480 train_time:122416ms step_avg:153.02ms step:811/1480 train_time:122575ms step_avg:153.03ms step:812/1480 train_time:122732ms step_avg:153.03ms step:813/1480 train_time:122889ms step_avg:153.04ms step:814/1480 train_time:123046ms step_avg:153.04ms step:815/1480 train_time:123204ms step_avg:153.05ms step:816/1480 train_time:123363ms step_avg:153.06ms step:817/1480 train_time:123520ms step_avg:153.06ms step:818/1480 train_time:123677ms step_avg:153.07ms step:819/1480 train_time:123834ms step_avg:153.07ms step:820/1480 train_time:123991ms step_avg:153.08ms step:821/1480 train_time:124148ms step_avg:153.08ms step:822/1480 train_time:124306ms step_avg:153.09ms step:823/1480 train_time:124464ms step_avg:153.09ms step:824/1480 train_time:124621ms step_avg:153.10ms step:825/1480 train_time:124780ms step_avg:153.10ms step:826/1480 train_time:124940ms step_avg:153.11ms step:827/1480 train_time:125099ms step_avg:153.12ms step:828/1480 train_time:125256ms step_avg:153.12ms step:829/1480 train_time:125416ms step_avg:153.13ms step:830/1480 train_time:125577ms step_avg:153.14ms step:831/1480 train_time:125736ms step_avg:153.15ms step:832/1480 train_time:125893ms step_avg:153.15ms step:833/1480 train_time:126050ms step_avg:153.16ms step:834/1480 train_time:126211ms step_avg:153.17ms step:835/1480 train_time:126368ms step_avg:153.17ms step:836/1480 train_time:126527ms step_avg:153.18ms step:837/1480 train_time:126683ms step_avg:153.18ms step:838/1480 train_time:126841ms step_avg:153.19ms step:839/1480 train_time:126999ms step_avg:153.20ms step:840/1480 train_time:127155ms step_avg:153.20ms step:841/1480 train_time:127313ms step_avg:153.20ms step:842/1480 train_time:127473ms step_avg:153.21ms step:843/1480 train_time:127630ms step_avg:153.22ms step:844/1480 train_time:127787ms step_avg:153.22ms step:845/1480 train_time:127945ms step_avg:153.23ms step:846/1480 train_time:128104ms step_avg:153.23ms step:847/1480 train_time:128263ms step_avg:153.24ms step:848/1480 train_time:128421ms step_avg:153.25ms step:849/1480 train_time:128578ms step_avg:153.25ms step:850/1480 train_time:128737ms step_avg:153.26ms step:851/1480 train_time:128897ms step_avg:153.27ms step:852/1480 train_time:129055ms step_avg:153.27ms step:853/1480 train_time:129212ms step_avg:153.28ms step:854/1480 train_time:129369ms step_avg:153.28ms step:855/1480 train_time:129526ms step_avg:153.29ms step:856/1480 train_time:129683ms step_avg:153.29ms step:857/1480 train_time:129842ms step_avg:153.30ms step:858/1480 train_time:130002ms step_avg:153.30ms step:859/1480 train_time:130160ms step_avg:153.31ms step:860/1480 train_time:130318ms step_avg:153.32ms step:861/1480 train_time:130478ms step_avg:153.32ms step:862/1480 train_time:130641ms step_avg:153.33ms step:863/1480 train_time:130801ms step_avg:153.34ms step:864/1480 train_time:130959ms step_avg:153.35ms step:865/1480 train_time:131115ms step_avg:153.35ms step:866/1480 train_time:131275ms step_avg:153.36ms step:867/1480 train_time:131435ms step_avg:153.37ms step:868/1480 train_time:131596ms step_avg:153.38ms step:869/1480 train_time:131753ms step_avg:153.38ms step:870/1480 train_time:131911ms step_avg:153.38ms step:871/1480 train_time:132067ms step_avg:153.39ms step:872/1480 train_time:132226ms step_avg:153.39ms step:873/1480 train_time:132382ms step_avg:153.40ms step:874/1480 train_time:132543ms step_avg:153.41ms step:875/1480 train_time:132702ms step_avg:153.41ms step:875/1480 val_loss:3.5078 train_time:132774ms step_avg:153.50ms step:876/1480 train_time:132866ms step_avg:153.42ms step:877/1480 train_time:133023ms step_avg:153.43ms step:878/1480 train_time:133181ms step_avg:153.43ms step:879/1480 train_time:133338ms step_avg:153.44ms step:880/1480 train_time:133496ms step_avg:153.44ms step:881/1480 train_time:133654ms step_avg:153.45ms step:882/1480 train_time:133812ms step_avg:153.45ms step:883/1480 train_time:133971ms step_avg:153.46ms step:884/1480 train_time:134131ms step_avg:153.47ms step:885/1480 train_time:134292ms step_avg:153.48ms step:886/1480 train_time:134451ms step_avg:153.48ms step:887/1480 train_time:134610ms step_avg:153.49ms step:888/1480 train_time:134774ms step_avg:153.50ms step:889/1480 train_time:134935ms step_avg:153.51ms step:890/1480 train_time:135093ms step_avg:153.51ms step:891/1480 train_time:135251ms step_avg:153.52ms step:892/1480 train_time:135410ms step_avg:153.53ms step:893/1480 train_time:135569ms step_avg:153.53ms step:894/1480 train_time:135729ms step_avg:153.54ms step:895/1480 train_time:135890ms step_avg:153.55ms step:896/1480 train_time:136048ms step_avg:153.55ms step:897/1480 train_time:136209ms step_avg:153.56ms step:898/1480 train_time:136367ms step_avg:153.57ms step:899/1480 train_time:136525ms step_avg:153.57ms step:900/1480 train_time:136683ms step_avg:153.58ms step:901/1480 train_time:136844ms step_avg:153.58ms step:902/1480 train_time:137002ms step_avg:153.59ms step:903/1480 train_time:137162ms step_avg:153.60ms step:904/1480 train_time:137323ms step_avg:153.60ms step:905/1480 train_time:137481ms step_avg:153.61ms step:906/1480 train_time:137641ms step_avg:153.62ms step:907/1480 train_time:137803ms step_avg:153.63ms step:908/1480 train_time:137962ms step_avg:153.63ms step:909/1480 train_time:138123ms step_avg:153.64ms step:910/1480 train_time:138286ms step_avg:153.65ms step:911/1480 train_time:138445ms step_avg:153.66ms step:912/1480 train_time:138605ms step_avg:153.66ms step:913/1480 train_time:138767ms step_avg:153.67ms step:914/1480 train_time:138926ms step_avg:153.68ms step:915/1480 train_time:139089ms step_avg:153.69ms step:916/1480 train_time:139248ms step_avg:153.70ms step:917/1480 train_time:139408ms step_avg:153.70ms step:918/1480 train_time:139567ms step_avg:153.71ms step:919/1480 train_time:139728ms step_avg:153.72ms step:920/1480 train_time:139887ms step_avg:153.72ms step:921/1480 train_time:140046ms step_avg:153.73ms step:922/1480 train_time:140209ms step_avg:153.74ms step:923/1480 train_time:140367ms step_avg:153.74ms step:924/1480 train_time:140527ms step_avg:153.75ms step:925/1480 train_time:140686ms step_avg:153.76ms step:926/1480 train_time:140844ms step_avg:153.76ms step:927/1480 train_time:141002ms step_avg:153.76ms step:928/1480 train_time:141160ms step_avg:153.77ms step:929/1480 train_time:141320ms step_avg:153.78ms step:930/1480 train_time:141480ms step_avg:153.78ms step:931/1480 train_time:141639ms step_avg:153.79ms step:932/1480 train_time:141799ms step_avg:153.79ms step:933/1480 train_time:141959ms step_avg:153.80ms step:934/1480 train_time:142118ms step_avg:153.81ms step:935/1480 train_time:142280ms step_avg:153.82ms step:936/1480 train_time:142439ms step_avg:153.82ms step:937/1480 train_time:142601ms step_avg:153.83ms step:938/1480 train_time:142759ms step_avg:153.84ms step:939/1480 train_time:142921ms step_avg:153.84ms step:940/1480 train_time:143083ms step_avg:153.85ms step:941/1480 train_time:143241ms step_avg:153.86ms step:942/1480 train_time:143400ms step_avg:153.86ms step:943/1480 train_time:143560ms step_avg:153.87ms step:944/1480 train_time:143723ms step_avg:153.88ms step:945/1480 train_time:143882ms step_avg:153.88ms step:946/1480 train_time:144046ms step_avg:153.89ms step:947/1480 train_time:144206ms step_avg:153.90ms step:948/1480 train_time:144364ms step_avg:153.91ms step:949/1480 train_time:144533ms step_avg:153.92ms step:950/1480 train_time:144683ms step_avg:153.92ms step:951/1480 train_time:144844ms step_avg:153.93ms step:952/1480 train_time:145003ms step_avg:153.93ms step:953/1480 train_time:145163ms step_avg:153.94ms step:954/1480 train_time:145326ms step_avg:153.95ms step:955/1480 train_time:145485ms step_avg:153.95ms step:956/1480 train_time:145643ms step_avg:153.96ms step:957/1480 train_time:145805ms step_avg:153.97ms step:958/1480 train_time:145966ms step_avg:153.97ms step:959/1480 train_time:146125ms step_avg:153.98ms step:960/1480 train_time:146285ms step_avg:153.98ms step:961/1480 train_time:146443ms step_avg:153.99ms step:962/1480 train_time:146603ms step_avg:153.99ms step:963/1480 train_time:146763ms step_avg:154.00ms step:964/1480 train_time:146924ms step_avg:154.01ms step:965/1480 train_time:147083ms step_avg:154.01ms step:966/1480 train_time:147242ms step_avg:154.02ms step:967/1480 train_time:147400ms step_avg:154.02ms step:968/1480 train_time:147559ms step_avg:154.03ms step:969/1480 train_time:147719ms step_avg:154.03ms step:970/1480 train_time:147877ms step_avg:154.04ms step:971/1480 train_time:148037ms step_avg:154.04ms step:972/1480 train_time:148196ms step_avg:154.05ms step:973/1480 train_time:148354ms step_avg:154.05ms step:974/1480 train_time:148514ms step_avg:154.06ms step:975/1480 train_time:148675ms step_avg:154.07ms step:976/1480 train_time:148834ms step_avg:154.07ms step:977/1480 train_time:148993ms step_avg:154.08ms step:978/1480 train_time:149152ms step_avg:154.08ms step:979/1480 train_time:149311ms step_avg:154.09ms step:980/1480 train_time:149471ms step_avg:154.09ms step:981/1480 train_time:149632ms step_avg:154.10ms step:982/1480 train_time:149790ms step_avg:154.10ms step:983/1480 train_time:149949ms step_avg:154.11ms step:984/1480 train_time:150108ms step_avg:154.11ms step:985/1480 train_time:150268ms step_avg:154.12ms step:986/1480 train_time:150427ms step_avg:154.13ms step:987/1480 train_time:150585ms step_avg:154.13ms step:988/1480 train_time:150743ms step_avg:154.13ms step:989/1480 train_time:150903ms step_avg:154.14ms step:990/1480 train_time:151064ms step_avg:154.15ms step:991/1480 train_time:151224ms step_avg:154.15ms step:992/1480 train_time:151389ms step_avg:154.16ms step:993/1480 train_time:151558ms step_avg:154.18ms step:994/1480 train_time:151718ms step_avg:154.18ms step:995/1480 train_time:151878ms step_avg:154.19ms step:996/1480 train_time:152036ms step_avg:154.19ms step:997/1480 train_time:152194ms step_avg:154.20ms step:998/1480 train_time:152356ms step_avg:154.21ms step:999/1480 train_time:152515ms step_avg:154.21ms step:1000/1480 train_time:152676ms step_avg:154.22ms step:1000/1480 val_loss:3.4424 train_time:152749ms step_avg:154.29ms step:1001/1480 train_time:152840ms step_avg:154.23ms step:1002/1480 train_time:153000ms step_avg:154.23ms step:1003/1480 train_time:153166ms step_avg:154.25ms step:1004/1480 train_time:153329ms step_avg:154.25ms step:1005/1480 train_time:153489ms step_avg:154.26ms step:1006/1480 train_time:153650ms step_avg:154.27ms step:1007/1480 train_time:153810ms step_avg:154.27ms step:1008/1480 train_time:153970ms step_avg:154.28ms step:1009/1480 train_time:154135ms step_avg:154.29ms step:1010/1480 train_time:154294ms step_avg:154.29ms step:1011/1480 train_time:154455ms step_avg:154.30ms step:1012/1480 train_time:154614ms step_avg:154.31ms step:1013/1480 train_time:154777ms step_avg:154.31ms step:1014/1480 train_time:154935ms step_avg:154.32ms step:1015/1480 train_time:155096ms step_avg:154.32ms step:1016/1480 train_time:155255ms step_avg:154.33ms step:1017/1480 train_time:155416ms step_avg:154.34ms step:1018/1480 train_time:155577ms step_avg:154.34ms step:1019/1480 train_time:155739ms step_avg:154.35ms step:1020/1480 train_time:155899ms step_avg:154.36ms step:1021/1480 train_time:156058ms step_avg:154.36ms step:1022/1480 train_time:156217ms step_avg:154.36ms step:1023/1480 train_time:156380ms step_avg:154.37ms step:1024/1480 train_time:156541ms step_avg:154.38ms step:1025/1480 train_time:156703ms step_avg:154.39ms step:1026/1480 train_time:156865ms step_avg:154.39ms step:1027/1480 train_time:157025ms step_avg:154.40ms step:1028/1480 train_time:157188ms step_avg:154.41ms step:1029/1480 train_time:157352ms step_avg:154.42ms step:1030/1480 train_time:157512ms step_avg:154.42ms step:1031/1480 train_time:157671ms step_avg:154.43ms step:1032/1480 train_time:157834ms step_avg:154.44ms step:1033/1480 train_time:157993ms step_avg:154.44ms step:1034/1480 train_time:158153ms step_avg:154.45ms step:1035/1480 train_time:158313ms step_avg:154.45ms step:1036/1480 train_time:158473ms step_avg:154.46ms step:1037/1480 train_time:158634ms step_avg:154.46ms step:1038/1480 train_time:158795ms step_avg:154.47ms step:1039/1480 train_time:158957ms step_avg:154.48ms step:1040/1480 train_time:159116ms step_avg:154.48ms step:1041/1480 train_time:159277ms step_avg:154.49ms step:1042/1480 train_time:159434ms step_avg:154.49ms step:1043/1480 train_time:159593ms step_avg:154.49ms step:1044/1480 train_time:159752ms step_avg:154.50ms step:1045/1480 train_time:159913ms step_avg:154.51ms step:1046/1480 train_time:160072ms step_avg:154.51ms step:1047/1480 train_time:160233ms step_avg:154.52ms step:1048/1480 train_time:160393ms step_avg:154.52ms step:1049/1480 train_time:160552ms step_avg:154.53ms step:1050/1480 train_time:160714ms step_avg:154.53ms step:1051/1480 train_time:160878ms step_avg:154.54ms step:1052/1480 train_time:161037ms step_avg:154.55ms step:1053/1480 train_time:161196ms step_avg:154.55ms step:1054/1480 train_time:161357ms step_avg:154.56ms step:1055/1480 train_time:161517ms step_avg:154.56ms step:1056/1480 train_time:161676ms step_avg:154.57ms step:1057/1480 train_time:161836ms step_avg:154.57ms step:1058/1480 train_time:161999ms step_avg:154.58ms step:1059/1480 train_time:162163ms step_avg:154.59ms step:1060/1480 train_time:162326ms step_avg:154.60ms step:1061/1480 train_time:162485ms step_avg:154.60ms step:1062/1480 train_time:162645ms step_avg:154.61ms step:1063/1480 train_time:162805ms step_avg:154.61ms step:1064/1480 train_time:162964ms step_avg:154.61ms step:1065/1480 train_time:163125ms step_avg:154.62ms step:1066/1480 train_time:163288ms step_avg:154.63ms step:1067/1480 train_time:163450ms step_avg:154.64ms step:1068/1480 train_time:163610ms step_avg:154.64ms step:1069/1480 train_time:163772ms step_avg:154.65ms step:1070/1480 train_time:163931ms step_avg:154.65ms step:1071/1480 train_time:164095ms step_avg:154.66ms step:1072/1480 train_time:164255ms step_avg:154.67ms step:1073/1480 train_time:164414ms step_avg:154.67ms step:1074/1480 train_time:164575ms step_avg:154.68ms step:1075/1480 train_time:164735ms step_avg:154.68ms step:1076/1480 train_time:164894ms step_avg:154.68ms step:1077/1480 train_time:165053ms step_avg:154.69ms step:1078/1480 train_time:165217ms step_avg:154.70ms step:1079/1480 train_time:165380ms step_avg:154.71ms step:1080/1480 train_time:165541ms step_avg:154.71ms step:1081/1480 train_time:165701ms step_avg:154.72ms step:1082/1480 train_time:165861ms step_avg:154.72ms step:1083/1480 train_time:166022ms step_avg:154.73ms step:1084/1480 train_time:166183ms step_avg:154.73ms step:1085/1480 train_time:166344ms step_avg:154.74ms step:1086/1480 train_time:166506ms step_avg:154.75ms step:1087/1480 train_time:166668ms step_avg:154.75ms step:1088/1480 train_time:166827ms step_avg:154.76ms step:1089/1480 train_time:166990ms step_avg:154.76ms step:1090/1480 train_time:167153ms step_avg:154.77ms step:1091/1480 train_time:167313ms step_avg:154.78ms step:1092/1480 train_time:167475ms step_avg:154.78ms step:1093/1480 train_time:167637ms step_avg:154.79ms step:1094/1480 train_time:167796ms step_avg:154.79ms step:1095/1480 train_time:167955ms step_avg:154.80ms step:1096/1480 train_time:168118ms step_avg:154.80ms step:1097/1480 train_time:168280ms step_avg:154.81ms step:1098/1480 train_time:168444ms step_avg:154.82ms step:1099/1480 train_time:168605ms step_avg:154.83ms step:1100/1480 train_time:168767ms step_avg:154.83ms step:1101/1480 train_time:168931ms step_avg:154.84ms step:1102/1480 train_time:169093ms step_avg:154.85ms step:1103/1480 train_time:169261ms step_avg:154.86ms step:1104/1480 train_time:169423ms step_avg:154.87ms step:1105/1480 train_time:169586ms step_avg:154.87ms step:1106/1480 train_time:169748ms step_avg:154.88ms step:1107/1480 train_time:169909ms step_avg:154.88ms step:1108/1480 train_time:170068ms step_avg:154.89ms step:1109/1480 train_time:170228ms step_avg:154.89ms step:1110/1480 train_time:170390ms step_avg:154.90ms step:1111/1480 train_time:170552ms step_avg:154.91ms step:1112/1480 train_time:170715ms step_avg:154.91ms step:1113/1480 train_time:170883ms step_avg:154.93ms step:1114/1480 train_time:171047ms step_avg:154.93ms step:1115/1480 train_time:171210ms step_avg:154.94ms step:1116/1480 train_time:171370ms step_avg:154.95ms step:1117/1480 train_time:171533ms step_avg:154.95ms step:1118/1480 train_time:171697ms step_avg:154.96ms step:1119/1480 train_time:171858ms step_avg:154.97ms step:1120/1480 train_time:172018ms step_avg:154.97ms step:1121/1480 train_time:172180ms step_avg:154.98ms step:1122/1480 train_time:172340ms step_avg:154.98ms step:1123/1480 train_time:172499ms step_avg:154.99ms step:1124/1480 train_time:172663ms step_avg:154.99ms step:1125/1480 train_time:172825ms step_avg:155.00ms step:1125/1480 val_loss:3.3865 train_time:172900ms step_avg:155.07ms step:1126/1480 train_time:172991ms step_avg:155.01ms step:1127/1480 train_time:173152ms step_avg:155.02ms step:1128/1480 train_time:173313ms step_avg:155.02ms step:1129/1480 train_time:173477ms step_avg:155.03ms step:1130/1480 train_time:173637ms step_avg:155.03ms step:1131/1480 train_time:173805ms step_avg:155.04ms step:1132/1480 train_time:173964ms step_avg:155.05ms step:1133/1480 train_time:174128ms step_avg:155.06ms step:1134/1480 train_time:174292ms step_avg:155.06ms step:1135/1480 train_time:174453ms step_avg:155.07ms step:1136/1480 train_time:174615ms step_avg:155.08ms step:1137/1480 train_time:174774ms step_avg:155.08ms step:1138/1480 train_time:174937ms step_avg:155.09ms step:1139/1480 train_time:175106ms step_avg:155.10ms step:1140/1480 train_time:175259ms step_avg:155.10ms step:1141/1480 train_time:175424ms step_avg:155.10ms step:1142/1480 train_time:175583ms step_avg:155.11ms step:1143/1480 train_time:175748ms step_avg:155.12ms step:1144/1480 train_time:175910ms step_avg:155.12ms step:1145/1480 train_time:176069ms step_avg:155.13ms step:1146/1480 train_time:176234ms step_avg:155.14ms step:1147/1480 train_time:176394ms step_avg:155.14ms step:1148/1480 train_time:176555ms step_avg:155.14ms step:1149/1480 train_time:176717ms step_avg:155.15ms step:1150/1480 train_time:176877ms step_avg:155.16ms step:1151/1480 train_time:177042ms step_avg:155.16ms step:1152/1480 train_time:177206ms step_avg:155.17ms step:1153/1480 train_time:177373ms step_avg:155.18ms step:1154/1480 train_time:177533ms step_avg:155.19ms step:1155/1480 train_time:177694ms step_avg:155.19ms step:1156/1480 train_time:177860ms step_avg:155.20ms step:1157/1480 train_time:178023ms step_avg:155.21ms step:1158/1480 train_time:178184ms step_avg:155.21ms step:1159/1480 train_time:178346ms step_avg:155.22ms step:1160/1480 train_time:178507ms step_avg:155.22ms step:1161/1480 train_time:178669ms step_avg:155.23ms step:1162/1480 train_time:178833ms step_avg:155.24ms step:1163/1480 train_time:178994ms step_avg:155.24ms step:1164/1480 train_time:179155ms step_avg:155.25ms step:1165/1480 train_time:179314ms step_avg:155.25ms step:1166/1480 train_time:179476ms step_avg:155.26ms step:1167/1480 train_time:179636ms step_avg:155.26ms step:1168/1480 train_time:179797ms step_avg:155.27ms step:1169/1480 train_time:179959ms step_avg:155.27ms step:1170/1480 train_time:180122ms step_avg:155.28ms step:1171/1480 train_time:180283ms step_avg:155.28ms step:1172/1480 train_time:180442ms step_avg:155.29ms step:1173/1480 train_time:180604ms step_avg:155.29ms step:1174/1480 train_time:180775ms step_avg:155.30ms step:1175/1480 train_time:180937ms step_avg:155.31ms step:1176/1480 train_time:181099ms step_avg:155.32ms step:1177/1480 train_time:181266ms step_avg:155.33ms step:1178/1480 train_time:181427ms step_avg:155.33ms step:1179/1480 train_time:181587ms step_avg:155.34ms step:1180/1480 train_time:181756ms step_avg:155.35ms step:1181/1480 train_time:181917ms step_avg:155.35ms step:1182/1480 train_time:182077ms step_avg:155.36ms step:1183/1480 train_time:182239ms step_avg:155.36ms step:1184/1480 train_time:182399ms step_avg:155.37ms step:1185/1480 train_time:182563ms step_avg:155.37ms step:1186/1480 train_time:182727ms step_avg:155.38ms step:1187/1480 train_time:182899ms step_avg:155.39ms step:1188/1480 train_time:183058ms step_avg:155.40ms step:1189/1480 train_time:183219ms step_avg:155.40ms step:1190/1480 train_time:183381ms step_avg:155.41ms step:1191/1480 train_time:183545ms step_avg:155.42ms step:1192/1480 train_time:183706ms step_avg:155.42ms step:1193/1480 train_time:183865ms step_avg:155.42ms step:1194/1480 train_time:184028ms step_avg:155.43ms step:1195/1480 train_time:184191ms step_avg:155.44ms step:1196/1480 train_time:184363ms step_avg:155.45ms step:1197/1480 train_time:184526ms step_avg:155.46ms step:1198/1480 train_time:184694ms step_avg:155.47ms step:1199/1480 train_time:184856ms step_avg:155.47ms step:1200/1480 train_time:185018ms step_avg:155.48ms step:1201/1480 train_time:185178ms step_avg:155.48ms step:1202/1480 train_time:185347ms step_avg:155.49ms step:1203/1480 train_time:185514ms step_avg:155.50ms step:1204/1480 train_time:185677ms step_avg:155.51ms step:1205/1480 train_time:185838ms step_avg:155.51ms step:1206/1480 train_time:185999ms step_avg:155.52ms step:1207/1480 train_time:186158ms step_avg:155.52ms step:1208/1480 train_time:186318ms step_avg:155.52ms step:1209/1480 train_time:186481ms step_avg:155.53ms step:1210/1480 train_time:186648ms step_avg:155.54ms step:1211/1480 train_time:186811ms step_avg:155.55ms step:1212/1480 train_time:186973ms step_avg:155.55ms step:1213/1480 train_time:187137ms step_avg:155.56ms step:1214/1480 train_time:187302ms step_avg:155.57ms step:1215/1480 train_time:187465ms step_avg:155.57ms step:1216/1480 train_time:187627ms step_avg:155.58ms step:1217/1480 train_time:187790ms step_avg:155.58ms step:1218/1480 train_time:187954ms step_avg:155.59ms step:1219/1480 train_time:188122ms step_avg:155.60ms step:1220/1480 train_time:188285ms step_avg:155.61ms step:1221/1480 train_time:188446ms step_avg:155.61ms step:1222/1480 train_time:188607ms step_avg:155.62ms step:1223/1480 train_time:188771ms step_avg:155.62ms step:1224/1480 train_time:188936ms step_avg:155.63ms step:1225/1480 train_time:189099ms step_avg:155.64ms step:1226/1480 train_time:189264ms step_avg:155.64ms step:1227/1480 train_time:189430ms step_avg:155.65ms step:1228/1480 train_time:189593ms step_avg:155.66ms step:1229/1480 train_time:189756ms step_avg:155.67ms step:1230/1480 train_time:189922ms step_avg:155.67ms step:1231/1480 train_time:190089ms step_avg:155.68ms step:1232/1480 train_time:190254ms step_avg:155.69ms step:1233/1480 train_time:190414ms step_avg:155.69ms step:1234/1480 train_time:190575ms step_avg:155.70ms step:1235/1480 train_time:190742ms step_avg:155.71ms step:1236/1480 train_time:190903ms step_avg:155.71ms step:1237/1480 train_time:191064ms step_avg:155.72ms step:1238/1480 train_time:191236ms step_avg:155.73ms step:1239/1480 train_time:191398ms step_avg:155.74ms step:1240/1480 train_time:191565ms step_avg:155.74ms step:1241/1480 train_time:191731ms step_avg:155.75ms step:1242/1480 train_time:191892ms step_avg:155.76ms step:1243/1480 train_time:192057ms step_avg:155.76ms step:1244/1480 train_time:192217ms step_avg:155.77ms step:1245/1480 train_time:192379ms step_avg:155.77ms step:1246/1480 train_time:192541ms step_avg:155.78ms step:1247/1480 train_time:192702ms step_avg:155.78ms step:1248/1480 train_time:192864ms step_avg:155.79ms step:1249/1480 train_time:193026ms step_avg:155.79ms step:1250/1480 train_time:193189ms step_avg:155.80ms step:1250/1480 val_loss:3.3366 train_time:193264ms step_avg:155.86ms step:1251/1480 train_time:193359ms step_avg:155.81ms step:1252/1480 train_time:193522ms step_avg:155.81ms step:1253/1480 train_time:193684ms step_avg:155.82ms step:1254/1480 train_time:193845ms step_avg:155.82ms step:1255/1480 train_time:194016ms step_avg:155.84ms step:1256/1480 train_time:194182ms step_avg:155.84ms step:1257/1480 train_time:194345ms step_avg:155.85ms step:1258/1480 train_time:194511ms step_avg:155.86ms step:1259/1480 train_time:194673ms step_avg:155.86ms step:1260/1480 train_time:194834ms step_avg:155.87ms step:1261/1480 train_time:194996ms step_avg:155.87ms step:1262/1480 train_time:195161ms step_avg:155.88ms step:1263/1480 train_time:195327ms step_avg:155.89ms step:1264/1480 train_time:195487ms step_avg:155.89ms step:1265/1480 train_time:195646ms step_avg:155.89ms step:1266/1480 train_time:195810ms step_avg:155.90ms step:1267/1480 train_time:195971ms step_avg:155.90ms step:1268/1480 train_time:196134ms step_avg:155.91ms step:1269/1480 train_time:196301ms step_avg:155.92ms step:1270/1480 train_time:196464ms step_avg:155.92ms step:1271/1480 train_time:196626ms step_avg:155.93ms step:1272/1480 train_time:196787ms step_avg:155.93ms step:1273/1480 train_time:196950ms step_avg:155.94ms step:1274/1480 train_time:197114ms step_avg:155.94ms step:1275/1480 train_time:197275ms step_avg:155.95ms step:1276/1480 train_time:197435ms step_avg:155.95ms step:1277/1480 train_time:197597ms step_avg:155.96ms step:1278/1480 train_time:197759ms step_avg:155.96ms step:1279/1480 train_time:197922ms step_avg:155.97ms step:1280/1480 train_time:198089ms step_avg:155.98ms step:1281/1480 train_time:198251ms step_avg:155.98ms step:1282/1480 train_time:198410ms step_avg:155.98ms step:1283/1480 train_time:198573ms step_avg:155.99ms step:1284/1480 train_time:198736ms step_avg:155.99ms step:1285/1480 train_time:198898ms step_avg:156.00ms step:1286/1480 train_time:199060ms step_avg:156.00ms step:1287/1480 train_time:199222ms step_avg:156.01ms step:1288/1480 train_time:199384ms step_avg:156.01ms step:1289/1480 train_time:199553ms step_avg:156.02ms step:1290/1480 train_time:199720ms step_avg:156.03ms step:1291/1480 train_time:199885ms step_avg:156.04ms step:1292/1480 train_time:200047ms step_avg:156.04ms step:1293/1480 train_time:200211ms step_avg:156.05ms step:1294/1480 train_time:200376ms step_avg:156.06ms step:1295/1480 train_time:200540ms step_avg:156.06ms step:1296/1480 train_time:200702ms step_avg:156.07ms step:1297/1480 train_time:200866ms step_avg:156.07ms step:1298/1480 train_time:201028ms step_avg:156.08ms step:1299/1480 train_time:201190ms step_avg:156.08ms step:1300/1480 train_time:201350ms step_avg:156.09ms step:1301/1480 train_time:201511ms step_avg:156.09ms step:1302/1480 train_time:201677ms step_avg:156.10ms step:1303/1480 train_time:201846ms step_avg:156.11ms step:1304/1480 train_time:202011ms step_avg:156.11ms step:1305/1480 train_time:202171ms step_avg:156.12ms step:1306/1480 train_time:202337ms step_avg:156.12ms step:1307/1480 train_time:202498ms step_avg:156.13ms step:1308/1480 train_time:202661ms step_avg:156.13ms step:1309/1480 train_time:202827ms step_avg:156.14ms step:1310/1480 train_time:202991ms step_avg:156.15ms step:1311/1480 train_time:203151ms step_avg:156.15ms step:1312/1480 train_time:203316ms step_avg:156.16ms step:1313/1480 train_time:203480ms step_avg:156.16ms step:1314/1480 train_time:203645ms step_avg:156.17ms step:1315/1480 train_time:203809ms step_avg:156.18ms step:1316/1480 train_time:203969ms step_avg:156.18ms step:1317/1480 train_time:204129ms step_avg:156.18ms step:1318/1480 train_time:204296ms step_avg:156.19ms step:1319/1480 train_time:204464ms step_avg:156.20ms step:1320/1480 train_time:204631ms step_avg:156.21ms step:1321/1480 train_time:204794ms step_avg:156.21ms step:1322/1480 train_time:204965ms step_avg:156.22ms step:1323/1480 train_time:205129ms step_avg:156.23ms step:1324/1480 train_time:205292ms step_avg:156.23ms step:1325/1480 train_time:205463ms step_avg:156.25ms step:1326/1480 train_time:205628ms step_avg:156.25ms step:1327/1480 train_time:205790ms step_avg:156.26ms step:1328/1480 train_time:205952ms step_avg:156.26ms step:1329/1480 train_time:206135ms step_avg:156.28ms step:1330/1480 train_time:206300ms step_avg:156.29ms step:1331/1480 train_time:206464ms step_avg:156.29ms step:1332/1480 train_time:206627ms step_avg:156.30ms step:1333/1480 train_time:206792ms step_avg:156.31ms step:1334/1480 train_time:206955ms step_avg:156.31ms step:1335/1480 train_time:207115ms step_avg:156.31ms step:1336/1480 train_time:207285ms step_avg:156.32ms step:1337/1480 train_time:207450ms step_avg:156.33ms step:1338/1480 train_time:207614ms step_avg:156.34ms step:1339/1480 train_time:207780ms step_avg:156.34ms step:1340/1480 train_time:207943ms step_avg:156.35ms step:1341/1480 train_time:208104ms step_avg:156.35ms step:1342/1480 train_time:208272ms step_avg:156.36ms step:1343/1480 train_time:208433ms step_avg:156.36ms step:1344/1480 train_time:208594ms step_avg:156.37ms step:1345/1480 train_time:208764ms step_avg:156.38ms step:1346/1480 train_time:208925ms step_avg:156.38ms step:1347/1480 train_time:209088ms step_avg:156.39ms step:1348/1480 train_time:209252ms step_avg:156.39ms step:1349/1480 train_time:209413ms step_avg:156.40ms step:1350/1480 train_time:209579ms step_avg:156.40ms step:1351/1480 train_time:209742ms step_avg:156.41ms step:1352/1480 train_time:209905ms step_avg:156.41ms step:1353/1480 train_time:210070ms step_avg:156.42ms step:1354/1480 train_time:210233ms step_avg:156.42ms step:1355/1480 train_time:210393ms step_avg:156.43ms step:1356/1480 train_time:210555ms step_avg:156.43ms step:1357/1480 train_time:210720ms step_avg:156.44ms step:1358/1480 train_time:210885ms step_avg:156.44ms step:1359/1480 train_time:211049ms step_avg:156.45ms step:1360/1480 train_time:211214ms step_avg:156.45ms step:1361/1480 train_time:211382ms step_avg:156.46ms step:1362/1480 train_time:211548ms step_avg:156.47ms step:1363/1480 train_time:211715ms step_avg:156.48ms step:1364/1480 train_time:211878ms step_avg:156.48ms step:1365/1480 train_time:212037ms step_avg:156.48ms step:1366/1480 train_time:212201ms step_avg:156.49ms step:1367/1480 train_time:212364ms step_avg:156.50ms step:1368/1480 train_time:212528ms step_avg:156.50ms step:1369/1480 train_time:212698ms step_avg:156.51ms step:1370/1480 train_time:212865ms step_avg:156.52ms step:1371/1480 train_time:213028ms step_avg:156.52ms step:1372/1480 train_time:213195ms step_avg:156.53ms step:1373/1480 train_time:213358ms step_avg:156.54ms step:1374/1480 train_time:213526ms step_avg:156.54ms step:1375/1480 train_time:213689ms step_avg:156.55ms step:1375/1480 val_loss:3.2976 train_time:213763ms step_avg:156.60ms step:1376/1480 train_time:213854ms step_avg:156.55ms step:1377/1480 train_time:214015ms step_avg:156.56ms step:1378/1480 train_time:214176ms step_avg:156.56ms step:1379/1480 train_time:214342ms step_avg:156.57ms step:1380/1480 train_time:214505ms step_avg:156.57ms step:1381/1480 train_time:214674ms step_avg:156.58ms step:1382/1480 train_time:214837ms step_avg:156.59ms step:1383/1480 train_time:215000ms step_avg:156.59ms step:1384/1480 train_time:215167ms step_avg:156.60ms step:1385/1480 train_time:215329ms step_avg:156.60ms step:1386/1480 train_time:215492ms step_avg:156.61ms step:1387/1480 train_time:215657ms step_avg:156.61ms step:1388/1480 train_time:215818ms step_avg:156.62ms step:1389/1480 train_time:215985ms step_avg:156.62ms step:1390/1480 train_time:216147ms step_avg:156.63ms step:1391/1480 train_time:216309ms step_avg:156.63ms step:1392/1480 train_time:216473ms step_avg:156.64ms step:1393/1480 train_time:216635ms step_avg:156.64ms step:1394/1480 train_time:216797ms step_avg:156.65ms step:1395/1480 train_time:216958ms step_avg:156.65ms step:1396/1480 train_time:217120ms step_avg:156.65ms step:1397/1480 train_time:217280ms step_avg:156.65ms step:1398/1480 train_time:217443ms step_avg:156.66ms step:1399/1480 train_time:217605ms step_avg:156.66ms step:1400/1480 train_time:217773ms step_avg:156.67ms step:1401/1480 train_time:217933ms step_avg:156.67ms step:1402/1480 train_time:218094ms step_avg:156.68ms step:1403/1480 train_time:218259ms step_avg:156.68ms step:1404/1480 train_time:218420ms step_avg:156.69ms step:1405/1480 train_time:218587ms step_avg:156.69ms step:1406/1480 train_time:218754ms step_avg:156.70ms step:1407/1480 train_time:218915ms step_avg:156.70ms step:1408/1480 train_time:219076ms step_avg:156.71ms step:1409/1480 train_time:219246ms step_avg:156.72ms step:1410/1480 train_time:219409ms step_avg:156.72ms step:1411/1480 train_time:219571ms step_avg:156.72ms step:1412/1480 train_time:219733ms step_avg:156.73ms step:1413/1480 train_time:219896ms step_avg:156.73ms step:1414/1480 train_time:220059ms step_avg:156.74ms step:1415/1480 train_time:220225ms step_avg:156.74ms step:1416/1480 train_time:220400ms step_avg:156.76ms step:1417/1480 train_time:220562ms step_avg:156.76ms step:1418/1480 train_time:220726ms step_avg:156.77ms step:1419/1480 train_time:220891ms step_avg:156.77ms step:1420/1480 train_time:221056ms step_avg:156.78ms step:1421/1480 train_time:221223ms step_avg:156.78ms step:1422/1480 train_time:221389ms step_avg:156.79ms step:1423/1480 train_time:221552ms step_avg:156.80ms step:1424/1480 train_time:221717ms step_avg:156.80ms step:1425/1480 train_time:221888ms step_avg:156.81ms step:1426/1480 train_time:222053ms step_avg:156.82ms step:1427/1480 train_time:222217ms step_avg:156.82ms step:1428/1480 train_time:222379ms step_avg:156.83ms step:1429/1480 train_time:222540ms step_avg:156.83ms step:1430/1480 train_time:222704ms step_avg:156.83ms step:1431/1480 train_time:222869ms step_avg:156.84ms step:1432/1480 train_time:223036ms step_avg:156.85ms step:1433/1480 train_time:223205ms step_avg:156.86ms step:1434/1480 train_time:223375ms step_avg:156.86ms step:1435/1480 train_time:223540ms step_avg:156.87ms step:1436/1480 train_time:223706ms step_avg:156.88ms step:1437/1480 train_time:223868ms step_avg:156.88ms step:1438/1480 train_time:224030ms step_avg:156.88ms step:1439/1480 train_time:224195ms step_avg:156.89ms step:1440/1480 train_time:224357ms step_avg:156.89ms step:1441/1480 train_time:224522ms step_avg:156.90ms step:1442/1480 train_time:224689ms step_avg:156.91ms step:1443/1480 train_time:224862ms step_avg:156.92ms step:1444/1480 train_time:225026ms step_avg:156.92ms step:1445/1480 train_time:225188ms step_avg:156.93ms step:1446/1480 train_time:225355ms step_avg:156.93ms step:1447/1480 train_time:225523ms step_avg:156.94ms step:1448/1480 train_time:225687ms step_avg:156.95ms step:1449/1480 train_time:225851ms step_avg:156.95ms step:1450/1480 train_time:226014ms step_avg:156.95ms step:1451/1480 train_time:226177ms step_avg:156.96ms step:1452/1480 train_time:226344ms step_avg:156.97ms step:1453/1480 train_time:226508ms step_avg:156.97ms step:1454/1480 train_time:226671ms step_avg:156.97ms step:1455/1480 train_time:226838ms step_avg:156.98ms step:1456/1480 train_time:227002ms step_avg:156.99ms step:1457/1480 train_time:227164ms step_avg:156.99ms step:1458/1480 train_time:227328ms step_avg:156.99ms step:1459/1480 train_time:227493ms step_avg:157.00ms step:1460/1480 train_time:227656ms step_avg:157.00ms step:1461/1480 train_time:227819ms step_avg:157.01ms step:1462/1480 train_time:227984ms step_avg:157.01ms step:1463/1480 train_time:228151ms step_avg:157.02ms step:1464/1480 train_time:228315ms step_avg:157.03ms step:1465/1480 train_time:228478ms step_avg:157.03ms step:1466/1480 train_time:228642ms step_avg:157.03ms step:1467/1480 train_time:228808ms step_avg:157.04ms step:1468/1480 train_time:228972ms step_avg:157.05ms step:1469/1480 train_time:229134ms step_avg:157.05ms step:1470/1480 train_time:229303ms step_avg:157.06ms step:1471/1480 train_time:229474ms step_avg:157.07ms step:1472/1480 train_time:229644ms step_avg:157.07ms step:1473/1480 train_time:229807ms step_avg:157.08ms step:1474/1480 train_time:229975ms step_avg:157.09ms step:1475/1480 train_time:230144ms step_avg:157.10ms step:1476/1480 train_time:230308ms step_avg:157.10ms step:1477/1480 train_time:230476ms step_avg:157.11ms step:1478/1480 train_time:230647ms step_avg:157.12ms step:1479/1480 train_time:230812ms step_avg:157.12ms step:1480/1480 train_time:230974ms step_avg:157.13ms step:1480/1480 val_loss:3.2785 train_time:231049ms step_avg:157.18ms peak memory consumption: 34239 MiB