import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 10:48:04 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29116ms step_avg:nanms step:2/1480 train_time:29234ms step_avg:nanms step:3/1480 train_time:29352ms step_avg:nanms step:4/1480 train_time:29491ms step_avg:nanms step:5/1480 train_time:29628ms step_avg:nanms step:6/1480 train_time:29770ms step_avg:nanms step:7/1480 train_time:29909ms step_avg:nanms step:8/1480 train_time:30050ms step_avg:nanms step:9/1480 train_time:30196ms step_avg:nanms step:10/1480 train_time:30342ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:277ms step_avg:nanms step:13/1480 train_time:421ms step_avg:140.19ms step:14/1480 train_time:563ms step_avg:140.65ms step:15/1480 train_time:706ms step_avg:141.10ms step:16/1480 train_time:852ms step_avg:141.94ms step:17/1480 train_time:994ms step_avg:141.97ms step:18/1480 train_time:1137ms step_avg:142.08ms step:19/1480 train_time:1277ms step_avg:141.91ms step:20/1480 train_time:1419ms step_avg:141.89ms step:21/1480 train_time:1561ms step_avg:141.89ms step:22/1480 train_time:1703ms step_avg:141.89ms step:23/1480 train_time:1846ms step_avg:141.99ms step:24/1480 train_time:1989ms step_avg:142.07ms step:25/1480 train_time:2133ms step_avg:142.18ms step:26/1480 train_time:2277ms step_avg:142.30ms step:27/1480 train_time:2418ms step_avg:142.22ms step:28/1480 train_time:2560ms step_avg:142.22ms step:29/1480 train_time:2702ms step_avg:142.23ms step:30/1480 train_time:2846ms step_avg:142.28ms step:31/1480 train_time:2989ms step_avg:142.32ms step:32/1480 train_time:3133ms step_avg:142.39ms step:33/1480 train_time:3275ms step_avg:142.37ms step:34/1480 train_time:3419ms step_avg:142.44ms step:35/1480 train_time:3560ms step_avg:142.39ms step:36/1480 train_time:3704ms step_avg:142.44ms step:37/1480 train_time:3847ms step_avg:142.49ms step:38/1480 train_time:3990ms step_avg:142.50ms step:39/1480 train_time:4133ms step_avg:142.50ms step:40/1480 train_time:4275ms step_avg:142.50ms step:41/1480 train_time:4417ms step_avg:142.48ms step:42/1480 train_time:4558ms step_avg:142.44ms step:43/1480 train_time:4701ms step_avg:142.44ms step:44/1480 train_time:4844ms step_avg:142.46ms step:45/1480 train_time:4988ms step_avg:142.51ms step:46/1480 train_time:5133ms step_avg:142.57ms step:47/1480 train_time:5275ms step_avg:142.57ms step:48/1480 train_time:5417ms step_avg:142.57ms step:49/1480 train_time:5558ms step_avg:142.52ms step:50/1480 train_time:5700ms step_avg:142.49ms step:51/1480 train_time:5843ms step_avg:142.50ms step:52/1480 train_time:5986ms step_avg:142.53ms step:53/1480 train_time:6129ms step_avg:142.55ms step:54/1480 train_time:6273ms step_avg:142.57ms step:55/1480 train_time:6415ms step_avg:142.57ms step:56/1480 train_time:6558ms step_avg:142.57ms step:57/1480 train_time:6700ms step_avg:142.55ms step:58/1480 train_time:6843ms step_avg:142.57ms step:59/1480 train_time:6985ms step_avg:142.55ms step:60/1480 train_time:7128ms step_avg:142.57ms step:61/1480 train_time:7273ms step_avg:142.60ms step:62/1480 train_time:7417ms step_avg:142.63ms step:63/1480 train_time:7559ms step_avg:142.62ms step:64/1480 train_time:7700ms step_avg:142.59ms step:65/1480 train_time:7843ms step_avg:142.61ms step:66/1480 train_time:7988ms step_avg:142.65ms step:67/1480 train_time:8133ms step_avg:142.68ms step:68/1480 train_time:8276ms step_avg:142.69ms step:69/1480 train_time:8419ms step_avg:142.70ms step:70/1480 train_time:8561ms step_avg:142.69ms step:71/1480 train_time:8704ms step_avg:142.69ms step:72/1480 train_time:8846ms step_avg:142.68ms step:73/1480 train_time:8990ms step_avg:142.69ms step:74/1480 train_time:9133ms step_avg:142.70ms step:75/1480 train_time:9277ms step_avg:142.72ms step:76/1480 train_time:9419ms step_avg:142.71ms step:77/1480 train_time:9561ms step_avg:142.70ms step:78/1480 train_time:9704ms step_avg:142.71ms step:79/1480 train_time:9847ms step_avg:142.72ms step:80/1480 train_time:9990ms step_avg:142.71ms step:81/1480 train_time:10535ms step_avg:148.38ms step:82/1480 train_time:10633ms step_avg:147.69ms step:83/1480 train_time:10776ms step_avg:147.62ms step:84/1480 train_time:10918ms step_avg:147.55ms step:85/1480 train_time:11060ms step_avg:147.46ms step:86/1480 train_time:11201ms step_avg:147.38ms step:87/1480 train_time:11343ms step_avg:147.31ms step:88/1480 train_time:11488ms step_avg:147.28ms step:89/1480 train_time:11632ms step_avg:147.24ms step:90/1480 train_time:11774ms step_avg:147.18ms step:91/1480 train_time:11918ms step_avg:147.13ms step:92/1480 train_time:12058ms step_avg:147.05ms step:93/1480 train_time:12201ms step_avg:147.00ms step:94/1480 train_time:12343ms step_avg:146.93ms step:95/1480 train_time:12485ms step_avg:146.88ms step:96/1480 train_time:12629ms step_avg:146.85ms step:97/1480 train_time:13138ms step_avg:151.01ms step:98/1480 train_time:13639ms step_avg:154.99ms step:99/1480 train_time:13737ms step_avg:154.34ms step:100/1480 train_time:13878ms step_avg:154.20ms step:101/1480 train_time:14027ms step_avg:154.14ms step:102/1480 train_time:14162ms step_avg:153.93ms step:103/1480 train_time:14303ms step_avg:153.80ms step:104/1480 train_time:14446ms step_avg:153.68ms step:105/1480 train_time:14590ms step_avg:153.58ms step:106/1480 train_time:14735ms step_avg:153.49ms step:107/1480 train_time:14878ms step_avg:153.38ms step:108/1480 train_time:15021ms step_avg:153.27ms step:109/1480 train_time:15162ms step_avg:153.15ms step:110/1480 train_time:15305ms step_avg:153.05ms step:111/1480 train_time:15450ms step_avg:152.97ms step:112/1480 train_time:15596ms step_avg:152.90ms step:113/1480 train_time:15741ms step_avg:152.83ms step:114/1480 train_time:15888ms step_avg:152.77ms step:115/1480 train_time:16033ms step_avg:152.70ms step:116/1480 train_time:16178ms step_avg:152.63ms step:117/1480 train_time:16324ms step_avg:152.56ms step:118/1480 train_time:16471ms step_avg:152.51ms step:119/1480 train_time:16617ms step_avg:152.45ms step:120/1480 train_time:16762ms step_avg:152.38ms step:121/1480 train_time:16908ms step_avg:152.32ms step:122/1480 train_time:17054ms step_avg:152.27ms step:123/1480 train_time:17198ms step_avg:152.20ms step:124/1480 train_time:17344ms step_avg:152.14ms step:125/1480 train_time:17491ms step_avg:152.09ms step:125/1480 val_loss:4.4041 train_time:17556ms step_avg:152.66ms step:126/1480 train_time:17653ms step_avg:152.18ms step:127/1480 train_time:17792ms step_avg:152.07ms step:128/1480 train_time:17939ms step_avg:152.02ms step:129/1480 train_time:18084ms step_avg:151.97ms step:130/1480 train_time:18230ms step_avg:151.91ms step:131/1480 train_time:18376ms step_avg:151.87ms step:132/1480 train_time:18521ms step_avg:151.81ms step:133/1480 train_time:18668ms step_avg:151.77ms step:134/1480 train_time:18816ms step_avg:151.75ms step:135/1480 train_time:18962ms step_avg:151.70ms step:136/1480 train_time:19107ms step_avg:151.64ms step:137/1480 train_time:19254ms step_avg:151.61ms step:138/1480 train_time:19400ms step_avg:151.56ms step:139/1480 train_time:19545ms step_avg:151.51ms step:140/1480 train_time:19691ms step_avg:151.47ms step:141/1480 train_time:19837ms step_avg:151.43ms step:142/1480 train_time:19983ms step_avg:151.38ms step:143/1480 train_time:20128ms step_avg:151.34ms step:144/1480 train_time:20275ms step_avg:151.31ms step:145/1480 train_time:20421ms step_avg:151.27ms step:146/1480 train_time:20566ms step_avg:151.22ms step:147/1480 train_time:20711ms step_avg:151.17ms step:148/1480 train_time:20857ms step_avg:151.14ms step:149/1480 train_time:21002ms step_avg:151.10ms step:150/1480 train_time:21148ms step_avg:151.06ms step:151/1480 train_time:21296ms step_avg:151.04ms step:152/1480 train_time:21441ms step_avg:150.99ms step:153/1480 train_time:21586ms step_avg:150.95ms step:154/1480 train_time:21732ms step_avg:150.92ms step:155/1480 train_time:21878ms step_avg:150.89ms step:156/1480 train_time:22023ms step_avg:150.84ms step:157/1480 train_time:22170ms step_avg:150.81ms step:158/1480 train_time:22316ms step_avg:150.79ms step:159/1480 train_time:22462ms step_avg:150.75ms step:160/1480 train_time:22607ms step_avg:150.71ms step:161/1480 train_time:22754ms step_avg:150.69ms step:162/1480 train_time:22900ms step_avg:150.66ms step:163/1480 train_time:23044ms step_avg:150.62ms step:164/1480 train_time:23191ms step_avg:150.59ms step:165/1480 train_time:23337ms step_avg:150.56ms step:166/1480 train_time:23481ms step_avg:150.52ms step:167/1480 train_time:23626ms step_avg:150.49ms step:168/1480 train_time:23773ms step_avg:150.46ms step:169/1480 train_time:23920ms step_avg:150.44ms step:170/1480 train_time:24065ms step_avg:150.41ms step:171/1480 train_time:24210ms step_avg:150.37ms step:172/1480 train_time:24357ms step_avg:150.35ms step:173/1480 train_time:24501ms step_avg:150.31ms step:174/1480 train_time:24646ms step_avg:150.28ms step:175/1480 train_time:24792ms step_avg:150.26ms step:176/1480 train_time:24938ms step_avg:150.23ms step:177/1480 train_time:25084ms step_avg:150.20ms step:178/1480 train_time:25229ms step_avg:150.17ms step:179/1480 train_time:25376ms step_avg:150.15ms step:180/1480 train_time:25520ms step_avg:150.12ms step:181/1480 train_time:25666ms step_avg:150.09ms step:182/1480 train_time:25812ms step_avg:150.07ms step:183/1480 train_time:25958ms step_avg:150.05ms step:184/1480 train_time:26104ms step_avg:150.02ms step:185/1480 train_time:26250ms step_avg:150.00ms step:186/1480 train_time:26397ms step_avg:149.98ms step:187/1480 train_time:26541ms step_avg:149.95ms step:188/1480 train_time:26687ms step_avg:149.93ms step:189/1480 train_time:26853ms step_avg:150.01ms step:190/1480 train_time:26978ms step_avg:149.88ms step:191/1480 train_time:27122ms step_avg:149.85ms step:192/1480 train_time:27268ms step_avg:149.82ms step:193/1480 train_time:27415ms step_avg:149.81ms step:194/1480 train_time:27561ms step_avg:149.79ms step:195/1480 train_time:27706ms step_avg:149.76ms step:196/1480 train_time:27851ms step_avg:149.74ms step:197/1480 train_time:27998ms step_avg:149.72ms step:198/1480 train_time:28142ms step_avg:149.69ms step:199/1480 train_time:28289ms step_avg:149.68ms step:200/1480 train_time:28434ms step_avg:149.66ms step:201/1480 train_time:28584ms step_avg:149.66ms step:202/1480 train_time:28727ms step_avg:149.62ms step:203/1480 train_time:28873ms step_avg:149.60ms step:204/1480 train_time:29020ms step_avg:149.59ms step:205/1480 train_time:29165ms step_avg:149.56ms step:206/1480 train_time:29310ms step_avg:149.54ms step:207/1480 train_time:29456ms step_avg:149.52ms step:208/1480 train_time:29602ms step_avg:149.50ms step:209/1480 train_time:29747ms step_avg:149.48ms step:210/1480 train_time:29893ms step_avg:149.46ms step:211/1480 train_time:30038ms step_avg:149.44ms step:212/1480 train_time:30184ms step_avg:149.43ms step:213/1480 train_time:30329ms step_avg:149.40ms step:214/1480 train_time:30475ms step_avg:149.39ms step:215/1480 train_time:30620ms step_avg:149.37ms step:216/1480 train_time:30765ms step_avg:149.35ms step:217/1480 train_time:30912ms step_avg:149.34ms step:218/1480 train_time:31058ms step_avg:149.32ms step:219/1480 train_time:31203ms step_avg:149.30ms step:220/1480 train_time:31348ms step_avg:149.28ms step:221/1480 train_time:31887ms step_avg:151.12ms step:222/1480 train_time:31991ms step_avg:150.90ms step:223/1480 train_time:32527ms step_avg:152.71ms step:224/1480 train_time:32636ms step_avg:152.50ms step:225/1480 train_time:32784ms step_avg:152.49ms step:226/1480 train_time:32932ms step_avg:152.46ms step:227/1480 train_time:33081ms step_avg:152.45ms step:228/1480 train_time:33228ms step_avg:152.42ms step:229/1480 train_time:33377ms step_avg:152.41ms step:230/1480 train_time:33525ms step_avg:152.38ms step:231/1480 train_time:33674ms step_avg:152.37ms step:232/1480 train_time:33822ms step_avg:152.35ms step:233/1480 train_time:33970ms step_avg:152.33ms step:234/1480 train_time:34119ms step_avg:152.32ms step:235/1480 train_time:34267ms step_avg:152.30ms step:236/1480 train_time:34417ms step_avg:152.29ms step:237/1480 train_time:34563ms step_avg:152.26ms step:238/1480 train_time:34712ms step_avg:152.24ms step:239/1480 train_time:34860ms step_avg:152.23ms step:240/1480 train_time:35008ms step_avg:152.21ms step:241/1480 train_time:35157ms step_avg:152.19ms step:242/1480 train_time:35305ms step_avg:152.18ms step:243/1480 train_time:35455ms step_avg:152.17ms step:244/1480 train_time:35603ms step_avg:152.15ms step:245/1480 train_time:35751ms step_avg:152.13ms step:246/1480 train_time:35900ms step_avg:152.12ms step:247/1480 train_time:36048ms step_avg:152.10ms step:248/1480 train_time:36198ms step_avg:152.09ms step:249/1480 train_time:36346ms step_avg:152.07ms step:250/1480 train_time:36495ms step_avg:152.06ms step:250/1480 val_loss:4.0008 train_time:36561ms step_avg:152.34ms step:251/1480 train_time:36670ms step_avg:152.16ms step:252/1480 train_time:36801ms step_avg:152.07ms step:253/1480 train_time:36950ms step_avg:152.06ms step:254/1480 train_time:37098ms step_avg:152.04ms step:255/1480 train_time:37247ms step_avg:152.03ms step:256/1480 train_time:37393ms step_avg:152.01ms step:257/1480 train_time:37542ms step_avg:151.99ms step:258/1480 train_time:37691ms step_avg:151.98ms step:259/1480 train_time:37840ms step_avg:151.97ms step:260/1480 train_time:37990ms step_avg:151.96ms step:261/1480 train_time:38137ms step_avg:151.94ms step:262/1480 train_time:38285ms step_avg:151.93ms step:263/1480 train_time:38433ms step_avg:151.91ms step:264/1480 train_time:38582ms step_avg:151.90ms step:265/1480 train_time:38730ms step_avg:151.88ms step:266/1480 train_time:38877ms step_avg:151.86ms step:267/1480 train_time:39026ms step_avg:151.85ms step:268/1480 train_time:39174ms step_avg:151.84ms step:269/1480 train_time:39323ms step_avg:151.83ms step:270/1480 train_time:39470ms step_avg:151.81ms step:271/1480 train_time:39619ms step_avg:151.80ms step:272/1480 train_time:39767ms step_avg:151.78ms step:273/1480 train_time:39916ms step_avg:151.77ms step:274/1480 train_time:40065ms step_avg:151.76ms step:275/1480 train_time:40213ms step_avg:151.75ms step:276/1480 train_time:40363ms step_avg:151.74ms step:277/1480 train_time:40510ms step_avg:151.72ms step:278/1480 train_time:40659ms step_avg:151.71ms step:279/1480 train_time:40808ms step_avg:151.70ms step:280/1480 train_time:40955ms step_avg:151.69ms step:281/1480 train_time:41105ms step_avg:151.68ms step:282/1480 train_time:41253ms step_avg:151.66ms step:283/1480 train_time:41402ms step_avg:151.66ms step:284/1480 train_time:41550ms step_avg:151.64ms step:285/1480 train_time:41698ms step_avg:151.63ms step:286/1480 train_time:41847ms step_avg:151.62ms step:287/1480 train_time:41996ms step_avg:151.61ms step:288/1480 train_time:42146ms step_avg:151.60ms step:289/1480 train_time:42294ms step_avg:151.59ms step:290/1480 train_time:42444ms step_avg:151.59ms step:291/1480 train_time:42591ms step_avg:151.57ms step:292/1480 train_time:42739ms step_avg:151.56ms step:293/1480 train_time:42888ms step_avg:151.55ms step:294/1480 train_time:43036ms step_avg:151.53ms step:295/1480 train_time:43185ms step_avg:151.53ms step:296/1480 train_time:43332ms step_avg:151.51ms step:297/1480 train_time:43482ms step_avg:151.50ms step:298/1480 train_time:43630ms step_avg:151.49ms step:299/1480 train_time:43779ms step_avg:151.48ms step:300/1480 train_time:43928ms step_avg:151.47ms step:301/1480 train_time:44075ms step_avg:151.46ms step:302/1480 train_time:44225ms step_avg:151.46ms step:303/1480 train_time:44373ms step_avg:151.44ms step:304/1480 train_time:44523ms step_avg:151.44ms step:305/1480 train_time:44670ms step_avg:151.42ms step:306/1480 train_time:44819ms step_avg:151.42ms step:307/1480 train_time:44968ms step_avg:151.41ms step:308/1480 train_time:45116ms step_avg:151.40ms step:309/1480 train_time:45265ms step_avg:151.39ms step:310/1480 train_time:45413ms step_avg:151.38ms step:311/1480 train_time:45562ms step_avg:151.37ms step:312/1480 train_time:45711ms step_avg:151.36ms step:313/1480 train_time:45860ms step_avg:151.35ms step:314/1480 train_time:46008ms step_avg:151.34ms step:315/1480 train_time:46156ms step_avg:151.33ms step:316/1480 train_time:46305ms step_avg:151.32ms step:317/1480 train_time:46453ms step_avg:151.31ms step:318/1480 train_time:46602ms step_avg:151.31ms step:319/1480 train_time:46751ms step_avg:151.30ms step:320/1480 train_time:46900ms step_avg:151.29ms step:321/1480 train_time:47049ms step_avg:151.28ms step:322/1480 train_time:47198ms step_avg:151.28ms step:323/1480 train_time:47346ms step_avg:151.27ms step:324/1480 train_time:47495ms step_avg:151.26ms step:325/1480 train_time:47645ms step_avg:151.25ms step:326/1480 train_time:47792ms step_avg:151.24ms step:327/1480 train_time:47942ms step_avg:151.24ms step:328/1480 train_time:48090ms step_avg:151.23ms step:329/1480 train_time:48239ms step_avg:151.22ms step:330/1480 train_time:48388ms step_avg:151.21ms step:331/1480 train_time:48538ms step_avg:151.21ms step:332/1480 train_time:48690ms step_avg:151.21ms step:333/1480 train_time:48840ms step_avg:151.21ms step:334/1480 train_time:48991ms step_avg:151.21ms step:335/1480 train_time:49142ms step_avg:151.21ms step:336/1480 train_time:49292ms step_avg:151.20ms step:337/1480 train_time:49445ms step_avg:151.21ms step:338/1480 train_time:49596ms step_avg:151.21ms step:339/1480 train_time:49748ms step_avg:151.21ms step:340/1480 train_time:49898ms step_avg:151.21ms step:341/1480 train_time:50050ms step_avg:151.21ms step:342/1480 train_time:50200ms step_avg:151.21ms step:343/1480 train_time:50352ms step_avg:151.21ms step:344/1480 train_time:50503ms step_avg:151.21ms step:345/1480 train_time:50652ms step_avg:151.20ms step:346/1480 train_time:50804ms step_avg:151.20ms step:347/1480 train_time:50954ms step_avg:151.20ms step:348/1480 train_time:51105ms step_avg:151.20ms step:349/1480 train_time:51255ms step_avg:151.20ms step:350/1480 train_time:51407ms step_avg:151.20ms step:351/1480 train_time:51557ms step_avg:151.19ms step:352/1480 train_time:51709ms step_avg:151.19ms step:353/1480 train_time:51859ms step_avg:151.19ms step:354/1480 train_time:52009ms step_avg:151.19ms step:355/1480 train_time:52159ms step_avg:151.19ms step:356/1480 train_time:52310ms step_avg:151.18ms step:357/1480 train_time:52460ms step_avg:151.18ms step:358/1480 train_time:52611ms step_avg:151.18ms step:359/1480 train_time:52762ms step_avg:151.18ms step:360/1480 train_time:52913ms step_avg:151.18ms step:361/1480 train_time:53065ms step_avg:151.18ms step:362/1480 train_time:53215ms step_avg:151.18ms step:363/1480 train_time:53366ms step_avg:151.18ms step:364/1480 train_time:53517ms step_avg:151.18ms step:365/1480 train_time:53668ms step_avg:151.18ms step:366/1480 train_time:53820ms step_avg:151.18ms step:367/1480 train_time:53970ms step_avg:151.18ms step:368/1480 train_time:54122ms step_avg:151.18ms step:369/1480 train_time:54272ms step_avg:151.17ms step:370/1480 train_time:54423ms step_avg:151.17ms step:371/1480 train_time:54572ms step_avg:151.17ms step:372/1480 train_time:54723ms step_avg:151.17ms step:373/1480 train_time:54875ms step_avg:151.17ms step:374/1480 train_time:55026ms step_avg:151.17ms step:375/1480 train_time:55176ms step_avg:151.17ms step:375/1480 val_loss:3.8036 train_time:55244ms step_avg:151.35ms step:376/1480 train_time:55350ms step_avg:151.23ms step:377/1480 train_time:55486ms step_avg:151.19ms step:378/1480 train_time:55638ms step_avg:151.19ms step:379/1480 train_time:55808ms step_avg:151.24ms step:380/1480 train_time:55938ms step_avg:151.18ms step:381/1480 train_time:56087ms step_avg:151.18ms step:382/1480 train_time:56239ms step_avg:151.18ms step:383/1480 train_time:56390ms step_avg:151.18ms step:384/1480 train_time:56542ms step_avg:151.18ms step:385/1480 train_time:56692ms step_avg:151.18ms step:386/1480 train_time:56843ms step_avg:151.18ms step:387/1480 train_time:56993ms step_avg:151.18ms step:388/1480 train_time:57144ms step_avg:151.17ms step:389/1480 train_time:57294ms step_avg:151.17ms step:390/1480 train_time:57445ms step_avg:151.17ms step:391/1480 train_time:57596ms step_avg:151.17ms step:392/1480 train_time:57746ms step_avg:151.17ms step:393/1480 train_time:57896ms step_avg:151.17ms step:394/1480 train_time:58046ms step_avg:151.16ms step:395/1480 train_time:58198ms step_avg:151.16ms step:396/1480 train_time:58348ms step_avg:151.16ms step:397/1480 train_time:58499ms step_avg:151.16ms step:398/1480 train_time:58652ms step_avg:151.16ms step:399/1480 train_time:58803ms step_avg:151.17ms step:400/1480 train_time:58954ms step_avg:151.16ms step:401/1480 train_time:59106ms step_avg:151.17ms step:402/1480 train_time:59257ms step_avg:151.17ms step:403/1480 train_time:59408ms step_avg:151.17ms step:404/1480 train_time:59560ms step_avg:151.17ms step:405/1480 train_time:59711ms step_avg:151.17ms step:406/1480 train_time:59862ms step_avg:151.17ms step:407/1480 train_time:60013ms step_avg:151.17ms step:408/1480 train_time:60164ms step_avg:151.16ms step:409/1480 train_time:60315ms step_avg:151.17ms step:410/1480 train_time:60466ms step_avg:151.17ms step:411/1480 train_time:60617ms step_avg:151.16ms step:412/1480 train_time:60767ms step_avg:151.16ms step:413/1480 train_time:60919ms step_avg:151.16ms step:414/1480 train_time:61068ms step_avg:151.16ms step:415/1480 train_time:61220ms step_avg:151.16ms step:416/1480 train_time:61370ms step_avg:151.16ms step:417/1480 train_time:61522ms step_avg:151.16ms step:418/1480 train_time:61672ms step_avg:151.16ms step:419/1480 train_time:61823ms step_avg:151.16ms step:420/1480 train_time:61974ms step_avg:151.16ms step:421/1480 train_time:62124ms step_avg:151.15ms step:422/1480 train_time:62275ms step_avg:151.15ms step:423/1480 train_time:62425ms step_avg:151.15ms step:424/1480 train_time:62577ms step_avg:151.15ms step:425/1480 train_time:62728ms step_avg:151.15ms step:426/1480 train_time:62878ms step_avg:151.15ms step:427/1480 train_time:63029ms step_avg:151.15ms step:428/1480 train_time:63180ms step_avg:151.15ms step:429/1480 train_time:63331ms step_avg:151.15ms step:430/1480 train_time:63482ms step_avg:151.15ms step:431/1480 train_time:63633ms step_avg:151.15ms step:432/1480 train_time:63783ms step_avg:151.15ms step:433/1480 train_time:63934ms step_avg:151.14ms step:434/1480 train_time:64085ms step_avg:151.14ms step:435/1480 train_time:64237ms step_avg:151.15ms step:436/1480 train_time:64387ms step_avg:151.14ms step:437/1480 train_time:64539ms step_avg:151.15ms step:438/1480 train_time:64689ms step_avg:151.14ms step:439/1480 train_time:64841ms step_avg:151.14ms step:440/1480 train_time:64991ms step_avg:151.14ms step:441/1480 train_time:65143ms step_avg:151.14ms step:442/1480 train_time:65297ms step_avg:151.15ms step:443/1480 train_time:65450ms step_avg:151.15ms step:444/1480 train_time:65603ms step_avg:151.16ms step:445/1480 train_time:65756ms step_avg:151.16ms step:446/1480 train_time:65908ms step_avg:151.17ms step:447/1480 train_time:66062ms step_avg:151.17ms step:448/1480 train_time:66215ms step_avg:151.17ms step:449/1480 train_time:66367ms step_avg:151.18ms step:450/1480 train_time:66522ms step_avg:151.19ms step:451/1480 train_time:66674ms step_avg:151.19ms step:452/1480 train_time:66827ms step_avg:151.19ms step:453/1480 train_time:66979ms step_avg:151.19ms step:454/1480 train_time:67133ms step_avg:151.20ms step:455/1480 train_time:67285ms step_avg:151.20ms step:456/1480 train_time:67439ms step_avg:151.21ms step:457/1480 train_time:67591ms step_avg:151.21ms step:458/1480 train_time:67744ms step_avg:151.22ms step:459/1480 train_time:67897ms step_avg:151.22ms step:460/1480 train_time:68049ms step_avg:151.22ms step:461/1480 train_time:68202ms step_avg:151.22ms step:462/1480 train_time:68356ms step_avg:151.23ms step:463/1480 train_time:68510ms step_avg:151.24ms step:464/1480 train_time:68664ms step_avg:151.24ms step:465/1480 train_time:68817ms step_avg:151.25ms step:466/1480 train_time:68968ms step_avg:151.25ms step:467/1480 train_time:69123ms step_avg:151.25ms step:468/1480 train_time:69275ms step_avg:151.26ms step:469/1480 train_time:69427ms step_avg:151.26ms step:470/1480 train_time:69579ms step_avg:151.26ms step:471/1480 train_time:69734ms step_avg:151.27ms step:472/1480 train_time:69887ms step_avg:151.27ms step:473/1480 train_time:70040ms step_avg:151.27ms step:474/1480 train_time:70193ms step_avg:151.28ms step:475/1480 train_time:70345ms step_avg:151.28ms step:476/1480 train_time:70499ms step_avg:151.29ms step:477/1480 train_time:70651ms step_avg:151.29ms step:478/1480 train_time:70803ms step_avg:151.29ms step:479/1480 train_time:70956ms step_avg:151.29ms step:480/1480 train_time:71108ms step_avg:151.29ms step:481/1480 train_time:71262ms step_avg:151.30ms step:482/1480 train_time:71415ms step_avg:151.30ms step:483/1480 train_time:71567ms step_avg:151.31ms step:484/1480 train_time:71721ms step_avg:151.31ms step:485/1480 train_time:71874ms step_avg:151.31ms step:486/1480 train_time:72026ms step_avg:151.32ms step:487/1480 train_time:72179ms step_avg:151.32ms step:488/1480 train_time:72334ms step_avg:151.33ms step:489/1480 train_time:72487ms step_avg:151.33ms step:490/1480 train_time:72640ms step_avg:151.33ms step:491/1480 train_time:72792ms step_avg:151.34ms step:492/1480 train_time:72944ms step_avg:151.34ms step:493/1480 train_time:73097ms step_avg:151.34ms step:494/1480 train_time:73249ms step_avg:151.34ms step:495/1480 train_time:73403ms step_avg:151.35ms step:496/1480 train_time:73556ms step_avg:151.35ms step:497/1480 train_time:73709ms step_avg:151.35ms step:498/1480 train_time:73862ms step_avg:151.36ms step:499/1480 train_time:74014ms step_avg:151.36ms step:500/1480 train_time:74168ms step_avg:151.36ms step:500/1480 val_loss:3.6865 train_time:74237ms step_avg:151.50ms step:501/1480 train_time:74334ms step_avg:151.39ms step:502/1480 train_time:74481ms step_avg:151.38ms step:503/1480 train_time:74633ms step_avg:151.39ms step:504/1480 train_time:74786ms step_avg:151.39ms step:505/1480 train_time:74938ms step_avg:151.39ms step:506/1480 train_time:75090ms step_avg:151.39ms step:507/1480 train_time:75244ms step_avg:151.40ms step:508/1480 train_time:75398ms step_avg:151.40ms step:509/1480 train_time:75552ms step_avg:151.41ms step:510/1480 train_time:75705ms step_avg:151.41ms step:511/1480 train_time:75857ms step_avg:151.41ms step:512/1480 train_time:76010ms step_avg:151.41ms step:513/1480 train_time:76163ms step_avg:151.42ms step:514/1480 train_time:76316ms step_avg:151.42ms step:515/1480 train_time:76470ms step_avg:151.43ms step:516/1480 train_time:76626ms step_avg:151.43ms step:517/1480 train_time:76779ms step_avg:151.44ms step:518/1480 train_time:76932ms step_avg:151.44ms step:519/1480 train_time:77085ms step_avg:151.44ms step:520/1480 train_time:77239ms step_avg:151.45ms step:521/1480 train_time:77391ms step_avg:151.45ms step:522/1480 train_time:77546ms step_avg:151.46ms step:523/1480 train_time:77700ms step_avg:151.46ms step:524/1480 train_time:77853ms step_avg:151.47ms step:525/1480 train_time:78007ms step_avg:151.47ms step:526/1480 train_time:78159ms step_avg:151.47ms step:527/1480 train_time:78312ms step_avg:151.47ms step:528/1480 train_time:78465ms step_avg:151.48ms step:529/1480 train_time:78617ms step_avg:151.48ms step:530/1480 train_time:78770ms step_avg:151.48ms step:531/1480 train_time:78924ms step_avg:151.48ms step:532/1480 train_time:79077ms step_avg:151.49ms step:533/1480 train_time:79231ms step_avg:151.49ms step:534/1480 train_time:79383ms step_avg:151.49ms step:535/1480 train_time:79536ms step_avg:151.50ms step:536/1480 train_time:79689ms step_avg:151.50ms step:537/1480 train_time:79843ms step_avg:151.51ms step:538/1480 train_time:79997ms step_avg:151.51ms step:539/1480 train_time:80151ms step_avg:151.51ms step:540/1480 train_time:80304ms step_avg:151.52ms step:541/1480 train_time:80456ms step_avg:151.52ms step:542/1480 train_time:80609ms step_avg:151.52ms step:543/1480 train_time:80763ms step_avg:151.53ms step:544/1480 train_time:80915ms step_avg:151.53ms step:545/1480 train_time:81069ms step_avg:151.53ms step:546/1480 train_time:81223ms step_avg:151.53ms step:547/1480 train_time:81376ms step_avg:151.54ms step:548/1480 train_time:81529ms step_avg:151.54ms step:549/1480 train_time:81682ms step_avg:151.54ms step:550/1480 train_time:81835ms step_avg:151.55ms step:551/1480 train_time:81988ms step_avg:151.55ms step:552/1480 train_time:82144ms step_avg:151.56ms step:553/1480 train_time:82301ms step_avg:151.57ms step:554/1480 train_time:82456ms step_avg:151.57ms step:555/1480 train_time:82611ms step_avg:151.58ms step:556/1480 train_time:82765ms step_avg:151.58ms step:557/1480 train_time:82920ms step_avg:151.59ms step:558/1480 train_time:83074ms step_avg:151.60ms step:559/1480 train_time:83229ms step_avg:151.60ms step:560/1480 train_time:83384ms step_avg:151.61ms step:561/1480 train_time:83539ms step_avg:151.61ms step:562/1480 train_time:83693ms step_avg:151.62ms step:563/1480 train_time:83849ms step_avg:151.63ms step:564/1480 train_time:84004ms step_avg:151.63ms step:565/1480 train_time:84159ms step_avg:151.64ms step:566/1480 train_time:84313ms step_avg:151.64ms step:567/1480 train_time:84468ms step_avg:151.65ms step:568/1480 train_time:84623ms step_avg:151.65ms step:569/1480 train_time:84793ms step_avg:151.69ms step:570/1480 train_time:84933ms step_avg:151.67ms step:571/1480 train_time:85087ms step_avg:151.67ms step:572/1480 train_time:85242ms step_avg:151.68ms step:573/1480 train_time:85397ms step_avg:151.68ms step:574/1480 train_time:85552ms step_avg:151.69ms step:575/1480 train_time:85708ms step_avg:151.70ms step:576/1480 train_time:85862ms step_avg:151.70ms step:577/1480 train_time:86017ms step_avg:151.71ms step:578/1480 train_time:86172ms step_avg:151.71ms step:579/1480 train_time:86327ms step_avg:151.72ms step:580/1480 train_time:86481ms step_avg:151.72ms step:581/1480 train_time:86636ms step_avg:151.73ms step:582/1480 train_time:86790ms step_avg:151.73ms step:583/1480 train_time:86945ms step_avg:151.74ms step:584/1480 train_time:87101ms step_avg:151.74ms step:585/1480 train_time:87255ms step_avg:151.75ms step:586/1480 train_time:87411ms step_avg:151.75ms step:587/1480 train_time:87565ms step_avg:151.76ms step:588/1480 train_time:87720ms step_avg:151.76ms step:589/1480 train_time:87875ms step_avg:151.77ms step:590/1480 train_time:88031ms step_avg:151.78ms step:591/1480 train_time:88184ms step_avg:151.78ms step:592/1480 train_time:88340ms step_avg:151.79ms step:593/1480 train_time:88494ms step_avg:151.79ms step:594/1480 train_time:88650ms step_avg:151.80ms step:595/1480 train_time:88805ms step_avg:151.80ms step:596/1480 train_time:88961ms step_avg:151.81ms step:597/1480 train_time:89117ms step_avg:151.82ms step:598/1480 train_time:89272ms step_avg:151.82ms step:599/1480 train_time:89428ms step_avg:151.83ms step:600/1480 train_time:89581ms step_avg:151.83ms step:601/1480 train_time:89736ms step_avg:151.84ms step:602/1480 train_time:89890ms step_avg:151.84ms step:603/1480 train_time:90046ms step_avg:151.85ms step:604/1480 train_time:90200ms step_avg:151.85ms step:605/1480 train_time:90355ms step_avg:151.86ms step:606/1480 train_time:90512ms step_avg:151.86ms step:607/1480 train_time:90667ms step_avg:151.87ms step:608/1480 train_time:90822ms step_avg:151.88ms step:609/1480 train_time:90978ms step_avg:151.88ms step:610/1480 train_time:91132ms step_avg:151.89ms step:611/1480 train_time:91286ms step_avg:151.89ms step:612/1480 train_time:91443ms step_avg:151.90ms step:613/1480 train_time:91598ms step_avg:151.90ms step:614/1480 train_time:91753ms step_avg:151.91ms step:615/1480 train_time:91907ms step_avg:151.91ms step:616/1480 train_time:92061ms step_avg:151.92ms step:617/1480 train_time:92215ms step_avg:151.92ms step:618/1480 train_time:92370ms step_avg:151.92ms step:619/1480 train_time:92526ms step_avg:151.93ms step:620/1480 train_time:92681ms step_avg:151.94ms step:621/1480 train_time:92835ms step_avg:151.94ms step:622/1480 train_time:92990ms step_avg:151.94ms step:623/1480 train_time:93145ms step_avg:151.95ms step:624/1480 train_time:93300ms step_avg:151.96ms step:625/1480 train_time:93455ms step_avg:151.96ms step:625/1480 val_loss:3.6047 train_time:93526ms step_avg:152.08ms step:626/1480 train_time:93619ms step_avg:151.98ms step:627/1480 train_time:93773ms step_avg:151.98ms step:628/1480 train_time:93928ms step_avg:151.99ms step:629/1480 train_time:94081ms step_avg:151.99ms step:630/1480 train_time:94236ms step_avg:151.99ms step:631/1480 train_time:94390ms step_avg:152.00ms step:632/1480 train_time:94544ms step_avg:152.00ms step:633/1480 train_time:94699ms step_avg:152.01ms step:634/1480 train_time:94854ms step_avg:152.01ms step:635/1480 train_time:95008ms step_avg:152.01ms step:636/1480 train_time:95163ms step_avg:152.02ms step:637/1480 train_time:95318ms step_avg:152.02ms step:638/1480 train_time:95472ms step_avg:152.03ms step:639/1480 train_time:95626ms step_avg:152.03ms step:640/1480 train_time:95780ms step_avg:152.03ms step:641/1480 train_time:95936ms step_avg:152.04ms step:642/1480 train_time:96090ms step_avg:152.04ms step:643/1480 train_time:96245ms step_avg:152.05ms step:644/1480 train_time:96399ms step_avg:152.05ms step:645/1480 train_time:96556ms step_avg:152.06ms step:646/1480 train_time:96711ms step_avg:152.06ms step:647/1480 train_time:96866ms step_avg:152.07ms step:648/1480 train_time:97022ms step_avg:152.07ms step:649/1480 train_time:97176ms step_avg:152.08ms step:650/1480 train_time:97332ms step_avg:152.08ms step:651/1480 train_time:97488ms step_avg:152.09ms step:652/1480 train_time:97643ms step_avg:152.09ms step:653/1480 train_time:97797ms step_avg:152.10ms step:654/1480 train_time:97952ms step_avg:152.10ms step:655/1480 train_time:98107ms step_avg:152.10ms step:656/1480 train_time:98262ms step_avg:152.11ms step:657/1480 train_time:98418ms step_avg:152.11ms step:658/1480 train_time:98573ms step_avg:152.12ms step:659/1480 train_time:98729ms step_avg:152.13ms step:660/1480 train_time:98885ms step_avg:152.13ms step:661/1480 train_time:99042ms step_avg:152.14ms step:662/1480 train_time:99198ms step_avg:152.14ms step:663/1480 train_time:99353ms step_avg:152.15ms step:664/1480 train_time:99511ms step_avg:152.16ms step:665/1480 train_time:99669ms step_avg:152.17ms step:666/1480 train_time:99825ms step_avg:152.17ms step:667/1480 train_time:99982ms step_avg:152.18ms step:668/1480 train_time:100139ms step_avg:152.19ms step:669/1480 train_time:100297ms step_avg:152.20ms step:670/1480 train_time:100452ms step_avg:152.20ms step:671/1480 train_time:100608ms step_avg:152.21ms step:672/1480 train_time:100763ms step_avg:152.21ms step:673/1480 train_time:100920ms step_avg:152.22ms step:674/1480 train_time:101076ms step_avg:152.22ms step:675/1480 train_time:101233ms step_avg:152.23ms step:676/1480 train_time:101391ms step_avg:152.24ms step:677/1480 train_time:101547ms step_avg:152.24ms step:678/1480 train_time:101702ms step_avg:152.25ms step:679/1480 train_time:101859ms step_avg:152.26ms step:680/1480 train_time:102016ms step_avg:152.26ms step:681/1480 train_time:102171ms step_avg:152.27ms step:682/1480 train_time:102328ms step_avg:152.27ms step:683/1480 train_time:102485ms step_avg:152.28ms step:684/1480 train_time:102643ms step_avg:152.29ms step:685/1480 train_time:102800ms step_avg:152.30ms step:686/1480 train_time:102956ms step_avg:152.30ms step:687/1480 train_time:103111ms step_avg:152.31ms step:688/1480 train_time:103269ms step_avg:152.31ms step:689/1480 train_time:103427ms step_avg:152.32ms step:690/1480 train_time:103584ms step_avg:152.33ms step:691/1480 train_time:103740ms step_avg:152.34ms step:692/1480 train_time:103897ms step_avg:152.34ms step:693/1480 train_time:104054ms step_avg:152.35ms step:694/1480 train_time:104211ms step_avg:152.36ms step:695/1480 train_time:104368ms step_avg:152.36ms step:696/1480 train_time:104523ms step_avg:152.37ms step:697/1480 train_time:104678ms step_avg:152.37ms step:698/1480 train_time:104836ms step_avg:152.38ms step:699/1480 train_time:104994ms step_avg:152.39ms step:700/1480 train_time:105152ms step_avg:152.39ms step:701/1480 train_time:105308ms step_avg:152.40ms step:702/1480 train_time:105463ms step_avg:152.40ms step:703/1480 train_time:105620ms step_avg:152.41ms step:704/1480 train_time:105775ms step_avg:152.41ms step:705/1480 train_time:105933ms step_avg:152.42ms step:706/1480 train_time:106090ms step_avg:152.43ms step:707/1480 train_time:106245ms step_avg:152.43ms step:708/1480 train_time:106402ms step_avg:152.44ms step:709/1480 train_time:106559ms step_avg:152.45ms step:710/1480 train_time:106716ms step_avg:152.45ms step:711/1480 train_time:106872ms step_avg:152.46ms step:712/1480 train_time:107029ms step_avg:152.46ms step:713/1480 train_time:107185ms step_avg:152.47ms step:714/1480 train_time:107342ms step_avg:152.47ms step:715/1480 train_time:107498ms step_avg:152.48ms step:716/1480 train_time:107654ms step_avg:152.48ms step:717/1480 train_time:107812ms step_avg:152.49ms step:718/1480 train_time:107967ms step_avg:152.50ms step:719/1480 train_time:108124ms step_avg:152.50ms step:720/1480 train_time:108280ms step_avg:152.51ms step:721/1480 train_time:108437ms step_avg:152.51ms step:722/1480 train_time:108593ms step_avg:152.52ms step:723/1480 train_time:108748ms step_avg:152.52ms step:724/1480 train_time:108903ms step_avg:152.53ms step:725/1480 train_time:109061ms step_avg:152.53ms step:726/1480 train_time:109218ms step_avg:152.54ms step:727/1480 train_time:109375ms step_avg:152.55ms step:728/1480 train_time:109533ms step_avg:152.55ms step:729/1480 train_time:109689ms step_avg:152.56ms step:730/1480 train_time:109847ms step_avg:152.56ms step:731/1480 train_time:110003ms step_avg:152.57ms step:732/1480 train_time:110159ms step_avg:152.57ms step:733/1480 train_time:110317ms step_avg:152.58ms step:734/1480 train_time:110474ms step_avg:152.59ms step:735/1480 train_time:110630ms step_avg:152.59ms step:736/1480 train_time:110787ms step_avg:152.60ms step:737/1480 train_time:110942ms step_avg:152.60ms step:738/1480 train_time:111097ms step_avg:152.61ms step:739/1480 train_time:111255ms step_avg:152.61ms step:740/1480 train_time:111415ms step_avg:152.62ms step:741/1480 train_time:111573ms step_avg:152.63ms step:742/1480 train_time:111729ms step_avg:152.64ms step:743/1480 train_time:111885ms step_avg:152.64ms step:744/1480 train_time:112042ms step_avg:152.65ms step:745/1480 train_time:112200ms step_avg:152.65ms step:746/1480 train_time:112356ms step_avg:152.66ms step:747/1480 train_time:112513ms step_avg:152.66ms step:748/1480 train_time:112672ms step_avg:152.67ms step:749/1480 train_time:112829ms step_avg:152.68ms step:750/1480 train_time:112986ms step_avg:152.68ms step:750/1480 val_loss:3.5484 train_time:113058ms step_avg:152.78ms step:751/1480 train_time:113150ms step_avg:152.70ms step:752/1480 train_time:113305ms step_avg:152.70ms step:753/1480 train_time:113460ms step_avg:152.71ms step:754/1480 train_time:113616ms step_avg:152.71ms step:755/1480 train_time:113771ms step_avg:152.71ms step:756/1480 train_time:113927ms step_avg:152.72ms step:757/1480 train_time:114086ms step_avg:152.73ms step:758/1480 train_time:114243ms step_avg:152.73ms step:759/1480 train_time:114417ms step_avg:152.76ms step:760/1480 train_time:114559ms step_avg:152.75ms step:761/1480 train_time:114715ms step_avg:152.75ms step:762/1480 train_time:114872ms step_avg:152.75ms step:763/1480 train_time:115028ms step_avg:152.76ms step:764/1480 train_time:115186ms step_avg:152.77ms step:765/1480 train_time:115342ms step_avg:152.77ms step:766/1480 train_time:115500ms step_avg:152.78ms step:767/1480 train_time:115658ms step_avg:152.78ms step:768/1480 train_time:115814ms step_avg:152.79ms step:769/1480 train_time:115970ms step_avg:152.79ms step:770/1480 train_time:116128ms step_avg:152.80ms step:771/1480 train_time:116285ms step_avg:152.81ms step:772/1480 train_time:116443ms step_avg:152.81ms step:773/1480 train_time:116601ms step_avg:152.82ms step:774/1480 train_time:116758ms step_avg:152.82ms step:775/1480 train_time:116917ms step_avg:152.83ms step:776/1480 train_time:117075ms step_avg:152.84ms step:777/1480 train_time:117236ms step_avg:152.85ms step:778/1480 train_time:117392ms step_avg:152.85ms step:779/1480 train_time:117549ms step_avg:152.86ms step:780/1480 train_time:117708ms step_avg:152.87ms step:781/1480 train_time:117865ms step_avg:152.87ms step:782/1480 train_time:118024ms step_avg:152.88ms step:783/1480 train_time:118181ms step_avg:152.89ms step:784/1480 train_time:118339ms step_avg:152.89ms step:785/1480 train_time:118497ms step_avg:152.90ms step:786/1480 train_time:118655ms step_avg:152.91ms step:787/1480 train_time:118812ms step_avg:152.91ms step:788/1480 train_time:118970ms step_avg:152.92ms step:789/1480 train_time:119127ms step_avg:152.92ms step:790/1480 train_time:119285ms step_avg:152.93ms step:791/1480 train_time:119446ms step_avg:152.94ms step:792/1480 train_time:119605ms step_avg:152.95ms step:793/1480 train_time:119761ms step_avg:152.95ms step:794/1480 train_time:119922ms step_avg:152.96ms step:795/1480 train_time:120082ms step_avg:152.97ms step:796/1480 train_time:120243ms step_avg:152.98ms step:797/1480 train_time:120403ms step_avg:152.99ms step:798/1480 train_time:120561ms step_avg:153.00ms step:799/1480 train_time:120722ms step_avg:153.01ms step:800/1480 train_time:120881ms step_avg:153.01ms step:801/1480 train_time:121038ms step_avg:153.02ms step:802/1480 train_time:121198ms step_avg:153.03ms step:803/1480 train_time:121357ms step_avg:153.03ms step:804/1480 train_time:121514ms step_avg:153.04ms step:805/1480 train_time:121672ms step_avg:153.05ms step:806/1480 train_time:121829ms step_avg:153.05ms step:807/1480 train_time:121985ms step_avg:153.06ms step:808/1480 train_time:122142ms step_avg:153.06ms step:809/1480 train_time:122300ms step_avg:153.07ms step:810/1480 train_time:122458ms step_avg:153.07ms step:811/1480 train_time:122616ms step_avg:153.08ms step:812/1480 train_time:122773ms step_avg:153.08ms step:813/1480 train_time:122930ms step_avg:153.09ms step:814/1480 train_time:123087ms step_avg:153.09ms step:815/1480 train_time:123244ms step_avg:153.10ms step:816/1480 train_time:123403ms step_avg:153.11ms step:817/1480 train_time:123560ms step_avg:153.11ms step:818/1480 train_time:123718ms step_avg:153.12ms step:819/1480 train_time:123875ms step_avg:153.12ms step:820/1480 train_time:124033ms step_avg:153.13ms step:821/1480 train_time:124189ms step_avg:153.13ms step:822/1480 train_time:124347ms step_avg:153.14ms step:823/1480 train_time:124505ms step_avg:153.14ms step:824/1480 train_time:124662ms step_avg:153.15ms step:825/1480 train_time:124822ms step_avg:153.16ms step:826/1480 train_time:124982ms step_avg:153.16ms step:827/1480 train_time:125142ms step_avg:153.17ms step:828/1480 train_time:125301ms step_avg:153.18ms step:829/1480 train_time:125461ms step_avg:153.19ms step:830/1480 train_time:125622ms step_avg:153.20ms step:831/1480 train_time:125781ms step_avg:153.20ms step:832/1480 train_time:125939ms step_avg:153.21ms step:833/1480 train_time:126097ms step_avg:153.22ms step:834/1480 train_time:126259ms step_avg:153.23ms step:835/1480 train_time:126416ms step_avg:153.23ms step:836/1480 train_time:126573ms step_avg:153.24ms step:837/1480 train_time:126730ms step_avg:153.24ms step:838/1480 train_time:126887ms step_avg:153.25ms step:839/1480 train_time:127046ms step_avg:153.25ms step:840/1480 train_time:127203ms step_avg:153.26ms step:841/1480 train_time:127359ms step_avg:153.26ms step:842/1480 train_time:127519ms step_avg:153.27ms step:843/1480 train_time:127676ms step_avg:153.27ms step:844/1480 train_time:127832ms step_avg:153.28ms step:845/1480 train_time:127990ms step_avg:153.28ms step:846/1480 train_time:128147ms step_avg:153.29ms step:847/1480 train_time:128307ms step_avg:153.29ms step:848/1480 train_time:128463ms step_avg:153.30ms step:849/1480 train_time:128623ms step_avg:153.31ms step:850/1480 train_time:128781ms step_avg:153.31ms step:851/1480 train_time:128941ms step_avg:153.32ms step:852/1480 train_time:129100ms step_avg:153.33ms step:853/1480 train_time:129259ms step_avg:153.33ms step:854/1480 train_time:129418ms step_avg:153.34ms step:855/1480 train_time:129575ms step_avg:153.34ms step:856/1480 train_time:129733ms step_avg:153.35ms step:857/1480 train_time:129890ms step_avg:153.35ms step:858/1480 train_time:130049ms step_avg:153.36ms step:859/1480 train_time:130207ms step_avg:153.36ms step:860/1480 train_time:130364ms step_avg:153.37ms step:861/1480 train_time:130524ms step_avg:153.38ms step:862/1480 train_time:130686ms step_avg:153.39ms step:863/1480 train_time:130846ms step_avg:153.39ms step:864/1480 train_time:131007ms step_avg:153.40ms step:865/1480 train_time:131162ms step_avg:153.41ms step:866/1480 train_time:131322ms step_avg:153.41ms step:867/1480 train_time:131482ms step_avg:153.42ms step:868/1480 train_time:131640ms step_avg:153.43ms step:869/1480 train_time:131799ms step_avg:153.43ms step:870/1480 train_time:131957ms step_avg:153.44ms step:871/1480 train_time:132115ms step_avg:153.44ms step:872/1480 train_time:132273ms step_avg:153.45ms step:873/1480 train_time:132430ms step_avg:153.45ms step:874/1480 train_time:132591ms step_avg:153.46ms step:875/1480 train_time:132749ms step_avg:153.47ms step:875/1480 val_loss:3.5045 train_time:132822ms step_avg:153.55ms step:876/1480 train_time:132918ms step_avg:153.48ms step:877/1480 train_time:133067ms step_avg:153.48ms step:878/1480 train_time:133225ms step_avg:153.48ms step:879/1480 train_time:133383ms step_avg:153.49ms step:880/1480 train_time:133540ms step_avg:153.49ms step:881/1480 train_time:133698ms step_avg:153.50ms step:882/1480 train_time:133858ms step_avg:153.51ms step:883/1480 train_time:134016ms step_avg:153.51ms step:884/1480 train_time:134177ms step_avg:153.52ms step:885/1480 train_time:134335ms step_avg:153.53ms step:886/1480 train_time:134499ms step_avg:153.54ms step:887/1480 train_time:134660ms step_avg:153.55ms step:888/1480 train_time:134822ms step_avg:153.56ms step:889/1480 train_time:134982ms step_avg:153.56ms step:890/1480 train_time:135139ms step_avg:153.57ms step:891/1480 train_time:135299ms step_avg:153.57ms step:892/1480 train_time:135458ms step_avg:153.58ms step:893/1480 train_time:135617ms step_avg:153.59ms step:894/1480 train_time:135777ms step_avg:153.59ms step:895/1480 train_time:135938ms step_avg:153.60ms step:896/1480 train_time:136098ms step_avg:153.61ms step:897/1480 train_time:136258ms step_avg:153.62ms step:898/1480 train_time:136417ms step_avg:153.62ms step:899/1480 train_time:136577ms step_avg:153.63ms step:900/1480 train_time:136734ms step_avg:153.63ms step:901/1480 train_time:136895ms step_avg:153.64ms step:902/1480 train_time:137054ms step_avg:153.65ms step:903/1480 train_time:137215ms step_avg:153.66ms step:904/1480 train_time:137378ms step_avg:153.67ms step:905/1480 train_time:137536ms step_avg:153.67ms step:906/1480 train_time:137696ms step_avg:153.68ms step:907/1480 train_time:137858ms step_avg:153.69ms step:908/1480 train_time:138016ms step_avg:153.69ms step:909/1480 train_time:138177ms step_avg:153.70ms step:910/1480 train_time:138340ms step_avg:153.71ms step:911/1480 train_time:138499ms step_avg:153.72ms step:912/1480 train_time:138660ms step_avg:153.72ms step:913/1480 train_time:138820ms step_avg:153.73ms step:914/1480 train_time:138980ms step_avg:153.74ms step:915/1480 train_time:139141ms step_avg:153.75ms step:916/1480 train_time:139299ms step_avg:153.75ms step:917/1480 train_time:139457ms step_avg:153.76ms step:918/1480 train_time:139619ms step_avg:153.77ms step:919/1480 train_time:139780ms step_avg:153.77ms step:920/1480 train_time:139939ms step_avg:153.78ms step:921/1480 train_time:140099ms step_avg:153.79ms step:922/1480 train_time:140260ms step_avg:153.79ms step:923/1480 train_time:140418ms step_avg:153.80ms step:924/1480 train_time:140577ms step_avg:153.80ms step:925/1480 train_time:140736ms step_avg:153.81ms step:926/1480 train_time:140896ms step_avg:153.82ms step:927/1480 train_time:141054ms step_avg:153.82ms step:928/1480 train_time:141215ms step_avg:153.83ms step:929/1480 train_time:141375ms step_avg:153.84ms step:930/1480 train_time:141534ms step_avg:153.84ms step:931/1480 train_time:141694ms step_avg:153.85ms step:932/1480 train_time:141853ms step_avg:153.85ms step:933/1480 train_time:142012ms step_avg:153.86ms step:934/1480 train_time:142172ms step_avg:153.87ms step:935/1480 train_time:142332ms step_avg:153.87ms step:936/1480 train_time:142493ms step_avg:153.88ms step:937/1480 train_time:142653ms step_avg:153.89ms step:938/1480 train_time:142811ms step_avg:153.89ms step:939/1480 train_time:142974ms step_avg:153.90ms step:940/1480 train_time:143136ms step_avg:153.91ms step:941/1480 train_time:143295ms step_avg:153.91ms step:942/1480 train_time:143453ms step_avg:153.92ms step:943/1480 train_time:143614ms step_avg:153.93ms step:944/1480 train_time:143777ms step_avg:153.94ms step:945/1480 train_time:143935ms step_avg:153.94ms step:946/1480 train_time:144098ms step_avg:153.95ms step:947/1480 train_time:144259ms step_avg:153.96ms step:948/1480 train_time:144418ms step_avg:153.96ms step:949/1480 train_time:144589ms step_avg:153.98ms step:950/1480 train_time:144737ms step_avg:153.98ms step:951/1480 train_time:144900ms step_avg:153.98ms step:952/1480 train_time:145058ms step_avg:153.99ms step:953/1480 train_time:145218ms step_avg:154.00ms step:954/1480 train_time:145380ms step_avg:154.00ms step:955/1480 train_time:145537ms step_avg:154.01ms step:956/1480 train_time:145697ms step_avg:154.01ms step:957/1480 train_time:145857ms step_avg:154.02ms step:958/1480 train_time:146019ms step_avg:154.03ms step:959/1480 train_time:146179ms step_avg:154.03ms step:960/1480 train_time:146338ms step_avg:154.04ms step:961/1480 train_time:146498ms step_avg:154.05ms step:962/1480 train_time:146657ms step_avg:154.05ms step:963/1480 train_time:146818ms step_avg:154.06ms step:964/1480 train_time:146979ms step_avg:154.07ms step:965/1480 train_time:147138ms step_avg:154.07ms step:966/1480 train_time:147298ms step_avg:154.08ms step:967/1480 train_time:147456ms step_avg:154.08ms step:968/1480 train_time:147616ms step_avg:154.09ms step:969/1480 train_time:147776ms step_avg:154.09ms step:970/1480 train_time:147933ms step_avg:154.10ms step:971/1480 train_time:148094ms step_avg:154.10ms step:972/1480 train_time:148253ms step_avg:154.11ms step:973/1480 train_time:148411ms step_avg:154.11ms step:974/1480 train_time:148572ms step_avg:154.12ms step:975/1480 train_time:148733ms step_avg:154.13ms step:976/1480 train_time:148894ms step_avg:154.13ms step:977/1480 train_time:149055ms step_avg:154.14ms step:978/1480 train_time:149215ms step_avg:154.15ms step:979/1480 train_time:149375ms step_avg:154.15ms step:980/1480 train_time:149534ms step_avg:154.16ms step:981/1480 train_time:149696ms step_avg:154.17ms step:982/1480 train_time:149855ms step_avg:154.17ms step:983/1480 train_time:150015ms step_avg:154.18ms step:984/1480 train_time:150175ms step_avg:154.18ms step:985/1480 train_time:150336ms step_avg:154.19ms step:986/1480 train_time:150497ms step_avg:154.20ms step:987/1480 train_time:150655ms step_avg:154.20ms step:988/1480 train_time:150814ms step_avg:154.21ms step:989/1480 train_time:150973ms step_avg:154.21ms step:990/1480 train_time:151134ms step_avg:154.22ms step:991/1480 train_time:151295ms step_avg:154.23ms step:992/1480 train_time:151461ms step_avg:154.24ms step:993/1480 train_time:151629ms step_avg:154.25ms step:994/1480 train_time:151789ms step_avg:154.26ms step:995/1480 train_time:151949ms step_avg:154.26ms step:996/1480 train_time:152107ms step_avg:154.27ms step:997/1480 train_time:152266ms step_avg:154.27ms step:998/1480 train_time:152425ms step_avg:154.28ms step:999/1480 train_time:152587ms step_avg:154.28ms step:1000/1480 train_time:152748ms step_avg:154.29ms step:1000/1480 val_loss:3.4397 train_time:152821ms step_avg:154.36ms step:1001/1480 train_time:152914ms step_avg:154.30ms step:1002/1480 train_time:153073ms step_avg:154.31ms step:1003/1480 train_time:153236ms step_avg:154.32ms step:1004/1480 train_time:153397ms step_avg:154.32ms step:1005/1480 train_time:153559ms step_avg:154.33ms step:1006/1480 train_time:153720ms step_avg:154.34ms step:1007/1480 train_time:153879ms step_avg:154.34ms step:1008/1480 train_time:154040ms step_avg:154.35ms step:1009/1480 train_time:154204ms step_avg:154.36ms step:1010/1480 train_time:154362ms step_avg:154.36ms step:1011/1480 train_time:154521ms step_avg:154.37ms step:1012/1480 train_time:154679ms step_avg:154.37ms step:1013/1480 train_time:154841ms step_avg:154.38ms step:1014/1480 train_time:155000ms step_avg:154.38ms step:1015/1480 train_time:155161ms step_avg:154.39ms step:1016/1480 train_time:155321ms step_avg:154.39ms step:1017/1480 train_time:155482ms step_avg:154.40ms step:1018/1480 train_time:155642ms step_avg:154.41ms step:1019/1480 train_time:155802ms step_avg:154.41ms step:1020/1480 train_time:155962ms step_avg:154.42ms step:1021/1480 train_time:156122ms step_avg:154.42ms step:1022/1480 train_time:156281ms step_avg:154.43ms step:1023/1480 train_time:156444ms step_avg:154.44ms step:1024/1480 train_time:156602ms step_avg:154.44ms step:1025/1480 train_time:156767ms step_avg:154.45ms step:1026/1480 train_time:156930ms step_avg:154.46ms step:1027/1480 train_time:157090ms step_avg:154.46ms step:1028/1480 train_time:157253ms step_avg:154.47ms step:1029/1480 train_time:157416ms step_avg:154.48ms step:1030/1480 train_time:157578ms step_avg:154.49ms step:1031/1480 train_time:157736ms step_avg:154.49ms step:1032/1480 train_time:157899ms step_avg:154.50ms step:1033/1480 train_time:158059ms step_avg:154.51ms step:1034/1480 train_time:158220ms step_avg:154.51ms step:1035/1480 train_time:158380ms step_avg:154.52ms step:1036/1480 train_time:158540ms step_avg:154.52ms step:1037/1480 train_time:158700ms step_avg:154.53ms step:1038/1480 train_time:158860ms step_avg:154.53ms step:1039/1480 train_time:159022ms step_avg:154.54ms step:1040/1480 train_time:159184ms step_avg:154.55ms step:1041/1480 train_time:159346ms step_avg:154.55ms step:1042/1480 train_time:159506ms step_avg:154.56ms step:1043/1480 train_time:159663ms step_avg:154.56ms step:1044/1480 train_time:159821ms step_avg:154.57ms step:1045/1480 train_time:159982ms step_avg:154.57ms step:1046/1480 train_time:160143ms step_avg:154.58ms step:1047/1480 train_time:160302ms step_avg:154.58ms step:1048/1480 train_time:160464ms step_avg:154.59ms step:1049/1480 train_time:160623ms step_avg:154.59ms step:1050/1480 train_time:160783ms step_avg:154.60ms step:1051/1480 train_time:160945ms step_avg:154.61ms step:1052/1480 train_time:161105ms step_avg:154.61ms step:1053/1480 train_time:161264ms step_avg:154.62ms step:1054/1480 train_time:161426ms step_avg:154.62ms step:1055/1480 train_time:161588ms step_avg:154.63ms step:1056/1480 train_time:161748ms step_avg:154.63ms step:1057/1480 train_time:161908ms step_avg:154.64ms step:1058/1480 train_time:162070ms step_avg:154.65ms step:1059/1480 train_time:162234ms step_avg:154.66ms step:1060/1480 train_time:162396ms step_avg:154.66ms step:1061/1480 train_time:162556ms step_avg:154.67ms step:1062/1480 train_time:162715ms step_avg:154.67ms step:1063/1480 train_time:162876ms step_avg:154.68ms step:1064/1480 train_time:163036ms step_avg:154.68ms step:1065/1480 train_time:163195ms step_avg:154.69ms step:1066/1480 train_time:163360ms step_avg:154.70ms step:1067/1480 train_time:163522ms step_avg:154.70ms step:1068/1480 train_time:163682ms step_avg:154.71ms step:1069/1480 train_time:163846ms step_avg:154.72ms step:1070/1480 train_time:164006ms step_avg:154.72ms step:1071/1480 train_time:164171ms step_avg:154.73ms step:1072/1480 train_time:164331ms step_avg:154.74ms step:1073/1480 train_time:164489ms step_avg:154.74ms step:1074/1480 train_time:164648ms step_avg:154.74ms step:1075/1480 train_time:164810ms step_avg:154.75ms step:1076/1480 train_time:164968ms step_avg:154.75ms step:1077/1480 train_time:165130ms step_avg:154.76ms step:1078/1480 train_time:165296ms step_avg:154.77ms step:1079/1480 train_time:165459ms step_avg:154.78ms step:1080/1480 train_time:165620ms step_avg:154.78ms step:1081/1480 train_time:165779ms step_avg:154.79ms step:1082/1480 train_time:165939ms step_avg:154.79ms step:1083/1480 train_time:166098ms step_avg:154.80ms step:1084/1480 train_time:166258ms step_avg:154.80ms step:1085/1480 train_time:166420ms step_avg:154.81ms step:1086/1480 train_time:166580ms step_avg:154.81ms step:1087/1480 train_time:166740ms step_avg:154.82ms step:1088/1480 train_time:166899ms step_avg:154.82ms step:1089/1480 train_time:167063ms step_avg:154.83ms step:1090/1480 train_time:167226ms step_avg:154.84ms step:1091/1480 train_time:167387ms step_avg:154.84ms step:1092/1480 train_time:167550ms step_avg:154.85ms step:1093/1480 train_time:167712ms step_avg:154.86ms step:1094/1480 train_time:167872ms step_avg:154.86ms step:1095/1480 train_time:168033ms step_avg:154.87ms step:1096/1480 train_time:168194ms step_avg:154.87ms step:1097/1480 train_time:168358ms step_avg:154.88ms step:1098/1480 train_time:168520ms step_avg:154.89ms step:1099/1480 train_time:168681ms step_avg:154.90ms step:1100/1480 train_time:168844ms step_avg:154.90ms step:1101/1480 train_time:169007ms step_avg:154.91ms step:1102/1480 train_time:169169ms step_avg:154.92ms step:1103/1480 train_time:169336ms step_avg:154.93ms step:1104/1480 train_time:169497ms step_avg:154.93ms step:1105/1480 train_time:169659ms step_avg:154.94ms step:1106/1480 train_time:169819ms step_avg:154.94ms step:1107/1480 train_time:169981ms step_avg:154.95ms step:1108/1480 train_time:170140ms step_avg:154.95ms step:1109/1480 train_time:170299ms step_avg:154.96ms step:1110/1480 train_time:170460ms step_avg:154.96ms step:1111/1480 train_time:170621ms step_avg:154.97ms step:1112/1480 train_time:170782ms step_avg:154.97ms step:1113/1480 train_time:170952ms step_avg:154.99ms step:1114/1480 train_time:171115ms step_avg:155.00ms step:1115/1480 train_time:171278ms step_avg:155.00ms step:1116/1480 train_time:171438ms step_avg:155.01ms step:1117/1480 train_time:171601ms step_avg:155.01ms step:1118/1480 train_time:171766ms step_avg:155.02ms step:1119/1480 train_time:171926ms step_avg:155.03ms step:1120/1480 train_time:172088ms step_avg:155.03ms step:1121/1480 train_time:172250ms step_avg:155.04ms step:1122/1480 train_time:172411ms step_avg:155.05ms step:1123/1480 train_time:172572ms step_avg:155.05ms step:1124/1480 train_time:172735ms step_avg:155.06ms step:1125/1480 train_time:172897ms step_avg:155.06ms step:1125/1480 val_loss:3.3853 train_time:172971ms step_avg:155.13ms step:1126/1480 train_time:173066ms step_avg:155.08ms step:1127/1480 train_time:173223ms step_avg:155.08ms step:1128/1480 train_time:173383ms step_avg:155.08ms step:1129/1480 train_time:173545ms step_avg:155.09ms step:1130/1480 train_time:173707ms step_avg:155.10ms step:1131/1480 train_time:173875ms step_avg:155.11ms step:1132/1480 train_time:174035ms step_avg:155.11ms step:1133/1480 train_time:174198ms step_avg:155.12ms step:1134/1480 train_time:174363ms step_avg:155.13ms step:1135/1480 train_time:174524ms step_avg:155.13ms step:1136/1480 train_time:174686ms step_avg:155.14ms step:1137/1480 train_time:174847ms step_avg:155.14ms step:1138/1480 train_time:175009ms step_avg:155.15ms step:1139/1480 train_time:175184ms step_avg:155.17ms step:1140/1480 train_time:175332ms step_avg:155.16ms step:1141/1480 train_time:175496ms step_avg:155.17ms step:1142/1480 train_time:175656ms step_avg:155.17ms step:1143/1480 train_time:175820ms step_avg:155.18ms step:1144/1480 train_time:175980ms step_avg:155.19ms step:1145/1480 train_time:176139ms step_avg:155.19ms step:1146/1480 train_time:176305ms step_avg:155.20ms step:1147/1480 train_time:176467ms step_avg:155.20ms step:1148/1480 train_time:176628ms step_avg:155.21ms step:1149/1480 train_time:176790ms step_avg:155.22ms step:1150/1480 train_time:176950ms step_avg:155.22ms step:1151/1480 train_time:177115ms step_avg:155.23ms step:1152/1480 train_time:177280ms step_avg:155.24ms step:1153/1480 train_time:177446ms step_avg:155.25ms step:1154/1480 train_time:177607ms step_avg:155.25ms step:1155/1480 train_time:177769ms step_avg:155.26ms step:1156/1480 train_time:177936ms step_avg:155.27ms step:1157/1480 train_time:178099ms step_avg:155.27ms step:1158/1480 train_time:178259ms step_avg:155.28ms step:1159/1480 train_time:178421ms step_avg:155.28ms step:1160/1480 train_time:178582ms step_avg:155.29ms step:1161/1480 train_time:178745ms step_avg:155.30ms step:1162/1480 train_time:178909ms step_avg:155.30ms step:1163/1480 train_time:179071ms step_avg:155.31ms step:1164/1480 train_time:179232ms step_avg:155.31ms step:1165/1480 train_time:179391ms step_avg:155.32ms step:1166/1480 train_time:179552ms step_avg:155.32ms step:1167/1480 train_time:179713ms step_avg:155.33ms step:1168/1480 train_time:179878ms step_avg:155.33ms step:1169/1480 train_time:180041ms step_avg:155.34ms step:1170/1480 train_time:180203ms step_avg:155.35ms step:1171/1480 train_time:180364ms step_avg:155.35ms step:1172/1480 train_time:180524ms step_avg:155.36ms step:1173/1480 train_time:180688ms step_avg:155.36ms step:1174/1480 train_time:180856ms step_avg:155.37ms step:1175/1480 train_time:181018ms step_avg:155.38ms step:1176/1480 train_time:181182ms step_avg:155.39ms step:1177/1480 train_time:181347ms step_avg:155.40ms step:1178/1480 train_time:181509ms step_avg:155.40ms step:1179/1480 train_time:181669ms step_avg:155.41ms step:1180/1480 train_time:181840ms step_avg:155.42ms step:1181/1480 train_time:182004ms step_avg:155.43ms step:1182/1480 train_time:182165ms step_avg:155.43ms step:1183/1480 train_time:182326ms step_avg:155.44ms step:1184/1480 train_time:182488ms step_avg:155.44ms step:1185/1480 train_time:182652ms step_avg:155.45ms step:1186/1480 train_time:182814ms step_avg:155.45ms step:1187/1480 train_time:182988ms step_avg:155.47ms step:1188/1480 train_time:183147ms step_avg:155.47ms step:1189/1480 train_time:183310ms step_avg:155.48ms step:1190/1480 train_time:183471ms step_avg:155.48ms step:1191/1480 train_time:183632ms step_avg:155.49ms step:1192/1480 train_time:183793ms step_avg:155.49ms step:1193/1480 train_time:183953ms step_avg:155.50ms step:1194/1480 train_time:184115ms step_avg:155.50ms step:1195/1480 train_time:184278ms step_avg:155.51ms step:1196/1480 train_time:184448ms step_avg:155.52ms step:1197/1480 train_time:184611ms step_avg:155.53ms step:1198/1480 train_time:184781ms step_avg:155.54ms step:1199/1480 train_time:184944ms step_avg:155.55ms step:1200/1480 train_time:185106ms step_avg:155.55ms step:1201/1480 train_time:185266ms step_avg:155.56ms step:1202/1480 train_time:185434ms step_avg:155.57ms step:1203/1480 train_time:185601ms step_avg:155.57ms step:1204/1480 train_time:185765ms step_avg:155.58ms step:1205/1480 train_time:185927ms step_avg:155.59ms step:1206/1480 train_time:186088ms step_avg:155.59ms step:1207/1480 train_time:186248ms step_avg:155.60ms step:1208/1480 train_time:186409ms step_avg:155.60ms step:1209/1480 train_time:186573ms step_avg:155.61ms step:1210/1480 train_time:186739ms step_avg:155.62ms step:1211/1480 train_time:186903ms step_avg:155.62ms step:1212/1480 train_time:187065ms step_avg:155.63ms step:1213/1480 train_time:187230ms step_avg:155.64ms step:1214/1480 train_time:187395ms step_avg:155.64ms step:1215/1480 train_time:187558ms step_avg:155.65ms step:1216/1480 train_time:187719ms step_avg:155.65ms step:1217/1480 train_time:187883ms step_avg:155.66ms step:1218/1480 train_time:188045ms step_avg:155.67ms step:1219/1480 train_time:188212ms step_avg:155.68ms step:1220/1480 train_time:188375ms step_avg:155.68ms step:1221/1480 train_time:188536ms step_avg:155.69ms step:1222/1480 train_time:188695ms step_avg:155.69ms step:1223/1480 train_time:188858ms step_avg:155.69ms step:1224/1480 train_time:189024ms step_avg:155.70ms step:1225/1480 train_time:189187ms step_avg:155.71ms step:1226/1480 train_time:189351ms step_avg:155.72ms step:1227/1480 train_time:189516ms step_avg:155.72ms step:1228/1480 train_time:189679ms step_avg:155.73ms step:1229/1480 train_time:189842ms step_avg:155.74ms step:1230/1480 train_time:190010ms step_avg:155.75ms step:1231/1480 train_time:190176ms step_avg:155.75ms step:1232/1480 train_time:190341ms step_avg:155.76ms step:1233/1480 train_time:190503ms step_avg:155.77ms step:1234/1480 train_time:190664ms step_avg:155.77ms step:1235/1480 train_time:190830ms step_avg:155.78ms step:1236/1480 train_time:190991ms step_avg:155.78ms step:1237/1480 train_time:191151ms step_avg:155.79ms step:1238/1480 train_time:191325ms step_avg:155.80ms step:1239/1480 train_time:191488ms step_avg:155.81ms step:1240/1480 train_time:191652ms step_avg:155.81ms step:1241/1480 train_time:191817ms step_avg:155.82ms step:1242/1480 train_time:191980ms step_avg:155.83ms step:1243/1480 train_time:192143ms step_avg:155.83ms step:1244/1480 train_time:192304ms step_avg:155.84ms step:1245/1480 train_time:192466ms step_avg:155.84ms step:1246/1480 train_time:192629ms step_avg:155.85ms step:1247/1480 train_time:192790ms step_avg:155.85ms step:1248/1480 train_time:192951ms step_avg:155.86ms step:1249/1480 train_time:193111ms step_avg:155.86ms step:1250/1480 train_time:193272ms step_avg:155.86ms step:1250/1480 val_loss:3.3352 train_time:193347ms step_avg:155.93ms step:1251/1480 train_time:193441ms step_avg:155.87ms step:1252/1480 train_time:193604ms step_avg:155.88ms step:1253/1480 train_time:193765ms step_avg:155.88ms step:1254/1480 train_time:193926ms step_avg:155.89ms step:1255/1480 train_time:194097ms step_avg:155.90ms step:1256/1480 train_time:194261ms step_avg:155.91ms step:1257/1480 train_time:194423ms step_avg:155.91ms step:1258/1480 train_time:194586ms step_avg:155.92ms step:1259/1480 train_time:194749ms step_avg:155.92ms step:1260/1480 train_time:194910ms step_avg:155.93ms step:1261/1480 train_time:195072ms step_avg:155.93ms step:1262/1480 train_time:195238ms step_avg:155.94ms step:1263/1480 train_time:195403ms step_avg:155.95ms step:1264/1480 train_time:195563ms step_avg:155.95ms step:1265/1480 train_time:195723ms step_avg:155.95ms step:1266/1480 train_time:195886ms step_avg:155.96ms step:1267/1480 train_time:196047ms step_avg:155.96ms step:1268/1480 train_time:196211ms step_avg:155.97ms step:1269/1480 train_time:196376ms step_avg:155.98ms step:1270/1480 train_time:196540ms step_avg:155.98ms step:1271/1480 train_time:196701ms step_avg:155.99ms step:1272/1480 train_time:196863ms step_avg:155.99ms step:1273/1480 train_time:197026ms step_avg:156.00ms step:1274/1480 train_time:197188ms step_avg:156.00ms step:1275/1480 train_time:197349ms step_avg:156.01ms step:1276/1480 train_time:197509ms step_avg:156.01ms step:1277/1480 train_time:197672ms step_avg:156.02ms step:1278/1480 train_time:197833ms step_avg:156.02ms step:1279/1480 train_time:197994ms step_avg:156.02ms step:1280/1480 train_time:198161ms step_avg:156.03ms step:1281/1480 train_time:198323ms step_avg:156.04ms step:1282/1480 train_time:198483ms step_avg:156.04ms step:1283/1480 train_time:198646ms step_avg:156.05ms step:1284/1480 train_time:198809ms step_avg:156.05ms step:1285/1480 train_time:198969ms step_avg:156.05ms step:1286/1480 train_time:199131ms step_avg:156.06ms step:1287/1480 train_time:199293ms step_avg:156.06ms step:1288/1480 train_time:199456ms step_avg:156.07ms step:1289/1480 train_time:199625ms step_avg:156.08ms step:1290/1480 train_time:199794ms step_avg:156.09ms step:1291/1480 train_time:199959ms step_avg:156.10ms step:1292/1480 train_time:200122ms step_avg:156.10ms step:1293/1480 train_time:200287ms step_avg:156.11ms step:1294/1480 train_time:200451ms step_avg:156.11ms step:1295/1480 train_time:200614ms step_avg:156.12ms step:1296/1480 train_time:200776ms step_avg:156.12ms step:1297/1480 train_time:200941ms step_avg:156.13ms step:1298/1480 train_time:201103ms step_avg:156.14ms step:1299/1480 train_time:201266ms step_avg:156.14ms step:1300/1480 train_time:201426ms step_avg:156.14ms step:1301/1480 train_time:201588ms step_avg:156.15ms step:1302/1480 train_time:201754ms step_avg:156.16ms step:1303/1480 train_time:201922ms step_avg:156.17ms step:1304/1480 train_time:202086ms step_avg:156.17ms step:1305/1480 train_time:202248ms step_avg:156.18ms step:1306/1480 train_time:202412ms step_avg:156.18ms step:1307/1480 train_time:202574ms step_avg:156.19ms step:1308/1480 train_time:202737ms step_avg:156.19ms step:1309/1480 train_time:202900ms step_avg:156.20ms step:1310/1480 train_time:203064ms step_avg:156.20ms step:1311/1480 train_time:203224ms step_avg:156.21ms step:1312/1480 train_time:203389ms step_avg:156.21ms step:1313/1480 train_time:203552ms step_avg:156.22ms step:1314/1480 train_time:203716ms step_avg:156.22ms step:1315/1480 train_time:203881ms step_avg:156.23ms step:1316/1480 train_time:204040ms step_avg:156.23ms step:1317/1480 train_time:204201ms step_avg:156.24ms step:1318/1480 train_time:204369ms step_avg:156.25ms step:1319/1480 train_time:204536ms step_avg:156.25ms step:1320/1480 train_time:204700ms step_avg:156.26ms step:1321/1480 train_time:204866ms step_avg:156.27ms step:1322/1480 train_time:205038ms step_avg:156.28ms step:1323/1480 train_time:205202ms step_avg:156.28ms step:1324/1480 train_time:205367ms step_avg:156.29ms step:1325/1480 train_time:205537ms step_avg:156.30ms step:1326/1480 train_time:205702ms step_avg:156.31ms step:1327/1480 train_time:205863ms step_avg:156.31ms step:1328/1480 train_time:206026ms step_avg:156.32ms step:1329/1480 train_time:206210ms step_avg:156.34ms step:1330/1480 train_time:206372ms step_avg:156.34ms step:1331/1480 train_time:206537ms step_avg:156.35ms step:1332/1480 train_time:206700ms step_avg:156.35ms step:1333/1480 train_time:206866ms step_avg:156.36ms step:1334/1480 train_time:207028ms step_avg:156.37ms step:1335/1480 train_time:207189ms step_avg:156.37ms step:1336/1480 train_time:207359ms step_avg:156.38ms step:1337/1480 train_time:207524ms step_avg:156.39ms step:1338/1480 train_time:207688ms step_avg:156.39ms step:1339/1480 train_time:207852ms step_avg:156.40ms step:1340/1480 train_time:208017ms step_avg:156.40ms step:1341/1480 train_time:208179ms step_avg:156.41ms step:1342/1480 train_time:208345ms step_avg:156.42ms step:1343/1480 train_time:208506ms step_avg:156.42ms step:1344/1480 train_time:208668ms step_avg:156.42ms step:1345/1480 train_time:208837ms step_avg:156.43ms step:1346/1480 train_time:208998ms step_avg:156.44ms step:1347/1480 train_time:209162ms step_avg:156.44ms step:1348/1480 train_time:209325ms step_avg:156.45ms step:1349/1480 train_time:209487ms step_avg:156.45ms step:1350/1480 train_time:209653ms step_avg:156.46ms step:1351/1480 train_time:209815ms step_avg:156.46ms step:1352/1480 train_time:209978ms step_avg:156.47ms step:1353/1480 train_time:210144ms step_avg:156.47ms step:1354/1480 train_time:210307ms step_avg:156.48ms step:1355/1480 train_time:210470ms step_avg:156.48ms step:1356/1480 train_time:210634ms step_avg:156.49ms step:1357/1480 train_time:210798ms step_avg:156.49ms step:1358/1480 train_time:210963ms step_avg:156.50ms step:1359/1480 train_time:211127ms step_avg:156.51ms step:1360/1480 train_time:211294ms step_avg:156.51ms step:1361/1480 train_time:211461ms step_avg:156.52ms step:1362/1480 train_time:211626ms step_avg:156.53ms step:1363/1480 train_time:211795ms step_avg:156.54ms step:1364/1480 train_time:211957ms step_avg:156.54ms step:1365/1480 train_time:212118ms step_avg:156.54ms step:1366/1480 train_time:212282ms step_avg:156.55ms step:1367/1480 train_time:212445ms step_avg:156.56ms step:1368/1480 train_time:212608ms step_avg:156.56ms step:1369/1480 train_time:212778ms step_avg:156.57ms step:1370/1480 train_time:212944ms step_avg:156.58ms step:1371/1480 train_time:213105ms step_avg:156.58ms step:1372/1480 train_time:213273ms step_avg:156.59ms step:1373/1480 train_time:213436ms step_avg:156.59ms step:1374/1480 train_time:213601ms step_avg:156.60ms step:1375/1480 train_time:213764ms step_avg:156.60ms step:1375/1480 val_loss:3.2963 train_time:213838ms step_avg:156.66ms step:1376/1480 train_time:213934ms step_avg:156.61ms step:1377/1480 train_time:214094ms step_avg:156.62ms step:1378/1480 train_time:214257ms step_avg:156.62ms step:1379/1480 train_time:214423ms step_avg:156.63ms step:1380/1480 train_time:214586ms step_avg:156.63ms step:1381/1480 train_time:214754ms step_avg:156.64ms step:1382/1480 train_time:214919ms step_avg:156.65ms step:1383/1480 train_time:215082ms step_avg:156.65ms step:1384/1480 train_time:215249ms step_avg:156.66ms step:1385/1480 train_time:215409ms step_avg:156.66ms step:1386/1480 train_time:215572ms step_avg:156.67ms step:1387/1480 train_time:215736ms step_avg:156.67ms step:1388/1480 train_time:215899ms step_avg:156.68ms step:1389/1480 train_time:216064ms step_avg:156.68ms step:1390/1480 train_time:216225ms step_avg:156.68ms step:1391/1480 train_time:216387ms step_avg:156.69ms step:1392/1480 train_time:216550ms step_avg:156.69ms step:1393/1480 train_time:216711ms step_avg:156.70ms step:1394/1480 train_time:216877ms step_avg:156.70ms step:1395/1480 train_time:217039ms step_avg:156.71ms step:1396/1480 train_time:217204ms step_avg:156.71ms step:1397/1480 train_time:217364ms step_avg:156.71ms step:1398/1480 train_time:217524ms step_avg:156.72ms step:1399/1480 train_time:217686ms step_avg:156.72ms step:1400/1480 train_time:217853ms step_avg:156.73ms step:1401/1480 train_time:218013ms step_avg:156.73ms step:1402/1480 train_time:218175ms step_avg:156.74ms step:1403/1480 train_time:218341ms step_avg:156.74ms step:1404/1480 train_time:218503ms step_avg:156.75ms step:1405/1480 train_time:218667ms step_avg:156.75ms step:1406/1480 train_time:218832ms step_avg:156.76ms step:1407/1480 train_time:218995ms step_avg:156.76ms step:1408/1480 train_time:219158ms step_avg:156.77ms step:1409/1480 train_time:219330ms step_avg:156.78ms step:1410/1480 train_time:219492ms step_avg:156.78ms step:1411/1480 train_time:219653ms step_avg:156.78ms step:1412/1480 train_time:219815ms step_avg:156.79ms step:1413/1480 train_time:219979ms step_avg:156.79ms step:1414/1480 train_time:220142ms step_avg:156.80ms step:1415/1480 train_time:220307ms step_avg:156.80ms step:1416/1480 train_time:220482ms step_avg:156.82ms step:1417/1480 train_time:220647ms step_avg:156.82ms step:1418/1480 train_time:220810ms step_avg:156.83ms step:1419/1480 train_time:220976ms step_avg:156.83ms step:1420/1480 train_time:221142ms step_avg:156.84ms step:1421/1480 train_time:221307ms step_avg:156.84ms step:1422/1480 train_time:221474ms step_avg:156.85ms step:1423/1480 train_time:221637ms step_avg:156.86ms step:1424/1480 train_time:221804ms step_avg:156.86ms step:1425/1480 train_time:221972ms step_avg:156.87ms step:1426/1480 train_time:222137ms step_avg:156.88ms step:1427/1480 train_time:222304ms step_avg:156.88ms step:1428/1480 train_time:222465ms step_avg:156.89ms step:1429/1480 train_time:222624ms step_avg:156.89ms step:1430/1480 train_time:222788ms step_avg:156.89ms step:1431/1480 train_time:222955ms step_avg:156.90ms step:1432/1480 train_time:223122ms step_avg:156.91ms step:1433/1480 train_time:223292ms step_avg:156.92ms step:1434/1480 train_time:223461ms step_avg:156.92ms step:1435/1480 train_time:223626ms step_avg:156.93ms step:1436/1480 train_time:223791ms step_avg:156.94ms step:1437/1480 train_time:223954ms step_avg:156.94ms step:1438/1480 train_time:224117ms step_avg:156.94ms step:1439/1480 train_time:224284ms step_avg:156.95ms step:1440/1480 train_time:224447ms step_avg:156.96ms step:1441/1480 train_time:224610ms step_avg:156.96ms step:1442/1480 train_time:224777ms step_avg:156.97ms step:1443/1480 train_time:224950ms step_avg:156.98ms step:1444/1480 train_time:225115ms step_avg:156.98ms step:1445/1480 train_time:225278ms step_avg:156.99ms step:1446/1480 train_time:225444ms step_avg:156.99ms step:1447/1480 train_time:225613ms step_avg:157.00ms step:1448/1480 train_time:225777ms step_avg:157.01ms step:1449/1480 train_time:225939ms step_avg:157.01ms step:1450/1480 train_time:226104ms step_avg:157.02ms step:1451/1480 train_time:226267ms step_avg:157.02ms step:1452/1480 train_time:226432ms step_avg:157.03ms step:1453/1480 train_time:226596ms step_avg:157.03ms step:1454/1480 train_time:226758ms step_avg:157.03ms step:1455/1480 train_time:226925ms step_avg:157.04ms step:1456/1480 train_time:227089ms step_avg:157.05ms step:1457/1480 train_time:227252ms step_avg:157.05ms step:1458/1480 train_time:227416ms step_avg:157.06ms step:1459/1480 train_time:227583ms step_avg:157.06ms step:1460/1480 train_time:227745ms step_avg:157.07ms step:1461/1480 train_time:227909ms step_avg:157.07ms step:1462/1480 train_time:228074ms step_avg:157.08ms step:1463/1480 train_time:228242ms step_avg:157.08ms step:1464/1480 train_time:228408ms step_avg:157.09ms step:1465/1480 train_time:228572ms step_avg:157.09ms step:1466/1480 train_time:228735ms step_avg:157.10ms step:1467/1480 train_time:228900ms step_avg:157.10ms step:1468/1480 train_time:229064ms step_avg:157.11ms step:1469/1480 train_time:229226ms step_avg:157.11ms step:1470/1480 train_time:229396ms step_avg:157.12ms step:1471/1480 train_time:229566ms step_avg:157.13ms step:1472/1480 train_time:229736ms step_avg:157.14ms step:1473/1480 train_time:229899ms step_avg:157.14ms step:1474/1480 train_time:230066ms step_avg:157.15ms step:1475/1480 train_time:230235ms step_avg:157.16ms step:1476/1480 train_time:230399ms step_avg:157.16ms step:1477/1480 train_time:230569ms step_avg:157.17ms step:1478/1480 train_time:230739ms step_avg:157.18ms step:1479/1480 train_time:230904ms step_avg:157.18ms step:1480/1480 train_time:231068ms step_avg:157.19ms step:1480/1480 val_loss:3.2776 train_time:231143ms step_avg:157.24ms peak memory consumption: 34238 MiB