import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 10:31:14 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 31C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 37C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29100ms step_avg:nanms step:2/1480 train_time:29206ms step_avg:nanms step:3/1480 train_time:29326ms step_avg:nanms step:4/1480 train_time:29466ms step_avg:nanms step:5/1480 train_time:29610ms step_avg:nanms step:6/1480 train_time:29750ms step_avg:nanms step:7/1480 train_time:29892ms step_avg:nanms step:8/1480 train_time:30036ms step_avg:nanms step:9/1480 train_time:30179ms step_avg:nanms step:10/1480 train_time:30321ms step_avg:nanms step:11/1480 train_time:145ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.48ms step:14/1480 train_time:568ms step_avg:142.07ms step:15/1480 train_time:712ms step_avg:142.50ms step:16/1480 train_time:854ms step_avg:142.36ms step:17/1480 train_time:997ms step_avg:142.42ms step:18/1480 train_time:1138ms step_avg:142.21ms step:19/1480 train_time:1281ms step_avg:142.33ms step:20/1480 train_time:1423ms step_avg:142.33ms step:21/1480 train_time:1567ms step_avg:142.44ms step:22/1480 train_time:1711ms step_avg:142.62ms step:23/1480 train_time:1854ms step_avg:142.60ms step:24/1480 train_time:1997ms step_avg:142.66ms step:25/1480 train_time:2139ms step_avg:142.60ms step:26/1480 train_time:2281ms step_avg:142.57ms step:27/1480 train_time:2425ms step_avg:142.65ms step:28/1480 train_time:2567ms step_avg:142.63ms step:29/1480 train_time:2713ms step_avg:142.79ms step:30/1480 train_time:2855ms step_avg:142.74ms step:31/1480 train_time:2997ms step_avg:142.72ms step:32/1480 train_time:3139ms step_avg:142.67ms step:33/1480 train_time:3281ms step_avg:142.64ms step:34/1480 train_time:3424ms step_avg:142.66ms step:35/1480 train_time:3567ms step_avg:142.69ms step:36/1480 train_time:3711ms step_avg:142.72ms step:37/1480 train_time:3852ms step_avg:142.68ms step:38/1480 train_time:3996ms step_avg:142.73ms step:39/1480 train_time:4138ms step_avg:142.71ms step:40/1480 train_time:4281ms step_avg:142.70ms step:41/1480 train_time:4423ms step_avg:142.67ms step:42/1480 train_time:4568ms step_avg:142.75ms step:43/1480 train_time:4713ms step_avg:142.81ms step:44/1480 train_time:4855ms step_avg:142.78ms step:45/1480 train_time:4998ms step_avg:142.79ms step:46/1480 train_time:5139ms step_avg:142.75ms step:47/1480 train_time:5282ms step_avg:142.76ms step:48/1480 train_time:5425ms step_avg:142.77ms step:49/1480 train_time:5569ms step_avg:142.79ms step:50/1480 train_time:5713ms step_avg:142.82ms step:51/1480 train_time:5855ms step_avg:142.81ms step:52/1480 train_time:5997ms step_avg:142.80ms step:53/1480 train_time:6139ms step_avg:142.77ms step:54/1480 train_time:6282ms step_avg:142.77ms step:55/1480 train_time:6424ms step_avg:142.76ms step:56/1480 train_time:6568ms step_avg:142.79ms step:57/1480 train_time:6713ms step_avg:142.82ms step:58/1480 train_time:6855ms step_avg:142.80ms step:59/1480 train_time:6998ms step_avg:142.81ms step:60/1480 train_time:7140ms step_avg:142.79ms step:61/1480 train_time:7281ms step_avg:142.77ms step:62/1480 train_time:7423ms step_avg:142.76ms step:63/1480 train_time:7568ms step_avg:142.80ms step:64/1480 train_time:7713ms step_avg:142.83ms step:65/1480 train_time:7855ms step_avg:142.81ms step:66/1480 train_time:7998ms step_avg:142.82ms step:67/1480 train_time:8140ms step_avg:142.81ms step:68/1480 train_time:8282ms step_avg:142.79ms step:69/1480 train_time:8425ms step_avg:142.79ms step:70/1480 train_time:8567ms step_avg:142.79ms step:71/1480 train_time:8713ms step_avg:142.84ms step:72/1480 train_time:8855ms step_avg:142.82ms step:73/1480 train_time:8999ms step_avg:142.84ms step:74/1480 train_time:9141ms step_avg:142.83ms step:75/1480 train_time:9284ms step_avg:142.84ms step:76/1480 train_time:9427ms step_avg:142.83ms step:77/1480 train_time:9571ms step_avg:142.85ms step:78/1480 train_time:9714ms step_avg:142.85ms step:79/1480 train_time:9856ms step_avg:142.84ms step:80/1480 train_time:10391ms step_avg:148.44ms step:81/1480 train_time:10488ms step_avg:147.71ms step:82/1480 train_time:10631ms step_avg:147.65ms step:83/1480 train_time:10774ms step_avg:147.58ms step:84/1480 train_time:10916ms step_avg:147.51ms step:85/1480 train_time:11057ms step_avg:147.42ms step:86/1480 train_time:11199ms step_avg:147.35ms step:87/1480 train_time:11341ms step_avg:147.29ms step:88/1480 train_time:11882ms step_avg:152.34ms step:89/1480 train_time:11984ms step_avg:151.70ms step:90/1480 train_time:12127ms step_avg:151.58ms step:91/1480 train_time:12269ms step_avg:151.47ms step:92/1480 train_time:12412ms step_avg:151.36ms step:93/1480 train_time:12553ms step_avg:151.25ms step:94/1480 train_time:12695ms step_avg:151.14ms step:95/1480 train_time:12837ms step_avg:151.03ms step:96/1480 train_time:12980ms step_avg:150.93ms step:97/1480 train_time:13122ms step_avg:150.82ms step:98/1480 train_time:13265ms step_avg:150.74ms step:99/1480 train_time:13409ms step_avg:150.67ms step:100/1480 train_time:13552ms step_avg:150.58ms step:101/1480 train_time:13698ms step_avg:150.52ms step:102/1480 train_time:13836ms step_avg:150.40ms step:103/1480 train_time:13979ms step_avg:150.31ms step:104/1480 train_time:14121ms step_avg:150.22ms step:105/1480 train_time:14263ms step_avg:150.14ms step:106/1480 train_time:14408ms step_avg:150.09ms step:107/1480 train_time:14553ms step_avg:150.03ms step:108/1480 train_time:14695ms step_avg:149.95ms step:109/1480 train_time:14837ms step_avg:149.87ms step:110/1480 train_time:14979ms step_avg:149.79ms step:111/1480 train_time:15123ms step_avg:149.73ms step:112/1480 train_time:15269ms step_avg:149.70ms step:113/1480 train_time:15416ms step_avg:149.67ms step:114/1480 train_time:15560ms step_avg:149.61ms step:115/1480 train_time:15706ms step_avg:149.58ms step:116/1480 train_time:15852ms step_avg:149.55ms step:117/1480 train_time:15998ms step_avg:149.51ms step:118/1480 train_time:16143ms step_avg:149.47ms step:119/1480 train_time:16290ms step_avg:149.45ms step:120/1480 train_time:16437ms step_avg:149.42ms step:121/1480 train_time:16583ms step_avg:149.39ms step:122/1480 train_time:16729ms step_avg:149.37ms step:123/1480 train_time:16875ms step_avg:149.34ms step:124/1480 train_time:17020ms step_avg:149.30ms step:125/1480 train_time:17165ms step_avg:149.26ms step:125/1480 val_loss:4.4133 train_time:17231ms step_avg:149.83ms step:126/1480 train_time:17329ms step_avg:149.39ms step:127/1480 train_time:17469ms step_avg:149.30ms step:128/1480 train_time:17614ms step_avg:149.27ms step:129/1480 train_time:17759ms step_avg:149.24ms step:130/1480 train_time:17905ms step_avg:149.21ms step:131/1480 train_time:18050ms step_avg:149.17ms step:132/1480 train_time:18195ms step_avg:149.14ms step:133/1480 train_time:18342ms step_avg:149.12ms step:134/1480 train_time:18489ms step_avg:149.11ms step:135/1480 train_time:18635ms step_avg:149.08ms step:136/1480 train_time:18782ms step_avg:149.06ms step:137/1480 train_time:18928ms step_avg:149.04ms step:138/1480 train_time:19072ms step_avg:149.00ms step:139/1480 train_time:19218ms step_avg:148.98ms step:140/1480 train_time:19365ms step_avg:148.96ms step:141/1480 train_time:19510ms step_avg:148.93ms step:142/1480 train_time:19657ms step_avg:148.92ms step:143/1480 train_time:19804ms step_avg:148.90ms step:144/1480 train_time:19950ms step_avg:148.88ms step:145/1480 train_time:20095ms step_avg:148.85ms step:146/1480 train_time:20242ms step_avg:148.83ms step:147/1480 train_time:20388ms step_avg:148.82ms step:148/1480 train_time:20536ms step_avg:148.81ms step:149/1480 train_time:20682ms step_avg:148.79ms step:150/1480 train_time:20828ms step_avg:148.77ms step:151/1480 train_time:20973ms step_avg:148.74ms step:152/1480 train_time:21118ms step_avg:148.72ms step:153/1480 train_time:21263ms step_avg:148.69ms step:154/1480 train_time:21409ms step_avg:148.67ms step:155/1480 train_time:21555ms step_avg:148.66ms step:156/1480 train_time:21702ms step_avg:148.65ms step:157/1480 train_time:21849ms step_avg:148.63ms step:158/1480 train_time:21993ms step_avg:148.60ms step:159/1480 train_time:22139ms step_avg:148.59ms step:160/1480 train_time:22286ms step_avg:148.57ms step:161/1480 train_time:22431ms step_avg:148.55ms step:162/1480 train_time:22576ms step_avg:148.52ms step:163/1480 train_time:22723ms step_avg:148.52ms step:164/1480 train_time:22869ms step_avg:148.50ms step:165/1480 train_time:23014ms step_avg:148.48ms step:166/1480 train_time:23161ms step_avg:148.47ms step:167/1480 train_time:23307ms step_avg:148.45ms step:168/1480 train_time:23453ms step_avg:148.43ms step:169/1480 train_time:23598ms step_avg:148.42ms step:170/1480 train_time:23745ms step_avg:148.41ms step:171/1480 train_time:23889ms step_avg:148.38ms step:172/1480 train_time:24036ms step_avg:148.37ms step:173/1480 train_time:24183ms step_avg:148.36ms step:174/1480 train_time:24328ms step_avg:148.34ms step:175/1480 train_time:24473ms step_avg:148.32ms step:176/1480 train_time:24619ms step_avg:148.31ms step:177/1480 train_time:24765ms step_avg:148.30ms step:178/1480 train_time:24910ms step_avg:148.27ms step:179/1480 train_time:25057ms step_avg:148.27ms step:180/1480 train_time:25203ms step_avg:148.25ms step:181/1480 train_time:25348ms step_avg:148.24ms step:182/1480 train_time:25493ms step_avg:148.21ms step:183/1480 train_time:25639ms step_avg:148.20ms step:184/1480 train_time:25785ms step_avg:148.19ms step:185/1480 train_time:25931ms step_avg:148.18ms step:186/1480 train_time:26077ms step_avg:148.16ms step:187/1480 train_time:26224ms step_avg:148.16ms step:188/1480 train_time:26369ms step_avg:148.14ms step:189/1480 train_time:26535ms step_avg:148.24ms step:190/1480 train_time:26660ms step_avg:148.11ms step:191/1480 train_time:26806ms step_avg:148.10ms step:192/1480 train_time:26953ms step_avg:148.10ms step:193/1480 train_time:27098ms step_avg:148.08ms step:194/1480 train_time:27246ms step_avg:148.07ms step:195/1480 train_time:27390ms step_avg:148.05ms step:196/1480 train_time:27536ms step_avg:148.04ms step:197/1480 train_time:27683ms step_avg:148.04ms step:198/1480 train_time:27828ms step_avg:148.02ms step:199/1480 train_time:27974ms step_avg:148.01ms step:200/1480 train_time:28121ms step_avg:148.01ms step:201/1480 train_time:28269ms step_avg:148.01ms step:202/1480 train_time:28413ms step_avg:147.99ms step:203/1480 train_time:28560ms step_avg:147.98ms step:204/1480 train_time:28706ms step_avg:147.97ms step:205/1480 train_time:28851ms step_avg:147.95ms step:206/1480 train_time:28996ms step_avg:147.94ms step:207/1480 train_time:29144ms step_avg:147.94ms step:208/1480 train_time:29289ms step_avg:147.93ms step:209/1480 train_time:29435ms step_avg:147.91ms step:210/1480 train_time:29582ms step_avg:147.91ms step:211/1480 train_time:29728ms step_avg:147.90ms step:212/1480 train_time:29872ms step_avg:147.88ms step:213/1480 train_time:30018ms step_avg:147.87ms step:214/1480 train_time:30165ms step_avg:147.87ms step:215/1480 train_time:30309ms step_avg:147.85ms step:216/1480 train_time:30456ms step_avg:147.85ms step:217/1480 train_time:30602ms step_avg:147.84ms step:218/1480 train_time:30749ms step_avg:147.83ms step:219/1480 train_time:30895ms step_avg:147.82ms step:220/1480 train_time:31042ms step_avg:147.82ms step:221/1480 train_time:31578ms step_avg:149.66ms step:222/1480 train_time:32084ms step_avg:151.34ms step:223/1480 train_time:32193ms step_avg:151.14ms step:224/1480 train_time:32342ms step_avg:151.13ms step:225/1480 train_time:32490ms step_avg:151.12ms step:226/1480 train_time:32639ms step_avg:151.10ms step:227/1480 train_time:32787ms step_avg:151.09ms step:228/1480 train_time:32934ms step_avg:151.08ms step:229/1480 train_time:33086ms step_avg:151.08ms step:230/1480 train_time:33234ms step_avg:151.06ms step:231/1480 train_time:33383ms step_avg:151.05ms step:232/1480 train_time:33531ms step_avg:151.04ms step:233/1480 train_time:33679ms step_avg:151.03ms step:234/1480 train_time:33828ms step_avg:151.02ms step:235/1480 train_time:33975ms step_avg:151.00ms step:236/1480 train_time:34126ms step_avg:151.00ms step:237/1480 train_time:34274ms step_avg:150.99ms step:238/1480 train_time:34423ms step_avg:150.98ms step:239/1480 train_time:34571ms step_avg:150.96ms step:240/1480 train_time:34720ms step_avg:150.95ms step:241/1480 train_time:34869ms step_avg:150.95ms step:242/1480 train_time:35017ms step_avg:150.93ms step:243/1480 train_time:35167ms step_avg:150.93ms step:244/1480 train_time:35314ms step_avg:150.92ms step:245/1480 train_time:35464ms step_avg:150.91ms step:246/1480 train_time:35612ms step_avg:150.90ms step:247/1480 train_time:35762ms step_avg:150.90ms step:248/1480 train_time:35911ms step_avg:150.89ms step:249/1480 train_time:36061ms step_avg:150.88ms step:250/1480 train_time:36209ms step_avg:150.87ms step:250/1480 val_loss:4.0012 train_time:36276ms step_avg:151.15ms step:251/1480 train_time:36372ms step_avg:150.92ms step:252/1480 train_time:36517ms step_avg:150.90ms step:253/1480 train_time:36666ms step_avg:150.89ms step:254/1480 train_time:36816ms step_avg:150.89ms step:255/1480 train_time:36964ms step_avg:150.87ms step:256/1480 train_time:37113ms step_avg:150.87ms step:257/1480 train_time:37261ms step_avg:150.85ms step:258/1480 train_time:37410ms step_avg:150.85ms step:259/1480 train_time:37559ms step_avg:150.84ms step:260/1480 train_time:37708ms step_avg:150.83ms step:261/1480 train_time:37858ms step_avg:150.83ms step:262/1480 train_time:38005ms step_avg:150.81ms step:263/1480 train_time:38154ms step_avg:150.81ms step:264/1480 train_time:38301ms step_avg:150.79ms step:265/1480 train_time:38450ms step_avg:150.79ms step:266/1480 train_time:38598ms step_avg:150.77ms step:267/1480 train_time:38747ms step_avg:150.77ms step:268/1480 train_time:38896ms step_avg:150.76ms step:269/1480 train_time:39044ms step_avg:150.75ms step:270/1480 train_time:39194ms step_avg:150.75ms step:271/1480 train_time:39341ms step_avg:150.73ms step:272/1480 train_time:39491ms step_avg:150.73ms step:273/1480 train_time:39638ms step_avg:150.72ms step:274/1480 train_time:39787ms step_avg:150.71ms step:275/1480 train_time:39935ms step_avg:150.70ms step:276/1480 train_time:40084ms step_avg:150.69ms step:277/1480 train_time:40233ms step_avg:150.68ms step:278/1480 train_time:40382ms step_avg:150.68ms step:279/1480 train_time:40530ms step_avg:150.67ms step:280/1480 train_time:40678ms step_avg:150.66ms step:281/1480 train_time:40827ms step_avg:150.65ms step:282/1480 train_time:40977ms step_avg:150.65ms step:283/1480 train_time:41124ms step_avg:150.64ms step:284/1480 train_time:41273ms step_avg:150.63ms step:285/1480 train_time:41420ms step_avg:150.62ms step:286/1480 train_time:41569ms step_avg:150.61ms step:287/1480 train_time:41718ms step_avg:150.61ms step:288/1480 train_time:41867ms step_avg:150.60ms step:289/1480 train_time:42016ms step_avg:150.59ms step:290/1480 train_time:42163ms step_avg:150.58ms step:291/1480 train_time:42312ms step_avg:150.58ms step:292/1480 train_time:42460ms step_avg:150.57ms step:293/1480 train_time:42609ms step_avg:150.56ms step:294/1480 train_time:42758ms step_avg:150.56ms step:295/1480 train_time:42908ms step_avg:150.56ms step:296/1480 train_time:43057ms step_avg:150.55ms step:297/1480 train_time:43206ms step_avg:150.54ms step:298/1480 train_time:43355ms step_avg:150.54ms step:299/1480 train_time:43503ms step_avg:150.53ms step:300/1480 train_time:43653ms step_avg:150.53ms step:301/1480 train_time:43805ms step_avg:150.53ms step:302/1480 train_time:43949ms step_avg:150.51ms step:303/1480 train_time:44098ms step_avg:150.50ms step:304/1480 train_time:44245ms step_avg:150.49ms step:305/1480 train_time:44396ms step_avg:150.50ms step:306/1480 train_time:44543ms step_avg:150.48ms step:307/1480 train_time:44692ms step_avg:150.48ms step:308/1480 train_time:44840ms step_avg:150.47ms step:309/1480 train_time:44990ms step_avg:150.47ms step:310/1480 train_time:45138ms step_avg:150.46ms step:311/1480 train_time:45288ms step_avg:150.46ms step:312/1480 train_time:45437ms step_avg:150.45ms step:313/1480 train_time:45585ms step_avg:150.45ms step:314/1480 train_time:45735ms step_avg:150.44ms step:315/1480 train_time:45883ms step_avg:150.43ms step:316/1480 train_time:46032ms step_avg:150.43ms step:317/1480 train_time:46181ms step_avg:150.43ms step:318/1480 train_time:46330ms step_avg:150.42ms step:319/1480 train_time:46478ms step_avg:150.41ms step:320/1480 train_time:46626ms step_avg:150.41ms step:321/1480 train_time:46776ms step_avg:150.40ms step:322/1480 train_time:46924ms step_avg:150.40ms step:323/1480 train_time:47072ms step_avg:150.39ms step:324/1480 train_time:47220ms step_avg:150.38ms step:325/1480 train_time:47369ms step_avg:150.38ms step:326/1480 train_time:47517ms step_avg:150.37ms step:327/1480 train_time:47668ms step_avg:150.37ms step:328/1480 train_time:47817ms step_avg:150.37ms step:329/1480 train_time:47965ms step_avg:150.36ms step:330/1480 train_time:48116ms step_avg:150.36ms step:331/1480 train_time:48267ms step_avg:150.36ms step:332/1480 train_time:48417ms step_avg:150.36ms step:333/1480 train_time:48569ms step_avg:150.37ms step:334/1480 train_time:48720ms step_avg:150.37ms step:335/1480 train_time:48872ms step_avg:150.37ms step:336/1480 train_time:49022ms step_avg:150.37ms step:337/1480 train_time:49174ms step_avg:150.38ms step:338/1480 train_time:49324ms step_avg:150.38ms step:339/1480 train_time:49475ms step_avg:150.38ms step:340/1480 train_time:49626ms step_avg:150.38ms step:341/1480 train_time:49777ms step_avg:150.38ms step:342/1480 train_time:49926ms step_avg:150.38ms step:343/1480 train_time:50078ms step_avg:150.38ms step:344/1480 train_time:50228ms step_avg:150.38ms step:345/1480 train_time:50379ms step_avg:150.39ms step:346/1480 train_time:50529ms step_avg:150.38ms step:347/1480 train_time:50680ms step_avg:150.39ms step:348/1480 train_time:50830ms step_avg:150.39ms step:349/1480 train_time:50980ms step_avg:150.38ms step:350/1480 train_time:51133ms step_avg:150.39ms step:351/1480 train_time:51285ms step_avg:150.40ms step:352/1480 train_time:51437ms step_avg:150.40ms step:353/1480 train_time:51587ms step_avg:150.40ms step:354/1480 train_time:51737ms step_avg:150.40ms step:355/1480 train_time:51888ms step_avg:150.40ms step:356/1480 train_time:52039ms step_avg:150.40ms step:357/1480 train_time:52191ms step_avg:150.41ms step:358/1480 train_time:52341ms step_avg:150.41ms step:359/1480 train_time:52493ms step_avg:150.41ms step:360/1480 train_time:52644ms step_avg:150.41ms step:361/1480 train_time:52796ms step_avg:150.42ms step:362/1480 train_time:52946ms step_avg:150.42ms step:363/1480 train_time:53098ms step_avg:150.42ms step:364/1480 train_time:53248ms step_avg:150.42ms step:365/1480 train_time:53399ms step_avg:150.42ms step:366/1480 train_time:53550ms step_avg:150.42ms step:367/1480 train_time:53700ms step_avg:150.42ms step:368/1480 train_time:53852ms step_avg:150.42ms step:369/1480 train_time:54002ms step_avg:150.42ms step:370/1480 train_time:54154ms step_avg:150.43ms step:371/1480 train_time:54303ms step_avg:150.42ms step:372/1480 train_time:54456ms step_avg:150.43ms step:373/1480 train_time:54606ms step_avg:150.43ms step:374/1480 train_time:54758ms step_avg:150.43ms step:375/1480 train_time:54908ms step_avg:150.43ms step:375/1480 val_loss:3.8074 train_time:54976ms step_avg:150.62ms step:376/1480 train_time:55075ms step_avg:150.48ms step:377/1480 train_time:55218ms step_avg:150.46ms step:378/1480 train_time:55370ms step_avg:150.46ms step:379/1480 train_time:55540ms step_avg:150.52ms step:380/1480 train_time:55670ms step_avg:150.46ms step:381/1480 train_time:55819ms step_avg:150.46ms step:382/1480 train_time:55971ms step_avg:150.46ms step:383/1480 train_time:56121ms step_avg:150.46ms step:384/1480 train_time:56273ms step_avg:150.46ms step:385/1480 train_time:56425ms step_avg:150.47ms step:386/1480 train_time:56576ms step_avg:150.47ms step:387/1480 train_time:56725ms step_avg:150.47ms step:388/1480 train_time:56876ms step_avg:150.47ms step:389/1480 train_time:57027ms step_avg:150.47ms step:390/1480 train_time:57178ms step_avg:150.47ms step:391/1480 train_time:57329ms step_avg:150.47ms step:392/1480 train_time:57480ms step_avg:150.47ms step:393/1480 train_time:57631ms step_avg:150.47ms step:394/1480 train_time:57781ms step_avg:150.47ms step:395/1480 train_time:57933ms step_avg:150.48ms step:396/1480 train_time:58083ms step_avg:150.48ms step:397/1480 train_time:58234ms step_avg:150.48ms step:398/1480 train_time:58385ms step_avg:150.48ms step:399/1480 train_time:58536ms step_avg:150.48ms step:400/1480 train_time:58689ms step_avg:150.49ms step:401/1480 train_time:58840ms step_avg:150.49ms step:402/1480 train_time:58991ms step_avg:150.49ms step:403/1480 train_time:59141ms step_avg:150.49ms step:404/1480 train_time:59293ms step_avg:150.49ms step:405/1480 train_time:59443ms step_avg:150.49ms step:406/1480 train_time:59595ms step_avg:150.49ms step:407/1480 train_time:59746ms step_avg:150.49ms step:408/1480 train_time:59896ms step_avg:150.49ms step:409/1480 train_time:60047ms step_avg:150.49ms step:410/1480 train_time:60197ms step_avg:150.49ms step:411/1480 train_time:60349ms step_avg:150.50ms step:412/1480 train_time:60500ms step_avg:150.50ms step:413/1480 train_time:60652ms step_avg:150.50ms step:414/1480 train_time:60802ms step_avg:150.50ms step:415/1480 train_time:60953ms step_avg:150.50ms step:416/1480 train_time:61104ms step_avg:150.50ms step:417/1480 train_time:61256ms step_avg:150.51ms step:418/1480 train_time:61407ms step_avg:150.51ms step:419/1480 train_time:61559ms step_avg:150.51ms step:420/1480 train_time:61711ms step_avg:150.51ms step:421/1480 train_time:61861ms step_avg:150.51ms step:422/1480 train_time:62013ms step_avg:150.52ms step:423/1480 train_time:62164ms step_avg:150.52ms step:424/1480 train_time:62315ms step_avg:150.52ms step:425/1480 train_time:62465ms step_avg:150.52ms step:426/1480 train_time:62616ms step_avg:150.52ms step:427/1480 train_time:62767ms step_avg:150.52ms step:428/1480 train_time:62918ms step_avg:150.52ms step:429/1480 train_time:63069ms step_avg:150.52ms step:430/1480 train_time:63220ms step_avg:150.52ms step:431/1480 train_time:63372ms step_avg:150.53ms step:432/1480 train_time:63522ms step_avg:150.53ms step:433/1480 train_time:63674ms step_avg:150.53ms step:434/1480 train_time:63824ms step_avg:150.53ms step:435/1480 train_time:63975ms step_avg:150.53ms step:436/1480 train_time:64126ms step_avg:150.53ms step:437/1480 train_time:64277ms step_avg:150.53ms step:438/1480 train_time:64428ms step_avg:150.53ms step:439/1480 train_time:64578ms step_avg:150.53ms step:440/1480 train_time:64730ms step_avg:150.53ms step:441/1480 train_time:64882ms step_avg:150.54ms step:442/1480 train_time:65036ms step_avg:150.55ms step:443/1480 train_time:65190ms step_avg:150.55ms step:444/1480 train_time:65342ms step_avg:150.56ms step:445/1480 train_time:65495ms step_avg:150.56ms step:446/1480 train_time:65648ms step_avg:150.57ms step:447/1480 train_time:65800ms step_avg:150.57ms step:448/1480 train_time:65952ms step_avg:150.58ms step:449/1480 train_time:66107ms step_avg:150.59ms step:450/1480 train_time:66261ms step_avg:150.59ms step:451/1480 train_time:66414ms step_avg:150.60ms step:452/1480 train_time:66568ms step_avg:150.61ms step:453/1480 train_time:66720ms step_avg:150.61ms step:454/1480 train_time:66873ms step_avg:150.62ms step:455/1480 train_time:67024ms step_avg:150.62ms step:456/1480 train_time:67177ms step_avg:150.62ms step:457/1480 train_time:67331ms step_avg:150.63ms step:458/1480 train_time:67484ms step_avg:150.63ms step:459/1480 train_time:67637ms step_avg:150.64ms step:460/1480 train_time:67791ms step_avg:150.65ms step:461/1480 train_time:67942ms step_avg:150.65ms step:462/1480 train_time:68095ms step_avg:150.65ms step:463/1480 train_time:68248ms step_avg:150.66ms step:464/1480 train_time:68402ms step_avg:150.66ms step:465/1480 train_time:68555ms step_avg:150.67ms step:466/1480 train_time:68709ms step_avg:150.68ms step:467/1480 train_time:68862ms step_avg:150.68ms step:468/1480 train_time:69015ms step_avg:150.69ms step:469/1480 train_time:69168ms step_avg:150.69ms step:470/1480 train_time:69320ms step_avg:150.70ms step:471/1480 train_time:69474ms step_avg:150.70ms step:472/1480 train_time:69626ms step_avg:150.71ms step:473/1480 train_time:69779ms step_avg:150.71ms step:474/1480 train_time:69934ms step_avg:150.72ms step:475/1480 train_time:70086ms step_avg:150.72ms step:476/1480 train_time:70239ms step_avg:150.73ms step:477/1480 train_time:70392ms step_avg:150.73ms step:478/1480 train_time:70545ms step_avg:150.74ms step:479/1480 train_time:70697ms step_avg:150.74ms step:480/1480 train_time:70850ms step_avg:150.74ms step:481/1480 train_time:71002ms step_avg:150.75ms step:482/1480 train_time:71155ms step_avg:150.75ms step:483/1480 train_time:71308ms step_avg:150.76ms step:484/1480 train_time:71462ms step_avg:150.76ms step:485/1480 train_time:71616ms step_avg:150.77ms step:486/1480 train_time:71768ms step_avg:150.77ms step:487/1480 train_time:71921ms step_avg:150.78ms step:488/1480 train_time:72075ms step_avg:150.78ms step:489/1480 train_time:72226ms step_avg:150.79ms step:490/1480 train_time:72379ms step_avg:150.79ms step:491/1480 train_time:72533ms step_avg:150.80ms step:492/1480 train_time:72686ms step_avg:150.80ms step:493/1480 train_time:72840ms step_avg:150.81ms step:494/1480 train_time:72992ms step_avg:150.81ms step:495/1480 train_time:73145ms step_avg:150.81ms step:496/1480 train_time:73298ms step_avg:150.82ms step:497/1480 train_time:73451ms step_avg:150.82ms step:498/1480 train_time:73605ms step_avg:150.83ms step:499/1480 train_time:73757ms step_avg:150.83ms step:500/1480 train_time:73910ms step_avg:150.84ms step:500/1480 val_loss:3.6887 train_time:73979ms step_avg:150.98ms step:501/1480 train_time:74071ms step_avg:150.86ms step:502/1480 train_time:74222ms step_avg:150.86ms step:503/1480 train_time:74375ms step_avg:150.86ms step:504/1480 train_time:74527ms step_avg:150.86ms step:505/1480 train_time:74679ms step_avg:150.87ms step:506/1480 train_time:74831ms step_avg:150.87ms step:507/1480 train_time:74984ms step_avg:150.87ms step:508/1480 train_time:75139ms step_avg:150.88ms step:509/1480 train_time:75292ms step_avg:150.89ms step:510/1480 train_time:75445ms step_avg:150.89ms step:511/1480 train_time:75598ms step_avg:150.90ms step:512/1480 train_time:75751ms step_avg:150.90ms step:513/1480 train_time:75904ms step_avg:150.90ms step:514/1480 train_time:76056ms step_avg:150.91ms step:515/1480 train_time:76210ms step_avg:150.91ms step:516/1480 train_time:76364ms step_avg:150.92ms step:517/1480 train_time:76519ms step_avg:150.93ms step:518/1480 train_time:76672ms step_avg:150.93ms step:519/1480 train_time:76825ms step_avg:150.93ms step:520/1480 train_time:76978ms step_avg:150.94ms step:521/1480 train_time:77131ms step_avg:150.94ms step:522/1480 train_time:77284ms step_avg:150.95ms step:523/1480 train_time:77439ms step_avg:150.95ms step:524/1480 train_time:77591ms step_avg:150.96ms step:525/1480 train_time:77744ms step_avg:150.96ms step:526/1480 train_time:77898ms step_avg:150.96ms step:527/1480 train_time:78050ms step_avg:150.97ms step:528/1480 train_time:78202ms step_avg:150.97ms step:529/1480 train_time:78356ms step_avg:150.97ms step:530/1480 train_time:78509ms step_avg:150.98ms step:531/1480 train_time:78662ms step_avg:150.98ms step:532/1480 train_time:78816ms step_avg:150.99ms step:533/1480 train_time:78968ms step_avg:150.99ms step:534/1480 train_time:79122ms step_avg:151.00ms step:535/1480 train_time:79274ms step_avg:151.00ms step:536/1480 train_time:79427ms step_avg:151.00ms step:537/1480 train_time:79580ms step_avg:151.01ms step:538/1480 train_time:79735ms step_avg:151.01ms step:539/1480 train_time:79889ms step_avg:151.02ms step:540/1480 train_time:80042ms step_avg:151.02ms step:541/1480 train_time:80196ms step_avg:151.03ms step:542/1480 train_time:80347ms step_avg:151.03ms step:543/1480 train_time:80500ms step_avg:151.03ms step:544/1480 train_time:80653ms step_avg:151.04ms step:545/1480 train_time:80805ms step_avg:151.04ms step:546/1480 train_time:80958ms step_avg:151.04ms step:547/1480 train_time:81112ms step_avg:151.05ms step:548/1480 train_time:81266ms step_avg:151.05ms step:549/1480 train_time:81420ms step_avg:151.06ms step:550/1480 train_time:81574ms step_avg:151.06ms step:551/1480 train_time:81728ms step_avg:151.07ms step:552/1480 train_time:81883ms step_avg:151.08ms step:553/1480 train_time:82039ms step_avg:151.08ms step:554/1480 train_time:82194ms step_avg:151.09ms step:555/1480 train_time:82348ms step_avg:151.10ms step:556/1480 train_time:82502ms step_avg:151.10ms step:557/1480 train_time:82656ms step_avg:151.11ms step:558/1480 train_time:82811ms step_avg:151.11ms step:559/1480 train_time:82967ms step_avg:151.12ms step:560/1480 train_time:83122ms step_avg:151.13ms step:561/1480 train_time:83276ms step_avg:151.14ms step:562/1480 train_time:83429ms step_avg:151.14ms step:563/1480 train_time:83583ms step_avg:151.15ms step:564/1480 train_time:83739ms step_avg:151.15ms step:565/1480 train_time:83893ms step_avg:151.16ms step:566/1480 train_time:84049ms step_avg:151.17ms step:567/1480 train_time:84204ms step_avg:151.17ms step:568/1480 train_time:84358ms step_avg:151.18ms step:569/1480 train_time:84527ms step_avg:151.21ms step:570/1480 train_time:84668ms step_avg:151.19ms step:571/1480 train_time:84823ms step_avg:151.20ms step:572/1480 train_time:84979ms step_avg:151.21ms step:573/1480 train_time:85134ms step_avg:151.21ms step:574/1480 train_time:85290ms step_avg:151.22ms step:575/1480 train_time:85444ms step_avg:151.23ms step:576/1480 train_time:85599ms step_avg:151.23ms step:577/1480 train_time:85752ms step_avg:151.24ms step:578/1480 train_time:85907ms step_avg:151.24ms step:579/1480 train_time:86061ms step_avg:151.25ms step:580/1480 train_time:86217ms step_avg:151.26ms step:581/1480 train_time:86371ms step_avg:151.26ms step:582/1480 train_time:86526ms step_avg:151.27ms step:583/1480 train_time:86680ms step_avg:151.27ms step:584/1480 train_time:86836ms step_avg:151.28ms step:585/1480 train_time:86990ms step_avg:151.29ms step:586/1480 train_time:87144ms step_avg:151.29ms step:587/1480 train_time:87300ms step_avg:151.30ms step:588/1480 train_time:87453ms step_avg:151.30ms step:589/1480 train_time:87608ms step_avg:151.31ms step:590/1480 train_time:87762ms step_avg:151.31ms step:591/1480 train_time:87917ms step_avg:151.32ms step:592/1480 train_time:88073ms step_avg:151.33ms step:593/1480 train_time:88228ms step_avg:151.33ms step:594/1480 train_time:88383ms step_avg:151.34ms step:595/1480 train_time:88539ms step_avg:151.35ms step:596/1480 train_time:88695ms step_avg:151.36ms step:597/1480 train_time:88850ms step_avg:151.36ms step:598/1480 train_time:89004ms step_avg:151.37ms step:599/1480 train_time:89158ms step_avg:151.37ms step:600/1480 train_time:89314ms step_avg:151.38ms step:601/1480 train_time:89470ms step_avg:151.39ms step:602/1480 train_time:89625ms step_avg:151.39ms step:603/1480 train_time:89780ms step_avg:151.40ms step:604/1480 train_time:89934ms step_avg:151.40ms step:605/1480 train_time:90090ms step_avg:151.41ms step:606/1480 train_time:90246ms step_avg:151.42ms step:607/1480 train_time:90401ms step_avg:151.43ms step:608/1480 train_time:90556ms step_avg:151.43ms step:609/1480 train_time:90711ms step_avg:151.44ms step:610/1480 train_time:90866ms step_avg:151.44ms step:611/1480 train_time:91020ms step_avg:151.45ms step:612/1480 train_time:91175ms step_avg:151.45ms step:613/1480 train_time:91329ms step_avg:151.46ms step:614/1480 train_time:91484ms step_avg:151.46ms step:615/1480 train_time:91640ms step_avg:151.47ms step:616/1480 train_time:91795ms step_avg:151.48ms step:617/1480 train_time:91949ms step_avg:151.48ms step:618/1480 train_time:92103ms step_avg:151.48ms step:619/1480 train_time:92257ms step_avg:151.49ms step:620/1480 train_time:92412ms step_avg:151.50ms step:621/1480 train_time:92567ms step_avg:151.50ms step:622/1480 train_time:92721ms step_avg:151.51ms step:623/1480 train_time:92877ms step_avg:151.51ms step:624/1480 train_time:93032ms step_avg:151.52ms step:625/1480 train_time:93188ms step_avg:151.52ms step:625/1480 val_loss:3.6083 train_time:93258ms step_avg:151.64ms step:626/1480 train_time:93354ms step_avg:151.55ms step:627/1480 train_time:93504ms step_avg:151.55ms step:628/1480 train_time:93659ms step_avg:151.55ms step:629/1480 train_time:93814ms step_avg:151.56ms step:630/1480 train_time:93968ms step_avg:151.56ms step:631/1480 train_time:94121ms step_avg:151.56ms step:632/1480 train_time:94277ms step_avg:151.57ms step:633/1480 train_time:94433ms step_avg:151.58ms step:634/1480 train_time:94588ms step_avg:151.58ms step:635/1480 train_time:94743ms step_avg:151.59ms step:636/1480 train_time:94896ms step_avg:151.59ms step:637/1480 train_time:95051ms step_avg:151.60ms step:638/1480 train_time:95207ms step_avg:151.60ms step:639/1480 train_time:95363ms step_avg:151.61ms step:640/1480 train_time:95517ms step_avg:151.61ms step:641/1480 train_time:95673ms step_avg:151.62ms step:642/1480 train_time:95827ms step_avg:151.63ms step:643/1480 train_time:95982ms step_avg:151.63ms step:644/1480 train_time:96136ms step_avg:151.63ms step:645/1480 train_time:96292ms step_avg:151.64ms step:646/1480 train_time:96446ms step_avg:151.64ms step:647/1480 train_time:96600ms step_avg:151.65ms step:648/1480 train_time:96755ms step_avg:151.65ms step:649/1480 train_time:96910ms step_avg:151.66ms step:650/1480 train_time:97065ms step_avg:151.66ms step:651/1480 train_time:97219ms step_avg:151.67ms step:652/1480 train_time:97374ms step_avg:151.67ms step:653/1480 train_time:97528ms step_avg:151.68ms step:654/1480 train_time:97682ms step_avg:151.68ms step:655/1480 train_time:97837ms step_avg:151.69ms step:656/1480 train_time:97992ms step_avg:151.69ms step:657/1480 train_time:98146ms step_avg:151.69ms step:658/1480 train_time:98301ms step_avg:151.70ms step:659/1480 train_time:98455ms step_avg:151.70ms step:660/1480 train_time:98612ms step_avg:151.71ms step:661/1480 train_time:98769ms step_avg:151.72ms step:662/1480 train_time:98927ms step_avg:151.73ms step:663/1480 train_time:99082ms step_avg:151.73ms step:664/1480 train_time:99239ms step_avg:151.74ms step:665/1480 train_time:99396ms step_avg:151.75ms step:666/1480 train_time:99551ms step_avg:151.75ms step:667/1480 train_time:99710ms step_avg:151.77ms step:668/1480 train_time:99866ms step_avg:151.77ms step:669/1480 train_time:100022ms step_avg:151.78ms step:670/1480 train_time:100178ms step_avg:151.79ms step:671/1480 train_time:100334ms step_avg:151.79ms step:672/1480 train_time:100491ms step_avg:151.80ms step:673/1480 train_time:100648ms step_avg:151.81ms step:674/1480 train_time:100804ms step_avg:151.81ms step:675/1480 train_time:100962ms step_avg:151.82ms step:676/1480 train_time:101118ms step_avg:151.83ms step:677/1480 train_time:101274ms step_avg:151.84ms step:678/1480 train_time:101429ms step_avg:151.84ms step:679/1480 train_time:101586ms step_avg:151.85ms step:680/1480 train_time:101744ms step_avg:151.86ms step:681/1480 train_time:101899ms step_avg:151.86ms step:682/1480 train_time:102056ms step_avg:151.87ms step:683/1480 train_time:102213ms step_avg:151.88ms step:684/1480 train_time:102369ms step_avg:151.88ms step:685/1480 train_time:102525ms step_avg:151.89ms step:686/1480 train_time:102681ms step_avg:151.90ms step:687/1480 train_time:102838ms step_avg:151.90ms step:688/1480 train_time:102996ms step_avg:151.91ms step:689/1480 train_time:103154ms step_avg:151.92ms step:690/1480 train_time:103311ms step_avg:151.93ms step:691/1480 train_time:103467ms step_avg:151.93ms step:692/1480 train_time:103623ms step_avg:151.94ms step:693/1480 train_time:103779ms step_avg:151.95ms step:694/1480 train_time:103936ms step_avg:151.95ms step:695/1480 train_time:104093ms step_avg:151.96ms step:696/1480 train_time:104248ms step_avg:151.96ms step:697/1480 train_time:104405ms step_avg:151.97ms step:698/1480 train_time:104561ms step_avg:151.98ms step:699/1480 train_time:104717ms step_avg:151.98ms step:700/1480 train_time:104875ms step_avg:151.99ms step:701/1480 train_time:105031ms step_avg:152.00ms step:702/1480 train_time:105189ms step_avg:152.01ms step:703/1480 train_time:105345ms step_avg:152.01ms step:704/1480 train_time:105501ms step_avg:152.02ms step:705/1480 train_time:105658ms step_avg:152.03ms step:706/1480 train_time:105815ms step_avg:152.03ms step:707/1480 train_time:105971ms step_avg:152.04ms step:708/1480 train_time:106128ms step_avg:152.05ms step:709/1480 train_time:106284ms step_avg:152.05ms step:710/1480 train_time:106440ms step_avg:152.06ms step:711/1480 train_time:106597ms step_avg:152.06ms step:712/1480 train_time:106754ms step_avg:152.07ms step:713/1480 train_time:106911ms step_avg:152.08ms step:714/1480 train_time:107068ms step_avg:152.09ms step:715/1480 train_time:107223ms step_avg:152.09ms step:716/1480 train_time:107378ms step_avg:152.09ms step:717/1480 train_time:107535ms step_avg:152.10ms step:718/1480 train_time:107690ms step_avg:152.10ms step:719/1480 train_time:107846ms step_avg:152.11ms step:720/1480 train_time:108004ms step_avg:152.12ms step:721/1480 train_time:108162ms step_avg:152.13ms step:722/1480 train_time:108317ms step_avg:152.13ms step:723/1480 train_time:108474ms step_avg:152.14ms step:724/1480 train_time:108630ms step_avg:152.14ms step:725/1480 train_time:108787ms step_avg:152.15ms step:726/1480 train_time:108943ms step_avg:152.16ms step:727/1480 train_time:109102ms step_avg:152.16ms step:728/1480 train_time:109257ms step_avg:152.17ms step:729/1480 train_time:109417ms step_avg:152.18ms step:730/1480 train_time:109576ms step_avg:152.19ms step:731/1480 train_time:109733ms step_avg:152.20ms step:732/1480 train_time:109888ms step_avg:152.20ms step:733/1480 train_time:110045ms step_avg:152.21ms step:734/1480 train_time:110202ms step_avg:152.21ms step:735/1480 train_time:110359ms step_avg:152.22ms step:736/1480 train_time:110515ms step_avg:152.22ms step:737/1480 train_time:110671ms step_avg:152.23ms step:738/1480 train_time:110826ms step_avg:152.23ms step:739/1480 train_time:110984ms step_avg:152.24ms step:740/1480 train_time:111142ms step_avg:152.25ms step:741/1480 train_time:111300ms step_avg:152.26ms step:742/1480 train_time:111456ms step_avg:152.26ms step:743/1480 train_time:111612ms step_avg:152.27ms step:744/1480 train_time:111768ms step_avg:152.27ms step:745/1480 train_time:111926ms step_avg:152.28ms step:746/1480 train_time:112082ms step_avg:152.29ms step:747/1480 train_time:112240ms step_avg:152.29ms step:748/1480 train_time:112397ms step_avg:152.30ms step:749/1480 train_time:112554ms step_avg:152.31ms step:750/1480 train_time:112711ms step_avg:152.31ms step:750/1480 val_loss:3.5516 train_time:112782ms step_avg:152.41ms step:751/1480 train_time:112879ms step_avg:152.33ms step:752/1480 train_time:113029ms step_avg:152.33ms step:753/1480 train_time:113185ms step_avg:152.33ms step:754/1480 train_time:113340ms step_avg:152.34ms step:755/1480 train_time:113496ms step_avg:152.34ms step:756/1480 train_time:113653ms step_avg:152.35ms step:757/1480 train_time:113810ms step_avg:152.36ms step:758/1480 train_time:113968ms step_avg:152.36ms step:759/1480 train_time:114138ms step_avg:152.39ms step:760/1480 train_time:114282ms step_avg:152.38ms step:761/1480 train_time:114438ms step_avg:152.38ms step:762/1480 train_time:114595ms step_avg:152.39ms step:763/1480 train_time:114752ms step_avg:152.39ms step:764/1480 train_time:114909ms step_avg:152.40ms step:765/1480 train_time:115067ms step_avg:152.41ms step:766/1480 train_time:115225ms step_avg:152.41ms step:767/1480 train_time:115382ms step_avg:152.42ms step:768/1480 train_time:115539ms step_avg:152.43ms step:769/1480 train_time:115697ms step_avg:152.43ms step:770/1480 train_time:115854ms step_avg:152.44ms step:771/1480 train_time:116011ms step_avg:152.45ms step:772/1480 train_time:116169ms step_avg:152.45ms step:773/1480 train_time:116328ms step_avg:152.46ms step:774/1480 train_time:116486ms step_avg:152.47ms step:775/1480 train_time:116646ms step_avg:152.48ms step:776/1480 train_time:116805ms step_avg:152.49ms step:777/1480 train_time:116965ms step_avg:152.50ms step:778/1480 train_time:117124ms step_avg:152.50ms step:779/1480 train_time:117281ms step_avg:152.51ms step:780/1480 train_time:117439ms step_avg:152.52ms step:781/1480 train_time:117598ms step_avg:152.53ms step:782/1480 train_time:117756ms step_avg:152.53ms step:783/1480 train_time:117912ms step_avg:152.54ms step:784/1480 train_time:118072ms step_avg:152.55ms step:785/1480 train_time:118231ms step_avg:152.56ms step:786/1480 train_time:118388ms step_avg:152.56ms step:787/1480 train_time:118548ms step_avg:152.57ms step:788/1480 train_time:118706ms step_avg:152.58ms step:789/1480 train_time:118863ms step_avg:152.58ms step:790/1480 train_time:119020ms step_avg:152.59ms step:791/1480 train_time:119180ms step_avg:152.60ms step:792/1480 train_time:119338ms step_avg:152.61ms step:793/1480 train_time:119496ms step_avg:152.61ms step:794/1480 train_time:119654ms step_avg:152.62ms step:795/1480 train_time:119813ms step_avg:152.63ms step:796/1480 train_time:119973ms step_avg:152.64ms step:797/1480 train_time:120133ms step_avg:152.65ms step:798/1480 train_time:120291ms step_avg:152.65ms step:799/1480 train_time:120452ms step_avg:152.66ms step:800/1480 train_time:120612ms step_avg:152.67ms step:801/1480 train_time:120771ms step_avg:152.68ms step:802/1480 train_time:120928ms step_avg:152.69ms step:803/1480 train_time:121087ms step_avg:152.69ms step:804/1480 train_time:121245ms step_avg:152.70ms step:805/1480 train_time:121403ms step_avg:152.71ms step:806/1480 train_time:121560ms step_avg:152.71ms step:807/1480 train_time:121717ms step_avg:152.72ms step:808/1480 train_time:121875ms step_avg:152.73ms step:809/1480 train_time:122032ms step_avg:152.73ms step:810/1480 train_time:122189ms step_avg:152.74ms step:811/1480 train_time:122347ms step_avg:152.74ms step:812/1480 train_time:122505ms step_avg:152.75ms step:813/1480 train_time:122662ms step_avg:152.75ms step:814/1480 train_time:122820ms step_avg:152.76ms step:815/1480 train_time:122978ms step_avg:152.77ms step:816/1480 train_time:123135ms step_avg:152.77ms step:817/1480 train_time:123295ms step_avg:152.78ms step:818/1480 train_time:123452ms step_avg:152.79ms step:819/1480 train_time:123610ms step_avg:152.79ms step:820/1480 train_time:123768ms step_avg:152.80ms step:821/1480 train_time:123926ms step_avg:152.81ms step:822/1480 train_time:124084ms step_avg:152.81ms step:823/1480 train_time:124242ms step_avg:152.82ms step:824/1480 train_time:124399ms step_avg:152.82ms step:825/1480 train_time:124558ms step_avg:152.83ms step:826/1480 train_time:124718ms step_avg:152.84ms step:827/1480 train_time:124877ms step_avg:152.85ms step:828/1480 train_time:125033ms step_avg:152.85ms step:829/1480 train_time:125193ms step_avg:152.86ms step:830/1480 train_time:125353ms step_avg:152.87ms step:831/1480 train_time:125511ms step_avg:152.88ms step:832/1480 train_time:125669ms step_avg:152.88ms step:833/1480 train_time:125828ms step_avg:152.89ms step:834/1480 train_time:125987ms step_avg:152.90ms step:835/1480 train_time:126144ms step_avg:152.90ms step:836/1480 train_time:126304ms step_avg:152.91ms step:837/1480 train_time:126462ms step_avg:152.92ms step:838/1480 train_time:126618ms step_avg:152.92ms step:839/1480 train_time:126776ms step_avg:152.93ms step:840/1480 train_time:126932ms step_avg:152.93ms step:841/1480 train_time:127090ms step_avg:152.94ms step:842/1480 train_time:127250ms step_avg:152.94ms step:843/1480 train_time:127408ms step_avg:152.95ms step:844/1480 train_time:127566ms step_avg:152.96ms step:845/1480 train_time:127725ms step_avg:152.96ms step:846/1480 train_time:127885ms step_avg:152.97ms step:847/1480 train_time:128044ms step_avg:152.98ms step:848/1480 train_time:128201ms step_avg:152.98ms step:849/1480 train_time:128360ms step_avg:152.99ms step:850/1480 train_time:128518ms step_avg:153.00ms step:851/1480 train_time:128677ms step_avg:153.01ms step:852/1480 train_time:128835ms step_avg:153.01ms step:853/1480 train_time:128992ms step_avg:153.02ms step:854/1480 train_time:129153ms step_avg:153.02ms step:855/1480 train_time:129309ms step_avg:153.03ms step:856/1480 train_time:129468ms step_avg:153.03ms step:857/1480 train_time:129626ms step_avg:153.04ms step:858/1480 train_time:129788ms step_avg:153.05ms step:859/1480 train_time:129948ms step_avg:153.06ms step:860/1480 train_time:130105ms step_avg:153.07ms step:861/1480 train_time:130265ms step_avg:153.07ms step:862/1480 train_time:130424ms step_avg:153.08ms step:863/1480 train_time:130582ms step_avg:153.09ms step:864/1480 train_time:130740ms step_avg:153.09ms step:865/1480 train_time:130897ms step_avg:153.10ms step:866/1480 train_time:131056ms step_avg:153.10ms step:867/1480 train_time:131214ms step_avg:153.11ms step:868/1480 train_time:131372ms step_avg:153.11ms step:869/1480 train_time:131530ms step_avg:153.12ms step:870/1480 train_time:131690ms step_avg:153.13ms step:871/1480 train_time:131847ms step_avg:153.13ms step:872/1480 train_time:132004ms step_avg:153.14ms step:873/1480 train_time:132160ms step_avg:153.14ms step:874/1480 train_time:132319ms step_avg:153.15ms step:875/1480 train_time:132478ms step_avg:153.15ms step:875/1480 val_loss:3.5064 train_time:132550ms step_avg:153.24ms step:876/1480 train_time:132647ms step_avg:153.17ms step:877/1480 train_time:132797ms step_avg:153.17ms step:878/1480 train_time:132955ms step_avg:153.17ms step:879/1480 train_time:133112ms step_avg:153.18ms step:880/1480 train_time:133272ms step_avg:153.19ms step:881/1480 train_time:133429ms step_avg:153.19ms step:882/1480 train_time:133588ms step_avg:153.20ms step:883/1480 train_time:133748ms step_avg:153.21ms step:884/1480 train_time:133909ms step_avg:153.21ms step:885/1480 train_time:134069ms step_avg:153.22ms step:886/1480 train_time:134229ms step_avg:153.23ms step:887/1480 train_time:134389ms step_avg:153.24ms step:888/1480 train_time:134553ms step_avg:153.25ms step:889/1480 train_time:134713ms step_avg:153.26ms step:890/1480 train_time:134872ms step_avg:153.26ms step:891/1480 train_time:135031ms step_avg:153.27ms step:892/1480 train_time:135191ms step_avg:153.28ms step:893/1480 train_time:135349ms step_avg:153.28ms step:894/1480 train_time:135508ms step_avg:153.29ms step:895/1480 train_time:135670ms step_avg:153.30ms step:896/1480 train_time:135829ms step_avg:153.31ms step:897/1480 train_time:135989ms step_avg:153.31ms step:898/1480 train_time:136149ms step_avg:153.32ms step:899/1480 train_time:136308ms step_avg:153.33ms step:900/1480 train_time:136467ms step_avg:153.33ms step:901/1480 train_time:136627ms step_avg:153.34ms step:902/1480 train_time:136785ms step_avg:153.35ms step:903/1480 train_time:136948ms step_avg:153.36ms step:904/1480 train_time:137107ms step_avg:153.36ms step:905/1480 train_time:137265ms step_avg:153.37ms step:906/1480 train_time:137424ms step_avg:153.37ms step:907/1480 train_time:137586ms step_avg:153.38ms step:908/1480 train_time:137745ms step_avg:153.39ms step:909/1480 train_time:137904ms step_avg:153.40ms step:910/1480 train_time:138067ms step_avg:153.41ms step:911/1480 train_time:138227ms step_avg:153.42ms step:912/1480 train_time:138387ms step_avg:153.42ms step:913/1480 train_time:138549ms step_avg:153.43ms step:914/1480 train_time:138709ms step_avg:153.44ms step:915/1480 train_time:138870ms step_avg:153.45ms step:916/1480 train_time:139029ms step_avg:153.45ms step:917/1480 train_time:139187ms step_avg:153.46ms step:918/1480 train_time:139350ms step_avg:153.47ms step:919/1480 train_time:139511ms step_avg:153.48ms step:920/1480 train_time:139671ms step_avg:153.48ms step:921/1480 train_time:139830ms step_avg:153.49ms step:922/1480 train_time:139991ms step_avg:153.50ms step:923/1480 train_time:140150ms step_avg:153.50ms step:924/1480 train_time:140307ms step_avg:153.51ms step:925/1480 train_time:140466ms step_avg:153.52ms step:926/1480 train_time:140626ms step_avg:153.52ms step:927/1480 train_time:140784ms step_avg:153.53ms step:928/1480 train_time:140944ms step_avg:153.53ms step:929/1480 train_time:141104ms step_avg:153.54ms step:930/1480 train_time:141265ms step_avg:153.55ms step:931/1480 train_time:141425ms step_avg:153.56ms step:932/1480 train_time:141584ms step_avg:153.56ms step:933/1480 train_time:141744ms step_avg:153.57ms step:934/1480 train_time:141903ms step_avg:153.57ms step:935/1480 train_time:142065ms step_avg:153.58ms step:936/1480 train_time:142226ms step_avg:153.59ms step:937/1480 train_time:142387ms step_avg:153.60ms step:938/1480 train_time:142544ms step_avg:153.60ms step:939/1480 train_time:142707ms step_avg:153.61ms step:940/1480 train_time:142869ms step_avg:153.62ms step:941/1480 train_time:143027ms step_avg:153.63ms step:942/1480 train_time:143185ms step_avg:153.63ms step:943/1480 train_time:143346ms step_avg:153.64ms step:944/1480 train_time:143508ms step_avg:153.65ms step:945/1480 train_time:143667ms step_avg:153.65ms step:946/1480 train_time:143830ms step_avg:153.66ms step:947/1480 train_time:143991ms step_avg:153.67ms step:948/1480 train_time:144150ms step_avg:153.68ms step:949/1480 train_time:144316ms step_avg:153.69ms step:950/1480 train_time:144467ms step_avg:153.69ms step:951/1480 train_time:144629ms step_avg:153.70ms step:952/1480 train_time:144788ms step_avg:153.70ms step:953/1480 train_time:144948ms step_avg:153.71ms step:954/1480 train_time:145109ms step_avg:153.72ms step:955/1480 train_time:145268ms step_avg:153.72ms step:956/1480 train_time:145426ms step_avg:153.73ms step:957/1480 train_time:145587ms step_avg:153.73ms step:958/1480 train_time:145752ms step_avg:153.75ms step:959/1480 train_time:145909ms step_avg:153.75ms step:960/1480 train_time:146071ms step_avg:153.76ms step:961/1480 train_time:146230ms step_avg:153.76ms step:962/1480 train_time:146389ms step_avg:153.77ms step:963/1480 train_time:146550ms step_avg:153.78ms step:964/1480 train_time:146710ms step_avg:153.78ms step:965/1480 train_time:146868ms step_avg:153.79ms step:966/1480 train_time:147027ms step_avg:153.79ms step:967/1480 train_time:147185ms step_avg:153.80ms step:968/1480 train_time:147345ms step_avg:153.80ms step:969/1480 train_time:147505ms step_avg:153.81ms step:970/1480 train_time:147664ms step_avg:153.82ms step:971/1480 train_time:147823ms step_avg:153.82ms step:972/1480 train_time:147981ms step_avg:153.83ms step:973/1480 train_time:148138ms step_avg:153.83ms step:974/1480 train_time:148297ms step_avg:153.83ms step:975/1480 train_time:148456ms step_avg:153.84ms step:976/1480 train_time:148615ms step_avg:153.85ms step:977/1480 train_time:148773ms step_avg:153.85ms step:978/1480 train_time:148932ms step_avg:153.86ms step:979/1480 train_time:149091ms step_avg:153.86ms step:980/1480 train_time:149251ms step_avg:153.87ms step:981/1480 train_time:149411ms step_avg:153.87ms step:982/1480 train_time:149570ms step_avg:153.88ms step:983/1480 train_time:149731ms step_avg:153.89ms step:984/1480 train_time:149889ms step_avg:153.89ms step:985/1480 train_time:150051ms step_avg:153.90ms step:986/1480 train_time:150209ms step_avg:153.90ms step:987/1480 train_time:150368ms step_avg:153.91ms step:988/1480 train_time:150529ms step_avg:153.92ms step:989/1480 train_time:150688ms step_avg:153.92ms step:990/1480 train_time:150849ms step_avg:153.93ms step:991/1480 train_time:151009ms step_avg:153.93ms step:992/1480 train_time:151174ms step_avg:153.95ms step:993/1480 train_time:151342ms step_avg:153.96ms step:994/1480 train_time:151502ms step_avg:153.97ms step:995/1480 train_time:151661ms step_avg:153.97ms step:996/1480 train_time:151818ms step_avg:153.97ms step:997/1480 train_time:151979ms step_avg:153.98ms step:998/1480 train_time:152140ms step_avg:153.99ms step:999/1480 train_time:152300ms step_avg:153.99ms step:1000/1480 train_time:152462ms step_avg:154.00ms step:1000/1480 val_loss:3.4434 train_time:152535ms step_avg:154.08ms step:1001/1480 train_time:152627ms step_avg:154.01ms step:1002/1480 train_time:152785ms step_avg:154.02ms step:1003/1480 train_time:152949ms step_avg:154.03ms step:1004/1480 train_time:153112ms step_avg:154.04ms step:1005/1480 train_time:153271ms step_avg:154.04ms step:1006/1480 train_time:153432ms step_avg:154.05ms step:1007/1480 train_time:153593ms step_avg:154.05ms step:1008/1480 train_time:153753ms step_avg:154.06ms step:1009/1480 train_time:153920ms step_avg:154.07ms step:1010/1480 train_time:154079ms step_avg:154.08ms step:1011/1480 train_time:154239ms step_avg:154.09ms step:1012/1480 train_time:154397ms step_avg:154.09ms step:1013/1480 train_time:154557ms step_avg:154.10ms step:1014/1480 train_time:154717ms step_avg:154.10ms step:1015/1480 train_time:154879ms step_avg:154.11ms step:1016/1480 train_time:155038ms step_avg:154.11ms step:1017/1480 train_time:155201ms step_avg:154.12ms step:1018/1480 train_time:155361ms step_avg:154.13ms step:1019/1480 train_time:155523ms step_avg:154.14ms step:1020/1480 train_time:155684ms step_avg:154.14ms step:1021/1480 train_time:155845ms step_avg:154.15ms step:1022/1480 train_time:156005ms step_avg:154.16ms step:1023/1480 train_time:156166ms step_avg:154.16ms step:1024/1480 train_time:156328ms step_avg:154.17ms step:1025/1480 train_time:156490ms step_avg:154.18ms step:1026/1480 train_time:156649ms step_avg:154.18ms step:1027/1480 train_time:156808ms step_avg:154.19ms step:1028/1480 train_time:156970ms step_avg:154.19ms step:1029/1480 train_time:157135ms step_avg:154.21ms step:1030/1480 train_time:157295ms step_avg:154.21ms step:1031/1480 train_time:157455ms step_avg:154.22ms step:1032/1480 train_time:157618ms step_avg:154.23ms step:1033/1480 train_time:157778ms step_avg:154.23ms step:1034/1480 train_time:157937ms step_avg:154.24ms step:1035/1480 train_time:158097ms step_avg:154.24ms step:1036/1480 train_time:158257ms step_avg:154.25ms step:1037/1480 train_time:158419ms step_avg:154.25ms step:1038/1480 train_time:158581ms step_avg:154.26ms step:1039/1480 train_time:158745ms step_avg:154.27ms step:1040/1480 train_time:158906ms step_avg:154.28ms step:1041/1480 train_time:159066ms step_avg:154.28ms step:1042/1480 train_time:159225ms step_avg:154.29ms step:1043/1480 train_time:159384ms step_avg:154.29ms step:1044/1480 train_time:159543ms step_avg:154.30ms step:1045/1480 train_time:159706ms step_avg:154.31ms step:1046/1480 train_time:159865ms step_avg:154.31ms step:1047/1480 train_time:160027ms step_avg:154.32ms step:1048/1480 train_time:160188ms step_avg:154.32ms step:1049/1480 train_time:160349ms step_avg:154.33ms step:1050/1480 train_time:160511ms step_avg:154.34ms step:1051/1480 train_time:160671ms step_avg:154.34ms step:1052/1480 train_time:160833ms step_avg:154.35ms step:1053/1480 train_time:160993ms step_avg:154.36ms step:1054/1480 train_time:161154ms step_avg:154.36ms step:1055/1480 train_time:161314ms step_avg:154.37ms step:1056/1480 train_time:161474ms step_avg:154.37ms step:1057/1480 train_time:161634ms step_avg:154.38ms step:1058/1480 train_time:161797ms step_avg:154.39ms step:1059/1480 train_time:161960ms step_avg:154.39ms step:1060/1480 train_time:162122ms step_avg:154.40ms step:1061/1480 train_time:162281ms step_avg:154.41ms step:1062/1480 train_time:162440ms step_avg:154.41ms step:1063/1480 train_time:162601ms step_avg:154.42ms step:1064/1480 train_time:162759ms step_avg:154.42ms step:1065/1480 train_time:162922ms step_avg:154.43ms step:1066/1480 train_time:163083ms step_avg:154.43ms step:1067/1480 train_time:163246ms step_avg:154.44ms step:1068/1480 train_time:163407ms step_avg:154.45ms step:1069/1480 train_time:163569ms step_avg:154.46ms step:1070/1480 train_time:163729ms step_avg:154.46ms step:1071/1480 train_time:163891ms step_avg:154.47ms step:1072/1480 train_time:164051ms step_avg:154.47ms step:1073/1480 train_time:164208ms step_avg:154.48ms step:1074/1480 train_time:164367ms step_avg:154.48ms step:1075/1480 train_time:164530ms step_avg:154.49ms step:1076/1480 train_time:164690ms step_avg:154.49ms step:1077/1480 train_time:164850ms step_avg:154.50ms step:1078/1480 train_time:165015ms step_avg:154.51ms step:1079/1480 train_time:165180ms step_avg:154.52ms step:1080/1480 train_time:165340ms step_avg:154.52ms step:1081/1480 train_time:165501ms step_avg:154.53ms step:1082/1480 train_time:165662ms step_avg:154.54ms step:1083/1480 train_time:165824ms step_avg:154.54ms step:1084/1480 train_time:165987ms step_avg:154.55ms step:1085/1480 train_time:166147ms step_avg:154.56ms step:1086/1480 train_time:166308ms step_avg:154.56ms step:1087/1480 train_time:166467ms step_avg:154.57ms step:1088/1480 train_time:166628ms step_avg:154.57ms step:1089/1480 train_time:166791ms step_avg:154.58ms step:1090/1480 train_time:166953ms step_avg:154.59ms step:1091/1480 train_time:167115ms step_avg:154.59ms step:1092/1480 train_time:167275ms step_avg:154.60ms step:1093/1480 train_time:167435ms step_avg:154.60ms step:1094/1480 train_time:167595ms step_avg:154.61ms step:1095/1480 train_time:167755ms step_avg:154.61ms step:1096/1480 train_time:167917ms step_avg:154.62ms step:1097/1480 train_time:168078ms step_avg:154.63ms step:1098/1480 train_time:168244ms step_avg:154.64ms step:1099/1480 train_time:168406ms step_avg:154.64ms step:1100/1480 train_time:168568ms step_avg:154.65ms step:1101/1480 train_time:168733ms step_avg:154.66ms step:1102/1480 train_time:168894ms step_avg:154.67ms step:1103/1480 train_time:169061ms step_avg:154.68ms step:1104/1480 train_time:169224ms step_avg:154.68ms step:1105/1480 train_time:169388ms step_avg:154.69ms step:1106/1480 train_time:169550ms step_avg:154.70ms step:1107/1480 train_time:169711ms step_avg:154.70ms step:1108/1480 train_time:169870ms step_avg:154.71ms step:1109/1480 train_time:170032ms step_avg:154.71ms step:1110/1480 train_time:170192ms step_avg:154.72ms step:1111/1480 train_time:170355ms step_avg:154.73ms step:1112/1480 train_time:170517ms step_avg:154.73ms step:1113/1480 train_time:170686ms step_avg:154.75ms step:1114/1480 train_time:170850ms step_avg:154.76ms step:1115/1480 train_time:171013ms step_avg:154.76ms step:1116/1480 train_time:171172ms step_avg:154.77ms step:1117/1480 train_time:171337ms step_avg:154.78ms step:1118/1480 train_time:171502ms step_avg:154.79ms step:1119/1480 train_time:171662ms step_avg:154.79ms step:1120/1480 train_time:171825ms step_avg:154.80ms step:1121/1480 train_time:171988ms step_avg:154.80ms step:1122/1480 train_time:172148ms step_avg:154.81ms step:1123/1480 train_time:172309ms step_avg:154.81ms step:1124/1480 train_time:172471ms step_avg:154.82ms step:1125/1480 train_time:172634ms step_avg:154.83ms step:1125/1480 val_loss:3.3871 train_time:172708ms step_avg:154.90ms step:1126/1480 train_time:172806ms step_avg:154.84ms step:1127/1480 train_time:172961ms step_avg:154.84ms step:1128/1480 train_time:173121ms step_avg:154.85ms step:1129/1480 train_time:173285ms step_avg:154.86ms step:1130/1480 train_time:173447ms step_avg:154.86ms step:1131/1480 train_time:173612ms step_avg:154.87ms step:1132/1480 train_time:173772ms step_avg:154.88ms step:1133/1480 train_time:173934ms step_avg:154.88ms step:1134/1480 train_time:174096ms step_avg:154.89ms step:1135/1480 train_time:174259ms step_avg:154.90ms step:1136/1480 train_time:174424ms step_avg:154.91ms step:1137/1480 train_time:174585ms step_avg:154.91ms step:1138/1480 train_time:174748ms step_avg:154.92ms step:1139/1480 train_time:174921ms step_avg:154.93ms step:1140/1480 train_time:175071ms step_avg:154.93ms step:1141/1480 train_time:175234ms step_avg:154.94ms step:1142/1480 train_time:175394ms step_avg:154.94ms step:1143/1480 train_time:175560ms step_avg:154.95ms step:1144/1480 train_time:175722ms step_avg:154.96ms step:1145/1480 train_time:175881ms step_avg:154.96ms step:1146/1480 train_time:176043ms step_avg:154.97ms step:1147/1480 train_time:176205ms step_avg:154.97ms step:1148/1480 train_time:176366ms step_avg:154.98ms step:1149/1480 train_time:176529ms step_avg:154.99ms step:1150/1480 train_time:176690ms step_avg:154.99ms step:1151/1480 train_time:176853ms step_avg:155.00ms step:1152/1480 train_time:177016ms step_avg:155.01ms step:1153/1480 train_time:177180ms step_avg:155.01ms step:1154/1480 train_time:177342ms step_avg:155.02ms step:1155/1480 train_time:177504ms step_avg:155.03ms step:1156/1480 train_time:177672ms step_avg:155.04ms step:1157/1480 train_time:177834ms step_avg:155.04ms step:1158/1480 train_time:177995ms step_avg:155.05ms step:1159/1480 train_time:178155ms step_avg:155.05ms step:1160/1480 train_time:178316ms step_avg:155.06ms step:1161/1480 train_time:178477ms step_avg:155.06ms step:1162/1480 train_time:178641ms step_avg:155.07ms step:1163/1480 train_time:178806ms step_avg:155.08ms step:1164/1480 train_time:178969ms step_avg:155.09ms step:1165/1480 train_time:179128ms step_avg:155.09ms step:1166/1480 train_time:179291ms step_avg:155.10ms step:1167/1480 train_time:179451ms step_avg:155.10ms step:1168/1480 train_time:179613ms step_avg:155.11ms step:1169/1480 train_time:179774ms step_avg:155.11ms step:1170/1480 train_time:179934ms step_avg:155.12ms step:1171/1480 train_time:180096ms step_avg:155.12ms step:1172/1480 train_time:180256ms step_avg:155.13ms step:1173/1480 train_time:180420ms step_avg:155.13ms step:1174/1480 train_time:180591ms step_avg:155.15ms step:1175/1480 train_time:180754ms step_avg:155.15ms step:1176/1480 train_time:180919ms step_avg:155.16ms step:1177/1480 train_time:181085ms step_avg:155.17ms step:1178/1480 train_time:181246ms step_avg:155.18ms step:1179/1480 train_time:181406ms step_avg:155.18ms step:1180/1480 train_time:181574ms step_avg:155.19ms step:1181/1480 train_time:181736ms step_avg:155.20ms step:1182/1480 train_time:181898ms step_avg:155.20ms step:1183/1480 train_time:182060ms step_avg:155.21ms step:1184/1480 train_time:182224ms step_avg:155.22ms step:1185/1480 train_time:182389ms step_avg:155.23ms step:1186/1480 train_time:182551ms step_avg:155.23ms step:1187/1480 train_time:182723ms step_avg:155.24ms step:1188/1480 train_time:182884ms step_avg:155.25ms step:1189/1480 train_time:183046ms step_avg:155.26ms step:1190/1480 train_time:183208ms step_avg:155.26ms step:1191/1480 train_time:183373ms step_avg:155.27ms step:1192/1480 train_time:183532ms step_avg:155.27ms step:1193/1480 train_time:183693ms step_avg:155.28ms step:1194/1480 train_time:183855ms step_avg:155.28ms step:1195/1480 train_time:184016ms step_avg:155.29ms step:1196/1480 train_time:184189ms step_avg:155.30ms step:1197/1480 train_time:184350ms step_avg:155.31ms step:1198/1480 train_time:184519ms step_avg:155.32ms step:1199/1480 train_time:184682ms step_avg:155.33ms step:1200/1480 train_time:184845ms step_avg:155.33ms step:1201/1480 train_time:185005ms step_avg:155.34ms step:1202/1480 train_time:185175ms step_avg:155.35ms step:1203/1480 train_time:185340ms step_avg:155.36ms step:1204/1480 train_time:185505ms step_avg:155.36ms step:1205/1480 train_time:185668ms step_avg:155.37ms step:1206/1480 train_time:185828ms step_avg:155.37ms step:1207/1480 train_time:185991ms step_avg:155.38ms step:1208/1480 train_time:186151ms step_avg:155.38ms step:1209/1480 train_time:186314ms step_avg:155.39ms step:1210/1480 train_time:186480ms step_avg:155.40ms step:1211/1480 train_time:186644ms step_avg:155.41ms step:1212/1480 train_time:186807ms step_avg:155.41ms step:1213/1480 train_time:186972ms step_avg:155.42ms step:1214/1480 train_time:187137ms step_avg:155.43ms step:1215/1480 train_time:187301ms step_avg:155.44ms step:1216/1480 train_time:187463ms step_avg:155.44ms step:1217/1480 train_time:187626ms step_avg:155.45ms step:1218/1480 train_time:187789ms step_avg:155.45ms step:1219/1480 train_time:187955ms step_avg:155.46ms step:1220/1480 train_time:188118ms step_avg:155.47ms step:1221/1480 train_time:188279ms step_avg:155.47ms step:1222/1480 train_time:188440ms step_avg:155.48ms step:1223/1480 train_time:188603ms step_avg:155.48ms step:1224/1480 train_time:188772ms step_avg:155.50ms step:1225/1480 train_time:188935ms step_avg:155.50ms step:1226/1480 train_time:189099ms step_avg:155.51ms step:1227/1480 train_time:189265ms step_avg:155.52ms step:1228/1480 train_time:189428ms step_avg:155.52ms step:1229/1480 train_time:189591ms step_avg:155.53ms step:1230/1480 train_time:189761ms step_avg:155.54ms step:1231/1480 train_time:189927ms step_avg:155.55ms step:1232/1480 train_time:190093ms step_avg:155.56ms step:1233/1480 train_time:190253ms step_avg:155.56ms step:1234/1480 train_time:190414ms step_avg:155.57ms step:1235/1480 train_time:190581ms step_avg:155.58ms step:1236/1480 train_time:190742ms step_avg:155.58ms step:1237/1480 train_time:190904ms step_avg:155.59ms step:1238/1480 train_time:191076ms step_avg:155.60ms step:1239/1480 train_time:191238ms step_avg:155.60ms step:1240/1480 train_time:191406ms step_avg:155.61ms step:1241/1480 train_time:191571ms step_avg:155.62ms step:1242/1480 train_time:191732ms step_avg:155.63ms step:1243/1480 train_time:191896ms step_avg:155.63ms step:1244/1480 train_time:192057ms step_avg:155.64ms step:1245/1480 train_time:192220ms step_avg:155.64ms step:1246/1480 train_time:192383ms step_avg:155.65ms step:1247/1480 train_time:192545ms step_avg:155.65ms step:1248/1480 train_time:192706ms step_avg:155.66ms step:1249/1480 train_time:192869ms step_avg:155.66ms step:1250/1480 train_time:193030ms step_avg:155.67ms step:1250/1480 val_loss:3.3380 train_time:193106ms step_avg:155.73ms step:1251/1480 train_time:193203ms step_avg:155.68ms step:1252/1480 train_time:193362ms step_avg:155.69ms step:1253/1480 train_time:193523ms step_avg:155.69ms step:1254/1480 train_time:193684ms step_avg:155.69ms step:1255/1480 train_time:193856ms step_avg:155.71ms step:1256/1480 train_time:194021ms step_avg:155.71ms step:1257/1480 train_time:194182ms step_avg:155.72ms step:1258/1480 train_time:194346ms step_avg:155.73ms step:1259/1480 train_time:194509ms step_avg:155.73ms step:1260/1480 train_time:194672ms step_avg:155.74ms step:1261/1480 train_time:194835ms step_avg:155.74ms step:1262/1480 train_time:195000ms step_avg:155.75ms step:1263/1480 train_time:195164ms step_avg:155.76ms step:1264/1480 train_time:195323ms step_avg:155.76ms step:1265/1480 train_time:195482ms step_avg:155.76ms step:1266/1480 train_time:195647ms step_avg:155.77ms step:1267/1480 train_time:195808ms step_avg:155.77ms step:1268/1480 train_time:195974ms step_avg:155.78ms step:1269/1480 train_time:196140ms step_avg:155.79ms step:1270/1480 train_time:196301ms step_avg:155.79ms step:1271/1480 train_time:196463ms step_avg:155.80ms step:1272/1480 train_time:196623ms step_avg:155.80ms step:1273/1480 train_time:196785ms step_avg:155.81ms step:1274/1480 train_time:196949ms step_avg:155.81ms step:1275/1480 train_time:197110ms step_avg:155.82ms step:1276/1480 train_time:197272ms step_avg:155.82ms step:1277/1480 train_time:197436ms step_avg:155.83ms step:1278/1480 train_time:197597ms step_avg:155.83ms step:1279/1480 train_time:197760ms step_avg:155.84ms step:1280/1480 train_time:197926ms step_avg:155.85ms step:1281/1480 train_time:198088ms step_avg:155.85ms step:1282/1480 train_time:198247ms step_avg:155.85ms step:1283/1480 train_time:198412ms step_avg:155.86ms step:1284/1480 train_time:198575ms step_avg:155.87ms step:1285/1480 train_time:198738ms step_avg:155.87ms step:1286/1480 train_time:198899ms step_avg:155.88ms step:1287/1480 train_time:199061ms step_avg:155.88ms step:1288/1480 train_time:199223ms step_avg:155.89ms step:1289/1480 train_time:199392ms step_avg:155.90ms step:1290/1480 train_time:199560ms step_avg:155.91ms step:1291/1480 train_time:199724ms step_avg:155.91ms step:1292/1480 train_time:199888ms step_avg:155.92ms step:1293/1480 train_time:200058ms step_avg:155.93ms step:1294/1480 train_time:200220ms step_avg:155.93ms step:1295/1480 train_time:200382ms step_avg:155.94ms step:1296/1480 train_time:200545ms step_avg:155.94ms step:1297/1480 train_time:200709ms step_avg:155.95ms step:1298/1480 train_time:200873ms step_avg:155.96ms step:1299/1480 train_time:201038ms step_avg:155.96ms step:1300/1480 train_time:201199ms step_avg:155.97ms step:1301/1480 train_time:201359ms step_avg:155.97ms step:1302/1480 train_time:201524ms step_avg:155.98ms step:1303/1480 train_time:201692ms step_avg:155.99ms step:1304/1480 train_time:201858ms step_avg:156.00ms step:1305/1480 train_time:202019ms step_avg:156.00ms step:1306/1480 train_time:202183ms step_avg:156.01ms step:1307/1480 train_time:202343ms step_avg:156.01ms step:1308/1480 train_time:202507ms step_avg:156.01ms step:1309/1480 train_time:202674ms step_avg:156.02ms step:1310/1480 train_time:202837ms step_avg:156.03ms step:1311/1480 train_time:202999ms step_avg:156.03ms step:1312/1480 train_time:203163ms step_avg:156.04ms step:1313/1480 train_time:203326ms step_avg:156.04ms step:1314/1480 train_time:203491ms step_avg:156.05ms step:1315/1480 train_time:203656ms step_avg:156.06ms step:1316/1480 train_time:203816ms step_avg:156.06ms step:1317/1480 train_time:203977ms step_avg:156.07ms step:1318/1480 train_time:204145ms step_avg:156.07ms step:1319/1480 train_time:204312ms step_avg:156.08ms step:1320/1480 train_time:204480ms step_avg:156.09ms step:1321/1480 train_time:204644ms step_avg:156.10ms step:1322/1480 train_time:204817ms step_avg:156.11ms step:1323/1480 train_time:204979ms step_avg:156.12ms step:1324/1480 train_time:205144ms step_avg:156.12ms step:1325/1480 train_time:205315ms step_avg:156.13ms step:1326/1480 train_time:205480ms step_avg:156.14ms step:1327/1480 train_time:205642ms step_avg:156.14ms step:1328/1480 train_time:205804ms step_avg:156.15ms step:1329/1480 train_time:205989ms step_avg:156.17ms step:1330/1480 train_time:206152ms step_avg:156.18ms step:1331/1480 train_time:206316ms step_avg:156.18ms step:1332/1480 train_time:206478ms step_avg:156.19ms step:1333/1480 train_time:206643ms step_avg:156.19ms step:1334/1480 train_time:206806ms step_avg:156.20ms step:1335/1480 train_time:206967ms step_avg:156.20ms step:1336/1480 train_time:207138ms step_avg:156.21ms step:1337/1480 train_time:207304ms step_avg:156.22ms step:1338/1480 train_time:207466ms step_avg:156.22ms step:1339/1480 train_time:207631ms step_avg:156.23ms step:1340/1480 train_time:207795ms step_avg:156.24ms step:1341/1480 train_time:207957ms step_avg:156.24ms step:1342/1480 train_time:208122ms step_avg:156.25ms step:1343/1480 train_time:208283ms step_avg:156.25ms step:1344/1480 train_time:208445ms step_avg:156.26ms step:1345/1480 train_time:208615ms step_avg:156.27ms step:1346/1480 train_time:208776ms step_avg:156.27ms step:1347/1480 train_time:208939ms step_avg:156.27ms step:1348/1480 train_time:209101ms step_avg:156.28ms step:1349/1480 train_time:209265ms step_avg:156.28ms step:1350/1480 train_time:209431ms step_avg:156.29ms step:1351/1480 train_time:209594ms step_avg:156.30ms step:1352/1480 train_time:209757ms step_avg:156.30ms step:1353/1480 train_time:209923ms step_avg:156.31ms step:1354/1480 train_time:210088ms step_avg:156.32ms step:1355/1480 train_time:210249ms step_avg:156.32ms step:1356/1480 train_time:210414ms step_avg:156.33ms step:1357/1480 train_time:210578ms step_avg:156.33ms step:1358/1480 train_time:210742ms step_avg:156.34ms step:1359/1480 train_time:210907ms step_avg:156.34ms step:1360/1480 train_time:211073ms step_avg:156.35ms step:1361/1480 train_time:211240ms step_avg:156.36ms step:1362/1480 train_time:211404ms step_avg:156.36ms step:1363/1480 train_time:211573ms step_avg:156.37ms step:1364/1480 train_time:211736ms step_avg:156.38ms step:1365/1480 train_time:211898ms step_avg:156.38ms step:1366/1480 train_time:212060ms step_avg:156.39ms step:1367/1480 train_time:212223ms step_avg:156.39ms step:1368/1480 train_time:212388ms step_avg:156.40ms step:1369/1480 train_time:212560ms step_avg:156.41ms step:1370/1480 train_time:212727ms step_avg:156.42ms step:1371/1480 train_time:212891ms step_avg:156.42ms step:1372/1480 train_time:213059ms step_avg:156.43ms step:1373/1480 train_time:213220ms step_avg:156.43ms step:1374/1480 train_time:213386ms step_avg:156.44ms step:1375/1480 train_time:213548ms step_avg:156.45ms step:1375/1480 val_loss:3.2988 train_time:213623ms step_avg:156.50ms step:1376/1480 train_time:213715ms step_avg:156.45ms step:1377/1480 train_time:213881ms step_avg:156.46ms step:1378/1480 train_time:214043ms step_avg:156.46ms step:1379/1480 train_time:214207ms step_avg:156.47ms step:1380/1480 train_time:214371ms step_avg:156.48ms step:1381/1480 train_time:214539ms step_avg:156.48ms step:1382/1480 train_time:214704ms step_avg:156.49ms step:1383/1480 train_time:214867ms step_avg:156.49ms step:1384/1480 train_time:215033ms step_avg:156.50ms step:1385/1480 train_time:215194ms step_avg:156.50ms step:1386/1480 train_time:215356ms step_avg:156.51ms step:1387/1480 train_time:215520ms step_avg:156.51ms step:1388/1480 train_time:215682ms step_avg:156.52ms step:1389/1480 train_time:215847ms step_avg:156.52ms step:1390/1480 train_time:216010ms step_avg:156.53ms step:1391/1480 train_time:216173ms step_avg:156.53ms step:1392/1480 train_time:216338ms step_avg:156.54ms step:1393/1480 train_time:216501ms step_avg:156.54ms step:1394/1480 train_time:216663ms step_avg:156.55ms step:1395/1480 train_time:216825ms step_avg:156.55ms step:1396/1480 train_time:216987ms step_avg:156.56ms step:1397/1480 train_time:217147ms step_avg:156.56ms step:1398/1480 train_time:217308ms step_avg:156.56ms step:1399/1480 train_time:217469ms step_avg:156.57ms step:1400/1480 train_time:217638ms step_avg:156.57ms step:1401/1480 train_time:217798ms step_avg:156.58ms step:1402/1480 train_time:217961ms step_avg:156.58ms step:1403/1480 train_time:218127ms step_avg:156.59ms step:1404/1480 train_time:218289ms step_avg:156.59ms step:1405/1480 train_time:218454ms step_avg:156.60ms step:1406/1480 train_time:218619ms step_avg:156.60ms step:1407/1480 train_time:218781ms step_avg:156.61ms step:1408/1480 train_time:218943ms step_avg:156.61ms step:1409/1480 train_time:219117ms step_avg:156.62ms step:1410/1480 train_time:219280ms step_avg:156.63ms step:1411/1480 train_time:219440ms step_avg:156.63ms step:1412/1480 train_time:219601ms step_avg:156.63ms step:1413/1480 train_time:219764ms step_avg:156.64ms step:1414/1480 train_time:219927ms step_avg:156.64ms step:1415/1480 train_time:220091ms step_avg:156.65ms step:1416/1480 train_time:220265ms step_avg:156.66ms step:1417/1480 train_time:220429ms step_avg:156.67ms step:1418/1480 train_time:220593ms step_avg:156.67ms step:1419/1480 train_time:220758ms step_avg:156.68ms step:1420/1480 train_time:220923ms step_avg:156.68ms step:1421/1480 train_time:221088ms step_avg:156.69ms step:1422/1480 train_time:221253ms step_avg:156.70ms step:1423/1480 train_time:221416ms step_avg:156.70ms step:1424/1480 train_time:221584ms step_avg:156.71ms step:1425/1480 train_time:221753ms step_avg:156.72ms step:1426/1480 train_time:221918ms step_avg:156.72ms step:1427/1480 train_time:222084ms step_avg:156.73ms step:1428/1480 train_time:222246ms step_avg:156.73ms step:1429/1480 train_time:222406ms step_avg:156.73ms step:1430/1480 train_time:222570ms step_avg:156.74ms step:1431/1480 train_time:222737ms step_avg:156.75ms step:1432/1480 train_time:222905ms step_avg:156.75ms step:1433/1480 train_time:223075ms step_avg:156.76ms step:1434/1480 train_time:223244ms step_avg:156.77ms step:1435/1480 train_time:223409ms step_avg:156.78ms step:1436/1480 train_time:223573ms step_avg:156.78ms step:1437/1480 train_time:223736ms step_avg:156.79ms step:1438/1480 train_time:223899ms step_avg:156.79ms step:1439/1480 train_time:224067ms step_avg:156.80ms step:1440/1480 train_time:224229ms step_avg:156.80ms step:1441/1480 train_time:224393ms step_avg:156.81ms step:1442/1480 train_time:224560ms step_avg:156.82ms step:1443/1480 train_time:224734ms step_avg:156.83ms step:1444/1480 train_time:224899ms step_avg:156.83ms step:1445/1480 train_time:225062ms step_avg:156.84ms step:1446/1480 train_time:225227ms step_avg:156.84ms step:1447/1480 train_time:225396ms step_avg:156.85ms step:1448/1480 train_time:225559ms step_avg:156.86ms step:1449/1480 train_time:225722ms step_avg:156.86ms step:1450/1480 train_time:225886ms step_avg:156.87ms step:1451/1480 train_time:226049ms step_avg:156.87ms step:1452/1480 train_time:226213ms step_avg:156.87ms step:1453/1480 train_time:226379ms step_avg:156.88ms step:1454/1480 train_time:226541ms step_avg:156.88ms step:1455/1480 train_time:226709ms step_avg:156.89ms step:1456/1480 train_time:226873ms step_avg:156.90ms step:1457/1480 train_time:227035ms step_avg:156.90ms step:1458/1480 train_time:227199ms step_avg:156.91ms step:1459/1480 train_time:227365ms step_avg:156.91ms step:1460/1480 train_time:227526ms step_avg:156.91ms step:1461/1480 train_time:227691ms step_avg:156.92ms step:1462/1480 train_time:227858ms step_avg:156.93ms step:1463/1480 train_time:228023ms step_avg:156.93ms step:1464/1480 train_time:228188ms step_avg:156.94ms step:1465/1480 train_time:228353ms step_avg:156.94ms step:1466/1480 train_time:228518ms step_avg:156.95ms step:1467/1480 train_time:228682ms step_avg:156.95ms step:1468/1480 train_time:228845ms step_avg:156.96ms step:1469/1480 train_time:229009ms step_avg:156.96ms step:1470/1480 train_time:229177ms step_avg:156.97ms step:1471/1480 train_time:229348ms step_avg:156.98ms step:1472/1480 train_time:229519ms step_avg:156.99ms step:1473/1480 train_time:229684ms step_avg:156.99ms step:1474/1480 train_time:229849ms step_avg:157.00ms step:1475/1480 train_time:230020ms step_avg:157.01ms step:1476/1480 train_time:230183ms step_avg:157.01ms step:1477/1480 train_time:230351ms step_avg:157.02ms step:1478/1480 train_time:230520ms step_avg:157.03ms step:1479/1480 train_time:230686ms step_avg:157.04ms step:1480/1480 train_time:230849ms step_avg:157.04ms step:1480/1480 val_loss:3.2796 train_time:230925ms step_avg:157.09ms peak memory consumption: 34239 MiB