import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 09:07:10 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 37C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 35C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29165ms step_avg:nanms step:2/1480 train_time:29349ms step_avg:nanms step:3/1480 train_time:29471ms step_avg:nanms step:4/1480 train_time:29612ms step_avg:nanms step:5/1480 train_time:29753ms step_avg:nanms step:6/1480 train_time:29898ms step_avg:nanms step:7/1480 train_time:30036ms step_avg:nanms step:8/1480 train_time:30179ms step_avg:nanms step:9/1480 train_time:30321ms step_avg:nanms step:10/1480 train_time:30467ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.42ms step:14/1480 train_time:566ms step_avg:141.50ms step:15/1480 train_time:709ms step_avg:141.79ms step:16/1480 train_time:853ms step_avg:142.10ms step:17/1480 train_time:997ms step_avg:142.41ms step:18/1480 train_time:1139ms step_avg:142.43ms step:19/1480 train_time:1281ms step_avg:142.32ms step:20/1480 train_time:1424ms step_avg:142.38ms step:21/1480 train_time:1564ms step_avg:142.23ms step:22/1480 train_time:1706ms step_avg:142.20ms step:23/1480 train_time:1850ms step_avg:142.29ms step:24/1480 train_time:1995ms step_avg:142.49ms step:25/1480 train_time:2140ms step_avg:142.67ms step:26/1480 train_time:2282ms step_avg:142.64ms step:27/1480 train_time:2423ms step_avg:142.55ms step:28/1480 train_time:2565ms step_avg:142.51ms step:29/1480 train_time:2707ms step_avg:142.47ms step:30/1480 train_time:3230ms step_avg:161.52ms step:31/1480 train_time:3333ms step_avg:158.69ms step:32/1480 train_time:3476ms step_avg:158.01ms step:33/1480 train_time:3619ms step_avg:157.34ms step:34/1480 train_time:3760ms step_avg:156.68ms step:35/1480 train_time:3902ms step_avg:156.07ms step:36/1480 train_time:4043ms step_avg:155.51ms step:37/1480 train_time:4188ms step_avg:155.11ms step:38/1480 train_time:4332ms step_avg:154.70ms step:39/1480 train_time:4475ms step_avg:154.33ms step:40/1480 train_time:4618ms step_avg:153.95ms step:41/1480 train_time:4760ms step_avg:153.56ms step:42/1480 train_time:4903ms step_avg:153.22ms step:43/1480 train_time:5045ms step_avg:152.89ms step:44/1480 train_time:5187ms step_avg:152.56ms step:45/1480 train_time:5331ms step_avg:152.32ms step:46/1480 train_time:5476ms step_avg:152.12ms step:47/1480 train_time:5620ms step_avg:151.90ms step:48/1480 train_time:5761ms step_avg:151.61ms step:49/1480 train_time:5903ms step_avg:151.35ms step:50/1480 train_time:6045ms step_avg:151.12ms step:51/1480 train_time:6188ms step_avg:150.93ms step:52/1480 train_time:6331ms step_avg:150.74ms step:53/1480 train_time:6475ms step_avg:150.59ms step:54/1480 train_time:6618ms step_avg:150.41ms step:55/1480 train_time:6760ms step_avg:150.22ms step:56/1480 train_time:6904ms step_avg:150.09ms step:57/1480 train_time:7046ms step_avg:149.92ms step:58/1480 train_time:7188ms step_avg:149.75ms step:59/1480 train_time:7331ms step_avg:149.62ms step:60/1480 train_time:7475ms step_avg:149.49ms step:61/1480 train_time:7619ms step_avg:149.38ms step:62/1480 train_time:7760ms step_avg:149.23ms step:63/1480 train_time:7904ms step_avg:149.13ms step:64/1480 train_time:8046ms step_avg:149.00ms step:65/1480 train_time:8189ms step_avg:148.88ms step:66/1480 train_time:8330ms step_avg:148.76ms step:67/1480 train_time:8474ms step_avg:148.67ms step:68/1480 train_time:8618ms step_avg:148.58ms step:69/1480 train_time:8760ms step_avg:148.48ms step:70/1480 train_time:8902ms step_avg:148.37ms step:71/1480 train_time:9044ms step_avg:148.27ms step:72/1480 train_time:9186ms step_avg:148.16ms step:73/1480 train_time:9330ms step_avg:148.09ms step:74/1480 train_time:9474ms step_avg:148.04ms step:75/1480 train_time:9620ms step_avg:147.99ms step:76/1480 train_time:9761ms step_avg:147.89ms step:77/1480 train_time:9904ms step_avg:147.82ms step:78/1480 train_time:10047ms step_avg:147.75ms step:79/1480 train_time:10188ms step_avg:147.66ms step:80/1480 train_time:10760ms step_avg:153.72ms step:81/1480 train_time:10860ms step_avg:152.96ms step:82/1480 train_time:11002ms step_avg:152.81ms step:83/1480 train_time:11145ms step_avg:152.67ms step:84/1480 train_time:11286ms step_avg:152.51ms step:85/1480 train_time:11427ms step_avg:152.36ms step:86/1480 train_time:11570ms step_avg:152.23ms step:87/1480 train_time:11714ms step_avg:152.13ms step:88/1480 train_time:11857ms step_avg:152.02ms step:89/1480 train_time:12000ms step_avg:151.90ms step:90/1480 train_time:12143ms step_avg:151.79ms step:91/1480 train_time:12285ms step_avg:151.67ms step:92/1480 train_time:12426ms step_avg:151.54ms step:93/1480 train_time:12567ms step_avg:151.41ms step:94/1480 train_time:12712ms step_avg:151.33ms step:95/1480 train_time:12855ms step_avg:151.23ms step:96/1480 train_time:12998ms step_avg:151.14ms step:97/1480 train_time:13141ms step_avg:151.04ms step:98/1480 train_time:13659ms step_avg:155.22ms step:99/1480 train_time:13760ms step_avg:154.61ms step:100/1480 train_time:13904ms step_avg:154.48ms step:101/1480 train_time:14050ms step_avg:154.40ms step:102/1480 train_time:14187ms step_avg:154.21ms step:103/1480 train_time:14329ms step_avg:154.08ms step:104/1480 train_time:14472ms step_avg:153.96ms step:105/1480 train_time:14617ms step_avg:153.87ms step:106/1480 train_time:14759ms step_avg:153.74ms step:107/1480 train_time:14902ms step_avg:153.63ms step:108/1480 train_time:15044ms step_avg:153.51ms step:109/1480 train_time:15187ms step_avg:153.41ms step:110/1480 train_time:15329ms step_avg:153.29ms step:111/1480 train_time:15473ms step_avg:153.20ms step:112/1480 train_time:15620ms step_avg:153.13ms step:113/1480 train_time:15764ms step_avg:153.05ms step:114/1480 train_time:15910ms step_avg:152.98ms step:115/1480 train_time:16056ms step_avg:152.92ms step:116/1480 train_time:16202ms step_avg:152.85ms step:117/1480 train_time:16347ms step_avg:152.77ms step:118/1480 train_time:16492ms step_avg:152.71ms step:119/1480 train_time:16639ms step_avg:152.65ms step:120/1480 train_time:16784ms step_avg:152.58ms step:121/1480 train_time:16930ms step_avg:152.53ms step:122/1480 train_time:17077ms step_avg:152.48ms step:123/1480 train_time:17222ms step_avg:152.41ms step:124/1480 train_time:17367ms step_avg:152.35ms step:125/1480 train_time:17514ms step_avg:152.29ms step:125/1480 val_loss:4.4428 train_time:17580ms step_avg:152.87ms step:126/1480 train_time:17676ms step_avg:152.38ms step:127/1480 train_time:17815ms step_avg:152.27ms step:128/1480 train_time:17962ms step_avg:152.22ms step:129/1480 train_time:18107ms step_avg:152.16ms step:130/1480 train_time:18252ms step_avg:152.10ms step:131/1480 train_time:18396ms step_avg:152.04ms step:132/1480 train_time:18541ms step_avg:151.97ms step:133/1480 train_time:18687ms step_avg:151.92ms step:134/1480 train_time:18833ms step_avg:151.88ms step:135/1480 train_time:18978ms step_avg:151.82ms step:136/1480 train_time:19124ms step_avg:151.78ms step:137/1480 train_time:19270ms step_avg:151.73ms step:138/1480 train_time:19414ms step_avg:151.67ms step:139/1480 train_time:19559ms step_avg:151.62ms step:140/1480 train_time:19705ms step_avg:151.58ms step:141/1480 train_time:19851ms step_avg:151.54ms step:142/1480 train_time:19995ms step_avg:151.48ms step:143/1480 train_time:20142ms step_avg:151.44ms step:144/1480 train_time:20289ms step_avg:151.41ms step:145/1480 train_time:20433ms step_avg:151.36ms step:146/1480 train_time:20578ms step_avg:151.31ms step:147/1480 train_time:20724ms step_avg:151.27ms step:148/1480 train_time:20870ms step_avg:151.23ms step:149/1480 train_time:21014ms step_avg:151.18ms step:150/1480 train_time:21159ms step_avg:151.13ms step:151/1480 train_time:21304ms step_avg:151.10ms step:152/1480 train_time:21451ms step_avg:151.06ms step:153/1480 train_time:21596ms step_avg:151.02ms step:154/1480 train_time:21741ms step_avg:150.98ms step:155/1480 train_time:21886ms step_avg:150.94ms step:156/1480 train_time:22032ms step_avg:150.90ms step:157/1480 train_time:22176ms step_avg:150.86ms step:158/1480 train_time:22322ms step_avg:150.82ms step:159/1480 train_time:22469ms step_avg:150.80ms step:160/1480 train_time:22613ms step_avg:150.76ms step:161/1480 train_time:22758ms step_avg:150.72ms step:162/1480 train_time:22904ms step_avg:150.68ms step:163/1480 train_time:23050ms step_avg:150.65ms step:164/1480 train_time:23195ms step_avg:150.62ms step:165/1480 train_time:23340ms step_avg:150.58ms step:166/1480 train_time:23488ms step_avg:150.57ms step:167/1480 train_time:23634ms step_avg:150.53ms step:168/1480 train_time:23778ms step_avg:150.49ms step:169/1480 train_time:23925ms step_avg:150.47ms step:170/1480 train_time:24071ms step_avg:150.44ms step:171/1480 train_time:24215ms step_avg:150.40ms step:172/1480 train_time:24361ms step_avg:150.38ms step:173/1480 train_time:24507ms step_avg:150.35ms step:174/1480 train_time:24653ms step_avg:150.32ms step:175/1480 train_time:24797ms step_avg:150.28ms step:176/1480 train_time:24943ms step_avg:150.26ms step:177/1480 train_time:25089ms step_avg:150.24ms step:178/1480 train_time:25234ms step_avg:150.20ms step:179/1480 train_time:25379ms step_avg:150.17ms step:180/1480 train_time:25526ms step_avg:150.15ms step:181/1480 train_time:25671ms step_avg:150.13ms step:182/1480 train_time:25816ms step_avg:150.09ms step:183/1480 train_time:25961ms step_avg:150.07ms step:184/1480 train_time:26107ms step_avg:150.04ms step:185/1480 train_time:26254ms step_avg:150.02ms step:186/1480 train_time:26399ms step_avg:149.99ms step:187/1480 train_time:26545ms step_avg:149.97ms step:188/1480 train_time:26692ms step_avg:149.95ms step:189/1480 train_time:26855ms step_avg:150.03ms step:190/1480 train_time:26981ms step_avg:149.89ms step:191/1480 train_time:27127ms step_avg:149.88ms step:192/1480 train_time:27273ms step_avg:149.85ms step:193/1480 train_time:27417ms step_avg:149.82ms step:194/1480 train_time:27564ms step_avg:149.80ms step:195/1480 train_time:27710ms step_avg:149.78ms step:196/1480 train_time:27855ms step_avg:149.76ms step:197/1480 train_time:28000ms step_avg:149.73ms step:198/1480 train_time:28148ms step_avg:149.72ms step:199/1480 train_time:28293ms step_avg:149.70ms step:200/1480 train_time:28440ms step_avg:149.69ms step:201/1480 train_time:28589ms step_avg:149.68ms step:202/1480 train_time:28733ms step_avg:149.65ms step:203/1480 train_time:28878ms step_avg:149.63ms step:204/1480 train_time:29024ms step_avg:149.61ms step:205/1480 train_time:29170ms step_avg:149.59ms step:206/1480 train_time:29314ms step_avg:149.56ms step:207/1480 train_time:29460ms step_avg:149.54ms step:208/1480 train_time:29606ms step_avg:149.53ms step:209/1480 train_time:29752ms step_avg:149.51ms step:210/1480 train_time:29897ms step_avg:149.49ms step:211/1480 train_time:30043ms step_avg:149.47ms step:212/1480 train_time:30190ms step_avg:149.46ms step:213/1480 train_time:30334ms step_avg:149.43ms step:214/1480 train_time:30479ms step_avg:149.41ms step:215/1480 train_time:30626ms step_avg:149.40ms step:216/1480 train_time:30771ms step_avg:149.37ms step:217/1480 train_time:30916ms step_avg:149.35ms step:218/1480 train_time:31063ms step_avg:149.34ms step:219/1480 train_time:31209ms step_avg:149.33ms step:220/1480 train_time:31354ms step_avg:149.31ms step:221/1480 train_time:31891ms step_avg:151.14ms step:222/1480 train_time:32401ms step_avg:152.83ms step:223/1480 train_time:32508ms step_avg:152.62ms step:224/1480 train_time:32656ms step_avg:152.60ms step:225/1480 train_time:32804ms step_avg:152.58ms step:226/1480 train_time:32952ms step_avg:152.55ms step:227/1480 train_time:33099ms step_avg:152.53ms step:228/1480 train_time:33248ms step_avg:152.51ms step:229/1480 train_time:33395ms step_avg:152.49ms step:230/1480 train_time:33544ms step_avg:152.47ms step:231/1480 train_time:33693ms step_avg:152.46ms step:232/1480 train_time:33841ms step_avg:152.44ms step:233/1480 train_time:33990ms step_avg:152.42ms step:234/1480 train_time:34138ms step_avg:152.40ms step:235/1480 train_time:34288ms step_avg:152.39ms step:236/1480 train_time:34435ms step_avg:152.37ms step:237/1480 train_time:34585ms step_avg:152.35ms step:238/1480 train_time:34733ms step_avg:152.34ms step:239/1480 train_time:34881ms step_avg:152.32ms step:240/1480 train_time:35030ms step_avg:152.31ms step:241/1480 train_time:35178ms step_avg:152.28ms step:242/1480 train_time:35327ms step_avg:152.27ms step:243/1480 train_time:35475ms step_avg:152.25ms step:244/1480 train_time:35624ms step_avg:152.24ms step:245/1480 train_time:35773ms step_avg:152.22ms step:246/1480 train_time:35921ms step_avg:152.21ms step:247/1480 train_time:36070ms step_avg:152.19ms step:248/1480 train_time:36217ms step_avg:152.17ms step:249/1480 train_time:36366ms step_avg:152.16ms step:250/1480 train_time:36514ms step_avg:152.14ms step:250/1480 val_loss:3.9915 train_time:36580ms step_avg:152.42ms step:251/1480 train_time:36673ms step_avg:152.17ms step:252/1480 train_time:36822ms step_avg:152.16ms step:253/1480 train_time:36972ms step_avg:152.15ms step:254/1480 train_time:37119ms step_avg:152.13ms step:255/1480 train_time:37267ms step_avg:152.11ms step:256/1480 train_time:37415ms step_avg:152.09ms step:257/1480 train_time:37563ms step_avg:152.08ms step:258/1480 train_time:37713ms step_avg:152.07ms step:259/1480 train_time:37862ms step_avg:152.06ms step:260/1480 train_time:38012ms step_avg:152.05ms step:261/1480 train_time:38159ms step_avg:152.03ms step:262/1480 train_time:38309ms step_avg:152.02ms step:263/1480 train_time:38456ms step_avg:152.00ms step:264/1480 train_time:38603ms step_avg:151.98ms step:265/1480 train_time:38752ms step_avg:151.97ms step:266/1480 train_time:38901ms step_avg:151.96ms step:267/1480 train_time:39050ms step_avg:151.94ms step:268/1480 train_time:39197ms step_avg:151.93ms step:269/1480 train_time:39345ms step_avg:151.91ms step:270/1480 train_time:39495ms step_avg:151.90ms step:271/1480 train_time:39642ms step_avg:151.88ms step:272/1480 train_time:39791ms step_avg:151.87ms step:273/1480 train_time:39938ms step_avg:151.86ms step:274/1480 train_time:40088ms step_avg:151.85ms step:275/1480 train_time:40236ms step_avg:151.83ms step:276/1480 train_time:40384ms step_avg:151.82ms step:277/1480 train_time:40533ms step_avg:151.81ms step:278/1480 train_time:40682ms step_avg:151.80ms step:279/1480 train_time:40830ms step_avg:151.78ms step:280/1480 train_time:40978ms step_avg:151.77ms step:281/1480 train_time:41128ms step_avg:151.76ms step:282/1480 train_time:41276ms step_avg:151.75ms step:283/1480 train_time:41424ms step_avg:151.74ms step:284/1480 train_time:41574ms step_avg:151.73ms step:285/1480 train_time:41722ms step_avg:151.72ms step:286/1480 train_time:41871ms step_avg:151.71ms step:287/1480 train_time:42018ms step_avg:151.69ms step:288/1480 train_time:42168ms step_avg:151.68ms step:289/1480 train_time:42316ms step_avg:151.67ms step:290/1480 train_time:42465ms step_avg:151.66ms step:291/1480 train_time:42613ms step_avg:151.65ms step:292/1480 train_time:42762ms step_avg:151.64ms step:293/1480 train_time:42911ms step_avg:151.63ms step:294/1480 train_time:43058ms step_avg:151.61ms step:295/1480 train_time:43207ms step_avg:151.60ms step:296/1480 train_time:43355ms step_avg:151.59ms step:297/1480 train_time:43503ms step_avg:151.58ms step:298/1480 train_time:43651ms step_avg:151.57ms step:299/1480 train_time:43799ms step_avg:151.55ms step:300/1480 train_time:43948ms step_avg:151.54ms step:301/1480 train_time:44097ms step_avg:151.54ms step:302/1480 train_time:44244ms step_avg:151.52ms step:303/1480 train_time:44393ms step_avg:151.51ms step:304/1480 train_time:44541ms step_avg:151.50ms step:305/1480 train_time:44691ms step_avg:151.49ms step:306/1480 train_time:44839ms step_avg:151.48ms step:307/1480 train_time:44988ms step_avg:151.47ms step:308/1480 train_time:45136ms step_avg:151.46ms step:309/1480 train_time:45284ms step_avg:151.45ms step:310/1480 train_time:45433ms step_avg:151.44ms step:311/1480 train_time:45581ms step_avg:151.43ms step:312/1480 train_time:45730ms step_avg:151.42ms step:313/1480 train_time:45879ms step_avg:151.41ms step:314/1480 train_time:46028ms step_avg:151.41ms step:315/1480 train_time:46176ms step_avg:151.40ms step:316/1480 train_time:46324ms step_avg:151.39ms step:317/1480 train_time:46473ms step_avg:151.38ms step:318/1480 train_time:46621ms step_avg:151.37ms step:319/1480 train_time:46770ms step_avg:151.36ms step:320/1480 train_time:46918ms step_avg:151.35ms step:321/1480 train_time:47067ms step_avg:151.34ms step:322/1480 train_time:47216ms step_avg:151.33ms step:323/1480 train_time:47365ms step_avg:151.32ms step:324/1480 train_time:47514ms step_avg:151.32ms step:325/1480 train_time:47661ms step_avg:151.31ms step:326/1480 train_time:47810ms step_avg:151.30ms step:327/1480 train_time:47959ms step_avg:151.29ms step:328/1480 train_time:48108ms step_avg:151.28ms step:329/1480 train_time:48256ms step_avg:151.27ms step:330/1480 train_time:48405ms step_avg:151.26ms step:331/1480 train_time:48555ms step_avg:151.26ms step:332/1480 train_time:48706ms step_avg:151.26ms step:333/1480 train_time:48857ms step_avg:151.26ms step:334/1480 train_time:49008ms step_avg:151.26ms step:335/1480 train_time:49157ms step_avg:151.25ms step:336/1480 train_time:49310ms step_avg:151.26ms step:337/1480 train_time:49460ms step_avg:151.25ms step:338/1480 train_time:49612ms step_avg:151.26ms step:339/1480 train_time:49763ms step_avg:151.26ms step:340/1480 train_time:49914ms step_avg:151.25ms step:341/1480 train_time:50064ms step_avg:151.25ms step:342/1480 train_time:50215ms step_avg:151.25ms step:343/1480 train_time:50366ms step_avg:151.25ms step:344/1480 train_time:50517ms step_avg:151.25ms step:345/1480 train_time:50669ms step_avg:151.25ms step:346/1480 train_time:50819ms step_avg:151.25ms step:347/1480 train_time:50971ms step_avg:151.25ms step:348/1480 train_time:51123ms step_avg:151.25ms step:349/1480 train_time:51274ms step_avg:151.25ms step:350/1480 train_time:51424ms step_avg:151.25ms step:351/1480 train_time:51576ms step_avg:151.25ms step:352/1480 train_time:51727ms step_avg:151.25ms step:353/1480 train_time:51877ms step_avg:151.25ms step:354/1480 train_time:52028ms step_avg:151.24ms step:355/1480 train_time:52178ms step_avg:151.24ms step:356/1480 train_time:52329ms step_avg:151.24ms step:357/1480 train_time:52481ms step_avg:151.24ms step:358/1480 train_time:52632ms step_avg:151.24ms step:359/1480 train_time:52784ms step_avg:151.24ms step:360/1480 train_time:52936ms step_avg:151.25ms step:361/1480 train_time:53087ms step_avg:151.25ms step:362/1480 train_time:53239ms step_avg:151.25ms step:363/1480 train_time:53390ms step_avg:151.25ms step:364/1480 train_time:53539ms step_avg:151.24ms step:365/1480 train_time:53690ms step_avg:151.24ms step:366/1480 train_time:53841ms step_avg:151.24ms step:367/1480 train_time:53992ms step_avg:151.24ms step:368/1480 train_time:54142ms step_avg:151.23ms step:369/1480 train_time:54294ms step_avg:151.24ms step:370/1480 train_time:54444ms step_avg:151.23ms step:371/1480 train_time:54594ms step_avg:151.23ms step:372/1480 train_time:54744ms step_avg:151.23ms step:373/1480 train_time:54895ms step_avg:151.23ms step:374/1480 train_time:55046ms step_avg:151.22ms step:375/1480 train_time:55196ms step_avg:151.22ms step:375/1480 val_loss:3.8057 train_time:55264ms step_avg:151.41ms step:376/1480 train_time:55359ms step_avg:151.26ms step:377/1480 train_time:55504ms step_avg:151.24ms step:378/1480 train_time:55654ms step_avg:151.23ms step:379/1480 train_time:55818ms step_avg:151.27ms step:380/1480 train_time:55955ms step_avg:151.23ms step:381/1480 train_time:56106ms step_avg:151.23ms step:382/1480 train_time:56256ms step_avg:151.22ms step:383/1480 train_time:56408ms step_avg:151.23ms step:384/1480 train_time:56558ms step_avg:151.23ms step:385/1480 train_time:56709ms step_avg:151.22ms step:386/1480 train_time:56860ms step_avg:151.22ms step:387/1480 train_time:57011ms step_avg:151.22ms step:388/1480 train_time:57162ms step_avg:151.22ms step:389/1480 train_time:57313ms step_avg:151.22ms step:390/1480 train_time:57463ms step_avg:151.22ms step:391/1480 train_time:57613ms step_avg:151.22ms step:392/1480 train_time:57764ms step_avg:151.22ms step:393/1480 train_time:57915ms step_avg:151.21ms step:394/1480 train_time:58066ms step_avg:151.21ms step:395/1480 train_time:58216ms step_avg:151.21ms step:396/1480 train_time:58368ms step_avg:151.21ms step:397/1480 train_time:58518ms step_avg:151.21ms step:398/1480 train_time:58669ms step_avg:151.21ms step:399/1480 train_time:58820ms step_avg:151.21ms step:400/1480 train_time:58971ms step_avg:151.21ms step:401/1480 train_time:59122ms step_avg:151.21ms step:402/1480 train_time:59273ms step_avg:151.21ms step:403/1480 train_time:59426ms step_avg:151.21ms step:404/1480 train_time:59576ms step_avg:151.21ms step:405/1480 train_time:59727ms step_avg:151.21ms step:406/1480 train_time:59878ms step_avg:151.21ms step:407/1480 train_time:60029ms step_avg:151.21ms step:408/1480 train_time:60180ms step_avg:151.21ms step:409/1480 train_time:60331ms step_avg:151.21ms step:410/1480 train_time:60482ms step_avg:151.21ms step:411/1480 train_time:60633ms step_avg:151.20ms step:412/1480 train_time:60784ms step_avg:151.20ms step:413/1480 train_time:60934ms step_avg:151.20ms step:414/1480 train_time:61085ms step_avg:151.20ms step:415/1480 train_time:61235ms step_avg:151.20ms step:416/1480 train_time:61387ms step_avg:151.20ms step:417/1480 train_time:61539ms step_avg:151.20ms step:418/1480 train_time:61690ms step_avg:151.20ms step:419/1480 train_time:61841ms step_avg:151.20ms step:420/1480 train_time:61991ms step_avg:151.20ms step:421/1480 train_time:62141ms step_avg:151.20ms step:422/1480 train_time:62291ms step_avg:151.19ms step:423/1480 train_time:62442ms step_avg:151.19ms step:424/1480 train_time:62592ms step_avg:151.19ms step:425/1480 train_time:62743ms step_avg:151.19ms step:426/1480 train_time:62894ms step_avg:151.19ms step:427/1480 train_time:63045ms step_avg:151.19ms step:428/1480 train_time:63195ms step_avg:151.18ms step:429/1480 train_time:63346ms step_avg:151.18ms step:430/1480 train_time:63498ms step_avg:151.18ms step:431/1480 train_time:63649ms step_avg:151.18ms step:432/1480 train_time:63799ms step_avg:151.18ms step:433/1480 train_time:63950ms step_avg:151.18ms step:434/1480 train_time:64102ms step_avg:151.18ms step:435/1480 train_time:64253ms step_avg:151.18ms step:436/1480 train_time:64405ms step_avg:151.19ms step:437/1480 train_time:64555ms step_avg:151.18ms step:438/1480 train_time:64707ms step_avg:151.18ms step:439/1480 train_time:64857ms step_avg:151.18ms step:440/1480 train_time:65009ms step_avg:151.18ms step:441/1480 train_time:65162ms step_avg:151.19ms step:442/1480 train_time:65314ms step_avg:151.19ms step:443/1480 train_time:65467ms step_avg:151.19ms step:444/1480 train_time:65619ms step_avg:151.20ms step:445/1480 train_time:65772ms step_avg:151.20ms step:446/1480 train_time:65926ms step_avg:151.21ms step:447/1480 train_time:66078ms step_avg:151.21ms step:448/1480 train_time:66231ms step_avg:151.21ms step:449/1480 train_time:66385ms step_avg:151.22ms step:450/1480 train_time:66538ms step_avg:151.22ms step:451/1480 train_time:66691ms step_avg:151.23ms step:452/1480 train_time:66843ms step_avg:151.23ms step:453/1480 train_time:66996ms step_avg:151.23ms step:454/1480 train_time:67148ms step_avg:151.23ms step:455/1480 train_time:67302ms step_avg:151.24ms step:456/1480 train_time:67455ms step_avg:151.24ms step:457/1480 train_time:67609ms step_avg:151.25ms step:458/1480 train_time:67761ms step_avg:151.25ms step:459/1480 train_time:67913ms step_avg:151.25ms step:460/1480 train_time:68066ms step_avg:151.26ms step:461/1480 train_time:68218ms step_avg:151.26ms step:462/1480 train_time:68372ms step_avg:151.27ms step:463/1480 train_time:68526ms step_avg:151.27ms step:464/1480 train_time:68679ms step_avg:151.28ms step:465/1480 train_time:68832ms step_avg:151.28ms step:466/1480 train_time:68985ms step_avg:151.28ms step:467/1480 train_time:69138ms step_avg:151.29ms step:468/1480 train_time:69290ms step_avg:151.29ms step:469/1480 train_time:69442ms step_avg:151.29ms step:470/1480 train_time:69595ms step_avg:151.29ms step:471/1480 train_time:69748ms step_avg:151.30ms step:472/1480 train_time:69902ms step_avg:151.30ms step:473/1480 train_time:70055ms step_avg:151.31ms step:474/1480 train_time:70209ms step_avg:151.31ms step:475/1480 train_time:70362ms step_avg:151.32ms step:476/1480 train_time:70514ms step_avg:151.32ms step:477/1480 train_time:70668ms step_avg:151.32ms step:478/1480 train_time:70820ms step_avg:151.33ms step:479/1480 train_time:70973ms step_avg:151.33ms step:480/1480 train_time:71127ms step_avg:151.33ms step:481/1480 train_time:71280ms step_avg:151.34ms step:482/1480 train_time:71433ms step_avg:151.34ms step:483/1480 train_time:71587ms step_avg:151.35ms step:484/1480 train_time:71740ms step_avg:151.35ms step:485/1480 train_time:71893ms step_avg:151.35ms step:486/1480 train_time:72045ms step_avg:151.36ms step:487/1480 train_time:72199ms step_avg:151.36ms step:488/1480 train_time:72353ms step_avg:151.37ms step:489/1480 train_time:72506ms step_avg:151.37ms step:490/1480 train_time:72659ms step_avg:151.37ms step:491/1480 train_time:72810ms step_avg:151.37ms step:492/1480 train_time:72963ms step_avg:151.38ms step:493/1480 train_time:73115ms step_avg:151.38ms step:494/1480 train_time:73268ms step_avg:151.38ms step:495/1480 train_time:73422ms step_avg:151.39ms step:496/1480 train_time:73575ms step_avg:151.39ms step:497/1480 train_time:73728ms step_avg:151.39ms step:498/1480 train_time:73880ms step_avg:151.39ms step:499/1480 train_time:74032ms step_avg:151.40ms step:500/1480 train_time:74185ms step_avg:151.40ms step:500/1480 val_loss:3.6859 train_time:74254ms step_avg:151.54ms step:501/1480 train_time:74345ms step_avg:151.42ms step:502/1480 train_time:74497ms step_avg:151.42ms step:503/1480 train_time:74649ms step_avg:151.42ms step:504/1480 train_time:74801ms step_avg:151.42ms step:505/1480 train_time:74954ms step_avg:151.42ms step:506/1480 train_time:75107ms step_avg:151.42ms step:507/1480 train_time:75260ms step_avg:151.43ms step:508/1480 train_time:75415ms step_avg:151.43ms step:509/1480 train_time:75568ms step_avg:151.44ms step:510/1480 train_time:75721ms step_avg:151.44ms step:511/1480 train_time:75874ms step_avg:151.45ms step:512/1480 train_time:76028ms step_avg:151.45ms step:513/1480 train_time:76180ms step_avg:151.45ms step:514/1480 train_time:76334ms step_avg:151.46ms step:515/1480 train_time:76488ms step_avg:151.46ms step:516/1480 train_time:76642ms step_avg:151.47ms step:517/1480 train_time:76795ms step_avg:151.47ms step:518/1480 train_time:76948ms step_avg:151.47ms step:519/1480 train_time:77101ms step_avg:151.47ms step:520/1480 train_time:77253ms step_avg:151.48ms step:521/1480 train_time:77407ms step_avg:151.48ms step:522/1480 train_time:77560ms step_avg:151.48ms step:523/1480 train_time:77714ms step_avg:151.49ms step:524/1480 train_time:77867ms step_avg:151.49ms step:525/1480 train_time:78019ms step_avg:151.49ms step:526/1480 train_time:78173ms step_avg:151.50ms step:527/1480 train_time:78326ms step_avg:151.50ms step:528/1480 train_time:78478ms step_avg:151.50ms step:529/1480 train_time:78632ms step_avg:151.51ms step:530/1480 train_time:78785ms step_avg:151.51ms step:531/1480 train_time:78938ms step_avg:151.51ms step:532/1480 train_time:79092ms step_avg:151.52ms step:533/1480 train_time:79244ms step_avg:151.52ms step:534/1480 train_time:79397ms step_avg:151.52ms step:535/1480 train_time:79549ms step_avg:151.52ms step:536/1480 train_time:79703ms step_avg:151.53ms step:537/1480 train_time:79856ms step_avg:151.53ms step:538/1480 train_time:80010ms step_avg:151.53ms step:539/1480 train_time:80164ms step_avg:151.54ms step:540/1480 train_time:80318ms step_avg:151.54ms step:541/1480 train_time:80472ms step_avg:151.55ms step:542/1480 train_time:80623ms step_avg:151.55ms step:543/1480 train_time:80776ms step_avg:151.55ms step:544/1480 train_time:80928ms step_avg:151.55ms step:545/1480 train_time:81081ms step_avg:151.55ms step:546/1480 train_time:81234ms step_avg:151.56ms step:547/1480 train_time:81388ms step_avg:151.56ms step:548/1480 train_time:81540ms step_avg:151.56ms step:549/1480 train_time:81693ms step_avg:151.56ms step:550/1480 train_time:81847ms step_avg:151.57ms step:551/1480 train_time:82001ms step_avg:151.57ms step:552/1480 train_time:82155ms step_avg:151.58ms step:553/1480 train_time:82311ms step_avg:151.59ms step:554/1480 train_time:82466ms step_avg:151.59ms step:555/1480 train_time:82620ms step_avg:151.60ms step:556/1480 train_time:82775ms step_avg:151.60ms step:557/1480 train_time:82931ms step_avg:151.61ms step:558/1480 train_time:83086ms step_avg:151.62ms step:559/1480 train_time:83241ms step_avg:151.62ms step:560/1480 train_time:83395ms step_avg:151.63ms step:561/1480 train_time:83549ms step_avg:151.63ms step:562/1480 train_time:83705ms step_avg:151.64ms step:563/1480 train_time:83860ms step_avg:151.65ms step:564/1480 train_time:84014ms step_avg:151.65ms step:565/1480 train_time:84169ms step_avg:151.66ms step:566/1480 train_time:84324ms step_avg:151.66ms step:567/1480 train_time:84479ms step_avg:151.67ms step:568/1480 train_time:84633ms step_avg:151.67ms step:569/1480 train_time:84799ms step_avg:151.70ms step:570/1480 train_time:84943ms step_avg:151.68ms step:571/1480 train_time:85097ms step_avg:151.69ms step:572/1480 train_time:85251ms step_avg:151.69ms step:573/1480 train_time:85408ms step_avg:151.70ms step:574/1480 train_time:85564ms step_avg:151.71ms step:575/1480 train_time:85719ms step_avg:151.71ms step:576/1480 train_time:85874ms step_avg:151.72ms step:577/1480 train_time:86028ms step_avg:151.72ms step:578/1480 train_time:86182ms step_avg:151.73ms step:579/1480 train_time:86337ms step_avg:151.73ms step:580/1480 train_time:86491ms step_avg:151.74ms step:581/1480 train_time:86646ms step_avg:151.74ms step:582/1480 train_time:86800ms step_avg:151.75ms step:583/1480 train_time:86954ms step_avg:151.75ms step:584/1480 train_time:87110ms step_avg:151.76ms step:585/1480 train_time:87264ms step_avg:151.76ms step:586/1480 train_time:87418ms step_avg:151.77ms step:587/1480 train_time:87573ms step_avg:151.77ms step:588/1480 train_time:87727ms step_avg:151.78ms step:589/1480 train_time:87882ms step_avg:151.78ms step:590/1480 train_time:88037ms step_avg:151.79ms step:591/1480 train_time:88191ms step_avg:151.79ms step:592/1480 train_time:88347ms step_avg:151.80ms step:593/1480 train_time:88503ms step_avg:151.81ms step:594/1480 train_time:88658ms step_avg:151.81ms step:595/1480 train_time:88814ms step_avg:151.82ms step:596/1480 train_time:88971ms step_avg:151.83ms step:597/1480 train_time:89126ms step_avg:151.83ms step:598/1480 train_time:89281ms step_avg:151.84ms step:599/1480 train_time:89436ms step_avg:151.84ms step:600/1480 train_time:89591ms step_avg:151.85ms step:601/1480 train_time:89745ms step_avg:151.85ms step:602/1480 train_time:89900ms step_avg:151.86ms step:603/1480 train_time:90054ms step_avg:151.86ms step:604/1480 train_time:90209ms step_avg:151.87ms step:605/1480 train_time:90363ms step_avg:151.87ms step:606/1480 train_time:90517ms step_avg:151.87ms step:607/1480 train_time:90674ms step_avg:151.88ms step:608/1480 train_time:90829ms step_avg:151.89ms step:609/1480 train_time:90984ms step_avg:151.89ms step:610/1480 train_time:91140ms step_avg:151.90ms step:611/1480 train_time:91294ms step_avg:151.90ms step:612/1480 train_time:91448ms step_avg:151.91ms step:613/1480 train_time:91602ms step_avg:151.91ms step:614/1480 train_time:91757ms step_avg:151.92ms step:615/1480 train_time:91912ms step_avg:151.92ms step:616/1480 train_time:92066ms step_avg:151.92ms step:617/1480 train_time:92220ms step_avg:151.93ms step:618/1480 train_time:92374ms step_avg:151.93ms step:619/1480 train_time:92530ms step_avg:151.94ms step:620/1480 train_time:92684ms step_avg:151.94ms step:621/1480 train_time:92838ms step_avg:151.95ms step:622/1480 train_time:92994ms step_avg:151.95ms step:623/1480 train_time:93148ms step_avg:151.96ms step:624/1480 train_time:93303ms step_avg:151.96ms step:625/1480 train_time:93457ms step_avg:151.96ms step:625/1480 val_loss:3.6062 train_time:93528ms step_avg:152.08ms step:626/1480 train_time:93619ms step_avg:151.98ms step:627/1480 train_time:93774ms step_avg:151.98ms step:628/1480 train_time:93929ms step_avg:151.99ms step:629/1480 train_time:94083ms step_avg:151.99ms step:630/1480 train_time:94237ms step_avg:151.99ms step:631/1480 train_time:94391ms step_avg:152.00ms step:632/1480 train_time:94546ms step_avg:152.00ms step:633/1480 train_time:94701ms step_avg:152.01ms step:634/1480 train_time:94856ms step_avg:152.01ms step:635/1480 train_time:95011ms step_avg:152.02ms step:636/1480 train_time:95166ms step_avg:152.02ms step:637/1480 train_time:95321ms step_avg:152.03ms step:638/1480 train_time:95475ms step_avg:152.03ms step:639/1480 train_time:95629ms step_avg:152.03ms step:640/1480 train_time:95783ms step_avg:152.04ms step:641/1480 train_time:95937ms step_avg:152.04ms step:642/1480 train_time:96094ms step_avg:152.05ms step:643/1480 train_time:96250ms step_avg:152.05ms step:644/1480 train_time:96404ms step_avg:152.06ms step:645/1480 train_time:96559ms step_avg:152.06ms step:646/1480 train_time:96713ms step_avg:152.06ms step:647/1480 train_time:96868ms step_avg:152.07ms step:648/1480 train_time:97023ms step_avg:152.07ms step:649/1480 train_time:97179ms step_avg:152.08ms step:650/1480 train_time:97335ms step_avg:152.09ms step:651/1480 train_time:97490ms step_avg:152.09ms step:652/1480 train_time:97645ms step_avg:152.09ms step:653/1480 train_time:97799ms step_avg:152.10ms step:654/1480 train_time:97955ms step_avg:152.10ms step:655/1480 train_time:98110ms step_avg:152.11ms step:656/1480 train_time:98265ms step_avg:152.11ms step:657/1480 train_time:98420ms step_avg:152.12ms step:658/1480 train_time:98576ms step_avg:152.12ms step:659/1480 train_time:98731ms step_avg:152.13ms step:660/1480 train_time:98887ms step_avg:152.13ms step:661/1480 train_time:99043ms step_avg:152.14ms step:662/1480 train_time:99199ms step_avg:152.15ms step:663/1480 train_time:99355ms step_avg:152.15ms step:664/1480 train_time:99512ms step_avg:152.16ms step:665/1480 train_time:99670ms step_avg:152.17ms step:666/1480 train_time:99827ms step_avg:152.18ms step:667/1480 train_time:99984ms step_avg:152.18ms step:668/1480 train_time:100139ms step_avg:152.19ms step:669/1480 train_time:100298ms step_avg:152.20ms step:670/1480 train_time:100454ms step_avg:152.20ms step:671/1480 train_time:100609ms step_avg:152.21ms step:672/1480 train_time:100764ms step_avg:152.21ms step:673/1480 train_time:100920ms step_avg:152.22ms step:674/1480 train_time:101077ms step_avg:152.22ms step:675/1480 train_time:101234ms step_avg:152.23ms step:676/1480 train_time:101391ms step_avg:152.24ms step:677/1480 train_time:101548ms step_avg:152.25ms step:678/1480 train_time:101704ms step_avg:152.25ms step:679/1480 train_time:101860ms step_avg:152.26ms step:680/1480 train_time:102018ms step_avg:152.27ms step:681/1480 train_time:102174ms step_avg:152.27ms step:682/1480 train_time:102332ms step_avg:152.28ms step:683/1480 train_time:102489ms step_avg:152.29ms step:684/1480 train_time:102645ms step_avg:152.29ms step:685/1480 train_time:102802ms step_avg:152.30ms step:686/1480 train_time:102958ms step_avg:152.30ms step:687/1480 train_time:103113ms step_avg:152.31ms step:688/1480 train_time:103271ms step_avg:152.32ms step:689/1480 train_time:103430ms step_avg:152.33ms step:690/1480 train_time:103586ms step_avg:152.33ms step:691/1480 train_time:103742ms step_avg:152.34ms step:692/1480 train_time:103898ms step_avg:152.34ms step:693/1480 train_time:104055ms step_avg:152.35ms step:694/1480 train_time:104211ms step_avg:152.36ms step:695/1480 train_time:104367ms step_avg:152.36ms step:696/1480 train_time:104523ms step_avg:152.37ms step:697/1480 train_time:104679ms step_avg:152.37ms step:698/1480 train_time:104835ms step_avg:152.38ms step:699/1480 train_time:104991ms step_avg:152.38ms step:700/1480 train_time:105148ms step_avg:152.39ms step:701/1480 train_time:105304ms step_avg:152.39ms step:702/1480 train_time:105460ms step_avg:152.40ms step:703/1480 train_time:105617ms step_avg:152.41ms step:704/1480 train_time:105773ms step_avg:152.41ms step:705/1480 train_time:105929ms step_avg:152.42ms step:706/1480 train_time:106086ms step_avg:152.42ms step:707/1480 train_time:106242ms step_avg:152.43ms step:708/1480 train_time:106399ms step_avg:152.43ms step:709/1480 train_time:106556ms step_avg:152.44ms step:710/1480 train_time:106711ms step_avg:152.44ms step:711/1480 train_time:106869ms step_avg:152.45ms step:712/1480 train_time:107026ms step_avg:152.46ms step:713/1480 train_time:107183ms step_avg:152.46ms step:714/1480 train_time:107338ms step_avg:152.47ms step:715/1480 train_time:107496ms step_avg:152.48ms step:716/1480 train_time:107651ms step_avg:152.48ms step:717/1480 train_time:107806ms step_avg:152.48ms step:718/1480 train_time:107962ms step_avg:152.49ms step:719/1480 train_time:108117ms step_avg:152.49ms step:720/1480 train_time:108275ms step_avg:152.50ms step:721/1480 train_time:108433ms step_avg:152.51ms step:722/1480 train_time:108590ms step_avg:152.51ms step:723/1480 train_time:108746ms step_avg:152.52ms step:724/1480 train_time:108902ms step_avg:152.52ms step:725/1480 train_time:109060ms step_avg:152.53ms step:726/1480 train_time:109216ms step_avg:152.54ms step:727/1480 train_time:109376ms step_avg:152.55ms step:728/1480 train_time:109532ms step_avg:152.55ms step:729/1480 train_time:109689ms step_avg:152.56ms step:730/1480 train_time:109845ms step_avg:152.56ms step:731/1480 train_time:110002ms step_avg:152.57ms step:732/1480 train_time:110157ms step_avg:152.57ms step:733/1480 train_time:110314ms step_avg:152.58ms step:734/1480 train_time:110471ms step_avg:152.58ms step:735/1480 train_time:110628ms step_avg:152.59ms step:736/1480 train_time:110784ms step_avg:152.60ms step:737/1480 train_time:110940ms step_avg:152.60ms step:738/1480 train_time:111096ms step_avg:152.60ms step:739/1480 train_time:111254ms step_avg:152.61ms step:740/1480 train_time:111413ms step_avg:152.62ms step:741/1480 train_time:111571ms step_avg:152.63ms step:742/1480 train_time:111728ms step_avg:152.63ms step:743/1480 train_time:111884ms step_avg:152.64ms step:744/1480 train_time:112040ms step_avg:152.64ms step:745/1480 train_time:112198ms step_avg:152.65ms step:746/1480 train_time:112353ms step_avg:152.65ms step:747/1480 train_time:112509ms step_avg:152.66ms step:748/1480 train_time:112670ms step_avg:152.67ms step:749/1480 train_time:112827ms step_avg:152.68ms step:750/1480 train_time:112983ms step_avg:152.68ms step:750/1480 val_loss:3.5512 train_time:113053ms step_avg:152.77ms step:751/1480 train_time:113146ms step_avg:152.69ms step:752/1480 train_time:113300ms step_avg:152.70ms step:753/1480 train_time:113457ms step_avg:152.70ms step:754/1480 train_time:113614ms step_avg:152.71ms step:755/1480 train_time:113769ms step_avg:152.71ms step:756/1480 train_time:113924ms step_avg:152.71ms step:757/1480 train_time:114084ms step_avg:152.72ms step:758/1480 train_time:114241ms step_avg:152.73ms step:759/1480 train_time:114407ms step_avg:152.75ms step:760/1480 train_time:114555ms step_avg:152.74ms step:761/1480 train_time:114711ms step_avg:152.74ms step:762/1480 train_time:114868ms step_avg:152.75ms step:763/1480 train_time:115025ms step_avg:152.76ms step:764/1480 train_time:115181ms step_avg:152.76ms step:765/1480 train_time:115338ms step_avg:152.77ms step:766/1480 train_time:115495ms step_avg:152.77ms step:767/1480 train_time:115653ms step_avg:152.78ms step:768/1480 train_time:115810ms step_avg:152.78ms step:769/1480 train_time:115967ms step_avg:152.79ms step:770/1480 train_time:116124ms step_avg:152.79ms step:771/1480 train_time:116282ms step_avg:152.80ms step:772/1480 train_time:116439ms step_avg:152.81ms step:773/1480 train_time:116597ms step_avg:152.81ms step:774/1480 train_time:116754ms step_avg:152.82ms step:775/1480 train_time:116913ms step_avg:152.83ms step:776/1480 train_time:117071ms step_avg:152.83ms step:777/1480 train_time:117231ms step_avg:152.84ms step:778/1480 train_time:117389ms step_avg:152.85ms step:779/1480 train_time:117546ms step_avg:152.86ms step:780/1480 train_time:117705ms step_avg:152.86ms step:781/1480 train_time:117863ms step_avg:152.87ms step:782/1480 train_time:118020ms step_avg:152.88ms step:783/1480 train_time:118177ms step_avg:152.88ms step:784/1480 train_time:118337ms step_avg:152.89ms step:785/1480 train_time:118495ms step_avg:152.90ms step:786/1480 train_time:118654ms step_avg:152.90ms step:787/1480 train_time:118812ms step_avg:152.91ms step:788/1480 train_time:118971ms step_avg:152.92ms step:789/1480 train_time:119127ms step_avg:152.92ms step:790/1480 train_time:119285ms step_avg:152.93ms step:791/1480 train_time:119444ms step_avg:152.94ms step:792/1480 train_time:119601ms step_avg:152.94ms step:793/1480 train_time:119758ms step_avg:152.95ms step:794/1480 train_time:119917ms step_avg:152.96ms step:795/1480 train_time:120077ms step_avg:152.96ms step:796/1480 train_time:120238ms step_avg:152.97ms step:797/1480 train_time:120398ms step_avg:152.98ms step:798/1480 train_time:120557ms step_avg:152.99ms step:799/1480 train_time:120717ms step_avg:153.00ms step:800/1480 train_time:120876ms step_avg:153.01ms step:801/1480 train_time:121033ms step_avg:153.01ms step:802/1480 train_time:121192ms step_avg:153.02ms step:803/1480 train_time:121349ms step_avg:153.02ms step:804/1480 train_time:121506ms step_avg:153.03ms step:805/1480 train_time:121666ms step_avg:153.04ms step:806/1480 train_time:121822ms step_avg:153.04ms step:807/1480 train_time:121979ms step_avg:153.05ms step:808/1480 train_time:122138ms step_avg:153.05ms step:809/1480 train_time:122295ms step_avg:153.06ms step:810/1480 train_time:122453ms step_avg:153.07ms step:811/1480 train_time:122611ms step_avg:153.07ms step:812/1480 train_time:122768ms step_avg:153.08ms step:813/1480 train_time:122924ms step_avg:153.08ms step:814/1480 train_time:123082ms step_avg:153.09ms step:815/1480 train_time:123239ms step_avg:153.09ms step:816/1480 train_time:123398ms step_avg:153.10ms step:817/1480 train_time:123556ms step_avg:153.11ms step:818/1480 train_time:123715ms step_avg:153.11ms step:819/1480 train_time:123871ms step_avg:153.12ms step:820/1480 train_time:124029ms step_avg:153.12ms step:821/1480 train_time:124186ms step_avg:153.13ms step:822/1480 train_time:124344ms step_avg:153.13ms step:823/1480 train_time:124501ms step_avg:153.14ms step:824/1480 train_time:124659ms step_avg:153.14ms step:825/1480 train_time:124818ms step_avg:153.15ms step:826/1480 train_time:124978ms step_avg:153.16ms step:827/1480 train_time:125138ms step_avg:153.17ms step:828/1480 train_time:125296ms step_avg:153.17ms step:829/1480 train_time:125455ms step_avg:153.18ms step:830/1480 train_time:125615ms step_avg:153.19ms step:831/1480 train_time:125773ms step_avg:153.20ms step:832/1480 train_time:125931ms step_avg:153.20ms step:833/1480 train_time:126088ms step_avg:153.21ms step:834/1480 train_time:126249ms step_avg:153.21ms step:835/1480 train_time:126406ms step_avg:153.22ms step:836/1480 train_time:126565ms step_avg:153.23ms step:837/1480 train_time:126721ms step_avg:153.23ms step:838/1480 train_time:126880ms step_avg:153.24ms step:839/1480 train_time:127038ms step_avg:153.24ms step:840/1480 train_time:127196ms step_avg:153.25ms step:841/1480 train_time:127352ms step_avg:153.25ms step:842/1480 train_time:127510ms step_avg:153.26ms step:843/1480 train_time:127668ms step_avg:153.26ms step:844/1480 train_time:127824ms step_avg:153.27ms step:845/1480 train_time:127982ms step_avg:153.27ms step:846/1480 train_time:128140ms step_avg:153.28ms step:847/1480 train_time:128299ms step_avg:153.28ms step:848/1480 train_time:128457ms step_avg:153.29ms step:849/1480 train_time:128614ms step_avg:153.29ms step:850/1480 train_time:128772ms step_avg:153.30ms step:851/1480 train_time:128931ms step_avg:153.31ms step:852/1480 train_time:129090ms step_avg:153.31ms step:853/1480 train_time:129246ms step_avg:153.32ms step:854/1480 train_time:129404ms step_avg:153.32ms step:855/1480 train_time:129562ms step_avg:153.33ms step:856/1480 train_time:129719ms step_avg:153.33ms step:857/1480 train_time:129878ms step_avg:153.34ms step:858/1480 train_time:130038ms step_avg:153.35ms step:859/1480 train_time:130199ms step_avg:153.36ms step:860/1480 train_time:130358ms step_avg:153.36ms step:861/1480 train_time:130518ms step_avg:153.37ms step:862/1480 train_time:130681ms step_avg:153.38ms step:863/1480 train_time:130840ms step_avg:153.39ms step:864/1480 train_time:130998ms step_avg:153.39ms step:865/1480 train_time:131156ms step_avg:153.40ms step:866/1480 train_time:131315ms step_avg:153.41ms step:867/1480 train_time:131474ms step_avg:153.41ms step:868/1480 train_time:131632ms step_avg:153.42ms step:869/1480 train_time:131790ms step_avg:153.42ms step:870/1480 train_time:131948ms step_avg:153.43ms step:871/1480 train_time:132104ms step_avg:153.43ms step:872/1480 train_time:132262ms step_avg:153.44ms step:873/1480 train_time:132418ms step_avg:153.44ms step:874/1480 train_time:132579ms step_avg:153.45ms step:875/1480 train_time:132740ms step_avg:153.46ms step:875/1480 val_loss:3.5065 train_time:132813ms step_avg:153.54ms step:876/1480 train_time:132903ms step_avg:153.47ms step:877/1480 train_time:133061ms step_avg:153.47ms step:878/1480 train_time:133219ms step_avg:153.48ms step:879/1480 train_time:133377ms step_avg:153.48ms step:880/1480 train_time:133536ms step_avg:153.49ms step:881/1480 train_time:133693ms step_avg:153.49ms step:882/1480 train_time:133853ms step_avg:153.50ms step:883/1480 train_time:134013ms step_avg:153.51ms step:884/1480 train_time:134174ms step_avg:153.52ms step:885/1480 train_time:134335ms step_avg:153.53ms step:886/1480 train_time:134494ms step_avg:153.53ms step:887/1480 train_time:134655ms step_avg:153.54ms step:888/1480 train_time:134820ms step_avg:153.55ms step:889/1480 train_time:134981ms step_avg:153.56ms step:890/1480 train_time:135138ms step_avg:153.57ms step:891/1480 train_time:135296ms step_avg:153.57ms step:892/1480 train_time:135457ms step_avg:153.58ms step:893/1480 train_time:135615ms step_avg:153.58ms step:894/1480 train_time:135776ms step_avg:153.59ms step:895/1480 train_time:135940ms step_avg:153.60ms step:896/1480 train_time:136098ms step_avg:153.61ms step:897/1480 train_time:136260ms step_avg:153.62ms step:898/1480 train_time:136419ms step_avg:153.63ms step:899/1480 train_time:136579ms step_avg:153.63ms step:900/1480 train_time:136737ms step_avg:153.64ms step:901/1480 train_time:136897ms step_avg:153.64ms step:902/1480 train_time:137056ms step_avg:153.65ms step:903/1480 train_time:137218ms step_avg:153.66ms step:904/1480 train_time:137377ms step_avg:153.67ms step:905/1480 train_time:137536ms step_avg:153.67ms step:906/1480 train_time:137697ms step_avg:153.68ms step:907/1480 train_time:137859ms step_avg:153.69ms step:908/1480 train_time:138017ms step_avg:153.69ms step:909/1480 train_time:138177ms step_avg:153.70ms step:910/1480 train_time:138341ms step_avg:153.71ms step:911/1480 train_time:138499ms step_avg:153.72ms step:912/1480 train_time:138660ms step_avg:153.72ms step:913/1480 train_time:138821ms step_avg:153.73ms step:914/1480 train_time:138981ms step_avg:153.74ms step:915/1480 train_time:139143ms step_avg:153.75ms step:916/1480 train_time:139301ms step_avg:153.75ms step:917/1480 train_time:139460ms step_avg:153.76ms step:918/1480 train_time:139621ms step_avg:153.77ms step:919/1480 train_time:139782ms step_avg:153.78ms step:920/1480 train_time:139942ms step_avg:153.78ms step:921/1480 train_time:140101ms step_avg:153.79ms step:922/1480 train_time:140261ms step_avg:153.80ms step:923/1480 train_time:140420ms step_avg:153.80ms step:924/1480 train_time:140579ms step_avg:153.81ms step:925/1480 train_time:140739ms step_avg:153.81ms step:926/1480 train_time:140897ms step_avg:153.82ms step:927/1480 train_time:141056ms step_avg:153.82ms step:928/1480 train_time:141216ms step_avg:153.83ms step:929/1480 train_time:141376ms step_avg:153.84ms step:930/1480 train_time:141536ms step_avg:153.84ms step:931/1480 train_time:141696ms step_avg:153.85ms step:932/1480 train_time:141856ms step_avg:153.86ms step:933/1480 train_time:142016ms step_avg:153.86ms step:934/1480 train_time:142177ms step_avg:153.87ms step:935/1480 train_time:142339ms step_avg:153.88ms step:936/1480 train_time:142497ms step_avg:153.88ms step:937/1480 train_time:142659ms step_avg:153.89ms step:938/1480 train_time:142817ms step_avg:153.90ms step:939/1480 train_time:142979ms step_avg:153.91ms step:940/1480 train_time:143140ms step_avg:153.91ms step:941/1480 train_time:143298ms step_avg:153.92ms step:942/1480 train_time:143458ms step_avg:153.92ms step:943/1480 train_time:143617ms step_avg:153.93ms step:944/1480 train_time:143781ms step_avg:153.94ms step:945/1480 train_time:143941ms step_avg:153.95ms step:946/1480 train_time:144103ms step_avg:153.96ms step:947/1480 train_time:144262ms step_avg:153.96ms step:948/1480 train_time:144421ms step_avg:153.97ms step:949/1480 train_time:144595ms step_avg:153.99ms step:950/1480 train_time:144741ms step_avg:153.98ms step:951/1480 train_time:144903ms step_avg:153.99ms step:952/1480 train_time:145061ms step_avg:153.99ms step:953/1480 train_time:145221ms step_avg:154.00ms step:954/1480 train_time:145382ms step_avg:154.01ms step:955/1480 train_time:145539ms step_avg:154.01ms step:956/1480 train_time:145697ms step_avg:154.01ms step:957/1480 train_time:145856ms step_avg:154.02ms step:958/1480 train_time:146023ms step_avg:154.03ms step:959/1480 train_time:146182ms step_avg:154.04ms step:960/1480 train_time:146342ms step_avg:154.04ms step:961/1480 train_time:146501ms step_avg:154.05ms step:962/1480 train_time:146659ms step_avg:154.05ms step:963/1480 train_time:146820ms step_avg:154.06ms step:964/1480 train_time:146981ms step_avg:154.07ms step:965/1480 train_time:147140ms step_avg:154.07ms step:966/1480 train_time:147299ms step_avg:154.08ms step:967/1480 train_time:147457ms step_avg:154.08ms step:968/1480 train_time:147617ms step_avg:154.09ms step:969/1480 train_time:147778ms step_avg:154.10ms step:970/1480 train_time:147936ms step_avg:154.10ms step:971/1480 train_time:148096ms step_avg:154.11ms step:972/1480 train_time:148255ms step_avg:154.11ms step:973/1480 train_time:148414ms step_avg:154.12ms step:974/1480 train_time:148574ms step_avg:154.12ms step:975/1480 train_time:148735ms step_avg:154.13ms step:976/1480 train_time:148895ms step_avg:154.14ms step:977/1480 train_time:149055ms step_avg:154.14ms step:978/1480 train_time:149216ms step_avg:154.15ms step:979/1480 train_time:149377ms step_avg:154.16ms step:980/1480 train_time:149536ms step_avg:154.16ms step:981/1480 train_time:149699ms step_avg:154.17ms step:982/1480 train_time:149859ms step_avg:154.18ms step:983/1480 train_time:150019ms step_avg:154.18ms step:984/1480 train_time:150178ms step_avg:154.19ms step:985/1480 train_time:150340ms step_avg:154.20ms step:986/1480 train_time:150499ms step_avg:154.20ms step:987/1480 train_time:150658ms step_avg:154.20ms step:988/1480 train_time:150818ms step_avg:154.21ms step:989/1480 train_time:150976ms step_avg:154.21ms step:990/1480 train_time:151138ms step_avg:154.22ms step:991/1480 train_time:151299ms step_avg:154.23ms step:992/1480 train_time:151464ms step_avg:154.24ms step:993/1480 train_time:151631ms step_avg:154.25ms step:994/1480 train_time:151792ms step_avg:154.26ms step:995/1480 train_time:151951ms step_avg:154.26ms step:996/1480 train_time:152108ms step_avg:154.27ms step:997/1480 train_time:152267ms step_avg:154.27ms step:998/1480 train_time:152426ms step_avg:154.28ms step:999/1480 train_time:152586ms step_avg:154.28ms step:1000/1480 train_time:152746ms step_avg:154.29ms step:1000/1480 val_loss:3.4424 train_time:152818ms step_avg:154.36ms step:1001/1480 train_time:152914ms step_avg:154.30ms step:1002/1480 train_time:153070ms step_avg:154.30ms step:1003/1480 train_time:153233ms step_avg:154.31ms step:1004/1480 train_time:153395ms step_avg:154.32ms step:1005/1480 train_time:153554ms step_avg:154.33ms step:1006/1480 train_time:153715ms step_avg:154.33ms step:1007/1480 train_time:153876ms step_avg:154.34ms step:1008/1480 train_time:154036ms step_avg:154.34ms step:1009/1480 train_time:154201ms step_avg:154.36ms step:1010/1480 train_time:154360ms step_avg:154.36ms step:1011/1480 train_time:154519ms step_avg:154.36ms step:1012/1480 train_time:154677ms step_avg:154.37ms step:1013/1480 train_time:154837ms step_avg:154.37ms step:1014/1480 train_time:154997ms step_avg:154.38ms step:1015/1480 train_time:155159ms step_avg:154.39ms step:1016/1480 train_time:155318ms step_avg:154.39ms step:1017/1480 train_time:155479ms step_avg:154.40ms step:1018/1480 train_time:155639ms step_avg:154.40ms step:1019/1480 train_time:155799ms step_avg:154.41ms step:1020/1480 train_time:155960ms step_avg:154.42ms step:1021/1480 train_time:156119ms step_avg:154.42ms step:1022/1480 train_time:156278ms step_avg:154.42ms step:1023/1480 train_time:156439ms step_avg:154.43ms step:1024/1480 train_time:156599ms step_avg:154.44ms step:1025/1480 train_time:156762ms step_avg:154.45ms step:1026/1480 train_time:156921ms step_avg:154.45ms step:1027/1480 train_time:157080ms step_avg:154.45ms step:1028/1480 train_time:157245ms step_avg:154.46ms step:1029/1480 train_time:157410ms step_avg:154.48ms step:1030/1480 train_time:157571ms step_avg:154.48ms step:1031/1480 train_time:157730ms step_avg:154.49ms step:1032/1480 train_time:157897ms step_avg:154.50ms step:1033/1480 train_time:158058ms step_avg:154.50ms step:1034/1480 train_time:158218ms step_avg:154.51ms step:1035/1480 train_time:158378ms step_avg:154.52ms step:1036/1480 train_time:158538ms step_avg:154.52ms step:1037/1480 train_time:158698ms step_avg:154.53ms step:1038/1480 train_time:158856ms step_avg:154.53ms step:1039/1480 train_time:159018ms step_avg:154.54ms step:1040/1480 train_time:159178ms step_avg:154.54ms step:1041/1480 train_time:159338ms step_avg:154.55ms step:1042/1480 train_time:159497ms step_avg:154.55ms step:1043/1480 train_time:159656ms step_avg:154.56ms step:1044/1480 train_time:159816ms step_avg:154.56ms step:1045/1480 train_time:159976ms step_avg:154.57ms step:1046/1480 train_time:160135ms step_avg:154.57ms step:1047/1480 train_time:160294ms step_avg:154.58ms step:1048/1480 train_time:160457ms step_avg:154.58ms step:1049/1480 train_time:160618ms step_avg:154.59ms step:1050/1480 train_time:160779ms step_avg:154.59ms step:1051/1480 train_time:160939ms step_avg:154.60ms step:1052/1480 train_time:161098ms step_avg:154.60ms step:1053/1480 train_time:161258ms step_avg:154.61ms step:1054/1480 train_time:161419ms step_avg:154.62ms step:1055/1480 train_time:161579ms step_avg:154.62ms step:1056/1480 train_time:161738ms step_avg:154.63ms step:1057/1480 train_time:161897ms step_avg:154.63ms step:1058/1480 train_time:162059ms step_avg:154.64ms step:1059/1480 train_time:162221ms step_avg:154.64ms step:1060/1480 train_time:162383ms step_avg:154.65ms step:1061/1480 train_time:162541ms step_avg:154.65ms step:1062/1480 train_time:162700ms step_avg:154.66ms step:1063/1480 train_time:162859ms step_avg:154.66ms step:1064/1480 train_time:163016ms step_avg:154.66ms step:1065/1480 train_time:163177ms step_avg:154.67ms step:1066/1480 train_time:163339ms step_avg:154.68ms step:1067/1480 train_time:163504ms step_avg:154.69ms step:1068/1480 train_time:163665ms step_avg:154.69ms step:1069/1480 train_time:163829ms step_avg:154.70ms step:1070/1480 train_time:163990ms step_avg:154.71ms step:1071/1480 train_time:164152ms step_avg:154.71ms step:1072/1480 train_time:164312ms step_avg:154.72ms step:1073/1480 train_time:164472ms step_avg:154.72ms step:1074/1480 train_time:164632ms step_avg:154.73ms step:1075/1480 train_time:164794ms step_avg:154.74ms step:1076/1480 train_time:164953ms step_avg:154.74ms step:1077/1480 train_time:165113ms step_avg:154.75ms step:1078/1480 train_time:165277ms step_avg:154.75ms step:1079/1480 train_time:165440ms step_avg:154.76ms step:1080/1480 train_time:165601ms step_avg:154.77ms step:1081/1480 train_time:165761ms step_avg:154.77ms step:1082/1480 train_time:165921ms step_avg:154.78ms step:1083/1480 train_time:166081ms step_avg:154.78ms step:1084/1480 train_time:166240ms step_avg:154.79ms step:1085/1480 train_time:166401ms step_avg:154.79ms step:1086/1480 train_time:166561ms step_avg:154.80ms step:1087/1480 train_time:166720ms step_avg:154.80ms step:1088/1480 train_time:166882ms step_avg:154.81ms step:1089/1480 train_time:167046ms step_avg:154.82ms step:1090/1480 train_time:167210ms step_avg:154.82ms step:1091/1480 train_time:167372ms step_avg:154.83ms step:1092/1480 train_time:167533ms step_avg:154.84ms step:1093/1480 train_time:167694ms step_avg:154.84ms step:1094/1480 train_time:167854ms step_avg:154.85ms step:1095/1480 train_time:168015ms step_avg:154.85ms step:1096/1480 train_time:168178ms step_avg:154.86ms step:1097/1480 train_time:168338ms step_avg:154.86ms step:1098/1480 train_time:168500ms step_avg:154.87ms step:1099/1480 train_time:168662ms step_avg:154.88ms step:1100/1480 train_time:168825ms step_avg:154.88ms step:1101/1480 train_time:168988ms step_avg:154.89ms step:1102/1480 train_time:169150ms step_avg:154.90ms step:1103/1480 train_time:169319ms step_avg:154.91ms step:1104/1480 train_time:169479ms step_avg:154.92ms step:1105/1480 train_time:169640ms step_avg:154.92ms step:1106/1480 train_time:169801ms step_avg:154.93ms step:1107/1480 train_time:169961ms step_avg:154.93ms step:1108/1480 train_time:170120ms step_avg:154.94ms step:1109/1480 train_time:170280ms step_avg:154.94ms step:1110/1480 train_time:170443ms step_avg:154.95ms step:1111/1480 train_time:170603ms step_avg:154.95ms step:1112/1480 train_time:170764ms step_avg:154.96ms step:1113/1480 train_time:170933ms step_avg:154.97ms step:1114/1480 train_time:171096ms step_avg:154.98ms step:1115/1480 train_time:171257ms step_avg:154.98ms step:1116/1480 train_time:171417ms step_avg:154.99ms step:1117/1480 train_time:171581ms step_avg:155.00ms step:1118/1480 train_time:171746ms step_avg:155.01ms step:1119/1480 train_time:171907ms step_avg:155.01ms step:1120/1480 train_time:172069ms step_avg:155.02ms step:1121/1480 train_time:172231ms step_avg:155.02ms step:1122/1480 train_time:172393ms step_avg:155.03ms step:1123/1480 train_time:172554ms step_avg:155.03ms step:1124/1480 train_time:172718ms step_avg:155.04ms step:1125/1480 train_time:172879ms step_avg:155.05ms step:1125/1480 val_loss:3.3878 train_time:172953ms step_avg:155.12ms step:1126/1480 train_time:173044ms step_avg:155.06ms step:1127/1480 train_time:173207ms step_avg:155.06ms step:1128/1480 train_time:173369ms step_avg:155.07ms step:1129/1480 train_time:173531ms step_avg:155.08ms step:1130/1480 train_time:173690ms step_avg:155.08ms step:1131/1480 train_time:173858ms step_avg:155.09ms step:1132/1480 train_time:174019ms step_avg:155.10ms step:1133/1480 train_time:174183ms step_avg:155.11ms step:1134/1480 train_time:174347ms step_avg:155.11ms step:1135/1480 train_time:174508ms step_avg:155.12ms step:1136/1480 train_time:174670ms step_avg:155.12ms step:1137/1480 train_time:174830ms step_avg:155.13ms step:1138/1480 train_time:174996ms step_avg:155.14ms step:1139/1480 train_time:175167ms step_avg:155.15ms step:1140/1480 train_time:175322ms step_avg:155.15ms step:1141/1480 train_time:175486ms step_avg:155.16ms step:1142/1480 train_time:175647ms step_avg:155.17ms step:1143/1480 train_time:175811ms step_avg:155.17ms step:1144/1480 train_time:175972ms step_avg:155.18ms step:1145/1480 train_time:176131ms step_avg:155.18ms step:1146/1480 train_time:176295ms step_avg:155.19ms step:1147/1480 train_time:176455ms step_avg:155.19ms step:1148/1480 train_time:176618ms step_avg:155.20ms step:1149/1480 train_time:176784ms step_avg:155.21ms step:1150/1480 train_time:176945ms step_avg:155.21ms step:1151/1480 train_time:177108ms step_avg:155.22ms step:1152/1480 train_time:177271ms step_avg:155.23ms step:1153/1480 train_time:177435ms step_avg:155.24ms step:1154/1480 train_time:177596ms step_avg:155.24ms step:1155/1480 train_time:177757ms step_avg:155.25ms step:1156/1480 train_time:177926ms step_avg:155.26ms step:1157/1480 train_time:178090ms step_avg:155.27ms step:1158/1480 train_time:178249ms step_avg:155.27ms step:1159/1480 train_time:178409ms step_avg:155.27ms step:1160/1480 train_time:178568ms step_avg:155.28ms step:1161/1480 train_time:178731ms step_avg:155.28ms step:1162/1480 train_time:178893ms step_avg:155.29ms step:1163/1480 train_time:179056ms step_avg:155.30ms step:1164/1480 train_time:179221ms step_avg:155.30ms step:1165/1480 train_time:179381ms step_avg:155.31ms step:1166/1480 train_time:179544ms step_avg:155.31ms step:1167/1480 train_time:179703ms step_avg:155.32ms step:1168/1480 train_time:179865ms step_avg:155.32ms step:1169/1480 train_time:180027ms step_avg:155.33ms step:1170/1480 train_time:180187ms step_avg:155.33ms step:1171/1480 train_time:180349ms step_avg:155.34ms step:1172/1480 train_time:180509ms step_avg:155.34ms step:1173/1480 train_time:180670ms step_avg:155.35ms step:1174/1480 train_time:180841ms step_avg:155.36ms step:1175/1480 train_time:181004ms step_avg:155.37ms step:1176/1480 train_time:181168ms step_avg:155.38ms step:1177/1480 train_time:181335ms step_avg:155.39ms step:1178/1480 train_time:181497ms step_avg:155.39ms step:1179/1480 train_time:181656ms step_avg:155.39ms step:1180/1480 train_time:181826ms step_avg:155.41ms step:1181/1480 train_time:181989ms step_avg:155.41ms step:1182/1480 train_time:182149ms step_avg:155.42ms step:1183/1480 train_time:182311ms step_avg:155.42ms step:1184/1480 train_time:182473ms step_avg:155.43ms step:1185/1480 train_time:182637ms step_avg:155.44ms step:1186/1480 train_time:182801ms step_avg:155.44ms step:1187/1480 train_time:182971ms step_avg:155.46ms step:1188/1480 train_time:183130ms step_avg:155.46ms step:1189/1480 train_time:183291ms step_avg:155.46ms step:1190/1480 train_time:183452ms step_avg:155.47ms step:1191/1480 train_time:183614ms step_avg:155.47ms step:1192/1480 train_time:183774ms step_avg:155.48ms step:1193/1480 train_time:183935ms step_avg:155.48ms step:1194/1480 train_time:184097ms step_avg:155.49ms step:1195/1480 train_time:184260ms step_avg:155.49ms step:1196/1480 train_time:184431ms step_avg:155.51ms step:1197/1480 train_time:184593ms step_avg:155.51ms step:1198/1480 train_time:184763ms step_avg:155.52ms step:1199/1480 train_time:184926ms step_avg:155.53ms step:1200/1480 train_time:185088ms step_avg:155.54ms step:1201/1480 train_time:185248ms step_avg:155.54ms step:1202/1480 train_time:185418ms step_avg:155.55ms step:1203/1480 train_time:185584ms step_avg:155.56ms step:1204/1480 train_time:185749ms step_avg:155.57ms step:1205/1480 train_time:185910ms step_avg:155.57ms step:1206/1480 train_time:186070ms step_avg:155.58ms step:1207/1480 train_time:186230ms step_avg:155.58ms step:1208/1480 train_time:186390ms step_avg:155.58ms step:1209/1480 train_time:186553ms step_avg:155.59ms step:1210/1480 train_time:186720ms step_avg:155.60ms step:1211/1480 train_time:186884ms step_avg:155.61ms step:1212/1480 train_time:187047ms step_avg:155.61ms step:1213/1480 train_time:187210ms step_avg:155.62ms step:1214/1480 train_time:187376ms step_avg:155.63ms step:1215/1480 train_time:187541ms step_avg:155.64ms step:1216/1480 train_time:187702ms step_avg:155.64ms step:1217/1480 train_time:187867ms step_avg:155.65ms step:1218/1480 train_time:188030ms step_avg:155.65ms step:1219/1480 train_time:188199ms step_avg:155.67ms step:1220/1480 train_time:188363ms step_avg:155.67ms step:1221/1480 train_time:188525ms step_avg:155.68ms step:1222/1480 train_time:188686ms step_avg:155.68ms step:1223/1480 train_time:188849ms step_avg:155.69ms step:1224/1480 train_time:189013ms step_avg:155.69ms step:1225/1480 train_time:189177ms step_avg:155.70ms step:1226/1480 train_time:189343ms step_avg:155.71ms step:1227/1480 train_time:189508ms step_avg:155.72ms step:1228/1480 train_time:189669ms step_avg:155.72ms step:1229/1480 train_time:189831ms step_avg:155.73ms step:1230/1480 train_time:190002ms step_avg:155.74ms step:1231/1480 train_time:190169ms step_avg:155.75ms step:1232/1480 train_time:190334ms step_avg:155.76ms step:1233/1480 train_time:190494ms step_avg:155.76ms step:1234/1480 train_time:190657ms step_avg:155.77ms step:1235/1480 train_time:190823ms step_avg:155.77ms step:1236/1480 train_time:190984ms step_avg:155.78ms step:1237/1480 train_time:191146ms step_avg:155.78ms step:1238/1480 train_time:191317ms step_avg:155.80ms step:1239/1480 train_time:191481ms step_avg:155.80ms step:1240/1480 train_time:191645ms step_avg:155.81ms step:1241/1480 train_time:191811ms step_avg:155.82ms step:1242/1480 train_time:191972ms step_avg:155.82ms step:1243/1480 train_time:192135ms step_avg:155.83ms step:1244/1480 train_time:192297ms step_avg:155.83ms step:1245/1480 train_time:192462ms step_avg:155.84ms step:1246/1480 train_time:192624ms step_avg:155.84ms step:1247/1480 train_time:192786ms step_avg:155.85ms step:1248/1480 train_time:192948ms step_avg:155.85ms step:1249/1480 train_time:193108ms step_avg:155.86ms step:1250/1480 train_time:193270ms step_avg:155.86ms step:1250/1480 val_loss:3.3375 train_time:193345ms step_avg:155.92ms step:1251/1480 train_time:193439ms step_avg:155.87ms step:1252/1480 train_time:193601ms step_avg:155.88ms step:1253/1480 train_time:193762ms step_avg:155.88ms step:1254/1480 train_time:193922ms step_avg:155.89ms step:1255/1480 train_time:194094ms step_avg:155.90ms step:1256/1480 train_time:194258ms step_avg:155.91ms step:1257/1480 train_time:194420ms step_avg:155.91ms step:1258/1480 train_time:194583ms step_avg:155.92ms step:1259/1480 train_time:194747ms step_avg:155.92ms step:1260/1480 train_time:194907ms step_avg:155.93ms step:1261/1480 train_time:195069ms step_avg:155.93ms step:1262/1480 train_time:195234ms step_avg:155.94ms step:1263/1480 train_time:195400ms step_avg:155.95ms step:1264/1480 train_time:195560ms step_avg:155.95ms step:1265/1480 train_time:195720ms step_avg:155.95ms step:1266/1480 train_time:195883ms step_avg:155.96ms step:1267/1480 train_time:196042ms step_avg:155.96ms step:1268/1480 train_time:196203ms step_avg:155.96ms step:1269/1480 train_time:196368ms step_avg:155.97ms step:1270/1480 train_time:196530ms step_avg:155.98ms step:1271/1480 train_time:196695ms step_avg:155.98ms step:1272/1480 train_time:196856ms step_avg:155.99ms step:1273/1480 train_time:197021ms step_avg:155.99ms step:1274/1480 train_time:197185ms step_avg:156.00ms step:1275/1480 train_time:197346ms step_avg:156.00ms step:1276/1480 train_time:197505ms step_avg:156.01ms step:1277/1480 train_time:197668ms step_avg:156.01ms step:1278/1480 train_time:197828ms step_avg:156.02ms step:1279/1480 train_time:197992ms step_avg:156.02ms step:1280/1480 train_time:198158ms step_avg:156.03ms step:1281/1480 train_time:198320ms step_avg:156.03ms step:1282/1480 train_time:198479ms step_avg:156.04ms step:1283/1480 train_time:198641ms step_avg:156.04ms step:1284/1480 train_time:198804ms step_avg:156.05ms step:1285/1480 train_time:198967ms step_avg:156.05ms step:1286/1480 train_time:199128ms step_avg:156.06ms step:1287/1480 train_time:199292ms step_avg:156.06ms step:1288/1480 train_time:199455ms step_avg:156.07ms step:1289/1480 train_time:199623ms step_avg:156.08ms step:1290/1480 train_time:199793ms step_avg:156.09ms step:1291/1480 train_time:199958ms step_avg:156.09ms step:1292/1480 train_time:200121ms step_avg:156.10ms step:1293/1480 train_time:200288ms step_avg:156.11ms step:1294/1480 train_time:200452ms step_avg:156.12ms step:1295/1480 train_time:200617ms step_avg:156.12ms step:1296/1480 train_time:200780ms step_avg:156.13ms step:1297/1480 train_time:200943ms step_avg:156.13ms step:1298/1480 train_time:201105ms step_avg:156.14ms step:1299/1480 train_time:201266ms step_avg:156.14ms step:1300/1480 train_time:201426ms step_avg:156.14ms step:1301/1480 train_time:201590ms step_avg:156.15ms step:1302/1480 train_time:201755ms step_avg:156.16ms step:1303/1480 train_time:201922ms step_avg:156.17ms step:1304/1480 train_time:202087ms step_avg:156.17ms step:1305/1480 train_time:202249ms step_avg:156.18ms step:1306/1480 train_time:202415ms step_avg:156.18ms step:1307/1480 train_time:202577ms step_avg:156.19ms step:1308/1480 train_time:202739ms step_avg:156.19ms step:1309/1480 train_time:202903ms step_avg:156.20ms step:1310/1480 train_time:203064ms step_avg:156.20ms step:1311/1480 train_time:203225ms step_avg:156.21ms step:1312/1480 train_time:203390ms step_avg:156.21ms step:1313/1480 train_time:203554ms step_avg:156.22ms step:1314/1480 train_time:203720ms step_avg:156.23ms step:1315/1480 train_time:203883ms step_avg:156.23ms step:1316/1480 train_time:204042ms step_avg:156.23ms step:1317/1480 train_time:204204ms step_avg:156.24ms step:1318/1480 train_time:204371ms step_avg:156.25ms step:1319/1480 train_time:204538ms step_avg:156.26ms step:1320/1480 train_time:204705ms step_avg:156.26ms step:1321/1480 train_time:204869ms step_avg:156.27ms step:1322/1480 train_time:205040ms step_avg:156.28ms step:1323/1480 train_time:205202ms step_avg:156.29ms step:1324/1480 train_time:205366ms step_avg:156.29ms step:1325/1480 train_time:205536ms step_avg:156.30ms step:1326/1480 train_time:205703ms step_avg:156.31ms step:1327/1480 train_time:205864ms step_avg:156.31ms step:1328/1480 train_time:206026ms step_avg:156.32ms step:1329/1480 train_time:206209ms step_avg:156.34ms step:1330/1480 train_time:206374ms step_avg:156.34ms step:1331/1480 train_time:206538ms step_avg:156.35ms step:1332/1480 train_time:206701ms step_avg:156.35ms step:1333/1480 train_time:206865ms step_avg:156.36ms step:1334/1480 train_time:207028ms step_avg:156.37ms step:1335/1480 train_time:207188ms step_avg:156.37ms step:1336/1480 train_time:207358ms step_avg:156.38ms step:1337/1480 train_time:207524ms step_avg:156.39ms step:1338/1480 train_time:207688ms step_avg:156.39ms step:1339/1480 train_time:207852ms step_avg:156.40ms step:1340/1480 train_time:208017ms step_avg:156.40ms step:1341/1480 train_time:208179ms step_avg:156.41ms step:1342/1480 train_time:208344ms step_avg:156.41ms step:1343/1480 train_time:208505ms step_avg:156.42ms step:1344/1480 train_time:208666ms step_avg:156.42ms step:1345/1480 train_time:208835ms step_avg:156.43ms step:1346/1480 train_time:208997ms step_avg:156.44ms step:1347/1480 train_time:209159ms step_avg:156.44ms step:1348/1480 train_time:209323ms step_avg:156.44ms step:1349/1480 train_time:209485ms step_avg:156.45ms step:1350/1480 train_time:209651ms step_avg:156.46ms step:1351/1480 train_time:209814ms step_avg:156.46ms step:1352/1480 train_time:209978ms step_avg:156.47ms step:1353/1480 train_time:210143ms step_avg:156.47ms step:1354/1480 train_time:210306ms step_avg:156.48ms step:1355/1480 train_time:210467ms step_avg:156.48ms step:1356/1480 train_time:210631ms step_avg:156.49ms step:1357/1480 train_time:210797ms step_avg:156.49ms step:1358/1480 train_time:210961ms step_avg:156.50ms step:1359/1480 train_time:211125ms step_avg:156.50ms step:1360/1480 train_time:211291ms step_avg:156.51ms step:1361/1480 train_time:211458ms step_avg:156.52ms step:1362/1480 train_time:211622ms step_avg:156.53ms step:1363/1480 train_time:211790ms step_avg:156.53ms step:1364/1480 train_time:211952ms step_avg:156.54ms step:1365/1480 train_time:212114ms step_avg:156.54ms step:1366/1480 train_time:212278ms step_avg:156.55ms step:1367/1480 train_time:212440ms step_avg:156.55ms step:1368/1480 train_time:212605ms step_avg:156.56ms step:1369/1480 train_time:212775ms step_avg:156.57ms step:1370/1480 train_time:212940ms step_avg:156.57ms step:1371/1480 train_time:213103ms step_avg:156.58ms step:1372/1480 train_time:213270ms step_avg:156.59ms step:1373/1480 train_time:213432ms step_avg:156.59ms step:1374/1480 train_time:213600ms step_avg:156.60ms step:1375/1480 train_time:213761ms step_avg:156.60ms step:1375/1480 val_loss:3.2983 train_time:213835ms step_avg:156.66ms step:1376/1480 train_time:213929ms step_avg:156.61ms step:1377/1480 train_time:214091ms step_avg:156.61ms step:1378/1480 train_time:214254ms step_avg:156.62ms step:1379/1480 train_time:214418ms step_avg:156.62ms step:1380/1480 train_time:214582ms step_avg:156.63ms step:1381/1480 train_time:214751ms step_avg:156.64ms step:1382/1480 train_time:214914ms step_avg:156.64ms step:1383/1480 train_time:215078ms step_avg:156.65ms step:1384/1480 train_time:215244ms step_avg:156.65ms step:1385/1480 train_time:215405ms step_avg:156.66ms step:1386/1480 train_time:215567ms step_avg:156.66ms step:1387/1480 train_time:215733ms step_avg:156.67ms step:1388/1480 train_time:215895ms step_avg:156.67ms step:1389/1480 train_time:216060ms step_avg:156.68ms step:1390/1480 train_time:216221ms step_avg:156.68ms step:1391/1480 train_time:216385ms step_avg:156.69ms step:1392/1480 train_time:216549ms step_avg:156.69ms step:1393/1480 train_time:216712ms step_avg:156.70ms step:1394/1480 train_time:216874ms step_avg:156.70ms step:1395/1480 train_time:217037ms step_avg:156.71ms step:1396/1480 train_time:217198ms step_avg:156.71ms step:1397/1480 train_time:217357ms step_avg:156.71ms step:1398/1480 train_time:217518ms step_avg:156.71ms step:1399/1480 train_time:217678ms step_avg:156.72ms step:1400/1480 train_time:217847ms step_avg:156.72ms step:1401/1480 train_time:218009ms step_avg:156.73ms step:1402/1480 train_time:218171ms step_avg:156.73ms step:1403/1480 train_time:218338ms step_avg:156.74ms step:1404/1480 train_time:218501ms step_avg:156.74ms step:1405/1480 train_time:218668ms step_avg:156.75ms step:1406/1480 train_time:218833ms step_avg:156.76ms step:1407/1480 train_time:218995ms step_avg:156.76ms step:1408/1480 train_time:219156ms step_avg:156.76ms step:1409/1480 train_time:219329ms step_avg:156.78ms step:1410/1480 train_time:219491ms step_avg:156.78ms step:1411/1480 train_time:219653ms step_avg:156.78ms step:1412/1480 train_time:219815ms step_avg:156.79ms step:1413/1480 train_time:219978ms step_avg:156.79ms step:1414/1480 train_time:220142ms step_avg:156.80ms step:1415/1480 train_time:220307ms step_avg:156.80ms step:1416/1480 train_time:220481ms step_avg:156.81ms step:1417/1480 train_time:220647ms step_avg:156.82ms step:1418/1480 train_time:220811ms step_avg:156.83ms step:1419/1480 train_time:220976ms step_avg:156.83ms step:1420/1480 train_time:221140ms step_avg:156.84ms step:1421/1480 train_time:221305ms step_avg:156.84ms step:1422/1480 train_time:221471ms step_avg:156.85ms step:1423/1480 train_time:221633ms step_avg:156.85ms step:1424/1480 train_time:221799ms step_avg:156.86ms step:1425/1480 train_time:221969ms step_avg:156.87ms step:1426/1480 train_time:222134ms step_avg:156.87ms step:1427/1480 train_time:222298ms step_avg:156.88ms step:1428/1480 train_time:222459ms step_avg:156.88ms step:1429/1480 train_time:222620ms step_avg:156.89ms step:1430/1480 train_time:222785ms step_avg:156.89ms step:1431/1480 train_time:222953ms step_avg:156.90ms step:1432/1480 train_time:223120ms step_avg:156.91ms step:1433/1480 train_time:223289ms step_avg:156.91ms step:1434/1480 train_time:223458ms step_avg:156.92ms step:1435/1480 train_time:223623ms step_avg:156.93ms step:1436/1480 train_time:223788ms step_avg:156.93ms step:1437/1480 train_time:223951ms step_avg:156.94ms step:1438/1480 train_time:224113ms step_avg:156.94ms step:1439/1480 train_time:224278ms step_avg:156.95ms step:1440/1480 train_time:224441ms step_avg:156.95ms step:1441/1480 train_time:224604ms step_avg:156.96ms step:1442/1480 train_time:224771ms step_avg:156.96ms step:1443/1480 train_time:224943ms step_avg:156.97ms step:1444/1480 train_time:225108ms step_avg:156.98ms step:1445/1480 train_time:225270ms step_avg:156.98ms step:1446/1480 train_time:225437ms step_avg:156.99ms step:1447/1480 train_time:225604ms step_avg:157.00ms step:1448/1480 train_time:225767ms step_avg:157.00ms step:1449/1480 train_time:225933ms step_avg:157.01ms step:1450/1480 train_time:226097ms step_avg:157.01ms step:1451/1480 train_time:226259ms step_avg:157.02ms step:1452/1480 train_time:226426ms step_avg:157.02ms step:1453/1480 train_time:226590ms step_avg:157.03ms step:1454/1480 train_time:226753ms step_avg:157.03ms step:1455/1480 train_time:226920ms step_avg:157.04ms step:1456/1480 train_time:227084ms step_avg:157.04ms step:1457/1480 train_time:227247ms step_avg:157.05ms step:1458/1480 train_time:227409ms step_avg:157.05ms step:1459/1480 train_time:227575ms step_avg:157.06ms step:1460/1480 train_time:227738ms step_avg:157.06ms step:1461/1480 train_time:227902ms step_avg:157.07ms step:1462/1480 train_time:228068ms step_avg:157.07ms step:1463/1480 train_time:228235ms step_avg:157.08ms step:1464/1480 train_time:228400ms step_avg:157.08ms step:1465/1480 train_time:228563ms step_avg:157.09ms step:1466/1480 train_time:228726ms step_avg:157.09ms step:1467/1480 train_time:228890ms step_avg:157.10ms step:1468/1480 train_time:229055ms step_avg:157.10ms step:1469/1480 train_time:229217ms step_avg:157.11ms step:1470/1480 train_time:229385ms step_avg:157.11ms step:1471/1480 train_time:229557ms step_avg:157.12ms step:1472/1480 train_time:229729ms step_avg:157.13ms step:1473/1480 train_time:229893ms step_avg:157.14ms step:1474/1480 train_time:230059ms step_avg:157.14ms step:1475/1480 train_time:230228ms step_avg:157.15ms step:1476/1480 train_time:230392ms step_avg:157.16ms step:1477/1480 train_time:230559ms step_avg:157.16ms step:1478/1480 train_time:230731ms step_avg:157.17ms step:1479/1480 train_time:230895ms step_avg:157.18ms step:1480/1480 train_time:231057ms step_avg:157.18ms step:1480/1480 val_loss:3.2792 train_time:231133ms step_avg:157.23ms peak memory consumption: 34239 MiB