import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 10:20:03 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 31C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 37C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 39C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28912ms step_avg:nanms step:2/1480 train_time:29018ms step_avg:nanms step:3/1480 train_time:29139ms step_avg:nanms step:4/1480 train_time:29280ms step_avg:nanms step:5/1480 train_time:29420ms step_avg:nanms step:6/1480 train_time:29560ms step_avg:nanms step:7/1480 train_time:29703ms step_avg:nanms step:8/1480 train_time:29848ms step_avg:nanms step:9/1480 train_time:29992ms step_avg:nanms step:10/1480 train_time:30136ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.72ms step:14/1480 train_time:568ms step_avg:141.91ms step:15/1480 train_time:711ms step_avg:142.22ms step:16/1480 train_time:855ms step_avg:142.55ms step:17/1480 train_time:997ms step_avg:142.49ms step:18/1480 train_time:1140ms step_avg:142.54ms step:19/1480 train_time:1282ms step_avg:142.50ms step:20/1480 train_time:1425ms step_avg:142.45ms step:21/1480 train_time:1567ms step_avg:142.50ms step:22/1480 train_time:1711ms step_avg:142.56ms step:23/1480 train_time:1855ms step_avg:142.70ms step:24/1480 train_time:1997ms step_avg:142.66ms step:25/1480 train_time:2140ms step_avg:142.68ms step:26/1480 train_time:2281ms step_avg:142.57ms step:27/1480 train_time:2423ms step_avg:142.54ms step:28/1480 train_time:2566ms step_avg:142.55ms step:29/1480 train_time:2710ms step_avg:142.62ms step:30/1480 train_time:2853ms step_avg:142.67ms step:31/1480 train_time:2997ms step_avg:142.70ms step:32/1480 train_time:3140ms step_avg:142.74ms step:33/1480 train_time:3282ms step_avg:142.70ms step:34/1480 train_time:3424ms step_avg:142.65ms step:35/1480 train_time:3565ms step_avg:142.58ms step:36/1480 train_time:3709ms step_avg:142.64ms step:37/1480 train_time:3852ms step_avg:142.68ms step:38/1480 train_time:3995ms step_avg:142.66ms step:39/1480 train_time:4138ms step_avg:142.70ms step:40/1480 train_time:4281ms step_avg:142.71ms step:41/1480 train_time:4424ms step_avg:142.72ms step:42/1480 train_time:4566ms step_avg:142.69ms step:43/1480 train_time:4710ms step_avg:142.71ms step:44/1480 train_time:4854ms step_avg:142.77ms step:45/1480 train_time:4997ms step_avg:142.78ms step:46/1480 train_time:5141ms step_avg:142.79ms step:47/1480 train_time:5283ms step_avg:142.78ms step:48/1480 train_time:5423ms step_avg:142.72ms step:49/1480 train_time:5566ms step_avg:142.71ms step:50/1480 train_time:5710ms step_avg:142.74ms step:51/1480 train_time:5853ms step_avg:142.75ms step:52/1480 train_time:5997ms step_avg:142.79ms step:53/1480 train_time:6139ms step_avg:142.77ms step:54/1480 train_time:6281ms step_avg:142.74ms step:55/1480 train_time:6424ms step_avg:142.76ms step:56/1480 train_time:6569ms step_avg:142.80ms step:57/1480 train_time:6713ms step_avg:142.84ms step:58/1480 train_time:6858ms step_avg:142.88ms step:59/1480 train_time:7000ms step_avg:142.86ms step:60/1480 train_time:7144ms step_avg:142.88ms step:61/1480 train_time:7285ms step_avg:142.85ms step:62/1480 train_time:7429ms step_avg:142.86ms step:63/1480 train_time:7572ms step_avg:142.86ms step:64/1480 train_time:7715ms step_avg:142.87ms step:65/1480 train_time:7859ms step_avg:142.89ms step:66/1480 train_time:8001ms step_avg:142.87ms step:67/1480 train_time:8142ms step_avg:142.85ms step:68/1480 train_time:8284ms step_avg:142.83ms step:69/1480 train_time:8427ms step_avg:142.83ms step:70/1480 train_time:8570ms step_avg:142.84ms step:71/1480 train_time:8713ms step_avg:142.83ms step:72/1480 train_time:8855ms step_avg:142.83ms step:73/1480 train_time:8998ms step_avg:142.83ms step:74/1480 train_time:9143ms step_avg:142.86ms step:75/1480 train_time:9284ms step_avg:142.83ms step:76/1480 train_time:9428ms step_avg:142.84ms step:77/1480 train_time:9570ms step_avg:142.84ms step:78/1480 train_time:9714ms step_avg:142.85ms step:79/1480 train_time:10223ms step_avg:148.16ms step:80/1480 train_time:10734ms step_avg:153.35ms step:81/1480 train_time:10833ms step_avg:152.58ms step:82/1480 train_time:10977ms step_avg:152.45ms step:83/1480 train_time:11119ms step_avg:152.31ms step:84/1480 train_time:11261ms step_avg:152.18ms step:85/1480 train_time:11402ms step_avg:152.03ms step:86/1480 train_time:11544ms step_avg:151.90ms step:87/1480 train_time:11689ms step_avg:151.81ms step:88/1480 train_time:11834ms step_avg:151.72ms step:89/1480 train_time:11979ms step_avg:151.64ms step:90/1480 train_time:12122ms step_avg:151.52ms step:91/1480 train_time:12263ms step_avg:151.39ms step:92/1480 train_time:12404ms step_avg:151.27ms step:93/1480 train_time:12547ms step_avg:151.17ms step:94/1480 train_time:12691ms step_avg:151.08ms step:95/1480 train_time:12834ms step_avg:150.99ms step:96/1480 train_time:13358ms step_avg:155.33ms step:97/1480 train_time:13462ms step_avg:154.73ms step:98/1480 train_time:13603ms step_avg:154.57ms step:99/1480 train_time:13745ms step_avg:154.44ms step:100/1480 train_time:13887ms step_avg:154.30ms step:101/1480 train_time:14032ms step_avg:154.20ms step:102/1480 train_time:14173ms step_avg:154.05ms step:103/1480 train_time:14317ms step_avg:153.94ms step:104/1480 train_time:14460ms step_avg:153.83ms step:105/1480 train_time:14602ms step_avg:153.71ms step:106/1480 train_time:14744ms step_avg:153.59ms step:107/1480 train_time:14887ms step_avg:153.47ms step:108/1480 train_time:15031ms step_avg:153.38ms step:109/1480 train_time:15173ms step_avg:153.27ms step:110/1480 train_time:15316ms step_avg:153.16ms step:111/1480 train_time:15460ms step_avg:153.07ms step:112/1480 train_time:15604ms step_avg:152.98ms step:113/1480 train_time:15751ms step_avg:152.92ms step:114/1480 train_time:15896ms step_avg:152.85ms step:115/1480 train_time:16042ms step_avg:152.79ms step:116/1480 train_time:16187ms step_avg:152.71ms step:117/1480 train_time:16334ms step_avg:152.65ms step:118/1480 train_time:16480ms step_avg:152.60ms step:119/1480 train_time:16625ms step_avg:152.52ms step:120/1480 train_time:16770ms step_avg:152.46ms step:121/1480 train_time:16916ms step_avg:152.40ms step:122/1480 train_time:17062ms step_avg:152.34ms step:123/1480 train_time:17206ms step_avg:152.27ms step:124/1480 train_time:17354ms step_avg:152.23ms step:125/1480 train_time:17500ms step_avg:152.18ms step:125/1480 val_loss:4.4120 train_time:17565ms step_avg:152.74ms step:126/1480 train_time:17656ms step_avg:152.21ms step:127/1480 train_time:17802ms step_avg:152.15ms step:128/1480 train_time:17948ms step_avg:152.10ms step:129/1480 train_time:18094ms step_avg:152.05ms step:130/1480 train_time:18238ms step_avg:151.98ms step:131/1480 train_time:18383ms step_avg:151.93ms step:132/1480 train_time:18528ms step_avg:151.87ms step:133/1480 train_time:18674ms step_avg:151.82ms step:134/1480 train_time:18819ms step_avg:151.77ms step:135/1480 train_time:18966ms step_avg:151.73ms step:136/1480 train_time:19113ms step_avg:151.69ms step:137/1480 train_time:19256ms step_avg:151.63ms step:138/1480 train_time:19402ms step_avg:151.58ms step:139/1480 train_time:19547ms step_avg:151.53ms step:140/1480 train_time:19692ms step_avg:151.48ms step:141/1480 train_time:19836ms step_avg:151.42ms step:142/1480 train_time:19983ms step_avg:151.39ms step:143/1480 train_time:20130ms step_avg:151.36ms step:144/1480 train_time:20276ms step_avg:151.31ms step:145/1480 train_time:20421ms step_avg:151.27ms step:146/1480 train_time:20567ms step_avg:151.23ms step:147/1480 train_time:20713ms step_avg:151.19ms step:148/1480 train_time:20857ms step_avg:151.14ms step:149/1480 train_time:21003ms step_avg:151.10ms step:150/1480 train_time:21149ms step_avg:151.07ms step:151/1480 train_time:21295ms step_avg:151.03ms step:152/1480 train_time:21439ms step_avg:150.98ms step:153/1480 train_time:21586ms step_avg:150.95ms step:154/1480 train_time:21733ms step_avg:150.92ms step:155/1480 train_time:21878ms step_avg:150.88ms step:156/1480 train_time:22024ms step_avg:150.85ms step:157/1480 train_time:22170ms step_avg:150.82ms step:158/1480 train_time:22315ms step_avg:150.78ms step:159/1480 train_time:22460ms step_avg:150.74ms step:160/1480 train_time:22608ms step_avg:150.72ms step:161/1480 train_time:22753ms step_avg:150.68ms step:162/1480 train_time:22899ms step_avg:150.65ms step:163/1480 train_time:23045ms step_avg:150.62ms step:164/1480 train_time:23192ms step_avg:150.60ms step:165/1480 train_time:23336ms step_avg:150.56ms step:166/1480 train_time:23481ms step_avg:150.52ms step:167/1480 train_time:23627ms step_avg:150.49ms step:168/1480 train_time:23773ms step_avg:150.46ms step:169/1480 train_time:23917ms step_avg:150.42ms step:170/1480 train_time:24064ms step_avg:150.40ms step:171/1480 train_time:24211ms step_avg:150.38ms step:172/1480 train_time:24356ms step_avg:150.34ms step:173/1480 train_time:24502ms step_avg:150.32ms step:174/1480 train_time:24647ms step_avg:150.29ms step:175/1480 train_time:24794ms step_avg:150.27ms step:176/1480 train_time:24938ms step_avg:150.23ms step:177/1480 train_time:25084ms step_avg:150.20ms step:178/1480 train_time:25232ms step_avg:150.19ms step:179/1480 train_time:25377ms step_avg:150.16ms step:180/1480 train_time:25522ms step_avg:150.13ms step:181/1480 train_time:25668ms step_avg:150.10ms step:182/1480 train_time:25814ms step_avg:150.08ms step:183/1480 train_time:25958ms step_avg:150.05ms step:184/1480 train_time:26104ms step_avg:150.02ms step:185/1480 train_time:26250ms step_avg:150.00ms step:186/1480 train_time:26396ms step_avg:149.98ms step:187/1480 train_time:26541ms step_avg:149.95ms step:188/1480 train_time:26688ms step_avg:149.93ms step:189/1480 train_time:26855ms step_avg:150.03ms step:190/1480 train_time:26978ms step_avg:149.88ms step:191/1480 train_time:27124ms step_avg:149.86ms step:192/1480 train_time:27271ms step_avg:149.84ms step:193/1480 train_time:27416ms step_avg:149.81ms step:194/1480 train_time:27561ms step_avg:149.79ms step:195/1480 train_time:27708ms step_avg:149.77ms step:196/1480 train_time:27854ms step_avg:149.75ms step:197/1480 train_time:27999ms step_avg:149.72ms step:198/1480 train_time:28145ms step_avg:149.71ms step:199/1480 train_time:28291ms step_avg:149.69ms step:200/1480 train_time:28437ms step_avg:149.67ms step:201/1480 train_time:28587ms step_avg:149.67ms step:202/1480 train_time:28732ms step_avg:149.65ms step:203/1480 train_time:28877ms step_avg:149.62ms step:204/1480 train_time:29023ms step_avg:149.60ms step:205/1480 train_time:29169ms step_avg:149.59ms step:206/1480 train_time:29316ms step_avg:149.57ms step:207/1480 train_time:29460ms step_avg:149.54ms step:208/1480 train_time:29607ms step_avg:149.53ms step:209/1480 train_time:29753ms step_avg:149.51ms step:210/1480 train_time:29899ms step_avg:149.49ms step:211/1480 train_time:30045ms step_avg:149.48ms step:212/1480 train_time:30191ms step_avg:149.46ms step:213/1480 train_time:30335ms step_avg:149.44ms step:214/1480 train_time:30481ms step_avg:149.42ms step:215/1480 train_time:30628ms step_avg:149.41ms step:216/1480 train_time:30774ms step_avg:149.39ms step:217/1480 train_time:30919ms step_avg:149.36ms step:218/1480 train_time:31066ms step_avg:149.36ms step:219/1480 train_time:31213ms step_avg:149.34ms step:220/1480 train_time:31357ms step_avg:149.32ms step:221/1480 train_time:31914ms step_avg:151.25ms step:222/1480 train_time:32423ms step_avg:152.94ms step:223/1480 train_time:32535ms step_avg:152.74ms step:224/1480 train_time:32682ms step_avg:152.72ms step:225/1480 train_time:32831ms step_avg:152.70ms step:226/1480 train_time:32978ms step_avg:152.68ms step:227/1480 train_time:33128ms step_avg:152.66ms step:228/1480 train_time:33276ms step_avg:152.64ms step:229/1480 train_time:33424ms step_avg:152.62ms step:230/1480 train_time:33573ms step_avg:152.60ms step:231/1480 train_time:33721ms step_avg:152.58ms step:232/1480 train_time:33870ms step_avg:152.57ms step:233/1480 train_time:34018ms step_avg:152.55ms step:234/1480 train_time:34166ms step_avg:152.53ms step:235/1480 train_time:34315ms step_avg:152.51ms step:236/1480 train_time:34462ms step_avg:152.49ms step:237/1480 train_time:34612ms step_avg:152.48ms step:238/1480 train_time:34760ms step_avg:152.46ms step:239/1480 train_time:34908ms step_avg:152.44ms step:240/1480 train_time:35056ms step_avg:152.42ms step:241/1480 train_time:35205ms step_avg:152.40ms step:242/1480 train_time:35353ms step_avg:152.39ms step:243/1480 train_time:35501ms step_avg:152.37ms step:244/1480 train_time:35650ms step_avg:152.35ms step:245/1480 train_time:35798ms step_avg:152.33ms step:246/1480 train_time:35946ms step_avg:152.31ms step:247/1480 train_time:36095ms step_avg:152.30ms step:248/1480 train_time:36242ms step_avg:152.28ms step:249/1480 train_time:36391ms step_avg:152.26ms step:250/1480 train_time:36538ms step_avg:152.24ms step:250/1480 val_loss:3.9888 train_time:36606ms step_avg:152.52ms step:251/1480 train_time:36696ms step_avg:152.27ms step:252/1480 train_time:36844ms step_avg:152.25ms step:253/1480 train_time:36994ms step_avg:152.24ms step:254/1480 train_time:37143ms step_avg:152.22ms step:255/1480 train_time:37290ms step_avg:152.21ms step:256/1480 train_time:37438ms step_avg:152.19ms step:257/1480 train_time:37587ms step_avg:152.17ms step:258/1480 train_time:37735ms step_avg:152.16ms step:259/1480 train_time:37885ms step_avg:152.15ms step:260/1480 train_time:38034ms step_avg:152.13ms step:261/1480 train_time:38183ms step_avg:152.12ms step:262/1480 train_time:38332ms step_avg:152.11ms step:263/1480 train_time:38479ms step_avg:152.09ms step:264/1480 train_time:38628ms step_avg:152.08ms step:265/1480 train_time:38775ms step_avg:152.06ms step:266/1480 train_time:38925ms step_avg:152.05ms step:267/1480 train_time:39074ms step_avg:152.04ms step:268/1480 train_time:39222ms step_avg:152.02ms step:269/1480 train_time:39371ms step_avg:152.01ms step:270/1480 train_time:39519ms step_avg:152.00ms step:271/1480 train_time:39668ms step_avg:151.98ms step:272/1480 train_time:39815ms step_avg:151.97ms step:273/1480 train_time:39965ms step_avg:151.96ms step:274/1480 train_time:40113ms step_avg:151.94ms step:275/1480 train_time:40261ms step_avg:151.93ms step:276/1480 train_time:40410ms step_avg:151.92ms step:277/1480 train_time:40558ms step_avg:151.90ms step:278/1480 train_time:40708ms step_avg:151.89ms step:279/1480 train_time:40856ms step_avg:151.88ms step:280/1480 train_time:41005ms step_avg:151.87ms step:281/1480 train_time:41154ms step_avg:151.86ms step:282/1480 train_time:41301ms step_avg:151.84ms step:283/1480 train_time:41450ms step_avg:151.83ms step:284/1480 train_time:41598ms step_avg:151.82ms step:285/1480 train_time:41748ms step_avg:151.81ms step:286/1480 train_time:41896ms step_avg:151.80ms step:287/1480 train_time:42046ms step_avg:151.79ms step:288/1480 train_time:42194ms step_avg:151.78ms step:289/1480 train_time:42343ms step_avg:151.77ms step:290/1480 train_time:42491ms step_avg:151.75ms step:291/1480 train_time:42640ms step_avg:151.74ms step:292/1480 train_time:42788ms step_avg:151.73ms step:293/1480 train_time:42937ms step_avg:151.72ms step:294/1480 train_time:43087ms step_avg:151.71ms step:295/1480 train_time:43235ms step_avg:151.70ms step:296/1480 train_time:43385ms step_avg:151.70ms step:297/1480 train_time:43533ms step_avg:151.68ms step:298/1480 train_time:43681ms step_avg:151.67ms step:299/1480 train_time:43830ms step_avg:151.66ms step:300/1480 train_time:43978ms step_avg:151.65ms step:301/1480 train_time:44129ms step_avg:151.64ms step:302/1480 train_time:44276ms step_avg:151.63ms step:303/1480 train_time:44425ms step_avg:151.62ms step:304/1480 train_time:44573ms step_avg:151.61ms step:305/1480 train_time:44721ms step_avg:151.60ms step:306/1480 train_time:44870ms step_avg:151.59ms step:307/1480 train_time:45017ms step_avg:151.57ms step:308/1480 train_time:45166ms step_avg:151.57ms step:309/1480 train_time:45314ms step_avg:151.55ms step:310/1480 train_time:45463ms step_avg:151.54ms step:311/1480 train_time:45612ms step_avg:151.53ms step:312/1480 train_time:45759ms step_avg:151.52ms step:313/1480 train_time:45908ms step_avg:151.51ms step:314/1480 train_time:46055ms step_avg:151.50ms step:315/1480 train_time:46204ms step_avg:151.49ms step:316/1480 train_time:46353ms step_avg:151.48ms step:317/1480 train_time:46501ms step_avg:151.47ms step:318/1480 train_time:46651ms step_avg:151.46ms step:319/1480 train_time:46798ms step_avg:151.45ms step:320/1480 train_time:46948ms step_avg:151.45ms step:321/1480 train_time:47096ms step_avg:151.43ms step:322/1480 train_time:47246ms step_avg:151.43ms step:323/1480 train_time:47394ms step_avg:151.42ms step:324/1480 train_time:47544ms step_avg:151.41ms step:325/1480 train_time:47693ms step_avg:151.41ms step:326/1480 train_time:47841ms step_avg:151.39ms step:327/1480 train_time:47990ms step_avg:151.39ms step:328/1480 train_time:48138ms step_avg:151.38ms step:329/1480 train_time:48288ms step_avg:151.37ms step:330/1480 train_time:48436ms step_avg:151.36ms step:331/1480 train_time:48587ms step_avg:151.36ms step:332/1480 train_time:48737ms step_avg:151.36ms step:333/1480 train_time:48888ms step_avg:151.36ms step:334/1480 train_time:49040ms step_avg:151.36ms step:335/1480 train_time:49192ms step_avg:151.36ms step:336/1480 train_time:49342ms step_avg:151.36ms step:337/1480 train_time:49494ms step_avg:151.36ms step:338/1480 train_time:49645ms step_avg:151.36ms step:339/1480 train_time:49796ms step_avg:151.35ms step:340/1480 train_time:49948ms step_avg:151.36ms step:341/1480 train_time:50097ms step_avg:151.35ms step:342/1480 train_time:50249ms step_avg:151.35ms step:343/1480 train_time:50399ms step_avg:151.35ms step:344/1480 train_time:50551ms step_avg:151.35ms step:345/1480 train_time:50702ms step_avg:151.35ms step:346/1480 train_time:50853ms step_avg:151.35ms step:347/1480 train_time:51004ms step_avg:151.35ms step:348/1480 train_time:51155ms step_avg:151.35ms step:349/1480 train_time:51306ms step_avg:151.35ms step:350/1480 train_time:51457ms step_avg:151.34ms step:351/1480 train_time:51610ms step_avg:151.35ms step:352/1480 train_time:51759ms step_avg:151.34ms step:353/1480 train_time:51911ms step_avg:151.34ms step:354/1480 train_time:52060ms step_avg:151.34ms step:355/1480 train_time:52212ms step_avg:151.34ms step:356/1480 train_time:52362ms step_avg:151.33ms step:357/1480 train_time:52513ms step_avg:151.33ms step:358/1480 train_time:52663ms step_avg:151.33ms step:359/1480 train_time:52815ms step_avg:151.33ms step:360/1480 train_time:52967ms step_avg:151.33ms step:361/1480 train_time:53119ms step_avg:151.34ms step:362/1480 train_time:53270ms step_avg:151.34ms step:363/1480 train_time:53421ms step_avg:151.33ms step:364/1480 train_time:53573ms step_avg:151.34ms step:365/1480 train_time:53723ms step_avg:151.33ms step:366/1480 train_time:53874ms step_avg:151.33ms step:367/1480 train_time:54024ms step_avg:151.33ms step:368/1480 train_time:54175ms step_avg:151.33ms step:369/1480 train_time:54327ms step_avg:151.33ms step:370/1480 train_time:54477ms step_avg:151.33ms step:371/1480 train_time:54628ms step_avg:151.33ms step:372/1480 train_time:54778ms step_avg:151.32ms step:373/1480 train_time:54930ms step_avg:151.32ms step:374/1480 train_time:55080ms step_avg:151.32ms step:375/1480 train_time:55231ms step_avg:151.32ms step:375/1480 val_loss:3.8065 train_time:55298ms step_avg:151.50ms step:376/1480 train_time:55395ms step_avg:151.35ms step:377/1480 train_time:55539ms step_avg:151.33ms step:378/1480 train_time:55691ms step_avg:151.33ms step:379/1480 train_time:55853ms step_avg:151.36ms step:380/1480 train_time:55992ms step_avg:151.33ms step:381/1480 train_time:56142ms step_avg:151.33ms step:382/1480 train_time:56292ms step_avg:151.32ms step:383/1480 train_time:56443ms step_avg:151.32ms step:384/1480 train_time:56595ms step_avg:151.32ms step:385/1480 train_time:56747ms step_avg:151.33ms step:386/1480 train_time:56896ms step_avg:151.32ms step:387/1480 train_time:57049ms step_avg:151.32ms step:388/1480 train_time:57199ms step_avg:151.32ms step:389/1480 train_time:57350ms step_avg:151.32ms step:390/1480 train_time:57500ms step_avg:151.32ms step:391/1480 train_time:57651ms step_avg:151.32ms step:392/1480 train_time:57801ms step_avg:151.31ms step:393/1480 train_time:57953ms step_avg:151.31ms step:394/1480 train_time:58102ms step_avg:151.31ms step:395/1480 train_time:58254ms step_avg:151.31ms step:396/1480 train_time:58404ms step_avg:151.31ms step:397/1480 train_time:58555ms step_avg:151.30ms step:398/1480 train_time:58706ms step_avg:151.30ms step:399/1480 train_time:58857ms step_avg:151.30ms step:400/1480 train_time:59009ms step_avg:151.30ms step:401/1480 train_time:59159ms step_avg:151.30ms step:402/1480 train_time:59311ms step_avg:151.30ms step:403/1480 train_time:59462ms step_avg:151.30ms step:404/1480 train_time:59613ms step_avg:151.30ms step:405/1480 train_time:59764ms step_avg:151.30ms step:406/1480 train_time:59917ms step_avg:151.30ms step:407/1480 train_time:60069ms step_avg:151.31ms step:408/1480 train_time:60219ms step_avg:151.30ms step:409/1480 train_time:60371ms step_avg:151.31ms step:410/1480 train_time:60521ms step_avg:151.30ms step:411/1480 train_time:60672ms step_avg:151.30ms step:412/1480 train_time:60821ms step_avg:151.30ms step:413/1480 train_time:60972ms step_avg:151.30ms step:414/1480 train_time:61123ms step_avg:151.30ms step:415/1480 train_time:61275ms step_avg:151.30ms step:416/1480 train_time:61427ms step_avg:151.30ms step:417/1480 train_time:61577ms step_avg:151.30ms step:418/1480 train_time:61730ms step_avg:151.30ms step:419/1480 train_time:61879ms step_avg:151.29ms step:420/1480 train_time:62030ms step_avg:151.29ms step:421/1480 train_time:62180ms step_avg:151.29ms step:422/1480 train_time:62331ms step_avg:151.29ms step:423/1480 train_time:62481ms step_avg:151.28ms step:424/1480 train_time:62633ms step_avg:151.29ms step:425/1480 train_time:62784ms step_avg:151.29ms step:426/1480 train_time:62935ms step_avg:151.29ms step:427/1480 train_time:63086ms step_avg:151.29ms step:428/1480 train_time:63236ms step_avg:151.28ms step:429/1480 train_time:63389ms step_avg:151.29ms step:430/1480 train_time:63539ms step_avg:151.28ms step:431/1480 train_time:63690ms step_avg:151.28ms step:432/1480 train_time:63841ms step_avg:151.28ms step:433/1480 train_time:63992ms step_avg:151.28ms step:434/1480 train_time:64143ms step_avg:151.28ms step:435/1480 train_time:64295ms step_avg:151.28ms step:436/1480 train_time:64446ms step_avg:151.28ms step:437/1480 train_time:64596ms step_avg:151.28ms step:438/1480 train_time:64748ms step_avg:151.28ms step:439/1480 train_time:64898ms step_avg:151.28ms step:440/1480 train_time:65051ms step_avg:151.28ms step:441/1480 train_time:65203ms step_avg:151.28ms step:442/1480 train_time:65356ms step_avg:151.29ms step:443/1480 train_time:65509ms step_avg:151.29ms step:444/1480 train_time:65663ms step_avg:151.30ms step:445/1480 train_time:65816ms step_avg:151.30ms step:446/1480 train_time:65970ms step_avg:151.31ms step:447/1480 train_time:66123ms step_avg:151.31ms step:448/1480 train_time:66275ms step_avg:151.31ms step:449/1480 train_time:66430ms step_avg:151.32ms step:450/1480 train_time:66582ms step_avg:151.32ms step:451/1480 train_time:66736ms step_avg:151.33ms step:452/1480 train_time:66889ms step_avg:151.33ms step:453/1480 train_time:67042ms step_avg:151.34ms step:454/1480 train_time:67194ms step_avg:151.34ms step:455/1480 train_time:67348ms step_avg:151.34ms step:456/1480 train_time:67500ms step_avg:151.34ms step:457/1480 train_time:67653ms step_avg:151.35ms step:458/1480 train_time:67805ms step_avg:151.35ms step:459/1480 train_time:67959ms step_avg:151.36ms step:460/1480 train_time:68111ms step_avg:151.36ms step:461/1480 train_time:68265ms step_avg:151.36ms step:462/1480 train_time:68419ms step_avg:151.37ms step:463/1480 train_time:68573ms step_avg:151.38ms step:464/1480 train_time:68726ms step_avg:151.38ms step:465/1480 train_time:68878ms step_avg:151.38ms step:466/1480 train_time:69032ms step_avg:151.39ms step:467/1480 train_time:69184ms step_avg:151.39ms step:468/1480 train_time:69338ms step_avg:151.39ms step:469/1480 train_time:69491ms step_avg:151.40ms step:470/1480 train_time:69644ms step_avg:151.40ms step:471/1480 train_time:69797ms step_avg:151.40ms step:472/1480 train_time:69951ms step_avg:151.41ms step:473/1480 train_time:70103ms step_avg:151.41ms step:474/1480 train_time:70256ms step_avg:151.41ms step:475/1480 train_time:70408ms step_avg:151.41ms step:476/1480 train_time:70561ms step_avg:151.42ms step:477/1480 train_time:70713ms step_avg:151.42ms step:478/1480 train_time:70867ms step_avg:151.43ms step:479/1480 train_time:71020ms step_avg:151.43ms step:480/1480 train_time:71174ms step_avg:151.43ms step:481/1480 train_time:71326ms step_avg:151.44ms step:482/1480 train_time:71478ms step_avg:151.44ms step:483/1480 train_time:71632ms step_avg:151.44ms step:484/1480 train_time:71784ms step_avg:151.44ms step:485/1480 train_time:71937ms step_avg:151.45ms step:486/1480 train_time:72091ms step_avg:151.45ms step:487/1480 train_time:72244ms step_avg:151.45ms step:488/1480 train_time:72396ms step_avg:151.46ms step:489/1480 train_time:72550ms step_avg:151.46ms step:490/1480 train_time:72703ms step_avg:151.46ms step:491/1480 train_time:72855ms step_avg:151.47ms step:492/1480 train_time:73007ms step_avg:151.47ms step:493/1480 train_time:73161ms step_avg:151.47ms step:494/1480 train_time:73314ms step_avg:151.47ms step:495/1480 train_time:73468ms step_avg:151.48ms step:496/1480 train_time:73622ms step_avg:151.49ms step:497/1480 train_time:73775ms step_avg:151.49ms step:498/1480 train_time:73929ms step_avg:151.49ms step:499/1480 train_time:74080ms step_avg:151.49ms step:500/1480 train_time:74233ms step_avg:151.50ms step:500/1480 val_loss:3.6876 train_time:74301ms step_avg:151.63ms step:501/1480 train_time:74391ms step_avg:151.51ms step:502/1480 train_time:74543ms step_avg:151.51ms step:503/1480 train_time:74698ms step_avg:151.52ms step:504/1480 train_time:74851ms step_avg:151.52ms step:505/1480 train_time:75003ms step_avg:151.52ms step:506/1480 train_time:75156ms step_avg:151.52ms step:507/1480 train_time:75308ms step_avg:151.53ms step:508/1480 train_time:75462ms step_avg:151.53ms step:509/1480 train_time:75616ms step_avg:151.53ms step:510/1480 train_time:75769ms step_avg:151.54ms step:511/1480 train_time:75922ms step_avg:151.54ms step:512/1480 train_time:76077ms step_avg:151.55ms step:513/1480 train_time:76229ms step_avg:151.55ms step:514/1480 train_time:76382ms step_avg:151.55ms step:515/1480 train_time:76535ms step_avg:151.56ms step:516/1480 train_time:76689ms step_avg:151.56ms step:517/1480 train_time:76842ms step_avg:151.56ms step:518/1480 train_time:76998ms step_avg:151.57ms step:519/1480 train_time:77150ms step_avg:151.57ms step:520/1480 train_time:77304ms step_avg:151.58ms step:521/1480 train_time:77458ms step_avg:151.58ms step:522/1480 train_time:77611ms step_avg:151.58ms step:523/1480 train_time:77765ms step_avg:151.59ms step:524/1480 train_time:77918ms step_avg:151.59ms step:525/1480 train_time:78072ms step_avg:151.60ms step:526/1480 train_time:78225ms step_avg:151.60ms step:527/1480 train_time:78378ms step_avg:151.60ms step:528/1480 train_time:78531ms step_avg:151.60ms step:529/1480 train_time:78684ms step_avg:151.61ms step:530/1480 train_time:78839ms step_avg:151.61ms step:531/1480 train_time:78991ms step_avg:151.61ms step:532/1480 train_time:79145ms step_avg:151.62ms step:533/1480 train_time:79298ms step_avg:151.62ms step:534/1480 train_time:79450ms step_avg:151.62ms step:535/1480 train_time:79604ms step_avg:151.63ms step:536/1480 train_time:79757ms step_avg:151.63ms step:537/1480 train_time:79910ms step_avg:151.63ms step:538/1480 train_time:80063ms step_avg:151.63ms step:539/1480 train_time:80218ms step_avg:151.64ms step:540/1480 train_time:80371ms step_avg:151.64ms step:541/1480 train_time:80524ms step_avg:151.65ms step:542/1480 train_time:80677ms step_avg:151.65ms step:543/1480 train_time:80829ms step_avg:151.65ms step:544/1480 train_time:80982ms step_avg:151.65ms step:545/1480 train_time:81135ms step_avg:151.65ms step:546/1480 train_time:81287ms step_avg:151.65ms step:547/1480 train_time:81440ms step_avg:151.66ms step:548/1480 train_time:81593ms step_avg:151.66ms step:549/1480 train_time:81745ms step_avg:151.66ms step:550/1480 train_time:81901ms step_avg:151.67ms step:551/1480 train_time:82056ms step_avg:151.67ms step:552/1480 train_time:82210ms step_avg:151.68ms step:553/1480 train_time:82365ms step_avg:151.69ms step:554/1480 train_time:82519ms step_avg:151.69ms step:555/1480 train_time:82675ms step_avg:151.70ms step:556/1480 train_time:82829ms step_avg:151.70ms step:557/1480 train_time:82983ms step_avg:151.71ms step:558/1480 train_time:83138ms step_avg:151.71ms step:559/1480 train_time:83293ms step_avg:151.72ms step:560/1480 train_time:83447ms step_avg:151.72ms step:561/1480 train_time:83602ms step_avg:151.73ms step:562/1480 train_time:83758ms step_avg:151.73ms step:563/1480 train_time:83912ms step_avg:151.74ms step:564/1480 train_time:84068ms step_avg:151.75ms step:565/1480 train_time:84222ms step_avg:151.75ms step:566/1480 train_time:84378ms step_avg:151.76ms step:567/1480 train_time:84533ms step_avg:151.76ms step:568/1480 train_time:84687ms step_avg:151.77ms step:569/1480 train_time:84852ms step_avg:151.79ms step:570/1480 train_time:84997ms step_avg:151.78ms step:571/1480 train_time:85151ms step_avg:151.78ms step:572/1480 train_time:85305ms step_avg:151.79ms step:573/1480 train_time:85461ms step_avg:151.79ms step:574/1480 train_time:85618ms step_avg:151.80ms step:575/1480 train_time:85774ms step_avg:151.81ms step:576/1480 train_time:85930ms step_avg:151.82ms step:577/1480 train_time:86084ms step_avg:151.82ms step:578/1480 train_time:86239ms step_avg:151.83ms step:579/1480 train_time:86393ms step_avg:151.83ms step:580/1480 train_time:86549ms step_avg:151.84ms step:581/1480 train_time:86702ms step_avg:151.84ms step:582/1480 train_time:86859ms step_avg:151.85ms step:583/1480 train_time:87012ms step_avg:151.85ms step:584/1480 train_time:87167ms step_avg:151.86ms step:585/1480 train_time:87321ms step_avg:151.86ms step:586/1480 train_time:87478ms step_avg:151.87ms step:587/1480 train_time:87633ms step_avg:151.88ms step:588/1480 train_time:87787ms step_avg:151.88ms step:589/1480 train_time:87942ms step_avg:151.89ms step:590/1480 train_time:88098ms step_avg:151.89ms step:591/1480 train_time:88252ms step_avg:151.90ms step:592/1480 train_time:88406ms step_avg:151.90ms step:593/1480 train_time:88562ms step_avg:151.91ms step:594/1480 train_time:88716ms step_avg:151.91ms step:595/1480 train_time:88873ms step_avg:151.92ms step:596/1480 train_time:89031ms step_avg:151.93ms step:597/1480 train_time:89186ms step_avg:151.94ms step:598/1480 train_time:89341ms step_avg:151.94ms step:599/1480 train_time:89496ms step_avg:151.95ms step:600/1480 train_time:89651ms step_avg:151.95ms step:601/1480 train_time:89804ms step_avg:151.95ms step:602/1480 train_time:89960ms step_avg:151.96ms step:603/1480 train_time:90114ms step_avg:151.96ms step:604/1480 train_time:90269ms step_avg:151.97ms step:605/1480 train_time:90426ms step_avg:151.98ms step:606/1480 train_time:90582ms step_avg:151.98ms step:607/1480 train_time:90738ms step_avg:151.99ms step:608/1480 train_time:90892ms step_avg:151.99ms step:609/1480 train_time:91047ms step_avg:152.00ms step:610/1480 train_time:91201ms step_avg:152.00ms step:611/1480 train_time:91356ms step_avg:152.01ms step:612/1480 train_time:91511ms step_avg:152.01ms step:613/1480 train_time:91667ms step_avg:152.02ms step:614/1480 train_time:91822ms step_avg:152.02ms step:615/1480 train_time:91976ms step_avg:152.03ms step:616/1480 train_time:92129ms step_avg:152.03ms step:617/1480 train_time:92283ms step_avg:152.03ms step:618/1480 train_time:92438ms step_avg:152.04ms step:619/1480 train_time:92593ms step_avg:152.04ms step:620/1480 train_time:92747ms step_avg:152.04ms step:621/1480 train_time:92902ms step_avg:152.05ms step:622/1480 train_time:93058ms step_avg:152.06ms step:623/1480 train_time:93213ms step_avg:152.06ms step:624/1480 train_time:93368ms step_avg:152.06ms step:625/1480 train_time:93522ms step_avg:152.07ms step:625/1480 val_loss:3.6062 train_time:93594ms step_avg:152.19ms step:626/1480 train_time:93685ms step_avg:152.09ms step:627/1480 train_time:93838ms step_avg:152.09ms step:628/1480 train_time:93993ms step_avg:152.09ms step:629/1480 train_time:94148ms step_avg:152.10ms step:630/1480 train_time:94302ms step_avg:152.10ms step:631/1480 train_time:94456ms step_avg:152.10ms step:632/1480 train_time:94610ms step_avg:152.11ms step:633/1480 train_time:94767ms step_avg:152.11ms step:634/1480 train_time:94922ms step_avg:152.12ms step:635/1480 train_time:95076ms step_avg:152.12ms step:636/1480 train_time:95231ms step_avg:152.13ms step:637/1480 train_time:95386ms step_avg:152.13ms step:638/1480 train_time:95542ms step_avg:152.14ms step:639/1480 train_time:95696ms step_avg:152.14ms step:640/1480 train_time:95852ms step_avg:152.15ms step:641/1480 train_time:96006ms step_avg:152.15ms step:642/1480 train_time:96160ms step_avg:152.15ms step:643/1480 train_time:96314ms step_avg:152.16ms step:644/1480 train_time:96469ms step_avg:152.16ms step:645/1480 train_time:96624ms step_avg:152.16ms step:646/1480 train_time:96780ms step_avg:152.17ms step:647/1480 train_time:96934ms step_avg:152.17ms step:648/1480 train_time:97092ms step_avg:152.18ms step:649/1480 train_time:97247ms step_avg:152.19ms step:650/1480 train_time:97403ms step_avg:152.19ms step:651/1480 train_time:97557ms step_avg:152.20ms step:652/1480 train_time:97712ms step_avg:152.20ms step:653/1480 train_time:97866ms step_avg:152.20ms step:654/1480 train_time:98021ms step_avg:152.21ms step:655/1480 train_time:98176ms step_avg:152.21ms step:656/1480 train_time:98332ms step_avg:152.22ms step:657/1480 train_time:98486ms step_avg:152.22ms step:658/1480 train_time:98641ms step_avg:152.22ms step:659/1480 train_time:98797ms step_avg:152.23ms step:660/1480 train_time:98953ms step_avg:152.24ms step:661/1480 train_time:99108ms step_avg:152.24ms step:662/1480 train_time:99265ms step_avg:152.25ms step:663/1480 train_time:99421ms step_avg:152.25ms step:664/1480 train_time:99579ms step_avg:152.26ms step:665/1480 train_time:99736ms step_avg:152.27ms step:666/1480 train_time:99892ms step_avg:152.27ms step:667/1480 train_time:100049ms step_avg:152.28ms step:668/1480 train_time:100205ms step_avg:152.29ms step:669/1480 train_time:100363ms step_avg:152.30ms step:670/1480 train_time:100519ms step_avg:152.30ms step:671/1480 train_time:100676ms step_avg:152.31ms step:672/1480 train_time:100834ms step_avg:152.32ms step:673/1480 train_time:100991ms step_avg:152.32ms step:674/1480 train_time:101147ms step_avg:152.33ms step:675/1480 train_time:101305ms step_avg:152.34ms step:676/1480 train_time:101463ms step_avg:152.35ms step:677/1480 train_time:101621ms step_avg:152.35ms step:678/1480 train_time:101777ms step_avg:152.36ms step:679/1480 train_time:101934ms step_avg:152.37ms step:680/1480 train_time:102092ms step_avg:152.38ms step:681/1480 train_time:102248ms step_avg:152.38ms step:682/1480 train_time:102405ms step_avg:152.39ms step:683/1480 train_time:102563ms step_avg:152.40ms step:684/1480 train_time:102720ms step_avg:152.40ms step:685/1480 train_time:102876ms step_avg:152.41ms step:686/1480 train_time:103034ms step_avg:152.42ms step:687/1480 train_time:103191ms step_avg:152.42ms step:688/1480 train_time:103349ms step_avg:152.43ms step:689/1480 train_time:103507ms step_avg:152.44ms step:690/1480 train_time:103664ms step_avg:152.45ms step:691/1480 train_time:103822ms step_avg:152.45ms step:692/1480 train_time:103978ms step_avg:152.46ms step:693/1480 train_time:104134ms step_avg:152.47ms step:694/1480 train_time:104292ms step_avg:152.47ms step:695/1480 train_time:104447ms step_avg:152.48ms step:696/1480 train_time:104602ms step_avg:152.48ms step:697/1480 train_time:104758ms step_avg:152.49ms step:698/1480 train_time:104915ms step_avg:152.49ms step:699/1480 train_time:105073ms step_avg:152.50ms step:700/1480 train_time:105229ms step_avg:152.51ms step:701/1480 train_time:105386ms step_avg:152.51ms step:702/1480 train_time:105541ms step_avg:152.52ms step:703/1480 train_time:105697ms step_avg:152.52ms step:704/1480 train_time:105853ms step_avg:152.53ms step:705/1480 train_time:106011ms step_avg:152.53ms step:706/1480 train_time:106169ms step_avg:152.54ms step:707/1480 train_time:106326ms step_avg:152.55ms step:708/1480 train_time:106481ms step_avg:152.55ms step:709/1480 train_time:106636ms step_avg:152.56ms step:710/1480 train_time:106793ms step_avg:152.56ms step:711/1480 train_time:106950ms step_avg:152.57ms step:712/1480 train_time:107107ms step_avg:152.57ms step:713/1480 train_time:107263ms step_avg:152.58ms step:714/1480 train_time:107420ms step_avg:152.59ms step:715/1480 train_time:107575ms step_avg:152.59ms step:716/1480 train_time:107732ms step_avg:152.59ms step:717/1480 train_time:107888ms step_avg:152.60ms step:718/1480 train_time:108045ms step_avg:152.61ms step:719/1480 train_time:108201ms step_avg:152.61ms step:720/1480 train_time:108360ms step_avg:152.62ms step:721/1480 train_time:108517ms step_avg:152.63ms step:722/1480 train_time:108674ms step_avg:152.63ms step:723/1480 train_time:108830ms step_avg:152.64ms step:724/1480 train_time:108985ms step_avg:152.64ms step:725/1480 train_time:109142ms step_avg:152.65ms step:726/1480 train_time:109298ms step_avg:152.65ms step:727/1480 train_time:109458ms step_avg:152.66ms step:728/1480 train_time:109614ms step_avg:152.67ms step:729/1480 train_time:109772ms step_avg:152.67ms step:730/1480 train_time:109929ms step_avg:152.68ms step:731/1480 train_time:110086ms step_avg:152.68ms step:732/1480 train_time:110241ms step_avg:152.69ms step:733/1480 train_time:110398ms step_avg:152.69ms step:734/1480 train_time:110555ms step_avg:152.70ms step:735/1480 train_time:110710ms step_avg:152.70ms step:736/1480 train_time:110868ms step_avg:152.71ms step:737/1480 train_time:111024ms step_avg:152.72ms step:738/1480 train_time:111179ms step_avg:152.72ms step:739/1480 train_time:111336ms step_avg:152.72ms step:740/1480 train_time:111494ms step_avg:152.73ms step:741/1480 train_time:111651ms step_avg:152.74ms step:742/1480 train_time:111807ms step_avg:152.74ms step:743/1480 train_time:111964ms step_avg:152.75ms step:744/1480 train_time:112120ms step_avg:152.75ms step:745/1480 train_time:112277ms step_avg:152.76ms step:746/1480 train_time:112433ms step_avg:152.76ms step:747/1480 train_time:112589ms step_avg:152.77ms step:748/1480 train_time:112750ms step_avg:152.78ms step:749/1480 train_time:112907ms step_avg:152.78ms step:750/1480 train_time:113063ms step_avg:152.79ms step:750/1480 val_loss:3.5506 train_time:113134ms step_avg:152.88ms step:751/1480 train_time:113225ms step_avg:152.80ms step:752/1480 train_time:113379ms step_avg:152.80ms step:753/1480 train_time:113536ms step_avg:152.81ms step:754/1480 train_time:113693ms step_avg:152.81ms step:755/1480 train_time:113848ms step_avg:152.82ms step:756/1480 train_time:114004ms step_avg:152.82ms step:757/1480 train_time:114163ms step_avg:152.83ms step:758/1480 train_time:114320ms step_avg:152.83ms step:759/1480 train_time:114485ms step_avg:152.85ms step:760/1480 train_time:114637ms step_avg:152.85ms step:761/1480 train_time:114795ms step_avg:152.86ms step:762/1480 train_time:114952ms step_avg:152.86ms step:763/1480 train_time:115109ms step_avg:152.87ms step:764/1480 train_time:115266ms step_avg:152.87ms step:765/1480 train_time:115423ms step_avg:152.88ms step:766/1480 train_time:115583ms step_avg:152.89ms step:767/1480 train_time:115739ms step_avg:152.89ms step:768/1480 train_time:115894ms step_avg:152.89ms step:769/1480 train_time:116053ms step_avg:152.90ms step:770/1480 train_time:116211ms step_avg:152.91ms step:771/1480 train_time:116368ms step_avg:152.91ms step:772/1480 train_time:116525ms step_avg:152.92ms step:773/1480 train_time:116685ms step_avg:152.93ms step:774/1480 train_time:116843ms step_avg:152.94ms step:775/1480 train_time:116999ms step_avg:152.94ms step:776/1480 train_time:117158ms step_avg:152.95ms step:777/1480 train_time:117319ms step_avg:152.96ms step:778/1480 train_time:117478ms step_avg:152.97ms step:779/1480 train_time:117637ms step_avg:152.97ms step:780/1480 train_time:117795ms step_avg:152.98ms step:781/1480 train_time:117954ms step_avg:152.99ms step:782/1480 train_time:118112ms step_avg:153.00ms step:783/1480 train_time:118269ms step_avg:153.00ms step:784/1480 train_time:118427ms step_avg:153.01ms step:785/1480 train_time:118585ms step_avg:153.01ms step:786/1480 train_time:118744ms step_avg:153.02ms step:787/1480 train_time:118901ms step_avg:153.03ms step:788/1480 train_time:119060ms step_avg:153.03ms step:789/1480 train_time:119218ms step_avg:153.04ms step:790/1480 train_time:119374ms step_avg:153.04ms step:791/1480 train_time:119534ms step_avg:153.05ms step:792/1480 train_time:119693ms step_avg:153.06ms step:793/1480 train_time:119852ms step_avg:153.07ms step:794/1480 train_time:120011ms step_avg:153.07ms step:795/1480 train_time:120172ms step_avg:153.08ms step:796/1480 train_time:120332ms step_avg:153.09ms step:797/1480 train_time:120491ms step_avg:153.10ms step:798/1480 train_time:120652ms step_avg:153.11ms step:799/1480 train_time:120813ms step_avg:153.12ms step:800/1480 train_time:120971ms step_avg:153.13ms step:801/1480 train_time:121129ms step_avg:153.13ms step:802/1480 train_time:121287ms step_avg:153.14ms step:803/1480 train_time:121444ms step_avg:153.15ms step:804/1480 train_time:121601ms step_avg:153.15ms step:805/1480 train_time:121762ms step_avg:153.16ms step:806/1480 train_time:121919ms step_avg:153.16ms step:807/1480 train_time:122074ms step_avg:153.17ms step:808/1480 train_time:122233ms step_avg:153.17ms step:809/1480 train_time:122391ms step_avg:153.18ms step:810/1480 train_time:122549ms step_avg:153.19ms step:811/1480 train_time:122706ms step_avg:153.19ms step:812/1480 train_time:122864ms step_avg:153.20ms step:813/1480 train_time:123020ms step_avg:153.20ms step:814/1480 train_time:123177ms step_avg:153.20ms step:815/1480 train_time:123334ms step_avg:153.21ms step:816/1480 train_time:123493ms step_avg:153.22ms step:817/1480 train_time:123652ms step_avg:153.22ms step:818/1480 train_time:123811ms step_avg:153.23ms step:819/1480 train_time:123969ms step_avg:153.24ms step:820/1480 train_time:124127ms step_avg:153.24ms step:821/1480 train_time:124284ms step_avg:153.25ms step:822/1480 train_time:124442ms step_avg:153.25ms step:823/1480 train_time:124599ms step_avg:153.26ms step:824/1480 train_time:124757ms step_avg:153.26ms step:825/1480 train_time:124917ms step_avg:153.27ms step:826/1480 train_time:125076ms step_avg:153.28ms step:827/1480 train_time:125236ms step_avg:153.29ms step:828/1480 train_time:125394ms step_avg:153.29ms step:829/1480 train_time:125553ms step_avg:153.30ms step:830/1480 train_time:125713ms step_avg:153.31ms step:831/1480 train_time:125870ms step_avg:153.31ms step:832/1480 train_time:126031ms step_avg:153.32ms step:833/1480 train_time:126188ms step_avg:153.33ms step:834/1480 train_time:126346ms step_avg:153.33ms step:835/1480 train_time:126504ms step_avg:153.34ms step:836/1480 train_time:126665ms step_avg:153.35ms step:837/1480 train_time:126823ms step_avg:153.35ms step:838/1480 train_time:126981ms step_avg:153.36ms step:839/1480 train_time:127139ms step_avg:153.36ms step:840/1480 train_time:127297ms step_avg:153.37ms step:841/1480 train_time:127454ms step_avg:153.37ms step:842/1480 train_time:127613ms step_avg:153.38ms step:843/1480 train_time:127770ms step_avg:153.38ms step:844/1480 train_time:127928ms step_avg:153.39ms step:845/1480 train_time:128085ms step_avg:153.40ms step:846/1480 train_time:128246ms step_avg:153.40ms step:847/1480 train_time:128405ms step_avg:153.41ms step:848/1480 train_time:128563ms step_avg:153.42ms step:849/1480 train_time:128720ms step_avg:153.42ms step:850/1480 train_time:128879ms step_avg:153.43ms step:851/1480 train_time:129038ms step_avg:153.43ms step:852/1480 train_time:129196ms step_avg:153.44ms step:853/1480 train_time:129354ms step_avg:153.44ms step:854/1480 train_time:129513ms step_avg:153.45ms step:855/1480 train_time:129670ms step_avg:153.46ms step:856/1480 train_time:129828ms step_avg:153.46ms step:857/1480 train_time:129986ms step_avg:153.47ms step:858/1480 train_time:130148ms step_avg:153.48ms step:859/1480 train_time:130306ms step_avg:153.48ms step:860/1480 train_time:130464ms step_avg:153.49ms step:861/1480 train_time:130624ms step_avg:153.50ms step:862/1480 train_time:130785ms step_avg:153.50ms step:863/1480 train_time:130945ms step_avg:153.51ms step:864/1480 train_time:131103ms step_avg:153.52ms step:865/1480 train_time:131261ms step_avg:153.52ms step:866/1480 train_time:131421ms step_avg:153.53ms step:867/1480 train_time:131580ms step_avg:153.54ms step:868/1480 train_time:131738ms step_avg:153.54ms step:869/1480 train_time:131895ms step_avg:153.55ms step:870/1480 train_time:132054ms step_avg:153.55ms step:871/1480 train_time:132211ms step_avg:153.56ms step:872/1480 train_time:132369ms step_avg:153.56ms step:873/1480 train_time:132527ms step_avg:153.57ms step:874/1480 train_time:132687ms step_avg:153.57ms step:875/1480 train_time:132847ms step_avg:153.58ms step:875/1480 val_loss:3.5037 train_time:132919ms step_avg:153.66ms step:876/1480 train_time:133013ms step_avg:153.60ms step:877/1480 train_time:133167ms step_avg:153.59ms step:878/1480 train_time:133324ms step_avg:153.60ms step:879/1480 train_time:133482ms step_avg:153.60ms step:880/1480 train_time:133641ms step_avg:153.61ms step:881/1480 train_time:133798ms step_avg:153.61ms step:882/1480 train_time:133958ms step_avg:153.62ms step:883/1480 train_time:134119ms step_avg:153.63ms step:884/1480 train_time:134280ms step_avg:153.64ms step:885/1480 train_time:134440ms step_avg:153.65ms step:886/1480 train_time:134600ms step_avg:153.65ms step:887/1480 train_time:134760ms step_avg:153.66ms step:888/1480 train_time:134924ms step_avg:153.67ms step:889/1480 train_time:135084ms step_avg:153.68ms step:890/1480 train_time:135242ms step_avg:153.68ms step:891/1480 train_time:135401ms step_avg:153.69ms step:892/1480 train_time:135561ms step_avg:153.70ms step:893/1480 train_time:135719ms step_avg:153.70ms step:894/1480 train_time:135878ms step_avg:153.71ms step:895/1480 train_time:136038ms step_avg:153.71ms step:896/1480 train_time:136196ms step_avg:153.72ms step:897/1480 train_time:136356ms step_avg:153.73ms step:898/1480 train_time:136517ms step_avg:153.74ms step:899/1480 train_time:136677ms step_avg:153.74ms step:900/1480 train_time:136837ms step_avg:153.75ms step:901/1480 train_time:136996ms step_avg:153.76ms step:902/1480 train_time:137154ms step_avg:153.76ms step:903/1480 train_time:137316ms step_avg:153.77ms step:904/1480 train_time:137476ms step_avg:153.78ms step:905/1480 train_time:137635ms step_avg:153.78ms step:906/1480 train_time:137794ms step_avg:153.79ms step:907/1480 train_time:137957ms step_avg:153.80ms step:908/1480 train_time:138116ms step_avg:153.80ms step:909/1480 train_time:138274ms step_avg:153.81ms step:910/1480 train_time:138437ms step_avg:153.82ms step:911/1480 train_time:138598ms step_avg:153.83ms step:912/1480 train_time:138757ms step_avg:153.83ms step:913/1480 train_time:138919ms step_avg:153.84ms step:914/1480 train_time:139079ms step_avg:153.85ms step:915/1480 train_time:139241ms step_avg:153.86ms step:916/1480 train_time:139400ms step_avg:153.86ms step:917/1480 train_time:139558ms step_avg:153.87ms step:918/1480 train_time:139721ms step_avg:153.88ms step:919/1480 train_time:139883ms step_avg:153.89ms step:920/1480 train_time:140042ms step_avg:153.89ms step:921/1480 train_time:140201ms step_avg:153.90ms step:922/1480 train_time:140359ms step_avg:153.90ms step:923/1480 train_time:140517ms step_avg:153.91ms step:924/1480 train_time:140675ms step_avg:153.91ms step:925/1480 train_time:140835ms step_avg:153.92ms step:926/1480 train_time:140994ms step_avg:153.92ms step:927/1480 train_time:141152ms step_avg:153.93ms step:928/1480 train_time:141311ms step_avg:153.93ms step:929/1480 train_time:141471ms step_avg:153.94ms step:930/1480 train_time:141632ms step_avg:153.95ms step:931/1480 train_time:141792ms step_avg:153.95ms step:932/1480 train_time:141952ms step_avg:153.96ms step:933/1480 train_time:142111ms step_avg:153.97ms step:934/1480 train_time:142270ms step_avg:153.97ms step:935/1480 train_time:142431ms step_avg:153.98ms step:936/1480 train_time:142590ms step_avg:153.99ms step:937/1480 train_time:142750ms step_avg:153.99ms step:938/1480 train_time:142909ms step_avg:154.00ms step:939/1480 train_time:143070ms step_avg:154.00ms step:940/1480 train_time:143231ms step_avg:154.01ms step:941/1480 train_time:143390ms step_avg:154.02ms step:942/1480 train_time:143548ms step_avg:154.02ms step:943/1480 train_time:143707ms step_avg:154.03ms step:944/1480 train_time:143869ms step_avg:154.04ms step:945/1480 train_time:144029ms step_avg:154.04ms step:946/1480 train_time:144190ms step_avg:154.05ms step:947/1480 train_time:144351ms step_avg:154.06ms step:948/1480 train_time:144514ms step_avg:154.07ms step:949/1480 train_time:144681ms step_avg:154.08ms step:950/1480 train_time:144835ms step_avg:154.08ms step:951/1480 train_time:144996ms step_avg:154.09ms step:952/1480 train_time:145156ms step_avg:154.09ms step:953/1480 train_time:145316ms step_avg:154.10ms step:954/1480 train_time:145476ms step_avg:154.11ms step:955/1480 train_time:145634ms step_avg:154.11ms step:956/1480 train_time:145794ms step_avg:154.12ms step:957/1480 train_time:145953ms step_avg:154.12ms step:958/1480 train_time:146120ms step_avg:154.13ms step:959/1480 train_time:146279ms step_avg:154.14ms step:960/1480 train_time:146440ms step_avg:154.15ms step:961/1480 train_time:146599ms step_avg:154.15ms step:962/1480 train_time:146757ms step_avg:154.16ms step:963/1480 train_time:146918ms step_avg:154.16ms step:964/1480 train_time:147081ms step_avg:154.17ms step:965/1480 train_time:147240ms step_avg:154.18ms step:966/1480 train_time:147399ms step_avg:154.18ms step:967/1480 train_time:147557ms step_avg:154.19ms step:968/1480 train_time:147716ms step_avg:154.19ms step:969/1480 train_time:147877ms step_avg:154.20ms step:970/1480 train_time:148037ms step_avg:154.20ms step:971/1480 train_time:148195ms step_avg:154.21ms step:972/1480 train_time:148352ms step_avg:154.21ms step:973/1480 train_time:148510ms step_avg:154.22ms step:974/1480 train_time:148670ms step_avg:154.22ms step:975/1480 train_time:148831ms step_avg:154.23ms step:976/1480 train_time:148991ms step_avg:154.24ms step:977/1480 train_time:149150ms step_avg:154.24ms step:978/1480 train_time:149309ms step_avg:154.24ms step:979/1480 train_time:149469ms step_avg:154.25ms step:980/1480 train_time:149628ms step_avg:154.26ms step:981/1480 train_time:149790ms step_avg:154.26ms step:982/1480 train_time:149950ms step_avg:154.27ms step:983/1480 train_time:150109ms step_avg:154.27ms step:984/1480 train_time:150267ms step_avg:154.28ms step:985/1480 train_time:150427ms step_avg:154.28ms step:986/1480 train_time:150586ms step_avg:154.29ms step:987/1480 train_time:150745ms step_avg:154.29ms step:988/1480 train_time:150905ms step_avg:154.30ms step:989/1480 train_time:151062ms step_avg:154.30ms step:990/1480 train_time:151223ms step_avg:154.31ms step:991/1480 train_time:151383ms step_avg:154.31ms step:992/1480 train_time:151547ms step_avg:154.33ms step:993/1480 train_time:151716ms step_avg:154.34ms step:994/1480 train_time:151877ms step_avg:154.35ms step:995/1480 train_time:152038ms step_avg:154.35ms step:996/1480 train_time:152197ms step_avg:154.36ms step:997/1480 train_time:152355ms step_avg:154.36ms step:998/1480 train_time:152514ms step_avg:154.37ms step:999/1480 train_time:152674ms step_avg:154.37ms step:1000/1480 train_time:152833ms step_avg:154.38ms step:1000/1480 val_loss:3.4405 train_time:152906ms step_avg:154.45ms step:1001/1480 train_time:152997ms step_avg:154.39ms step:1002/1480 train_time:153158ms step_avg:154.39ms step:1003/1480 train_time:153323ms step_avg:154.40ms step:1004/1480 train_time:153484ms step_avg:154.41ms step:1005/1480 train_time:153644ms step_avg:154.42ms step:1006/1480 train_time:153805ms step_avg:154.42ms step:1007/1480 train_time:153966ms step_avg:154.43ms step:1008/1480 train_time:154126ms step_avg:154.44ms step:1009/1480 train_time:154291ms step_avg:154.45ms step:1010/1480 train_time:154449ms step_avg:154.45ms step:1011/1480 train_time:154611ms step_avg:154.46ms step:1012/1480 train_time:154770ms step_avg:154.46ms step:1013/1480 train_time:154930ms step_avg:154.47ms step:1014/1480 train_time:155091ms step_avg:154.47ms step:1015/1480 train_time:155252ms step_avg:154.48ms step:1016/1480 train_time:155413ms step_avg:154.49ms step:1017/1480 train_time:155575ms step_avg:154.49ms step:1018/1480 train_time:155735ms step_avg:154.50ms step:1019/1480 train_time:155898ms step_avg:154.51ms step:1020/1480 train_time:156059ms step_avg:154.51ms step:1021/1480 train_time:156218ms step_avg:154.52ms step:1022/1480 train_time:156378ms step_avg:154.52ms step:1023/1480 train_time:156542ms step_avg:154.53ms step:1024/1480 train_time:156703ms step_avg:154.54ms step:1025/1480 train_time:156865ms step_avg:154.55ms step:1026/1480 train_time:157025ms step_avg:154.55ms step:1027/1480 train_time:157185ms step_avg:154.56ms step:1028/1480 train_time:157346ms step_avg:154.56ms step:1029/1480 train_time:157512ms step_avg:154.57ms step:1030/1480 train_time:157672ms step_avg:154.58ms step:1031/1480 train_time:157831ms step_avg:154.58ms step:1032/1480 train_time:157995ms step_avg:154.59ms step:1033/1480 train_time:158154ms step_avg:154.60ms step:1034/1480 train_time:158314ms step_avg:154.60ms step:1035/1480 train_time:158476ms step_avg:154.61ms step:1036/1480 train_time:158636ms step_avg:154.62ms step:1037/1480 train_time:158798ms step_avg:154.62ms step:1038/1480 train_time:158957ms step_avg:154.63ms step:1039/1480 train_time:159119ms step_avg:154.63ms step:1040/1480 train_time:159278ms step_avg:154.64ms step:1041/1480 train_time:159440ms step_avg:154.65ms step:1042/1480 train_time:159599ms step_avg:154.65ms step:1043/1480 train_time:159758ms step_avg:154.65ms step:1044/1480 train_time:159917ms step_avg:154.66ms step:1045/1480 train_time:160078ms step_avg:154.67ms step:1046/1480 train_time:160240ms step_avg:154.67ms step:1047/1480 train_time:160399ms step_avg:154.68ms step:1048/1480 train_time:160559ms step_avg:154.68ms step:1049/1480 train_time:160719ms step_avg:154.69ms step:1050/1480 train_time:160882ms step_avg:154.69ms step:1051/1480 train_time:161042ms step_avg:154.70ms step:1052/1480 train_time:161206ms step_avg:154.71ms step:1053/1480 train_time:161368ms step_avg:154.72ms step:1054/1480 train_time:161528ms step_avg:154.72ms step:1055/1480 train_time:161688ms step_avg:154.73ms step:1056/1480 train_time:161847ms step_avg:154.73ms step:1057/1480 train_time:162008ms step_avg:154.74ms step:1058/1480 train_time:162170ms step_avg:154.74ms step:1059/1480 train_time:162332ms step_avg:154.75ms step:1060/1480 train_time:162493ms step_avg:154.76ms step:1061/1480 train_time:162651ms step_avg:154.76ms step:1062/1480 train_time:162812ms step_avg:154.76ms step:1063/1480 train_time:162972ms step_avg:154.77ms step:1064/1480 train_time:163131ms step_avg:154.77ms step:1065/1480 train_time:163292ms step_avg:154.78ms step:1066/1480 train_time:163455ms step_avg:154.79ms step:1067/1480 train_time:163620ms step_avg:154.80ms step:1068/1480 train_time:163781ms step_avg:154.80ms step:1069/1480 train_time:163944ms step_avg:154.81ms step:1070/1480 train_time:164105ms step_avg:154.82ms step:1071/1480 train_time:164267ms step_avg:154.82ms step:1072/1480 train_time:164426ms step_avg:154.83ms step:1073/1480 train_time:164585ms step_avg:154.83ms step:1074/1480 train_time:164743ms step_avg:154.83ms step:1075/1480 train_time:164906ms step_avg:154.84ms step:1076/1480 train_time:165066ms step_avg:154.85ms step:1077/1480 train_time:165225ms step_avg:154.85ms step:1078/1480 train_time:165390ms step_avg:154.86ms step:1079/1480 train_time:165554ms step_avg:154.87ms step:1080/1480 train_time:165715ms step_avg:154.87ms step:1081/1480 train_time:165875ms step_avg:154.88ms step:1082/1480 train_time:166036ms step_avg:154.88ms step:1083/1480 train_time:166196ms step_avg:154.89ms step:1084/1480 train_time:166356ms step_avg:154.89ms step:1085/1480 train_time:166517ms step_avg:154.90ms step:1086/1480 train_time:166678ms step_avg:154.91ms step:1087/1480 train_time:166839ms step_avg:154.91ms step:1088/1480 train_time:167000ms step_avg:154.92ms step:1089/1480 train_time:167163ms step_avg:154.92ms step:1090/1480 train_time:167326ms step_avg:154.93ms step:1091/1480 train_time:167487ms step_avg:154.94ms step:1092/1480 train_time:167648ms step_avg:154.94ms step:1093/1480 train_time:167809ms step_avg:154.95ms step:1094/1480 train_time:167968ms step_avg:154.95ms step:1095/1480 train_time:168127ms step_avg:154.96ms step:1096/1480 train_time:168290ms step_avg:154.96ms step:1097/1480 train_time:168450ms step_avg:154.97ms step:1098/1480 train_time:168614ms step_avg:154.98ms step:1099/1480 train_time:168775ms step_avg:154.98ms step:1100/1480 train_time:168940ms step_avg:154.99ms step:1101/1480 train_time:169105ms step_avg:155.00ms step:1102/1480 train_time:169267ms step_avg:155.01ms step:1103/1480 train_time:169432ms step_avg:155.02ms step:1104/1480 train_time:169593ms step_avg:155.02ms step:1105/1480 train_time:169756ms step_avg:155.03ms step:1106/1480 train_time:169917ms step_avg:155.03ms step:1107/1480 train_time:170081ms step_avg:155.04ms step:1108/1480 train_time:170242ms step_avg:155.05ms step:1109/1480 train_time:170402ms step_avg:155.05ms step:1110/1480 train_time:170563ms step_avg:155.06ms step:1111/1480 train_time:170724ms step_avg:155.06ms step:1112/1480 train_time:170888ms step_avg:155.07ms step:1113/1480 train_time:171057ms step_avg:155.08ms step:1114/1480 train_time:171219ms step_avg:155.09ms step:1115/1480 train_time:171382ms step_avg:155.10ms step:1116/1480 train_time:171544ms step_avg:155.10ms step:1117/1480 train_time:171708ms step_avg:155.11ms step:1118/1480 train_time:171872ms step_avg:155.12ms step:1119/1480 train_time:172034ms step_avg:155.13ms step:1120/1480 train_time:172196ms step_avg:155.13ms step:1121/1480 train_time:172360ms step_avg:155.14ms step:1122/1480 train_time:172520ms step_avg:155.14ms step:1123/1480 train_time:172680ms step_avg:155.15ms step:1124/1480 train_time:172842ms step_avg:155.15ms step:1125/1480 train_time:173005ms step_avg:155.16ms step:1125/1480 val_loss:3.3858 train_time:173080ms step_avg:155.23ms step:1126/1480 train_time:173170ms step_avg:155.17ms step:1127/1480 train_time:173332ms step_avg:155.18ms step:1128/1480 train_time:173491ms step_avg:155.18ms step:1129/1480 train_time:173654ms step_avg:155.19ms step:1130/1480 train_time:173816ms step_avg:155.19ms step:1131/1480 train_time:173982ms step_avg:155.20ms step:1132/1480 train_time:174143ms step_avg:155.21ms step:1133/1480 train_time:174305ms step_avg:155.21ms step:1134/1480 train_time:174468ms step_avg:155.22ms step:1135/1480 train_time:174628ms step_avg:155.23ms step:1136/1480 train_time:174792ms step_avg:155.23ms step:1137/1480 train_time:174952ms step_avg:155.24ms step:1138/1480 train_time:175119ms step_avg:155.25ms step:1139/1480 train_time:175286ms step_avg:155.26ms step:1140/1480 train_time:175442ms step_avg:155.26ms step:1141/1480 train_time:175604ms step_avg:155.26ms step:1142/1480 train_time:175767ms step_avg:155.27ms step:1143/1480 train_time:175931ms step_avg:155.28ms step:1144/1480 train_time:176093ms step_avg:155.28ms step:1145/1480 train_time:176252ms step_avg:155.29ms step:1146/1480 train_time:176416ms step_avg:155.30ms step:1147/1480 train_time:176577ms step_avg:155.30ms step:1148/1480 train_time:176739ms step_avg:155.31ms step:1149/1480 train_time:176902ms step_avg:155.31ms step:1150/1480 train_time:177062ms step_avg:155.32ms step:1151/1480 train_time:177225ms step_avg:155.32ms step:1152/1480 train_time:177388ms step_avg:155.33ms step:1153/1480 train_time:177553ms step_avg:155.34ms step:1154/1480 train_time:177715ms step_avg:155.35ms step:1155/1480 train_time:177878ms step_avg:155.35ms step:1156/1480 train_time:178046ms step_avg:155.36ms step:1157/1480 train_time:178207ms step_avg:155.37ms step:1158/1480 train_time:178368ms step_avg:155.37ms step:1159/1480 train_time:178528ms step_avg:155.38ms step:1160/1480 train_time:178689ms step_avg:155.38ms step:1161/1480 train_time:178851ms step_avg:155.39ms step:1162/1480 train_time:179014ms step_avg:155.39ms step:1163/1480 train_time:179178ms step_avg:155.40ms step:1164/1480 train_time:179341ms step_avg:155.41ms step:1165/1480 train_time:179501ms step_avg:155.41ms step:1166/1480 train_time:179663ms step_avg:155.42ms step:1167/1480 train_time:179824ms step_avg:155.42ms step:1168/1480 train_time:179985ms step_avg:155.43ms step:1169/1480 train_time:180147ms step_avg:155.43ms step:1170/1480 train_time:180307ms step_avg:155.44ms step:1171/1480 train_time:180471ms step_avg:155.44ms step:1172/1480 train_time:180632ms step_avg:155.45ms step:1173/1480 train_time:180796ms step_avg:155.46ms step:1174/1480 train_time:180966ms step_avg:155.47ms step:1175/1480 train_time:181129ms step_avg:155.48ms step:1176/1480 train_time:181295ms step_avg:155.48ms step:1177/1480 train_time:181464ms step_avg:155.50ms step:1178/1480 train_time:181624ms step_avg:155.50ms step:1179/1480 train_time:181784ms step_avg:155.50ms step:1180/1480 train_time:181951ms step_avg:155.51ms step:1181/1480 train_time:182115ms step_avg:155.52ms step:1182/1480 train_time:182276ms step_avg:155.53ms step:1183/1480 train_time:182438ms step_avg:155.53ms step:1184/1480 train_time:182601ms step_avg:155.54ms step:1185/1480 train_time:182768ms step_avg:155.55ms step:1186/1480 train_time:182929ms step_avg:155.55ms step:1187/1480 train_time:183102ms step_avg:155.57ms step:1188/1480 train_time:183262ms step_avg:155.57ms step:1189/1480 train_time:183424ms step_avg:155.58ms step:1190/1480 train_time:183586ms step_avg:155.58ms step:1191/1480 train_time:183749ms step_avg:155.59ms step:1192/1480 train_time:183909ms step_avg:155.59ms step:1193/1480 train_time:184069ms step_avg:155.59ms step:1194/1480 train_time:184230ms step_avg:155.60ms step:1195/1480 train_time:184392ms step_avg:155.60ms step:1196/1480 train_time:184561ms step_avg:155.62ms step:1197/1480 train_time:184724ms step_avg:155.62ms step:1198/1480 train_time:184893ms step_avg:155.63ms step:1199/1480 train_time:185056ms step_avg:155.64ms step:1200/1480 train_time:185219ms step_avg:155.65ms step:1201/1480 train_time:185380ms step_avg:155.65ms step:1202/1480 train_time:185549ms step_avg:155.66ms step:1203/1480 train_time:185715ms step_avg:155.67ms step:1204/1480 train_time:185879ms step_avg:155.68ms step:1205/1480 train_time:186042ms step_avg:155.68ms step:1206/1480 train_time:186202ms step_avg:155.69ms step:1207/1480 train_time:186364ms step_avg:155.69ms step:1208/1480 train_time:186525ms step_avg:155.70ms step:1209/1480 train_time:186688ms step_avg:155.70ms step:1210/1480 train_time:186855ms step_avg:155.71ms step:1211/1480 train_time:187019ms step_avg:155.72ms step:1212/1480 train_time:187182ms step_avg:155.73ms step:1213/1480 train_time:187346ms step_avg:155.73ms step:1214/1480 train_time:187512ms step_avg:155.74ms step:1215/1480 train_time:187675ms step_avg:155.75ms step:1216/1480 train_time:187837ms step_avg:155.75ms step:1217/1480 train_time:188000ms step_avg:155.76ms step:1218/1480 train_time:188163ms step_avg:155.76ms step:1219/1480 train_time:188330ms step_avg:155.77ms step:1220/1480 train_time:188492ms step_avg:155.78ms step:1221/1480 train_time:188653ms step_avg:155.78ms step:1222/1480 train_time:188815ms step_avg:155.79ms step:1223/1480 train_time:188978ms step_avg:155.79ms step:1224/1480 train_time:189145ms step_avg:155.80ms step:1225/1480 train_time:189308ms step_avg:155.81ms step:1226/1480 train_time:189472ms step_avg:155.82ms step:1227/1480 train_time:189637ms step_avg:155.82ms step:1228/1480 train_time:189801ms step_avg:155.83ms step:1229/1480 train_time:189963ms step_avg:155.84ms step:1230/1480 train_time:190131ms step_avg:155.84ms step:1231/1480 train_time:190297ms step_avg:155.85ms step:1232/1480 train_time:190463ms step_avg:155.86ms step:1233/1480 train_time:190624ms step_avg:155.87ms step:1234/1480 train_time:190785ms step_avg:155.87ms step:1235/1480 train_time:190950ms step_avg:155.88ms step:1236/1480 train_time:191110ms step_avg:155.88ms step:1237/1480 train_time:191272ms step_avg:155.89ms step:1238/1480 train_time:191447ms step_avg:155.90ms step:1239/1480 train_time:191609ms step_avg:155.91ms step:1240/1480 train_time:191774ms step_avg:155.91ms step:1241/1480 train_time:191941ms step_avg:155.92ms step:1242/1480 train_time:192102ms step_avg:155.93ms step:1243/1480 train_time:192267ms step_avg:155.93ms step:1244/1480 train_time:192427ms step_avg:155.94ms step:1245/1480 train_time:192589ms step_avg:155.94ms step:1246/1480 train_time:192751ms step_avg:155.95ms step:1247/1480 train_time:192915ms step_avg:155.95ms step:1248/1480 train_time:193076ms step_avg:155.96ms step:1249/1480 train_time:193238ms step_avg:155.96ms step:1250/1480 train_time:193402ms step_avg:155.97ms step:1250/1480 val_loss:3.3366 train_time:193477ms step_avg:156.03ms step:1251/1480 train_time:193573ms step_avg:155.98ms step:1252/1480 train_time:193737ms step_avg:155.99ms step:1253/1480 train_time:193898ms step_avg:155.99ms step:1254/1480 train_time:194059ms step_avg:156.00ms step:1255/1480 train_time:194231ms step_avg:156.01ms step:1256/1480 train_time:194397ms step_avg:156.02ms step:1257/1480 train_time:194558ms step_avg:156.02ms step:1258/1480 train_time:194723ms step_avg:156.03ms step:1259/1480 train_time:194886ms step_avg:156.03ms step:1260/1480 train_time:195046ms step_avg:156.04ms step:1261/1480 train_time:195210ms step_avg:156.04ms step:1262/1480 train_time:195375ms step_avg:156.05ms step:1263/1480 train_time:195541ms step_avg:156.06ms step:1264/1480 train_time:195699ms step_avg:156.06ms step:1265/1480 train_time:195859ms step_avg:156.06ms step:1266/1480 train_time:196023ms step_avg:156.07ms step:1267/1480 train_time:196184ms step_avg:156.07ms step:1268/1480 train_time:196349ms step_avg:156.08ms step:1269/1480 train_time:196515ms step_avg:156.09ms step:1270/1480 train_time:196678ms step_avg:156.09ms step:1271/1480 train_time:196839ms step_avg:156.10ms step:1272/1480 train_time:197000ms step_avg:156.10ms step:1273/1480 train_time:197161ms step_avg:156.11ms step:1274/1480 train_time:197324ms step_avg:156.11ms step:1275/1480 train_time:197486ms step_avg:156.12ms step:1276/1480 train_time:197647ms step_avg:156.12ms step:1277/1480 train_time:197810ms step_avg:156.12ms step:1278/1480 train_time:197972ms step_avg:156.13ms step:1279/1480 train_time:198133ms step_avg:156.13ms step:1280/1480 train_time:198301ms step_avg:156.14ms step:1281/1480 train_time:198462ms step_avg:156.15ms step:1282/1480 train_time:198621ms step_avg:156.15ms step:1283/1480 train_time:198785ms step_avg:156.15ms step:1284/1480 train_time:198949ms step_avg:156.16ms step:1285/1480 train_time:199112ms step_avg:156.17ms step:1286/1480 train_time:199274ms step_avg:156.17ms step:1287/1480 train_time:199436ms step_avg:156.18ms step:1288/1480 train_time:199598ms step_avg:156.18ms step:1289/1480 train_time:199767ms step_avg:156.19ms step:1290/1480 train_time:199938ms step_avg:156.20ms step:1291/1480 train_time:200101ms step_avg:156.21ms step:1292/1480 train_time:200264ms step_avg:156.21ms step:1293/1480 train_time:200434ms step_avg:156.22ms step:1294/1480 train_time:200596ms step_avg:156.23ms step:1295/1480 train_time:200759ms step_avg:156.23ms step:1296/1480 train_time:200922ms step_avg:156.24ms step:1297/1480 train_time:201087ms step_avg:156.24ms step:1298/1480 train_time:201251ms step_avg:156.25ms step:1299/1480 train_time:201415ms step_avg:156.26ms step:1300/1480 train_time:201576ms step_avg:156.26ms step:1301/1480 train_time:201737ms step_avg:156.26ms step:1302/1480 train_time:201902ms step_avg:156.27ms step:1303/1480 train_time:202069ms step_avg:156.28ms step:1304/1480 train_time:202235ms step_avg:156.29ms step:1305/1480 train_time:202397ms step_avg:156.29ms step:1306/1480 train_time:202560ms step_avg:156.30ms step:1307/1480 train_time:202721ms step_avg:156.30ms step:1308/1480 train_time:202884ms step_avg:156.30ms step:1309/1480 train_time:203050ms step_avg:156.31ms step:1310/1480 train_time:203214ms step_avg:156.32ms step:1311/1480 train_time:203375ms step_avg:156.32ms step:1312/1480 train_time:203540ms step_avg:156.33ms step:1313/1480 train_time:203702ms step_avg:156.33ms step:1314/1480 train_time:203867ms step_avg:156.34ms step:1315/1480 train_time:204031ms step_avg:156.35ms step:1316/1480 train_time:204192ms step_avg:156.35ms step:1317/1480 train_time:204353ms step_avg:156.35ms step:1318/1480 train_time:204521ms step_avg:156.36ms step:1319/1480 train_time:204687ms step_avg:156.37ms step:1320/1480 train_time:204854ms step_avg:156.38ms step:1321/1480 train_time:205018ms step_avg:156.38ms step:1322/1480 train_time:205188ms step_avg:156.39ms step:1323/1480 train_time:205352ms step_avg:156.40ms step:1324/1480 train_time:205516ms step_avg:156.40ms step:1325/1480 train_time:205685ms step_avg:156.41ms step:1326/1480 train_time:205852ms step_avg:156.42ms step:1327/1480 train_time:206016ms step_avg:156.43ms step:1328/1480 train_time:206178ms step_avg:156.43ms step:1329/1480 train_time:206359ms step_avg:156.45ms step:1330/1480 train_time:206524ms step_avg:156.46ms step:1331/1480 train_time:206687ms step_avg:156.46ms step:1332/1480 train_time:206851ms step_avg:156.47ms step:1333/1480 train_time:207016ms step_avg:156.47ms step:1334/1480 train_time:207180ms step_avg:156.48ms step:1335/1480 train_time:207340ms step_avg:156.48ms step:1336/1480 train_time:207509ms step_avg:156.49ms step:1337/1480 train_time:207676ms step_avg:156.50ms step:1338/1480 train_time:207840ms step_avg:156.51ms step:1339/1480 train_time:208003ms step_avg:156.51ms step:1340/1480 train_time:208166ms step_avg:156.52ms step:1341/1480 train_time:208326ms step_avg:156.52ms step:1342/1480 train_time:208494ms step_avg:156.53ms step:1343/1480 train_time:208657ms step_avg:156.53ms step:1344/1480 train_time:208819ms step_avg:156.54ms step:1345/1480 train_time:208989ms step_avg:156.55ms step:1346/1480 train_time:209151ms step_avg:156.55ms step:1347/1480 train_time:209315ms step_avg:156.56ms step:1348/1480 train_time:209478ms step_avg:156.56ms step:1349/1480 train_time:209640ms step_avg:156.56ms step:1350/1480 train_time:209806ms step_avg:156.57ms step:1351/1480 train_time:209969ms step_avg:156.58ms step:1352/1480 train_time:210131ms step_avg:156.58ms step:1353/1480 train_time:210298ms step_avg:156.59ms step:1354/1480 train_time:210461ms step_avg:156.59ms step:1355/1480 train_time:210624ms step_avg:156.60ms step:1356/1480 train_time:210790ms step_avg:156.60ms step:1357/1480 train_time:210954ms step_avg:156.61ms step:1358/1480 train_time:211118ms step_avg:156.62ms step:1359/1480 train_time:211283ms step_avg:156.62ms step:1360/1480 train_time:211449ms step_avg:156.63ms step:1361/1480 train_time:211618ms step_avg:156.64ms step:1362/1480 train_time:211783ms step_avg:156.64ms step:1363/1480 train_time:211951ms step_avg:156.65ms step:1364/1480 train_time:212114ms step_avg:156.66ms step:1365/1480 train_time:212274ms step_avg:156.66ms step:1366/1480 train_time:212438ms step_avg:156.67ms step:1367/1480 train_time:212600ms step_avg:156.67ms step:1368/1480 train_time:212764ms step_avg:156.67ms step:1369/1480 train_time:212938ms step_avg:156.69ms step:1370/1480 train_time:213105ms step_avg:156.69ms step:1371/1480 train_time:213267ms step_avg:156.70ms step:1372/1480 train_time:213435ms step_avg:156.71ms step:1373/1480 train_time:213595ms step_avg:156.71ms step:1374/1480 train_time:213760ms step_avg:156.72ms step:1375/1480 train_time:213922ms step_avg:156.72ms step:1375/1480 val_loss:3.2980 train_time:213996ms step_avg:156.77ms step:1376/1480 train_time:214087ms step_avg:156.73ms step:1377/1480 train_time:214249ms step_avg:156.73ms step:1378/1480 train_time:214410ms step_avg:156.73ms step:1379/1480 train_time:214576ms step_avg:156.74ms step:1380/1480 train_time:214741ms step_avg:156.75ms step:1381/1480 train_time:214907ms step_avg:156.75ms step:1382/1480 train_time:215070ms step_avg:156.76ms step:1383/1480 train_time:215237ms step_avg:156.76ms step:1384/1480 train_time:215403ms step_avg:156.77ms step:1385/1480 train_time:215562ms step_avg:156.77ms step:1386/1480 train_time:215725ms step_avg:156.78ms step:1387/1480 train_time:215890ms step_avg:156.78ms step:1388/1480 train_time:216052ms step_avg:156.79ms step:1389/1480 train_time:216219ms step_avg:156.79ms step:1390/1480 train_time:216381ms step_avg:156.80ms step:1391/1480 train_time:216543ms step_avg:156.80ms step:1392/1480 train_time:216706ms step_avg:156.81ms step:1393/1480 train_time:216868ms step_avg:156.81ms step:1394/1480 train_time:217030ms step_avg:156.81ms step:1395/1480 train_time:217194ms step_avg:156.82ms step:1396/1480 train_time:217356ms step_avg:156.82ms step:1397/1480 train_time:217519ms step_avg:156.83ms step:1398/1480 train_time:217680ms step_avg:156.83ms step:1399/1480 train_time:217844ms step_avg:156.83ms step:1400/1480 train_time:218010ms step_avg:156.84ms step:1401/1480 train_time:218171ms step_avg:156.84ms step:1402/1480 train_time:218334ms step_avg:156.85ms step:1403/1480 train_time:218501ms step_avg:156.86ms step:1404/1480 train_time:218665ms step_avg:156.86ms step:1405/1480 train_time:218828ms step_avg:156.87ms step:1406/1480 train_time:218993ms step_avg:156.87ms step:1407/1480 train_time:219156ms step_avg:156.88ms step:1408/1480 train_time:219319ms step_avg:156.88ms step:1409/1480 train_time:219490ms step_avg:156.89ms step:1410/1480 train_time:219655ms step_avg:156.90ms step:1411/1480 train_time:219814ms step_avg:156.90ms step:1412/1480 train_time:219978ms step_avg:156.90ms step:1413/1480 train_time:220141ms step_avg:156.91ms step:1414/1480 train_time:220305ms step_avg:156.91ms step:1415/1480 train_time:220469ms step_avg:156.92ms step:1416/1480 train_time:220644ms step_avg:156.93ms step:1417/1480 train_time:220808ms step_avg:156.94ms step:1418/1480 train_time:220970ms step_avg:156.94ms step:1419/1480 train_time:221135ms step_avg:156.94ms step:1420/1480 train_time:221301ms step_avg:156.95ms step:1421/1480 train_time:221465ms step_avg:156.96ms step:1422/1480 train_time:221628ms step_avg:156.96ms step:1423/1480 train_time:221789ms step_avg:156.96ms step:1424/1480 train_time:221957ms step_avg:156.97ms step:1425/1480 train_time:222127ms step_avg:156.98ms step:1426/1480 train_time:222290ms step_avg:156.98ms step:1427/1480 train_time:222457ms step_avg:156.99ms step:1428/1480 train_time:222621ms step_avg:157.00ms step:1429/1480 train_time:222781ms step_avg:157.00ms step:1430/1480 train_time:222946ms step_avg:157.00ms step:1431/1480 train_time:223110ms step_avg:157.01ms step:1432/1480 train_time:223278ms step_avg:157.02ms step:1433/1480 train_time:223447ms step_avg:157.03ms step:1434/1480 train_time:223617ms step_avg:157.03ms step:1435/1480 train_time:223783ms step_avg:157.04ms step:1436/1480 train_time:223947ms step_avg:157.05ms step:1437/1480 train_time:224108ms step_avg:157.05ms step:1438/1480 train_time:224270ms step_avg:157.05ms step:1439/1480 train_time:224438ms step_avg:157.06ms step:1440/1480 train_time:224601ms step_avg:157.06ms step:1441/1480 train_time:224765ms step_avg:157.07ms step:1442/1480 train_time:224931ms step_avg:157.07ms step:1443/1480 train_time:225105ms step_avg:157.09ms step:1444/1480 train_time:225268ms step_avg:157.09ms step:1445/1480 train_time:225432ms step_avg:157.10ms step:1446/1480 train_time:225599ms step_avg:157.10ms step:1447/1480 train_time:225766ms step_avg:157.11ms step:1448/1480 train_time:225929ms step_avg:157.11ms step:1449/1480 train_time:226092ms step_avg:157.12ms step:1450/1480 train_time:226258ms step_avg:157.12ms step:1451/1480 train_time:226422ms step_avg:157.13ms step:1452/1480 train_time:226586ms step_avg:157.13ms step:1453/1480 train_time:226748ms step_avg:157.14ms step:1454/1480 train_time:226909ms step_avg:157.14ms step:1455/1480 train_time:227080ms step_avg:157.15ms step:1456/1480 train_time:227245ms step_avg:157.15ms step:1457/1480 train_time:227407ms step_avg:157.16ms step:1458/1480 train_time:227571ms step_avg:157.16ms step:1459/1480 train_time:227739ms step_avg:157.17ms step:1460/1480 train_time:227902ms step_avg:157.17ms step:1461/1480 train_time:228067ms step_avg:157.18ms step:1462/1480 train_time:228230ms step_avg:157.18ms step:1463/1480 train_time:228397ms step_avg:157.19ms step:1464/1480 train_time:228563ms step_avg:157.20ms step:1465/1480 train_time:228727ms step_avg:157.20ms step:1466/1480 train_time:228888ms step_avg:157.20ms step:1467/1480 train_time:229053ms step_avg:157.21ms step:1468/1480 train_time:229216ms step_avg:157.21ms step:1469/1480 train_time:229379ms step_avg:157.22ms step:1470/1480 train_time:229546ms step_avg:157.22ms step:1471/1480 train_time:229717ms step_avg:157.23ms step:1472/1480 train_time:229887ms step_avg:157.24ms step:1473/1480 train_time:230049ms step_avg:157.24ms step:1474/1480 train_time:230217ms step_avg:157.25ms step:1475/1480 train_time:230386ms step_avg:157.26ms step:1476/1480 train_time:230549ms step_avg:157.26ms step:1477/1480 train_time:230717ms step_avg:157.27ms step:1478/1480 train_time:230886ms step_avg:157.28ms step:1479/1480 train_time:231050ms step_avg:157.28ms step:1480/1480 train_time:231213ms step_avg:157.29ms step:1480/1480 val_loss:3.2794 train_time:231289ms step_avg:157.34ms peak memory consumption: 34239 MiB