import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 09:46:27 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28787ms step_avg:nanms step:2/1480 train_time:28892ms step_avg:nanms step:3/1480 train_time:29021ms step_avg:nanms step:4/1480 train_time:29156ms step_avg:nanms step:5/1480 train_time:29297ms step_avg:nanms step:6/1480 train_time:29440ms step_avg:nanms step:7/1480 train_time:29582ms step_avg:nanms step:8/1480 train_time:29722ms step_avg:nanms step:9/1480 train_time:29863ms step_avg:nanms step:10/1480 train_time:30005ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:426ms step_avg:141.91ms step:14/1480 train_time:570ms step_avg:142.54ms step:15/1480 train_time:714ms step_avg:142.74ms step:16/1480 train_time:855ms step_avg:142.45ms step:17/1480 train_time:997ms step_avg:142.36ms step:18/1480 train_time:1138ms step_avg:142.29ms step:19/1480 train_time:1282ms step_avg:142.46ms step:20/1480 train_time:1426ms step_avg:142.56ms step:21/1480 train_time:1569ms step_avg:142.66ms step:22/1480 train_time:1713ms step_avg:142.73ms step:23/1480 train_time:1854ms step_avg:142.63ms step:24/1480 train_time:1996ms step_avg:142.59ms step:25/1480 train_time:2137ms step_avg:142.47ms step:26/1480 train_time:2280ms step_avg:142.52ms step:27/1480 train_time:2422ms step_avg:142.47ms step:28/1480 train_time:2568ms step_avg:142.68ms step:29/1480 train_time:2712ms step_avg:142.74ms step:30/1480 train_time:2855ms step_avg:142.73ms step:31/1480 train_time:2995ms step_avg:142.63ms step:32/1480 train_time:3138ms step_avg:142.61ms step:33/1480 train_time:3280ms step_avg:142.62ms step:34/1480 train_time:3424ms step_avg:142.68ms step:35/1480 train_time:3568ms step_avg:142.73ms step:36/1480 train_time:3711ms step_avg:142.72ms step:37/1480 train_time:3856ms step_avg:142.80ms step:38/1480 train_time:3998ms step_avg:142.77ms step:39/1480 train_time:4140ms step_avg:142.75ms step:40/1480 train_time:4284ms step_avg:142.79ms step:41/1480 train_time:4426ms step_avg:142.79ms step:42/1480 train_time:4570ms step_avg:142.80ms step:43/1480 train_time:4712ms step_avg:142.78ms step:44/1480 train_time:4855ms step_avg:142.79ms step:45/1480 train_time:4996ms step_avg:142.74ms step:46/1480 train_time:5138ms step_avg:142.71ms step:47/1480 train_time:5281ms step_avg:142.72ms step:48/1480 train_time:5423ms step_avg:142.72ms step:49/1480 train_time:5567ms step_avg:142.75ms step:50/1480 train_time:5710ms step_avg:142.75ms step:51/1480 train_time:5853ms step_avg:142.77ms step:52/1480 train_time:5995ms step_avg:142.74ms step:53/1480 train_time:6137ms step_avg:142.72ms step:54/1480 train_time:6278ms step_avg:142.68ms step:55/1480 train_time:6420ms step_avg:142.67ms step:56/1480 train_time:6561ms step_avg:142.64ms step:57/1480 train_time:6704ms step_avg:142.64ms step:58/1480 train_time:6847ms step_avg:142.65ms step:59/1480 train_time:6990ms step_avg:142.65ms step:60/1480 train_time:7132ms step_avg:142.63ms step:61/1480 train_time:7274ms step_avg:142.63ms step:62/1480 train_time:7416ms step_avg:142.62ms step:63/1480 train_time:7558ms step_avg:142.61ms step:64/1480 train_time:7702ms step_avg:142.63ms step:65/1480 train_time:7846ms step_avg:142.65ms step:66/1480 train_time:7990ms step_avg:142.68ms step:67/1480 train_time:8132ms step_avg:142.67ms step:68/1480 train_time:8275ms step_avg:142.68ms step:69/1480 train_time:8418ms step_avg:142.67ms step:70/1480 train_time:8559ms step_avg:142.66ms step:71/1480 train_time:8701ms step_avg:142.64ms step:72/1480 train_time:8845ms step_avg:142.66ms step:73/1480 train_time:8989ms step_avg:142.68ms step:74/1480 train_time:9131ms step_avg:142.68ms step:75/1480 train_time:9276ms step_avg:142.70ms step:76/1480 train_time:9416ms step_avg:142.67ms step:77/1480 train_time:9556ms step_avg:142.63ms step:78/1480 train_time:9698ms step_avg:142.62ms step:79/1480 train_time:10225ms step_avg:148.19ms step:80/1480 train_time:10329ms step_avg:147.56ms step:81/1480 train_time:10473ms step_avg:147.51ms step:82/1480 train_time:10613ms step_avg:147.41ms step:83/1480 train_time:10756ms step_avg:147.34ms step:84/1480 train_time:10897ms step_avg:147.25ms step:85/1480 train_time:11037ms step_avg:147.17ms step:86/1480 train_time:11181ms step_avg:147.12ms step:87/1480 train_time:11326ms step_avg:147.09ms step:88/1480 train_time:11471ms step_avg:147.06ms step:89/1480 train_time:11613ms step_avg:147.00ms step:90/1480 train_time:11755ms step_avg:146.94ms step:91/1480 train_time:11897ms step_avg:146.87ms step:92/1480 train_time:12037ms step_avg:146.80ms step:93/1480 train_time:12179ms step_avg:146.74ms step:94/1480 train_time:12322ms step_avg:146.70ms step:95/1480 train_time:12467ms step_avg:146.67ms step:96/1480 train_time:12992ms step_avg:151.07ms step:97/1480 train_time:13091ms step_avg:150.47ms step:98/1480 train_time:13617ms step_avg:154.74ms step:99/1480 train_time:13722ms step_avg:154.18ms step:100/1480 train_time:13865ms step_avg:154.05ms step:101/1480 train_time:14010ms step_avg:153.96ms step:102/1480 train_time:14151ms step_avg:153.82ms step:103/1480 train_time:14294ms step_avg:153.70ms step:104/1480 train_time:14434ms step_avg:153.56ms step:105/1480 train_time:14577ms step_avg:153.45ms step:106/1480 train_time:14719ms step_avg:153.33ms step:107/1480 train_time:14862ms step_avg:153.22ms step:108/1480 train_time:15006ms step_avg:153.12ms step:109/1480 train_time:15149ms step_avg:153.02ms step:110/1480 train_time:15292ms step_avg:152.92ms step:111/1480 train_time:15435ms step_avg:152.82ms step:112/1480 train_time:15580ms step_avg:152.74ms step:113/1480 train_time:15725ms step_avg:152.67ms step:114/1480 train_time:15873ms step_avg:152.63ms step:115/1480 train_time:16018ms step_avg:152.55ms step:116/1480 train_time:16164ms step_avg:152.49ms step:117/1480 train_time:16312ms step_avg:152.45ms step:118/1480 train_time:16457ms step_avg:152.38ms step:119/1480 train_time:16602ms step_avg:152.31ms step:120/1480 train_time:16748ms step_avg:152.26ms step:121/1480 train_time:16894ms step_avg:152.20ms step:122/1480 train_time:17039ms step_avg:152.13ms step:123/1480 train_time:17185ms step_avg:152.08ms step:124/1480 train_time:17331ms step_avg:152.03ms step:125/1480 train_time:17477ms step_avg:151.97ms step:125/1480 val_loss:4.4114 train_time:17541ms step_avg:152.53ms step:126/1480 train_time:17636ms step_avg:152.04ms step:127/1480 train_time:17775ms step_avg:151.92ms step:128/1480 train_time:17922ms step_avg:151.88ms step:129/1480 train_time:18067ms step_avg:151.83ms step:130/1480 train_time:18213ms step_avg:151.77ms step:131/1480 train_time:18358ms step_avg:151.72ms step:132/1480 train_time:18505ms step_avg:151.68ms step:133/1480 train_time:18650ms step_avg:151.63ms step:134/1480 train_time:18795ms step_avg:151.57ms step:135/1480 train_time:18942ms step_avg:151.54ms step:136/1480 train_time:19088ms step_avg:151.49ms step:137/1480 train_time:19232ms step_avg:151.44ms step:138/1480 train_time:19377ms step_avg:151.39ms step:139/1480 train_time:19524ms step_avg:151.35ms step:140/1480 train_time:19669ms step_avg:151.30ms step:141/1480 train_time:19814ms step_avg:151.25ms step:142/1480 train_time:19962ms step_avg:151.23ms step:143/1480 train_time:20109ms step_avg:151.19ms step:144/1480 train_time:20253ms step_avg:151.14ms step:145/1480 train_time:20399ms step_avg:151.10ms step:146/1480 train_time:20545ms step_avg:151.07ms step:147/1480 train_time:20690ms step_avg:151.03ms step:148/1480 train_time:20835ms step_avg:150.98ms step:149/1480 train_time:20982ms step_avg:150.95ms step:150/1480 train_time:21128ms step_avg:150.91ms step:151/1480 train_time:21272ms step_avg:150.87ms step:152/1480 train_time:21420ms step_avg:150.84ms step:153/1480 train_time:21567ms step_avg:150.82ms step:154/1480 train_time:21712ms step_avg:150.78ms step:155/1480 train_time:21859ms step_avg:150.75ms step:156/1480 train_time:22005ms step_avg:150.72ms step:157/1480 train_time:22150ms step_avg:150.68ms step:158/1480 train_time:22296ms step_avg:150.65ms step:159/1480 train_time:22442ms step_avg:150.62ms step:160/1480 train_time:22588ms step_avg:150.59ms step:161/1480 train_time:22732ms step_avg:150.55ms step:162/1480 train_time:22877ms step_avg:150.51ms step:163/1480 train_time:23025ms step_avg:150.49ms step:164/1480 train_time:23169ms step_avg:150.45ms step:165/1480 train_time:23315ms step_avg:150.42ms step:166/1480 train_time:23461ms step_avg:150.39ms step:167/1480 train_time:23607ms step_avg:150.36ms step:168/1480 train_time:23752ms step_avg:150.33ms step:169/1480 train_time:23898ms step_avg:150.30ms step:170/1480 train_time:24045ms step_avg:150.28ms step:171/1480 train_time:24190ms step_avg:150.25ms step:172/1480 train_time:24336ms step_avg:150.22ms step:173/1480 train_time:24482ms step_avg:150.19ms step:174/1480 train_time:24628ms step_avg:150.17ms step:175/1480 train_time:24772ms step_avg:150.13ms step:176/1480 train_time:24918ms step_avg:150.11ms step:177/1480 train_time:25065ms step_avg:150.09ms step:178/1480 train_time:25210ms step_avg:150.06ms step:179/1480 train_time:25355ms step_avg:150.03ms step:180/1480 train_time:25500ms step_avg:150.00ms step:181/1480 train_time:25647ms step_avg:149.98ms step:182/1480 train_time:25791ms step_avg:149.95ms step:183/1480 train_time:25935ms step_avg:149.92ms step:184/1480 train_time:26083ms step_avg:149.90ms step:185/1480 train_time:26229ms step_avg:149.88ms step:186/1480 train_time:26373ms step_avg:149.85ms step:187/1480 train_time:26518ms step_avg:149.82ms step:188/1480 train_time:26664ms step_avg:149.80ms step:189/1480 train_time:26832ms step_avg:149.90ms step:190/1480 train_time:26955ms step_avg:149.75ms step:191/1480 train_time:27102ms step_avg:149.73ms step:192/1480 train_time:27248ms step_avg:149.71ms step:193/1480 train_time:27392ms step_avg:149.69ms step:194/1480 train_time:27538ms step_avg:149.66ms step:195/1480 train_time:27685ms step_avg:149.65ms step:196/1480 train_time:27830ms step_avg:149.62ms step:197/1480 train_time:27974ms step_avg:149.59ms step:198/1480 train_time:28120ms step_avg:149.58ms step:199/1480 train_time:28266ms step_avg:149.55ms step:200/1480 train_time:28411ms step_avg:149.53ms step:201/1480 train_time:28557ms step_avg:149.51ms step:202/1480 train_time:28703ms step_avg:149.50ms step:203/1480 train_time:28849ms step_avg:149.48ms step:204/1480 train_time:28993ms step_avg:149.45ms step:205/1480 train_time:29139ms step_avg:149.43ms step:206/1480 train_time:29286ms step_avg:149.42ms step:207/1480 train_time:29431ms step_avg:149.39ms step:208/1480 train_time:29576ms step_avg:149.37ms step:209/1480 train_time:29723ms step_avg:149.36ms step:210/1480 train_time:29869ms step_avg:149.34ms step:211/1480 train_time:30014ms step_avg:149.32ms step:212/1480 train_time:30160ms step_avg:149.31ms step:213/1480 train_time:30307ms step_avg:149.30ms step:214/1480 train_time:30451ms step_avg:149.27ms step:215/1480 train_time:30597ms step_avg:149.25ms step:216/1480 train_time:30744ms step_avg:149.24ms step:217/1480 train_time:30889ms step_avg:149.22ms step:218/1480 train_time:31033ms step_avg:149.20ms step:219/1480 train_time:31180ms step_avg:149.18ms step:220/1480 train_time:31326ms step_avg:149.17ms step:221/1480 train_time:31861ms step_avg:151.00ms step:222/1480 train_time:31970ms step_avg:150.80ms step:223/1480 train_time:32509ms step_avg:152.62ms step:224/1480 train_time:32614ms step_avg:152.40ms step:225/1480 train_time:32762ms step_avg:152.38ms step:226/1480 train_time:32910ms step_avg:152.36ms step:227/1480 train_time:33058ms step_avg:152.34ms step:228/1480 train_time:33207ms step_avg:152.33ms step:229/1480 train_time:33355ms step_avg:152.30ms step:230/1480 train_time:33504ms step_avg:152.29ms step:231/1480 train_time:33652ms step_avg:152.27ms step:232/1480 train_time:33800ms step_avg:152.25ms step:233/1480 train_time:33950ms step_avg:152.24ms step:234/1480 train_time:34096ms step_avg:152.22ms step:235/1480 train_time:34246ms step_avg:152.20ms step:236/1480 train_time:34394ms step_avg:152.18ms step:237/1480 train_time:34542ms step_avg:152.17ms step:238/1480 train_time:34691ms step_avg:152.15ms step:239/1480 train_time:34839ms step_avg:152.14ms step:240/1480 train_time:34988ms step_avg:152.12ms step:241/1480 train_time:35136ms step_avg:152.10ms step:242/1480 train_time:35285ms step_avg:152.09ms step:243/1480 train_time:35433ms step_avg:152.07ms step:244/1480 train_time:35582ms step_avg:152.06ms step:245/1480 train_time:35731ms step_avg:152.05ms step:246/1480 train_time:35878ms step_avg:152.02ms step:247/1480 train_time:36027ms step_avg:152.01ms step:248/1480 train_time:36174ms step_avg:151.99ms step:249/1480 train_time:36323ms step_avg:151.98ms step:250/1480 train_time:36471ms step_avg:151.96ms step:250/1480 val_loss:3.9934 train_time:36537ms step_avg:152.24ms step:251/1480 train_time:36630ms step_avg:151.99ms step:252/1480 train_time:36776ms step_avg:151.97ms step:253/1480 train_time:36926ms step_avg:151.96ms step:254/1480 train_time:37074ms step_avg:151.94ms step:255/1480 train_time:37222ms step_avg:151.93ms step:256/1480 train_time:37369ms step_avg:151.91ms step:257/1480 train_time:37518ms step_avg:151.89ms step:258/1480 train_time:37667ms step_avg:151.88ms step:259/1480 train_time:37815ms step_avg:151.87ms step:260/1480 train_time:37966ms step_avg:151.86ms step:261/1480 train_time:38113ms step_avg:151.85ms step:262/1480 train_time:38263ms step_avg:151.84ms step:263/1480 train_time:38410ms step_avg:151.82ms step:264/1480 train_time:38558ms step_avg:151.80ms step:265/1480 train_time:38707ms step_avg:151.79ms step:266/1480 train_time:38854ms step_avg:151.77ms step:267/1480 train_time:39004ms step_avg:151.77ms step:268/1480 train_time:39152ms step_avg:151.75ms step:269/1480 train_time:39302ms step_avg:151.74ms step:270/1480 train_time:39450ms step_avg:151.73ms step:271/1480 train_time:39597ms step_avg:151.71ms step:272/1480 train_time:39745ms step_avg:151.70ms step:273/1480 train_time:39893ms step_avg:151.68ms step:274/1480 train_time:40042ms step_avg:151.67ms step:275/1480 train_time:40190ms step_avg:151.66ms step:276/1480 train_time:40338ms step_avg:151.65ms step:277/1480 train_time:40487ms step_avg:151.64ms step:278/1480 train_time:40635ms step_avg:151.62ms step:279/1480 train_time:40784ms step_avg:151.61ms step:280/1480 train_time:40933ms step_avg:151.60ms step:281/1480 train_time:41082ms step_avg:151.59ms step:282/1480 train_time:41230ms step_avg:151.58ms step:283/1480 train_time:41379ms step_avg:151.57ms step:284/1480 train_time:41528ms step_avg:151.56ms step:285/1480 train_time:41675ms step_avg:151.55ms step:286/1480 train_time:41825ms step_avg:151.54ms step:287/1480 train_time:41972ms step_avg:151.52ms step:288/1480 train_time:42122ms step_avg:151.52ms step:289/1480 train_time:42270ms step_avg:151.51ms step:290/1480 train_time:42418ms step_avg:151.49ms step:291/1480 train_time:42567ms step_avg:151.48ms step:292/1480 train_time:42716ms step_avg:151.47ms step:293/1480 train_time:42865ms step_avg:151.47ms step:294/1480 train_time:43013ms step_avg:151.45ms step:295/1480 train_time:43163ms step_avg:151.45ms step:296/1480 train_time:43310ms step_avg:151.43ms step:297/1480 train_time:43457ms step_avg:151.42ms step:298/1480 train_time:43606ms step_avg:151.41ms step:299/1480 train_time:43753ms step_avg:151.39ms step:300/1480 train_time:43903ms step_avg:151.39ms step:301/1480 train_time:44051ms step_avg:151.38ms step:302/1480 train_time:44199ms step_avg:151.37ms step:303/1480 train_time:44349ms step_avg:151.36ms step:304/1480 train_time:44496ms step_avg:151.35ms step:305/1480 train_time:44645ms step_avg:151.34ms step:306/1480 train_time:44793ms step_avg:151.33ms step:307/1480 train_time:44942ms step_avg:151.32ms step:308/1480 train_time:45091ms step_avg:151.31ms step:309/1480 train_time:45239ms step_avg:151.30ms step:310/1480 train_time:45388ms step_avg:151.29ms step:311/1480 train_time:45535ms step_avg:151.28ms step:312/1480 train_time:45684ms step_avg:151.27ms step:313/1480 train_time:45832ms step_avg:151.26ms step:314/1480 train_time:45982ms step_avg:151.26ms step:315/1480 train_time:46129ms step_avg:151.24ms step:316/1480 train_time:46277ms step_avg:151.23ms step:317/1480 train_time:46427ms step_avg:151.23ms step:318/1480 train_time:46575ms step_avg:151.22ms step:319/1480 train_time:46723ms step_avg:151.21ms step:320/1480 train_time:46871ms step_avg:151.20ms step:321/1480 train_time:47020ms step_avg:151.19ms step:322/1480 train_time:47169ms step_avg:151.18ms step:323/1480 train_time:47316ms step_avg:151.17ms step:324/1480 train_time:47466ms step_avg:151.17ms step:325/1480 train_time:47615ms step_avg:151.16ms step:326/1480 train_time:47764ms step_avg:151.15ms step:327/1480 train_time:47912ms step_avg:151.14ms step:328/1480 train_time:48061ms step_avg:151.13ms step:329/1480 train_time:48209ms step_avg:151.13ms step:330/1480 train_time:48357ms step_avg:151.12ms step:331/1480 train_time:48509ms step_avg:151.12ms step:332/1480 train_time:48659ms step_avg:151.11ms step:333/1480 train_time:48810ms step_avg:151.11ms step:334/1480 train_time:48959ms step_avg:151.11ms step:335/1480 train_time:49110ms step_avg:151.11ms step:336/1480 train_time:49260ms step_avg:151.10ms step:337/1480 train_time:49411ms step_avg:151.10ms step:338/1480 train_time:49563ms step_avg:151.11ms step:339/1480 train_time:49713ms step_avg:151.10ms step:340/1480 train_time:49864ms step_avg:151.10ms step:341/1480 train_time:50015ms step_avg:151.10ms step:342/1480 train_time:50168ms step_avg:151.11ms step:343/1480 train_time:50318ms step_avg:151.11ms step:344/1480 train_time:50469ms step_avg:151.10ms step:345/1480 train_time:50619ms step_avg:151.10ms step:346/1480 train_time:50770ms step_avg:151.10ms step:347/1480 train_time:50922ms step_avg:151.10ms step:348/1480 train_time:51072ms step_avg:151.10ms step:349/1480 train_time:51223ms step_avg:151.10ms step:350/1480 train_time:51373ms step_avg:151.10ms step:351/1480 train_time:51525ms step_avg:151.10ms step:352/1480 train_time:51674ms step_avg:151.09ms step:353/1480 train_time:51825ms step_avg:151.09ms step:354/1480 train_time:51975ms step_avg:151.09ms step:355/1480 train_time:52128ms step_avg:151.09ms step:356/1480 train_time:52277ms step_avg:151.09ms step:357/1480 train_time:52429ms step_avg:151.09ms step:358/1480 train_time:52578ms step_avg:151.09ms step:359/1480 train_time:52730ms step_avg:151.09ms step:360/1480 train_time:52881ms step_avg:151.09ms step:361/1480 train_time:53033ms step_avg:151.09ms step:362/1480 train_time:53184ms step_avg:151.09ms step:363/1480 train_time:53334ms step_avg:151.09ms step:364/1480 train_time:53485ms step_avg:151.09ms step:365/1480 train_time:53637ms step_avg:151.09ms step:366/1480 train_time:53789ms step_avg:151.09ms step:367/1480 train_time:53940ms step_avg:151.09ms step:368/1480 train_time:54090ms step_avg:151.09ms step:369/1480 train_time:54242ms step_avg:151.09ms step:370/1480 train_time:54392ms step_avg:151.09ms step:371/1480 train_time:54544ms step_avg:151.09ms step:372/1480 train_time:54694ms step_avg:151.09ms step:373/1480 train_time:54845ms step_avg:151.09ms step:374/1480 train_time:54996ms step_avg:151.09ms step:375/1480 train_time:55147ms step_avg:151.09ms step:375/1480 val_loss:3.8038 train_time:55214ms step_avg:151.27ms step:376/1480 train_time:55306ms step_avg:151.11ms step:377/1480 train_time:55458ms step_avg:151.11ms step:378/1480 train_time:55609ms step_avg:151.11ms step:379/1480 train_time:55783ms step_avg:151.17ms step:380/1480 train_time:55910ms step_avg:151.11ms step:381/1480 train_time:56059ms step_avg:151.10ms step:382/1480 train_time:56211ms step_avg:151.10ms step:383/1480 train_time:56362ms step_avg:151.10ms step:384/1480 train_time:56514ms step_avg:151.11ms step:385/1480 train_time:56664ms step_avg:151.10ms step:386/1480 train_time:56815ms step_avg:151.10ms step:387/1480 train_time:56964ms step_avg:151.10ms step:388/1480 train_time:57115ms step_avg:151.10ms step:389/1480 train_time:57266ms step_avg:151.10ms step:390/1480 train_time:57417ms step_avg:151.10ms step:391/1480 train_time:57567ms step_avg:151.10ms step:392/1480 train_time:57718ms step_avg:151.09ms step:393/1480 train_time:57870ms step_avg:151.10ms step:394/1480 train_time:58019ms step_avg:151.09ms step:395/1480 train_time:58171ms step_avg:151.09ms step:396/1480 train_time:58321ms step_avg:151.09ms step:397/1480 train_time:58473ms step_avg:151.09ms step:398/1480 train_time:58624ms step_avg:151.09ms step:399/1480 train_time:58776ms step_avg:151.09ms step:400/1480 train_time:58927ms step_avg:151.09ms step:401/1480 train_time:59078ms step_avg:151.09ms step:402/1480 train_time:59229ms step_avg:151.09ms step:403/1480 train_time:59380ms step_avg:151.09ms step:404/1480 train_time:59531ms step_avg:151.09ms step:405/1480 train_time:59681ms step_avg:151.09ms step:406/1480 train_time:59833ms step_avg:151.09ms step:407/1480 train_time:59983ms step_avg:151.09ms step:408/1480 train_time:60134ms step_avg:151.09ms step:409/1480 train_time:60284ms step_avg:151.09ms step:410/1480 train_time:60435ms step_avg:151.09ms step:411/1480 train_time:60585ms step_avg:151.08ms step:412/1480 train_time:60736ms step_avg:151.09ms step:413/1480 train_time:60888ms step_avg:151.09ms step:414/1480 train_time:61038ms step_avg:151.08ms step:415/1480 train_time:61189ms step_avg:151.08ms step:416/1480 train_time:61340ms step_avg:151.08ms step:417/1480 train_time:61491ms step_avg:151.08ms step:418/1480 train_time:61641ms step_avg:151.08ms step:419/1480 train_time:61791ms step_avg:151.08ms step:420/1480 train_time:61942ms step_avg:151.08ms step:421/1480 train_time:62093ms step_avg:151.08ms step:422/1480 train_time:62243ms step_avg:151.08ms step:423/1480 train_time:62395ms step_avg:151.08ms step:424/1480 train_time:62546ms step_avg:151.08ms step:425/1480 train_time:62697ms step_avg:151.08ms step:426/1480 train_time:62848ms step_avg:151.08ms step:427/1480 train_time:62999ms step_avg:151.08ms step:428/1480 train_time:63151ms step_avg:151.08ms step:429/1480 train_time:63301ms step_avg:151.08ms step:430/1480 train_time:63453ms step_avg:151.08ms step:431/1480 train_time:63603ms step_avg:151.08ms step:432/1480 train_time:63755ms step_avg:151.08ms step:433/1480 train_time:63905ms step_avg:151.07ms step:434/1480 train_time:64056ms step_avg:151.07ms step:435/1480 train_time:64206ms step_avg:151.07ms step:436/1480 train_time:64357ms step_avg:151.07ms step:437/1480 train_time:64508ms step_avg:151.07ms step:438/1480 train_time:64659ms step_avg:151.07ms step:439/1480 train_time:64812ms step_avg:151.08ms step:440/1480 train_time:64962ms step_avg:151.07ms step:441/1480 train_time:65115ms step_avg:151.08ms step:442/1480 train_time:65267ms step_avg:151.08ms step:443/1480 train_time:65419ms step_avg:151.08ms step:444/1480 train_time:65573ms step_avg:151.09ms step:445/1480 train_time:65726ms step_avg:151.09ms step:446/1480 train_time:65878ms step_avg:151.10ms step:447/1480 train_time:66031ms step_avg:151.10ms step:448/1480 train_time:66184ms step_avg:151.10ms step:449/1480 train_time:66337ms step_avg:151.11ms step:450/1480 train_time:66490ms step_avg:151.11ms step:451/1480 train_time:66643ms step_avg:151.12ms step:452/1480 train_time:66796ms step_avg:151.12ms step:453/1480 train_time:66950ms step_avg:151.13ms step:454/1480 train_time:67102ms step_avg:151.13ms step:455/1480 train_time:67256ms step_avg:151.14ms step:456/1480 train_time:67409ms step_avg:151.14ms step:457/1480 train_time:67561ms step_avg:151.14ms step:458/1480 train_time:67714ms step_avg:151.15ms step:459/1480 train_time:67866ms step_avg:151.15ms step:460/1480 train_time:68020ms step_avg:151.15ms step:461/1480 train_time:68173ms step_avg:151.16ms step:462/1480 train_time:68326ms step_avg:151.16ms step:463/1480 train_time:68479ms step_avg:151.17ms step:464/1480 train_time:68632ms step_avg:151.17ms step:465/1480 train_time:68784ms step_avg:151.17ms step:466/1480 train_time:68937ms step_avg:151.18ms step:467/1480 train_time:69090ms step_avg:151.18ms step:468/1480 train_time:69243ms step_avg:151.19ms step:469/1480 train_time:69396ms step_avg:151.19ms step:470/1480 train_time:69549ms step_avg:151.19ms step:471/1480 train_time:69702ms step_avg:151.20ms step:472/1480 train_time:69855ms step_avg:151.20ms step:473/1480 train_time:70007ms step_avg:151.20ms step:474/1480 train_time:70161ms step_avg:151.21ms step:475/1480 train_time:70314ms step_avg:151.21ms step:476/1480 train_time:70466ms step_avg:151.21ms step:477/1480 train_time:70619ms step_avg:151.22ms step:478/1480 train_time:70772ms step_avg:151.22ms step:479/1480 train_time:70926ms step_avg:151.23ms step:480/1480 train_time:71079ms step_avg:151.23ms step:481/1480 train_time:71232ms step_avg:151.24ms step:482/1480 train_time:71385ms step_avg:151.24ms step:483/1480 train_time:71537ms step_avg:151.24ms step:484/1480 train_time:71690ms step_avg:151.24ms step:485/1480 train_time:71844ms step_avg:151.25ms step:486/1480 train_time:71997ms step_avg:151.25ms step:487/1480 train_time:72151ms step_avg:151.26ms step:488/1480 train_time:72304ms step_avg:151.26ms step:489/1480 train_time:72458ms step_avg:151.27ms step:490/1480 train_time:72610ms step_avg:151.27ms step:491/1480 train_time:72762ms step_avg:151.27ms step:492/1480 train_time:72915ms step_avg:151.28ms step:493/1480 train_time:73067ms step_avg:151.28ms step:494/1480 train_time:73220ms step_avg:151.28ms step:495/1480 train_time:73374ms step_avg:151.29ms step:496/1480 train_time:73528ms step_avg:151.29ms step:497/1480 train_time:73681ms step_avg:151.30ms step:498/1480 train_time:73834ms step_avg:151.30ms step:499/1480 train_time:73986ms step_avg:151.30ms step:500/1480 train_time:74140ms step_avg:151.31ms step:500/1480 val_loss:3.6857 train_time:74209ms step_avg:151.45ms step:501/1480 train_time:74304ms step_avg:151.33ms step:502/1480 train_time:74451ms step_avg:151.32ms step:503/1480 train_time:74604ms step_avg:151.33ms step:504/1480 train_time:74755ms step_avg:151.33ms step:505/1480 train_time:74907ms step_avg:151.33ms step:506/1480 train_time:75059ms step_avg:151.33ms step:507/1480 train_time:75212ms step_avg:151.33ms step:508/1480 train_time:75367ms step_avg:151.34ms step:509/1480 train_time:75521ms step_avg:151.34ms step:510/1480 train_time:75674ms step_avg:151.35ms step:511/1480 train_time:75828ms step_avg:151.35ms step:512/1480 train_time:75981ms step_avg:151.36ms step:513/1480 train_time:76133ms step_avg:151.36ms step:514/1480 train_time:76285ms step_avg:151.36ms step:515/1480 train_time:76440ms step_avg:151.37ms step:516/1480 train_time:76594ms step_avg:151.37ms step:517/1480 train_time:76749ms step_avg:151.38ms step:518/1480 train_time:76901ms step_avg:151.38ms step:519/1480 train_time:77053ms step_avg:151.38ms step:520/1480 train_time:77207ms step_avg:151.39ms step:521/1480 train_time:77359ms step_avg:151.39ms step:522/1480 train_time:77512ms step_avg:151.39ms step:523/1480 train_time:77667ms step_avg:151.40ms step:524/1480 train_time:77820ms step_avg:151.40ms step:525/1480 train_time:77972ms step_avg:151.40ms step:526/1480 train_time:78125ms step_avg:151.41ms step:527/1480 train_time:78276ms step_avg:151.40ms step:528/1480 train_time:78429ms step_avg:151.41ms step:529/1480 train_time:78582ms step_avg:151.41ms step:530/1480 train_time:78736ms step_avg:151.41ms step:531/1480 train_time:78889ms step_avg:151.42ms step:532/1480 train_time:79042ms step_avg:151.42ms step:533/1480 train_time:79194ms step_avg:151.42ms step:534/1480 train_time:79348ms step_avg:151.43ms step:535/1480 train_time:79500ms step_avg:151.43ms step:536/1480 train_time:79653ms step_avg:151.43ms step:537/1480 train_time:79806ms step_avg:151.44ms step:538/1480 train_time:79959ms step_avg:151.44ms step:539/1480 train_time:80113ms step_avg:151.44ms step:540/1480 train_time:80266ms step_avg:151.45ms step:541/1480 train_time:80419ms step_avg:151.45ms step:542/1480 train_time:80572ms step_avg:151.45ms step:543/1480 train_time:80725ms step_avg:151.45ms step:544/1480 train_time:80877ms step_avg:151.46ms step:545/1480 train_time:81030ms step_avg:151.46ms step:546/1480 train_time:81182ms step_avg:151.46ms step:547/1480 train_time:81335ms step_avg:151.46ms step:548/1480 train_time:81488ms step_avg:151.46ms step:549/1480 train_time:81641ms step_avg:151.47ms step:550/1480 train_time:81794ms step_avg:151.47ms step:551/1480 train_time:81948ms step_avg:151.48ms step:552/1480 train_time:82103ms step_avg:151.48ms step:553/1480 train_time:82258ms step_avg:151.49ms step:554/1480 train_time:82414ms step_avg:151.50ms step:555/1480 train_time:82568ms step_avg:151.50ms step:556/1480 train_time:82723ms step_avg:151.51ms step:557/1480 train_time:82878ms step_avg:151.51ms step:558/1480 train_time:83033ms step_avg:151.52ms step:559/1480 train_time:83186ms step_avg:151.52ms step:560/1480 train_time:83340ms step_avg:151.53ms step:561/1480 train_time:83496ms step_avg:151.54ms step:562/1480 train_time:83650ms step_avg:151.54ms step:563/1480 train_time:83805ms step_avg:151.55ms step:564/1480 train_time:83960ms step_avg:151.55ms step:565/1480 train_time:84115ms step_avg:151.56ms step:566/1480 train_time:84270ms step_avg:151.57ms step:567/1480 train_time:84425ms step_avg:151.57ms step:568/1480 train_time:84578ms step_avg:151.57ms step:569/1480 train_time:84753ms step_avg:151.62ms step:570/1480 train_time:84887ms step_avg:151.58ms step:571/1480 train_time:85042ms step_avg:151.59ms step:572/1480 train_time:85197ms step_avg:151.60ms step:573/1480 train_time:85351ms step_avg:151.60ms step:574/1480 train_time:85509ms step_avg:151.61ms step:575/1480 train_time:85664ms step_avg:151.62ms step:576/1480 train_time:85819ms step_avg:151.62ms step:577/1480 train_time:85973ms step_avg:151.63ms step:578/1480 train_time:86127ms step_avg:151.63ms step:579/1480 train_time:86280ms step_avg:151.63ms step:580/1480 train_time:86435ms step_avg:151.64ms step:581/1480 train_time:86589ms step_avg:151.64ms step:582/1480 train_time:86744ms step_avg:151.65ms step:583/1480 train_time:86899ms step_avg:151.66ms step:584/1480 train_time:87053ms step_avg:151.66ms step:585/1480 train_time:87207ms step_avg:151.66ms step:586/1480 train_time:87361ms step_avg:151.67ms step:587/1480 train_time:87516ms step_avg:151.67ms step:588/1480 train_time:87671ms step_avg:151.68ms step:589/1480 train_time:87826ms step_avg:151.69ms step:590/1480 train_time:87980ms step_avg:151.69ms step:591/1480 train_time:88135ms step_avg:151.70ms step:592/1480 train_time:88289ms step_avg:151.70ms step:593/1480 train_time:88445ms step_avg:151.71ms step:594/1480 train_time:88599ms step_avg:151.71ms step:595/1480 train_time:88755ms step_avg:151.72ms step:596/1480 train_time:88911ms step_avg:151.72ms step:597/1480 train_time:89065ms step_avg:151.73ms step:598/1480 train_time:89220ms step_avg:151.73ms step:599/1480 train_time:89376ms step_avg:151.74ms step:600/1480 train_time:89530ms step_avg:151.75ms step:601/1480 train_time:89685ms step_avg:151.75ms step:602/1480 train_time:89840ms step_avg:151.76ms step:603/1480 train_time:89994ms step_avg:151.76ms step:604/1480 train_time:90148ms step_avg:151.77ms step:605/1480 train_time:90304ms step_avg:151.77ms step:606/1480 train_time:90459ms step_avg:151.78ms step:607/1480 train_time:90615ms step_avg:151.78ms step:608/1480 train_time:90770ms step_avg:151.79ms step:609/1480 train_time:90925ms step_avg:151.79ms step:610/1480 train_time:91078ms step_avg:151.80ms step:611/1480 train_time:91233ms step_avg:151.80ms step:612/1480 train_time:91387ms step_avg:151.81ms step:613/1480 train_time:91543ms step_avg:151.81ms step:614/1480 train_time:91698ms step_avg:151.82ms step:615/1480 train_time:91853ms step_avg:151.82ms step:616/1480 train_time:92007ms step_avg:151.83ms step:617/1480 train_time:92161ms step_avg:151.83ms step:618/1480 train_time:92315ms step_avg:151.83ms step:619/1480 train_time:92470ms step_avg:151.84ms step:620/1480 train_time:92627ms step_avg:151.85ms step:621/1480 train_time:92781ms step_avg:151.85ms step:622/1480 train_time:92937ms step_avg:151.86ms step:623/1480 train_time:93091ms step_avg:151.86ms step:624/1480 train_time:93247ms step_avg:151.87ms step:625/1480 train_time:93401ms step_avg:151.87ms step:625/1480 val_loss:3.6036 train_time:93471ms step_avg:151.99ms step:626/1480 train_time:93566ms step_avg:151.89ms step:627/1480 train_time:93717ms step_avg:151.89ms step:628/1480 train_time:93872ms step_avg:151.90ms step:629/1480 train_time:94026ms step_avg:151.90ms step:630/1480 train_time:94180ms step_avg:151.90ms step:631/1480 train_time:94334ms step_avg:151.91ms step:632/1480 train_time:94487ms step_avg:151.91ms step:633/1480 train_time:94642ms step_avg:151.91ms step:634/1480 train_time:94797ms step_avg:151.92ms step:635/1480 train_time:94952ms step_avg:151.92ms step:636/1480 train_time:95107ms step_avg:151.93ms step:637/1480 train_time:95260ms step_avg:151.93ms step:638/1480 train_time:95415ms step_avg:151.93ms step:639/1480 train_time:95569ms step_avg:151.94ms step:640/1480 train_time:95724ms step_avg:151.94ms step:641/1480 train_time:95878ms step_avg:151.95ms step:642/1480 train_time:96032ms step_avg:151.95ms step:643/1480 train_time:96187ms step_avg:151.95ms step:644/1480 train_time:96341ms step_avg:151.96ms step:645/1480 train_time:96496ms step_avg:151.96ms step:646/1480 train_time:96651ms step_avg:151.97ms step:647/1480 train_time:96805ms step_avg:151.97ms step:648/1480 train_time:96960ms step_avg:151.97ms step:649/1480 train_time:97115ms step_avg:151.98ms step:650/1480 train_time:97270ms step_avg:151.98ms step:651/1480 train_time:97425ms step_avg:151.99ms step:652/1480 train_time:97580ms step_avg:151.99ms step:653/1480 train_time:97734ms step_avg:152.00ms step:654/1480 train_time:97888ms step_avg:152.00ms step:655/1480 train_time:98043ms step_avg:152.00ms step:656/1480 train_time:98197ms step_avg:152.01ms step:657/1480 train_time:98352ms step_avg:152.01ms step:658/1480 train_time:98507ms step_avg:152.02ms step:659/1480 train_time:98661ms step_avg:152.02ms step:660/1480 train_time:98818ms step_avg:152.03ms step:661/1480 train_time:98976ms step_avg:152.04ms step:662/1480 train_time:99133ms step_avg:152.04ms step:663/1480 train_time:99289ms step_avg:152.05ms step:664/1480 train_time:99446ms step_avg:152.06ms step:665/1480 train_time:99603ms step_avg:152.07ms step:666/1480 train_time:99759ms step_avg:152.07ms step:667/1480 train_time:99915ms step_avg:152.08ms step:668/1480 train_time:100070ms step_avg:152.08ms step:669/1480 train_time:100228ms step_avg:152.09ms step:670/1480 train_time:100384ms step_avg:152.10ms step:671/1480 train_time:100541ms step_avg:152.10ms step:672/1480 train_time:100698ms step_avg:152.11ms step:673/1480 train_time:100855ms step_avg:152.12ms step:674/1480 train_time:101012ms step_avg:152.13ms step:675/1480 train_time:101169ms step_avg:152.13ms step:676/1480 train_time:101326ms step_avg:152.14ms step:677/1480 train_time:101483ms step_avg:152.15ms step:678/1480 train_time:101639ms step_avg:152.15ms step:679/1480 train_time:101795ms step_avg:152.16ms step:680/1480 train_time:101953ms step_avg:152.17ms step:681/1480 train_time:102109ms step_avg:152.17ms step:682/1480 train_time:102265ms step_avg:152.18ms step:683/1480 train_time:102422ms step_avg:152.19ms step:684/1480 train_time:102578ms step_avg:152.19ms step:685/1480 train_time:102735ms step_avg:152.20ms step:686/1480 train_time:102892ms step_avg:152.21ms step:687/1480 train_time:103048ms step_avg:152.21ms step:688/1480 train_time:103205ms step_avg:152.22ms step:689/1480 train_time:103362ms step_avg:152.23ms step:690/1480 train_time:103520ms step_avg:152.24ms step:691/1480 train_time:103678ms step_avg:152.24ms step:692/1480 train_time:103834ms step_avg:152.25ms step:693/1480 train_time:103991ms step_avg:152.26ms step:694/1480 train_time:104148ms step_avg:152.26ms step:695/1480 train_time:104304ms step_avg:152.27ms step:696/1480 train_time:104459ms step_avg:152.27ms step:697/1480 train_time:104617ms step_avg:152.28ms step:698/1480 train_time:104773ms step_avg:152.29ms step:699/1480 train_time:104930ms step_avg:152.29ms step:700/1480 train_time:105087ms step_avg:152.30ms step:701/1480 train_time:105243ms step_avg:152.30ms step:702/1480 train_time:105399ms step_avg:152.31ms step:703/1480 train_time:105554ms step_avg:152.32ms step:704/1480 train_time:105710ms step_avg:152.32ms step:705/1480 train_time:105868ms step_avg:152.33ms step:706/1480 train_time:106026ms step_avg:152.34ms step:707/1480 train_time:106181ms step_avg:152.34ms step:708/1480 train_time:106336ms step_avg:152.34ms step:709/1480 train_time:106493ms step_avg:152.35ms step:710/1480 train_time:106648ms step_avg:152.35ms step:711/1480 train_time:106805ms step_avg:152.36ms step:712/1480 train_time:106961ms step_avg:152.37ms step:713/1480 train_time:107119ms step_avg:152.37ms step:714/1480 train_time:107277ms step_avg:152.38ms step:715/1480 train_time:107432ms step_avg:152.39ms step:716/1480 train_time:107588ms step_avg:152.39ms step:717/1480 train_time:107745ms step_avg:152.40ms step:718/1480 train_time:107901ms step_avg:152.40ms step:719/1480 train_time:108056ms step_avg:152.41ms step:720/1480 train_time:108214ms step_avg:152.41ms step:721/1480 train_time:108373ms step_avg:152.42ms step:722/1480 train_time:108528ms step_avg:152.43ms step:723/1480 train_time:108685ms step_avg:152.43ms step:724/1480 train_time:108841ms step_avg:152.44ms step:725/1480 train_time:108997ms step_avg:152.44ms step:726/1480 train_time:109152ms step_avg:152.45ms step:727/1480 train_time:109309ms step_avg:152.45ms step:728/1480 train_time:109464ms step_avg:152.46ms step:729/1480 train_time:109622ms step_avg:152.46ms step:730/1480 train_time:109779ms step_avg:152.47ms step:731/1480 train_time:109936ms step_avg:152.48ms step:732/1480 train_time:110093ms step_avg:152.48ms step:733/1480 train_time:110250ms step_avg:152.49ms step:734/1480 train_time:110406ms step_avg:152.49ms step:735/1480 train_time:110563ms step_avg:152.50ms step:736/1480 train_time:110719ms step_avg:152.51ms step:737/1480 train_time:110875ms step_avg:152.51ms step:738/1480 train_time:111030ms step_avg:152.51ms step:739/1480 train_time:111186ms step_avg:152.52ms step:740/1480 train_time:111344ms step_avg:152.53ms step:741/1480 train_time:111503ms step_avg:152.53ms step:742/1480 train_time:111658ms step_avg:152.54ms step:743/1480 train_time:111813ms step_avg:152.54ms step:744/1480 train_time:111968ms step_avg:152.55ms step:745/1480 train_time:112126ms step_avg:152.55ms step:746/1480 train_time:112282ms step_avg:152.56ms step:747/1480 train_time:112437ms step_avg:152.56ms step:748/1480 train_time:112598ms step_avg:152.57ms step:749/1480 train_time:112757ms step_avg:152.58ms step:750/1480 train_time:112913ms step_avg:152.59ms step:750/1480 val_loss:3.5494 train_time:112985ms step_avg:152.68ms step:751/1480 train_time:113076ms step_avg:152.60ms step:752/1480 train_time:113231ms step_avg:152.60ms step:753/1480 train_time:113388ms step_avg:152.61ms step:754/1480 train_time:113545ms step_avg:152.61ms step:755/1480 train_time:113700ms step_avg:152.62ms step:756/1480 train_time:113856ms step_avg:152.62ms step:757/1480 train_time:114013ms step_avg:152.63ms step:758/1480 train_time:114170ms step_avg:152.63ms step:759/1480 train_time:114346ms step_avg:152.67ms step:760/1480 train_time:114488ms step_avg:152.65ms step:761/1480 train_time:114645ms step_avg:152.66ms step:762/1480 train_time:114801ms step_avg:152.66ms step:763/1480 train_time:114958ms step_avg:152.67ms step:764/1480 train_time:115116ms step_avg:152.67ms step:765/1480 train_time:115273ms step_avg:152.68ms step:766/1480 train_time:115429ms step_avg:152.68ms step:767/1480 train_time:115587ms step_avg:152.69ms step:768/1480 train_time:115743ms step_avg:152.70ms step:769/1480 train_time:115899ms step_avg:152.70ms step:770/1480 train_time:116057ms step_avg:152.71ms step:771/1480 train_time:116214ms step_avg:152.71ms step:772/1480 train_time:116371ms step_avg:152.72ms step:773/1480 train_time:116528ms step_avg:152.72ms step:774/1480 train_time:116685ms step_avg:152.73ms step:775/1480 train_time:116843ms step_avg:152.74ms step:776/1480 train_time:117001ms step_avg:152.74ms step:777/1480 train_time:117162ms step_avg:152.75ms step:778/1480 train_time:117322ms step_avg:152.76ms step:779/1480 train_time:117480ms step_avg:152.77ms step:780/1480 train_time:117637ms step_avg:152.78ms step:781/1480 train_time:117794ms step_avg:152.78ms step:782/1480 train_time:117951ms step_avg:152.79ms step:783/1480 train_time:118107ms step_avg:152.79ms step:784/1480 train_time:118265ms step_avg:152.80ms step:785/1480 train_time:118423ms step_avg:152.80ms step:786/1480 train_time:118580ms step_avg:152.81ms step:787/1480 train_time:118738ms step_avg:152.82ms step:788/1480 train_time:118897ms step_avg:152.82ms step:789/1480 train_time:119053ms step_avg:152.83ms step:790/1480 train_time:119210ms step_avg:152.83ms step:791/1480 train_time:119372ms step_avg:152.84ms step:792/1480 train_time:119528ms step_avg:152.85ms step:793/1480 train_time:119685ms step_avg:152.85ms step:794/1480 train_time:119843ms step_avg:152.86ms step:795/1480 train_time:120004ms step_avg:152.87ms step:796/1480 train_time:120165ms step_avg:152.88ms step:797/1480 train_time:120325ms step_avg:152.89ms step:798/1480 train_time:120485ms step_avg:152.90ms step:799/1480 train_time:120647ms step_avg:152.91ms step:800/1480 train_time:120806ms step_avg:152.92ms step:801/1480 train_time:120963ms step_avg:152.92ms step:802/1480 train_time:121121ms step_avg:152.93ms step:803/1480 train_time:121279ms step_avg:152.94ms step:804/1480 train_time:121436ms step_avg:152.94ms step:805/1480 train_time:121595ms step_avg:152.95ms step:806/1480 train_time:121752ms step_avg:152.95ms step:807/1480 train_time:121907ms step_avg:152.96ms step:808/1480 train_time:122064ms step_avg:152.96ms step:809/1480 train_time:122222ms step_avg:152.97ms step:810/1480 train_time:122380ms step_avg:152.97ms step:811/1480 train_time:122536ms step_avg:152.98ms step:812/1480 train_time:122693ms step_avg:152.98ms step:813/1480 train_time:122849ms step_avg:152.99ms step:814/1480 train_time:123005ms step_avg:152.99ms step:815/1480 train_time:123163ms step_avg:153.00ms step:816/1480 train_time:123322ms step_avg:153.01ms step:817/1480 train_time:123479ms step_avg:153.01ms step:818/1480 train_time:123637ms step_avg:153.02ms step:819/1480 train_time:123795ms step_avg:153.02ms step:820/1480 train_time:123953ms step_avg:153.03ms step:821/1480 train_time:124109ms step_avg:153.03ms step:822/1480 train_time:124267ms step_avg:153.04ms step:823/1480 train_time:124425ms step_avg:153.04ms step:824/1480 train_time:124583ms step_avg:153.05ms step:825/1480 train_time:124743ms step_avg:153.06ms step:826/1480 train_time:124903ms step_avg:153.07ms step:827/1480 train_time:125061ms step_avg:153.07ms step:828/1480 train_time:125219ms step_avg:153.08ms step:829/1480 train_time:125377ms step_avg:153.09ms step:830/1480 train_time:125535ms step_avg:153.09ms step:831/1480 train_time:125693ms step_avg:153.10ms step:832/1480 train_time:125849ms step_avg:153.10ms step:833/1480 train_time:126007ms step_avg:153.11ms step:834/1480 train_time:126167ms step_avg:153.11ms step:835/1480 train_time:126325ms step_avg:153.12ms step:836/1480 train_time:126486ms step_avg:153.13ms step:837/1480 train_time:126644ms step_avg:153.14ms step:838/1480 train_time:126801ms step_avg:153.14ms step:839/1480 train_time:126958ms step_avg:153.15ms step:840/1480 train_time:127115ms step_avg:153.15ms step:841/1480 train_time:127272ms step_avg:153.16ms step:842/1480 train_time:127431ms step_avg:153.16ms step:843/1480 train_time:127588ms step_avg:153.17ms step:844/1480 train_time:127745ms step_avg:153.17ms step:845/1480 train_time:127902ms step_avg:153.18ms step:846/1480 train_time:128061ms step_avg:153.18ms step:847/1480 train_time:128220ms step_avg:153.19ms step:848/1480 train_time:128377ms step_avg:153.19ms step:849/1480 train_time:128534ms step_avg:153.20ms step:850/1480 train_time:128693ms step_avg:153.21ms step:851/1480 train_time:128851ms step_avg:153.21ms step:852/1480 train_time:129007ms step_avg:153.21ms step:853/1480 train_time:129164ms step_avg:153.22ms step:854/1480 train_time:129323ms step_avg:153.23ms step:855/1480 train_time:129481ms step_avg:153.23ms step:856/1480 train_time:129638ms step_avg:153.24ms step:857/1480 train_time:129795ms step_avg:153.24ms step:858/1480 train_time:129954ms step_avg:153.25ms step:859/1480 train_time:130112ms step_avg:153.25ms step:860/1480 train_time:130270ms step_avg:153.26ms step:861/1480 train_time:130427ms step_avg:153.26ms step:862/1480 train_time:130589ms step_avg:153.27ms step:863/1480 train_time:130749ms step_avg:153.28ms step:864/1480 train_time:130906ms step_avg:153.29ms step:865/1480 train_time:131064ms step_avg:153.29ms step:866/1480 train_time:131224ms step_avg:153.30ms step:867/1480 train_time:131383ms step_avg:153.31ms step:868/1480 train_time:131540ms step_avg:153.31ms step:869/1480 train_time:131697ms step_avg:153.31ms step:870/1480 train_time:131856ms step_avg:153.32ms step:871/1480 train_time:132012ms step_avg:153.32ms step:872/1480 train_time:132170ms step_avg:153.33ms step:873/1480 train_time:132327ms step_avg:153.33ms step:874/1480 train_time:132487ms step_avg:153.34ms step:875/1480 train_time:132648ms step_avg:153.35ms step:875/1480 val_loss:3.5054 train_time:132720ms step_avg:153.43ms step:876/1480 train_time:132811ms step_avg:153.36ms step:877/1480 train_time:132968ms step_avg:153.37ms step:878/1480 train_time:133127ms step_avg:153.37ms step:879/1480 train_time:133286ms step_avg:153.38ms step:880/1480 train_time:133445ms step_avg:153.38ms step:881/1480 train_time:133603ms step_avg:153.39ms step:882/1480 train_time:133763ms step_avg:153.40ms step:883/1480 train_time:133923ms step_avg:153.41ms step:884/1480 train_time:134084ms step_avg:153.41ms step:885/1480 train_time:134245ms step_avg:153.42ms step:886/1480 train_time:134406ms step_avg:153.43ms step:887/1480 train_time:134565ms step_avg:153.44ms step:888/1480 train_time:134731ms step_avg:153.45ms step:889/1480 train_time:134894ms step_avg:153.46ms step:890/1480 train_time:135052ms step_avg:153.47ms step:891/1480 train_time:135212ms step_avg:153.48ms step:892/1480 train_time:135372ms step_avg:153.48ms step:893/1480 train_time:135531ms step_avg:153.49ms step:894/1480 train_time:135691ms step_avg:153.50ms step:895/1480 train_time:135852ms step_avg:153.50ms step:896/1480 train_time:136010ms step_avg:153.51ms step:897/1480 train_time:136170ms step_avg:153.52ms step:898/1480 train_time:136331ms step_avg:153.53ms step:899/1480 train_time:136491ms step_avg:153.53ms step:900/1480 train_time:136649ms step_avg:153.54ms step:901/1480 train_time:136809ms step_avg:153.55ms step:902/1480 train_time:136967ms step_avg:153.55ms step:903/1480 train_time:137131ms step_avg:153.56ms step:904/1480 train_time:137290ms step_avg:153.57ms step:905/1480 train_time:137449ms step_avg:153.57ms step:906/1480 train_time:137609ms step_avg:153.58ms step:907/1480 train_time:137771ms step_avg:153.59ms step:908/1480 train_time:137930ms step_avg:153.60ms step:909/1480 train_time:138089ms step_avg:153.60ms step:910/1480 train_time:138254ms step_avg:153.62ms step:911/1480 train_time:138413ms step_avg:153.62ms step:912/1480 train_time:138572ms step_avg:153.63ms step:913/1480 train_time:138733ms step_avg:153.64ms step:914/1480 train_time:138892ms step_avg:153.64ms step:915/1480 train_time:139054ms step_avg:153.65ms step:916/1480 train_time:139212ms step_avg:153.66ms step:917/1480 train_time:139370ms step_avg:153.66ms step:918/1480 train_time:139532ms step_avg:153.67ms step:919/1480 train_time:139695ms step_avg:153.68ms step:920/1480 train_time:139854ms step_avg:153.69ms step:921/1480 train_time:140012ms step_avg:153.69ms step:922/1480 train_time:140173ms step_avg:153.70ms step:923/1480 train_time:140330ms step_avg:153.70ms step:924/1480 train_time:140488ms step_avg:153.71ms step:925/1480 train_time:140650ms step_avg:153.72ms step:926/1480 train_time:140808ms step_avg:153.72ms step:927/1480 train_time:140966ms step_avg:153.73ms step:928/1480 train_time:141127ms step_avg:153.73ms step:929/1480 train_time:141287ms step_avg:153.74ms step:930/1480 train_time:141449ms step_avg:153.75ms step:931/1480 train_time:141610ms step_avg:153.76ms step:932/1480 train_time:141769ms step_avg:153.76ms step:933/1480 train_time:141929ms step_avg:153.77ms step:934/1480 train_time:142089ms step_avg:153.78ms step:935/1480 train_time:142251ms step_avg:153.79ms step:936/1480 train_time:142411ms step_avg:153.79ms step:937/1480 train_time:142573ms step_avg:153.80ms step:938/1480 train_time:142731ms step_avg:153.80ms step:939/1480 train_time:142891ms step_avg:153.81ms step:940/1480 train_time:143052ms step_avg:153.82ms step:941/1480 train_time:143211ms step_avg:153.83ms step:942/1480 train_time:143369ms step_avg:153.83ms step:943/1480 train_time:143531ms step_avg:153.84ms step:944/1480 train_time:143693ms step_avg:153.85ms step:945/1480 train_time:143852ms step_avg:153.85ms step:946/1480 train_time:144014ms step_avg:153.86ms step:947/1480 train_time:144175ms step_avg:153.87ms step:948/1480 train_time:144334ms step_avg:153.87ms step:949/1480 train_time:144508ms step_avg:153.90ms step:950/1480 train_time:144652ms step_avg:153.88ms step:951/1480 train_time:144813ms step_avg:153.89ms step:952/1480 train_time:144972ms step_avg:153.90ms step:953/1480 train_time:145133ms step_avg:153.91ms step:954/1480 train_time:145295ms step_avg:153.91ms step:955/1480 train_time:145453ms step_avg:153.92ms step:956/1480 train_time:145612ms step_avg:153.92ms step:957/1480 train_time:145773ms step_avg:153.93ms step:958/1480 train_time:145935ms step_avg:153.94ms step:959/1480 train_time:146094ms step_avg:153.94ms step:960/1480 train_time:146254ms step_avg:153.95ms step:961/1480 train_time:146413ms step_avg:153.96ms step:962/1480 train_time:146571ms step_avg:153.96ms step:963/1480 train_time:146732ms step_avg:153.97ms step:964/1480 train_time:146893ms step_avg:153.98ms step:965/1480 train_time:147052ms step_avg:153.98ms step:966/1480 train_time:147211ms step_avg:153.99ms step:967/1480 train_time:147369ms step_avg:153.99ms step:968/1480 train_time:147529ms step_avg:154.00ms step:969/1480 train_time:147689ms step_avg:154.00ms step:970/1480 train_time:147850ms step_avg:154.01ms step:971/1480 train_time:148008ms step_avg:154.01ms step:972/1480 train_time:148166ms step_avg:154.02ms step:973/1480 train_time:148325ms step_avg:154.02ms step:974/1480 train_time:148485ms step_avg:154.03ms step:975/1480 train_time:148647ms step_avg:154.04ms step:976/1480 train_time:148808ms step_avg:154.05ms step:977/1480 train_time:148969ms step_avg:154.05ms step:978/1480 train_time:149129ms step_avg:154.06ms step:979/1480 train_time:149290ms step_avg:154.07ms step:980/1480 train_time:149450ms step_avg:154.07ms step:981/1480 train_time:149611ms step_avg:154.08ms step:982/1480 train_time:149768ms step_avg:154.08ms step:983/1480 train_time:149928ms step_avg:154.09ms step:984/1480 train_time:150088ms step_avg:154.09ms step:985/1480 train_time:150252ms step_avg:154.10ms step:986/1480 train_time:150412ms step_avg:154.11ms step:987/1480 train_time:150570ms step_avg:154.11ms step:988/1480 train_time:150730ms step_avg:154.12ms step:989/1480 train_time:150890ms step_avg:154.13ms step:990/1480 train_time:151052ms step_avg:154.14ms step:991/1480 train_time:151214ms step_avg:154.14ms step:992/1480 train_time:151377ms step_avg:154.15ms step:993/1480 train_time:151545ms step_avg:154.17ms step:994/1480 train_time:151705ms step_avg:154.17ms step:995/1480 train_time:151863ms step_avg:154.18ms step:996/1480 train_time:152020ms step_avg:154.18ms step:997/1480 train_time:152179ms step_avg:154.18ms step:998/1480 train_time:152341ms step_avg:154.19ms step:999/1480 train_time:152500ms step_avg:154.20ms step:1000/1480 train_time:152661ms step_avg:154.20ms step:1000/1480 val_loss:3.4427 train_time:152734ms step_avg:154.28ms step:1001/1480 train_time:152825ms step_avg:154.21ms step:1002/1480 train_time:152983ms step_avg:154.22ms step:1003/1480 train_time:153145ms step_avg:154.22ms step:1004/1480 train_time:153306ms step_avg:154.23ms step:1005/1480 train_time:153466ms step_avg:154.24ms step:1006/1480 train_time:153627ms step_avg:154.24ms step:1007/1480 train_time:153788ms step_avg:154.25ms step:1008/1480 train_time:153947ms step_avg:154.26ms step:1009/1480 train_time:154115ms step_avg:154.27ms step:1010/1480 train_time:154276ms step_avg:154.28ms step:1011/1480 train_time:154436ms step_avg:154.28ms step:1012/1480 train_time:154596ms step_avg:154.29ms step:1013/1480 train_time:154759ms step_avg:154.30ms step:1014/1480 train_time:154919ms step_avg:154.30ms step:1015/1480 train_time:155083ms step_avg:154.31ms step:1016/1480 train_time:155242ms step_avg:154.32ms step:1017/1480 train_time:155404ms step_avg:154.32ms step:1018/1480 train_time:155564ms step_avg:154.33ms step:1019/1480 train_time:155724ms step_avg:154.33ms step:1020/1480 train_time:155884ms step_avg:154.34ms step:1021/1480 train_time:156043ms step_avg:154.34ms step:1022/1480 train_time:156202ms step_avg:154.35ms step:1023/1480 train_time:156363ms step_avg:154.36ms step:1024/1480 train_time:156522ms step_avg:154.36ms step:1025/1480 train_time:156683ms step_avg:154.37ms step:1026/1480 train_time:156843ms step_avg:154.37ms step:1027/1480 train_time:157003ms step_avg:154.38ms step:1028/1480 train_time:157164ms step_avg:154.39ms step:1029/1480 train_time:157327ms step_avg:154.39ms step:1030/1480 train_time:157486ms step_avg:154.40ms step:1031/1480 train_time:157644ms step_avg:154.40ms step:1032/1480 train_time:157807ms step_avg:154.41ms step:1033/1480 train_time:157966ms step_avg:154.41ms step:1034/1480 train_time:158125ms step_avg:154.42ms step:1035/1480 train_time:158286ms step_avg:154.43ms step:1036/1480 train_time:158445ms step_avg:154.43ms step:1037/1480 train_time:158605ms step_avg:154.44ms step:1038/1480 train_time:158764ms step_avg:154.44ms step:1039/1480 train_time:158926ms step_avg:154.45ms step:1040/1480 train_time:159085ms step_avg:154.45ms step:1041/1480 train_time:159245ms step_avg:154.46ms step:1042/1480 train_time:159403ms step_avg:154.46ms step:1043/1480 train_time:159561ms step_avg:154.46ms step:1044/1480 train_time:159720ms step_avg:154.47ms step:1045/1480 train_time:159881ms step_avg:154.47ms step:1046/1480 train_time:160040ms step_avg:154.48ms step:1047/1480 train_time:160201ms step_avg:154.49ms step:1048/1480 train_time:160360ms step_avg:154.49ms step:1049/1480 train_time:160520ms step_avg:154.49ms step:1050/1480 train_time:160681ms step_avg:154.50ms step:1051/1480 train_time:160842ms step_avg:154.51ms step:1052/1480 train_time:161002ms step_avg:154.51ms step:1053/1480 train_time:161163ms step_avg:154.52ms step:1054/1480 train_time:161323ms step_avg:154.52ms step:1055/1480 train_time:161483ms step_avg:154.53ms step:1056/1480 train_time:161642ms step_avg:154.53ms step:1057/1480 train_time:161801ms step_avg:154.54ms step:1058/1480 train_time:161962ms step_avg:154.54ms step:1059/1480 train_time:162124ms step_avg:154.55ms step:1060/1480 train_time:162285ms step_avg:154.56ms step:1061/1480 train_time:162442ms step_avg:154.56ms step:1062/1480 train_time:162601ms step_avg:154.56ms step:1063/1480 train_time:162760ms step_avg:154.57ms step:1064/1480 train_time:162918ms step_avg:154.57ms step:1065/1480 train_time:163080ms step_avg:154.58ms step:1066/1480 train_time:163241ms step_avg:154.58ms step:1067/1480 train_time:163403ms step_avg:154.59ms step:1068/1480 train_time:163562ms step_avg:154.60ms step:1069/1480 train_time:163724ms step_avg:154.60ms step:1070/1480 train_time:163883ms step_avg:154.61ms step:1071/1480 train_time:164046ms step_avg:154.61ms step:1072/1480 train_time:164204ms step_avg:154.62ms step:1073/1480 train_time:164363ms step_avg:154.62ms step:1074/1480 train_time:164521ms step_avg:154.63ms step:1075/1480 train_time:164682ms step_avg:154.63ms step:1076/1480 train_time:164840ms step_avg:154.63ms step:1077/1480 train_time:165000ms step_avg:154.64ms step:1078/1480 train_time:165165ms step_avg:154.65ms step:1079/1480 train_time:165328ms step_avg:154.66ms step:1080/1480 train_time:165490ms step_avg:154.66ms step:1081/1480 train_time:165651ms step_avg:154.67ms step:1082/1480 train_time:165811ms step_avg:154.67ms step:1083/1480 train_time:165971ms step_avg:154.68ms step:1084/1480 train_time:166132ms step_avg:154.69ms step:1085/1480 train_time:166292ms step_avg:154.69ms step:1086/1480 train_time:166453ms step_avg:154.70ms step:1087/1480 train_time:166615ms step_avg:154.70ms step:1088/1480 train_time:166776ms step_avg:154.71ms step:1089/1480 train_time:166939ms step_avg:154.72ms step:1090/1480 train_time:167103ms step_avg:154.73ms step:1091/1480 train_time:167263ms step_avg:154.73ms step:1092/1480 train_time:167423ms step_avg:154.73ms step:1093/1480 train_time:167583ms step_avg:154.74ms step:1094/1480 train_time:167741ms step_avg:154.74ms step:1095/1480 train_time:167901ms step_avg:154.75ms step:1096/1480 train_time:168062ms step_avg:154.75ms step:1097/1480 train_time:168223ms step_avg:154.76ms step:1098/1480 train_time:168385ms step_avg:154.77ms step:1099/1480 train_time:168546ms step_avg:154.77ms step:1100/1480 train_time:168709ms step_avg:154.78ms step:1101/1480 train_time:168871ms step_avg:154.79ms step:1102/1480 train_time:169035ms step_avg:154.79ms step:1103/1480 train_time:169203ms step_avg:154.81ms step:1104/1480 train_time:169364ms step_avg:154.81ms step:1105/1480 train_time:169526ms step_avg:154.82ms step:1106/1480 train_time:169687ms step_avg:154.82ms step:1107/1480 train_time:169848ms step_avg:154.83ms step:1108/1480 train_time:170006ms step_avg:154.83ms step:1109/1480 train_time:170165ms step_avg:154.84ms step:1110/1480 train_time:170328ms step_avg:154.84ms step:1111/1480 train_time:170489ms step_avg:154.85ms step:1112/1480 train_time:170652ms step_avg:154.86ms step:1113/1480 train_time:170821ms step_avg:154.87ms step:1114/1480 train_time:170983ms step_avg:154.88ms step:1115/1480 train_time:171145ms step_avg:154.88ms step:1116/1480 train_time:171305ms step_avg:154.89ms step:1117/1480 train_time:171468ms step_avg:154.89ms step:1118/1480 train_time:171633ms step_avg:154.90ms step:1119/1480 train_time:171795ms step_avg:154.91ms step:1120/1480 train_time:171957ms step_avg:154.92ms step:1121/1480 train_time:172119ms step_avg:154.92ms step:1122/1480 train_time:172280ms step_avg:154.93ms step:1123/1480 train_time:172440ms step_avg:154.93ms step:1124/1480 train_time:172603ms step_avg:154.94ms step:1125/1480 train_time:172764ms step_avg:154.95ms step:1125/1480 val_loss:3.3869 train_time:172838ms step_avg:155.01ms step:1126/1480 train_time:172934ms step_avg:154.96ms step:1127/1480 train_time:173087ms step_avg:154.96ms step:1128/1480 train_time:173248ms step_avg:154.96ms step:1129/1480 train_time:173410ms step_avg:154.97ms step:1130/1480 train_time:173570ms step_avg:154.97ms step:1131/1480 train_time:173738ms step_avg:154.99ms step:1132/1480 train_time:173900ms step_avg:154.99ms step:1133/1480 train_time:174063ms step_avg:155.00ms step:1134/1480 train_time:174226ms step_avg:155.01ms step:1135/1480 train_time:174386ms step_avg:155.01ms step:1136/1480 train_time:174548ms step_avg:155.02ms step:1137/1480 train_time:174708ms step_avg:155.02ms step:1138/1480 train_time:174873ms step_avg:155.03ms step:1139/1480 train_time:175048ms step_avg:155.05ms step:1140/1480 train_time:175197ms step_avg:155.04ms step:1141/1480 train_time:175363ms step_avg:155.05ms step:1142/1480 train_time:175524ms step_avg:155.06ms step:1143/1480 train_time:175687ms step_avg:155.06ms step:1144/1480 train_time:175849ms step_avg:155.07ms step:1145/1480 train_time:176007ms step_avg:155.07ms step:1146/1480 train_time:176170ms step_avg:155.08ms step:1147/1480 train_time:176331ms step_avg:155.08ms step:1148/1480 train_time:176492ms step_avg:155.09ms step:1149/1480 train_time:176658ms step_avg:155.10ms step:1150/1480 train_time:176820ms step_avg:155.11ms step:1151/1480 train_time:176982ms step_avg:155.11ms step:1152/1480 train_time:177145ms step_avg:155.12ms step:1153/1480 train_time:177309ms step_avg:155.13ms step:1154/1480 train_time:177469ms step_avg:155.13ms step:1155/1480 train_time:177630ms step_avg:155.14ms step:1156/1480 train_time:177799ms step_avg:155.15ms step:1157/1480 train_time:177961ms step_avg:155.15ms step:1158/1480 train_time:178121ms step_avg:155.16ms step:1159/1480 train_time:178281ms step_avg:155.16ms step:1160/1480 train_time:178442ms step_avg:155.17ms step:1161/1480 train_time:178604ms step_avg:155.17ms step:1162/1480 train_time:178766ms step_avg:155.18ms step:1163/1480 train_time:178928ms step_avg:155.18ms step:1164/1480 train_time:179090ms step_avg:155.19ms step:1165/1480 train_time:179249ms step_avg:155.19ms step:1166/1480 train_time:179410ms step_avg:155.20ms step:1167/1480 train_time:179571ms step_avg:155.20ms step:1168/1480 train_time:179735ms step_avg:155.21ms step:1169/1480 train_time:179898ms step_avg:155.22ms step:1170/1480 train_time:180061ms step_avg:155.22ms step:1171/1480 train_time:180222ms step_avg:155.23ms step:1172/1480 train_time:180381ms step_avg:155.23ms step:1173/1480 train_time:180544ms step_avg:155.24ms step:1174/1480 train_time:180714ms step_avg:155.25ms step:1175/1480 train_time:180878ms step_avg:155.26ms step:1176/1480 train_time:181042ms step_avg:155.27ms step:1177/1480 train_time:181208ms step_avg:155.28ms step:1178/1480 train_time:181370ms step_avg:155.28ms step:1179/1480 train_time:181529ms step_avg:155.29ms step:1180/1480 train_time:181698ms step_avg:155.30ms step:1181/1480 train_time:181862ms step_avg:155.30ms step:1182/1480 train_time:182023ms step_avg:155.31ms step:1183/1480 train_time:182183ms step_avg:155.31ms step:1184/1480 train_time:182344ms step_avg:155.32ms step:1185/1480 train_time:182507ms step_avg:155.33ms step:1186/1480 train_time:182669ms step_avg:155.33ms step:1187/1480 train_time:182843ms step_avg:155.35ms step:1188/1480 train_time:183002ms step_avg:155.35ms step:1189/1480 train_time:183164ms step_avg:155.36ms step:1190/1480 train_time:183324ms step_avg:155.36ms step:1191/1480 train_time:183486ms step_avg:155.37ms step:1192/1480 train_time:183646ms step_avg:155.37ms step:1193/1480 train_time:183806ms step_avg:155.37ms step:1194/1480 train_time:183967ms step_avg:155.38ms step:1195/1480 train_time:184129ms step_avg:155.38ms step:1196/1480 train_time:184300ms step_avg:155.40ms step:1197/1480 train_time:184462ms step_avg:155.40ms step:1198/1480 train_time:184630ms step_avg:155.41ms step:1199/1480 train_time:184792ms step_avg:155.42ms step:1200/1480 train_time:184954ms step_avg:155.42ms step:1201/1480 train_time:185117ms step_avg:155.43ms step:1202/1480 train_time:185285ms step_avg:155.44ms step:1203/1480 train_time:185450ms step_avg:155.45ms step:1204/1480 train_time:185615ms step_avg:155.46ms step:1205/1480 train_time:185778ms step_avg:155.46ms step:1206/1480 train_time:185940ms step_avg:155.47ms step:1207/1480 train_time:186101ms step_avg:155.47ms step:1208/1480 train_time:186263ms step_avg:155.48ms step:1209/1480 train_time:186427ms step_avg:155.49ms step:1210/1480 train_time:186591ms step_avg:155.49ms step:1211/1480 train_time:186753ms step_avg:155.50ms step:1212/1480 train_time:186917ms step_avg:155.51ms step:1213/1480 train_time:187083ms step_avg:155.51ms step:1214/1480 train_time:187248ms step_avg:155.52ms step:1215/1480 train_time:187412ms step_avg:155.53ms step:1216/1480 train_time:187572ms step_avg:155.53ms step:1217/1480 train_time:187738ms step_avg:155.54ms step:1218/1480 train_time:187900ms step_avg:155.55ms step:1219/1480 train_time:188067ms step_avg:155.56ms step:1220/1480 train_time:188228ms step_avg:155.56ms step:1221/1480 train_time:188388ms step_avg:155.56ms step:1222/1480 train_time:188548ms step_avg:155.57ms step:1223/1480 train_time:188709ms step_avg:155.57ms step:1224/1480 train_time:188874ms step_avg:155.58ms step:1225/1480 train_time:189039ms step_avg:155.59ms step:1226/1480 train_time:189204ms step_avg:155.60ms step:1227/1480 train_time:189367ms step_avg:155.60ms step:1228/1480 train_time:189529ms step_avg:155.61ms step:1229/1480 train_time:189691ms step_avg:155.61ms step:1230/1480 train_time:189862ms step_avg:155.62ms step:1231/1480 train_time:190027ms step_avg:155.63ms step:1232/1480 train_time:190192ms step_avg:155.64ms step:1233/1480 train_time:190353ms step_avg:155.64ms step:1234/1480 train_time:190516ms step_avg:155.65ms step:1235/1480 train_time:190684ms step_avg:155.66ms step:1236/1480 train_time:190846ms step_avg:155.67ms step:1237/1480 train_time:191006ms step_avg:155.67ms step:1238/1480 train_time:191179ms step_avg:155.68ms step:1239/1480 train_time:191341ms step_avg:155.69ms step:1240/1480 train_time:191504ms step_avg:155.69ms step:1241/1480 train_time:191668ms step_avg:155.70ms step:1242/1480 train_time:191830ms step_avg:155.71ms step:1243/1480 train_time:191992ms step_avg:155.71ms step:1244/1480 train_time:192154ms step_avg:155.72ms step:1245/1480 train_time:192318ms step_avg:155.72ms step:1246/1480 train_time:192481ms step_avg:155.73ms step:1247/1480 train_time:192644ms step_avg:155.73ms step:1248/1480 train_time:192805ms step_avg:155.74ms step:1249/1480 train_time:192965ms step_avg:155.74ms step:1250/1480 train_time:193126ms step_avg:155.75ms step:1250/1480 val_loss:3.3379 train_time:193201ms step_avg:155.81ms step:1251/1480 train_time:193296ms step_avg:155.76ms step:1252/1480 train_time:193458ms step_avg:155.76ms step:1253/1480 train_time:193618ms step_avg:155.77ms step:1254/1480 train_time:193780ms step_avg:155.77ms step:1255/1480 train_time:193951ms step_avg:155.78ms step:1256/1480 train_time:194115ms step_avg:155.79ms step:1257/1480 train_time:194278ms step_avg:155.80ms step:1258/1480 train_time:194442ms step_avg:155.80ms step:1259/1480 train_time:194605ms step_avg:155.81ms step:1260/1480 train_time:194764ms step_avg:155.81ms step:1261/1480 train_time:194925ms step_avg:155.82ms step:1262/1480 train_time:195090ms step_avg:155.82ms step:1263/1480 train_time:195256ms step_avg:155.83ms step:1264/1480 train_time:195415ms step_avg:155.83ms step:1265/1480 train_time:195576ms step_avg:155.84ms step:1266/1480 train_time:195738ms step_avg:155.84ms step:1267/1480 train_time:195900ms step_avg:155.85ms step:1268/1480 train_time:196061ms step_avg:155.85ms step:1269/1480 train_time:196227ms step_avg:155.86ms step:1270/1480 train_time:196390ms step_avg:155.86ms step:1271/1480 train_time:196554ms step_avg:155.87ms step:1272/1480 train_time:196714ms step_avg:155.87ms step:1273/1480 train_time:196879ms step_avg:155.88ms step:1274/1480 train_time:197043ms step_avg:155.89ms step:1275/1480 train_time:197204ms step_avg:155.89ms step:1276/1480 train_time:197364ms step_avg:155.90ms step:1277/1480 train_time:197525ms step_avg:155.90ms step:1278/1480 train_time:197684ms step_avg:155.90ms step:1279/1480 train_time:197847ms step_avg:155.91ms step:1280/1480 train_time:198014ms step_avg:155.92ms step:1281/1480 train_time:198177ms step_avg:155.92ms step:1282/1480 train_time:198337ms step_avg:155.93ms step:1283/1480 train_time:198499ms step_avg:155.93ms step:1284/1480 train_time:198661ms step_avg:155.94ms step:1285/1480 train_time:198822ms step_avg:155.94ms step:1286/1480 train_time:198982ms step_avg:155.94ms step:1287/1480 train_time:199144ms step_avg:155.95ms step:1288/1480 train_time:199306ms step_avg:155.95ms step:1289/1480 train_time:199477ms step_avg:155.96ms step:1290/1480 train_time:199643ms step_avg:155.97ms step:1291/1480 train_time:199807ms step_avg:155.98ms step:1292/1480 train_time:199972ms step_avg:155.98ms step:1293/1480 train_time:200138ms step_avg:155.99ms step:1294/1480 train_time:200300ms step_avg:156.00ms step:1295/1480 train_time:200462ms step_avg:156.00ms step:1296/1480 train_time:200624ms step_avg:156.01ms step:1297/1480 train_time:200788ms step_avg:156.01ms step:1298/1480 train_time:200952ms step_avg:156.02ms step:1299/1480 train_time:201115ms step_avg:156.02ms step:1300/1480 train_time:201276ms step_avg:156.03ms step:1301/1480 train_time:201437ms step_avg:156.03ms step:1302/1480 train_time:201601ms step_avg:156.04ms step:1303/1480 train_time:201767ms step_avg:156.05ms step:1304/1480 train_time:201933ms step_avg:156.05ms step:1305/1480 train_time:202095ms step_avg:156.06ms step:1306/1480 train_time:202260ms step_avg:156.06ms step:1307/1480 train_time:202420ms step_avg:156.07ms step:1308/1480 train_time:202582ms step_avg:156.07ms step:1309/1480 train_time:202747ms step_avg:156.08ms step:1310/1480 train_time:202910ms step_avg:156.08ms step:1311/1480 train_time:203074ms step_avg:156.09ms step:1312/1480 train_time:203237ms step_avg:156.10ms step:1313/1480 train_time:203398ms step_avg:156.10ms step:1314/1480 train_time:203563ms step_avg:156.11ms step:1315/1480 train_time:203726ms step_avg:156.11ms step:1316/1480 train_time:203885ms step_avg:156.11ms step:1317/1480 train_time:204046ms step_avg:156.12ms step:1318/1480 train_time:204214ms step_avg:156.13ms step:1319/1480 train_time:204381ms step_avg:156.14ms step:1320/1480 train_time:204548ms step_avg:156.14ms step:1321/1480 train_time:204714ms step_avg:156.15ms step:1322/1480 train_time:204882ms step_avg:156.16ms step:1323/1480 train_time:205045ms step_avg:156.17ms step:1324/1480 train_time:205210ms step_avg:156.17ms step:1325/1480 train_time:205379ms step_avg:156.18ms step:1326/1480 train_time:205544ms step_avg:156.19ms step:1327/1480 train_time:205706ms step_avg:156.19ms step:1328/1480 train_time:205869ms step_avg:156.20ms step:1329/1480 train_time:206060ms step_avg:156.22ms step:1330/1480 train_time:206218ms step_avg:156.23ms step:1331/1480 train_time:206380ms step_avg:156.23ms step:1332/1480 train_time:206542ms step_avg:156.23ms step:1333/1480 train_time:206707ms step_avg:156.24ms step:1334/1480 train_time:206872ms step_avg:156.25ms step:1335/1480 train_time:207035ms step_avg:156.25ms step:1336/1480 train_time:207205ms step_avg:156.26ms step:1337/1480 train_time:207374ms step_avg:156.27ms step:1338/1480 train_time:207538ms step_avg:156.28ms step:1339/1480 train_time:207701ms step_avg:156.28ms step:1340/1480 train_time:207864ms step_avg:156.29ms step:1341/1480 train_time:208025ms step_avg:156.29ms step:1342/1480 train_time:208190ms step_avg:156.30ms step:1343/1480 train_time:208354ms step_avg:156.30ms step:1344/1480 train_time:208516ms step_avg:156.31ms step:1345/1480 train_time:208685ms step_avg:156.32ms step:1346/1480 train_time:208847ms step_avg:156.32ms step:1347/1480 train_time:209012ms step_avg:156.33ms step:1348/1480 train_time:209175ms step_avg:156.33ms step:1349/1480 train_time:209337ms step_avg:156.34ms step:1350/1480 train_time:209502ms step_avg:156.35ms step:1351/1480 train_time:209664ms step_avg:156.35ms step:1352/1480 train_time:209826ms step_avg:156.35ms step:1353/1480 train_time:209992ms step_avg:156.36ms step:1354/1480 train_time:210156ms step_avg:156.37ms step:1355/1480 train_time:210317ms step_avg:156.37ms step:1356/1480 train_time:210483ms step_avg:156.38ms step:1357/1480 train_time:210646ms step_avg:156.38ms step:1358/1480 train_time:210810ms step_avg:156.39ms step:1359/1480 train_time:210976ms step_avg:156.39ms step:1360/1480 train_time:211141ms step_avg:156.40ms step:1361/1480 train_time:211307ms step_avg:156.41ms step:1362/1480 train_time:211473ms step_avg:156.41ms step:1363/1480 train_time:211640ms step_avg:156.42ms step:1364/1480 train_time:211801ms step_avg:156.43ms step:1365/1480 train_time:211962ms step_avg:156.43ms step:1366/1480 train_time:212124ms step_avg:156.43ms step:1367/1480 train_time:212286ms step_avg:156.44ms step:1368/1480 train_time:212453ms step_avg:156.45ms step:1369/1480 train_time:212622ms step_avg:156.45ms step:1370/1480 train_time:212787ms step_avg:156.46ms step:1371/1480 train_time:212950ms step_avg:156.47ms step:1372/1480 train_time:213119ms step_avg:156.48ms step:1373/1480 train_time:213280ms step_avg:156.48ms step:1374/1480 train_time:213447ms step_avg:156.49ms step:1375/1480 train_time:213611ms step_avg:156.49ms step:1375/1480 val_loss:3.2996 train_time:213686ms step_avg:156.55ms step:1376/1480 train_time:213778ms step_avg:156.50ms step:1377/1480 train_time:213942ms step_avg:156.50ms step:1378/1480 train_time:214104ms step_avg:156.51ms step:1379/1480 train_time:214270ms step_avg:156.52ms step:1380/1480 train_time:214432ms step_avg:156.52ms step:1381/1480 train_time:214602ms step_avg:156.53ms step:1382/1480 train_time:214767ms step_avg:156.54ms step:1383/1480 train_time:214929ms step_avg:156.54ms step:1384/1480 train_time:215095ms step_avg:156.55ms step:1385/1480 train_time:215255ms step_avg:156.55ms step:1386/1480 train_time:215419ms step_avg:156.55ms step:1387/1480 train_time:215585ms step_avg:156.56ms step:1388/1480 train_time:215746ms step_avg:156.56ms step:1389/1480 train_time:215911ms step_avg:156.57ms step:1390/1480 train_time:216072ms step_avg:156.57ms step:1391/1480 train_time:216235ms step_avg:156.58ms step:1392/1480 train_time:216402ms step_avg:156.59ms step:1393/1480 train_time:216565ms step_avg:156.59ms step:1394/1480 train_time:216727ms step_avg:156.59ms step:1395/1480 train_time:216889ms step_avg:156.60ms step:1396/1480 train_time:217051ms step_avg:156.60ms step:1397/1480 train_time:217211ms step_avg:156.60ms step:1398/1480 train_time:217371ms step_avg:156.61ms step:1399/1480 train_time:217531ms step_avg:156.61ms step:1400/1480 train_time:217701ms step_avg:156.62ms step:1401/1480 train_time:217862ms step_avg:156.62ms step:1402/1480 train_time:218024ms step_avg:156.63ms step:1403/1480 train_time:218190ms step_avg:156.63ms step:1404/1480 train_time:218352ms step_avg:156.64ms step:1405/1480 train_time:218518ms step_avg:156.64ms step:1406/1480 train_time:218684ms step_avg:156.65ms step:1407/1480 train_time:218845ms step_avg:156.65ms step:1408/1480 train_time:219007ms step_avg:156.66ms step:1409/1480 train_time:219177ms step_avg:156.67ms step:1410/1480 train_time:219342ms step_avg:156.67ms step:1411/1480 train_time:219503ms step_avg:156.68ms step:1412/1480 train_time:219665ms step_avg:156.68ms step:1413/1480 train_time:219827ms step_avg:156.68ms step:1414/1480 train_time:219991ms step_avg:156.69ms step:1415/1480 train_time:220154ms step_avg:156.69ms step:1416/1480 train_time:220329ms step_avg:156.71ms step:1417/1480 train_time:220494ms step_avg:156.71ms step:1418/1480 train_time:220659ms step_avg:156.72ms step:1419/1480 train_time:220824ms step_avg:156.72ms step:1420/1480 train_time:220989ms step_avg:156.73ms step:1421/1480 train_time:221152ms step_avg:156.73ms step:1422/1480 train_time:221317ms step_avg:156.74ms step:1423/1480 train_time:221481ms step_avg:156.75ms step:1424/1480 train_time:221647ms step_avg:156.75ms step:1425/1480 train_time:221816ms step_avg:156.76ms step:1426/1480 train_time:221981ms step_avg:156.77ms step:1427/1480 train_time:222146ms step_avg:156.77ms step:1428/1480 train_time:222309ms step_avg:156.78ms step:1429/1480 train_time:222469ms step_avg:156.78ms step:1430/1480 train_time:222632ms step_avg:156.78ms step:1431/1480 train_time:222798ms step_avg:156.79ms step:1432/1480 train_time:222967ms step_avg:156.80ms step:1433/1480 train_time:223134ms step_avg:156.81ms step:1434/1480 train_time:223305ms step_avg:156.82ms step:1435/1480 train_time:223470ms step_avg:156.82ms step:1436/1480 train_time:223635ms step_avg:156.83ms step:1437/1480 train_time:223798ms step_avg:156.83ms step:1438/1480 train_time:223961ms step_avg:156.84ms step:1439/1480 train_time:224127ms step_avg:156.84ms step:1440/1480 train_time:224288ms step_avg:156.84ms step:1441/1480 train_time:224451ms step_avg:156.85ms step:1442/1480 train_time:224618ms step_avg:156.86ms step:1443/1480 train_time:224791ms step_avg:156.87ms step:1444/1480 train_time:224953ms step_avg:156.87ms step:1445/1480 train_time:225115ms step_avg:156.87ms step:1446/1480 train_time:225283ms step_avg:156.88ms step:1447/1480 train_time:225449ms step_avg:156.89ms step:1448/1480 train_time:225611ms step_avg:156.89ms step:1449/1480 train_time:225774ms step_avg:156.90ms step:1450/1480 train_time:225939ms step_avg:156.90ms step:1451/1480 train_time:226104ms step_avg:156.91ms step:1452/1480 train_time:226268ms step_avg:156.91ms step:1453/1480 train_time:226432ms step_avg:156.92ms step:1454/1480 train_time:226594ms step_avg:156.92ms step:1455/1480 train_time:226763ms step_avg:156.93ms step:1456/1480 train_time:226927ms step_avg:156.93ms step:1457/1480 train_time:227089ms step_avg:156.94ms step:1458/1480 train_time:227251ms step_avg:156.94ms step:1459/1480 train_time:227417ms step_avg:156.95ms step:1460/1480 train_time:227581ms step_avg:156.95ms step:1461/1480 train_time:227745ms step_avg:156.96ms step:1462/1480 train_time:227908ms step_avg:156.96ms step:1463/1480 train_time:228072ms step_avg:156.97ms step:1464/1480 train_time:228236ms step_avg:156.97ms step:1465/1480 train_time:228401ms step_avg:156.98ms step:1466/1480 train_time:228565ms step_avg:156.98ms step:1467/1480 train_time:228729ms step_avg:156.99ms step:1468/1480 train_time:228891ms step_avg:156.99ms step:1469/1480 train_time:229054ms step_avg:156.99ms step:1470/1480 train_time:229222ms step_avg:157.00ms step:1471/1480 train_time:229394ms step_avg:157.01ms step:1472/1480 train_time:229567ms step_avg:157.02ms step:1473/1480 train_time:229729ms step_avg:157.03ms step:1474/1480 train_time:229894ms step_avg:157.03ms step:1475/1480 train_time:230064ms step_avg:157.04ms step:1476/1480 train_time:230228ms step_avg:157.05ms step:1477/1480 train_time:230397ms step_avg:157.05ms step:1478/1480 train_time:230567ms step_avg:157.06ms step:1479/1480 train_time:230731ms step_avg:157.07ms step:1480/1480 train_time:230893ms step_avg:157.07ms step:1480/1480 val_loss:3.2810 train_time:230970ms step_avg:157.12ms peak memory consumption: 34239 MiB