import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 09:01:34 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 37C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 111W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 37C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 35C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 29C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28891ms step_avg:nanms step:2/1480 train_time:28996ms step_avg:nanms step:3/1480 train_time:29120ms step_avg:nanms step:4/1480 train_time:29261ms step_avg:nanms step:5/1480 train_time:29403ms step_avg:nanms step:6/1480 train_time:29546ms step_avg:nanms step:7/1480 train_time:29687ms step_avg:nanms step:8/1480 train_time:29829ms step_avg:nanms step:9/1480 train_time:29975ms step_avg:nanms step:10/1480 train_time:30114ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.51ms step:14/1480 train_time:568ms step_avg:142.01ms step:15/1480 train_time:710ms step_avg:142.04ms step:16/1480 train_time:852ms step_avg:142.01ms step:17/1480 train_time:994ms step_avg:141.99ms step:18/1480 train_time:1136ms step_avg:142.02ms step:19/1480 train_time:1277ms step_avg:141.93ms step:20/1480 train_time:1420ms step_avg:141.97ms step:21/1480 train_time:1562ms step_avg:142.03ms step:22/1480 train_time:1706ms step_avg:142.19ms step:23/1480 train_time:1848ms step_avg:142.19ms step:24/1480 train_time:1991ms step_avg:142.25ms step:25/1480 train_time:2134ms step_avg:142.26ms step:26/1480 train_time:2277ms step_avg:142.34ms step:27/1480 train_time:2419ms step_avg:142.30ms step:28/1480 train_time:2563ms step_avg:142.40ms step:29/1480 train_time:2706ms step_avg:142.43ms step:30/1480 train_time:2848ms step_avg:142.42ms step:31/1480 train_time:2992ms step_avg:142.48ms step:32/1480 train_time:3134ms step_avg:142.47ms step:33/1480 train_time:3276ms step_avg:142.45ms step:34/1480 train_time:3419ms step_avg:142.47ms step:35/1480 train_time:3563ms step_avg:142.53ms step:36/1480 train_time:3708ms step_avg:142.63ms step:37/1480 train_time:3851ms step_avg:142.61ms step:38/1480 train_time:3994ms step_avg:142.63ms step:39/1480 train_time:4136ms step_avg:142.63ms step:40/1480 train_time:4278ms step_avg:142.60ms step:41/1480 train_time:4420ms step_avg:142.59ms step:42/1480 train_time:4565ms step_avg:142.65ms step:43/1480 train_time:4708ms step_avg:142.67ms step:44/1480 train_time:4850ms step_avg:142.66ms step:45/1480 train_time:4992ms step_avg:142.64ms step:46/1480 train_time:5135ms step_avg:142.64ms step:47/1480 train_time:5277ms step_avg:142.63ms step:48/1480 train_time:5419ms step_avg:142.61ms step:49/1480 train_time:5562ms step_avg:142.62ms step:50/1480 train_time:5707ms step_avg:142.67ms step:51/1480 train_time:5850ms step_avg:142.68ms step:52/1480 train_time:5992ms step_avg:142.66ms step:53/1480 train_time:6135ms step_avg:142.68ms step:54/1480 train_time:6276ms step_avg:142.64ms step:55/1480 train_time:6418ms step_avg:142.63ms step:56/1480 train_time:6560ms step_avg:142.61ms step:57/1480 train_time:6704ms step_avg:142.64ms step:58/1480 train_time:6846ms step_avg:142.62ms step:59/1480 train_time:6989ms step_avg:142.62ms step:60/1480 train_time:7131ms step_avg:142.63ms step:61/1480 train_time:7274ms step_avg:142.63ms step:62/1480 train_time:7416ms step_avg:142.61ms step:63/1480 train_time:7559ms step_avg:142.62ms step:64/1480 train_time:7701ms step_avg:142.60ms step:65/1480 train_time:7844ms step_avg:142.61ms step:66/1480 train_time:7988ms step_avg:142.64ms step:67/1480 train_time:8130ms step_avg:142.63ms step:68/1480 train_time:8273ms step_avg:142.64ms step:69/1480 train_time:8415ms step_avg:142.63ms step:70/1480 train_time:8558ms step_avg:142.63ms step:71/1480 train_time:8702ms step_avg:142.65ms step:72/1480 train_time:8844ms step_avg:142.65ms step:73/1480 train_time:8988ms step_avg:142.67ms step:74/1480 train_time:9131ms step_avg:142.68ms step:75/1480 train_time:9276ms step_avg:142.71ms step:76/1480 train_time:9417ms step_avg:142.68ms step:77/1480 train_time:9561ms step_avg:142.71ms step:78/1480 train_time:9705ms step_avg:142.73ms step:79/1480 train_time:9848ms step_avg:142.72ms step:80/1480 train_time:10359ms step_avg:147.98ms step:81/1480 train_time:10873ms step_avg:153.14ms step:82/1480 train_time:10971ms step_avg:152.38ms step:83/1480 train_time:11115ms step_avg:152.25ms step:84/1480 train_time:11255ms step_avg:152.09ms step:85/1480 train_time:11397ms step_avg:151.96ms step:86/1480 train_time:11540ms step_avg:151.84ms step:87/1480 train_time:11681ms step_avg:151.70ms step:88/1480 train_time:11826ms step_avg:151.62ms step:89/1480 train_time:11971ms step_avg:151.53ms step:90/1480 train_time:12113ms step_avg:151.41ms step:91/1480 train_time:12254ms step_avg:151.29ms step:92/1480 train_time:12396ms step_avg:151.18ms step:93/1480 train_time:12538ms step_avg:151.06ms step:94/1480 train_time:12681ms step_avg:150.96ms step:95/1480 train_time:12823ms step_avg:150.86ms step:96/1480 train_time:12968ms step_avg:150.79ms step:97/1480 train_time:13492ms step_avg:155.08ms step:98/1480 train_time:13592ms step_avg:154.46ms step:99/1480 train_time:13737ms step_avg:154.34ms step:100/1480 train_time:13878ms step_avg:154.20ms step:101/1480 train_time:14023ms step_avg:154.09ms step:102/1480 train_time:14164ms step_avg:153.95ms step:103/1480 train_time:14306ms step_avg:153.83ms step:104/1480 train_time:14449ms step_avg:153.71ms step:105/1480 train_time:14594ms step_avg:153.62ms step:106/1480 train_time:14738ms step_avg:153.52ms step:107/1480 train_time:14881ms step_avg:153.41ms step:108/1480 train_time:15023ms step_avg:153.30ms step:109/1480 train_time:15166ms step_avg:153.19ms step:110/1480 train_time:15308ms step_avg:153.08ms step:111/1480 train_time:15450ms step_avg:152.97ms step:112/1480 train_time:15595ms step_avg:152.90ms step:113/1480 train_time:15740ms step_avg:152.82ms step:114/1480 train_time:15887ms step_avg:152.76ms step:115/1480 train_time:16033ms step_avg:152.69ms step:116/1480 train_time:16178ms step_avg:152.63ms step:117/1480 train_time:16326ms step_avg:152.58ms step:118/1480 train_time:16472ms step_avg:152.52ms step:119/1480 train_time:16617ms step_avg:152.45ms step:120/1480 train_time:16762ms step_avg:152.38ms step:121/1480 train_time:16909ms step_avg:152.33ms step:122/1480 train_time:17053ms step_avg:152.26ms step:123/1480 train_time:17199ms step_avg:152.20ms step:124/1480 train_time:17345ms step_avg:152.15ms step:125/1480 train_time:17491ms step_avg:152.10ms step:125/1480 val_loss:4.4142 train_time:17555ms step_avg:152.65ms step:126/1480 train_time:17651ms step_avg:152.16ms step:127/1480 train_time:17793ms step_avg:152.08ms step:128/1480 train_time:17939ms step_avg:152.03ms step:129/1480 train_time:18084ms step_avg:151.96ms step:130/1480 train_time:18230ms step_avg:151.92ms step:131/1480 train_time:18374ms step_avg:151.85ms step:132/1480 train_time:18519ms step_avg:151.80ms step:133/1480 train_time:18666ms step_avg:151.76ms step:134/1480 train_time:18812ms step_avg:151.71ms step:135/1480 train_time:18958ms step_avg:151.66ms step:136/1480 train_time:19104ms step_avg:151.62ms step:137/1480 train_time:19251ms step_avg:151.58ms step:138/1480 train_time:19394ms step_avg:151.52ms step:139/1480 train_time:19539ms step_avg:151.47ms step:140/1480 train_time:19686ms step_avg:151.43ms step:141/1480 train_time:19832ms step_avg:151.39ms step:142/1480 train_time:19977ms step_avg:151.34ms step:143/1480 train_time:20124ms step_avg:151.31ms step:144/1480 train_time:20270ms step_avg:151.27ms step:145/1480 train_time:20415ms step_avg:151.22ms step:146/1480 train_time:20560ms step_avg:151.17ms step:147/1480 train_time:20707ms step_avg:151.14ms step:148/1480 train_time:20852ms step_avg:151.10ms step:149/1480 train_time:20997ms step_avg:151.06ms step:150/1480 train_time:21143ms step_avg:151.02ms step:151/1480 train_time:21288ms step_avg:150.98ms step:152/1480 train_time:21434ms step_avg:150.95ms step:153/1480 train_time:21580ms step_avg:150.91ms step:154/1480 train_time:21727ms step_avg:150.88ms step:155/1480 train_time:21872ms step_avg:150.84ms step:156/1480 train_time:22017ms step_avg:150.80ms step:157/1480 train_time:22162ms step_avg:150.76ms step:158/1480 train_time:22308ms step_avg:150.73ms step:159/1480 train_time:22453ms step_avg:150.69ms step:160/1480 train_time:22598ms step_avg:150.65ms step:161/1480 train_time:22744ms step_avg:150.62ms step:162/1480 train_time:22890ms step_avg:150.59ms step:163/1480 train_time:23035ms step_avg:150.56ms step:164/1480 train_time:23182ms step_avg:150.53ms step:165/1480 train_time:23329ms step_avg:150.51ms step:166/1480 train_time:23474ms step_avg:150.47ms step:167/1480 train_time:23620ms step_avg:150.44ms step:168/1480 train_time:23765ms step_avg:150.41ms step:169/1480 train_time:23911ms step_avg:150.39ms step:170/1480 train_time:24055ms step_avg:150.35ms step:171/1480 train_time:24201ms step_avg:150.32ms step:172/1480 train_time:24348ms step_avg:150.30ms step:173/1480 train_time:24493ms step_avg:150.26ms step:174/1480 train_time:24638ms step_avg:150.23ms step:175/1480 train_time:24784ms step_avg:150.21ms step:176/1480 train_time:24930ms step_avg:150.18ms step:177/1480 train_time:25075ms step_avg:150.15ms step:178/1480 train_time:25221ms step_avg:150.12ms step:179/1480 train_time:25762ms step_avg:152.44ms step:180/1480 train_time:25864ms step_avg:152.14ms step:181/1480 train_time:26012ms step_avg:152.12ms step:182/1480 train_time:26157ms step_avg:152.07ms step:183/1480 train_time:26302ms step_avg:152.04ms step:184/1480 train_time:26448ms step_avg:152.00ms step:185/1480 train_time:26592ms step_avg:151.96ms step:186/1480 train_time:26740ms step_avg:151.93ms step:187/1480 train_time:26888ms step_avg:151.91ms step:188/1480 train_time:27034ms step_avg:151.88ms step:189/1480 train_time:27202ms step_avg:151.96ms step:190/1480 train_time:27326ms step_avg:151.81ms step:191/1480 train_time:27472ms step_avg:151.78ms step:192/1480 train_time:27617ms step_avg:151.74ms step:193/1480 train_time:27763ms step_avg:151.71ms step:194/1480 train_time:27911ms step_avg:151.69ms step:195/1480 train_time:28056ms step_avg:151.65ms step:196/1480 train_time:28202ms step_avg:151.62ms step:197/1480 train_time:28349ms step_avg:151.60ms step:198/1480 train_time:28493ms step_avg:151.56ms step:199/1480 train_time:28638ms step_avg:151.52ms step:200/1480 train_time:28784ms step_avg:151.50ms step:201/1480 train_time:28931ms step_avg:151.47ms step:202/1480 train_time:29075ms step_avg:151.43ms step:203/1480 train_time:29223ms step_avg:151.41ms step:204/1480 train_time:29370ms step_avg:151.39ms step:205/1480 train_time:29514ms step_avg:151.36ms step:206/1480 train_time:29660ms step_avg:151.32ms step:207/1480 train_time:29806ms step_avg:151.30ms step:208/1480 train_time:29952ms step_avg:151.27ms step:209/1480 train_time:30096ms step_avg:151.24ms step:210/1480 train_time:30242ms step_avg:151.21ms step:211/1480 train_time:30388ms step_avg:151.18ms step:212/1480 train_time:30534ms step_avg:151.16ms step:213/1480 train_time:30680ms step_avg:151.13ms step:214/1480 train_time:30826ms step_avg:151.11ms step:215/1480 train_time:30971ms step_avg:151.08ms step:216/1480 train_time:31116ms step_avg:151.05ms step:217/1480 train_time:31261ms step_avg:151.02ms step:218/1480 train_time:31408ms step_avg:151.00ms step:219/1480 train_time:31553ms step_avg:150.97ms step:220/1480 train_time:31698ms step_avg:150.94ms step:221/1480 train_time:32232ms step_avg:152.76ms step:222/1480 train_time:32341ms step_avg:152.55ms step:223/1480 train_time:32488ms step_avg:152.53ms step:224/1480 train_time:32636ms step_avg:152.50ms step:225/1480 train_time:32783ms step_avg:152.48ms step:226/1480 train_time:32932ms step_avg:152.46ms step:227/1480 train_time:33079ms step_avg:152.44ms step:228/1480 train_time:33230ms step_avg:152.43ms step:229/1480 train_time:33378ms step_avg:152.41ms step:230/1480 train_time:33526ms step_avg:152.39ms step:231/1480 train_time:33674ms step_avg:152.37ms step:232/1480 train_time:33821ms step_avg:152.35ms step:233/1480 train_time:33970ms step_avg:152.33ms step:234/1480 train_time:34118ms step_avg:152.31ms step:235/1480 train_time:34267ms step_avg:152.30ms step:236/1480 train_time:34415ms step_avg:152.28ms step:237/1480 train_time:34564ms step_avg:152.27ms step:238/1480 train_time:34713ms step_avg:152.25ms step:239/1480 train_time:34860ms step_avg:152.23ms step:240/1480 train_time:35009ms step_avg:152.21ms step:241/1480 train_time:35156ms step_avg:152.19ms step:242/1480 train_time:35304ms step_avg:152.17ms step:243/1480 train_time:35453ms step_avg:152.16ms step:244/1480 train_time:35600ms step_avg:152.14ms step:245/1480 train_time:35749ms step_avg:152.12ms step:246/1480 train_time:35896ms step_avg:152.10ms step:247/1480 train_time:36045ms step_avg:152.09ms step:248/1480 train_time:36193ms step_avg:152.07ms step:249/1480 train_time:36341ms step_avg:152.05ms step:250/1480 train_time:36490ms step_avg:152.04ms step:250/1480 val_loss:4.0036 train_time:36557ms step_avg:152.32ms step:251/1480 train_time:36648ms step_avg:152.07ms step:252/1480 train_time:36796ms step_avg:152.05ms step:253/1480 train_time:36945ms step_avg:152.04ms step:254/1480 train_time:37093ms step_avg:152.02ms step:255/1480 train_time:37241ms step_avg:152.00ms step:256/1480 train_time:37389ms step_avg:151.99ms step:257/1480 train_time:37536ms step_avg:151.97ms step:258/1480 train_time:37684ms step_avg:151.95ms step:259/1480 train_time:37834ms step_avg:151.94ms step:260/1480 train_time:37982ms step_avg:151.93ms step:261/1480 train_time:38132ms step_avg:151.92ms step:262/1480 train_time:38279ms step_avg:151.90ms step:263/1480 train_time:38427ms step_avg:151.89ms step:264/1480 train_time:38575ms step_avg:151.87ms step:265/1480 train_time:38723ms step_avg:151.85ms step:266/1480 train_time:38872ms step_avg:151.84ms step:267/1480 train_time:39021ms step_avg:151.83ms step:268/1480 train_time:39170ms step_avg:151.82ms step:269/1480 train_time:39317ms step_avg:151.80ms step:270/1480 train_time:39466ms step_avg:151.79ms step:271/1480 train_time:39615ms step_avg:151.78ms step:272/1480 train_time:39763ms step_avg:151.77ms step:273/1480 train_time:39911ms step_avg:151.75ms step:274/1480 train_time:40060ms step_avg:151.74ms step:275/1480 train_time:40209ms step_avg:151.73ms step:276/1480 train_time:40357ms step_avg:151.72ms step:277/1480 train_time:40506ms step_avg:151.71ms step:278/1480 train_time:40654ms step_avg:151.70ms step:279/1480 train_time:40802ms step_avg:151.68ms step:280/1480 train_time:40952ms step_avg:151.67ms step:281/1480 train_time:41100ms step_avg:151.66ms step:282/1480 train_time:41248ms step_avg:151.65ms step:283/1480 train_time:41395ms step_avg:151.63ms step:284/1480 train_time:41544ms step_avg:151.62ms step:285/1480 train_time:41693ms step_avg:151.61ms step:286/1480 train_time:41841ms step_avg:151.60ms step:287/1480 train_time:41990ms step_avg:151.59ms step:288/1480 train_time:42138ms step_avg:151.57ms step:289/1480 train_time:42286ms step_avg:151.56ms step:290/1480 train_time:42435ms step_avg:151.55ms step:291/1480 train_time:42583ms step_avg:151.54ms step:292/1480 train_time:42732ms step_avg:151.53ms step:293/1480 train_time:42879ms step_avg:151.52ms step:294/1480 train_time:43028ms step_avg:151.51ms step:295/1480 train_time:43176ms step_avg:151.50ms step:296/1480 train_time:43325ms step_avg:151.48ms step:297/1480 train_time:43474ms step_avg:151.48ms step:298/1480 train_time:43620ms step_avg:151.46ms step:299/1480 train_time:43770ms step_avg:151.45ms step:300/1480 train_time:43918ms step_avg:151.44ms step:301/1480 train_time:44068ms step_avg:151.44ms step:302/1480 train_time:44215ms step_avg:151.42ms step:303/1480 train_time:44365ms step_avg:151.41ms step:304/1480 train_time:44515ms step_avg:151.41ms step:305/1480 train_time:44663ms step_avg:151.40ms step:306/1480 train_time:44810ms step_avg:151.39ms step:307/1480 train_time:44959ms step_avg:151.38ms step:308/1480 train_time:45108ms step_avg:151.37ms step:309/1480 train_time:45256ms step_avg:151.36ms step:310/1480 train_time:45404ms step_avg:151.35ms step:311/1480 train_time:45553ms step_avg:151.34ms step:312/1480 train_time:45702ms step_avg:151.33ms step:313/1480 train_time:45851ms step_avg:151.32ms step:314/1480 train_time:45999ms step_avg:151.31ms step:315/1480 train_time:46147ms step_avg:151.30ms step:316/1480 train_time:46296ms step_avg:151.29ms step:317/1480 train_time:46444ms step_avg:151.28ms step:318/1480 train_time:46593ms step_avg:151.28ms step:319/1480 train_time:46741ms step_avg:151.27ms step:320/1480 train_time:46891ms step_avg:151.26ms step:321/1480 train_time:47039ms step_avg:151.25ms step:322/1480 train_time:47188ms step_avg:151.24ms step:323/1480 train_time:47336ms step_avg:151.23ms step:324/1480 train_time:47485ms step_avg:151.23ms step:325/1480 train_time:47633ms step_avg:151.22ms step:326/1480 train_time:47781ms step_avg:151.21ms step:327/1480 train_time:47930ms step_avg:151.20ms step:328/1480 train_time:48078ms step_avg:151.19ms step:329/1480 train_time:48228ms step_avg:151.18ms step:330/1480 train_time:48377ms step_avg:151.18ms step:331/1480 train_time:48528ms step_avg:151.18ms step:332/1480 train_time:48678ms step_avg:151.17ms step:333/1480 train_time:48829ms step_avg:151.17ms step:334/1480 train_time:48978ms step_avg:151.17ms step:335/1480 train_time:49129ms step_avg:151.17ms step:336/1480 train_time:49279ms step_avg:151.16ms step:337/1480 train_time:49431ms step_avg:151.17ms step:338/1480 train_time:49580ms step_avg:151.16ms step:339/1480 train_time:49732ms step_avg:151.16ms step:340/1480 train_time:49881ms step_avg:151.16ms step:341/1480 train_time:50032ms step_avg:151.16ms step:342/1480 train_time:50182ms step_avg:151.15ms step:343/1480 train_time:50333ms step_avg:151.15ms step:344/1480 train_time:50483ms step_avg:151.15ms step:345/1480 train_time:50635ms step_avg:151.15ms step:346/1480 train_time:50784ms step_avg:151.14ms step:347/1480 train_time:50936ms step_avg:151.14ms step:348/1480 train_time:51087ms step_avg:151.15ms step:349/1480 train_time:51237ms step_avg:151.14ms step:350/1480 train_time:51388ms step_avg:151.14ms step:351/1480 train_time:51538ms step_avg:151.14ms step:352/1480 train_time:51690ms step_avg:151.14ms step:353/1480 train_time:51840ms step_avg:151.14ms step:354/1480 train_time:51992ms step_avg:151.14ms step:355/1480 train_time:52142ms step_avg:151.14ms step:356/1480 train_time:52294ms step_avg:151.14ms step:357/1480 train_time:52444ms step_avg:151.14ms step:358/1480 train_time:52595ms step_avg:151.14ms step:359/1480 train_time:52746ms step_avg:151.13ms step:360/1480 train_time:52897ms step_avg:151.14ms step:361/1480 train_time:53049ms step_avg:151.14ms step:362/1480 train_time:53199ms step_avg:151.13ms step:363/1480 train_time:53350ms step_avg:151.13ms step:364/1480 train_time:53499ms step_avg:151.13ms step:365/1480 train_time:53650ms step_avg:151.13ms step:366/1480 train_time:53801ms step_avg:151.13ms step:367/1480 train_time:53952ms step_avg:151.13ms step:368/1480 train_time:54103ms step_avg:151.13ms step:369/1480 train_time:54255ms step_avg:151.13ms step:370/1480 train_time:54405ms step_avg:151.12ms step:371/1480 train_time:54555ms step_avg:151.12ms step:372/1480 train_time:54706ms step_avg:151.12ms step:373/1480 train_time:54857ms step_avg:151.12ms step:374/1480 train_time:55008ms step_avg:151.12ms step:375/1480 train_time:55157ms step_avg:151.12ms step:375/1480 val_loss:3.8158 train_time:55225ms step_avg:151.30ms step:376/1480 train_time:55319ms step_avg:151.15ms step:377/1480 train_time:55465ms step_avg:151.13ms step:378/1480 train_time:55616ms step_avg:151.13ms step:379/1480 train_time:55786ms step_avg:151.18ms step:380/1480 train_time:55917ms step_avg:151.13ms step:381/1480 train_time:56068ms step_avg:151.13ms step:382/1480 train_time:56218ms step_avg:151.12ms step:383/1480 train_time:56370ms step_avg:151.13ms step:384/1480 train_time:56520ms step_avg:151.12ms step:385/1480 train_time:56672ms step_avg:151.12ms step:386/1480 train_time:56821ms step_avg:151.12ms step:387/1480 train_time:56973ms step_avg:151.12ms step:388/1480 train_time:57123ms step_avg:151.12ms step:389/1480 train_time:57274ms step_avg:151.12ms step:390/1480 train_time:57424ms step_avg:151.11ms step:391/1480 train_time:57575ms step_avg:151.12ms step:392/1480 train_time:57726ms step_avg:151.11ms step:393/1480 train_time:57877ms step_avg:151.12ms step:394/1480 train_time:58027ms step_avg:151.11ms step:395/1480 train_time:58177ms step_avg:151.11ms step:396/1480 train_time:58329ms step_avg:151.11ms step:397/1480 train_time:58478ms step_avg:151.11ms step:398/1480 train_time:58630ms step_avg:151.11ms step:399/1480 train_time:58780ms step_avg:151.11ms step:400/1480 train_time:58932ms step_avg:151.11ms step:401/1480 train_time:59082ms step_avg:151.10ms step:402/1480 train_time:59233ms step_avg:151.11ms step:403/1480 train_time:59383ms step_avg:151.10ms step:404/1480 train_time:59535ms step_avg:151.10ms step:405/1480 train_time:59684ms step_avg:151.10ms step:406/1480 train_time:59835ms step_avg:151.10ms step:407/1480 train_time:59986ms step_avg:151.10ms step:408/1480 train_time:60137ms step_avg:151.10ms step:409/1480 train_time:60288ms step_avg:151.10ms step:410/1480 train_time:60438ms step_avg:151.09ms step:411/1480 train_time:60589ms step_avg:151.09ms step:412/1480 train_time:60740ms step_avg:151.09ms step:413/1480 train_time:60890ms step_avg:151.09ms step:414/1480 train_time:61040ms step_avg:151.09ms step:415/1480 train_time:61193ms step_avg:151.09ms step:416/1480 train_time:61343ms step_avg:151.09ms step:417/1480 train_time:61494ms step_avg:151.09ms step:418/1480 train_time:61645ms step_avg:151.09ms step:419/1480 train_time:61796ms step_avg:151.09ms step:420/1480 train_time:61947ms step_avg:151.09ms step:421/1480 train_time:62098ms step_avg:151.09ms step:422/1480 train_time:62249ms step_avg:151.09ms step:423/1480 train_time:62399ms step_avg:151.09ms step:424/1480 train_time:62550ms step_avg:151.09ms step:425/1480 train_time:62700ms step_avg:151.08ms step:426/1480 train_time:62852ms step_avg:151.09ms step:427/1480 train_time:63001ms step_avg:151.08ms step:428/1480 train_time:63153ms step_avg:151.08ms step:429/1480 train_time:63304ms step_avg:151.08ms step:430/1480 train_time:63455ms step_avg:151.08ms step:431/1480 train_time:63605ms step_avg:151.08ms step:432/1480 train_time:63756ms step_avg:151.08ms step:433/1480 train_time:63907ms step_avg:151.08ms step:434/1480 train_time:64057ms step_avg:151.08ms step:435/1480 train_time:64209ms step_avg:151.08ms step:436/1480 train_time:64359ms step_avg:151.08ms step:437/1480 train_time:64510ms step_avg:151.08ms step:438/1480 train_time:64660ms step_avg:151.08ms step:439/1480 train_time:64811ms step_avg:151.07ms step:440/1480 train_time:64963ms step_avg:151.08ms step:441/1480 train_time:65116ms step_avg:151.08ms step:442/1480 train_time:65269ms step_avg:151.08ms step:443/1480 train_time:65422ms step_avg:151.09ms step:444/1480 train_time:65574ms step_avg:151.09ms step:445/1480 train_time:65727ms step_avg:151.10ms step:446/1480 train_time:65879ms step_avg:151.10ms step:447/1480 train_time:66031ms step_avg:151.10ms step:448/1480 train_time:66184ms step_avg:151.10ms step:449/1480 train_time:66337ms step_avg:151.11ms step:450/1480 train_time:66491ms step_avg:151.12ms step:451/1480 train_time:66645ms step_avg:151.12ms step:452/1480 train_time:66798ms step_avg:151.13ms step:453/1480 train_time:66951ms step_avg:151.13ms step:454/1480 train_time:67103ms step_avg:151.13ms step:455/1480 train_time:67256ms step_avg:151.14ms step:456/1480 train_time:67409ms step_avg:151.14ms step:457/1480 train_time:67560ms step_avg:151.14ms step:458/1480 train_time:67713ms step_avg:151.14ms step:459/1480 train_time:67867ms step_avg:151.15ms step:460/1480 train_time:68020ms step_avg:151.16ms step:461/1480 train_time:68174ms step_avg:151.16ms step:462/1480 train_time:68326ms step_avg:151.16ms step:463/1480 train_time:68479ms step_avg:151.17ms step:464/1480 train_time:68633ms step_avg:151.17ms step:465/1480 train_time:68786ms step_avg:151.18ms step:466/1480 train_time:68938ms step_avg:151.18ms step:467/1480 train_time:69091ms step_avg:151.18ms step:468/1480 train_time:69243ms step_avg:151.19ms step:469/1480 train_time:69396ms step_avg:151.19ms step:470/1480 train_time:69549ms step_avg:151.19ms step:471/1480 train_time:69701ms step_avg:151.19ms step:472/1480 train_time:69854ms step_avg:151.20ms step:473/1480 train_time:70007ms step_avg:151.20ms step:474/1480 train_time:70159ms step_avg:151.21ms step:475/1480 train_time:70312ms step_avg:151.21ms step:476/1480 train_time:70466ms step_avg:151.22ms step:477/1480 train_time:70620ms step_avg:151.22ms step:478/1480 train_time:70773ms step_avg:151.22ms step:479/1480 train_time:70926ms step_avg:151.23ms step:480/1480 train_time:71078ms step_avg:151.23ms step:481/1480 train_time:71231ms step_avg:151.23ms step:482/1480 train_time:71383ms step_avg:151.24ms step:483/1480 train_time:71536ms step_avg:151.24ms step:484/1480 train_time:71689ms step_avg:151.24ms step:485/1480 train_time:71842ms step_avg:151.25ms step:486/1480 train_time:71996ms step_avg:151.25ms step:487/1480 train_time:72149ms step_avg:151.25ms step:488/1480 train_time:72301ms step_avg:151.26ms step:489/1480 train_time:72454ms step_avg:151.26ms step:490/1480 train_time:72607ms step_avg:151.26ms step:491/1480 train_time:72759ms step_avg:151.27ms step:492/1480 train_time:72912ms step_avg:151.27ms step:493/1480 train_time:73065ms step_avg:151.27ms step:494/1480 train_time:73218ms step_avg:151.28ms step:495/1480 train_time:73372ms step_avg:151.28ms step:496/1480 train_time:73525ms step_avg:151.29ms step:497/1480 train_time:73678ms step_avg:151.29ms step:498/1480 train_time:73830ms step_avg:151.29ms step:499/1480 train_time:73982ms step_avg:151.29ms step:500/1480 train_time:74135ms step_avg:151.30ms step:500/1480 val_loss:3.6928 train_time:74204ms step_avg:151.44ms step:501/1480 train_time:74295ms step_avg:151.31ms step:502/1480 train_time:74448ms step_avg:151.32ms step:503/1480 train_time:74601ms step_avg:151.32ms step:504/1480 train_time:74753ms step_avg:151.32ms step:505/1480 train_time:74905ms step_avg:151.32ms step:506/1480 train_time:75059ms step_avg:151.33ms step:507/1480 train_time:75212ms step_avg:151.33ms step:508/1480 train_time:75366ms step_avg:151.34ms step:509/1480 train_time:75520ms step_avg:151.34ms step:510/1480 train_time:75673ms step_avg:151.35ms step:511/1480 train_time:75827ms step_avg:151.35ms step:512/1480 train_time:75980ms step_avg:151.35ms step:513/1480 train_time:76131ms step_avg:151.35ms step:514/1480 train_time:76284ms step_avg:151.36ms step:515/1480 train_time:76437ms step_avg:151.36ms step:516/1480 train_time:76590ms step_avg:151.36ms step:517/1480 train_time:76744ms step_avg:151.37ms step:518/1480 train_time:76896ms step_avg:151.37ms step:519/1480 train_time:77049ms step_avg:151.37ms step:520/1480 train_time:77202ms step_avg:151.38ms step:521/1480 train_time:77355ms step_avg:151.38ms step:522/1480 train_time:77508ms step_avg:151.38ms step:523/1480 train_time:77662ms step_avg:151.39ms step:524/1480 train_time:77815ms step_avg:151.39ms step:525/1480 train_time:77969ms step_avg:151.40ms step:526/1480 train_time:78122ms step_avg:151.40ms step:527/1480 train_time:78274ms step_avg:151.40ms step:528/1480 train_time:78427ms step_avg:151.40ms step:529/1480 train_time:78580ms step_avg:151.41ms step:530/1480 train_time:78732ms step_avg:151.41ms step:531/1480 train_time:78887ms step_avg:151.41ms step:532/1480 train_time:79040ms step_avg:151.42ms step:533/1480 train_time:79192ms step_avg:151.42ms step:534/1480 train_time:79345ms step_avg:151.42ms step:535/1480 train_time:79497ms step_avg:151.42ms step:536/1480 train_time:79652ms step_avg:151.43ms step:537/1480 train_time:79804ms step_avg:151.43ms step:538/1480 train_time:79959ms step_avg:151.44ms step:539/1480 train_time:80113ms step_avg:151.44ms step:540/1480 train_time:80266ms step_avg:151.45ms step:541/1480 train_time:80418ms step_avg:151.45ms step:542/1480 train_time:80571ms step_avg:151.45ms step:543/1480 train_time:80725ms step_avg:151.45ms step:544/1480 train_time:80876ms step_avg:151.45ms step:545/1480 train_time:81029ms step_avg:151.46ms step:546/1480 train_time:81181ms step_avg:151.46ms step:547/1480 train_time:81335ms step_avg:151.46ms step:548/1480 train_time:81488ms step_avg:151.46ms step:549/1480 train_time:81642ms step_avg:151.47ms step:550/1480 train_time:81795ms step_avg:151.47ms step:551/1480 train_time:81949ms step_avg:151.48ms step:552/1480 train_time:82104ms step_avg:151.48ms step:553/1480 train_time:82260ms step_avg:151.49ms step:554/1480 train_time:82415ms step_avg:151.50ms step:555/1480 train_time:82569ms step_avg:151.50ms step:556/1480 train_time:82723ms step_avg:151.51ms step:557/1480 train_time:82878ms step_avg:151.51ms step:558/1480 train_time:83033ms step_avg:151.52ms step:559/1480 train_time:83187ms step_avg:151.52ms step:560/1480 train_time:83342ms step_avg:151.53ms step:561/1480 train_time:83497ms step_avg:151.54ms step:562/1480 train_time:83652ms step_avg:151.54ms step:563/1480 train_time:83807ms step_avg:151.55ms step:564/1480 train_time:83964ms step_avg:151.56ms step:565/1480 train_time:84119ms step_avg:151.57ms step:566/1480 train_time:84274ms step_avg:151.57ms step:567/1480 train_time:84428ms step_avg:151.58ms step:568/1480 train_time:84584ms step_avg:151.58ms step:569/1480 train_time:84757ms step_avg:151.62ms step:570/1480 train_time:84894ms step_avg:151.60ms step:571/1480 train_time:85048ms step_avg:151.60ms step:572/1480 train_time:85204ms step_avg:151.61ms step:573/1480 train_time:85359ms step_avg:151.61ms step:574/1480 train_time:85513ms step_avg:151.62ms step:575/1480 train_time:85669ms step_avg:151.63ms step:576/1480 train_time:85823ms step_avg:151.63ms step:577/1480 train_time:85977ms step_avg:151.64ms step:578/1480 train_time:86132ms step_avg:151.64ms step:579/1480 train_time:86286ms step_avg:151.65ms step:580/1480 train_time:86442ms step_avg:151.65ms step:581/1480 train_time:86595ms step_avg:151.66ms step:582/1480 train_time:86750ms step_avg:151.66ms step:583/1480 train_time:86904ms step_avg:151.67ms step:584/1480 train_time:87060ms step_avg:151.67ms step:585/1480 train_time:87214ms step_avg:151.68ms step:586/1480 train_time:87370ms step_avg:151.68ms step:587/1480 train_time:87524ms step_avg:151.69ms step:588/1480 train_time:87679ms step_avg:151.69ms step:589/1480 train_time:87832ms step_avg:151.70ms step:590/1480 train_time:87987ms step_avg:151.70ms step:591/1480 train_time:88142ms step_avg:151.71ms step:592/1480 train_time:88298ms step_avg:151.71ms step:593/1480 train_time:88452ms step_avg:151.72ms step:594/1480 train_time:88606ms step_avg:151.72ms step:595/1480 train_time:88762ms step_avg:151.73ms step:596/1480 train_time:88919ms step_avg:151.74ms step:597/1480 train_time:89074ms step_avg:151.74ms step:598/1480 train_time:89228ms step_avg:151.75ms step:599/1480 train_time:89381ms step_avg:151.75ms step:600/1480 train_time:89535ms step_avg:151.75ms step:601/1480 train_time:89690ms step_avg:151.76ms step:602/1480 train_time:89845ms step_avg:151.77ms step:603/1480 train_time:89999ms step_avg:151.77ms step:604/1480 train_time:90154ms step_avg:151.77ms step:605/1480 train_time:90308ms step_avg:151.78ms step:606/1480 train_time:90465ms step_avg:151.79ms step:607/1480 train_time:90620ms step_avg:151.79ms step:608/1480 train_time:90775ms step_avg:151.80ms step:609/1480 train_time:90931ms step_avg:151.80ms step:610/1480 train_time:91086ms step_avg:151.81ms step:611/1480 train_time:91241ms step_avg:151.81ms step:612/1480 train_time:91395ms step_avg:151.82ms step:613/1480 train_time:91550ms step_avg:151.82ms step:614/1480 train_time:91704ms step_avg:151.83ms step:615/1480 train_time:91859ms step_avg:151.83ms step:616/1480 train_time:92015ms step_avg:151.84ms step:617/1480 train_time:92171ms step_avg:151.85ms step:618/1480 train_time:92325ms step_avg:151.85ms step:619/1480 train_time:92479ms step_avg:151.85ms step:620/1480 train_time:92633ms step_avg:151.86ms step:621/1480 train_time:92787ms step_avg:151.86ms step:622/1480 train_time:92942ms step_avg:151.87ms step:623/1480 train_time:93098ms step_avg:151.87ms step:624/1480 train_time:93253ms step_avg:151.88ms step:625/1480 train_time:93406ms step_avg:151.88ms step:625/1480 val_loss:3.6121 train_time:93477ms step_avg:152.00ms step:626/1480 train_time:93571ms step_avg:151.90ms step:627/1480 train_time:93724ms step_avg:151.90ms step:628/1480 train_time:93879ms step_avg:151.91ms step:629/1480 train_time:94033ms step_avg:151.91ms step:630/1480 train_time:94186ms step_avg:151.91ms step:631/1480 train_time:94341ms step_avg:151.92ms step:632/1480 train_time:94495ms step_avg:151.92ms step:633/1480 train_time:94651ms step_avg:151.93ms step:634/1480 train_time:94805ms step_avg:151.93ms step:635/1480 train_time:94959ms step_avg:151.93ms step:636/1480 train_time:95113ms step_avg:151.94ms step:637/1480 train_time:95268ms step_avg:151.94ms step:638/1480 train_time:95423ms step_avg:151.95ms step:639/1480 train_time:95576ms step_avg:151.95ms step:640/1480 train_time:95731ms step_avg:151.95ms step:641/1480 train_time:95885ms step_avg:151.96ms step:642/1480 train_time:96041ms step_avg:151.96ms step:643/1480 train_time:96196ms step_avg:151.97ms step:644/1480 train_time:96349ms step_avg:151.97ms step:645/1480 train_time:96505ms step_avg:151.98ms step:646/1480 train_time:96659ms step_avg:151.98ms step:647/1480 train_time:96814ms step_avg:151.98ms step:648/1480 train_time:96970ms step_avg:151.99ms step:649/1480 train_time:97125ms step_avg:152.00ms step:650/1480 train_time:97280ms step_avg:152.00ms step:651/1480 train_time:97436ms step_avg:152.01ms step:652/1480 train_time:97591ms step_avg:152.01ms step:653/1480 train_time:97746ms step_avg:152.02ms step:654/1480 train_time:97900ms step_avg:152.02ms step:655/1480 train_time:98054ms step_avg:152.02ms step:656/1480 train_time:98209ms step_avg:152.03ms step:657/1480 train_time:98363ms step_avg:152.03ms step:658/1480 train_time:98518ms step_avg:152.03ms step:659/1480 train_time:98674ms step_avg:152.04ms step:660/1480 train_time:98830ms step_avg:152.05ms step:661/1480 train_time:98988ms step_avg:152.06ms step:662/1480 train_time:99144ms step_avg:152.06ms step:663/1480 train_time:99299ms step_avg:152.07ms step:664/1480 train_time:99454ms step_avg:152.07ms step:665/1480 train_time:99610ms step_avg:152.08ms step:666/1480 train_time:99767ms step_avg:152.08ms step:667/1480 train_time:99924ms step_avg:152.09ms step:668/1480 train_time:100081ms step_avg:152.10ms step:669/1480 train_time:100239ms step_avg:152.11ms step:670/1480 train_time:100394ms step_avg:152.11ms step:671/1480 train_time:100550ms step_avg:152.12ms step:672/1480 train_time:100707ms step_avg:152.13ms step:673/1480 train_time:100863ms step_avg:152.13ms step:674/1480 train_time:101021ms step_avg:152.14ms step:675/1480 train_time:101179ms step_avg:152.15ms step:676/1480 train_time:101337ms step_avg:152.16ms step:677/1480 train_time:101492ms step_avg:152.16ms step:678/1480 train_time:101649ms step_avg:152.17ms step:679/1480 train_time:101805ms step_avg:152.18ms step:680/1480 train_time:101961ms step_avg:152.18ms step:681/1480 train_time:102118ms step_avg:152.19ms step:682/1480 train_time:102274ms step_avg:152.19ms step:683/1480 train_time:102431ms step_avg:152.20ms step:684/1480 train_time:102587ms step_avg:152.21ms step:685/1480 train_time:102744ms step_avg:152.21ms step:686/1480 train_time:102900ms step_avg:152.22ms step:687/1480 train_time:103056ms step_avg:152.22ms step:688/1480 train_time:103213ms step_avg:152.23ms step:689/1480 train_time:103370ms step_avg:152.24ms step:690/1480 train_time:103527ms step_avg:152.25ms step:691/1480 train_time:103683ms step_avg:152.25ms step:692/1480 train_time:103839ms step_avg:152.26ms step:693/1480 train_time:103995ms step_avg:152.26ms step:694/1480 train_time:104151ms step_avg:152.27ms step:695/1480 train_time:104306ms step_avg:152.27ms step:696/1480 train_time:104464ms step_avg:152.28ms step:697/1480 train_time:104621ms step_avg:152.29ms step:698/1480 train_time:104777ms step_avg:152.29ms step:699/1480 train_time:104933ms step_avg:152.30ms step:700/1480 train_time:105088ms step_avg:152.30ms step:701/1480 train_time:105245ms step_avg:152.31ms step:702/1480 train_time:105403ms step_avg:152.32ms step:703/1480 train_time:105559ms step_avg:152.32ms step:704/1480 train_time:105714ms step_avg:152.33ms step:705/1480 train_time:105871ms step_avg:152.33ms step:706/1480 train_time:106028ms step_avg:152.34ms step:707/1480 train_time:106184ms step_avg:152.34ms step:708/1480 train_time:106340ms step_avg:152.35ms step:709/1480 train_time:106495ms step_avg:152.35ms step:710/1480 train_time:106650ms step_avg:152.36ms step:711/1480 train_time:106806ms step_avg:152.36ms step:712/1480 train_time:106963ms step_avg:152.37ms step:713/1480 train_time:107120ms step_avg:152.38ms step:714/1480 train_time:107278ms step_avg:152.38ms step:715/1480 train_time:107433ms step_avg:152.39ms step:716/1480 train_time:107588ms step_avg:152.39ms step:717/1480 train_time:107745ms step_avg:152.40ms step:718/1480 train_time:107901ms step_avg:152.40ms step:719/1480 train_time:108057ms step_avg:152.41ms step:720/1480 train_time:108213ms step_avg:152.41ms step:721/1480 train_time:108370ms step_avg:152.42ms step:722/1480 train_time:108528ms step_avg:152.43ms step:723/1480 train_time:108684ms step_avg:152.43ms step:724/1480 train_time:108840ms step_avg:152.44ms step:725/1480 train_time:108996ms step_avg:152.44ms step:726/1480 train_time:109153ms step_avg:152.45ms step:727/1480 train_time:109311ms step_avg:152.46ms step:728/1480 train_time:109467ms step_avg:152.46ms step:729/1480 train_time:109624ms step_avg:152.47ms step:730/1480 train_time:109781ms step_avg:152.47ms step:731/1480 train_time:109938ms step_avg:152.48ms step:732/1480 train_time:110093ms step_avg:152.48ms step:733/1480 train_time:110249ms step_avg:152.49ms step:734/1480 train_time:110406ms step_avg:152.49ms step:735/1480 train_time:110561ms step_avg:152.50ms step:736/1480 train_time:110717ms step_avg:152.50ms step:737/1480 train_time:110873ms step_avg:152.51ms step:738/1480 train_time:111029ms step_avg:152.51ms step:739/1480 train_time:111186ms step_avg:152.52ms step:740/1480 train_time:111346ms step_avg:152.53ms step:741/1480 train_time:111504ms step_avg:152.54ms step:742/1480 train_time:111659ms step_avg:152.54ms step:743/1480 train_time:111815ms step_avg:152.54ms step:744/1480 train_time:111971ms step_avg:152.55ms step:745/1480 train_time:112129ms step_avg:152.56ms step:746/1480 train_time:112284ms step_avg:152.56ms step:747/1480 train_time:112441ms step_avg:152.57ms step:748/1480 train_time:112601ms step_avg:152.58ms step:749/1480 train_time:112757ms step_avg:152.58ms step:750/1480 train_time:112912ms step_avg:152.58ms step:750/1480 val_loss:3.5555 train_time:112985ms step_avg:152.68ms step:751/1480 train_time:113078ms step_avg:152.60ms step:752/1480 train_time:113230ms step_avg:152.60ms step:753/1480 train_time:113387ms step_avg:152.61ms step:754/1480 train_time:113543ms step_avg:152.61ms step:755/1480 train_time:113699ms step_avg:152.62ms step:756/1480 train_time:113855ms step_avg:152.62ms step:757/1480 train_time:114012ms step_avg:152.63ms step:758/1480 train_time:114168ms step_avg:152.63ms step:759/1480 train_time:114340ms step_avg:152.66ms step:760/1480 train_time:114483ms step_avg:152.64ms step:761/1480 train_time:114639ms step_avg:152.65ms step:762/1480 train_time:114797ms step_avg:152.66ms step:763/1480 train_time:114954ms step_avg:152.66ms step:764/1480 train_time:115111ms step_avg:152.67ms step:765/1480 train_time:115269ms step_avg:152.67ms step:766/1480 train_time:115425ms step_avg:152.68ms step:767/1480 train_time:115582ms step_avg:152.68ms step:768/1480 train_time:115737ms step_avg:152.69ms step:769/1480 train_time:115896ms step_avg:152.70ms step:770/1480 train_time:116053ms step_avg:152.70ms step:771/1480 train_time:116210ms step_avg:152.71ms step:772/1480 train_time:116367ms step_avg:152.71ms step:773/1480 train_time:116524ms step_avg:152.72ms step:774/1480 train_time:116682ms step_avg:152.72ms step:775/1480 train_time:116838ms step_avg:152.73ms step:776/1480 train_time:116997ms step_avg:152.74ms step:777/1480 train_time:117157ms step_avg:152.75ms step:778/1480 train_time:117315ms step_avg:152.75ms step:779/1480 train_time:117473ms step_avg:152.76ms step:780/1480 train_time:117631ms step_avg:152.77ms step:781/1480 train_time:117788ms step_avg:152.77ms step:782/1480 train_time:117945ms step_avg:152.78ms step:783/1480 train_time:118102ms step_avg:152.78ms step:784/1480 train_time:118259ms step_avg:152.79ms step:785/1480 train_time:118417ms step_avg:152.80ms step:786/1480 train_time:118575ms step_avg:152.80ms step:787/1480 train_time:118733ms step_avg:152.81ms step:788/1480 train_time:118892ms step_avg:152.82ms step:789/1480 train_time:119049ms step_avg:152.82ms step:790/1480 train_time:119208ms step_avg:152.83ms step:791/1480 train_time:119366ms step_avg:152.84ms step:792/1480 train_time:119524ms step_avg:152.84ms step:793/1480 train_time:119681ms step_avg:152.85ms step:794/1480 train_time:119839ms step_avg:152.86ms step:795/1480 train_time:119998ms step_avg:152.86ms step:796/1480 train_time:120157ms step_avg:152.87ms step:797/1480 train_time:120316ms step_avg:152.88ms step:798/1480 train_time:120475ms step_avg:152.89ms step:799/1480 train_time:120635ms step_avg:152.90ms step:800/1480 train_time:120794ms step_avg:152.90ms step:801/1480 train_time:120952ms step_avg:152.91ms step:802/1480 train_time:121113ms step_avg:152.92ms step:803/1480 train_time:121272ms step_avg:152.93ms step:804/1480 train_time:121428ms step_avg:152.93ms step:805/1480 train_time:121587ms step_avg:152.94ms step:806/1480 train_time:121743ms step_avg:152.94ms step:807/1480 train_time:121901ms step_avg:152.95ms step:808/1480 train_time:122058ms step_avg:152.95ms step:809/1480 train_time:122216ms step_avg:152.96ms step:810/1480 train_time:122373ms step_avg:152.97ms step:811/1480 train_time:122530ms step_avg:152.97ms step:812/1480 train_time:122686ms step_avg:152.98ms step:813/1480 train_time:122844ms step_avg:152.98ms step:814/1480 train_time:123001ms step_avg:152.99ms step:815/1480 train_time:123157ms step_avg:152.99ms step:816/1480 train_time:123317ms step_avg:153.00ms step:817/1480 train_time:123473ms step_avg:153.00ms step:818/1480 train_time:123631ms step_avg:153.01ms step:819/1480 train_time:123790ms step_avg:153.02ms step:820/1480 train_time:123950ms step_avg:153.02ms step:821/1480 train_time:124107ms step_avg:153.03ms step:822/1480 train_time:124265ms step_avg:153.04ms step:823/1480 train_time:124423ms step_avg:153.04ms step:824/1480 train_time:124579ms step_avg:153.05ms step:825/1480 train_time:124739ms step_avg:153.05ms step:826/1480 train_time:124898ms step_avg:153.06ms step:827/1480 train_time:125056ms step_avg:153.07ms step:828/1480 train_time:125214ms step_avg:153.07ms step:829/1480 train_time:125374ms step_avg:153.08ms step:830/1480 train_time:125533ms step_avg:153.09ms step:831/1480 train_time:125691ms step_avg:153.10ms step:832/1480 train_time:125851ms step_avg:153.10ms step:833/1480 train_time:126009ms step_avg:153.11ms step:834/1480 train_time:126166ms step_avg:153.11ms step:835/1480 train_time:126323ms step_avg:153.12ms step:836/1480 train_time:126482ms step_avg:153.13ms step:837/1480 train_time:126640ms step_avg:153.13ms step:838/1480 train_time:126798ms step_avg:153.14ms step:839/1480 train_time:126955ms step_avg:153.14ms step:840/1480 train_time:127113ms step_avg:153.15ms step:841/1480 train_time:127271ms step_avg:153.15ms step:842/1480 train_time:127429ms step_avg:153.16ms step:843/1480 train_time:127586ms step_avg:153.17ms step:844/1480 train_time:127743ms step_avg:153.17ms step:845/1480 train_time:127900ms step_avg:153.17ms step:846/1480 train_time:128057ms step_avg:153.18ms step:847/1480 train_time:128217ms step_avg:153.19ms step:848/1480 train_time:128376ms step_avg:153.19ms step:849/1480 train_time:128533ms step_avg:153.20ms step:850/1480 train_time:128691ms step_avg:153.20ms step:851/1480 train_time:128852ms step_avg:153.21ms step:852/1480 train_time:129010ms step_avg:153.22ms step:853/1480 train_time:129168ms step_avg:153.22ms step:854/1480 train_time:129325ms step_avg:153.23ms step:855/1480 train_time:129482ms step_avg:153.23ms step:856/1480 train_time:129639ms step_avg:153.24ms step:857/1480 train_time:129797ms step_avg:153.24ms step:858/1480 train_time:129957ms step_avg:153.25ms step:859/1480 train_time:130116ms step_avg:153.26ms step:860/1480 train_time:130275ms step_avg:153.27ms step:861/1480 train_time:130435ms step_avg:153.27ms step:862/1480 train_time:130597ms step_avg:153.28ms step:863/1480 train_time:130755ms step_avg:153.29ms step:864/1480 train_time:130914ms step_avg:153.30ms step:865/1480 train_time:131071ms step_avg:153.30ms step:866/1480 train_time:131230ms step_avg:153.31ms step:867/1480 train_time:131388ms step_avg:153.31ms step:868/1480 train_time:131545ms step_avg:153.32ms step:869/1480 train_time:131703ms step_avg:153.32ms step:870/1480 train_time:131861ms step_avg:153.33ms step:871/1480 train_time:132018ms step_avg:153.33ms step:872/1480 train_time:132176ms step_avg:153.34ms step:873/1480 train_time:132332ms step_avg:153.34ms step:874/1480 train_time:132494ms step_avg:153.35ms step:875/1480 train_time:132654ms step_avg:153.36ms step:875/1480 val_loss:3.5097 train_time:132726ms step_avg:153.44ms step:876/1480 train_time:132817ms step_avg:153.37ms step:877/1480 train_time:132974ms step_avg:153.37ms step:878/1480 train_time:133132ms step_avg:153.38ms step:879/1480 train_time:133290ms step_avg:153.38ms step:880/1480 train_time:133447ms step_avg:153.39ms step:881/1480 train_time:133605ms step_avg:153.39ms step:882/1480 train_time:133765ms step_avg:153.40ms step:883/1480 train_time:133923ms step_avg:153.41ms step:884/1480 train_time:134085ms step_avg:153.42ms step:885/1480 train_time:134244ms step_avg:153.42ms step:886/1480 train_time:134405ms step_avg:153.43ms step:887/1480 train_time:134564ms step_avg:153.44ms step:888/1480 train_time:134726ms step_avg:153.45ms step:889/1480 train_time:134886ms step_avg:153.45ms step:890/1480 train_time:135044ms step_avg:153.46ms step:891/1480 train_time:135203ms step_avg:153.46ms step:892/1480 train_time:135361ms step_avg:153.47ms step:893/1480 train_time:135518ms step_avg:153.47ms step:894/1480 train_time:135679ms step_avg:153.48ms step:895/1480 train_time:135838ms step_avg:153.49ms step:896/1480 train_time:135997ms step_avg:153.50ms step:897/1480 train_time:136158ms step_avg:153.50ms step:898/1480 train_time:136318ms step_avg:153.51ms step:899/1480 train_time:136479ms step_avg:153.52ms step:900/1480 train_time:136637ms step_avg:153.53ms step:901/1480 train_time:136796ms step_avg:153.53ms step:902/1480 train_time:136953ms step_avg:153.53ms step:903/1480 train_time:137115ms step_avg:153.54ms step:904/1480 train_time:137274ms step_avg:153.55ms step:905/1480 train_time:137432ms step_avg:153.56ms step:906/1480 train_time:137593ms step_avg:153.56ms step:907/1480 train_time:137756ms step_avg:153.57ms step:908/1480 train_time:137914ms step_avg:153.58ms step:909/1480 train_time:138073ms step_avg:153.59ms step:910/1480 train_time:138236ms step_avg:153.60ms step:911/1480 train_time:138396ms step_avg:153.60ms step:912/1480 train_time:138558ms step_avg:153.61ms step:913/1480 train_time:138719ms step_avg:153.62ms step:914/1480 train_time:138880ms step_avg:153.63ms step:915/1480 train_time:139042ms step_avg:153.64ms step:916/1480 train_time:139201ms step_avg:153.64ms step:917/1480 train_time:139360ms step_avg:153.65ms step:918/1480 train_time:139521ms step_avg:153.66ms step:919/1480 train_time:139683ms step_avg:153.67ms step:920/1480 train_time:139842ms step_avg:153.67ms step:921/1480 train_time:140002ms step_avg:153.68ms step:922/1480 train_time:140163ms step_avg:153.69ms step:923/1480 train_time:140320ms step_avg:153.69ms step:924/1480 train_time:140480ms step_avg:153.70ms step:925/1480 train_time:140639ms step_avg:153.70ms step:926/1480 train_time:140799ms step_avg:153.71ms step:927/1480 train_time:140957ms step_avg:153.72ms step:928/1480 train_time:141117ms step_avg:153.72ms step:929/1480 train_time:141277ms step_avg:153.73ms step:930/1480 train_time:141438ms step_avg:153.74ms step:931/1480 train_time:141597ms step_avg:153.74ms step:932/1480 train_time:141756ms step_avg:153.75ms step:933/1480 train_time:141916ms step_avg:153.76ms step:934/1480 train_time:142078ms step_avg:153.76ms step:935/1480 train_time:142239ms step_avg:153.77ms step:936/1480 train_time:142397ms step_avg:153.78ms step:937/1480 train_time:142559ms step_avg:153.79ms step:938/1480 train_time:142717ms step_avg:153.79ms step:939/1480 train_time:142880ms step_avg:153.80ms step:940/1480 train_time:143042ms step_avg:153.81ms step:941/1480 train_time:143200ms step_avg:153.81ms step:942/1480 train_time:143358ms step_avg:153.82ms step:943/1480 train_time:143519ms step_avg:153.82ms step:944/1480 train_time:143683ms step_avg:153.84ms step:945/1480 train_time:143841ms step_avg:153.84ms step:946/1480 train_time:144004ms step_avg:153.85ms step:947/1480 train_time:144164ms step_avg:153.86ms step:948/1480 train_time:144324ms step_avg:153.86ms step:949/1480 train_time:144503ms step_avg:153.89ms step:950/1480 train_time:144644ms step_avg:153.88ms step:951/1480 train_time:144804ms step_avg:153.88ms step:952/1480 train_time:144963ms step_avg:153.89ms step:953/1480 train_time:145122ms step_avg:153.89ms step:954/1480 train_time:145285ms step_avg:153.90ms step:955/1480 train_time:145443ms step_avg:153.91ms step:956/1480 train_time:145602ms step_avg:153.91ms step:957/1480 train_time:145763ms step_avg:153.92ms step:958/1480 train_time:145925ms step_avg:153.93ms step:959/1480 train_time:146085ms step_avg:153.94ms step:960/1480 train_time:146245ms step_avg:153.94ms step:961/1480 train_time:146404ms step_avg:153.95ms step:962/1480 train_time:146563ms step_avg:153.95ms step:963/1480 train_time:146722ms step_avg:153.96ms step:964/1480 train_time:146883ms step_avg:153.97ms step:965/1480 train_time:147042ms step_avg:153.97ms step:966/1480 train_time:147201ms step_avg:153.98ms step:967/1480 train_time:147360ms step_avg:153.98ms step:968/1480 train_time:147519ms step_avg:153.99ms step:969/1480 train_time:147680ms step_avg:153.99ms step:970/1480 train_time:147838ms step_avg:154.00ms step:971/1480 train_time:147996ms step_avg:154.00ms step:972/1480 train_time:148156ms step_avg:154.01ms step:973/1480 train_time:148314ms step_avg:154.01ms step:974/1480 train_time:148477ms step_avg:154.02ms step:975/1480 train_time:148639ms step_avg:154.03ms step:976/1480 train_time:148799ms step_avg:154.04ms step:977/1480 train_time:148958ms step_avg:154.04ms step:978/1480 train_time:149118ms step_avg:154.05ms step:979/1480 train_time:149278ms step_avg:154.05ms step:980/1480 train_time:149439ms step_avg:154.06ms step:981/1480 train_time:149601ms step_avg:154.07ms step:982/1480 train_time:149760ms step_avg:154.07ms step:983/1480 train_time:149919ms step_avg:154.08ms step:984/1480 train_time:150078ms step_avg:154.08ms step:985/1480 train_time:150241ms step_avg:154.09ms step:986/1480 train_time:150400ms step_avg:154.10ms step:987/1480 train_time:150559ms step_avg:154.10ms step:988/1480 train_time:150717ms step_avg:154.11ms step:989/1480 train_time:150878ms step_avg:154.11ms step:990/1480 train_time:151040ms step_avg:154.12ms step:991/1480 train_time:151201ms step_avg:154.13ms step:992/1480 train_time:151366ms step_avg:154.14ms step:993/1480 train_time:151533ms step_avg:154.15ms step:994/1480 train_time:151692ms step_avg:154.16ms step:995/1480 train_time:151851ms step_avg:154.16ms step:996/1480 train_time:152008ms step_avg:154.17ms step:997/1480 train_time:152168ms step_avg:154.17ms step:998/1480 train_time:152325ms step_avg:154.18ms step:999/1480 train_time:152484ms step_avg:154.18ms step:1000/1480 train_time:152644ms step_avg:154.19ms step:1000/1480 val_loss:3.4458 train_time:152717ms step_avg:154.26ms step:1001/1480 train_time:152807ms step_avg:154.19ms step:1002/1480 train_time:152967ms step_avg:154.20ms step:1003/1480 train_time:153129ms step_avg:154.21ms step:1004/1480 train_time:153291ms step_avg:154.22ms step:1005/1480 train_time:153450ms step_avg:154.22ms step:1006/1480 train_time:153610ms step_avg:154.23ms step:1007/1480 train_time:153771ms step_avg:154.23ms step:1008/1480 train_time:153932ms step_avg:154.24ms step:1009/1480 train_time:154098ms step_avg:154.25ms step:1010/1480 train_time:154258ms step_avg:154.26ms step:1011/1480 train_time:154416ms step_avg:154.26ms step:1012/1480 train_time:154574ms step_avg:154.27ms step:1013/1480 train_time:154735ms step_avg:154.27ms step:1014/1480 train_time:154894ms step_avg:154.28ms step:1015/1480 train_time:155056ms step_avg:154.28ms step:1016/1480 train_time:155215ms step_avg:154.29ms step:1017/1480 train_time:155376ms step_avg:154.30ms step:1018/1480 train_time:155536ms step_avg:154.30ms step:1019/1480 train_time:155697ms step_avg:154.31ms step:1020/1480 train_time:155857ms step_avg:154.31ms step:1021/1480 train_time:156015ms step_avg:154.32ms step:1022/1480 train_time:156174ms step_avg:154.32ms step:1023/1480 train_time:156335ms step_avg:154.33ms step:1024/1480 train_time:156495ms step_avg:154.33ms step:1025/1480 train_time:156656ms step_avg:154.34ms step:1026/1480 train_time:156816ms step_avg:154.35ms step:1027/1480 train_time:156974ms step_avg:154.35ms step:1028/1480 train_time:157137ms step_avg:154.36ms step:1029/1480 train_time:157299ms step_avg:154.37ms step:1030/1480 train_time:157459ms step_avg:154.37ms step:1031/1480 train_time:157619ms step_avg:154.38ms step:1032/1480 train_time:157784ms step_avg:154.39ms step:1033/1480 train_time:157944ms step_avg:154.39ms step:1034/1480 train_time:158106ms step_avg:154.40ms step:1035/1480 train_time:158267ms step_avg:154.41ms step:1036/1480 train_time:158427ms step_avg:154.41ms step:1037/1480 train_time:158587ms step_avg:154.42ms step:1038/1480 train_time:158749ms step_avg:154.42ms step:1039/1480 train_time:158911ms step_avg:154.43ms step:1040/1480 train_time:159070ms step_avg:154.44ms step:1041/1480 train_time:159231ms step_avg:154.44ms step:1042/1480 train_time:159389ms step_avg:154.45ms step:1043/1480 train_time:159548ms step_avg:154.45ms step:1044/1480 train_time:159709ms step_avg:154.46ms step:1045/1480 train_time:159870ms step_avg:154.46ms step:1046/1480 train_time:160030ms step_avg:154.47ms step:1047/1480 train_time:160190ms step_avg:154.47ms step:1048/1480 train_time:160351ms step_avg:154.48ms step:1049/1480 train_time:160512ms step_avg:154.49ms step:1050/1480 train_time:160672ms step_avg:154.49ms step:1051/1480 train_time:160833ms step_avg:154.50ms step:1052/1480 train_time:160993ms step_avg:154.50ms step:1053/1480 train_time:161154ms step_avg:154.51ms step:1054/1480 train_time:161314ms step_avg:154.52ms step:1055/1480 train_time:161473ms step_avg:154.52ms step:1056/1480 train_time:161633ms step_avg:154.53ms step:1057/1480 train_time:161793ms step_avg:154.53ms step:1058/1480 train_time:161955ms step_avg:154.54ms step:1059/1480 train_time:162118ms step_avg:154.55ms step:1060/1480 train_time:162281ms step_avg:154.55ms step:1061/1480 train_time:162438ms step_avg:154.56ms step:1062/1480 train_time:162597ms step_avg:154.56ms step:1063/1480 train_time:162757ms step_avg:154.56ms step:1064/1480 train_time:162914ms step_avg:154.57ms step:1065/1480 train_time:163075ms step_avg:154.57ms step:1066/1480 train_time:163238ms step_avg:154.58ms step:1067/1480 train_time:163401ms step_avg:154.59ms step:1068/1480 train_time:163562ms step_avg:154.60ms step:1069/1480 train_time:163727ms step_avg:154.61ms step:1070/1480 train_time:163887ms step_avg:154.61ms step:1071/1480 train_time:164051ms step_avg:154.62ms step:1072/1480 train_time:164210ms step_avg:154.62ms step:1073/1480 train_time:164367ms step_avg:154.63ms step:1074/1480 train_time:164528ms step_avg:154.63ms step:1075/1480 train_time:164688ms step_avg:154.64ms step:1076/1480 train_time:164847ms step_avg:154.64ms step:1077/1480 train_time:165008ms step_avg:154.65ms step:1078/1480 train_time:165172ms step_avg:154.66ms step:1079/1480 train_time:165335ms step_avg:154.66ms step:1080/1480 train_time:165496ms step_avg:154.67ms step:1081/1480 train_time:165654ms step_avg:154.67ms step:1082/1480 train_time:165814ms step_avg:154.68ms step:1083/1480 train_time:165973ms step_avg:154.68ms step:1084/1480 train_time:166132ms step_avg:154.69ms step:1085/1480 train_time:166292ms step_avg:154.69ms step:1086/1480 train_time:166453ms step_avg:154.70ms step:1087/1480 train_time:166614ms step_avg:154.70ms step:1088/1480 train_time:166773ms step_avg:154.71ms step:1089/1480 train_time:166935ms step_avg:154.71ms step:1090/1480 train_time:167099ms step_avg:154.72ms step:1091/1480 train_time:167260ms step_avg:154.73ms step:1092/1480 train_time:167421ms step_avg:154.73ms step:1093/1480 train_time:167584ms step_avg:154.74ms step:1094/1480 train_time:167745ms step_avg:154.75ms step:1095/1480 train_time:167905ms step_avg:154.75ms step:1096/1480 train_time:168066ms step_avg:154.76ms step:1097/1480 train_time:168228ms step_avg:154.76ms step:1098/1480 train_time:168389ms step_avg:154.77ms step:1099/1480 train_time:168550ms step_avg:154.78ms step:1100/1480 train_time:168713ms step_avg:154.78ms step:1101/1480 train_time:168875ms step_avg:154.79ms step:1102/1480 train_time:169037ms step_avg:154.80ms step:1103/1480 train_time:169204ms step_avg:154.81ms step:1104/1480 train_time:169366ms step_avg:154.81ms step:1105/1480 train_time:169530ms step_avg:154.82ms step:1106/1480 train_time:169691ms step_avg:154.83ms step:1107/1480 train_time:169852ms step_avg:154.83ms step:1108/1480 train_time:170012ms step_avg:154.84ms step:1109/1480 train_time:170172ms step_avg:154.84ms step:1110/1480 train_time:170334ms step_avg:154.85ms step:1111/1480 train_time:170496ms step_avg:154.86ms step:1112/1480 train_time:170658ms step_avg:154.86ms step:1113/1480 train_time:170827ms step_avg:154.87ms step:1114/1480 train_time:170990ms step_avg:154.88ms step:1115/1480 train_time:171152ms step_avg:154.89ms step:1116/1480 train_time:171312ms step_avg:154.89ms step:1117/1480 train_time:171476ms step_avg:154.90ms step:1118/1480 train_time:171640ms step_avg:154.91ms step:1119/1480 train_time:171802ms step_avg:154.92ms step:1120/1480 train_time:171963ms step_avg:154.92ms step:1121/1480 train_time:172126ms step_avg:154.93ms step:1122/1480 train_time:172288ms step_avg:154.93ms step:1123/1480 train_time:172449ms step_avg:154.94ms step:1124/1480 train_time:172612ms step_avg:154.95ms step:1125/1480 train_time:172774ms step_avg:154.95ms step:1125/1480 val_loss:3.3899 train_time:172848ms step_avg:155.02ms step:1126/1480 train_time:172943ms step_avg:154.97ms step:1127/1480 train_time:173098ms step_avg:154.97ms step:1128/1480 train_time:173259ms step_avg:154.97ms step:1129/1480 train_time:173420ms step_avg:154.98ms step:1130/1480 train_time:173580ms step_avg:154.98ms step:1131/1480 train_time:173749ms step_avg:154.99ms step:1132/1480 train_time:173911ms step_avg:155.00ms step:1133/1480 train_time:174075ms step_avg:155.01ms step:1134/1480 train_time:174237ms step_avg:155.01ms step:1135/1480 train_time:174396ms step_avg:155.02ms step:1136/1480 train_time:174558ms step_avg:155.03ms step:1137/1480 train_time:174718ms step_avg:155.03ms step:1138/1480 train_time:174886ms step_avg:155.04ms step:1139/1480 train_time:175061ms step_avg:155.06ms step:1140/1480 train_time:175211ms step_avg:155.05ms step:1141/1480 train_time:175375ms step_avg:155.06ms step:1142/1480 train_time:175536ms step_avg:155.07ms step:1143/1480 train_time:175700ms step_avg:155.07ms step:1144/1480 train_time:175861ms step_avg:155.08ms step:1145/1480 train_time:176019ms step_avg:155.08ms step:1146/1480 train_time:176182ms step_avg:155.09ms step:1147/1480 train_time:176345ms step_avg:155.10ms step:1148/1480 train_time:176509ms step_avg:155.10ms step:1149/1480 train_time:176672ms step_avg:155.11ms step:1150/1480 train_time:176832ms step_avg:155.12ms step:1151/1480 train_time:176995ms step_avg:155.12ms step:1152/1480 train_time:177159ms step_avg:155.13ms step:1153/1480 train_time:177323ms step_avg:155.14ms step:1154/1480 train_time:177484ms step_avg:155.14ms step:1155/1480 train_time:177648ms step_avg:155.15ms step:1156/1480 train_time:177814ms step_avg:155.16ms step:1157/1480 train_time:177977ms step_avg:155.17ms step:1158/1480 train_time:178138ms step_avg:155.17ms step:1159/1480 train_time:178298ms step_avg:155.18ms step:1160/1480 train_time:178457ms step_avg:155.18ms step:1161/1480 train_time:178618ms step_avg:155.19ms step:1162/1480 train_time:178780ms step_avg:155.19ms step:1163/1480 train_time:178943ms step_avg:155.20ms step:1164/1480 train_time:179107ms step_avg:155.21ms step:1165/1480 train_time:179268ms step_avg:155.21ms step:1166/1480 train_time:179431ms step_avg:155.22ms step:1167/1480 train_time:179591ms step_avg:155.22ms step:1168/1480 train_time:179752ms step_avg:155.23ms step:1169/1480 train_time:179913ms step_avg:155.23ms step:1170/1480 train_time:180074ms step_avg:155.24ms step:1171/1480 train_time:180236ms step_avg:155.24ms step:1172/1480 train_time:180396ms step_avg:155.25ms step:1173/1480 train_time:180558ms step_avg:155.25ms step:1174/1480 train_time:180729ms step_avg:155.27ms step:1175/1480 train_time:180891ms step_avg:155.27ms step:1176/1480 train_time:181055ms step_avg:155.28ms step:1177/1480 train_time:181221ms step_avg:155.29ms step:1178/1480 train_time:181383ms step_avg:155.29ms step:1179/1480 train_time:181542ms step_avg:155.30ms step:1180/1480 train_time:181712ms step_avg:155.31ms step:1181/1480 train_time:181875ms step_avg:155.32ms step:1182/1480 train_time:182035ms step_avg:155.32ms step:1183/1480 train_time:182195ms step_avg:155.32ms step:1184/1480 train_time:182357ms step_avg:155.33ms step:1185/1480 train_time:182520ms step_avg:155.34ms step:1186/1480 train_time:182684ms step_avg:155.34ms step:1187/1480 train_time:182857ms step_avg:155.36ms step:1188/1480 train_time:183015ms step_avg:155.36ms step:1189/1480 train_time:183177ms step_avg:155.37ms step:1190/1480 train_time:183340ms step_avg:155.37ms step:1191/1480 train_time:183502ms step_avg:155.38ms step:1192/1480 train_time:183664ms step_avg:155.38ms step:1193/1480 train_time:183825ms step_avg:155.39ms step:1194/1480 train_time:183988ms step_avg:155.40ms step:1195/1480 train_time:184153ms step_avg:155.40ms step:1196/1480 train_time:184324ms step_avg:155.42ms step:1197/1480 train_time:184486ms step_avg:155.42ms step:1198/1480 train_time:184656ms step_avg:155.43ms step:1199/1480 train_time:184817ms step_avg:155.44ms step:1200/1480 train_time:184978ms step_avg:155.44ms step:1201/1480 train_time:185138ms step_avg:155.45ms step:1202/1480 train_time:185308ms step_avg:155.46ms step:1203/1480 train_time:185474ms step_avg:155.47ms step:1204/1480 train_time:185636ms step_avg:155.47ms step:1205/1480 train_time:185796ms step_avg:155.48ms step:1206/1480 train_time:185956ms step_avg:155.48ms step:1207/1480 train_time:186116ms step_avg:155.49ms step:1208/1480 train_time:186276ms step_avg:155.49ms step:1209/1480 train_time:186438ms step_avg:155.49ms step:1210/1480 train_time:186606ms step_avg:155.50ms step:1211/1480 train_time:186770ms step_avg:155.51ms step:1212/1480 train_time:186933ms step_avg:155.52ms step:1213/1480 train_time:187096ms step_avg:155.52ms step:1214/1480 train_time:187262ms step_avg:155.53ms step:1215/1480 train_time:187426ms step_avg:155.54ms step:1216/1480 train_time:187588ms step_avg:155.55ms step:1217/1480 train_time:187752ms step_avg:155.55ms step:1218/1480 train_time:187915ms step_avg:155.56ms step:1219/1480 train_time:188081ms step_avg:155.57ms step:1220/1480 train_time:188244ms step_avg:155.57ms step:1221/1480 train_time:188405ms step_avg:155.58ms step:1222/1480 train_time:188566ms step_avg:155.58ms step:1223/1480 train_time:188729ms step_avg:155.59ms step:1224/1480 train_time:188895ms step_avg:155.60ms step:1225/1480 train_time:189058ms step_avg:155.60ms step:1226/1480 train_time:189222ms step_avg:155.61ms step:1227/1480 train_time:189386ms step_avg:155.62ms step:1228/1480 train_time:189548ms step_avg:155.62ms step:1229/1480 train_time:189712ms step_avg:155.63ms step:1230/1480 train_time:189879ms step_avg:155.64ms step:1231/1480 train_time:190044ms step_avg:155.65ms step:1232/1480 train_time:190211ms step_avg:155.66ms step:1233/1480 train_time:190373ms step_avg:155.66ms step:1234/1480 train_time:190533ms step_avg:155.66ms step:1235/1480 train_time:190698ms step_avg:155.67ms step:1236/1480 train_time:190859ms step_avg:155.68ms step:1237/1480 train_time:191019ms step_avg:155.68ms step:1238/1480 train_time:191194ms step_avg:155.70ms step:1239/1480 train_time:191356ms step_avg:155.70ms step:1240/1480 train_time:191519ms step_avg:155.71ms step:1241/1480 train_time:191685ms step_avg:155.71ms step:1242/1480 train_time:191847ms step_avg:155.72ms step:1243/1480 train_time:192010ms step_avg:155.73ms step:1244/1480 train_time:192171ms step_avg:155.73ms step:1245/1480 train_time:192333ms step_avg:155.74ms step:1246/1480 train_time:192495ms step_avg:155.74ms step:1247/1480 train_time:192658ms step_avg:155.75ms step:1248/1480 train_time:192818ms step_avg:155.75ms step:1249/1480 train_time:192978ms step_avg:155.75ms step:1250/1480 train_time:193140ms step_avg:155.76ms step:1250/1480 val_loss:3.3392 train_time:193215ms step_avg:155.82ms step:1251/1480 train_time:193310ms step_avg:155.77ms step:1252/1480 train_time:193475ms step_avg:155.78ms step:1253/1480 train_time:193636ms step_avg:155.78ms step:1254/1480 train_time:193796ms step_avg:155.78ms step:1255/1480 train_time:193964ms step_avg:155.79ms step:1256/1480 train_time:194129ms step_avg:155.80ms step:1257/1480 train_time:194292ms step_avg:155.81ms step:1258/1480 train_time:194457ms step_avg:155.82ms step:1259/1480 train_time:194619ms step_avg:155.82ms step:1260/1480 train_time:194779ms step_avg:155.82ms step:1261/1480 train_time:194940ms step_avg:155.83ms step:1262/1480 train_time:195105ms step_avg:155.83ms step:1263/1480 train_time:195271ms step_avg:155.84ms step:1264/1480 train_time:195431ms step_avg:155.85ms step:1265/1480 train_time:195593ms step_avg:155.85ms step:1266/1480 train_time:195755ms step_avg:155.86ms step:1267/1480 train_time:195917ms step_avg:155.86ms step:1268/1480 train_time:196079ms step_avg:155.87ms step:1269/1480 train_time:196244ms step_avg:155.87ms step:1270/1480 train_time:196406ms step_avg:155.88ms step:1271/1480 train_time:196571ms step_avg:155.88ms step:1272/1480 train_time:196731ms step_avg:155.89ms step:1273/1480 train_time:196895ms step_avg:155.89ms step:1274/1480 train_time:197058ms step_avg:155.90ms step:1275/1480 train_time:197219ms step_avg:155.90ms step:1276/1480 train_time:197378ms step_avg:155.91ms step:1277/1480 train_time:197540ms step_avg:155.91ms step:1278/1480 train_time:197700ms step_avg:155.91ms step:1279/1480 train_time:197861ms step_avg:155.92ms step:1280/1480 train_time:198027ms step_avg:155.93ms step:1281/1480 train_time:198191ms step_avg:155.93ms step:1282/1480 train_time:198351ms step_avg:155.94ms step:1283/1480 train_time:198515ms step_avg:155.94ms step:1284/1480 train_time:198678ms step_avg:155.95ms step:1285/1480 train_time:198839ms step_avg:155.95ms step:1286/1480 train_time:199000ms step_avg:155.96ms step:1287/1480 train_time:199162ms step_avg:155.96ms step:1288/1480 train_time:199324ms step_avg:155.97ms step:1289/1480 train_time:199495ms step_avg:155.98ms step:1290/1480 train_time:199661ms step_avg:155.98ms step:1291/1480 train_time:199824ms step_avg:155.99ms step:1292/1480 train_time:199989ms step_avg:156.00ms step:1293/1480 train_time:200155ms step_avg:156.01ms step:1294/1480 train_time:200317ms step_avg:156.01ms step:1295/1480 train_time:200479ms step_avg:156.02ms step:1296/1480 train_time:200641ms step_avg:156.02ms step:1297/1480 train_time:200804ms step_avg:156.02ms step:1298/1480 train_time:200968ms step_avg:156.03ms step:1299/1480 train_time:201132ms step_avg:156.04ms step:1300/1480 train_time:201293ms step_avg:156.04ms step:1301/1480 train_time:201453ms step_avg:156.04ms step:1302/1480 train_time:201618ms step_avg:156.05ms step:1303/1480 train_time:201785ms step_avg:156.06ms step:1304/1480 train_time:201950ms step_avg:156.07ms step:1305/1480 train_time:202113ms step_avg:156.07ms step:1306/1480 train_time:202277ms step_avg:156.08ms step:1307/1480 train_time:202438ms step_avg:156.08ms step:1308/1480 train_time:202598ms step_avg:156.08ms step:1309/1480 train_time:202761ms step_avg:156.09ms step:1310/1480 train_time:202922ms step_avg:156.09ms step:1311/1480 train_time:203083ms step_avg:156.10ms step:1312/1480 train_time:203249ms step_avg:156.11ms step:1313/1480 train_time:203412ms step_avg:156.11ms step:1314/1480 train_time:203576ms step_avg:156.12ms step:1315/1480 train_time:203738ms step_avg:156.12ms step:1316/1480 train_time:203897ms step_avg:156.12ms step:1317/1480 train_time:204057ms step_avg:156.13ms step:1318/1480 train_time:204224ms step_avg:156.13ms step:1319/1480 train_time:204390ms step_avg:156.14ms step:1320/1480 train_time:204555ms step_avg:156.15ms step:1321/1480 train_time:204719ms step_avg:156.16ms step:1322/1480 train_time:204889ms step_avg:156.17ms step:1323/1480 train_time:205053ms step_avg:156.17ms step:1324/1480 train_time:205217ms step_avg:156.18ms step:1325/1480 train_time:205385ms step_avg:156.19ms step:1326/1480 train_time:205552ms step_avg:156.19ms step:1327/1480 train_time:205715ms step_avg:156.20ms step:1328/1480 train_time:205877ms step_avg:156.20ms step:1329/1480 train_time:206069ms step_avg:156.23ms step:1330/1480 train_time:206224ms step_avg:156.23ms step:1331/1480 train_time:206388ms step_avg:156.24ms step:1332/1480 train_time:206550ms step_avg:156.24ms step:1333/1480 train_time:206716ms step_avg:156.25ms step:1334/1480 train_time:206879ms step_avg:156.25ms step:1335/1480 train_time:207040ms step_avg:156.26ms step:1336/1480 train_time:207210ms step_avg:156.27ms step:1337/1480 train_time:207376ms step_avg:156.27ms step:1338/1480 train_time:207539ms step_avg:156.28ms step:1339/1480 train_time:207703ms step_avg:156.29ms step:1340/1480 train_time:207867ms step_avg:156.29ms step:1341/1480 train_time:208028ms step_avg:156.29ms step:1342/1480 train_time:208194ms step_avg:156.30ms step:1343/1480 train_time:208356ms step_avg:156.31ms step:1344/1480 train_time:208519ms step_avg:156.31ms step:1345/1480 train_time:208688ms step_avg:156.32ms step:1346/1480 train_time:208849ms step_avg:156.32ms step:1347/1480 train_time:209013ms step_avg:156.33ms step:1348/1480 train_time:209176ms step_avg:156.33ms step:1349/1480 train_time:209339ms step_avg:156.34ms step:1350/1480 train_time:209506ms step_avg:156.35ms step:1351/1480 train_time:209670ms step_avg:156.35ms step:1352/1480 train_time:209833ms step_avg:156.36ms step:1353/1480 train_time:209999ms step_avg:156.37ms step:1354/1480 train_time:210162ms step_avg:156.37ms step:1355/1480 train_time:210323ms step_avg:156.37ms step:1356/1480 train_time:210489ms step_avg:156.38ms step:1357/1480 train_time:210654ms step_avg:156.39ms step:1358/1480 train_time:210818ms step_avg:156.39ms step:1359/1480 train_time:210982ms step_avg:156.40ms step:1360/1480 train_time:211147ms step_avg:156.41ms step:1361/1480 train_time:211315ms step_avg:156.41ms step:1362/1480 train_time:211479ms step_avg:156.42ms step:1363/1480 train_time:211647ms step_avg:156.43ms step:1364/1480 train_time:211809ms step_avg:156.43ms step:1365/1480 train_time:211969ms step_avg:156.43ms step:1366/1480 train_time:212133ms step_avg:156.44ms step:1367/1480 train_time:212296ms step_avg:156.45ms step:1368/1480 train_time:212460ms step_avg:156.45ms step:1369/1480 train_time:212629ms step_avg:156.46ms step:1370/1480 train_time:212795ms step_avg:156.47ms step:1371/1480 train_time:212957ms step_avg:156.47ms step:1372/1480 train_time:213123ms step_avg:156.48ms step:1373/1480 train_time:213285ms step_avg:156.48ms step:1374/1480 train_time:213455ms step_avg:156.49ms step:1375/1480 train_time:213618ms step_avg:156.50ms step:1375/1480 val_loss:3.3010 train_time:213692ms step_avg:156.55ms step:1376/1480 train_time:213783ms step_avg:156.50ms step:1377/1480 train_time:213947ms step_avg:156.51ms step:1378/1480 train_time:214108ms step_avg:156.51ms step:1379/1480 train_time:214271ms step_avg:156.52ms step:1380/1480 train_time:214434ms step_avg:156.52ms step:1381/1480 train_time:214604ms step_avg:156.53ms step:1382/1480 train_time:214768ms step_avg:156.54ms step:1383/1480 train_time:214932ms step_avg:156.54ms step:1384/1480 train_time:215098ms step_avg:156.55ms step:1385/1480 train_time:215259ms step_avg:156.55ms step:1386/1480 train_time:215422ms step_avg:156.56ms step:1387/1480 train_time:215586ms step_avg:156.56ms step:1388/1480 train_time:215750ms step_avg:156.57ms step:1389/1480 train_time:215914ms step_avg:156.57ms step:1390/1480 train_time:216076ms step_avg:156.58ms step:1391/1480 train_time:216236ms step_avg:156.58ms step:1392/1480 train_time:216403ms step_avg:156.59ms step:1393/1480 train_time:216565ms step_avg:156.59ms step:1394/1480 train_time:216728ms step_avg:156.60ms step:1395/1480 train_time:216890ms step_avg:156.60ms step:1396/1480 train_time:217052ms step_avg:156.60ms step:1397/1480 train_time:217211ms step_avg:156.61ms step:1398/1480 train_time:217371ms step_avg:156.61ms step:1399/1480 train_time:217533ms step_avg:156.61ms step:1400/1480 train_time:217700ms step_avg:156.62ms step:1401/1480 train_time:217861ms step_avg:156.62ms step:1402/1480 train_time:218024ms step_avg:156.63ms step:1403/1480 train_time:218189ms step_avg:156.63ms step:1404/1480 train_time:218351ms step_avg:156.64ms step:1405/1480 train_time:218516ms step_avg:156.64ms step:1406/1480 train_time:218682ms step_avg:156.65ms step:1407/1480 train_time:218844ms step_avg:156.65ms step:1408/1480 train_time:219005ms step_avg:156.66ms step:1409/1480 train_time:219178ms step_avg:156.67ms step:1410/1480 train_time:219340ms step_avg:156.67ms step:1411/1480 train_time:219501ms step_avg:156.67ms step:1412/1480 train_time:219663ms step_avg:156.68ms step:1413/1480 train_time:219826ms step_avg:156.68ms step:1414/1480 train_time:219990ms step_avg:156.69ms step:1415/1480 train_time:220153ms step_avg:156.69ms step:1416/1480 train_time:220327ms step_avg:156.70ms step:1417/1480 train_time:220490ms step_avg:156.71ms step:1418/1480 train_time:220654ms step_avg:156.71ms step:1419/1480 train_time:220819ms step_avg:156.72ms step:1420/1480 train_time:220985ms step_avg:156.73ms step:1421/1480 train_time:221149ms step_avg:156.73ms step:1422/1480 train_time:221312ms step_avg:156.74ms step:1423/1480 train_time:221472ms step_avg:156.74ms step:1424/1480 train_time:221641ms step_avg:156.75ms step:1425/1480 train_time:221810ms step_avg:156.76ms step:1426/1480 train_time:221973ms step_avg:156.76ms step:1427/1480 train_time:222139ms step_avg:156.77ms step:1428/1480 train_time:222303ms step_avg:156.77ms step:1429/1480 train_time:222464ms step_avg:156.78ms step:1430/1480 train_time:222630ms step_avg:156.78ms step:1431/1480 train_time:222795ms step_avg:156.79ms step:1432/1480 train_time:222962ms step_avg:156.79ms step:1433/1480 train_time:223131ms step_avg:156.80ms step:1434/1480 train_time:223299ms step_avg:156.81ms step:1435/1480 train_time:223464ms step_avg:156.82ms step:1436/1480 train_time:223629ms step_avg:156.82ms step:1437/1480 train_time:223790ms step_avg:156.83ms step:1438/1480 train_time:223953ms step_avg:156.83ms step:1439/1480 train_time:224121ms step_avg:156.84ms step:1440/1480 train_time:224284ms step_avg:156.84ms step:1441/1480 train_time:224448ms step_avg:156.85ms step:1442/1480 train_time:224613ms step_avg:156.85ms step:1443/1480 train_time:224786ms step_avg:156.86ms step:1444/1480 train_time:224949ms step_avg:156.87ms step:1445/1480 train_time:225111ms step_avg:156.87ms step:1446/1480 train_time:225277ms step_avg:156.88ms step:1447/1480 train_time:225445ms step_avg:156.89ms step:1448/1480 train_time:225609ms step_avg:156.89ms step:1449/1480 train_time:225771ms step_avg:156.89ms step:1450/1480 train_time:225935ms step_avg:156.90ms step:1451/1480 train_time:226098ms step_avg:156.90ms step:1452/1480 train_time:226262ms step_avg:156.91ms step:1453/1480 train_time:226426ms step_avg:156.91ms step:1454/1480 train_time:226588ms step_avg:156.92ms step:1455/1480 train_time:226756ms step_avg:156.92ms step:1456/1480 train_time:226922ms step_avg:156.93ms step:1457/1480 train_time:227084ms step_avg:156.93ms step:1458/1480 train_time:227248ms step_avg:156.94ms step:1459/1480 train_time:227413ms step_avg:156.94ms step:1460/1480 train_time:227576ms step_avg:156.95ms step:1461/1480 train_time:227740ms step_avg:156.95ms step:1462/1480 train_time:227904ms step_avg:156.96ms step:1463/1480 train_time:228069ms step_avg:156.96ms step:1464/1480 train_time:228234ms step_avg:156.97ms step:1465/1480 train_time:228397ms step_avg:156.97ms step:1466/1480 train_time:228560ms step_avg:156.98ms step:1467/1480 train_time:228725ms step_avg:156.98ms step:1468/1480 train_time:228889ms step_avg:156.99ms step:1469/1480 train_time:229051ms step_avg:156.99ms step:1470/1480 train_time:229221ms step_avg:157.00ms step:1471/1480 train_time:229392ms step_avg:157.01ms step:1472/1480 train_time:229562ms step_avg:157.02ms step:1473/1480 train_time:229725ms step_avg:157.02ms step:1474/1480 train_time:229891ms step_avg:157.03ms step:1475/1480 train_time:230060ms step_avg:157.04ms step:1476/1480 train_time:230223ms step_avg:157.04ms step:1477/1480 train_time:230389ms step_avg:157.05ms step:1478/1480 train_time:230558ms step_avg:157.06ms step:1479/1480 train_time:230724ms step_avg:157.06ms step:1480/1480 train_time:230886ms step_avg:157.07ms step:1480/1480 val_loss:3.2819 train_time:230962ms step_avg:157.12ms peak memory consumption: 34239 MiB