import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 08:50:22 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 37C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28726ms step_avg:nanms step:2/1480 train_time:28832ms step_avg:nanms step:3/1480 train_time:28954ms step_avg:nanms step:4/1480 train_time:29094ms step_avg:nanms step:5/1480 train_time:29235ms step_avg:nanms step:6/1480 train_time:29376ms step_avg:nanms step:7/1480 train_time:29519ms step_avg:nanms step:8/1480 train_time:29661ms step_avg:nanms step:9/1480 train_time:29803ms step_avg:nanms step:10/1480 train_time:29946ms step_avg:nanms step:11/1480 train_time:144ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:423ms step_avg:141.11ms step:14/1480 train_time:565ms step_avg:141.28ms step:15/1480 train_time:707ms step_avg:141.42ms step:16/1480 train_time:849ms step_avg:141.53ms step:17/1480 train_time:993ms step_avg:141.93ms step:18/1480 train_time:1137ms step_avg:142.11ms step:19/1480 train_time:1280ms step_avg:142.26ms step:20/1480 train_time:1421ms step_avg:142.15ms step:21/1480 train_time:1562ms step_avg:142.01ms step:22/1480 train_time:1705ms step_avg:142.05ms step:23/1480 train_time:1846ms step_avg:142.02ms step:24/1480 train_time:1989ms step_avg:142.08ms step:25/1480 train_time:2132ms step_avg:142.15ms step:26/1480 train_time:2277ms step_avg:142.29ms step:27/1480 train_time:2420ms step_avg:142.34ms step:28/1480 train_time:2561ms step_avg:142.30ms step:29/1480 train_time:2704ms step_avg:142.32ms step:30/1480 train_time:2845ms step_avg:142.23ms step:31/1480 train_time:2987ms step_avg:142.26ms step:32/1480 train_time:3132ms step_avg:142.35ms step:33/1480 train_time:3276ms step_avg:142.45ms step:34/1480 train_time:3419ms step_avg:142.48ms step:35/1480 train_time:3563ms step_avg:142.51ms step:36/1480 train_time:3706ms step_avg:142.54ms step:37/1480 train_time:3849ms step_avg:142.55ms step:38/1480 train_time:3991ms step_avg:142.54ms step:39/1480 train_time:4136ms step_avg:142.62ms step:40/1480 train_time:4282ms step_avg:142.72ms step:41/1480 train_time:4423ms step_avg:142.67ms step:42/1480 train_time:4564ms step_avg:142.64ms step:43/1480 train_time:4708ms step_avg:142.67ms step:44/1480 train_time:4849ms step_avg:142.62ms step:45/1480 train_time:4992ms step_avg:142.62ms step:46/1480 train_time:5135ms step_avg:142.63ms step:47/1480 train_time:5278ms step_avg:142.66ms step:48/1480 train_time:5421ms step_avg:142.67ms step:49/1480 train_time:5563ms step_avg:142.65ms step:50/1480 train_time:5707ms step_avg:142.66ms step:51/1480 train_time:5848ms step_avg:142.64ms step:52/1480 train_time:5992ms step_avg:142.67ms step:53/1480 train_time:6135ms step_avg:142.66ms step:54/1480 train_time:6278ms step_avg:142.67ms step:55/1480 train_time:6420ms step_avg:142.68ms step:56/1480 train_time:6564ms step_avg:142.69ms step:57/1480 train_time:6704ms step_avg:142.64ms step:58/1480 train_time:6846ms step_avg:142.62ms step:59/1480 train_time:6988ms step_avg:142.62ms step:60/1480 train_time:7131ms step_avg:142.62ms step:61/1480 train_time:7275ms step_avg:142.65ms step:62/1480 train_time:7417ms step_avg:142.64ms step:63/1480 train_time:7560ms step_avg:142.64ms step:64/1480 train_time:7702ms step_avg:142.64ms step:65/1480 train_time:7845ms step_avg:142.63ms step:66/1480 train_time:7985ms step_avg:142.59ms step:67/1480 train_time:8127ms step_avg:142.57ms step:68/1480 train_time:8271ms step_avg:142.60ms step:69/1480 train_time:8414ms step_avg:142.61ms step:70/1480 train_time:8557ms step_avg:142.62ms step:71/1480 train_time:8701ms step_avg:142.64ms step:72/1480 train_time:8843ms step_avg:142.62ms step:73/1480 train_time:8986ms step_avg:142.64ms step:74/1480 train_time:9127ms step_avg:142.61ms step:75/1480 train_time:9270ms step_avg:142.61ms step:76/1480 train_time:9414ms step_avg:142.64ms step:77/1480 train_time:9558ms step_avg:142.66ms step:78/1480 train_time:9701ms step_avg:142.66ms step:79/1480 train_time:10230ms step_avg:148.26ms step:80/1480 train_time:10745ms step_avg:153.51ms step:81/1480 train_time:10848ms step_avg:152.79ms step:82/1480 train_time:10991ms step_avg:152.66ms step:83/1480 train_time:11134ms step_avg:152.53ms step:84/1480 train_time:11277ms step_avg:152.39ms step:85/1480 train_time:11420ms step_avg:152.27ms step:86/1480 train_time:11564ms step_avg:152.15ms step:87/1480 train_time:11705ms step_avg:152.01ms step:88/1480 train_time:12233ms step_avg:156.83ms step:89/1480 train_time:12336ms step_avg:156.16ms step:90/1480 train_time:12480ms step_avg:155.99ms step:91/1480 train_time:12620ms step_avg:155.81ms step:92/1480 train_time:12762ms step_avg:155.64ms step:93/1480 train_time:12903ms step_avg:155.46ms step:94/1480 train_time:13045ms step_avg:155.30ms step:95/1480 train_time:13188ms step_avg:155.16ms step:96/1480 train_time:13334ms step_avg:155.04ms step:97/1480 train_time:13479ms step_avg:154.93ms step:98/1480 train_time:13621ms step_avg:154.79ms step:99/1480 train_time:13764ms step_avg:154.65ms step:100/1480 train_time:13906ms step_avg:154.51ms step:101/1480 train_time:14052ms step_avg:154.42ms step:102/1480 train_time:14191ms step_avg:154.25ms step:103/1480 train_time:14334ms step_avg:154.13ms step:104/1480 train_time:14478ms step_avg:154.02ms step:105/1480 train_time:14619ms step_avg:153.89ms step:106/1480 train_time:14763ms step_avg:153.78ms step:107/1480 train_time:14904ms step_avg:153.65ms step:108/1480 train_time:15047ms step_avg:153.54ms step:109/1480 train_time:15189ms step_avg:153.43ms step:110/1480 train_time:15333ms step_avg:153.33ms step:111/1480 train_time:15480ms step_avg:153.27ms step:112/1480 train_time:15625ms step_avg:153.19ms step:113/1480 train_time:15770ms step_avg:153.11ms step:114/1480 train_time:15916ms step_avg:153.04ms step:115/1480 train_time:16062ms step_avg:152.97ms step:116/1480 train_time:16207ms step_avg:152.89ms step:117/1480 train_time:16352ms step_avg:152.82ms step:118/1480 train_time:16500ms step_avg:152.78ms step:119/1480 train_time:16645ms step_avg:152.71ms step:120/1480 train_time:16791ms step_avg:152.64ms step:121/1480 train_time:16937ms step_avg:152.59ms step:122/1480 train_time:17083ms step_avg:152.53ms step:123/1480 train_time:17227ms step_avg:152.45ms step:124/1480 train_time:17372ms step_avg:152.39ms step:125/1480 train_time:17518ms step_avg:152.33ms step:125/1480 val_loss:4.4145 train_time:17582ms step_avg:152.89ms step:126/1480 train_time:17672ms step_avg:152.35ms step:127/1480 train_time:17821ms step_avg:152.32ms step:128/1480 train_time:17967ms step_avg:152.26ms step:129/1480 train_time:18112ms step_avg:152.20ms step:130/1480 train_time:18257ms step_avg:152.14ms step:131/1480 train_time:18403ms step_avg:152.09ms step:132/1480 train_time:18548ms step_avg:152.03ms step:133/1480 train_time:18695ms step_avg:151.99ms step:134/1480 train_time:18842ms step_avg:151.95ms step:135/1480 train_time:18988ms step_avg:151.90ms step:136/1480 train_time:19132ms step_avg:151.84ms step:137/1480 train_time:19278ms step_avg:151.79ms step:138/1480 train_time:19424ms step_avg:151.75ms step:139/1480 train_time:19568ms step_avg:151.69ms step:140/1480 train_time:19715ms step_avg:151.65ms step:141/1480 train_time:19862ms step_avg:151.62ms step:142/1480 train_time:20008ms step_avg:151.58ms step:143/1480 train_time:20153ms step_avg:151.53ms step:144/1480 train_time:20299ms step_avg:151.48ms step:145/1480 train_time:20446ms step_avg:151.45ms step:146/1480 train_time:20590ms step_avg:151.40ms step:147/1480 train_time:20735ms step_avg:151.35ms step:148/1480 train_time:20883ms step_avg:151.32ms step:149/1480 train_time:21028ms step_avg:151.28ms step:150/1480 train_time:21172ms step_avg:151.23ms step:151/1480 train_time:21319ms step_avg:151.20ms step:152/1480 train_time:21465ms step_avg:151.16ms step:153/1480 train_time:21611ms step_avg:151.12ms step:154/1480 train_time:21756ms step_avg:151.09ms step:155/1480 train_time:21904ms step_avg:151.06ms step:156/1480 train_time:22049ms step_avg:151.02ms step:157/1480 train_time:22195ms step_avg:150.99ms step:158/1480 train_time:22342ms step_avg:150.96ms step:159/1480 train_time:22488ms step_avg:150.92ms step:160/1480 train_time:22632ms step_avg:150.88ms step:161/1480 train_time:22779ms step_avg:150.86ms step:162/1480 train_time:22926ms step_avg:150.83ms step:163/1480 train_time:23071ms step_avg:150.79ms step:164/1480 train_time:23218ms step_avg:150.77ms step:165/1480 train_time:23364ms step_avg:150.74ms step:166/1480 train_time:23510ms step_avg:150.70ms step:167/1480 train_time:23655ms step_avg:150.67ms step:168/1480 train_time:23802ms step_avg:150.65ms step:169/1480 train_time:23947ms step_avg:150.61ms step:170/1480 train_time:24093ms step_avg:150.58ms step:171/1480 train_time:24239ms step_avg:150.55ms step:172/1480 train_time:24385ms step_avg:150.53ms step:173/1480 train_time:24530ms step_avg:150.49ms step:174/1480 train_time:24676ms step_avg:150.46ms step:175/1480 train_time:24823ms step_avg:150.44ms step:176/1480 train_time:24968ms step_avg:150.41ms step:177/1480 train_time:25114ms step_avg:150.38ms step:178/1480 train_time:25261ms step_avg:150.36ms step:179/1480 train_time:25407ms step_avg:150.34ms step:180/1480 train_time:25936ms step_avg:152.56ms step:181/1480 train_time:26036ms step_avg:152.26ms step:182/1480 train_time:26183ms step_avg:152.23ms step:183/1480 train_time:26328ms step_avg:152.19ms step:184/1480 train_time:26473ms step_avg:152.14ms step:185/1480 train_time:26619ms step_avg:152.11ms step:186/1480 train_time:26765ms step_avg:152.07ms step:187/1480 train_time:26910ms step_avg:152.04ms step:188/1480 train_time:27057ms step_avg:152.01ms step:189/1480 train_time:27225ms step_avg:152.09ms step:190/1480 train_time:27349ms step_avg:151.94ms step:191/1480 train_time:27495ms step_avg:151.91ms step:192/1480 train_time:27642ms step_avg:151.88ms step:193/1480 train_time:27787ms step_avg:151.84ms step:194/1480 train_time:27932ms step_avg:151.80ms step:195/1480 train_time:28078ms step_avg:151.77ms step:196/1480 train_time:28225ms step_avg:151.75ms step:197/1480 train_time:28370ms step_avg:151.71ms step:198/1480 train_time:28516ms step_avg:151.68ms step:199/1480 train_time:28663ms step_avg:151.66ms step:200/1480 train_time:28809ms step_avg:151.63ms step:201/1480 train_time:28956ms step_avg:151.60ms step:202/1480 train_time:29101ms step_avg:151.57ms step:203/1480 train_time:29246ms step_avg:151.54ms step:204/1480 train_time:29391ms step_avg:151.50ms step:205/1480 train_time:29537ms step_avg:151.47ms step:206/1480 train_time:29683ms step_avg:151.45ms step:207/1480 train_time:29828ms step_avg:151.41ms step:208/1480 train_time:29973ms step_avg:151.38ms step:209/1480 train_time:30120ms step_avg:151.36ms step:210/1480 train_time:30267ms step_avg:151.34ms step:211/1480 train_time:30412ms step_avg:151.31ms step:212/1480 train_time:30558ms step_avg:151.28ms step:213/1480 train_time:30705ms step_avg:151.25ms step:214/1480 train_time:30849ms step_avg:151.22ms step:215/1480 train_time:30994ms step_avg:151.19ms step:216/1480 train_time:31143ms step_avg:151.18ms step:217/1480 train_time:31288ms step_avg:151.15ms step:218/1480 train_time:31434ms step_avg:151.13ms step:219/1480 train_time:31582ms step_avg:151.11ms step:220/1480 train_time:31727ms step_avg:151.08ms step:221/1480 train_time:32262ms step_avg:152.90ms step:222/1480 train_time:32370ms step_avg:152.69ms step:223/1480 train_time:32520ms step_avg:152.67ms step:224/1480 train_time:32668ms step_avg:152.65ms step:225/1480 train_time:32816ms step_avg:152.63ms step:226/1480 train_time:32964ms step_avg:152.61ms step:227/1480 train_time:33111ms step_avg:152.59ms step:228/1480 train_time:33259ms step_avg:152.57ms step:229/1480 train_time:33408ms step_avg:152.55ms step:230/1480 train_time:33558ms step_avg:152.54ms step:231/1480 train_time:33707ms step_avg:152.52ms step:232/1480 train_time:33854ms step_avg:152.50ms step:233/1480 train_time:34003ms step_avg:152.48ms step:234/1480 train_time:34150ms step_avg:152.46ms step:235/1480 train_time:34299ms step_avg:152.44ms step:236/1480 train_time:34448ms step_avg:152.42ms step:237/1480 train_time:34596ms step_avg:152.41ms step:238/1480 train_time:34745ms step_avg:152.39ms step:239/1480 train_time:34893ms step_avg:152.37ms step:240/1480 train_time:35041ms step_avg:152.35ms step:241/1480 train_time:35189ms step_avg:152.33ms step:242/1480 train_time:35337ms step_avg:152.31ms step:243/1480 train_time:35486ms step_avg:152.30ms step:244/1480 train_time:35633ms step_avg:152.28ms step:245/1480 train_time:35783ms step_avg:152.27ms step:246/1480 train_time:35930ms step_avg:152.25ms step:247/1480 train_time:36079ms step_avg:152.23ms step:248/1480 train_time:36228ms step_avg:152.22ms step:249/1480 train_time:36376ms step_avg:152.20ms step:250/1480 train_time:36526ms step_avg:152.19ms step:250/1480 val_loss:3.9950 train_time:36592ms step_avg:152.47ms step:251/1480 train_time:36682ms step_avg:152.21ms step:252/1480 train_time:36833ms step_avg:152.20ms step:253/1480 train_time:36981ms step_avg:152.19ms step:254/1480 train_time:37130ms step_avg:152.17ms step:255/1480 train_time:37278ms step_avg:152.15ms step:256/1480 train_time:37426ms step_avg:152.14ms step:257/1480 train_time:37573ms step_avg:152.12ms step:258/1480 train_time:37722ms step_avg:152.10ms step:259/1480 train_time:37872ms step_avg:152.10ms step:260/1480 train_time:38022ms step_avg:152.09ms step:261/1480 train_time:38171ms step_avg:152.08ms step:262/1480 train_time:38318ms step_avg:152.06ms step:263/1480 train_time:38467ms step_avg:152.05ms step:264/1480 train_time:38614ms step_avg:152.02ms step:265/1480 train_time:38764ms step_avg:152.01ms step:266/1480 train_time:38912ms step_avg:152.00ms step:267/1480 train_time:39060ms step_avg:151.99ms step:268/1480 train_time:39209ms step_avg:151.97ms step:269/1480 train_time:39357ms step_avg:151.96ms step:270/1480 train_time:39506ms step_avg:151.95ms step:271/1480 train_time:39653ms step_avg:151.93ms step:272/1480 train_time:39802ms step_avg:151.92ms step:273/1480 train_time:39951ms step_avg:151.91ms step:274/1480 train_time:40098ms step_avg:151.89ms step:275/1480 train_time:40248ms step_avg:151.88ms step:276/1480 train_time:40395ms step_avg:151.86ms step:277/1480 train_time:40545ms step_avg:151.85ms step:278/1480 train_time:40693ms step_avg:151.84ms step:279/1480 train_time:40843ms step_avg:151.83ms step:280/1480 train_time:40992ms step_avg:151.82ms step:281/1480 train_time:41140ms step_avg:151.81ms step:282/1480 train_time:41289ms step_avg:151.80ms step:283/1480 train_time:41438ms step_avg:151.79ms step:284/1480 train_time:41588ms step_avg:151.78ms step:285/1480 train_time:41735ms step_avg:151.76ms step:286/1480 train_time:41884ms step_avg:151.75ms step:287/1480 train_time:42032ms step_avg:151.74ms step:288/1480 train_time:42180ms step_avg:151.73ms step:289/1480 train_time:42329ms step_avg:151.72ms step:290/1480 train_time:42477ms step_avg:151.70ms step:291/1480 train_time:42627ms step_avg:151.70ms step:292/1480 train_time:42775ms step_avg:151.68ms step:293/1480 train_time:42924ms step_avg:151.67ms step:294/1480 train_time:43072ms step_avg:151.66ms step:295/1480 train_time:43220ms step_avg:151.65ms step:296/1480 train_time:43370ms step_avg:151.64ms step:297/1480 train_time:43518ms step_avg:151.63ms step:298/1480 train_time:43667ms step_avg:151.62ms step:299/1480 train_time:43815ms step_avg:151.61ms step:300/1480 train_time:43964ms step_avg:151.60ms step:301/1480 train_time:44117ms step_avg:151.60ms step:302/1480 train_time:44260ms step_avg:151.58ms step:303/1480 train_time:44410ms step_avg:151.57ms step:304/1480 train_time:44557ms step_avg:151.56ms step:305/1480 train_time:44708ms step_avg:151.55ms step:306/1480 train_time:44855ms step_avg:151.54ms step:307/1480 train_time:45004ms step_avg:151.53ms step:308/1480 train_time:45153ms step_avg:151.52ms step:309/1480 train_time:45301ms step_avg:151.51ms step:310/1480 train_time:45450ms step_avg:151.50ms step:311/1480 train_time:45598ms step_avg:151.49ms step:312/1480 train_time:45748ms step_avg:151.48ms step:313/1480 train_time:45896ms step_avg:151.47ms step:314/1480 train_time:46045ms step_avg:151.46ms step:315/1480 train_time:46192ms step_avg:151.45ms step:316/1480 train_time:46341ms step_avg:151.44ms step:317/1480 train_time:46490ms step_avg:151.43ms step:318/1480 train_time:46640ms step_avg:151.43ms step:319/1480 train_time:46789ms step_avg:151.42ms step:320/1480 train_time:46937ms step_avg:151.41ms step:321/1480 train_time:47086ms step_avg:151.40ms step:322/1480 train_time:47234ms step_avg:151.39ms step:323/1480 train_time:47383ms step_avg:151.38ms step:324/1480 train_time:47532ms step_avg:151.38ms step:325/1480 train_time:47680ms step_avg:151.36ms step:326/1480 train_time:47829ms step_avg:151.36ms step:327/1480 train_time:47977ms step_avg:151.35ms step:328/1480 train_time:48127ms step_avg:151.34ms step:329/1480 train_time:48274ms step_avg:151.33ms step:330/1480 train_time:48425ms step_avg:151.33ms step:331/1480 train_time:48575ms step_avg:151.32ms step:332/1480 train_time:48727ms step_avg:151.33ms step:333/1480 train_time:48876ms step_avg:151.32ms step:334/1480 train_time:49027ms step_avg:151.32ms step:335/1480 train_time:49177ms step_avg:151.31ms step:336/1480 train_time:49329ms step_avg:151.31ms step:337/1480 train_time:49479ms step_avg:151.31ms step:338/1480 train_time:49630ms step_avg:151.31ms step:339/1480 train_time:49780ms step_avg:151.31ms step:340/1480 train_time:49931ms step_avg:151.31ms step:341/1480 train_time:50081ms step_avg:151.30ms step:342/1480 train_time:50232ms step_avg:151.30ms step:343/1480 train_time:50384ms step_avg:151.30ms step:344/1480 train_time:50534ms step_avg:151.30ms step:345/1480 train_time:50686ms step_avg:151.30ms step:346/1480 train_time:50837ms step_avg:151.30ms step:347/1480 train_time:50988ms step_avg:151.30ms step:348/1480 train_time:51138ms step_avg:151.30ms step:349/1480 train_time:51289ms step_avg:151.29ms step:350/1480 train_time:51440ms step_avg:151.29ms step:351/1480 train_time:51590ms step_avg:151.29ms step:352/1480 train_time:51742ms step_avg:151.29ms step:353/1480 train_time:51892ms step_avg:151.29ms step:354/1480 train_time:52044ms step_avg:151.29ms step:355/1480 train_time:52194ms step_avg:151.29ms step:356/1480 train_time:52346ms step_avg:151.29ms step:357/1480 train_time:52497ms step_avg:151.29ms step:358/1480 train_time:52648ms step_avg:151.29ms step:359/1480 train_time:52798ms step_avg:151.28ms step:360/1480 train_time:52950ms step_avg:151.29ms step:361/1480 train_time:53101ms step_avg:151.29ms step:362/1480 train_time:53253ms step_avg:151.29ms step:363/1480 train_time:53404ms step_avg:151.29ms step:364/1480 train_time:53555ms step_avg:151.29ms step:365/1480 train_time:53706ms step_avg:151.29ms step:366/1480 train_time:53857ms step_avg:151.28ms step:367/1480 train_time:54009ms step_avg:151.28ms step:368/1480 train_time:54158ms step_avg:151.28ms step:369/1480 train_time:54311ms step_avg:151.28ms step:370/1480 train_time:54461ms step_avg:151.28ms step:371/1480 train_time:54612ms step_avg:151.28ms step:372/1480 train_time:54761ms step_avg:151.27ms step:373/1480 train_time:54912ms step_avg:151.27ms step:374/1480 train_time:55064ms step_avg:151.27ms step:375/1480 train_time:55214ms step_avg:151.27ms step:375/1480 val_loss:3.8072 train_time:55283ms step_avg:151.46ms step:376/1480 train_time:55377ms step_avg:151.30ms step:377/1480 train_time:55525ms step_avg:151.29ms step:378/1480 train_time:55678ms step_avg:151.30ms step:379/1480 train_time:55845ms step_avg:151.34ms step:380/1480 train_time:55980ms step_avg:151.30ms step:381/1480 train_time:56130ms step_avg:151.29ms step:382/1480 train_time:56281ms step_avg:151.29ms step:383/1480 train_time:56433ms step_avg:151.29ms step:384/1480 train_time:56583ms step_avg:151.29ms step:385/1480 train_time:56735ms step_avg:151.29ms step:386/1480 train_time:56885ms step_avg:151.29ms step:387/1480 train_time:57036ms step_avg:151.29ms step:388/1480 train_time:57186ms step_avg:151.29ms step:389/1480 train_time:57338ms step_avg:151.29ms step:390/1480 train_time:57489ms step_avg:151.29ms step:391/1480 train_time:57640ms step_avg:151.29ms step:392/1480 train_time:57792ms step_avg:151.29ms step:393/1480 train_time:57943ms step_avg:151.29ms step:394/1480 train_time:58094ms step_avg:151.29ms step:395/1480 train_time:58244ms step_avg:151.28ms step:396/1480 train_time:58395ms step_avg:151.28ms step:397/1480 train_time:58545ms step_avg:151.28ms step:398/1480 train_time:58696ms step_avg:151.28ms step:399/1480 train_time:58846ms step_avg:151.28ms step:400/1480 train_time:58998ms step_avg:151.28ms step:401/1480 train_time:59149ms step_avg:151.28ms step:402/1480 train_time:59301ms step_avg:151.28ms step:403/1480 train_time:59452ms step_avg:151.28ms step:404/1480 train_time:59604ms step_avg:151.28ms step:405/1480 train_time:59755ms step_avg:151.28ms step:406/1480 train_time:59906ms step_avg:151.28ms step:407/1480 train_time:60059ms step_avg:151.28ms step:408/1480 train_time:60209ms step_avg:151.28ms step:409/1480 train_time:60361ms step_avg:151.28ms step:410/1480 train_time:60511ms step_avg:151.28ms step:411/1480 train_time:60661ms step_avg:151.28ms step:412/1480 train_time:60812ms step_avg:151.27ms step:413/1480 train_time:60962ms step_avg:151.27ms step:414/1480 train_time:61114ms step_avg:151.27ms step:415/1480 train_time:61265ms step_avg:151.27ms step:416/1480 train_time:61417ms step_avg:151.27ms step:417/1480 train_time:61566ms step_avg:151.27ms step:418/1480 train_time:61717ms step_avg:151.27ms step:419/1480 train_time:61867ms step_avg:151.26ms step:420/1480 train_time:62018ms step_avg:151.26ms step:421/1480 train_time:62168ms step_avg:151.26ms step:422/1480 train_time:62319ms step_avg:151.26ms step:423/1480 train_time:62469ms step_avg:151.26ms step:424/1480 train_time:62620ms step_avg:151.26ms step:425/1480 train_time:62771ms step_avg:151.25ms step:426/1480 train_time:62922ms step_avg:151.25ms step:427/1480 train_time:63073ms step_avg:151.25ms step:428/1480 train_time:63223ms step_avg:151.25ms step:429/1480 train_time:63375ms step_avg:151.25ms step:430/1480 train_time:63526ms step_avg:151.25ms step:431/1480 train_time:63678ms step_avg:151.25ms step:432/1480 train_time:63828ms step_avg:151.25ms step:433/1480 train_time:63980ms step_avg:151.25ms step:434/1480 train_time:64131ms step_avg:151.25ms step:435/1480 train_time:64282ms step_avg:151.25ms step:436/1480 train_time:64434ms step_avg:151.25ms step:437/1480 train_time:64584ms step_avg:151.25ms step:438/1480 train_time:64735ms step_avg:151.25ms step:439/1480 train_time:64886ms step_avg:151.25ms step:440/1480 train_time:65038ms step_avg:151.25ms step:441/1480 train_time:65190ms step_avg:151.25ms step:442/1480 train_time:65343ms step_avg:151.26ms step:443/1480 train_time:65495ms step_avg:151.26ms step:444/1480 train_time:65647ms step_avg:151.26ms step:445/1480 train_time:65800ms step_avg:151.26ms step:446/1480 train_time:65953ms step_avg:151.27ms step:447/1480 train_time:66109ms step_avg:151.28ms step:448/1480 train_time:66261ms step_avg:151.28ms step:449/1480 train_time:66414ms step_avg:151.29ms step:450/1480 train_time:66567ms step_avg:151.29ms step:451/1480 train_time:66720ms step_avg:151.29ms step:452/1480 train_time:66872ms step_avg:151.29ms step:453/1480 train_time:67024ms step_avg:151.30ms step:454/1480 train_time:67178ms step_avg:151.30ms step:455/1480 train_time:67331ms step_avg:151.31ms step:456/1480 train_time:67484ms step_avg:151.31ms step:457/1480 train_time:67637ms step_avg:151.31ms step:458/1480 train_time:67789ms step_avg:151.31ms step:459/1480 train_time:67942ms step_avg:151.32ms step:460/1480 train_time:68095ms step_avg:151.32ms step:461/1480 train_time:68247ms step_avg:151.32ms step:462/1480 train_time:68400ms step_avg:151.33ms step:463/1480 train_time:68553ms step_avg:151.33ms step:464/1480 train_time:68707ms step_avg:151.34ms step:465/1480 train_time:68860ms step_avg:151.34ms step:466/1480 train_time:69012ms step_avg:151.34ms step:467/1480 train_time:69165ms step_avg:151.35ms step:468/1480 train_time:69318ms step_avg:151.35ms step:469/1480 train_time:69469ms step_avg:151.35ms step:470/1480 train_time:69622ms step_avg:151.35ms step:471/1480 train_time:69776ms step_avg:151.36ms step:472/1480 train_time:69928ms step_avg:151.36ms step:473/1480 train_time:70082ms step_avg:151.36ms step:474/1480 train_time:70234ms step_avg:151.37ms step:475/1480 train_time:70386ms step_avg:151.37ms step:476/1480 train_time:70540ms step_avg:151.37ms step:477/1480 train_time:70692ms step_avg:151.38ms step:478/1480 train_time:70846ms step_avg:151.38ms step:479/1480 train_time:71000ms step_avg:151.38ms step:480/1480 train_time:71152ms step_avg:151.39ms step:481/1480 train_time:71305ms step_avg:151.39ms step:482/1480 train_time:71459ms step_avg:151.40ms step:483/1480 train_time:71611ms step_avg:151.40ms step:484/1480 train_time:71765ms step_avg:151.40ms step:485/1480 train_time:71919ms step_avg:151.41ms step:486/1480 train_time:72071ms step_avg:151.41ms step:487/1480 train_time:72224ms step_avg:151.41ms step:488/1480 train_time:72377ms step_avg:151.42ms step:489/1480 train_time:72530ms step_avg:151.42ms step:490/1480 train_time:72683ms step_avg:151.42ms step:491/1480 train_time:72836ms step_avg:151.43ms step:492/1480 train_time:72989ms step_avg:151.43ms step:493/1480 train_time:73142ms step_avg:151.43ms step:494/1480 train_time:73295ms step_avg:151.44ms step:495/1480 train_time:73448ms step_avg:151.44ms step:496/1480 train_time:73601ms step_avg:151.44ms step:497/1480 train_time:73755ms step_avg:151.45ms step:498/1480 train_time:73908ms step_avg:151.45ms step:499/1480 train_time:74062ms step_avg:151.46ms step:500/1480 train_time:74215ms step_avg:151.46ms step:500/1480 val_loss:3.6880 train_time:74285ms step_avg:151.60ms step:501/1480 train_time:74376ms step_avg:151.48ms step:502/1480 train_time:74528ms step_avg:151.48ms step:503/1480 train_time:74681ms step_avg:151.48ms step:504/1480 train_time:74834ms step_avg:151.48ms step:505/1480 train_time:74985ms step_avg:151.49ms step:506/1480 train_time:75138ms step_avg:151.49ms step:507/1480 train_time:75291ms step_avg:151.49ms step:508/1480 train_time:75444ms step_avg:151.49ms step:509/1480 train_time:75597ms step_avg:151.50ms step:510/1480 train_time:75752ms step_avg:151.50ms step:511/1480 train_time:75905ms step_avg:151.51ms step:512/1480 train_time:76058ms step_avg:151.51ms step:513/1480 train_time:76211ms step_avg:151.51ms step:514/1480 train_time:76364ms step_avg:151.52ms step:515/1480 train_time:76518ms step_avg:151.52ms step:516/1480 train_time:76671ms step_avg:151.52ms step:517/1480 train_time:76825ms step_avg:151.53ms step:518/1480 train_time:76978ms step_avg:151.53ms step:519/1480 train_time:77131ms step_avg:151.53ms step:520/1480 train_time:77284ms step_avg:151.54ms step:521/1480 train_time:77437ms step_avg:151.54ms step:522/1480 train_time:77589ms step_avg:151.54ms step:523/1480 train_time:77742ms step_avg:151.54ms step:524/1480 train_time:77896ms step_avg:151.55ms step:525/1480 train_time:78050ms step_avg:151.55ms step:526/1480 train_time:78203ms step_avg:151.56ms step:527/1480 train_time:78357ms step_avg:151.56ms step:528/1480 train_time:78508ms step_avg:151.56ms step:529/1480 train_time:78661ms step_avg:151.56ms step:530/1480 train_time:78814ms step_avg:151.57ms step:531/1480 train_time:78967ms step_avg:151.57ms step:532/1480 train_time:79120ms step_avg:151.57ms step:533/1480 train_time:79273ms step_avg:151.57ms step:534/1480 train_time:79426ms step_avg:151.58ms step:535/1480 train_time:79579ms step_avg:151.58ms step:536/1480 train_time:79732ms step_avg:151.58ms step:537/1480 train_time:79884ms step_avg:151.58ms step:538/1480 train_time:80039ms step_avg:151.59ms step:539/1480 train_time:80192ms step_avg:151.59ms step:540/1480 train_time:80346ms step_avg:151.60ms step:541/1480 train_time:80498ms step_avg:151.60ms step:542/1480 train_time:80652ms step_avg:151.60ms step:543/1480 train_time:80805ms step_avg:151.60ms step:544/1480 train_time:80958ms step_avg:151.61ms step:545/1480 train_time:81110ms step_avg:151.61ms step:546/1480 train_time:81264ms step_avg:151.61ms step:547/1480 train_time:81417ms step_avg:151.61ms step:548/1480 train_time:81569ms step_avg:151.62ms step:549/1480 train_time:81722ms step_avg:151.62ms step:550/1480 train_time:81877ms step_avg:151.62ms step:551/1480 train_time:82031ms step_avg:151.63ms step:552/1480 train_time:82187ms step_avg:151.64ms step:553/1480 train_time:82342ms step_avg:151.64ms step:554/1480 train_time:82496ms step_avg:151.65ms step:555/1480 train_time:82651ms step_avg:151.65ms step:556/1480 train_time:82806ms step_avg:151.66ms step:557/1480 train_time:82961ms step_avg:151.66ms step:558/1480 train_time:83115ms step_avg:151.67ms step:559/1480 train_time:83269ms step_avg:151.67ms step:560/1480 train_time:83423ms step_avg:151.68ms step:561/1480 train_time:83578ms step_avg:151.68ms step:562/1480 train_time:83733ms step_avg:151.69ms step:563/1480 train_time:83888ms step_avg:151.70ms step:564/1480 train_time:84042ms step_avg:151.70ms step:565/1480 train_time:84196ms step_avg:151.70ms step:566/1480 train_time:84353ms step_avg:151.71ms step:567/1480 train_time:84508ms step_avg:151.72ms step:568/1480 train_time:84662ms step_avg:151.72ms step:569/1480 train_time:84832ms step_avg:151.76ms step:570/1480 train_time:84972ms step_avg:151.74ms step:571/1480 train_time:85127ms step_avg:151.74ms step:572/1480 train_time:85282ms step_avg:151.75ms step:573/1480 train_time:85437ms step_avg:151.75ms step:574/1480 train_time:85593ms step_avg:151.76ms step:575/1480 train_time:85749ms step_avg:151.77ms step:576/1480 train_time:85903ms step_avg:151.77ms step:577/1480 train_time:86058ms step_avg:151.78ms step:578/1480 train_time:86212ms step_avg:151.78ms step:579/1480 train_time:86366ms step_avg:151.79ms step:580/1480 train_time:86521ms step_avg:151.79ms step:581/1480 train_time:86676ms step_avg:151.80ms step:582/1480 train_time:86830ms step_avg:151.80ms step:583/1480 train_time:86985ms step_avg:151.81ms step:584/1480 train_time:87140ms step_avg:151.81ms step:585/1480 train_time:87295ms step_avg:151.82ms step:586/1480 train_time:87451ms step_avg:151.82ms step:587/1480 train_time:87607ms step_avg:151.83ms step:588/1480 train_time:87760ms step_avg:151.83ms step:589/1480 train_time:87916ms step_avg:151.84ms step:590/1480 train_time:88070ms step_avg:151.85ms step:591/1480 train_time:88226ms step_avg:151.85ms step:592/1480 train_time:88380ms step_avg:151.86ms step:593/1480 train_time:88536ms step_avg:151.86ms step:594/1480 train_time:88691ms step_avg:151.87ms step:595/1480 train_time:88847ms step_avg:151.88ms step:596/1480 train_time:89003ms step_avg:151.88ms step:597/1480 train_time:89159ms step_avg:151.89ms step:598/1480 train_time:89313ms step_avg:151.89ms step:599/1480 train_time:89469ms step_avg:151.90ms step:600/1480 train_time:89625ms step_avg:151.91ms step:601/1480 train_time:89780ms step_avg:151.91ms step:602/1480 train_time:89935ms step_avg:151.92ms step:603/1480 train_time:90089ms step_avg:151.92ms step:604/1480 train_time:90244ms step_avg:151.93ms step:605/1480 train_time:90398ms step_avg:151.93ms step:606/1480 train_time:90554ms step_avg:151.94ms step:607/1480 train_time:90709ms step_avg:151.94ms step:608/1480 train_time:90864ms step_avg:151.95ms step:609/1480 train_time:91019ms step_avg:151.95ms step:610/1480 train_time:91173ms step_avg:151.96ms step:611/1480 train_time:91327ms step_avg:151.96ms step:612/1480 train_time:91482ms step_avg:151.96ms step:613/1480 train_time:91637ms step_avg:151.97ms step:614/1480 train_time:91792ms step_avg:151.97ms step:615/1480 train_time:91946ms step_avg:151.98ms step:616/1480 train_time:92099ms step_avg:151.98ms step:617/1480 train_time:92255ms step_avg:151.99ms step:618/1480 train_time:92409ms step_avg:151.99ms step:619/1480 train_time:92564ms step_avg:151.99ms step:620/1480 train_time:92719ms step_avg:152.00ms step:621/1480 train_time:92874ms step_avg:152.00ms step:622/1480 train_time:93029ms step_avg:152.01ms step:623/1480 train_time:93184ms step_avg:152.01ms step:624/1480 train_time:93339ms step_avg:152.02ms step:625/1480 train_time:93493ms step_avg:152.02ms step:625/1480 val_loss:3.6066 train_time:93564ms step_avg:152.14ms step:626/1480 train_time:93655ms step_avg:152.04ms step:627/1480 train_time:93809ms step_avg:152.04ms step:628/1480 train_time:93964ms step_avg:152.05ms step:629/1480 train_time:94118ms step_avg:152.05ms step:630/1480 train_time:94273ms step_avg:152.05ms step:631/1480 train_time:94427ms step_avg:152.06ms step:632/1480 train_time:94581ms step_avg:152.06ms step:633/1480 train_time:94738ms step_avg:152.07ms step:634/1480 train_time:94894ms step_avg:152.07ms step:635/1480 train_time:95048ms step_avg:152.08ms step:636/1480 train_time:95204ms step_avg:152.08ms step:637/1480 train_time:95358ms step_avg:152.09ms step:638/1480 train_time:95514ms step_avg:152.09ms step:639/1480 train_time:95668ms step_avg:152.10ms step:640/1480 train_time:95823ms step_avg:152.10ms step:641/1480 train_time:95977ms step_avg:152.10ms step:642/1480 train_time:96132ms step_avg:152.11ms step:643/1480 train_time:96286ms step_avg:152.11ms step:644/1480 train_time:96441ms step_avg:152.12ms step:645/1480 train_time:96595ms step_avg:152.12ms step:646/1480 train_time:96750ms step_avg:152.12ms step:647/1480 train_time:96904ms step_avg:152.13ms step:648/1480 train_time:97063ms step_avg:152.14ms step:649/1480 train_time:97218ms step_avg:152.14ms step:650/1480 train_time:97373ms step_avg:152.14ms step:651/1480 train_time:97527ms step_avg:152.15ms step:652/1480 train_time:97682ms step_avg:152.15ms step:653/1480 train_time:97837ms step_avg:152.16ms step:654/1480 train_time:97993ms step_avg:152.16ms step:655/1480 train_time:98147ms step_avg:152.17ms step:656/1480 train_time:98303ms step_avg:152.17ms step:657/1480 train_time:98457ms step_avg:152.17ms step:658/1480 train_time:98612ms step_avg:152.18ms step:659/1480 train_time:98767ms step_avg:152.18ms step:660/1480 train_time:98923ms step_avg:152.19ms step:661/1480 train_time:99079ms step_avg:152.19ms step:662/1480 train_time:99235ms step_avg:152.20ms step:663/1480 train_time:99392ms step_avg:152.21ms step:664/1480 train_time:99547ms step_avg:152.21ms step:665/1480 train_time:99705ms step_avg:152.22ms step:666/1480 train_time:99861ms step_avg:152.23ms step:667/1480 train_time:100018ms step_avg:152.23ms step:668/1480 train_time:100174ms step_avg:152.24ms step:669/1480 train_time:100332ms step_avg:152.25ms step:670/1480 train_time:100487ms step_avg:152.25ms step:671/1480 train_time:100644ms step_avg:152.26ms step:672/1480 train_time:100801ms step_avg:152.27ms step:673/1480 train_time:100956ms step_avg:152.27ms step:674/1480 train_time:101113ms step_avg:152.28ms step:675/1480 train_time:101270ms step_avg:152.29ms step:676/1480 train_time:101427ms step_avg:152.29ms step:677/1480 train_time:101582ms step_avg:152.30ms step:678/1480 train_time:101739ms step_avg:152.30ms step:679/1480 train_time:101894ms step_avg:152.31ms step:680/1480 train_time:102051ms step_avg:152.32ms step:681/1480 train_time:102207ms step_avg:152.32ms step:682/1480 train_time:102364ms step_avg:152.33ms step:683/1480 train_time:102521ms step_avg:152.33ms step:684/1480 train_time:102676ms step_avg:152.34ms step:685/1480 train_time:102833ms step_avg:152.35ms step:686/1480 train_time:102989ms step_avg:152.35ms step:687/1480 train_time:103145ms step_avg:152.36ms step:688/1480 train_time:103303ms step_avg:152.36ms step:689/1480 train_time:103460ms step_avg:152.37ms step:690/1480 train_time:103617ms step_avg:152.38ms step:691/1480 train_time:103774ms step_avg:152.39ms step:692/1480 train_time:103931ms step_avg:152.39ms step:693/1480 train_time:104086ms step_avg:152.40ms step:694/1480 train_time:104244ms step_avg:152.40ms step:695/1480 train_time:104399ms step_avg:152.41ms step:696/1480 train_time:104554ms step_avg:152.41ms step:697/1480 train_time:104712ms step_avg:152.42ms step:698/1480 train_time:104868ms step_avg:152.43ms step:699/1480 train_time:105025ms step_avg:152.43ms step:700/1480 train_time:105181ms step_avg:152.44ms step:701/1480 train_time:105338ms step_avg:152.44ms step:702/1480 train_time:105494ms step_avg:152.45ms step:703/1480 train_time:105650ms step_avg:152.45ms step:704/1480 train_time:105806ms step_avg:152.46ms step:705/1480 train_time:105963ms step_avg:152.46ms step:706/1480 train_time:106120ms step_avg:152.47ms step:707/1480 train_time:106277ms step_avg:152.48ms step:708/1480 train_time:106432ms step_avg:152.48ms step:709/1480 train_time:106587ms step_avg:152.49ms step:710/1480 train_time:106743ms step_avg:152.49ms step:711/1480 train_time:106899ms step_avg:152.49ms step:712/1480 train_time:107055ms step_avg:152.50ms step:713/1480 train_time:107213ms step_avg:152.51ms step:714/1480 train_time:107369ms step_avg:152.51ms step:715/1480 train_time:107526ms step_avg:152.52ms step:716/1480 train_time:107682ms step_avg:152.52ms step:717/1480 train_time:107839ms step_avg:152.53ms step:718/1480 train_time:107996ms step_avg:152.54ms step:719/1480 train_time:108150ms step_avg:152.54ms step:720/1480 train_time:108309ms step_avg:152.55ms step:721/1480 train_time:108466ms step_avg:152.55ms step:722/1480 train_time:108623ms step_avg:152.56ms step:723/1480 train_time:108778ms step_avg:152.56ms step:724/1480 train_time:108934ms step_avg:152.57ms step:725/1480 train_time:109090ms step_avg:152.57ms step:726/1480 train_time:109247ms step_avg:152.58ms step:727/1480 train_time:109404ms step_avg:152.59ms step:728/1480 train_time:109561ms step_avg:152.59ms step:729/1480 train_time:109718ms step_avg:152.60ms step:730/1480 train_time:109876ms step_avg:152.61ms step:731/1480 train_time:110033ms step_avg:152.61ms step:732/1480 train_time:110189ms step_avg:152.62ms step:733/1480 train_time:110346ms step_avg:152.62ms step:734/1480 train_time:110503ms step_avg:152.63ms step:735/1480 train_time:110659ms step_avg:152.63ms step:736/1480 train_time:110814ms step_avg:152.64ms step:737/1480 train_time:110969ms step_avg:152.64ms step:738/1480 train_time:111125ms step_avg:152.64ms step:739/1480 train_time:111283ms step_avg:152.65ms step:740/1480 train_time:111443ms step_avg:152.66ms step:741/1480 train_time:111600ms step_avg:152.67ms step:742/1480 train_time:111756ms step_avg:152.67ms step:743/1480 train_time:111912ms step_avg:152.68ms step:744/1480 train_time:112068ms step_avg:152.68ms step:745/1480 train_time:112227ms step_avg:152.69ms step:746/1480 train_time:112383ms step_avg:152.69ms step:747/1480 train_time:112540ms step_avg:152.70ms step:748/1480 train_time:112701ms step_avg:152.71ms step:749/1480 train_time:112856ms step_avg:152.71ms step:750/1480 train_time:113011ms step_avg:152.72ms step:750/1480 val_loss:3.5506 train_time:113083ms step_avg:152.82ms step:751/1480 train_time:113174ms step_avg:152.73ms step:752/1480 train_time:113331ms step_avg:152.74ms step:753/1480 train_time:113488ms step_avg:152.74ms step:754/1480 train_time:113644ms step_avg:152.75ms step:755/1480 train_time:113801ms step_avg:152.75ms step:756/1480 train_time:113957ms step_avg:152.76ms step:757/1480 train_time:114115ms step_avg:152.76ms step:758/1480 train_time:114272ms step_avg:152.77ms step:759/1480 train_time:114442ms step_avg:152.79ms step:760/1480 train_time:114584ms step_avg:152.78ms step:761/1480 train_time:114740ms step_avg:152.78ms step:762/1480 train_time:114897ms step_avg:152.79ms step:763/1480 train_time:115053ms step_avg:152.79ms step:764/1480 train_time:115211ms step_avg:152.80ms step:765/1480 train_time:115370ms step_avg:152.81ms step:766/1480 train_time:115527ms step_avg:152.81ms step:767/1480 train_time:115684ms step_avg:152.82ms step:768/1480 train_time:115841ms step_avg:152.82ms step:769/1480 train_time:115999ms step_avg:152.83ms step:770/1480 train_time:116156ms step_avg:152.84ms step:771/1480 train_time:116314ms step_avg:152.84ms step:772/1480 train_time:116472ms step_avg:152.85ms step:773/1480 train_time:116628ms step_avg:152.86ms step:774/1480 train_time:116786ms step_avg:152.86ms step:775/1480 train_time:116945ms step_avg:152.87ms step:776/1480 train_time:117104ms step_avg:152.88ms step:777/1480 train_time:117264ms step_avg:152.89ms step:778/1480 train_time:117422ms step_avg:152.89ms step:779/1480 train_time:117580ms step_avg:152.90ms step:780/1480 train_time:117740ms step_avg:152.91ms step:781/1480 train_time:117898ms step_avg:152.92ms step:782/1480 train_time:118055ms step_avg:152.92ms step:783/1480 train_time:118211ms step_avg:152.92ms step:784/1480 train_time:118370ms step_avg:152.93ms step:785/1480 train_time:118527ms step_avg:152.94ms step:786/1480 train_time:118686ms step_avg:152.95ms step:787/1480 train_time:118845ms step_avg:152.95ms step:788/1480 train_time:119002ms step_avg:152.96ms step:789/1480 train_time:119158ms step_avg:152.96ms step:790/1480 train_time:119316ms step_avg:152.97ms step:791/1480 train_time:119477ms step_avg:152.98ms step:792/1480 train_time:119634ms step_avg:152.99ms step:793/1480 train_time:119792ms step_avg:152.99ms step:794/1480 train_time:119951ms step_avg:153.00ms step:795/1480 train_time:120112ms step_avg:153.01ms step:796/1480 train_time:120273ms step_avg:153.02ms step:797/1480 train_time:120434ms step_avg:153.03ms step:798/1480 train_time:120593ms step_avg:153.04ms step:799/1480 train_time:120753ms step_avg:153.05ms step:800/1480 train_time:120912ms step_avg:153.05ms step:801/1480 train_time:121070ms step_avg:153.06ms step:802/1480 train_time:121229ms step_avg:153.07ms step:803/1480 train_time:121388ms step_avg:153.07ms step:804/1480 train_time:121546ms step_avg:153.08ms step:805/1480 train_time:121705ms step_avg:153.09ms step:806/1480 train_time:121861ms step_avg:153.09ms step:807/1480 train_time:122019ms step_avg:153.10ms step:808/1480 train_time:122177ms step_avg:153.10ms step:809/1480 train_time:122334ms step_avg:153.11ms step:810/1480 train_time:122491ms step_avg:153.11ms step:811/1480 train_time:122649ms step_avg:153.12ms step:812/1480 train_time:122806ms step_avg:153.13ms step:813/1480 train_time:122962ms step_avg:153.13ms step:814/1480 train_time:123120ms step_avg:153.13ms step:815/1480 train_time:123278ms step_avg:153.14ms step:816/1480 train_time:123438ms step_avg:153.15ms step:817/1480 train_time:123595ms step_avg:153.15ms step:818/1480 train_time:123752ms step_avg:153.16ms step:819/1480 train_time:123910ms step_avg:153.16ms step:820/1480 train_time:124067ms step_avg:153.17ms step:821/1480 train_time:124224ms step_avg:153.17ms step:822/1480 train_time:124383ms step_avg:153.18ms step:823/1480 train_time:124540ms step_avg:153.19ms step:824/1480 train_time:124698ms step_avg:153.19ms step:825/1480 train_time:124857ms step_avg:153.20ms step:826/1480 train_time:125016ms step_avg:153.21ms step:827/1480 train_time:125175ms step_avg:153.21ms step:828/1480 train_time:125332ms step_avg:153.22ms step:829/1480 train_time:125492ms step_avg:153.23ms step:830/1480 train_time:125652ms step_avg:153.23ms step:831/1480 train_time:125810ms step_avg:153.24ms step:832/1480 train_time:125969ms step_avg:153.25ms step:833/1480 train_time:126126ms step_avg:153.25ms step:834/1480 train_time:126285ms step_avg:153.26ms step:835/1480 train_time:126441ms step_avg:153.26ms step:836/1480 train_time:126601ms step_avg:153.27ms step:837/1480 train_time:126760ms step_avg:153.28ms step:838/1480 train_time:126918ms step_avg:153.28ms step:839/1480 train_time:127075ms step_avg:153.29ms step:840/1480 train_time:127232ms step_avg:153.29ms step:841/1480 train_time:127388ms step_avg:153.30ms step:842/1480 train_time:127546ms step_avg:153.30ms step:843/1480 train_time:127704ms step_avg:153.31ms step:844/1480 train_time:127861ms step_avg:153.31ms step:845/1480 train_time:128019ms step_avg:153.32ms step:846/1480 train_time:128176ms step_avg:153.32ms step:847/1480 train_time:128333ms step_avg:153.33ms step:848/1480 train_time:128492ms step_avg:153.33ms step:849/1480 train_time:128650ms step_avg:153.34ms step:850/1480 train_time:128808ms step_avg:153.34ms step:851/1480 train_time:128967ms step_avg:153.35ms step:852/1480 train_time:129125ms step_avg:153.35ms step:853/1480 train_time:129282ms step_avg:153.36ms step:854/1480 train_time:129440ms step_avg:153.36ms step:855/1480 train_time:129598ms step_avg:153.37ms step:856/1480 train_time:129754ms step_avg:153.37ms step:857/1480 train_time:129913ms step_avg:153.38ms step:858/1480 train_time:130074ms step_avg:153.39ms step:859/1480 train_time:130233ms step_avg:153.40ms step:860/1480 train_time:130392ms step_avg:153.40ms step:861/1480 train_time:130551ms step_avg:153.41ms step:862/1480 train_time:130713ms step_avg:153.42ms step:863/1480 train_time:130873ms step_avg:153.43ms step:864/1480 train_time:131032ms step_avg:153.43ms step:865/1480 train_time:131190ms step_avg:153.44ms step:866/1480 train_time:131349ms step_avg:153.44ms step:867/1480 train_time:131507ms step_avg:153.45ms step:868/1480 train_time:131664ms step_avg:153.45ms step:869/1480 train_time:131821ms step_avg:153.46ms step:870/1480 train_time:131981ms step_avg:153.47ms step:871/1480 train_time:132138ms step_avg:153.47ms step:872/1480 train_time:132295ms step_avg:153.47ms step:873/1480 train_time:132450ms step_avg:153.48ms step:874/1480 train_time:132611ms step_avg:153.49ms step:875/1480 train_time:132774ms step_avg:153.50ms step:875/1480 val_loss:3.5050 train_time:132846ms step_avg:153.58ms step:876/1480 train_time:132941ms step_avg:153.51ms step:877/1480 train_time:133095ms step_avg:153.51ms step:878/1480 train_time:133254ms step_avg:153.52ms step:879/1480 train_time:133413ms step_avg:153.52ms step:880/1480 train_time:133571ms step_avg:153.53ms step:881/1480 train_time:133728ms step_avg:153.53ms step:882/1480 train_time:133888ms step_avg:153.54ms step:883/1480 train_time:134047ms step_avg:153.55ms step:884/1480 train_time:134208ms step_avg:153.56ms step:885/1480 train_time:134368ms step_avg:153.56ms step:886/1480 train_time:134529ms step_avg:153.57ms step:887/1480 train_time:134688ms step_avg:153.58ms step:888/1480 train_time:134849ms step_avg:153.59ms step:889/1480 train_time:135010ms step_avg:153.59ms step:890/1480 train_time:135168ms step_avg:153.60ms step:891/1480 train_time:135326ms step_avg:153.61ms step:892/1480 train_time:135487ms step_avg:153.61ms step:893/1480 train_time:135645ms step_avg:153.62ms step:894/1480 train_time:135805ms step_avg:153.63ms step:895/1480 train_time:135965ms step_avg:153.63ms step:896/1480 train_time:136124ms step_avg:153.64ms step:897/1480 train_time:136284ms step_avg:153.65ms step:898/1480 train_time:136443ms step_avg:153.65ms step:899/1480 train_time:136603ms step_avg:153.66ms step:900/1480 train_time:136761ms step_avg:153.66ms step:901/1480 train_time:136920ms step_avg:153.67ms step:902/1480 train_time:137079ms step_avg:153.68ms step:903/1480 train_time:137240ms step_avg:153.68ms step:904/1480 train_time:137399ms step_avg:153.69ms step:905/1480 train_time:137558ms step_avg:153.70ms step:906/1480 train_time:137717ms step_avg:153.70ms step:907/1480 train_time:137880ms step_avg:153.71ms step:908/1480 train_time:138038ms step_avg:153.72ms step:909/1480 train_time:138197ms step_avg:153.72ms step:910/1480 train_time:138359ms step_avg:153.73ms step:911/1480 train_time:138519ms step_avg:153.74ms step:912/1480 train_time:138679ms step_avg:153.75ms step:913/1480 train_time:138840ms step_avg:153.75ms step:914/1480 train_time:139002ms step_avg:153.76ms step:915/1480 train_time:139162ms step_avg:153.77ms step:916/1480 train_time:139321ms step_avg:153.78ms step:917/1480 train_time:139480ms step_avg:153.78ms step:918/1480 train_time:139642ms step_avg:153.79ms step:919/1480 train_time:139804ms step_avg:153.80ms step:920/1480 train_time:139963ms step_avg:153.81ms step:921/1480 train_time:140121ms step_avg:153.81ms step:922/1480 train_time:140282ms step_avg:153.82ms step:923/1480 train_time:140439ms step_avg:153.82ms step:924/1480 train_time:140597ms step_avg:153.83ms step:925/1480 train_time:140756ms step_avg:153.83ms step:926/1480 train_time:140916ms step_avg:153.84ms step:927/1480 train_time:141075ms step_avg:153.84ms step:928/1480 train_time:141234ms step_avg:153.85ms step:929/1480 train_time:141392ms step_avg:153.85ms step:930/1480 train_time:141552ms step_avg:153.86ms step:931/1480 train_time:141708ms step_avg:153.86ms step:932/1480 train_time:141870ms step_avg:153.87ms step:933/1480 train_time:142028ms step_avg:153.88ms step:934/1480 train_time:142188ms step_avg:153.88ms step:935/1480 train_time:142350ms step_avg:153.89ms step:936/1480 train_time:142509ms step_avg:153.90ms step:937/1480 train_time:142669ms step_avg:153.90ms step:938/1480 train_time:142827ms step_avg:153.91ms step:939/1480 train_time:142988ms step_avg:153.92ms step:940/1480 train_time:143148ms step_avg:153.92ms step:941/1480 train_time:143306ms step_avg:153.93ms step:942/1480 train_time:143465ms step_avg:153.93ms step:943/1480 train_time:143623ms step_avg:153.94ms step:944/1480 train_time:143786ms step_avg:153.95ms step:945/1480 train_time:143945ms step_avg:153.95ms step:946/1480 train_time:144108ms step_avg:153.96ms step:947/1480 train_time:144269ms step_avg:153.97ms step:948/1480 train_time:144428ms step_avg:153.97ms step:949/1480 train_time:144600ms step_avg:153.99ms step:950/1480 train_time:144746ms step_avg:153.99ms step:951/1480 train_time:144909ms step_avg:154.00ms step:952/1480 train_time:145067ms step_avg:154.00ms step:953/1480 train_time:145229ms step_avg:154.01ms step:954/1480 train_time:145389ms step_avg:154.01ms step:955/1480 train_time:145547ms step_avg:154.02ms step:956/1480 train_time:145706ms step_avg:154.02ms step:957/1480 train_time:145866ms step_avg:154.03ms step:958/1480 train_time:146028ms step_avg:154.04ms step:959/1480 train_time:146187ms step_avg:154.04ms step:960/1480 train_time:146347ms step_avg:154.05ms step:961/1480 train_time:146506ms step_avg:154.05ms step:962/1480 train_time:146665ms step_avg:154.06ms step:963/1480 train_time:146825ms step_avg:154.07ms step:964/1480 train_time:146986ms step_avg:154.07ms step:965/1480 train_time:147145ms step_avg:154.08ms step:966/1480 train_time:147304ms step_avg:154.08ms step:967/1480 train_time:147462ms step_avg:154.09ms step:968/1480 train_time:147621ms step_avg:154.09ms step:969/1480 train_time:147782ms step_avg:154.10ms step:970/1480 train_time:147939ms step_avg:154.10ms step:971/1480 train_time:148097ms step_avg:154.11ms step:972/1480 train_time:148255ms step_avg:154.11ms step:973/1480 train_time:148414ms step_avg:154.12ms step:974/1480 train_time:148576ms step_avg:154.12ms step:975/1480 train_time:148737ms step_avg:154.13ms step:976/1480 train_time:148897ms step_avg:154.14ms step:977/1480 train_time:149057ms step_avg:154.14ms step:978/1480 train_time:149218ms step_avg:154.15ms step:979/1480 train_time:149379ms step_avg:154.16ms step:980/1480 train_time:149540ms step_avg:154.16ms step:981/1480 train_time:149701ms step_avg:154.17ms step:982/1480 train_time:149858ms step_avg:154.17ms step:983/1480 train_time:150019ms step_avg:154.18ms step:984/1480 train_time:150179ms step_avg:154.19ms step:985/1480 train_time:150341ms step_avg:154.20ms step:986/1480 train_time:150501ms step_avg:154.20ms step:987/1480 train_time:150660ms step_avg:154.21ms step:988/1480 train_time:150818ms step_avg:154.21ms step:989/1480 train_time:150977ms step_avg:154.22ms step:990/1480 train_time:151140ms step_avg:154.22ms step:991/1480 train_time:151302ms step_avg:154.23ms step:992/1480 train_time:151465ms step_avg:154.24ms step:993/1480 train_time:151635ms step_avg:154.26ms step:994/1480 train_time:151795ms step_avg:154.26ms step:995/1480 train_time:151955ms step_avg:154.27ms step:996/1480 train_time:152112ms step_avg:154.27ms step:997/1480 train_time:152272ms step_avg:154.28ms step:998/1480 train_time:152430ms step_avg:154.28ms step:999/1480 train_time:152589ms step_avg:154.29ms step:1000/1480 train_time:152755ms step_avg:154.30ms step:1000/1480 val_loss:3.4412 train_time:152828ms step_avg:154.37ms step:1001/1480 train_time:152919ms step_avg:154.31ms step:1002/1480 train_time:153081ms step_avg:154.32ms step:1003/1480 train_time:153246ms step_avg:154.33ms step:1004/1480 train_time:153407ms step_avg:154.33ms step:1005/1480 train_time:153568ms step_avg:154.34ms step:1006/1480 train_time:153728ms step_avg:154.35ms step:1007/1480 train_time:153888ms step_avg:154.35ms step:1008/1480 train_time:154049ms step_avg:154.36ms step:1009/1480 train_time:154215ms step_avg:154.37ms step:1010/1480 train_time:154373ms step_avg:154.37ms step:1011/1480 train_time:154532ms step_avg:154.38ms step:1012/1480 train_time:154691ms step_avg:154.38ms step:1013/1480 train_time:154852ms step_avg:154.39ms step:1014/1480 train_time:155014ms step_avg:154.40ms step:1015/1480 train_time:155177ms step_avg:154.40ms step:1016/1480 train_time:155337ms step_avg:154.41ms step:1017/1480 train_time:155498ms step_avg:154.42ms step:1018/1480 train_time:155658ms step_avg:154.42ms step:1019/1480 train_time:155819ms step_avg:154.43ms step:1020/1480 train_time:155978ms step_avg:154.43ms step:1021/1480 train_time:156139ms step_avg:154.44ms step:1022/1480 train_time:156299ms step_avg:154.45ms step:1023/1480 train_time:156459ms step_avg:154.45ms step:1024/1480 train_time:156619ms step_avg:154.46ms step:1025/1480 train_time:156779ms step_avg:154.46ms step:1026/1480 train_time:156938ms step_avg:154.47ms step:1027/1480 train_time:157097ms step_avg:154.47ms step:1028/1480 train_time:157258ms step_avg:154.48ms step:1029/1480 train_time:157422ms step_avg:154.49ms step:1030/1480 train_time:157584ms step_avg:154.49ms step:1031/1480 train_time:157742ms step_avg:154.50ms step:1032/1480 train_time:157907ms step_avg:154.51ms step:1033/1480 train_time:158068ms step_avg:154.51ms step:1034/1480 train_time:158230ms step_avg:154.52ms step:1035/1480 train_time:158389ms step_avg:154.53ms step:1036/1480 train_time:158549ms step_avg:154.53ms step:1037/1480 train_time:158710ms step_avg:154.54ms step:1038/1480 train_time:158870ms step_avg:154.54ms step:1039/1480 train_time:159032ms step_avg:154.55ms step:1040/1480 train_time:159194ms step_avg:154.56ms step:1041/1480 train_time:159353ms step_avg:154.56ms step:1042/1480 train_time:159512ms step_avg:154.57ms step:1043/1480 train_time:159670ms step_avg:154.57ms step:1044/1480 train_time:159829ms step_avg:154.57ms step:1045/1480 train_time:159991ms step_avg:154.58ms step:1046/1480 train_time:160150ms step_avg:154.59ms step:1047/1480 train_time:160312ms step_avg:154.59ms step:1048/1480 train_time:160472ms step_avg:154.60ms step:1049/1480 train_time:160632ms step_avg:154.60ms step:1050/1480 train_time:160793ms step_avg:154.61ms step:1051/1480 train_time:160954ms step_avg:154.62ms step:1052/1480 train_time:161116ms step_avg:154.62ms step:1053/1480 train_time:161276ms step_avg:154.63ms step:1054/1480 train_time:161438ms step_avg:154.63ms step:1055/1480 train_time:161597ms step_avg:154.64ms step:1056/1480 train_time:161758ms step_avg:154.64ms step:1057/1480 train_time:161919ms step_avg:154.65ms step:1058/1480 train_time:162082ms step_avg:154.66ms step:1059/1480 train_time:162244ms step_avg:154.67ms step:1060/1480 train_time:162406ms step_avg:154.67ms step:1061/1480 train_time:162565ms step_avg:154.68ms step:1062/1480 train_time:162724ms step_avg:154.68ms step:1063/1480 train_time:162884ms step_avg:154.69ms step:1064/1480 train_time:163042ms step_avg:154.69ms step:1065/1480 train_time:163203ms step_avg:154.70ms step:1066/1480 train_time:163366ms step_avg:154.70ms step:1067/1480 train_time:163530ms step_avg:154.71ms step:1068/1480 train_time:163692ms step_avg:154.72ms step:1069/1480 train_time:163855ms step_avg:154.73ms step:1070/1480 train_time:164014ms step_avg:154.73ms step:1071/1480 train_time:164176ms step_avg:154.74ms step:1072/1480 train_time:164336ms step_avg:154.74ms step:1073/1480 train_time:164493ms step_avg:154.74ms step:1074/1480 train_time:164652ms step_avg:154.75ms step:1075/1480 train_time:164813ms step_avg:154.75ms step:1076/1480 train_time:164971ms step_avg:154.76ms step:1077/1480 train_time:165132ms step_avg:154.76ms step:1078/1480 train_time:165297ms step_avg:154.77ms step:1079/1480 train_time:165461ms step_avg:154.78ms step:1080/1480 train_time:165622ms step_avg:154.79ms step:1081/1480 train_time:165782ms step_avg:154.79ms step:1082/1480 train_time:165941ms step_avg:154.80ms step:1083/1480 train_time:166100ms step_avg:154.80ms step:1084/1480 train_time:166260ms step_avg:154.80ms step:1085/1480 train_time:166420ms step_avg:154.81ms step:1086/1480 train_time:166580ms step_avg:154.81ms step:1087/1480 train_time:166742ms step_avg:154.82ms step:1088/1480 train_time:166903ms step_avg:154.83ms step:1089/1480 train_time:167069ms step_avg:154.84ms step:1090/1480 train_time:167233ms step_avg:154.85ms step:1091/1480 train_time:167393ms step_avg:154.85ms step:1092/1480 train_time:167555ms step_avg:154.86ms step:1093/1480 train_time:167717ms step_avg:154.86ms step:1094/1480 train_time:167876ms step_avg:154.87ms step:1095/1480 train_time:168036ms step_avg:154.87ms step:1096/1480 train_time:168198ms step_avg:154.88ms step:1097/1480 train_time:168359ms step_avg:154.88ms step:1098/1480 train_time:168519ms step_avg:154.89ms step:1099/1480 train_time:168681ms step_avg:154.90ms step:1100/1480 train_time:168845ms step_avg:154.90ms step:1101/1480 train_time:169009ms step_avg:154.91ms step:1102/1480 train_time:169172ms step_avg:154.92ms step:1103/1480 train_time:169337ms step_avg:154.93ms step:1104/1480 train_time:169498ms step_avg:154.93ms step:1105/1480 train_time:169659ms step_avg:154.94ms step:1106/1480 train_time:169820ms step_avg:154.95ms step:1107/1480 train_time:169980ms step_avg:154.95ms step:1108/1480 train_time:170139ms step_avg:154.95ms step:1109/1480 train_time:170299ms step_avg:154.96ms step:1110/1480 train_time:170460ms step_avg:154.96ms step:1111/1480 train_time:170621ms step_avg:154.97ms step:1112/1480 train_time:170783ms step_avg:154.98ms step:1113/1480 train_time:170952ms step_avg:154.99ms step:1114/1480 train_time:171115ms step_avg:155.00ms step:1115/1480 train_time:171276ms step_avg:155.00ms step:1116/1480 train_time:171436ms step_avg:155.01ms step:1117/1480 train_time:171599ms step_avg:155.01ms step:1118/1480 train_time:171764ms step_avg:155.02ms step:1119/1480 train_time:171923ms step_avg:155.03ms step:1120/1480 train_time:172087ms step_avg:155.03ms step:1121/1480 train_time:172250ms step_avg:155.04ms step:1122/1480 train_time:172410ms step_avg:155.05ms step:1123/1480 train_time:172571ms step_avg:155.05ms step:1124/1480 train_time:172735ms step_avg:155.06ms step:1125/1480 train_time:172897ms step_avg:155.06ms step:1125/1480 val_loss:3.3856 train_time:172972ms step_avg:155.13ms step:1126/1480 train_time:173067ms step_avg:155.08ms step:1127/1480 train_time:173222ms step_avg:155.08ms step:1128/1480 train_time:173383ms step_avg:155.08ms step:1129/1480 train_time:173546ms step_avg:155.09ms step:1130/1480 train_time:173706ms step_avg:155.09ms step:1131/1480 train_time:173872ms step_avg:155.10ms step:1132/1480 train_time:174032ms step_avg:155.11ms step:1133/1480 train_time:174196ms step_avg:155.12ms step:1134/1480 train_time:174361ms step_avg:155.13ms step:1135/1480 train_time:174522ms step_avg:155.13ms step:1136/1480 train_time:174684ms step_avg:155.14ms step:1137/1480 train_time:174845ms step_avg:155.14ms step:1138/1480 train_time:175008ms step_avg:155.15ms step:1139/1480 train_time:175180ms step_avg:155.16ms step:1140/1480 train_time:175328ms step_avg:155.16ms step:1141/1480 train_time:175492ms step_avg:155.17ms step:1142/1480 train_time:175655ms step_avg:155.17ms step:1143/1480 train_time:175820ms step_avg:155.18ms step:1144/1480 train_time:175982ms step_avg:155.19ms step:1145/1480 train_time:176141ms step_avg:155.19ms step:1146/1480 train_time:176303ms step_avg:155.20ms step:1147/1480 train_time:176464ms step_avg:155.20ms step:1148/1480 train_time:176623ms step_avg:155.20ms step:1149/1480 train_time:176786ms step_avg:155.21ms step:1150/1480 train_time:176946ms step_avg:155.22ms step:1151/1480 train_time:177111ms step_avg:155.22ms step:1152/1480 train_time:177274ms step_avg:155.23ms step:1153/1480 train_time:177441ms step_avg:155.24ms step:1154/1480 train_time:177602ms step_avg:155.25ms step:1155/1480 train_time:177763ms step_avg:155.25ms step:1156/1480 train_time:177928ms step_avg:155.26ms step:1157/1480 train_time:178091ms step_avg:155.27ms step:1158/1480 train_time:178252ms step_avg:155.27ms step:1159/1480 train_time:178413ms step_avg:155.28ms step:1160/1480 train_time:178575ms step_avg:155.28ms step:1161/1480 train_time:178738ms step_avg:155.29ms step:1162/1480 train_time:178902ms step_avg:155.30ms step:1163/1480 train_time:179065ms step_avg:155.30ms step:1164/1480 train_time:179226ms step_avg:155.31ms step:1165/1480 train_time:179385ms step_avg:155.31ms step:1166/1480 train_time:179546ms step_avg:155.32ms step:1167/1480 train_time:179706ms step_avg:155.32ms step:1168/1480 train_time:179867ms step_avg:155.33ms step:1169/1480 train_time:180028ms step_avg:155.33ms step:1170/1480 train_time:180189ms step_avg:155.34ms step:1171/1480 train_time:180350ms step_avg:155.34ms step:1172/1480 train_time:180511ms step_avg:155.34ms step:1173/1480 train_time:180673ms step_avg:155.35ms step:1174/1480 train_time:180844ms step_avg:155.36ms step:1175/1480 train_time:181006ms step_avg:155.37ms step:1176/1480 train_time:181169ms step_avg:155.38ms step:1177/1480 train_time:181338ms step_avg:155.39ms step:1178/1480 train_time:181499ms step_avg:155.39ms step:1179/1480 train_time:181659ms step_avg:155.40ms step:1180/1480 train_time:181827ms step_avg:155.41ms step:1181/1480 train_time:181989ms step_avg:155.41ms step:1182/1480 train_time:182149ms step_avg:155.42ms step:1183/1480 train_time:182310ms step_avg:155.42ms step:1184/1480 train_time:182473ms step_avg:155.43ms step:1185/1480 train_time:182640ms step_avg:155.44ms step:1186/1480 train_time:182803ms step_avg:155.44ms step:1187/1480 train_time:182973ms step_avg:155.46ms step:1188/1480 train_time:183133ms step_avg:155.46ms step:1189/1480 train_time:183295ms step_avg:155.47ms step:1190/1480 train_time:183457ms step_avg:155.47ms step:1191/1480 train_time:183620ms step_avg:155.48ms step:1192/1480 train_time:183782ms step_avg:155.48ms step:1193/1480 train_time:183941ms step_avg:155.49ms step:1194/1480 train_time:184101ms step_avg:155.49ms step:1195/1480 train_time:184265ms step_avg:155.50ms step:1196/1480 train_time:184435ms step_avg:155.51ms step:1197/1480 train_time:184597ms step_avg:155.52ms step:1198/1480 train_time:184767ms step_avg:155.53ms step:1199/1480 train_time:184928ms step_avg:155.53ms step:1200/1480 train_time:185090ms step_avg:155.54ms step:1201/1480 train_time:185250ms step_avg:155.54ms step:1202/1480 train_time:185420ms step_avg:155.55ms step:1203/1480 train_time:185587ms step_avg:155.56ms step:1204/1480 train_time:185751ms step_avg:155.57ms step:1205/1480 train_time:185913ms step_avg:155.58ms step:1206/1480 train_time:186074ms step_avg:155.58ms step:1207/1480 train_time:186236ms step_avg:155.59ms step:1208/1480 train_time:186397ms step_avg:155.59ms step:1209/1480 train_time:186563ms step_avg:155.60ms step:1210/1480 train_time:186727ms step_avg:155.61ms step:1211/1480 train_time:186889ms step_avg:155.61ms step:1212/1480 train_time:187052ms step_avg:155.62ms step:1213/1480 train_time:187216ms step_avg:155.62ms step:1214/1480 train_time:187384ms step_avg:155.63ms step:1215/1480 train_time:187547ms step_avg:155.64ms step:1216/1480 train_time:187707ms step_avg:155.64ms step:1217/1480 train_time:187871ms step_avg:155.65ms step:1218/1480 train_time:188031ms step_avg:155.65ms step:1219/1480 train_time:188201ms step_avg:155.67ms step:1220/1480 train_time:188363ms step_avg:155.67ms step:1221/1480 train_time:188523ms step_avg:155.68ms step:1222/1480 train_time:188685ms step_avg:155.68ms step:1223/1480 train_time:188847ms step_avg:155.69ms step:1224/1480 train_time:189011ms step_avg:155.69ms step:1225/1480 train_time:189174ms step_avg:155.70ms step:1226/1480 train_time:189341ms step_avg:155.71ms step:1227/1480 train_time:189506ms step_avg:155.72ms step:1228/1480 train_time:189667ms step_avg:155.72ms step:1229/1480 train_time:189829ms step_avg:155.72ms step:1230/1480 train_time:189998ms step_avg:155.74ms step:1231/1480 train_time:190165ms step_avg:155.75ms step:1232/1480 train_time:190329ms step_avg:155.75ms step:1233/1480 train_time:190490ms step_avg:155.76ms step:1234/1480 train_time:190651ms step_avg:155.76ms step:1235/1480 train_time:190820ms step_avg:155.77ms step:1236/1480 train_time:190982ms step_avg:155.78ms step:1237/1480 train_time:191143ms step_avg:155.78ms step:1238/1480 train_time:191316ms step_avg:155.79ms step:1239/1480 train_time:191480ms step_avg:155.80ms step:1240/1480 train_time:191643ms step_avg:155.81ms step:1241/1480 train_time:191807ms step_avg:155.81ms step:1242/1480 train_time:191968ms step_avg:155.82ms step:1243/1480 train_time:192131ms step_avg:155.82ms step:1244/1480 train_time:192292ms step_avg:155.83ms step:1245/1480 train_time:192455ms step_avg:155.83ms step:1246/1480 train_time:192619ms step_avg:155.84ms step:1247/1480 train_time:192782ms step_avg:155.85ms step:1248/1480 train_time:192943ms step_avg:155.85ms step:1249/1480 train_time:193104ms step_avg:155.85ms step:1250/1480 train_time:193266ms step_avg:155.86ms step:1250/1480 val_loss:3.3362 train_time:193341ms step_avg:155.92ms step:1251/1480 train_time:193436ms step_avg:155.87ms step:1252/1480 train_time:193596ms step_avg:155.87ms step:1253/1480 train_time:193757ms step_avg:155.88ms step:1254/1480 train_time:193919ms step_avg:155.88ms step:1255/1480 train_time:194089ms step_avg:155.90ms step:1256/1480 train_time:194254ms step_avg:155.90ms step:1257/1480 train_time:194414ms step_avg:155.91ms step:1258/1480 train_time:194579ms step_avg:155.91ms step:1259/1480 train_time:194744ms step_avg:155.92ms step:1260/1480 train_time:194904ms step_avg:155.92ms step:1261/1480 train_time:195066ms step_avg:155.93ms step:1262/1480 train_time:195230ms step_avg:155.93ms step:1263/1480 train_time:195396ms step_avg:155.94ms step:1264/1480 train_time:195555ms step_avg:155.94ms step:1265/1480 train_time:195715ms step_avg:155.95ms step:1266/1480 train_time:195879ms step_avg:155.95ms step:1267/1480 train_time:196041ms step_avg:155.96ms step:1268/1480 train_time:196204ms step_avg:155.97ms step:1269/1480 train_time:196369ms step_avg:155.97ms step:1270/1480 train_time:196530ms step_avg:155.98ms step:1271/1480 train_time:196693ms step_avg:155.98ms step:1272/1480 train_time:196853ms step_avg:155.98ms step:1273/1480 train_time:197016ms step_avg:155.99ms step:1274/1480 train_time:197181ms step_avg:156.00ms step:1275/1480 train_time:197343ms step_avg:156.00ms step:1276/1480 train_time:197504ms step_avg:156.01ms step:1277/1480 train_time:197665ms step_avg:156.01ms step:1278/1480 train_time:197826ms step_avg:156.01ms step:1279/1480 train_time:197987ms step_avg:156.02ms step:1280/1480 train_time:198155ms step_avg:156.03ms step:1281/1480 train_time:198317ms step_avg:156.03ms step:1282/1480 train_time:198477ms step_avg:156.04ms step:1283/1480 train_time:198640ms step_avg:156.04ms step:1284/1480 train_time:198804ms step_avg:156.05ms step:1285/1480 train_time:198966ms step_avg:156.05ms step:1286/1480 train_time:199127ms step_avg:156.06ms step:1287/1480 train_time:199288ms step_avg:156.06ms step:1288/1480 train_time:199450ms step_avg:156.06ms step:1289/1480 train_time:199620ms step_avg:156.08ms step:1290/1480 train_time:199788ms step_avg:156.08ms step:1291/1480 train_time:199951ms step_avg:156.09ms step:1292/1480 train_time:200114ms step_avg:156.09ms step:1293/1480 train_time:200282ms step_avg:156.10ms step:1294/1480 train_time:200446ms step_avg:156.11ms step:1295/1480 train_time:200609ms step_avg:156.12ms step:1296/1480 train_time:200771ms step_avg:156.12ms step:1297/1480 train_time:200934ms step_avg:156.13ms step:1298/1480 train_time:201098ms step_avg:156.13ms step:1299/1480 train_time:201261ms step_avg:156.14ms step:1300/1480 train_time:201423ms step_avg:156.14ms step:1301/1480 train_time:201584ms step_avg:156.15ms step:1302/1480 train_time:201748ms step_avg:156.15ms step:1303/1480 train_time:201914ms step_avg:156.16ms step:1304/1480 train_time:202081ms step_avg:156.17ms step:1305/1480 train_time:202244ms step_avg:156.17ms step:1306/1480 train_time:202409ms step_avg:156.18ms step:1307/1480 train_time:202569ms step_avg:156.18ms step:1308/1480 train_time:202731ms step_avg:156.19ms step:1309/1480 train_time:202893ms step_avg:156.19ms step:1310/1480 train_time:203056ms step_avg:156.20ms step:1311/1480 train_time:203218ms step_avg:156.20ms step:1312/1480 train_time:203383ms step_avg:156.21ms step:1313/1480 train_time:203546ms step_avg:156.21ms step:1314/1480 train_time:203711ms step_avg:156.22ms step:1315/1480 train_time:203873ms step_avg:156.22ms step:1316/1480 train_time:204032ms step_avg:156.23ms step:1317/1480 train_time:204194ms step_avg:156.23ms step:1318/1480 train_time:204361ms step_avg:156.24ms step:1319/1480 train_time:204527ms step_avg:156.25ms step:1320/1480 train_time:204695ms step_avg:156.26ms step:1321/1480 train_time:204859ms step_avg:156.26ms step:1322/1480 train_time:205030ms step_avg:156.27ms step:1323/1480 train_time:205193ms step_avg:156.28ms step:1324/1480 train_time:205358ms step_avg:156.28ms step:1325/1480 train_time:205528ms step_avg:156.29ms step:1326/1480 train_time:205693ms step_avg:156.30ms step:1327/1480 train_time:205855ms step_avg:156.31ms step:1328/1480 train_time:206017ms step_avg:156.31ms step:1329/1480 train_time:206200ms step_avg:156.33ms step:1330/1480 train_time:206364ms step_avg:156.34ms step:1331/1480 train_time:206528ms step_avg:156.34ms step:1332/1480 train_time:206689ms step_avg:156.35ms step:1333/1480 train_time:206854ms step_avg:156.35ms step:1334/1480 train_time:207018ms step_avg:156.36ms step:1335/1480 train_time:207180ms step_avg:156.36ms step:1336/1480 train_time:207349ms step_avg:156.37ms step:1337/1480 train_time:207517ms step_avg:156.38ms step:1338/1480 train_time:207682ms step_avg:156.39ms step:1339/1480 train_time:207846ms step_avg:156.39ms step:1340/1480 train_time:208012ms step_avg:156.40ms step:1341/1480 train_time:208172ms step_avg:156.40ms step:1342/1480 train_time:208337ms step_avg:156.41ms step:1343/1480 train_time:208500ms step_avg:156.41ms step:1344/1480 train_time:208662ms step_avg:156.42ms step:1345/1480 train_time:208831ms step_avg:156.43ms step:1346/1480 train_time:208993ms step_avg:156.43ms step:1347/1480 train_time:209156ms step_avg:156.44ms step:1348/1480 train_time:209318ms step_avg:156.44ms step:1349/1480 train_time:209480ms step_avg:156.45ms step:1350/1480 train_time:209646ms step_avg:156.45ms step:1351/1480 train_time:209809ms step_avg:156.46ms step:1352/1480 train_time:209970ms step_avg:156.46ms step:1353/1480 train_time:210136ms step_avg:156.47ms step:1354/1480 train_time:210301ms step_avg:156.47ms step:1355/1480 train_time:210462ms step_avg:156.48ms step:1356/1480 train_time:210629ms step_avg:156.48ms step:1357/1480 train_time:210792ms step_avg:156.49ms step:1358/1480 train_time:210956ms step_avg:156.50ms step:1359/1480 train_time:211120ms step_avg:156.50ms step:1360/1480 train_time:211286ms step_avg:156.51ms step:1361/1480 train_time:211452ms step_avg:156.52ms step:1362/1480 train_time:211616ms step_avg:156.52ms step:1363/1480 train_time:211785ms step_avg:156.53ms step:1364/1480 train_time:211947ms step_avg:156.53ms step:1365/1480 train_time:212108ms step_avg:156.54ms step:1366/1480 train_time:212271ms step_avg:156.54ms step:1367/1480 train_time:212434ms step_avg:156.55ms step:1368/1480 train_time:212600ms step_avg:156.55ms step:1369/1480 train_time:212769ms step_avg:156.56ms step:1370/1480 train_time:212935ms step_avg:156.57ms step:1371/1480 train_time:213098ms step_avg:156.57ms step:1372/1480 train_time:213265ms step_avg:156.58ms step:1373/1480 train_time:213426ms step_avg:156.59ms step:1374/1480 train_time:213590ms step_avg:156.59ms step:1375/1480 train_time:213751ms step_avg:156.59ms step:1375/1480 val_loss:3.2969 train_time:213826ms step_avg:156.65ms step:1376/1480 train_time:213917ms step_avg:156.60ms step:1377/1480 train_time:214080ms step_avg:156.61ms step:1378/1480 train_time:214243ms step_avg:156.61ms step:1379/1480 train_time:214408ms step_avg:156.62ms step:1380/1480 train_time:214571ms step_avg:156.62ms step:1381/1480 train_time:214739ms step_avg:156.63ms step:1382/1480 train_time:214904ms step_avg:156.64ms step:1383/1480 train_time:215067ms step_avg:156.64ms step:1384/1480 train_time:215233ms step_avg:156.65ms step:1385/1480 train_time:215392ms step_avg:156.65ms step:1386/1480 train_time:215555ms step_avg:156.65ms step:1387/1480 train_time:215721ms step_avg:156.66ms step:1388/1480 train_time:215881ms step_avg:156.66ms step:1389/1480 train_time:216046ms step_avg:156.67ms step:1390/1480 train_time:216207ms step_avg:156.67ms step:1391/1480 train_time:216369ms step_avg:156.68ms step:1392/1480 train_time:216532ms step_avg:156.68ms step:1393/1480 train_time:216695ms step_avg:156.68ms step:1394/1480 train_time:216859ms step_avg:156.69ms step:1395/1480 train_time:217022ms step_avg:156.69ms step:1396/1480 train_time:217184ms step_avg:156.70ms step:1397/1480 train_time:217346ms step_avg:156.70ms step:1398/1480 train_time:217506ms step_avg:156.70ms step:1399/1480 train_time:217672ms step_avg:156.71ms step:1400/1480 train_time:217840ms step_avg:156.72ms step:1401/1480 train_time:218000ms step_avg:156.72ms step:1402/1480 train_time:218164ms step_avg:156.73ms step:1403/1480 train_time:218330ms step_avg:156.73ms step:1404/1480 train_time:218492ms step_avg:156.74ms step:1405/1480 train_time:218657ms step_avg:156.74ms step:1406/1480 train_time:218823ms step_avg:156.75ms step:1407/1480 train_time:218986ms step_avg:156.75ms step:1408/1480 train_time:219148ms step_avg:156.76ms step:1409/1480 train_time:219319ms step_avg:156.77ms step:1410/1480 train_time:219482ms step_avg:156.77ms step:1411/1480 train_time:219643ms step_avg:156.78ms step:1412/1480 train_time:219806ms step_avg:156.78ms step:1413/1480 train_time:219969ms step_avg:156.78ms step:1414/1480 train_time:220133ms step_avg:156.79ms step:1415/1480 train_time:220299ms step_avg:156.80ms step:1416/1480 train_time:220472ms step_avg:156.81ms step:1417/1480 train_time:220638ms step_avg:156.81ms step:1418/1480 train_time:220803ms step_avg:156.82ms step:1419/1480 train_time:220969ms step_avg:156.83ms step:1420/1480 train_time:221133ms step_avg:156.83ms step:1421/1480 train_time:221297ms step_avg:156.84ms step:1422/1480 train_time:221463ms step_avg:156.84ms step:1423/1480 train_time:221627ms step_avg:156.85ms step:1424/1480 train_time:221792ms step_avg:156.85ms step:1425/1480 train_time:221963ms step_avg:156.86ms step:1426/1480 train_time:222128ms step_avg:156.87ms step:1427/1480 train_time:222293ms step_avg:156.88ms step:1428/1480 train_time:222455ms step_avg:156.88ms step:1429/1480 train_time:222614ms step_avg:156.88ms step:1430/1480 train_time:222781ms step_avg:156.89ms step:1431/1480 train_time:222947ms step_avg:156.89ms step:1432/1480 train_time:223115ms step_avg:156.90ms step:1433/1480 train_time:223285ms step_avg:156.91ms step:1434/1480 train_time:223453ms step_avg:156.92ms step:1435/1480 train_time:223620ms step_avg:156.93ms step:1436/1480 train_time:223786ms step_avg:156.93ms step:1437/1480 train_time:223948ms step_avg:156.94ms step:1438/1480 train_time:224110ms step_avg:156.94ms step:1439/1480 train_time:224275ms step_avg:156.95ms step:1440/1480 train_time:224438ms step_avg:156.95ms step:1441/1480 train_time:224603ms step_avg:156.96ms step:1442/1480 train_time:224769ms step_avg:156.96ms step:1443/1480 train_time:224944ms step_avg:156.97ms step:1444/1480 train_time:225107ms step_avg:156.98ms step:1445/1480 train_time:225270ms step_avg:156.98ms step:1446/1480 train_time:225436ms step_avg:156.99ms step:1447/1480 train_time:225605ms step_avg:157.00ms step:1448/1480 train_time:225769ms step_avg:157.00ms step:1449/1480 train_time:225931ms step_avg:157.01ms step:1450/1480 train_time:226094ms step_avg:157.01ms step:1451/1480 train_time:226257ms step_avg:157.01ms step:1452/1480 train_time:226425ms step_avg:157.02ms step:1453/1480 train_time:226589ms step_avg:157.03ms step:1454/1480 train_time:226751ms step_avg:157.03ms step:1455/1480 train_time:226919ms step_avg:157.04ms step:1456/1480 train_time:227083ms step_avg:157.04ms step:1457/1480 train_time:227245ms step_avg:157.05ms step:1458/1480 train_time:227409ms step_avg:157.05ms step:1459/1480 train_time:227573ms step_avg:157.06ms step:1460/1480 train_time:227738ms step_avg:157.06ms step:1461/1480 train_time:227902ms step_avg:157.07ms step:1462/1480 train_time:228066ms step_avg:157.07ms step:1463/1480 train_time:228231ms step_avg:157.08ms step:1464/1480 train_time:228394ms step_avg:157.08ms step:1465/1480 train_time:228559ms step_avg:157.09ms step:1466/1480 train_time:228723ms step_avg:157.09ms step:1467/1480 train_time:228887ms step_avg:157.09ms step:1468/1480 train_time:229050ms step_avg:157.10ms step:1469/1480 train_time:229213ms step_avg:157.10ms step:1470/1480 train_time:229382ms step_avg:157.11ms step:1471/1480 train_time:229553ms step_avg:157.12ms step:1472/1480 train_time:229725ms step_avg:157.13ms step:1473/1480 train_time:229889ms step_avg:157.14ms step:1474/1480 train_time:230055ms step_avg:157.14ms step:1475/1480 train_time:230225ms step_avg:157.15ms step:1476/1480 train_time:230389ms step_avg:157.15ms step:1477/1480 train_time:230557ms step_avg:157.16ms step:1478/1480 train_time:230730ms step_avg:157.17ms step:1479/1480 train_time:230893ms step_avg:157.18ms step:1480/1480 train_time:231057ms step_avg:157.18ms step:1480/1480 val_loss:3.2777 train_time:231133ms step_avg:157.23ms peak memory consumption: 34241 MiB