import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 08:27:57 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28821ms step_avg:nanms step:2/1480 train_time:28925ms step_avg:nanms step:3/1480 train_time:29048ms step_avg:nanms step:4/1480 train_time:29189ms step_avg:nanms step:5/1480 train_time:29330ms step_avg:nanms step:6/1480 train_time:29472ms step_avg:nanms step:7/1480 train_time:29615ms step_avg:nanms step:8/1480 train_time:29756ms step_avg:nanms step:9/1480 train_time:29898ms step_avg:nanms step:10/1480 train_time:30041ms step_avg:nanms step:11/1480 train_time:149ms step_avg:nanms step:12/1480 train_time:285ms step_avg:nanms step:13/1480 train_time:428ms step_avg:142.56ms step:14/1480 train_time:571ms step_avg:142.84ms step:15/1480 train_time:712ms step_avg:142.49ms step:16/1480 train_time:854ms step_avg:142.35ms step:17/1480 train_time:998ms step_avg:142.59ms step:18/1480 train_time:1141ms step_avg:142.64ms step:19/1480 train_time:1284ms step_avg:142.68ms step:20/1480 train_time:1428ms step_avg:142.81ms step:21/1480 train_time:1572ms step_avg:142.87ms step:22/1480 train_time:1713ms step_avg:142.78ms step:23/1480 train_time:1855ms step_avg:142.67ms step:24/1480 train_time:1997ms step_avg:142.65ms step:25/1480 train_time:2141ms step_avg:142.70ms step:26/1480 train_time:2284ms step_avg:142.77ms step:27/1480 train_time:2427ms step_avg:142.75ms step:28/1480 train_time:2570ms step_avg:142.77ms step:29/1480 train_time:2713ms step_avg:142.79ms step:30/1480 train_time:2855ms step_avg:142.73ms step:31/1480 train_time:2996ms step_avg:142.69ms step:32/1480 train_time:3139ms step_avg:142.67ms step:33/1480 train_time:3282ms step_avg:142.70ms step:34/1480 train_time:3425ms step_avg:142.72ms step:35/1480 train_time:3568ms step_avg:142.73ms step:36/1480 train_time:3710ms step_avg:142.71ms step:37/1480 train_time:3852ms step_avg:142.65ms step:38/1480 train_time:3995ms step_avg:142.67ms step:39/1480 train_time:4137ms step_avg:142.64ms step:40/1480 train_time:4281ms step_avg:142.71ms step:41/1480 train_time:4426ms step_avg:142.78ms step:42/1480 train_time:4570ms step_avg:142.81ms step:43/1480 train_time:4712ms step_avg:142.80ms step:44/1480 train_time:4854ms step_avg:142.77ms step:45/1480 train_time:4997ms step_avg:142.76ms step:46/1480 train_time:5139ms step_avg:142.76ms step:47/1480 train_time:5282ms step_avg:142.75ms step:48/1480 train_time:5425ms step_avg:142.77ms step:49/1480 train_time:5569ms step_avg:142.79ms step:50/1480 train_time:5711ms step_avg:142.77ms step:51/1480 train_time:5852ms step_avg:142.73ms step:52/1480 train_time:5994ms step_avg:142.72ms step:53/1480 train_time:6137ms step_avg:142.71ms step:54/1480 train_time:6280ms step_avg:142.72ms step:55/1480 train_time:6422ms step_avg:142.72ms step:56/1480 train_time:6566ms step_avg:142.74ms step:57/1480 train_time:6708ms step_avg:142.73ms step:58/1480 train_time:6852ms step_avg:142.75ms step:59/1480 train_time:6995ms step_avg:142.76ms step:60/1480 train_time:7137ms step_avg:142.73ms step:61/1480 train_time:7280ms step_avg:142.74ms step:62/1480 train_time:7422ms step_avg:142.72ms step:63/1480 train_time:7566ms step_avg:142.75ms step:64/1480 train_time:7709ms step_avg:142.76ms step:65/1480 train_time:7852ms step_avg:142.76ms step:66/1480 train_time:7995ms step_avg:142.76ms step:67/1480 train_time:8136ms step_avg:142.74ms step:68/1480 train_time:8279ms step_avg:142.74ms step:69/1480 train_time:8421ms step_avg:142.73ms step:70/1480 train_time:8564ms step_avg:142.74ms step:71/1480 train_time:8707ms step_avg:142.73ms step:72/1480 train_time:8850ms step_avg:142.74ms step:73/1480 train_time:8993ms step_avg:142.75ms step:74/1480 train_time:9136ms step_avg:142.75ms step:75/1480 train_time:9279ms step_avg:142.76ms step:76/1480 train_time:9423ms step_avg:142.77ms step:77/1480 train_time:9567ms step_avg:142.79ms step:78/1480 train_time:9709ms step_avg:142.79ms step:79/1480 train_time:9852ms step_avg:142.78ms step:80/1480 train_time:9993ms step_avg:142.76ms step:81/1480 train_time:10515ms step_avg:148.10ms step:82/1480 train_time:10617ms step_avg:147.46ms step:83/1480 train_time:10758ms step_avg:147.37ms step:84/1480 train_time:10900ms step_avg:147.30ms step:85/1480 train_time:11042ms step_avg:147.23ms step:86/1480 train_time:11186ms step_avg:147.18ms step:87/1480 train_time:11329ms step_avg:147.13ms step:88/1480 train_time:11472ms step_avg:147.08ms step:89/1480 train_time:11618ms step_avg:147.06ms step:90/1480 train_time:11762ms step_avg:147.03ms step:91/1480 train_time:11907ms step_avg:147.00ms step:92/1480 train_time:12048ms step_avg:146.93ms step:93/1480 train_time:12191ms step_avg:146.88ms step:94/1480 train_time:12332ms step_avg:146.81ms step:95/1480 train_time:12475ms step_avg:146.76ms step:96/1480 train_time:12988ms step_avg:151.03ms step:97/1480 train_time:13091ms step_avg:150.47ms step:98/1480 train_time:13622ms step_avg:154.79ms step:99/1480 train_time:13726ms step_avg:154.22ms step:100/1480 train_time:13869ms step_avg:154.10ms step:101/1480 train_time:14016ms step_avg:154.02ms step:102/1480 train_time:14152ms step_avg:153.82ms step:103/1480 train_time:14293ms step_avg:153.69ms step:104/1480 train_time:14435ms step_avg:153.56ms step:105/1480 train_time:14577ms step_avg:153.45ms step:106/1480 train_time:14722ms step_avg:153.36ms step:107/1480 train_time:14866ms step_avg:153.25ms step:108/1480 train_time:15008ms step_avg:153.14ms step:109/1480 train_time:15149ms step_avg:153.02ms step:110/1480 train_time:15291ms step_avg:152.91ms step:111/1480 train_time:15434ms step_avg:152.81ms step:112/1480 train_time:15579ms step_avg:152.73ms step:113/1480 train_time:15725ms step_avg:152.67ms step:114/1480 train_time:15871ms step_avg:152.60ms step:115/1480 train_time:16015ms step_avg:152.53ms step:116/1480 train_time:16161ms step_avg:152.46ms step:117/1480 train_time:16307ms step_avg:152.41ms step:118/1480 train_time:16452ms step_avg:152.33ms step:119/1480 train_time:16598ms step_avg:152.27ms step:120/1480 train_time:16745ms step_avg:152.23ms step:121/1480 train_time:16891ms step_avg:152.17ms step:122/1480 train_time:17036ms step_avg:152.11ms step:123/1480 train_time:17182ms step_avg:152.05ms step:124/1480 train_time:17329ms step_avg:152.01ms step:125/1480 train_time:17474ms step_avg:151.95ms step:125/1480 val_loss:4.4220 train_time:17538ms step_avg:152.51ms step:126/1480 train_time:17631ms step_avg:151.99ms step:127/1480 train_time:17773ms step_avg:151.91ms step:128/1480 train_time:17919ms step_avg:151.86ms step:129/1480 train_time:18064ms step_avg:151.80ms step:130/1480 train_time:18209ms step_avg:151.74ms step:131/1480 train_time:18356ms step_avg:151.70ms step:132/1480 train_time:18500ms step_avg:151.64ms step:133/1480 train_time:18645ms step_avg:151.59ms step:134/1480 train_time:18793ms step_avg:151.55ms step:135/1480 train_time:18939ms step_avg:151.52ms step:136/1480 train_time:19085ms step_avg:151.47ms step:137/1480 train_time:19231ms step_avg:151.43ms step:138/1480 train_time:19378ms step_avg:151.39ms step:139/1480 train_time:19522ms step_avg:151.34ms step:140/1480 train_time:19668ms step_avg:151.29ms step:141/1480 train_time:19814ms step_avg:151.25ms step:142/1480 train_time:19960ms step_avg:151.21ms step:143/1480 train_time:20105ms step_avg:151.17ms step:144/1480 train_time:20253ms step_avg:151.14ms step:145/1480 train_time:20399ms step_avg:151.10ms step:146/1480 train_time:20545ms step_avg:151.07ms step:147/1480 train_time:20692ms step_avg:151.04ms step:148/1480 train_time:20838ms step_avg:151.00ms step:149/1480 train_time:20983ms step_avg:150.96ms step:150/1480 train_time:21129ms step_avg:150.92ms step:151/1480 train_time:21276ms step_avg:150.89ms step:152/1480 train_time:21420ms step_avg:150.85ms step:153/1480 train_time:21565ms step_avg:150.80ms step:154/1480 train_time:21711ms step_avg:150.77ms step:155/1480 train_time:21857ms step_avg:150.74ms step:156/1480 train_time:22002ms step_avg:150.70ms step:157/1480 train_time:22148ms step_avg:150.67ms step:158/1480 train_time:22295ms step_avg:150.64ms step:159/1480 train_time:22440ms step_avg:150.60ms step:160/1480 train_time:22585ms step_avg:150.57ms step:161/1480 train_time:22731ms step_avg:150.54ms step:162/1480 train_time:22877ms step_avg:150.51ms step:163/1480 train_time:23021ms step_avg:150.47ms step:164/1480 train_time:23168ms step_avg:150.44ms step:165/1480 train_time:23314ms step_avg:150.41ms step:166/1480 train_time:23459ms step_avg:150.38ms step:167/1480 train_time:23604ms step_avg:150.34ms step:168/1480 train_time:23750ms step_avg:150.32ms step:169/1480 train_time:23896ms step_avg:150.29ms step:170/1480 train_time:24041ms step_avg:150.26ms step:171/1480 train_time:24187ms step_avg:150.23ms step:172/1480 train_time:24333ms step_avg:150.20ms step:173/1480 train_time:24479ms step_avg:150.18ms step:174/1480 train_time:24625ms step_avg:150.15ms step:175/1480 train_time:24771ms step_avg:150.13ms step:176/1480 train_time:24917ms step_avg:150.10ms step:177/1480 train_time:25062ms step_avg:150.07ms step:178/1480 train_time:25207ms step_avg:150.04ms step:179/1480 train_time:25354ms step_avg:150.02ms step:180/1480 train_time:25499ms step_avg:149.99ms step:181/1480 train_time:25645ms step_avg:149.97ms step:182/1480 train_time:25792ms step_avg:149.95ms step:183/1480 train_time:26316ms step_avg:152.11ms step:184/1480 train_time:26420ms step_avg:151.84ms step:185/1480 train_time:26566ms step_avg:151.81ms step:186/1480 train_time:26712ms step_avg:151.77ms step:187/1480 train_time:26858ms step_avg:151.74ms step:188/1480 train_time:27003ms step_avg:151.70ms step:189/1480 train_time:27167ms step_avg:151.77ms step:190/1480 train_time:27294ms step_avg:151.64ms step:191/1480 train_time:27441ms step_avg:151.61ms step:192/1480 train_time:27586ms step_avg:151.57ms step:193/1480 train_time:27733ms step_avg:151.55ms step:194/1480 train_time:27880ms step_avg:151.52ms step:195/1480 train_time:28027ms step_avg:151.50ms step:196/1480 train_time:28174ms step_avg:151.47ms step:197/1480 train_time:28320ms step_avg:151.44ms step:198/1480 train_time:28465ms step_avg:151.41ms step:199/1480 train_time:28611ms step_avg:151.38ms step:200/1480 train_time:28759ms step_avg:151.36ms step:201/1480 train_time:28905ms step_avg:151.34ms step:202/1480 train_time:29051ms step_avg:151.31ms step:203/1480 train_time:29197ms step_avg:151.28ms step:204/1480 train_time:29342ms step_avg:151.25ms step:205/1480 train_time:29488ms step_avg:151.22ms step:206/1480 train_time:29634ms step_avg:151.19ms step:207/1480 train_time:29780ms step_avg:151.17ms step:208/1480 train_time:29926ms step_avg:151.14ms step:209/1480 train_time:30071ms step_avg:151.11ms step:210/1480 train_time:30219ms step_avg:151.09ms step:211/1480 train_time:30363ms step_avg:151.06ms step:212/1480 train_time:30507ms step_avg:151.03ms step:213/1480 train_time:30654ms step_avg:151.01ms step:214/1480 train_time:30799ms step_avg:150.98ms step:215/1480 train_time:30945ms step_avg:150.95ms step:216/1480 train_time:31093ms step_avg:150.94ms step:217/1480 train_time:31239ms step_avg:150.92ms step:218/1480 train_time:31385ms step_avg:150.89ms step:219/1480 train_time:31533ms step_avg:150.87ms step:220/1480 train_time:31680ms step_avg:150.86ms step:221/1480 train_time:32214ms step_avg:152.67ms step:222/1480 train_time:32324ms step_avg:152.47ms step:223/1480 train_time:32471ms step_avg:152.45ms step:224/1480 train_time:32620ms step_avg:152.43ms step:225/1480 train_time:32767ms step_avg:152.40ms step:226/1480 train_time:32915ms step_avg:152.38ms step:227/1480 train_time:33062ms step_avg:152.36ms step:228/1480 train_time:33213ms step_avg:152.35ms step:229/1480 train_time:33362ms step_avg:152.34ms step:230/1480 train_time:33510ms step_avg:152.32ms step:231/1480 train_time:33659ms step_avg:152.30ms step:232/1480 train_time:33807ms step_avg:152.28ms step:233/1480 train_time:33956ms step_avg:152.27ms step:234/1480 train_time:34103ms step_avg:152.25ms step:235/1480 train_time:34253ms step_avg:152.23ms step:236/1480 train_time:34401ms step_avg:152.22ms step:237/1480 train_time:34549ms step_avg:152.20ms step:238/1480 train_time:34697ms step_avg:152.18ms step:239/1480 train_time:34847ms step_avg:152.17ms step:240/1480 train_time:34995ms step_avg:152.15ms step:241/1480 train_time:35143ms step_avg:152.14ms step:242/1480 train_time:35292ms step_avg:152.12ms step:243/1480 train_time:35441ms step_avg:152.11ms step:244/1480 train_time:35589ms step_avg:152.09ms step:245/1480 train_time:35739ms step_avg:152.08ms step:246/1480 train_time:35887ms step_avg:152.06ms step:247/1480 train_time:36036ms step_avg:152.05ms step:248/1480 train_time:36183ms step_avg:152.03ms step:249/1480 train_time:36331ms step_avg:152.01ms step:250/1480 train_time:36479ms step_avg:152.00ms step:250/1480 val_loss:3.9886 train_time:36546ms step_avg:152.28ms step:251/1480 train_time:36639ms step_avg:152.03ms step:252/1480 train_time:36786ms step_avg:152.01ms step:253/1480 train_time:36934ms step_avg:151.99ms step:254/1480 train_time:37083ms step_avg:151.98ms step:255/1480 train_time:37230ms step_avg:151.96ms step:256/1480 train_time:37378ms step_avg:151.94ms step:257/1480 train_time:37526ms step_avg:151.93ms step:258/1480 train_time:37676ms step_avg:151.92ms step:259/1480 train_time:37825ms step_avg:151.91ms step:260/1480 train_time:37976ms step_avg:151.90ms step:261/1480 train_time:38125ms step_avg:151.89ms step:262/1480 train_time:38274ms step_avg:151.88ms step:263/1480 train_time:38422ms step_avg:151.87ms step:264/1480 train_time:38571ms step_avg:151.86ms step:265/1480 train_time:38721ms step_avg:151.85ms step:266/1480 train_time:38870ms step_avg:151.83ms step:267/1480 train_time:39019ms step_avg:151.82ms step:268/1480 train_time:39167ms step_avg:151.81ms step:269/1480 train_time:39316ms step_avg:151.80ms step:270/1480 train_time:39465ms step_avg:151.79ms step:271/1480 train_time:39612ms step_avg:151.77ms step:272/1480 train_time:39760ms step_avg:151.76ms step:273/1480 train_time:39909ms step_avg:151.75ms step:274/1480 train_time:40058ms step_avg:151.74ms step:275/1480 train_time:40207ms step_avg:151.73ms step:276/1480 train_time:40356ms step_avg:151.71ms step:277/1480 train_time:40505ms step_avg:151.70ms step:278/1480 train_time:40653ms step_avg:151.69ms step:279/1480 train_time:40802ms step_avg:151.68ms step:280/1480 train_time:40950ms step_avg:151.67ms step:281/1480 train_time:41098ms step_avg:151.65ms step:282/1480 train_time:41247ms step_avg:151.64ms step:283/1480 train_time:41395ms step_avg:151.63ms step:284/1480 train_time:41545ms step_avg:151.62ms step:285/1480 train_time:41692ms step_avg:151.61ms step:286/1480 train_time:41841ms step_avg:151.60ms step:287/1480 train_time:41989ms step_avg:151.59ms step:288/1480 train_time:42137ms step_avg:151.57ms step:289/1480 train_time:42286ms step_avg:151.56ms step:290/1480 train_time:42435ms step_avg:151.55ms step:291/1480 train_time:42584ms step_avg:151.54ms step:292/1480 train_time:42732ms step_avg:151.53ms step:293/1480 train_time:42882ms step_avg:151.53ms step:294/1480 train_time:43031ms step_avg:151.52ms step:295/1480 train_time:43180ms step_avg:151.51ms step:296/1480 train_time:43328ms step_avg:151.50ms step:297/1480 train_time:43477ms step_avg:151.49ms step:298/1480 train_time:43625ms step_avg:151.48ms step:299/1480 train_time:43776ms step_avg:151.48ms step:300/1480 train_time:43925ms step_avg:151.47ms step:301/1480 train_time:44074ms step_avg:151.46ms step:302/1480 train_time:44223ms step_avg:151.45ms step:303/1480 train_time:44370ms step_avg:151.43ms step:304/1480 train_time:44519ms step_avg:151.42ms step:305/1480 train_time:44668ms step_avg:151.42ms step:306/1480 train_time:44816ms step_avg:151.41ms step:307/1480 train_time:44966ms step_avg:151.40ms step:308/1480 train_time:45113ms step_avg:151.39ms step:309/1480 train_time:45262ms step_avg:151.38ms step:310/1480 train_time:45410ms step_avg:151.37ms step:311/1480 train_time:45559ms step_avg:151.36ms step:312/1480 train_time:45708ms step_avg:151.35ms step:313/1480 train_time:45857ms step_avg:151.34ms step:314/1480 train_time:46005ms step_avg:151.33ms step:315/1480 train_time:46154ms step_avg:151.32ms step:316/1480 train_time:46303ms step_avg:151.32ms step:317/1480 train_time:46450ms step_avg:151.30ms step:318/1480 train_time:46598ms step_avg:151.29ms step:319/1480 train_time:46747ms step_avg:151.29ms step:320/1480 train_time:46895ms step_avg:151.27ms step:321/1480 train_time:47044ms step_avg:151.27ms step:322/1480 train_time:47192ms step_avg:151.26ms step:323/1480 train_time:47341ms step_avg:151.25ms step:324/1480 train_time:47489ms step_avg:151.24ms step:325/1480 train_time:47638ms step_avg:151.23ms step:326/1480 train_time:47788ms step_avg:151.23ms step:327/1480 train_time:47935ms step_avg:151.22ms step:328/1480 train_time:48085ms step_avg:151.21ms step:329/1480 train_time:48233ms step_avg:151.20ms step:330/1480 train_time:48384ms step_avg:151.20ms step:331/1480 train_time:48534ms step_avg:151.20ms step:332/1480 train_time:48685ms step_avg:151.20ms step:333/1480 train_time:48835ms step_avg:151.19ms step:334/1480 train_time:48987ms step_avg:151.19ms step:335/1480 train_time:49138ms step_avg:151.19ms step:336/1480 train_time:49289ms step_avg:151.19ms step:337/1480 train_time:49440ms step_avg:151.19ms step:338/1480 train_time:49590ms step_avg:151.19ms step:339/1480 train_time:49741ms step_avg:151.19ms step:340/1480 train_time:49891ms step_avg:151.19ms step:341/1480 train_time:50041ms step_avg:151.18ms step:342/1480 train_time:50191ms step_avg:151.18ms step:343/1480 train_time:50344ms step_avg:151.18ms step:344/1480 train_time:50494ms step_avg:151.18ms step:345/1480 train_time:50646ms step_avg:151.18ms step:346/1480 train_time:50796ms step_avg:151.18ms step:347/1480 train_time:50947ms step_avg:151.18ms step:348/1480 train_time:51097ms step_avg:151.18ms step:349/1480 train_time:51249ms step_avg:151.18ms step:350/1480 train_time:51398ms step_avg:151.17ms step:351/1480 train_time:51549ms step_avg:151.17ms step:352/1480 train_time:51701ms step_avg:151.17ms step:353/1480 train_time:51851ms step_avg:151.17ms step:354/1480 train_time:52004ms step_avg:151.17ms step:355/1480 train_time:52154ms step_avg:151.17ms step:356/1480 train_time:52306ms step_avg:151.17ms step:357/1480 train_time:52457ms step_avg:151.17ms step:358/1480 train_time:52608ms step_avg:151.17ms step:359/1480 train_time:52759ms step_avg:151.17ms step:360/1480 train_time:52911ms step_avg:151.17ms step:361/1480 train_time:53063ms step_avg:151.18ms step:362/1480 train_time:53213ms step_avg:151.17ms step:363/1480 train_time:53364ms step_avg:151.17ms step:364/1480 train_time:53514ms step_avg:151.17ms step:365/1480 train_time:53666ms step_avg:151.17ms step:366/1480 train_time:53816ms step_avg:151.17ms step:367/1480 train_time:53967ms step_avg:151.17ms step:368/1480 train_time:54118ms step_avg:151.17ms step:369/1480 train_time:54269ms step_avg:151.17ms step:370/1480 train_time:54419ms step_avg:151.17ms step:371/1480 train_time:54569ms step_avg:151.16ms step:372/1480 train_time:54720ms step_avg:151.16ms step:373/1480 train_time:54871ms step_avg:151.16ms step:374/1480 train_time:55021ms step_avg:151.16ms step:375/1480 train_time:55172ms step_avg:151.16ms step:375/1480 val_loss:3.8025 train_time:55240ms step_avg:151.34ms step:376/1480 train_time:55335ms step_avg:151.19ms step:377/1480 train_time:55481ms step_avg:151.17ms step:378/1480 train_time:55632ms step_avg:151.17ms step:379/1480 train_time:55805ms step_avg:151.23ms step:380/1480 train_time:55932ms step_avg:151.17ms step:381/1480 train_time:56083ms step_avg:151.17ms step:382/1480 train_time:56233ms step_avg:151.16ms step:383/1480 train_time:56385ms step_avg:151.17ms step:384/1480 train_time:56536ms step_avg:151.16ms step:385/1480 train_time:56687ms step_avg:151.17ms step:386/1480 train_time:56838ms step_avg:151.17ms step:387/1480 train_time:56989ms step_avg:151.16ms step:388/1480 train_time:57140ms step_avg:151.16ms step:389/1480 train_time:57292ms step_avg:151.17ms step:390/1480 train_time:57443ms step_avg:151.17ms step:391/1480 train_time:57595ms step_avg:151.17ms step:392/1480 train_time:57746ms step_avg:151.17ms step:393/1480 train_time:57897ms step_avg:151.17ms step:394/1480 train_time:58047ms step_avg:151.16ms step:395/1480 train_time:58199ms step_avg:151.17ms step:396/1480 train_time:58350ms step_avg:151.17ms step:397/1480 train_time:58501ms step_avg:151.17ms step:398/1480 train_time:58652ms step_avg:151.17ms step:399/1480 train_time:58803ms step_avg:151.16ms step:400/1480 train_time:58953ms step_avg:151.16ms step:401/1480 train_time:59105ms step_avg:151.16ms step:402/1480 train_time:59256ms step_avg:151.16ms step:403/1480 train_time:59406ms step_avg:151.16ms step:404/1480 train_time:59557ms step_avg:151.16ms step:405/1480 train_time:59707ms step_avg:151.16ms step:406/1480 train_time:59858ms step_avg:151.16ms step:407/1480 train_time:60008ms step_avg:151.15ms step:408/1480 train_time:60159ms step_avg:151.15ms step:409/1480 train_time:60308ms step_avg:151.15ms step:410/1480 train_time:60460ms step_avg:151.15ms step:411/1480 train_time:60609ms step_avg:151.15ms step:412/1480 train_time:60761ms step_avg:151.15ms step:413/1480 train_time:60911ms step_avg:151.14ms step:414/1480 train_time:61062ms step_avg:151.14ms step:415/1480 train_time:61213ms step_avg:151.14ms step:416/1480 train_time:61364ms step_avg:151.14ms step:417/1480 train_time:61516ms step_avg:151.14ms step:418/1480 train_time:61666ms step_avg:151.14ms step:419/1480 train_time:61818ms step_avg:151.14ms step:420/1480 train_time:61968ms step_avg:151.14ms step:421/1480 train_time:62119ms step_avg:151.14ms step:422/1480 train_time:62271ms step_avg:151.14ms step:423/1480 train_time:62422ms step_avg:151.14ms step:424/1480 train_time:62573ms step_avg:151.14ms step:425/1480 train_time:62724ms step_avg:151.14ms step:426/1480 train_time:62876ms step_avg:151.14ms step:427/1480 train_time:63027ms step_avg:151.14ms step:428/1480 train_time:63178ms step_avg:151.14ms step:429/1480 train_time:63328ms step_avg:151.14ms step:430/1480 train_time:63479ms step_avg:151.14ms step:431/1480 train_time:63630ms step_avg:151.14ms step:432/1480 train_time:63781ms step_avg:151.14ms step:433/1480 train_time:63931ms step_avg:151.14ms step:434/1480 train_time:64082ms step_avg:151.14ms step:435/1480 train_time:64232ms step_avg:151.13ms step:436/1480 train_time:64383ms step_avg:151.13ms step:437/1480 train_time:64534ms step_avg:151.13ms step:438/1480 train_time:64684ms step_avg:151.13ms step:439/1480 train_time:64836ms step_avg:151.13ms step:440/1480 train_time:64988ms step_avg:151.14ms step:441/1480 train_time:65141ms step_avg:151.14ms step:442/1480 train_time:65295ms step_avg:151.15ms step:443/1480 train_time:65449ms step_avg:151.15ms step:444/1480 train_time:65602ms step_avg:151.16ms step:445/1480 train_time:65755ms step_avg:151.16ms step:446/1480 train_time:65906ms step_avg:151.16ms step:447/1480 train_time:66059ms step_avg:151.16ms step:448/1480 train_time:66211ms step_avg:151.17ms step:449/1480 train_time:66364ms step_avg:151.17ms step:450/1480 train_time:66517ms step_avg:151.18ms step:451/1480 train_time:66670ms step_avg:151.18ms step:452/1480 train_time:66823ms step_avg:151.18ms step:453/1480 train_time:66977ms step_avg:151.19ms step:454/1480 train_time:67129ms step_avg:151.19ms step:455/1480 train_time:67282ms step_avg:151.19ms step:456/1480 train_time:67434ms step_avg:151.20ms step:457/1480 train_time:67587ms step_avg:151.20ms step:458/1480 train_time:67740ms step_avg:151.21ms step:459/1480 train_time:67896ms step_avg:151.22ms step:460/1480 train_time:68049ms step_avg:151.22ms step:461/1480 train_time:68202ms step_avg:151.22ms step:462/1480 train_time:68355ms step_avg:151.23ms step:463/1480 train_time:68507ms step_avg:151.23ms step:464/1480 train_time:68661ms step_avg:151.23ms step:465/1480 train_time:68812ms step_avg:151.24ms step:466/1480 train_time:68965ms step_avg:151.24ms step:467/1480 train_time:69119ms step_avg:151.24ms step:468/1480 train_time:69271ms step_avg:151.25ms step:469/1480 train_time:69424ms step_avg:151.25ms step:470/1480 train_time:69577ms step_avg:151.25ms step:471/1480 train_time:69729ms step_avg:151.26ms step:472/1480 train_time:69882ms step_avg:151.26ms step:473/1480 train_time:70035ms step_avg:151.26ms step:474/1480 train_time:70187ms step_avg:151.26ms step:475/1480 train_time:70340ms step_avg:151.27ms step:476/1480 train_time:70493ms step_avg:151.27ms step:477/1480 train_time:70646ms step_avg:151.28ms step:478/1480 train_time:70801ms step_avg:151.28ms step:479/1480 train_time:70954ms step_avg:151.29ms step:480/1480 train_time:71106ms step_avg:151.29ms step:481/1480 train_time:71259ms step_avg:151.29ms step:482/1480 train_time:71412ms step_avg:151.30ms step:483/1480 train_time:71564ms step_avg:151.30ms step:484/1480 train_time:71718ms step_avg:151.30ms step:485/1480 train_time:71870ms step_avg:151.31ms step:486/1480 train_time:72024ms step_avg:151.31ms step:487/1480 train_time:72176ms step_avg:151.31ms step:488/1480 train_time:72328ms step_avg:151.31ms step:489/1480 train_time:72481ms step_avg:151.32ms step:490/1480 train_time:72634ms step_avg:151.32ms step:491/1480 train_time:72787ms step_avg:151.32ms step:492/1480 train_time:72940ms step_avg:151.33ms step:493/1480 train_time:73094ms step_avg:151.33ms step:494/1480 train_time:73247ms step_avg:151.34ms step:495/1480 train_time:73401ms step_avg:151.34ms step:496/1480 train_time:73554ms step_avg:151.35ms step:497/1480 train_time:73707ms step_avg:151.35ms step:498/1480 train_time:73860ms step_avg:151.35ms step:499/1480 train_time:74013ms step_avg:151.36ms step:500/1480 train_time:74166ms step_avg:151.36ms step:500/1480 val_loss:3.6773 train_time:74237ms step_avg:151.50ms step:501/1480 train_time:74329ms step_avg:151.38ms step:502/1480 train_time:74480ms step_avg:151.38ms step:503/1480 train_time:74633ms step_avg:151.39ms step:504/1480 train_time:74785ms step_avg:151.39ms step:505/1480 train_time:74938ms step_avg:151.39ms step:506/1480 train_time:75090ms step_avg:151.39ms step:507/1480 train_time:75242ms step_avg:151.39ms step:508/1480 train_time:75397ms step_avg:151.40ms step:509/1480 train_time:75550ms step_avg:151.40ms step:510/1480 train_time:75704ms step_avg:151.41ms step:511/1480 train_time:75858ms step_avg:151.41ms step:512/1480 train_time:76011ms step_avg:151.42ms step:513/1480 train_time:76164ms step_avg:151.42ms step:514/1480 train_time:76318ms step_avg:151.42ms step:515/1480 train_time:76472ms step_avg:151.43ms step:516/1480 train_time:76625ms step_avg:151.43ms step:517/1480 train_time:76780ms step_avg:151.44ms step:518/1480 train_time:76932ms step_avg:151.44ms step:519/1480 train_time:77085ms step_avg:151.44ms step:520/1480 train_time:77237ms step_avg:151.45ms step:521/1480 train_time:77390ms step_avg:151.45ms step:522/1480 train_time:77543ms step_avg:151.45ms step:523/1480 train_time:77697ms step_avg:151.46ms step:524/1480 train_time:77849ms step_avg:151.46ms step:525/1480 train_time:78002ms step_avg:151.46ms step:526/1480 train_time:78155ms step_avg:151.46ms step:527/1480 train_time:78306ms step_avg:151.46ms step:528/1480 train_time:78459ms step_avg:151.47ms step:529/1480 train_time:78611ms step_avg:151.47ms step:530/1480 train_time:78764ms step_avg:151.47ms step:531/1480 train_time:78918ms step_avg:151.47ms step:532/1480 train_time:79071ms step_avg:151.48ms step:533/1480 train_time:79223ms step_avg:151.48ms step:534/1480 train_time:79377ms step_avg:151.48ms step:535/1480 train_time:79529ms step_avg:151.48ms step:536/1480 train_time:79682ms step_avg:151.49ms step:537/1480 train_time:79835ms step_avg:151.49ms step:538/1480 train_time:79989ms step_avg:151.49ms step:539/1480 train_time:80143ms step_avg:151.50ms step:540/1480 train_time:80297ms step_avg:151.50ms step:541/1480 train_time:80449ms step_avg:151.51ms step:542/1480 train_time:80602ms step_avg:151.51ms step:543/1480 train_time:80755ms step_avg:151.51ms step:544/1480 train_time:80906ms step_avg:151.51ms step:545/1480 train_time:81059ms step_avg:151.51ms step:546/1480 train_time:81214ms step_avg:151.52ms step:547/1480 train_time:81366ms step_avg:151.52ms step:548/1480 train_time:81520ms step_avg:151.52ms step:549/1480 train_time:81673ms step_avg:151.53ms step:550/1480 train_time:81827ms step_avg:151.53ms step:551/1480 train_time:81981ms step_avg:151.54ms step:552/1480 train_time:82135ms step_avg:151.54ms step:553/1480 train_time:82290ms step_avg:151.55ms step:554/1480 train_time:82445ms step_avg:151.55ms step:555/1480 train_time:82601ms step_avg:151.56ms step:556/1480 train_time:82755ms step_avg:151.57ms step:557/1480 train_time:82910ms step_avg:151.57ms step:558/1480 train_time:83064ms step_avg:151.58ms step:559/1480 train_time:83219ms step_avg:151.58ms step:560/1480 train_time:83374ms step_avg:151.59ms step:561/1480 train_time:83529ms step_avg:151.60ms step:562/1480 train_time:83683ms step_avg:151.60ms step:563/1480 train_time:83838ms step_avg:151.61ms step:564/1480 train_time:83996ms step_avg:151.62ms step:565/1480 train_time:84152ms step_avg:151.62ms step:566/1480 train_time:84307ms step_avg:151.63ms step:567/1480 train_time:84461ms step_avg:151.63ms step:568/1480 train_time:84616ms step_avg:151.64ms step:569/1480 train_time:84791ms step_avg:151.68ms step:570/1480 train_time:84925ms step_avg:151.65ms step:571/1480 train_time:85080ms step_avg:151.66ms step:572/1480 train_time:85234ms step_avg:151.66ms step:573/1480 train_time:85389ms step_avg:151.67ms step:574/1480 train_time:85546ms step_avg:151.68ms step:575/1480 train_time:85700ms step_avg:151.68ms step:576/1480 train_time:85854ms step_avg:151.69ms step:577/1480 train_time:86009ms step_avg:151.69ms step:578/1480 train_time:86163ms step_avg:151.70ms step:579/1480 train_time:86317ms step_avg:151.70ms step:580/1480 train_time:86471ms step_avg:151.70ms step:581/1480 train_time:86625ms step_avg:151.71ms step:582/1480 train_time:86780ms step_avg:151.71ms step:583/1480 train_time:86934ms step_avg:151.72ms step:584/1480 train_time:87089ms step_avg:151.72ms step:585/1480 train_time:87244ms step_avg:151.73ms step:586/1480 train_time:87399ms step_avg:151.73ms step:587/1480 train_time:87553ms step_avg:151.74ms step:588/1480 train_time:87707ms step_avg:151.74ms step:589/1480 train_time:87862ms step_avg:151.75ms step:590/1480 train_time:88017ms step_avg:151.75ms step:591/1480 train_time:88172ms step_avg:151.76ms step:592/1480 train_time:88327ms step_avg:151.76ms step:593/1480 train_time:88482ms step_avg:151.77ms step:594/1480 train_time:88636ms step_avg:151.77ms step:595/1480 train_time:88791ms step_avg:151.78ms step:596/1480 train_time:88947ms step_avg:151.79ms step:597/1480 train_time:89102ms step_avg:151.79ms step:598/1480 train_time:89256ms step_avg:151.80ms step:599/1480 train_time:89409ms step_avg:151.80ms step:600/1480 train_time:89564ms step_avg:151.80ms step:601/1480 train_time:89719ms step_avg:151.81ms step:602/1480 train_time:89874ms step_avg:151.81ms step:603/1480 train_time:90029ms step_avg:151.82ms step:604/1480 train_time:90183ms step_avg:151.82ms step:605/1480 train_time:90337ms step_avg:151.83ms step:606/1480 train_time:90494ms step_avg:151.83ms step:607/1480 train_time:90649ms step_avg:151.84ms step:608/1480 train_time:90805ms step_avg:151.85ms step:609/1480 train_time:90961ms step_avg:151.85ms step:610/1480 train_time:91114ms step_avg:151.86ms step:611/1480 train_time:91268ms step_avg:151.86ms step:612/1480 train_time:91423ms step_avg:151.87ms step:613/1480 train_time:91579ms step_avg:151.87ms step:614/1480 train_time:91734ms step_avg:151.88ms step:615/1480 train_time:91889ms step_avg:151.88ms step:616/1480 train_time:92042ms step_avg:151.89ms step:617/1480 train_time:92198ms step_avg:151.89ms step:618/1480 train_time:92352ms step_avg:151.89ms step:619/1480 train_time:92506ms step_avg:151.90ms step:620/1480 train_time:92661ms step_avg:151.90ms step:621/1480 train_time:92817ms step_avg:151.91ms step:622/1480 train_time:92971ms step_avg:151.91ms step:623/1480 train_time:93127ms step_avg:151.92ms step:624/1480 train_time:93282ms step_avg:151.92ms step:625/1480 train_time:93435ms step_avg:151.93ms step:625/1480 val_loss:3.5984 train_time:93506ms step_avg:152.04ms step:626/1480 train_time:93602ms step_avg:151.95ms step:627/1480 train_time:93751ms step_avg:151.95ms step:628/1480 train_time:93904ms step_avg:151.95ms step:629/1480 train_time:94058ms step_avg:151.95ms step:630/1480 train_time:94212ms step_avg:151.95ms step:631/1480 train_time:94366ms step_avg:151.96ms step:632/1480 train_time:94520ms step_avg:151.96ms step:633/1480 train_time:94675ms step_avg:151.97ms step:634/1480 train_time:94828ms step_avg:151.97ms step:635/1480 train_time:94984ms step_avg:151.97ms step:636/1480 train_time:95139ms step_avg:151.98ms step:637/1480 train_time:95294ms step_avg:151.98ms step:638/1480 train_time:95449ms step_avg:151.99ms step:639/1480 train_time:95603ms step_avg:151.99ms step:640/1480 train_time:95758ms step_avg:152.00ms step:641/1480 train_time:95912ms step_avg:152.00ms step:642/1480 train_time:96067ms step_avg:152.00ms step:643/1480 train_time:96221ms step_avg:152.01ms step:644/1480 train_time:96377ms step_avg:152.01ms step:645/1480 train_time:96531ms step_avg:152.02ms step:646/1480 train_time:96687ms step_avg:152.02ms step:647/1480 train_time:96842ms step_avg:152.03ms step:648/1480 train_time:96997ms step_avg:152.03ms step:649/1480 train_time:97151ms step_avg:152.04ms step:650/1480 train_time:97307ms step_avg:152.04ms step:651/1480 train_time:97463ms step_avg:152.05ms step:652/1480 train_time:97618ms step_avg:152.05ms step:653/1480 train_time:97772ms step_avg:152.06ms step:654/1480 train_time:97927ms step_avg:152.06ms step:655/1480 train_time:98080ms step_avg:152.06ms step:656/1480 train_time:98233ms step_avg:152.06ms step:657/1480 train_time:98389ms step_avg:152.07ms step:658/1480 train_time:98543ms step_avg:152.07ms step:659/1480 train_time:98698ms step_avg:152.08ms step:660/1480 train_time:98855ms step_avg:152.08ms step:661/1480 train_time:99011ms step_avg:152.09ms step:662/1480 train_time:99167ms step_avg:152.10ms step:663/1480 train_time:99322ms step_avg:152.10ms step:664/1480 train_time:99478ms step_avg:152.11ms step:665/1480 train_time:99635ms step_avg:152.11ms step:666/1480 train_time:99791ms step_avg:152.12ms step:667/1480 train_time:99948ms step_avg:152.13ms step:668/1480 train_time:100104ms step_avg:152.13ms step:669/1480 train_time:100261ms step_avg:152.14ms step:670/1480 train_time:100416ms step_avg:152.14ms step:671/1480 train_time:100572ms step_avg:152.15ms step:672/1480 train_time:100727ms step_avg:152.16ms step:673/1480 train_time:100885ms step_avg:152.16ms step:674/1480 train_time:101043ms step_avg:152.17ms step:675/1480 train_time:101200ms step_avg:152.18ms step:676/1480 train_time:101357ms step_avg:152.19ms step:677/1480 train_time:101513ms step_avg:152.19ms step:678/1480 train_time:101670ms step_avg:152.20ms step:679/1480 train_time:101826ms step_avg:152.21ms step:680/1480 train_time:101984ms step_avg:152.21ms step:681/1480 train_time:102141ms step_avg:152.22ms step:682/1480 train_time:102297ms step_avg:152.23ms step:683/1480 train_time:102454ms step_avg:152.23ms step:684/1480 train_time:102610ms step_avg:152.24ms step:685/1480 train_time:102766ms step_avg:152.25ms step:686/1480 train_time:102923ms step_avg:152.25ms step:687/1480 train_time:103080ms step_avg:152.26ms step:688/1480 train_time:103238ms step_avg:152.27ms step:689/1480 train_time:103396ms step_avg:152.28ms step:690/1480 train_time:103553ms step_avg:152.28ms step:691/1480 train_time:103709ms step_avg:152.29ms step:692/1480 train_time:103866ms step_avg:152.30ms step:693/1480 train_time:104023ms step_avg:152.30ms step:694/1480 train_time:104181ms step_avg:152.31ms step:695/1480 train_time:104336ms step_avg:152.32ms step:696/1480 train_time:104492ms step_avg:152.32ms step:697/1480 train_time:104649ms step_avg:152.33ms step:698/1480 train_time:104804ms step_avg:152.33ms step:699/1480 train_time:104960ms step_avg:152.34ms step:700/1480 train_time:105116ms step_avg:152.34ms step:701/1480 train_time:105272ms step_avg:152.35ms step:702/1480 train_time:105429ms step_avg:152.35ms step:703/1480 train_time:105587ms step_avg:152.36ms step:704/1480 train_time:105744ms step_avg:152.37ms step:705/1480 train_time:105899ms step_avg:152.37ms step:706/1480 train_time:106056ms step_avg:152.38ms step:707/1480 train_time:106212ms step_avg:152.38ms step:708/1480 train_time:106369ms step_avg:152.39ms step:709/1480 train_time:106525ms step_avg:152.40ms step:710/1480 train_time:106681ms step_avg:152.40ms step:711/1480 train_time:106837ms step_avg:152.41ms step:712/1480 train_time:106993ms step_avg:152.41ms step:713/1480 train_time:107151ms step_avg:152.42ms step:714/1480 train_time:107308ms step_avg:152.43ms step:715/1480 train_time:107463ms step_avg:152.43ms step:716/1480 train_time:107618ms step_avg:152.43ms step:717/1480 train_time:107774ms step_avg:152.44ms step:718/1480 train_time:107930ms step_avg:152.44ms step:719/1480 train_time:108087ms step_avg:152.45ms step:720/1480 train_time:108244ms step_avg:152.46ms step:721/1480 train_time:108402ms step_avg:152.46ms step:722/1480 train_time:108559ms step_avg:152.47ms step:723/1480 train_time:108714ms step_avg:152.47ms step:724/1480 train_time:108871ms step_avg:152.48ms step:725/1480 train_time:109027ms step_avg:152.49ms step:726/1480 train_time:109182ms step_avg:152.49ms step:727/1480 train_time:109339ms step_avg:152.50ms step:728/1480 train_time:109495ms step_avg:152.50ms step:729/1480 train_time:109651ms step_avg:152.51ms step:730/1480 train_time:109809ms step_avg:152.51ms step:731/1480 train_time:109965ms step_avg:152.52ms step:732/1480 train_time:110122ms step_avg:152.52ms step:733/1480 train_time:110278ms step_avg:152.53ms step:734/1480 train_time:110435ms step_avg:152.53ms step:735/1480 train_time:110591ms step_avg:152.54ms step:736/1480 train_time:110749ms step_avg:152.55ms step:737/1480 train_time:110903ms step_avg:152.55ms step:738/1480 train_time:111059ms step_avg:152.55ms step:739/1480 train_time:111215ms step_avg:152.56ms step:740/1480 train_time:111373ms step_avg:152.57ms step:741/1480 train_time:111530ms step_avg:152.57ms step:742/1480 train_time:111686ms step_avg:152.58ms step:743/1480 train_time:111843ms step_avg:152.58ms step:744/1480 train_time:112000ms step_avg:152.59ms step:745/1480 train_time:112158ms step_avg:152.60ms step:746/1480 train_time:112314ms step_avg:152.60ms step:747/1480 train_time:112471ms step_avg:152.61ms step:748/1480 train_time:112629ms step_avg:152.61ms step:749/1480 train_time:112786ms step_avg:152.62ms step:750/1480 train_time:112942ms step_avg:152.62ms step:750/1480 val_loss:3.5443 train_time:113014ms step_avg:152.72ms step:751/1480 train_time:113110ms step_avg:152.64ms step:752/1480 train_time:113261ms step_avg:152.64ms step:753/1480 train_time:113417ms step_avg:152.65ms step:754/1480 train_time:113573ms step_avg:152.65ms step:755/1480 train_time:113729ms step_avg:152.66ms step:756/1480 train_time:113884ms step_avg:152.66ms step:757/1480 train_time:114042ms step_avg:152.67ms step:758/1480 train_time:114198ms step_avg:152.67ms step:759/1480 train_time:114376ms step_avg:152.71ms step:760/1480 train_time:114512ms step_avg:152.68ms step:761/1480 train_time:114669ms step_avg:152.69ms step:762/1480 train_time:114825ms step_avg:152.69ms step:763/1480 train_time:114983ms step_avg:152.70ms step:764/1480 train_time:115140ms step_avg:152.71ms step:765/1480 train_time:115298ms step_avg:152.71ms step:766/1480 train_time:115455ms step_avg:152.72ms step:767/1480 train_time:115612ms step_avg:152.72ms step:768/1480 train_time:115769ms step_avg:152.73ms step:769/1480 train_time:115926ms step_avg:152.74ms step:770/1480 train_time:116084ms step_avg:152.74ms step:771/1480 train_time:116242ms step_avg:152.75ms step:772/1480 train_time:116400ms step_avg:152.76ms step:773/1480 train_time:116558ms step_avg:152.76ms step:774/1480 train_time:116715ms step_avg:152.77ms step:775/1480 train_time:116873ms step_avg:152.78ms step:776/1480 train_time:117032ms step_avg:152.78ms step:777/1480 train_time:117191ms step_avg:152.79ms step:778/1480 train_time:117350ms step_avg:152.80ms step:779/1480 train_time:117506ms step_avg:152.80ms step:780/1480 train_time:117666ms step_avg:152.81ms step:781/1480 train_time:117824ms step_avg:152.82ms step:782/1480 train_time:117982ms step_avg:152.83ms step:783/1480 train_time:118140ms step_avg:152.83ms step:784/1480 train_time:118297ms step_avg:152.84ms step:785/1480 train_time:118455ms step_avg:152.85ms step:786/1480 train_time:118612ms step_avg:152.85ms step:787/1480 train_time:118769ms step_avg:152.86ms step:788/1480 train_time:118929ms step_avg:152.87ms step:789/1480 train_time:119086ms step_avg:152.87ms step:790/1480 train_time:119244ms step_avg:152.88ms step:791/1480 train_time:119405ms step_avg:152.89ms step:792/1480 train_time:119565ms step_avg:152.90ms step:793/1480 train_time:119725ms step_avg:152.91ms step:794/1480 train_time:119883ms step_avg:152.91ms step:795/1480 train_time:120043ms step_avg:152.92ms step:796/1480 train_time:120203ms step_avg:152.93ms step:797/1480 train_time:120362ms step_avg:152.94ms step:798/1480 train_time:120521ms step_avg:152.95ms step:799/1480 train_time:120681ms step_avg:152.95ms step:800/1480 train_time:120841ms step_avg:152.96ms step:801/1480 train_time:120998ms step_avg:152.97ms step:802/1480 train_time:121159ms step_avg:152.98ms step:803/1480 train_time:121317ms step_avg:152.98ms step:804/1480 train_time:121473ms step_avg:152.99ms step:805/1480 train_time:121633ms step_avg:153.00ms step:806/1480 train_time:121790ms step_avg:153.00ms step:807/1480 train_time:121947ms step_avg:153.01ms step:808/1480 train_time:122105ms step_avg:153.01ms step:809/1480 train_time:122265ms step_avg:153.02ms step:810/1480 train_time:122424ms step_avg:153.03ms step:811/1480 train_time:122581ms step_avg:153.03ms step:812/1480 train_time:122739ms step_avg:153.04ms step:813/1480 train_time:122896ms step_avg:153.05ms step:814/1480 train_time:123053ms step_avg:153.05ms step:815/1480 train_time:123210ms step_avg:153.06ms step:816/1480 train_time:123369ms step_avg:153.06ms step:817/1480 train_time:123528ms step_avg:153.07ms step:818/1480 train_time:123684ms step_avg:153.07ms step:819/1480 train_time:123844ms step_avg:153.08ms step:820/1480 train_time:124002ms step_avg:153.09ms step:821/1480 train_time:124160ms step_avg:153.09ms step:822/1480 train_time:124316ms step_avg:153.10ms step:823/1480 train_time:124474ms step_avg:153.10ms step:824/1480 train_time:124630ms step_avg:153.11ms step:825/1480 train_time:124790ms step_avg:153.12ms step:826/1480 train_time:124950ms step_avg:153.12ms step:827/1480 train_time:125108ms step_avg:153.13ms step:828/1480 train_time:125268ms step_avg:153.14ms step:829/1480 train_time:125426ms step_avg:153.15ms step:830/1480 train_time:125586ms step_avg:153.15ms step:831/1480 train_time:125744ms step_avg:153.16ms step:832/1480 train_time:125902ms step_avg:153.17ms step:833/1480 train_time:126060ms step_avg:153.17ms step:834/1480 train_time:126220ms step_avg:153.18ms step:835/1480 train_time:126377ms step_avg:153.18ms step:836/1480 train_time:126535ms step_avg:153.19ms step:837/1480 train_time:126692ms step_avg:153.19ms step:838/1480 train_time:126850ms step_avg:153.20ms step:839/1480 train_time:127007ms step_avg:153.20ms step:840/1480 train_time:127166ms step_avg:153.21ms step:841/1480 train_time:127322ms step_avg:153.22ms step:842/1480 train_time:127479ms step_avg:153.22ms step:843/1480 train_time:127637ms step_avg:153.23ms step:844/1480 train_time:127793ms step_avg:153.23ms step:845/1480 train_time:127951ms step_avg:153.23ms step:846/1480 train_time:128110ms step_avg:153.24ms step:847/1480 train_time:128269ms step_avg:153.25ms step:848/1480 train_time:128428ms step_avg:153.25ms step:849/1480 train_time:128586ms step_avg:153.26ms step:850/1480 train_time:128744ms step_avg:153.27ms step:851/1480 train_time:128903ms step_avg:153.27ms step:852/1480 train_time:129062ms step_avg:153.28ms step:853/1480 train_time:129220ms step_avg:153.29ms step:854/1480 train_time:129377ms step_avg:153.29ms step:855/1480 train_time:129536ms step_avg:153.30ms step:856/1480 train_time:129693ms step_avg:153.30ms step:857/1480 train_time:129851ms step_avg:153.31ms step:858/1480 train_time:130011ms step_avg:153.32ms step:859/1480 train_time:130170ms step_avg:153.32ms step:860/1480 train_time:130328ms step_avg:153.33ms step:861/1480 train_time:130487ms step_avg:153.33ms step:862/1480 train_time:130647ms step_avg:153.34ms step:863/1480 train_time:130806ms step_avg:153.35ms step:864/1480 train_time:130966ms step_avg:153.36ms step:865/1480 train_time:131124ms step_avg:153.36ms step:866/1480 train_time:131282ms step_avg:153.37ms step:867/1480 train_time:131442ms step_avg:153.37ms step:868/1480 train_time:131600ms step_avg:153.38ms step:869/1480 train_time:131758ms step_avg:153.39ms step:870/1480 train_time:131915ms step_avg:153.39ms step:871/1480 train_time:132072ms step_avg:153.39ms step:872/1480 train_time:132230ms step_avg:153.40ms step:873/1480 train_time:132387ms step_avg:153.40ms step:874/1480 train_time:132548ms step_avg:153.41ms step:875/1480 train_time:132707ms step_avg:153.42ms step:875/1480 val_loss:3.4998 train_time:132781ms step_avg:153.50ms step:876/1480 train_time:132876ms step_avg:153.44ms step:877/1480 train_time:133027ms step_avg:153.43ms step:878/1480 train_time:133185ms step_avg:153.44ms step:879/1480 train_time:133343ms step_avg:153.44ms step:880/1480 train_time:133502ms step_avg:153.45ms step:881/1480 train_time:133660ms step_avg:153.46ms step:882/1480 train_time:133819ms step_avg:153.46ms step:883/1480 train_time:133980ms step_avg:153.47ms step:884/1480 train_time:134142ms step_avg:153.48ms step:885/1480 train_time:134303ms step_avg:153.49ms step:886/1480 train_time:134463ms step_avg:153.50ms step:887/1480 train_time:134623ms step_avg:153.50ms step:888/1480 train_time:134787ms step_avg:153.52ms step:889/1480 train_time:134948ms step_avg:153.52ms step:890/1480 train_time:135106ms step_avg:153.53ms step:891/1480 train_time:135265ms step_avg:153.54ms step:892/1480 train_time:135425ms step_avg:153.54ms step:893/1480 train_time:135582ms step_avg:153.55ms step:894/1480 train_time:135743ms step_avg:153.56ms step:895/1480 train_time:135902ms step_avg:153.56ms step:896/1480 train_time:136060ms step_avg:153.57ms step:897/1480 train_time:136221ms step_avg:153.57ms step:898/1480 train_time:136382ms step_avg:153.58ms step:899/1480 train_time:136543ms step_avg:153.59ms step:900/1480 train_time:136702ms step_avg:153.60ms step:901/1480 train_time:136861ms step_avg:153.60ms step:902/1480 train_time:137019ms step_avg:153.61ms step:903/1480 train_time:137180ms step_avg:153.62ms step:904/1480 train_time:137340ms step_avg:153.62ms step:905/1480 train_time:137499ms step_avg:153.63ms step:906/1480 train_time:137659ms step_avg:153.64ms step:907/1480 train_time:137823ms step_avg:153.65ms step:908/1480 train_time:137981ms step_avg:153.65ms step:909/1480 train_time:138140ms step_avg:153.66ms step:910/1480 train_time:138304ms step_avg:153.67ms step:911/1480 train_time:138464ms step_avg:153.68ms step:912/1480 train_time:138624ms step_avg:153.69ms step:913/1480 train_time:138784ms step_avg:153.69ms step:914/1480 train_time:138944ms step_avg:153.70ms step:915/1480 train_time:139104ms step_avg:153.71ms step:916/1480 train_time:139265ms step_avg:153.71ms step:917/1480 train_time:139423ms step_avg:153.72ms step:918/1480 train_time:139584ms step_avg:153.73ms step:919/1480 train_time:139747ms step_avg:153.74ms step:920/1480 train_time:139906ms step_avg:153.74ms step:921/1480 train_time:140066ms step_avg:153.75ms step:922/1480 train_time:140228ms step_avg:153.76ms step:923/1480 train_time:140387ms step_avg:153.76ms step:924/1480 train_time:140547ms step_avg:153.77ms step:925/1480 train_time:140707ms step_avg:153.78ms step:926/1480 train_time:140865ms step_avg:153.78ms step:927/1480 train_time:141022ms step_avg:153.79ms step:928/1480 train_time:141181ms step_avg:153.79ms step:929/1480 train_time:141341ms step_avg:153.80ms step:930/1480 train_time:141501ms step_avg:153.81ms step:931/1480 train_time:141661ms step_avg:153.81ms step:932/1480 train_time:141821ms step_avg:153.82ms step:933/1480 train_time:141980ms step_avg:153.82ms step:934/1480 train_time:142138ms step_avg:153.83ms step:935/1480 train_time:142301ms step_avg:153.84ms step:936/1480 train_time:142460ms step_avg:153.84ms step:937/1480 train_time:142622ms step_avg:153.85ms step:938/1480 train_time:142781ms step_avg:153.86ms step:939/1480 train_time:142944ms step_avg:153.87ms step:940/1480 train_time:143107ms step_avg:153.88ms step:941/1480 train_time:143265ms step_avg:153.88ms step:942/1480 train_time:143424ms step_avg:153.89ms step:943/1480 train_time:143584ms step_avg:153.89ms step:944/1480 train_time:143747ms step_avg:153.90ms step:945/1480 train_time:143906ms step_avg:153.91ms step:946/1480 train_time:144068ms step_avg:153.92ms step:947/1480 train_time:144229ms step_avg:153.93ms step:948/1480 train_time:144386ms step_avg:153.93ms step:949/1480 train_time:144556ms step_avg:153.95ms step:950/1480 train_time:144705ms step_avg:153.94ms step:951/1480 train_time:144867ms step_avg:153.95ms step:952/1480 train_time:145026ms step_avg:153.96ms step:953/1480 train_time:145185ms step_avg:153.96ms step:954/1480 train_time:145349ms step_avg:153.97ms step:955/1480 train_time:145508ms step_avg:153.98ms step:956/1480 train_time:145667ms step_avg:153.98ms step:957/1480 train_time:145829ms step_avg:153.99ms step:958/1480 train_time:145991ms step_avg:154.00ms step:959/1480 train_time:146149ms step_avg:154.00ms step:960/1480 train_time:146309ms step_avg:154.01ms step:961/1480 train_time:146468ms step_avg:154.01ms step:962/1480 train_time:146628ms step_avg:154.02ms step:963/1480 train_time:146787ms step_avg:154.03ms step:964/1480 train_time:146948ms step_avg:154.03ms step:965/1480 train_time:147107ms step_avg:154.04ms step:966/1480 train_time:147267ms step_avg:154.04ms step:967/1480 train_time:147425ms step_avg:154.05ms step:968/1480 train_time:147584ms step_avg:154.05ms step:969/1480 train_time:147744ms step_avg:154.06ms step:970/1480 train_time:147902ms step_avg:154.06ms step:971/1480 train_time:148061ms step_avg:154.07ms step:972/1480 train_time:148220ms step_avg:154.08ms step:973/1480 train_time:148379ms step_avg:154.08ms step:974/1480 train_time:148541ms step_avg:154.09ms step:975/1480 train_time:148703ms step_avg:154.10ms step:976/1480 train_time:148864ms step_avg:154.10ms step:977/1480 train_time:149022ms step_avg:154.11ms step:978/1480 train_time:149181ms step_avg:154.11ms step:979/1480 train_time:149343ms step_avg:154.12ms step:980/1480 train_time:149505ms step_avg:154.13ms step:981/1480 train_time:149666ms step_avg:154.14ms step:982/1480 train_time:149824ms step_avg:154.14ms step:983/1480 train_time:149983ms step_avg:154.14ms step:984/1480 train_time:150142ms step_avg:154.15ms step:985/1480 train_time:150303ms step_avg:154.16ms step:986/1480 train_time:150463ms step_avg:154.16ms step:987/1480 train_time:150623ms step_avg:154.17ms step:988/1480 train_time:150781ms step_avg:154.17ms step:989/1480 train_time:150941ms step_avg:154.18ms step:990/1480 train_time:151103ms step_avg:154.19ms step:991/1480 train_time:151265ms step_avg:154.19ms step:992/1480 train_time:151430ms step_avg:154.21ms step:993/1480 train_time:151598ms step_avg:154.22ms step:994/1480 train_time:151757ms step_avg:154.22ms step:995/1480 train_time:151914ms step_avg:154.23ms step:996/1480 train_time:152072ms step_avg:154.23ms step:997/1480 train_time:152230ms step_avg:154.23ms step:998/1480 train_time:152389ms step_avg:154.24ms step:999/1480 train_time:152549ms step_avg:154.25ms step:1000/1480 train_time:152709ms step_avg:154.25ms step:1000/1480 val_loss:3.4359 train_time:152782ms step_avg:154.33ms step:1001/1480 train_time:152879ms step_avg:154.27ms step:1002/1480 train_time:153033ms step_avg:154.27ms step:1003/1480 train_time:153198ms step_avg:154.28ms step:1004/1480 train_time:153360ms step_avg:154.29ms step:1005/1480 train_time:153520ms step_avg:154.29ms step:1006/1480 train_time:153681ms step_avg:154.30ms step:1007/1480 train_time:153841ms step_avg:154.30ms step:1008/1480 train_time:154002ms step_avg:154.31ms step:1009/1480 train_time:154167ms step_avg:154.32ms step:1010/1480 train_time:154327ms step_avg:154.33ms step:1011/1480 train_time:154487ms step_avg:154.33ms step:1012/1480 train_time:154645ms step_avg:154.34ms step:1013/1480 train_time:154807ms step_avg:154.34ms step:1014/1480 train_time:154968ms step_avg:154.35ms step:1015/1480 train_time:155130ms step_avg:154.36ms step:1016/1480 train_time:155290ms step_avg:154.36ms step:1017/1480 train_time:155451ms step_avg:154.37ms step:1018/1480 train_time:155612ms step_avg:154.38ms step:1019/1480 train_time:155773ms step_avg:154.38ms step:1020/1480 train_time:155932ms step_avg:154.39ms step:1021/1480 train_time:156092ms step_avg:154.39ms step:1022/1480 train_time:156252ms step_avg:154.40ms step:1023/1480 train_time:156413ms step_avg:154.41ms step:1024/1480 train_time:156572ms step_avg:154.41ms step:1025/1480 train_time:156733ms step_avg:154.42ms step:1026/1480 train_time:156892ms step_avg:154.42ms step:1027/1480 train_time:157050ms step_avg:154.42ms step:1028/1480 train_time:157212ms step_avg:154.43ms step:1029/1480 train_time:157377ms step_avg:154.44ms step:1030/1480 train_time:157537ms step_avg:154.45ms step:1031/1480 train_time:157695ms step_avg:154.45ms step:1032/1480 train_time:157860ms step_avg:154.46ms step:1033/1480 train_time:158019ms step_avg:154.47ms step:1034/1480 train_time:158182ms step_avg:154.47ms step:1035/1480 train_time:158341ms step_avg:154.48ms step:1036/1480 train_time:158501ms step_avg:154.48ms step:1037/1480 train_time:158662ms step_avg:154.49ms step:1038/1480 train_time:158823ms step_avg:154.50ms step:1039/1480 train_time:158986ms step_avg:154.51ms step:1040/1480 train_time:159147ms step_avg:154.51ms step:1041/1480 train_time:159309ms step_avg:154.52ms step:1042/1480 train_time:159468ms step_avg:154.52ms step:1043/1480 train_time:159628ms step_avg:154.53ms step:1044/1480 train_time:159788ms step_avg:154.53ms step:1045/1480 train_time:159948ms step_avg:154.54ms step:1046/1480 train_time:160109ms step_avg:154.55ms step:1047/1480 train_time:160270ms step_avg:154.55ms step:1048/1480 train_time:160430ms step_avg:154.56ms step:1049/1480 train_time:160590ms step_avg:154.56ms step:1050/1480 train_time:160751ms step_avg:154.57ms step:1051/1480 train_time:160911ms step_avg:154.57ms step:1052/1480 train_time:161072ms step_avg:154.58ms step:1053/1480 train_time:161232ms step_avg:154.58ms step:1054/1480 train_time:161394ms step_avg:154.59ms step:1055/1480 train_time:161554ms step_avg:154.60ms step:1056/1480 train_time:161715ms step_avg:154.60ms step:1057/1480 train_time:161875ms step_avg:154.61ms step:1058/1480 train_time:162040ms step_avg:154.62ms step:1059/1480 train_time:162204ms step_avg:154.63ms step:1060/1480 train_time:162366ms step_avg:154.63ms step:1061/1480 train_time:162524ms step_avg:154.64ms step:1062/1480 train_time:162683ms step_avg:154.64ms step:1063/1480 train_time:162842ms step_avg:154.65ms step:1064/1480 train_time:163002ms step_avg:154.65ms step:1065/1480 train_time:163164ms step_avg:154.66ms step:1066/1480 train_time:163325ms step_avg:154.66ms step:1067/1480 train_time:163486ms step_avg:154.67ms step:1068/1480 train_time:163646ms step_avg:154.68ms step:1069/1480 train_time:163809ms step_avg:154.68ms step:1070/1480 train_time:163969ms step_avg:154.69ms step:1071/1480 train_time:164132ms step_avg:154.70ms step:1072/1480 train_time:164291ms step_avg:154.70ms step:1073/1480 train_time:164449ms step_avg:154.70ms step:1074/1480 train_time:164609ms step_avg:154.71ms step:1075/1480 train_time:164770ms step_avg:154.71ms step:1076/1480 train_time:164929ms step_avg:154.72ms step:1077/1480 train_time:165090ms step_avg:154.72ms step:1078/1480 train_time:165254ms step_avg:154.73ms step:1079/1480 train_time:165417ms step_avg:154.74ms step:1080/1480 train_time:165578ms step_avg:154.75ms step:1081/1480 train_time:165741ms step_avg:154.75ms step:1082/1480 train_time:165901ms step_avg:154.76ms step:1083/1480 train_time:166062ms step_avg:154.76ms step:1084/1480 train_time:166223ms step_avg:154.77ms step:1085/1480 train_time:166383ms step_avg:154.78ms step:1086/1480 train_time:166544ms step_avg:154.78ms step:1087/1480 train_time:166705ms step_avg:154.79ms step:1088/1480 train_time:166865ms step_avg:154.79ms step:1089/1480 train_time:167030ms step_avg:154.80ms step:1090/1480 train_time:167194ms step_avg:154.81ms step:1091/1480 train_time:167355ms step_avg:154.81ms step:1092/1480 train_time:167515ms step_avg:154.82ms step:1093/1480 train_time:167676ms step_avg:154.83ms step:1094/1480 train_time:167836ms step_avg:154.83ms step:1095/1480 train_time:167997ms step_avg:154.84ms step:1096/1480 train_time:168161ms step_avg:154.84ms step:1097/1480 train_time:168323ms step_avg:154.85ms step:1098/1480 train_time:168486ms step_avg:154.86ms step:1099/1480 train_time:168646ms step_avg:154.86ms step:1100/1480 train_time:168811ms step_avg:154.87ms step:1101/1480 train_time:168974ms step_avg:154.88ms step:1102/1480 train_time:169137ms step_avg:154.89ms step:1103/1480 train_time:169303ms step_avg:154.90ms step:1104/1480 train_time:169466ms step_avg:154.90ms step:1105/1480 train_time:169628ms step_avg:154.91ms step:1106/1480 train_time:169789ms step_avg:154.92ms step:1107/1480 train_time:169950ms step_avg:154.92ms step:1108/1480 train_time:170109ms step_avg:154.93ms step:1109/1480 train_time:170271ms step_avg:154.93ms step:1110/1480 train_time:170430ms step_avg:154.94ms step:1111/1480 train_time:170591ms step_avg:154.94ms step:1112/1480 train_time:170753ms step_avg:154.95ms step:1113/1480 train_time:170923ms step_avg:154.96ms step:1114/1480 train_time:171087ms step_avg:154.97ms step:1115/1480 train_time:171248ms step_avg:154.98ms step:1116/1480 train_time:171408ms step_avg:154.98ms step:1117/1480 train_time:171571ms step_avg:154.99ms step:1118/1480 train_time:171735ms step_avg:155.00ms step:1119/1480 train_time:171897ms step_avg:155.00ms step:1120/1480 train_time:172058ms step_avg:155.01ms step:1121/1480 train_time:172219ms step_avg:155.01ms step:1122/1480 train_time:172380ms step_avg:155.02ms step:1123/1480 train_time:172540ms step_avg:155.02ms step:1124/1480 train_time:172703ms step_avg:155.03ms step:1125/1480 train_time:172866ms step_avg:155.04ms step:1125/1480 val_loss:3.3815 train_time:172941ms step_avg:155.10ms step:1126/1480 train_time:173035ms step_avg:155.05ms step:1127/1480 train_time:173192ms step_avg:155.05ms step:1128/1480 train_time:173354ms step_avg:155.06ms step:1129/1480 train_time:173517ms step_avg:155.06ms step:1130/1480 train_time:173679ms step_avg:155.07ms step:1131/1480 train_time:173846ms step_avg:155.08ms step:1132/1480 train_time:174005ms step_avg:155.08ms step:1133/1480 train_time:174168ms step_avg:155.09ms step:1134/1480 train_time:174330ms step_avg:155.10ms step:1135/1480 train_time:174493ms step_avg:155.10ms step:1136/1480 train_time:174656ms step_avg:155.11ms step:1137/1480 train_time:174818ms step_avg:155.12ms step:1138/1480 train_time:174982ms step_avg:155.13ms step:1139/1480 train_time:175161ms step_avg:155.15ms step:1140/1480 train_time:175304ms step_avg:155.14ms step:1141/1480 train_time:175469ms step_avg:155.15ms step:1142/1480 train_time:175629ms step_avg:155.15ms step:1143/1480 train_time:175793ms step_avg:155.16ms step:1144/1480 train_time:175955ms step_avg:155.16ms step:1145/1480 train_time:176115ms step_avg:155.17ms step:1146/1480 train_time:176278ms step_avg:155.17ms step:1147/1480 train_time:176439ms step_avg:155.18ms step:1148/1480 train_time:176599ms step_avg:155.18ms step:1149/1480 train_time:176762ms step_avg:155.19ms step:1150/1480 train_time:176922ms step_avg:155.19ms step:1151/1480 train_time:177086ms step_avg:155.20ms step:1152/1480 train_time:177248ms step_avg:155.21ms step:1153/1480 train_time:177412ms step_avg:155.22ms step:1154/1480 train_time:177573ms step_avg:155.22ms step:1155/1480 train_time:177736ms step_avg:155.23ms step:1156/1480 train_time:177901ms step_avg:155.24ms step:1157/1480 train_time:178064ms step_avg:155.24ms step:1158/1480 train_time:178224ms step_avg:155.25ms step:1159/1480 train_time:178385ms step_avg:155.25ms step:1160/1480 train_time:178545ms step_avg:155.26ms step:1161/1480 train_time:178706ms step_avg:155.26ms step:1162/1480 train_time:178868ms step_avg:155.27ms step:1163/1480 train_time:179032ms step_avg:155.27ms step:1164/1480 train_time:179195ms step_avg:155.28ms step:1165/1480 train_time:179356ms step_avg:155.29ms step:1166/1480 train_time:179518ms step_avg:155.29ms step:1167/1480 train_time:179679ms step_avg:155.30ms step:1168/1480 train_time:179840ms step_avg:155.30ms step:1169/1480 train_time:180002ms step_avg:155.31ms step:1170/1480 train_time:180163ms step_avg:155.31ms step:1171/1480 train_time:180324ms step_avg:155.32ms step:1172/1480 train_time:180483ms step_avg:155.32ms step:1173/1480 train_time:180645ms step_avg:155.33ms step:1174/1480 train_time:180816ms step_avg:155.34ms step:1175/1480 train_time:180978ms step_avg:155.35ms step:1176/1480 train_time:181141ms step_avg:155.35ms step:1177/1480 train_time:181309ms step_avg:155.36ms step:1178/1480 train_time:181470ms step_avg:155.37ms step:1179/1480 train_time:181631ms step_avg:155.37ms step:1180/1480 train_time:181799ms step_avg:155.38ms step:1181/1480 train_time:181962ms step_avg:155.39ms step:1182/1480 train_time:182122ms step_avg:155.39ms step:1183/1480 train_time:182283ms step_avg:155.40ms step:1184/1480 train_time:182445ms step_avg:155.40ms step:1185/1480 train_time:182609ms step_avg:155.41ms step:1186/1480 train_time:182772ms step_avg:155.42ms step:1187/1480 train_time:182944ms step_avg:155.43ms step:1188/1480 train_time:183103ms step_avg:155.44ms step:1189/1480 train_time:183265ms step_avg:155.44ms step:1190/1480 train_time:183426ms step_avg:155.45ms step:1191/1480 train_time:183589ms step_avg:155.45ms step:1192/1480 train_time:183750ms step_avg:155.46ms step:1193/1480 train_time:183910ms step_avg:155.46ms step:1194/1480 train_time:184072ms step_avg:155.47ms step:1195/1480 train_time:184237ms step_avg:155.47ms step:1196/1480 train_time:184407ms step_avg:155.49ms step:1197/1480 train_time:184568ms step_avg:155.49ms step:1198/1480 train_time:184739ms step_avg:155.50ms step:1199/1480 train_time:184901ms step_avg:155.51ms step:1200/1480 train_time:185064ms step_avg:155.52ms step:1201/1480 train_time:185225ms step_avg:155.52ms step:1202/1480 train_time:185394ms step_avg:155.53ms step:1203/1480 train_time:185561ms step_avg:155.54ms step:1204/1480 train_time:185723ms step_avg:155.55ms step:1205/1480 train_time:185884ms step_avg:155.55ms step:1206/1480 train_time:186046ms step_avg:155.56ms step:1207/1480 train_time:186205ms step_avg:155.56ms step:1208/1480 train_time:186365ms step_avg:155.56ms step:1209/1480 train_time:186527ms step_avg:155.57ms step:1210/1480 train_time:186694ms step_avg:155.58ms step:1211/1480 train_time:186859ms step_avg:155.59ms step:1212/1480 train_time:187021ms step_avg:155.59ms step:1213/1480 train_time:187185ms step_avg:155.60ms step:1214/1480 train_time:187350ms step_avg:155.61ms step:1215/1480 train_time:187514ms step_avg:155.61ms step:1216/1480 train_time:187675ms step_avg:155.62ms step:1217/1480 train_time:187838ms step_avg:155.62ms step:1218/1480 train_time:188001ms step_avg:155.63ms step:1219/1480 train_time:188169ms step_avg:155.64ms step:1220/1480 train_time:188333ms step_avg:155.65ms step:1221/1480 train_time:188494ms step_avg:155.65ms step:1222/1480 train_time:188655ms step_avg:155.66ms step:1223/1480 train_time:188819ms step_avg:155.66ms step:1224/1480 train_time:188984ms step_avg:155.67ms step:1225/1480 train_time:189146ms step_avg:155.68ms step:1226/1480 train_time:189311ms step_avg:155.68ms step:1227/1480 train_time:189477ms step_avg:155.69ms step:1228/1480 train_time:189641ms step_avg:155.70ms step:1229/1480 train_time:189803ms step_avg:155.70ms step:1230/1480 train_time:189970ms step_avg:155.71ms step:1231/1480 train_time:190138ms step_avg:155.72ms step:1232/1480 train_time:190302ms step_avg:155.73ms step:1233/1480 train_time:190463ms step_avg:155.73ms step:1234/1480 train_time:190624ms step_avg:155.74ms step:1235/1480 train_time:190791ms step_avg:155.75ms step:1236/1480 train_time:190952ms step_avg:155.75ms step:1237/1480 train_time:191114ms step_avg:155.76ms step:1238/1480 train_time:191286ms step_avg:155.77ms step:1239/1480 train_time:191450ms step_avg:155.78ms step:1240/1480 train_time:191615ms step_avg:155.78ms step:1241/1480 train_time:191781ms step_avg:155.79ms step:1242/1480 train_time:191942ms step_avg:155.80ms step:1243/1480 train_time:192105ms step_avg:155.80ms step:1244/1480 train_time:192266ms step_avg:155.81ms step:1245/1480 train_time:192428ms step_avg:155.81ms step:1246/1480 train_time:192591ms step_avg:155.82ms step:1247/1480 train_time:192754ms step_avg:155.82ms step:1248/1480 train_time:192916ms step_avg:155.83ms step:1249/1480 train_time:193079ms step_avg:155.83ms step:1250/1480 train_time:193240ms step_avg:155.84ms step:1250/1480 val_loss:3.3312 train_time:193315ms step_avg:155.90ms step:1251/1480 train_time:193408ms step_avg:155.85ms step:1252/1480 train_time:193571ms step_avg:155.85ms step:1253/1480 train_time:193732ms step_avg:155.86ms step:1254/1480 train_time:193893ms step_avg:155.86ms step:1255/1480 train_time:194064ms step_avg:155.87ms step:1256/1480 train_time:194228ms step_avg:155.88ms step:1257/1480 train_time:194389ms step_avg:155.89ms step:1258/1480 train_time:194553ms step_avg:155.89ms step:1259/1480 train_time:194716ms step_avg:155.90ms step:1260/1480 train_time:194877ms step_avg:155.90ms step:1261/1480 train_time:195041ms step_avg:155.91ms step:1262/1480 train_time:195206ms step_avg:155.92ms step:1263/1480 train_time:195371ms step_avg:155.92ms step:1264/1480 train_time:195530ms step_avg:155.92ms step:1265/1480 train_time:195690ms step_avg:155.93ms step:1266/1480 train_time:195852ms step_avg:155.93ms step:1267/1480 train_time:196013ms step_avg:155.94ms step:1268/1480 train_time:196177ms step_avg:155.94ms step:1269/1480 train_time:196343ms step_avg:155.95ms step:1270/1480 train_time:196507ms step_avg:155.96ms step:1271/1480 train_time:196670ms step_avg:155.96ms step:1272/1480 train_time:196830ms step_avg:155.97ms step:1273/1480 train_time:196994ms step_avg:155.97ms step:1274/1480 train_time:197158ms step_avg:155.98ms step:1275/1480 train_time:197319ms step_avg:155.98ms step:1276/1480 train_time:197480ms step_avg:155.99ms step:1277/1480 train_time:197643ms step_avg:155.99ms step:1278/1480 train_time:197804ms step_avg:156.00ms step:1279/1480 train_time:197966ms step_avg:156.00ms step:1280/1480 train_time:198132ms step_avg:156.01ms step:1281/1480 train_time:198294ms step_avg:156.01ms step:1282/1480 train_time:198453ms step_avg:156.02ms step:1283/1480 train_time:198615ms step_avg:156.02ms step:1284/1480 train_time:198781ms step_avg:156.03ms step:1285/1480 train_time:198944ms step_avg:156.03ms step:1286/1480 train_time:199106ms step_avg:156.04ms step:1287/1480 train_time:199268ms step_avg:156.04ms step:1288/1480 train_time:199431ms step_avg:156.05ms step:1289/1480 train_time:199603ms step_avg:156.06ms step:1290/1480 train_time:199771ms step_avg:156.07ms step:1291/1480 train_time:199935ms step_avg:156.08ms step:1292/1480 train_time:200100ms step_avg:156.08ms step:1293/1480 train_time:200268ms step_avg:156.09ms step:1294/1480 train_time:200431ms step_avg:156.10ms step:1295/1480 train_time:200592ms step_avg:156.10ms step:1296/1480 train_time:200755ms step_avg:156.11ms step:1297/1480 train_time:200919ms step_avg:156.11ms step:1298/1480 train_time:201082ms step_avg:156.12ms step:1299/1480 train_time:201246ms step_avg:156.13ms step:1300/1480 train_time:201407ms step_avg:156.13ms step:1301/1480 train_time:201568ms step_avg:156.13ms step:1302/1480 train_time:201734ms step_avg:156.14ms step:1303/1480 train_time:201903ms step_avg:156.15ms step:1304/1480 train_time:202069ms step_avg:156.16ms step:1305/1480 train_time:202230ms step_avg:156.16ms step:1306/1480 train_time:202396ms step_avg:156.17ms step:1307/1480 train_time:202558ms step_avg:156.17ms step:1308/1480 train_time:202720ms step_avg:156.18ms step:1309/1480 train_time:202886ms step_avg:156.19ms step:1310/1480 train_time:203048ms step_avg:156.19ms step:1311/1480 train_time:203209ms step_avg:156.19ms step:1312/1480 train_time:203376ms step_avg:156.20ms step:1313/1480 train_time:203539ms step_avg:156.21ms step:1314/1480 train_time:203704ms step_avg:156.21ms step:1315/1480 train_time:203869ms step_avg:156.22ms step:1316/1480 train_time:204028ms step_avg:156.22ms step:1317/1480 train_time:204190ms step_avg:156.23ms step:1318/1480 train_time:204357ms step_avg:156.24ms step:1319/1480 train_time:204524ms step_avg:156.24ms step:1320/1480 train_time:204692ms step_avg:156.25ms step:1321/1480 train_time:204856ms step_avg:156.26ms step:1322/1480 train_time:205029ms step_avg:156.27ms step:1323/1480 train_time:205192ms step_avg:156.28ms step:1324/1480 train_time:205354ms step_avg:156.28ms step:1325/1480 train_time:205524ms step_avg:156.29ms step:1326/1480 train_time:205690ms step_avg:156.30ms step:1327/1480 train_time:205852ms step_avg:156.30ms step:1328/1480 train_time:206013ms step_avg:156.31ms step:1329/1480 train_time:206207ms step_avg:156.34ms step:1330/1480 train_time:206364ms step_avg:156.34ms step:1331/1480 train_time:206527ms step_avg:156.34ms step:1332/1480 train_time:206691ms step_avg:156.35ms step:1333/1480 train_time:206857ms step_avg:156.35ms step:1334/1480 train_time:207019ms step_avg:156.36ms step:1335/1480 train_time:207180ms step_avg:156.36ms step:1336/1480 train_time:207349ms step_avg:156.37ms step:1337/1480 train_time:207516ms step_avg:156.38ms step:1338/1480 train_time:207679ms step_avg:156.39ms step:1339/1480 train_time:207844ms step_avg:156.39ms step:1340/1480 train_time:208009ms step_avg:156.40ms step:1341/1480 train_time:208170ms step_avg:156.40ms step:1342/1480 train_time:208336ms step_avg:156.41ms step:1343/1480 train_time:208498ms step_avg:156.41ms step:1344/1480 train_time:208659ms step_avg:156.42ms step:1345/1480 train_time:208829ms step_avg:156.43ms step:1346/1480 train_time:208991ms step_avg:156.43ms step:1347/1480 train_time:209152ms step_avg:156.43ms step:1348/1480 train_time:209314ms step_avg:156.44ms step:1349/1480 train_time:209478ms step_avg:156.44ms step:1350/1480 train_time:209643ms step_avg:156.45ms step:1351/1480 train_time:209808ms step_avg:156.46ms step:1352/1480 train_time:209970ms step_avg:156.46ms step:1353/1480 train_time:210137ms step_avg:156.47ms step:1354/1480 train_time:210301ms step_avg:156.47ms step:1355/1480 train_time:210464ms step_avg:156.48ms step:1356/1480 train_time:210627ms step_avg:156.48ms step:1357/1480 train_time:210792ms step_avg:156.49ms step:1358/1480 train_time:210956ms step_avg:156.50ms step:1359/1480 train_time:211120ms step_avg:156.50ms step:1360/1480 train_time:211286ms step_avg:156.51ms step:1361/1480 train_time:211451ms step_avg:156.51ms step:1362/1480 train_time:211616ms step_avg:156.52ms step:1363/1480 train_time:211784ms step_avg:156.53ms step:1364/1480 train_time:211945ms step_avg:156.53ms step:1365/1480 train_time:212108ms step_avg:156.54ms step:1366/1480 train_time:212272ms step_avg:156.54ms step:1367/1480 train_time:212435ms step_avg:156.55ms step:1368/1480 train_time:212602ms step_avg:156.56ms step:1369/1480 train_time:212771ms step_avg:156.56ms step:1370/1480 train_time:212936ms step_avg:156.57ms step:1371/1480 train_time:213100ms step_avg:156.58ms step:1372/1480 train_time:213267ms step_avg:156.58ms step:1373/1480 train_time:213428ms step_avg:156.59ms step:1374/1480 train_time:213594ms step_avg:156.59ms step:1375/1480 train_time:213757ms step_avg:156.60ms step:1375/1480 val_loss:3.2928 train_time:213832ms step_avg:156.65ms step:1376/1480 train_time:213924ms step_avg:156.61ms step:1377/1480 train_time:214088ms step_avg:156.61ms step:1378/1480 train_time:214250ms step_avg:156.62ms step:1379/1480 train_time:214414ms step_avg:156.62ms step:1380/1480 train_time:214578ms step_avg:156.63ms step:1381/1480 train_time:214748ms step_avg:156.64ms step:1382/1480 train_time:214911ms step_avg:156.64ms step:1383/1480 train_time:215075ms step_avg:156.65ms step:1384/1480 train_time:215242ms step_avg:156.65ms step:1385/1480 train_time:215404ms step_avg:156.66ms step:1386/1480 train_time:215565ms step_avg:156.66ms step:1387/1480 train_time:215729ms step_avg:156.67ms step:1388/1480 train_time:215889ms step_avg:156.67ms step:1389/1480 train_time:216054ms step_avg:156.67ms step:1390/1480 train_time:216216ms step_avg:156.68ms step:1391/1480 train_time:216380ms step_avg:156.68ms step:1392/1480 train_time:216544ms step_avg:156.69ms step:1393/1480 train_time:216707ms step_avg:156.69ms step:1394/1480 train_time:216869ms step_avg:156.70ms step:1395/1480 train_time:217032ms step_avg:156.70ms step:1396/1480 train_time:217194ms step_avg:156.71ms step:1397/1480 train_time:217354ms step_avg:156.71ms step:1398/1480 train_time:217516ms step_avg:156.71ms step:1399/1480 train_time:217678ms step_avg:156.72ms step:1400/1480 train_time:217846ms step_avg:156.72ms step:1401/1480 train_time:218007ms step_avg:156.73ms step:1402/1480 train_time:218169ms step_avg:156.73ms step:1403/1480 train_time:218335ms step_avg:156.74ms step:1404/1480 train_time:218500ms step_avg:156.74ms step:1405/1480 train_time:218665ms step_avg:156.75ms step:1406/1480 train_time:218829ms step_avg:156.75ms step:1407/1480 train_time:218990ms step_avg:156.76ms step:1408/1480 train_time:219152ms step_avg:156.76ms step:1409/1480 train_time:219325ms step_avg:156.77ms step:1410/1480 train_time:219486ms step_avg:156.78ms step:1411/1480 train_time:219647ms step_avg:156.78ms step:1412/1480 train_time:219808ms step_avg:156.78ms step:1413/1480 train_time:219972ms step_avg:156.79ms step:1414/1480 train_time:220135ms step_avg:156.79ms step:1415/1480 train_time:220301ms step_avg:156.80ms step:1416/1480 train_time:220474ms step_avg:156.81ms step:1417/1480 train_time:220640ms step_avg:156.82ms step:1418/1480 train_time:220803ms step_avg:156.82ms step:1419/1480 train_time:220968ms step_avg:156.83ms step:1420/1480 train_time:221133ms step_avg:156.83ms step:1421/1480 train_time:221297ms step_avg:156.84ms step:1422/1480 train_time:221464ms step_avg:156.84ms step:1423/1480 train_time:221626ms step_avg:156.85ms step:1424/1480 train_time:221791ms step_avg:156.85ms step:1425/1480 train_time:221963ms step_avg:156.86ms step:1426/1480 train_time:222127ms step_avg:156.87ms step:1427/1480 train_time:222292ms step_avg:156.87ms step:1428/1480 train_time:222453ms step_avg:156.88ms step:1429/1480 train_time:222614ms step_avg:156.88ms step:1430/1480 train_time:222778ms step_avg:156.89ms step:1431/1480 train_time:222945ms step_avg:156.89ms step:1432/1480 train_time:223112ms step_avg:156.90ms step:1433/1480 train_time:223282ms step_avg:156.91ms step:1434/1480 train_time:223450ms step_avg:156.92ms step:1435/1480 train_time:223616ms step_avg:156.92ms step:1436/1480 train_time:223781ms step_avg:156.93ms step:1437/1480 train_time:223944ms step_avg:156.93ms step:1438/1480 train_time:224107ms step_avg:156.94ms step:1439/1480 train_time:224271ms step_avg:156.94ms step:1440/1480 train_time:224433ms step_avg:156.95ms step:1441/1480 train_time:224600ms step_avg:156.95ms step:1442/1480 train_time:224766ms step_avg:156.96ms step:1443/1480 train_time:224939ms step_avg:156.97ms step:1444/1480 train_time:225104ms step_avg:156.98ms step:1445/1480 train_time:225267ms step_avg:156.98ms step:1446/1480 train_time:225432ms step_avg:156.99ms step:1447/1480 train_time:225602ms step_avg:156.99ms step:1448/1480 train_time:225765ms step_avg:157.00ms step:1449/1480 train_time:225928ms step_avg:157.00ms step:1450/1480 train_time:226092ms step_avg:157.01ms step:1451/1480 train_time:226256ms step_avg:157.01ms step:1452/1480 train_time:226422ms step_avg:157.02ms step:1453/1480 train_time:226585ms step_avg:157.02ms step:1454/1480 train_time:226747ms step_avg:157.03ms step:1455/1480 train_time:226914ms step_avg:157.03ms step:1456/1480 train_time:227078ms step_avg:157.04ms step:1457/1480 train_time:227242ms step_avg:157.04ms step:1458/1480 train_time:227405ms step_avg:157.05ms step:1459/1480 train_time:227571ms step_avg:157.05ms step:1460/1480 train_time:227734ms step_avg:157.06ms step:1461/1480 train_time:227898ms step_avg:157.06ms step:1462/1480 train_time:228064ms step_avg:157.07ms step:1463/1480 train_time:228230ms step_avg:157.07ms step:1464/1480 train_time:228396ms step_avg:157.08ms step:1465/1480 train_time:228559ms step_avg:157.09ms step:1466/1480 train_time:228723ms step_avg:157.09ms step:1467/1480 train_time:228886ms step_avg:157.09ms step:1468/1480 train_time:229051ms step_avg:157.10ms step:1469/1480 train_time:229215ms step_avg:157.10ms step:1470/1480 train_time:229383ms step_avg:157.11ms step:1471/1480 train_time:229554ms step_avg:157.12ms step:1472/1480 train_time:229726ms step_avg:157.13ms step:1473/1480 train_time:229890ms step_avg:157.14ms step:1474/1480 train_time:230056ms step_avg:157.14ms step:1475/1480 train_time:230226ms step_avg:157.15ms step:1476/1480 train_time:230389ms step_avg:157.15ms step:1477/1480 train_time:230557ms step_avg:157.16ms step:1478/1480 train_time:230728ms step_avg:157.17ms step:1479/1480 train_time:230892ms step_avg:157.18ms step:1480/1480 train_time:231054ms step_avg:157.18ms step:1480/1480 val_loss:3.2740 train_time:231131ms step_avg:157.23ms peak memory consumption: 34238 MiB