import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 07:20:41 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28893ms step_avg:nanms step:2/1480 train_time:28997ms step_avg:nanms step:3/1480 train_time:29121ms step_avg:nanms step:4/1480 train_time:29262ms step_avg:nanms step:5/1480 train_time:29402ms step_avg:nanms step:6/1480 train_time:29550ms step_avg:nanms step:7/1480 train_time:29690ms step_avg:nanms step:8/1480 train_time:29830ms step_avg:nanms step:9/1480 train_time:29976ms step_avg:nanms step:10/1480 train_time:30121ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:427ms step_avg:142.41ms step:14/1480 train_time:570ms step_avg:142.62ms step:15/1480 train_time:711ms step_avg:142.24ms step:16/1480 train_time:855ms step_avg:142.46ms step:17/1480 train_time:998ms step_avg:142.53ms step:18/1480 train_time:1142ms step_avg:142.70ms step:19/1480 train_time:1286ms step_avg:142.90ms step:20/1480 train_time:1428ms step_avg:142.82ms step:21/1480 train_time:1570ms step_avg:142.74ms step:22/1480 train_time:1713ms step_avg:142.74ms step:23/1480 train_time:1856ms step_avg:142.75ms step:24/1480 train_time:2000ms step_avg:142.85ms step:25/1480 train_time:2143ms step_avg:142.83ms step:26/1480 train_time:2286ms step_avg:142.86ms step:27/1480 train_time:2428ms step_avg:142.81ms step:28/1480 train_time:2571ms step_avg:142.84ms step:29/1480 train_time:2713ms step_avg:142.77ms step:30/1480 train_time:2856ms step_avg:142.81ms step:31/1480 train_time:2998ms step_avg:142.77ms step:32/1480 train_time:3143ms step_avg:142.86ms step:33/1480 train_time:3286ms step_avg:142.87ms step:34/1480 train_time:3427ms step_avg:142.80ms step:35/1480 train_time:3570ms step_avg:142.80ms step:36/1480 train_time:3713ms step_avg:142.79ms step:37/1480 train_time:3855ms step_avg:142.77ms step:38/1480 train_time:4366ms step_avg:155.93ms step:39/1480 train_time:4874ms step_avg:168.08ms step:40/1480 train_time:4978ms step_avg:165.94ms step:41/1480 train_time:5121ms step_avg:165.19ms step:42/1480 train_time:5263ms step_avg:164.47ms step:43/1480 train_time:5406ms step_avg:163.80ms step:44/1480 train_time:5547ms step_avg:163.15ms step:45/1480 train_time:5689ms step_avg:162.54ms step:46/1480 train_time:5831ms step_avg:161.96ms step:47/1480 train_time:5974ms step_avg:161.47ms step:48/1480 train_time:6118ms step_avg:161.00ms step:49/1480 train_time:6262ms step_avg:160.56ms step:50/1480 train_time:6407ms step_avg:160.17ms step:51/1480 train_time:6550ms step_avg:159.76ms step:52/1480 train_time:6692ms step_avg:159.34ms step:53/1480 train_time:6833ms step_avg:158.91ms step:54/1480 train_time:6976ms step_avg:158.54ms step:55/1480 train_time:7118ms step_avg:158.18ms step:56/1480 train_time:7261ms step_avg:157.86ms step:57/1480 train_time:7405ms step_avg:157.55ms step:58/1480 train_time:7547ms step_avg:157.24ms step:59/1480 train_time:7690ms step_avg:156.94ms step:60/1480 train_time:7832ms step_avg:156.64ms step:61/1480 train_time:7976ms step_avg:156.39ms step:62/1480 train_time:8119ms step_avg:156.14ms step:63/1480 train_time:8264ms step_avg:155.92ms step:64/1480 train_time:8408ms step_avg:155.70ms step:65/1480 train_time:8550ms step_avg:155.45ms step:66/1480 train_time:8691ms step_avg:155.20ms step:67/1480 train_time:8834ms step_avg:154.98ms step:68/1480 train_time:8976ms step_avg:154.75ms step:69/1480 train_time:9119ms step_avg:154.55ms step:70/1480 train_time:9262ms step_avg:154.37ms step:71/1480 train_time:9406ms step_avg:154.19ms step:72/1480 train_time:9549ms step_avg:154.01ms step:73/1480 train_time:9690ms step_avg:153.82ms step:74/1480 train_time:9833ms step_avg:153.64ms step:75/1480 train_time:9975ms step_avg:153.46ms step:76/1480 train_time:10117ms step_avg:153.29ms step:77/1480 train_time:10261ms step_avg:153.14ms step:78/1480 train_time:10404ms step_avg:152.99ms step:79/1480 train_time:10546ms step_avg:152.84ms step:80/1480 train_time:10689ms step_avg:152.70ms step:81/1480 train_time:10831ms step_avg:152.54ms step:82/1480 train_time:10973ms step_avg:152.41ms step:83/1480 train_time:11115ms step_avg:152.27ms step:84/1480 train_time:11258ms step_avg:152.13ms step:85/1480 train_time:11401ms step_avg:152.02ms step:86/1480 train_time:11544ms step_avg:151.90ms step:87/1480 train_time:11687ms step_avg:151.78ms step:88/1480 train_time:11830ms step_avg:151.66ms step:89/1480 train_time:11972ms step_avg:151.54ms step:90/1480 train_time:12115ms step_avg:151.44ms step:91/1480 train_time:12258ms step_avg:151.33ms step:92/1480 train_time:12400ms step_avg:151.22ms step:93/1480 train_time:12546ms step_avg:151.15ms step:94/1480 train_time:12688ms step_avg:151.05ms step:95/1480 train_time:12829ms step_avg:150.93ms step:96/1480 train_time:12973ms step_avg:150.84ms step:97/1480 train_time:13115ms step_avg:150.75ms step:98/1480 train_time:13258ms step_avg:150.66ms step:99/1480 train_time:13401ms step_avg:150.57ms step:100/1480 train_time:13544ms step_avg:150.49ms step:101/1480 train_time:13689ms step_avg:150.43ms step:102/1480 train_time:13828ms step_avg:150.30ms step:103/1480 train_time:13971ms step_avg:150.23ms step:104/1480 train_time:14114ms step_avg:150.15ms step:105/1480 train_time:14256ms step_avg:150.06ms step:106/1480 train_time:14400ms step_avg:150.00ms step:107/1480 train_time:14544ms step_avg:149.94ms step:108/1480 train_time:14689ms step_avg:149.88ms step:109/1480 train_time:14830ms step_avg:149.80ms step:110/1480 train_time:14973ms step_avg:149.73ms step:111/1480 train_time:15117ms step_avg:149.67ms step:112/1480 train_time:15263ms step_avg:149.64ms step:113/1480 train_time:15409ms step_avg:149.60ms step:114/1480 train_time:15554ms step_avg:149.55ms step:115/1480 train_time:15700ms step_avg:149.52ms step:116/1480 train_time:15846ms step_avg:149.49ms step:117/1480 train_time:15990ms step_avg:149.44ms step:118/1480 train_time:16137ms step_avg:149.42ms step:119/1480 train_time:16284ms step_avg:149.39ms step:120/1480 train_time:16429ms step_avg:149.35ms step:121/1480 train_time:16575ms step_avg:149.33ms step:122/1480 train_time:16722ms step_avg:149.30ms step:123/1480 train_time:16868ms step_avg:149.28ms step:124/1480 train_time:17012ms step_avg:149.23ms step:125/1480 train_time:17159ms step_avg:149.21ms step:125/1480 val_loss:4.4165 train_time:17225ms step_avg:149.78ms step:126/1480 train_time:17317ms step_avg:149.28ms step:127/1480 train_time:17461ms step_avg:149.24ms step:128/1480 train_time:17606ms step_avg:149.21ms step:129/1480 train_time:17751ms step_avg:149.16ms step:130/1480 train_time:17896ms step_avg:149.14ms step:131/1480 train_time:18042ms step_avg:149.11ms step:132/1480 train_time:18187ms step_avg:149.07ms step:133/1480 train_time:18331ms step_avg:149.04ms step:134/1480 train_time:18479ms step_avg:149.02ms step:135/1480 train_time:18626ms step_avg:149.01ms step:136/1480 train_time:18769ms step_avg:148.96ms step:137/1480 train_time:18915ms step_avg:148.94ms step:138/1480 train_time:19061ms step_avg:148.92ms step:139/1480 train_time:19206ms step_avg:148.88ms step:140/1480 train_time:19351ms step_avg:148.85ms step:141/1480 train_time:19497ms step_avg:148.83ms step:142/1480 train_time:19644ms step_avg:148.81ms step:143/1480 train_time:19788ms step_avg:148.78ms step:144/1480 train_time:19933ms step_avg:148.75ms step:145/1480 train_time:20079ms step_avg:148.73ms step:146/1480 train_time:20225ms step_avg:148.71ms step:147/1480 train_time:20369ms step_avg:148.68ms step:148/1480 train_time:20515ms step_avg:148.66ms step:149/1480 train_time:20661ms step_avg:148.64ms step:150/1480 train_time:20806ms step_avg:148.62ms step:151/1480 train_time:20951ms step_avg:148.59ms step:152/1480 train_time:21097ms step_avg:148.57ms step:153/1480 train_time:21244ms step_avg:148.56ms step:154/1480 train_time:21389ms step_avg:148.53ms step:155/1480 train_time:21534ms step_avg:148.51ms step:156/1480 train_time:21680ms step_avg:148.49ms step:157/1480 train_time:21826ms step_avg:148.48ms step:158/1480 train_time:21970ms step_avg:148.45ms step:159/1480 train_time:22117ms step_avg:148.43ms step:160/1480 train_time:22262ms step_avg:148.42ms step:161/1480 train_time:22407ms step_avg:148.39ms step:162/1480 train_time:22552ms step_avg:148.37ms step:163/1480 train_time:22699ms step_avg:148.36ms step:164/1480 train_time:22845ms step_avg:148.35ms step:165/1480 train_time:22993ms step_avg:148.34ms step:166/1480 train_time:23140ms step_avg:148.33ms step:167/1480 train_time:23284ms step_avg:148.31ms step:168/1480 train_time:23429ms step_avg:148.28ms step:169/1480 train_time:23574ms step_avg:148.27ms step:170/1480 train_time:23722ms step_avg:148.26ms step:171/1480 train_time:23866ms step_avg:148.24ms step:172/1480 train_time:24013ms step_avg:148.23ms step:173/1480 train_time:24159ms step_avg:148.22ms step:174/1480 train_time:24305ms step_avg:148.20ms step:175/1480 train_time:24449ms step_avg:148.18ms step:176/1480 train_time:24595ms step_avg:148.16ms step:177/1480 train_time:24741ms step_avg:148.15ms step:178/1480 train_time:25264ms step_avg:150.38ms step:179/1480 train_time:25370ms step_avg:150.12ms step:180/1480 train_time:25899ms step_avg:152.35ms step:181/1480 train_time:26006ms step_avg:152.08ms step:182/1480 train_time:26153ms step_avg:152.05ms step:183/1480 train_time:26299ms step_avg:152.02ms step:184/1480 train_time:26445ms step_avg:151.98ms step:185/1480 train_time:26590ms step_avg:151.94ms step:186/1480 train_time:26735ms step_avg:151.90ms step:187/1480 train_time:26882ms step_avg:151.88ms step:188/1480 train_time:27028ms step_avg:151.84ms step:189/1480 train_time:27204ms step_avg:151.98ms step:190/1480 train_time:27321ms step_avg:151.78ms step:191/1480 train_time:27467ms step_avg:151.75ms step:192/1480 train_time:27613ms step_avg:151.72ms step:193/1480 train_time:27759ms step_avg:151.69ms step:194/1480 train_time:27905ms step_avg:151.66ms step:195/1480 train_time:28050ms step_avg:151.62ms step:196/1480 train_time:28197ms step_avg:151.59ms step:197/1480 train_time:28343ms step_avg:151.57ms step:198/1480 train_time:28489ms step_avg:151.53ms step:199/1480 train_time:28634ms step_avg:151.50ms step:200/1480 train_time:28781ms step_avg:151.48ms step:201/1480 train_time:28932ms step_avg:151.48ms step:202/1480 train_time:29072ms step_avg:151.42ms step:203/1480 train_time:29219ms step_avg:151.39ms step:204/1480 train_time:29364ms step_avg:151.36ms step:205/1480 train_time:29510ms step_avg:151.33ms step:206/1480 train_time:29656ms step_avg:151.31ms step:207/1480 train_time:29804ms step_avg:151.29ms step:208/1480 train_time:29950ms step_avg:151.26ms step:209/1480 train_time:30096ms step_avg:151.24ms step:210/1480 train_time:30244ms step_avg:151.22ms step:211/1480 train_time:30388ms step_avg:151.18ms step:212/1480 train_time:30533ms step_avg:151.16ms step:213/1480 train_time:30681ms step_avg:151.14ms step:214/1480 train_time:30827ms step_avg:151.11ms step:215/1480 train_time:30973ms step_avg:151.09ms step:216/1480 train_time:31119ms step_avg:151.06ms step:217/1480 train_time:31265ms step_avg:151.04ms step:218/1480 train_time:31410ms step_avg:151.01ms step:219/1480 train_time:31556ms step_avg:150.99ms step:220/1480 train_time:31703ms step_avg:150.97ms step:221/1480 train_time:32232ms step_avg:152.76ms step:222/1480 train_time:32345ms step_avg:152.57ms step:223/1480 train_time:32491ms step_avg:152.54ms step:224/1480 train_time:32640ms step_avg:152.52ms step:225/1480 train_time:32788ms step_avg:152.50ms step:226/1480 train_time:32937ms step_avg:152.48ms step:227/1480 train_time:33085ms step_avg:152.47ms step:228/1480 train_time:33233ms step_avg:152.45ms step:229/1480 train_time:33384ms step_avg:152.44ms step:230/1480 train_time:33531ms step_avg:152.41ms step:231/1480 train_time:33680ms step_avg:152.40ms step:232/1480 train_time:33828ms step_avg:152.38ms step:233/1480 train_time:33977ms step_avg:152.36ms step:234/1480 train_time:34126ms step_avg:152.35ms step:235/1480 train_time:34273ms step_avg:152.33ms step:236/1480 train_time:34423ms step_avg:152.31ms step:237/1480 train_time:34570ms step_avg:152.29ms step:238/1480 train_time:34719ms step_avg:152.28ms step:239/1480 train_time:34867ms step_avg:152.26ms step:240/1480 train_time:35016ms step_avg:152.24ms step:241/1480 train_time:35164ms step_avg:152.23ms step:242/1480 train_time:35312ms step_avg:152.21ms step:243/1480 train_time:35461ms step_avg:152.19ms step:244/1480 train_time:35610ms step_avg:152.18ms step:245/1480 train_time:35758ms step_avg:152.16ms step:246/1480 train_time:35908ms step_avg:152.15ms step:247/1480 train_time:36055ms step_avg:152.13ms step:248/1480 train_time:36205ms step_avg:152.12ms step:249/1480 train_time:36352ms step_avg:152.10ms step:250/1480 train_time:36501ms step_avg:152.09ms step:250/1480 val_loss:3.9919 train_time:36567ms step_avg:152.36ms step:251/1480 train_time:36663ms step_avg:152.13ms step:252/1480 train_time:36807ms step_avg:152.10ms step:253/1480 train_time:36955ms step_avg:152.08ms step:254/1480 train_time:37104ms step_avg:152.06ms step:255/1480 train_time:37251ms step_avg:152.04ms step:256/1480 train_time:37400ms step_avg:152.03ms step:257/1480 train_time:37547ms step_avg:152.01ms step:258/1480 train_time:37695ms step_avg:152.00ms step:259/1480 train_time:37846ms step_avg:151.99ms step:260/1480 train_time:37994ms step_avg:151.98ms step:261/1480 train_time:38144ms step_avg:151.97ms step:262/1480 train_time:38291ms step_avg:151.95ms step:263/1480 train_time:38441ms step_avg:151.94ms step:264/1480 train_time:38588ms step_avg:151.92ms step:265/1480 train_time:38738ms step_avg:151.92ms step:266/1480 train_time:38887ms step_avg:151.90ms step:267/1480 train_time:39036ms step_avg:151.89ms step:268/1480 train_time:39185ms step_avg:151.88ms step:269/1480 train_time:39333ms step_avg:151.86ms step:270/1480 train_time:39482ms step_avg:151.85ms step:271/1480 train_time:39629ms step_avg:151.84ms step:272/1480 train_time:39779ms step_avg:151.83ms step:273/1480 train_time:39927ms step_avg:151.81ms step:274/1480 train_time:40076ms step_avg:151.80ms step:275/1480 train_time:40225ms step_avg:151.79ms step:276/1480 train_time:40373ms step_avg:151.78ms step:277/1480 train_time:40521ms step_avg:151.77ms step:278/1480 train_time:40669ms step_avg:151.75ms step:279/1480 train_time:40817ms step_avg:151.74ms step:280/1480 train_time:40967ms step_avg:151.73ms step:281/1480 train_time:41115ms step_avg:151.71ms step:282/1480 train_time:41264ms step_avg:151.71ms step:283/1480 train_time:41412ms step_avg:151.69ms step:284/1480 train_time:41560ms step_avg:151.68ms step:285/1480 train_time:41707ms step_avg:151.66ms step:286/1480 train_time:41856ms step_avg:151.65ms step:287/1480 train_time:42005ms step_avg:151.64ms step:288/1480 train_time:42152ms step_avg:151.63ms step:289/1480 train_time:42301ms step_avg:151.62ms step:290/1480 train_time:42450ms step_avg:151.61ms step:291/1480 train_time:42599ms step_avg:151.60ms step:292/1480 train_time:42747ms step_avg:151.59ms step:293/1480 train_time:42895ms step_avg:151.57ms step:294/1480 train_time:43044ms step_avg:151.56ms step:295/1480 train_time:43192ms step_avg:151.55ms step:296/1480 train_time:43341ms step_avg:151.54ms step:297/1480 train_time:43489ms step_avg:151.53ms step:298/1480 train_time:43638ms step_avg:151.52ms step:299/1480 train_time:43786ms step_avg:151.51ms step:300/1480 train_time:43935ms step_avg:151.50ms step:301/1480 train_time:44084ms step_avg:151.49ms step:302/1480 train_time:44231ms step_avg:151.47ms step:303/1480 train_time:44380ms step_avg:151.47ms step:304/1480 train_time:44528ms step_avg:151.46ms step:305/1480 train_time:44677ms step_avg:151.45ms step:306/1480 train_time:44826ms step_avg:151.44ms step:307/1480 train_time:44974ms step_avg:151.43ms step:308/1480 train_time:45123ms step_avg:151.42ms step:309/1480 train_time:45272ms step_avg:151.41ms step:310/1480 train_time:45420ms step_avg:151.40ms step:311/1480 train_time:45568ms step_avg:151.39ms step:312/1480 train_time:45717ms step_avg:151.38ms step:313/1480 train_time:45866ms step_avg:151.37ms step:314/1480 train_time:46014ms step_avg:151.36ms step:315/1480 train_time:46163ms step_avg:151.35ms step:316/1480 train_time:46310ms step_avg:151.34ms step:317/1480 train_time:46460ms step_avg:151.34ms step:318/1480 train_time:46608ms step_avg:151.32ms step:319/1480 train_time:46756ms step_avg:151.32ms step:320/1480 train_time:46906ms step_avg:151.31ms step:321/1480 train_time:47053ms step_avg:151.30ms step:322/1480 train_time:47203ms step_avg:151.29ms step:323/1480 train_time:47350ms step_avg:151.28ms step:324/1480 train_time:47499ms step_avg:151.27ms step:325/1480 train_time:47648ms step_avg:151.26ms step:326/1480 train_time:47797ms step_avg:151.26ms step:327/1480 train_time:47946ms step_avg:151.25ms step:328/1480 train_time:48094ms step_avg:151.24ms step:329/1480 train_time:48243ms step_avg:151.23ms step:330/1480 train_time:48391ms step_avg:151.22ms step:331/1480 train_time:48542ms step_avg:151.22ms step:332/1480 train_time:48692ms step_avg:151.22ms step:333/1480 train_time:48843ms step_avg:151.22ms step:334/1480 train_time:48993ms step_avg:151.21ms step:335/1480 train_time:49144ms step_avg:151.21ms step:336/1480 train_time:49295ms step_avg:151.21ms step:337/1480 train_time:49446ms step_avg:151.21ms step:338/1480 train_time:49597ms step_avg:151.21ms step:339/1480 train_time:49749ms step_avg:151.21ms step:340/1480 train_time:49900ms step_avg:151.21ms step:341/1480 train_time:50050ms step_avg:151.21ms step:342/1480 train_time:50201ms step_avg:151.21ms step:343/1480 train_time:50351ms step_avg:151.20ms step:344/1480 train_time:50501ms step_avg:151.20ms step:345/1480 train_time:50651ms step_avg:151.20ms step:346/1480 train_time:50803ms step_avg:151.20ms step:347/1480 train_time:50953ms step_avg:151.20ms step:348/1480 train_time:51104ms step_avg:151.20ms step:349/1480 train_time:51254ms step_avg:151.19ms step:350/1480 train_time:51405ms step_avg:151.19ms step:351/1480 train_time:51554ms step_avg:151.19ms step:352/1480 train_time:51705ms step_avg:151.19ms step:353/1480 train_time:51855ms step_avg:151.18ms step:354/1480 train_time:52007ms step_avg:151.18ms step:355/1480 train_time:52157ms step_avg:151.18ms step:356/1480 train_time:52308ms step_avg:151.18ms step:357/1480 train_time:52459ms step_avg:151.18ms step:358/1480 train_time:52609ms step_avg:151.18ms step:359/1480 train_time:52761ms step_avg:151.18ms step:360/1480 train_time:52911ms step_avg:151.17ms step:361/1480 train_time:53064ms step_avg:151.18ms step:362/1480 train_time:53214ms step_avg:151.18ms step:363/1480 train_time:53366ms step_avg:151.18ms step:364/1480 train_time:53518ms step_avg:151.18ms step:365/1480 train_time:53669ms step_avg:151.18ms step:366/1480 train_time:53820ms step_avg:151.18ms step:367/1480 train_time:53971ms step_avg:151.18ms step:368/1480 train_time:54122ms step_avg:151.18ms step:369/1480 train_time:54272ms step_avg:151.18ms step:370/1480 train_time:54423ms step_avg:151.18ms step:371/1480 train_time:54573ms step_avg:151.17ms step:372/1480 train_time:54724ms step_avg:151.17ms step:373/1480 train_time:54874ms step_avg:151.17ms step:374/1480 train_time:55026ms step_avg:151.17ms step:375/1480 train_time:55176ms step_avg:151.17ms step:375/1480 val_loss:3.8082 train_time:55244ms step_avg:151.35ms step:376/1480 train_time:55335ms step_avg:151.19ms step:377/1480 train_time:55484ms step_avg:151.18ms step:378/1480 train_time:55635ms step_avg:151.18ms step:379/1480 train_time:55809ms step_avg:151.24ms step:380/1480 train_time:55935ms step_avg:151.18ms step:381/1480 train_time:56085ms step_avg:151.17ms step:382/1480 train_time:56235ms step_avg:151.17ms step:383/1480 train_time:56386ms step_avg:151.17ms step:384/1480 train_time:56536ms step_avg:151.17ms step:385/1480 train_time:56688ms step_avg:151.17ms step:386/1480 train_time:56839ms step_avg:151.17ms step:387/1480 train_time:56989ms step_avg:151.16ms step:388/1480 train_time:57140ms step_avg:151.17ms step:389/1480 train_time:57292ms step_avg:151.17ms step:390/1480 train_time:57444ms step_avg:151.17ms step:391/1480 train_time:57595ms step_avg:151.17ms step:392/1480 train_time:57746ms step_avg:151.17ms step:393/1480 train_time:57896ms step_avg:151.17ms step:394/1480 train_time:58048ms step_avg:151.17ms step:395/1480 train_time:58197ms step_avg:151.16ms step:396/1480 train_time:58349ms step_avg:151.16ms step:397/1480 train_time:58498ms step_avg:151.16ms step:398/1480 train_time:58650ms step_avg:151.16ms step:399/1480 train_time:58800ms step_avg:151.16ms step:400/1480 train_time:58952ms step_avg:151.16ms step:401/1480 train_time:59107ms step_avg:151.17ms step:402/1480 train_time:59255ms step_avg:151.16ms step:403/1480 train_time:59407ms step_avg:151.16ms step:404/1480 train_time:59556ms step_avg:151.16ms step:405/1480 train_time:59708ms step_avg:151.16ms step:406/1480 train_time:59858ms step_avg:151.16ms step:407/1480 train_time:60010ms step_avg:151.16ms step:408/1480 train_time:60160ms step_avg:151.16ms step:409/1480 train_time:60311ms step_avg:151.16ms step:410/1480 train_time:60462ms step_avg:151.15ms step:411/1480 train_time:60612ms step_avg:151.15ms step:412/1480 train_time:60762ms step_avg:151.15ms step:413/1480 train_time:60913ms step_avg:151.15ms step:414/1480 train_time:61064ms step_avg:151.15ms step:415/1480 train_time:61215ms step_avg:151.15ms step:416/1480 train_time:61367ms step_avg:151.15ms step:417/1480 train_time:61517ms step_avg:151.15ms step:418/1480 train_time:61668ms step_avg:151.15ms step:419/1480 train_time:61818ms step_avg:151.14ms step:420/1480 train_time:61969ms step_avg:151.15ms step:421/1480 train_time:62119ms step_avg:151.14ms step:422/1480 train_time:62271ms step_avg:151.14ms step:423/1480 train_time:62422ms step_avg:151.14ms step:424/1480 train_time:62573ms step_avg:151.14ms step:425/1480 train_time:62724ms step_avg:151.14ms step:426/1480 train_time:62874ms step_avg:151.14ms step:427/1480 train_time:63026ms step_avg:151.14ms step:428/1480 train_time:63175ms step_avg:151.14ms step:429/1480 train_time:63327ms step_avg:151.14ms step:430/1480 train_time:63477ms step_avg:151.14ms step:431/1480 train_time:63629ms step_avg:151.14ms step:432/1480 train_time:63779ms step_avg:151.13ms step:433/1480 train_time:63930ms step_avg:151.13ms step:434/1480 train_time:64080ms step_avg:151.13ms step:435/1480 train_time:64231ms step_avg:151.13ms step:436/1480 train_time:64381ms step_avg:151.13ms step:437/1480 train_time:64532ms step_avg:151.13ms step:438/1480 train_time:64683ms step_avg:151.13ms step:439/1480 train_time:64834ms step_avg:151.13ms step:440/1480 train_time:64985ms step_avg:151.13ms step:441/1480 train_time:65137ms step_avg:151.13ms step:442/1480 train_time:65291ms step_avg:151.14ms step:443/1480 train_time:65445ms step_avg:151.14ms step:444/1480 train_time:65597ms step_avg:151.15ms step:445/1480 train_time:65750ms step_avg:151.15ms step:446/1480 train_time:65904ms step_avg:151.15ms step:447/1480 train_time:66056ms step_avg:151.16ms step:448/1480 train_time:66209ms step_avg:151.16ms step:449/1480 train_time:66360ms step_avg:151.16ms step:450/1480 train_time:66514ms step_avg:151.17ms step:451/1480 train_time:66668ms step_avg:151.17ms step:452/1480 train_time:66820ms step_avg:151.18ms step:453/1480 train_time:66973ms step_avg:151.18ms step:454/1480 train_time:67127ms step_avg:151.19ms step:455/1480 train_time:67279ms step_avg:151.19ms step:456/1480 train_time:67432ms step_avg:151.19ms step:457/1480 train_time:67585ms step_avg:151.20ms step:458/1480 train_time:67737ms step_avg:151.20ms step:459/1480 train_time:67890ms step_avg:151.20ms step:460/1480 train_time:68045ms step_avg:151.21ms step:461/1480 train_time:68197ms step_avg:151.21ms step:462/1480 train_time:68350ms step_avg:151.22ms step:463/1480 train_time:68503ms step_avg:151.22ms step:464/1480 train_time:68657ms step_avg:151.23ms step:465/1480 train_time:68810ms step_avg:151.23ms step:466/1480 train_time:68961ms step_avg:151.23ms step:467/1480 train_time:69115ms step_avg:151.24ms step:468/1480 train_time:69268ms step_avg:151.24ms step:469/1480 train_time:69420ms step_avg:151.24ms step:470/1480 train_time:69573ms step_avg:151.24ms step:471/1480 train_time:69726ms step_avg:151.25ms step:472/1480 train_time:69879ms step_avg:151.25ms step:473/1480 train_time:70032ms step_avg:151.26ms step:474/1480 train_time:70184ms step_avg:151.26ms step:475/1480 train_time:70337ms step_avg:151.26ms step:476/1480 train_time:70491ms step_avg:151.27ms step:477/1480 train_time:70645ms step_avg:151.27ms step:478/1480 train_time:70799ms step_avg:151.28ms step:479/1480 train_time:70952ms step_avg:151.28ms step:480/1480 train_time:71105ms step_avg:151.29ms step:481/1480 train_time:71258ms step_avg:151.29ms step:482/1480 train_time:71411ms step_avg:151.29ms step:483/1480 train_time:71562ms step_avg:151.29ms step:484/1480 train_time:71715ms step_avg:151.30ms step:485/1480 train_time:71869ms step_avg:151.30ms step:486/1480 train_time:72022ms step_avg:151.31ms step:487/1480 train_time:72175ms step_avg:151.31ms step:488/1480 train_time:72328ms step_avg:151.31ms step:489/1480 train_time:72480ms step_avg:151.32ms step:490/1480 train_time:72633ms step_avg:151.32ms step:491/1480 train_time:72785ms step_avg:151.32ms step:492/1480 train_time:72937ms step_avg:151.32ms step:493/1480 train_time:73090ms step_avg:151.33ms step:494/1480 train_time:73246ms step_avg:151.33ms step:495/1480 train_time:73399ms step_avg:151.34ms step:496/1480 train_time:73552ms step_avg:151.34ms step:497/1480 train_time:73704ms step_avg:151.34ms step:498/1480 train_time:73856ms step_avg:151.34ms step:499/1480 train_time:74010ms step_avg:151.35ms step:500/1480 train_time:74162ms step_avg:151.35ms step:500/1480 val_loss:3.6898 train_time:74231ms step_avg:151.49ms step:501/1480 train_time:74323ms step_avg:151.37ms step:502/1480 train_time:74473ms step_avg:151.37ms step:503/1480 train_time:74626ms step_avg:151.37ms step:504/1480 train_time:74778ms step_avg:151.37ms step:505/1480 train_time:74930ms step_avg:151.37ms step:506/1480 train_time:75082ms step_avg:151.38ms step:507/1480 train_time:75234ms step_avg:151.38ms step:508/1480 train_time:75387ms step_avg:151.38ms step:509/1480 train_time:75542ms step_avg:151.39ms step:510/1480 train_time:75695ms step_avg:151.39ms step:511/1480 train_time:75848ms step_avg:151.39ms step:512/1480 train_time:76002ms step_avg:151.40ms step:513/1480 train_time:76154ms step_avg:151.40ms step:514/1480 train_time:76307ms step_avg:151.40ms step:515/1480 train_time:76459ms step_avg:151.40ms step:516/1480 train_time:76613ms step_avg:151.41ms step:517/1480 train_time:76766ms step_avg:151.41ms step:518/1480 train_time:76920ms step_avg:151.42ms step:519/1480 train_time:77073ms step_avg:151.42ms step:520/1480 train_time:77226ms step_avg:151.42ms step:521/1480 train_time:77378ms step_avg:151.42ms step:522/1480 train_time:77531ms step_avg:151.43ms step:523/1480 train_time:77684ms step_avg:151.43ms step:524/1480 train_time:77838ms step_avg:151.44ms step:525/1480 train_time:77991ms step_avg:151.44ms step:526/1480 train_time:78145ms step_avg:151.44ms step:527/1480 train_time:78297ms step_avg:151.44ms step:528/1480 train_time:78449ms step_avg:151.45ms step:529/1480 train_time:78602ms step_avg:151.45ms step:530/1480 train_time:78754ms step_avg:151.45ms step:531/1480 train_time:78906ms step_avg:151.45ms step:532/1480 train_time:79059ms step_avg:151.45ms step:533/1480 train_time:79211ms step_avg:151.46ms step:534/1480 train_time:79363ms step_avg:151.46ms step:535/1480 train_time:79517ms step_avg:151.46ms step:536/1480 train_time:79671ms step_avg:151.47ms step:537/1480 train_time:79824ms step_avg:151.47ms step:538/1480 train_time:79978ms step_avg:151.47ms step:539/1480 train_time:80131ms step_avg:151.48ms step:540/1480 train_time:80284ms step_avg:151.48ms step:541/1480 train_time:80438ms step_avg:151.48ms step:542/1480 train_time:80590ms step_avg:151.49ms step:543/1480 train_time:80744ms step_avg:151.49ms step:544/1480 train_time:80896ms step_avg:151.49ms step:545/1480 train_time:81049ms step_avg:151.49ms step:546/1480 train_time:81202ms step_avg:151.50ms step:547/1480 train_time:81354ms step_avg:151.50ms step:548/1480 train_time:81507ms step_avg:151.50ms step:549/1480 train_time:81661ms step_avg:151.50ms step:550/1480 train_time:81813ms step_avg:151.51ms step:551/1480 train_time:81968ms step_avg:151.51ms step:552/1480 train_time:82123ms step_avg:151.52ms step:553/1480 train_time:82277ms step_avg:151.52ms step:554/1480 train_time:82434ms step_avg:151.53ms step:555/1480 train_time:82588ms step_avg:151.54ms step:556/1480 train_time:82743ms step_avg:151.54ms step:557/1480 train_time:82898ms step_avg:151.55ms step:558/1480 train_time:83053ms step_avg:151.56ms step:559/1480 train_time:83206ms step_avg:151.56ms step:560/1480 train_time:83362ms step_avg:151.57ms step:561/1480 train_time:83515ms step_avg:151.57ms step:562/1480 train_time:83669ms step_avg:151.57ms step:563/1480 train_time:83823ms step_avg:151.58ms step:564/1480 train_time:83979ms step_avg:151.59ms step:565/1480 train_time:84134ms step_avg:151.59ms step:566/1480 train_time:84289ms step_avg:151.60ms step:567/1480 train_time:84444ms step_avg:151.60ms step:568/1480 train_time:84597ms step_avg:151.61ms step:569/1480 train_time:84772ms step_avg:151.65ms step:570/1480 train_time:84906ms step_avg:151.62ms step:571/1480 train_time:85060ms step_avg:151.62ms step:572/1480 train_time:85215ms step_avg:151.63ms step:573/1480 train_time:85370ms step_avg:151.63ms step:574/1480 train_time:85526ms step_avg:151.64ms step:575/1480 train_time:85680ms step_avg:151.65ms step:576/1480 train_time:85834ms step_avg:151.65ms step:577/1480 train_time:85989ms step_avg:151.66ms step:578/1480 train_time:86143ms step_avg:151.66ms step:579/1480 train_time:86297ms step_avg:151.66ms step:580/1480 train_time:86452ms step_avg:151.67ms step:581/1480 train_time:86606ms step_avg:151.68ms step:582/1480 train_time:86761ms step_avg:151.68ms step:583/1480 train_time:86916ms step_avg:151.69ms step:584/1480 train_time:87072ms step_avg:151.69ms step:585/1480 train_time:87226ms step_avg:151.70ms step:586/1480 train_time:87380ms step_avg:151.70ms step:587/1480 train_time:87535ms step_avg:151.71ms step:588/1480 train_time:87689ms step_avg:151.71ms step:589/1480 train_time:87844ms step_avg:151.72ms step:590/1480 train_time:87999ms step_avg:151.72ms step:591/1480 train_time:88153ms step_avg:151.73ms step:592/1480 train_time:88308ms step_avg:151.73ms step:593/1480 train_time:88463ms step_avg:151.74ms step:594/1480 train_time:88618ms step_avg:151.74ms step:595/1480 train_time:88774ms step_avg:151.75ms step:596/1480 train_time:88929ms step_avg:151.76ms step:597/1480 train_time:89083ms step_avg:151.76ms step:598/1480 train_time:89239ms step_avg:151.77ms step:599/1480 train_time:89394ms step_avg:151.77ms step:600/1480 train_time:89549ms step_avg:151.78ms step:601/1480 train_time:89704ms step_avg:151.78ms step:602/1480 train_time:89859ms step_avg:151.79ms step:603/1480 train_time:90014ms step_avg:151.79ms step:604/1480 train_time:90169ms step_avg:151.80ms step:605/1480 train_time:90324ms step_avg:151.81ms step:606/1480 train_time:90479ms step_avg:151.81ms step:607/1480 train_time:90635ms step_avg:151.82ms step:608/1480 train_time:90790ms step_avg:151.82ms step:609/1480 train_time:90945ms step_avg:151.83ms step:610/1480 train_time:91099ms step_avg:151.83ms step:611/1480 train_time:91253ms step_avg:151.83ms step:612/1480 train_time:91407ms step_avg:151.84ms step:613/1480 train_time:91563ms step_avg:151.85ms step:614/1480 train_time:91719ms step_avg:151.85ms step:615/1480 train_time:91873ms step_avg:151.86ms step:616/1480 train_time:92027ms step_avg:151.86ms step:617/1480 train_time:92182ms step_avg:151.86ms step:618/1480 train_time:92337ms step_avg:151.87ms step:619/1480 train_time:92492ms step_avg:151.88ms step:620/1480 train_time:92647ms step_avg:151.88ms step:621/1480 train_time:92803ms step_avg:151.89ms step:622/1480 train_time:92957ms step_avg:151.89ms step:623/1480 train_time:93112ms step_avg:151.90ms step:624/1480 train_time:93267ms step_avg:151.90ms step:625/1480 train_time:93421ms step_avg:151.90ms step:625/1480 val_loss:3.6084 train_time:93491ms step_avg:152.02ms step:626/1480 train_time:93583ms step_avg:151.92ms step:627/1480 train_time:93736ms step_avg:151.92ms step:628/1480 train_time:93890ms step_avg:151.93ms step:629/1480 train_time:94045ms step_avg:151.93ms step:630/1480 train_time:94199ms step_avg:151.93ms step:631/1480 train_time:94352ms step_avg:151.94ms step:632/1480 train_time:94507ms step_avg:151.94ms step:633/1480 train_time:94662ms step_avg:151.95ms step:634/1480 train_time:94817ms step_avg:151.95ms step:635/1480 train_time:94971ms step_avg:151.95ms step:636/1480 train_time:95126ms step_avg:151.96ms step:637/1480 train_time:95281ms step_avg:151.96ms step:638/1480 train_time:95436ms step_avg:151.97ms step:639/1480 train_time:95590ms step_avg:151.97ms step:640/1480 train_time:95745ms step_avg:151.98ms step:641/1480 train_time:95900ms step_avg:151.98ms step:642/1480 train_time:96054ms step_avg:151.98ms step:643/1480 train_time:96209ms step_avg:151.99ms step:644/1480 train_time:96364ms step_avg:151.99ms step:645/1480 train_time:96518ms step_avg:152.00ms step:646/1480 train_time:96673ms step_avg:152.00ms step:647/1480 train_time:96827ms step_avg:152.00ms step:648/1480 train_time:96984ms step_avg:152.01ms step:649/1480 train_time:97139ms step_avg:152.02ms step:650/1480 train_time:97295ms step_avg:152.02ms step:651/1480 train_time:97450ms step_avg:152.03ms step:652/1480 train_time:97605ms step_avg:152.03ms step:653/1480 train_time:97760ms step_avg:152.04ms step:654/1480 train_time:97914ms step_avg:152.04ms step:655/1480 train_time:98069ms step_avg:152.05ms step:656/1480 train_time:98224ms step_avg:152.05ms step:657/1480 train_time:98378ms step_avg:152.05ms step:658/1480 train_time:98533ms step_avg:152.06ms step:659/1480 train_time:98689ms step_avg:152.06ms step:660/1480 train_time:98844ms step_avg:152.07ms step:661/1480 train_time:99001ms step_avg:152.08ms step:662/1480 train_time:99158ms step_avg:152.08ms step:663/1480 train_time:99314ms step_avg:152.09ms step:664/1480 train_time:99469ms step_avg:152.09ms step:665/1480 train_time:99627ms step_avg:152.10ms step:666/1480 train_time:99783ms step_avg:152.11ms step:667/1480 train_time:99939ms step_avg:152.11ms step:668/1480 train_time:100095ms step_avg:152.12ms step:669/1480 train_time:100253ms step_avg:152.13ms step:670/1480 train_time:100409ms step_avg:152.14ms step:671/1480 train_time:100564ms step_avg:152.14ms step:672/1480 train_time:100721ms step_avg:152.15ms step:673/1480 train_time:100877ms step_avg:152.15ms step:674/1480 train_time:101034ms step_avg:152.16ms step:675/1480 train_time:101191ms step_avg:152.17ms step:676/1480 train_time:101349ms step_avg:152.18ms step:677/1480 train_time:101505ms step_avg:152.18ms step:678/1480 train_time:101661ms step_avg:152.19ms step:679/1480 train_time:101817ms step_avg:152.19ms step:680/1480 train_time:101974ms step_avg:152.20ms step:681/1480 train_time:102129ms step_avg:152.20ms step:682/1480 train_time:102288ms step_avg:152.21ms step:683/1480 train_time:102445ms step_avg:152.22ms step:684/1480 train_time:102601ms step_avg:152.23ms step:685/1480 train_time:102756ms step_avg:152.23ms step:686/1480 train_time:102913ms step_avg:152.24ms step:687/1480 train_time:103069ms step_avg:152.24ms step:688/1480 train_time:103227ms step_avg:152.25ms step:689/1480 train_time:103385ms step_avg:152.26ms step:690/1480 train_time:103542ms step_avg:152.27ms step:691/1480 train_time:103699ms step_avg:152.27ms step:692/1480 train_time:103855ms step_avg:152.28ms step:693/1480 train_time:104011ms step_avg:152.29ms step:694/1480 train_time:104167ms step_avg:152.29ms step:695/1480 train_time:104322ms step_avg:152.29ms step:696/1480 train_time:104478ms step_avg:152.30ms step:697/1480 train_time:104633ms step_avg:152.30ms step:698/1480 train_time:104790ms step_avg:152.31ms step:699/1480 train_time:104946ms step_avg:152.32ms step:700/1480 train_time:105103ms step_avg:152.32ms step:701/1480 train_time:105259ms step_avg:152.33ms step:702/1480 train_time:105415ms step_avg:152.33ms step:703/1480 train_time:105570ms step_avg:152.34ms step:704/1480 train_time:105727ms step_avg:152.34ms step:705/1480 train_time:105883ms step_avg:152.35ms step:706/1480 train_time:106041ms step_avg:152.36ms step:707/1480 train_time:106199ms step_avg:152.37ms step:708/1480 train_time:106356ms step_avg:152.37ms step:709/1480 train_time:106511ms step_avg:152.38ms step:710/1480 train_time:106667ms step_avg:152.38ms step:711/1480 train_time:106824ms step_avg:152.39ms step:712/1480 train_time:106980ms step_avg:152.39ms step:713/1480 train_time:107137ms step_avg:152.40ms step:714/1480 train_time:107293ms step_avg:152.40ms step:715/1480 train_time:107449ms step_avg:152.41ms step:716/1480 train_time:107605ms step_avg:152.42ms step:717/1480 train_time:107761ms step_avg:152.42ms step:718/1480 train_time:107917ms step_avg:152.42ms step:719/1480 train_time:108071ms step_avg:152.43ms step:720/1480 train_time:108230ms step_avg:152.44ms step:721/1480 train_time:108387ms step_avg:152.44ms step:722/1480 train_time:108544ms step_avg:152.45ms step:723/1480 train_time:108701ms step_avg:152.46ms step:724/1480 train_time:108858ms step_avg:152.46ms step:725/1480 train_time:109015ms step_avg:152.47ms step:726/1480 train_time:109171ms step_avg:152.47ms step:727/1480 train_time:109330ms step_avg:152.48ms step:728/1480 train_time:109486ms step_avg:152.49ms step:729/1480 train_time:109643ms step_avg:152.49ms step:730/1480 train_time:109800ms step_avg:152.50ms step:731/1480 train_time:109958ms step_avg:152.51ms step:732/1480 train_time:110114ms step_avg:152.51ms step:733/1480 train_time:110271ms step_avg:152.52ms step:734/1480 train_time:110428ms step_avg:152.53ms step:735/1480 train_time:110584ms step_avg:152.53ms step:736/1480 train_time:110739ms step_avg:152.53ms step:737/1480 train_time:110895ms step_avg:152.54ms step:738/1480 train_time:111050ms step_avg:152.54ms step:739/1480 train_time:111206ms step_avg:152.55ms step:740/1480 train_time:111365ms step_avg:152.55ms step:741/1480 train_time:111523ms step_avg:152.56ms step:742/1480 train_time:111679ms step_avg:152.57ms step:743/1480 train_time:111834ms step_avg:152.57ms step:744/1480 train_time:111991ms step_avg:152.58ms step:745/1480 train_time:112148ms step_avg:152.58ms step:746/1480 train_time:112304ms step_avg:152.59ms step:747/1480 train_time:112460ms step_avg:152.59ms step:748/1480 train_time:112619ms step_avg:152.60ms step:749/1480 train_time:112775ms step_avg:152.61ms step:750/1480 train_time:112931ms step_avg:152.61ms step:750/1480 val_loss:3.5514 train_time:113003ms step_avg:152.71ms step:751/1480 train_time:113094ms step_avg:152.62ms step:752/1480 train_time:113250ms step_avg:152.63ms step:753/1480 train_time:113406ms step_avg:152.63ms step:754/1480 train_time:113563ms step_avg:152.64ms step:755/1480 train_time:113718ms step_avg:152.64ms step:756/1480 train_time:113874ms step_avg:152.65ms step:757/1480 train_time:114032ms step_avg:152.65ms step:758/1480 train_time:114188ms step_avg:152.66ms step:759/1480 train_time:114365ms step_avg:152.69ms step:760/1480 train_time:114505ms step_avg:152.67ms step:761/1480 train_time:114661ms step_avg:152.68ms step:762/1480 train_time:114817ms step_avg:152.68ms step:763/1480 train_time:114974ms step_avg:152.69ms step:764/1480 train_time:115131ms step_avg:152.69ms step:765/1480 train_time:115288ms step_avg:152.70ms step:766/1480 train_time:115445ms step_avg:152.71ms step:767/1480 train_time:115603ms step_avg:152.71ms step:768/1480 train_time:115759ms step_avg:152.72ms step:769/1480 train_time:115916ms step_avg:152.72ms step:770/1480 train_time:116074ms step_avg:152.73ms step:771/1480 train_time:116231ms step_avg:152.73ms step:772/1480 train_time:116389ms step_avg:152.74ms step:773/1480 train_time:116547ms step_avg:152.75ms step:774/1480 train_time:116704ms step_avg:152.75ms step:775/1480 train_time:116861ms step_avg:152.76ms step:776/1480 train_time:117020ms step_avg:152.77ms step:777/1480 train_time:117180ms step_avg:152.78ms step:778/1480 train_time:117339ms step_avg:152.78ms step:779/1480 train_time:117496ms step_avg:152.79ms step:780/1480 train_time:117655ms step_avg:152.80ms step:781/1480 train_time:117813ms step_avg:152.81ms step:782/1480 train_time:117971ms step_avg:152.81ms step:783/1480 train_time:118128ms step_avg:152.82ms step:784/1480 train_time:118286ms step_avg:152.82ms step:785/1480 train_time:118442ms step_avg:152.83ms step:786/1480 train_time:118600ms step_avg:152.83ms step:787/1480 train_time:118759ms step_avg:152.84ms step:788/1480 train_time:118917ms step_avg:152.85ms step:789/1480 train_time:119074ms step_avg:152.85ms step:790/1480 train_time:119231ms step_avg:152.86ms step:791/1480 train_time:119392ms step_avg:152.87ms step:792/1480 train_time:119551ms step_avg:152.88ms step:793/1480 train_time:119708ms step_avg:152.88ms step:794/1480 train_time:119866ms step_avg:152.89ms step:795/1480 train_time:120025ms step_avg:152.90ms step:796/1480 train_time:120185ms step_avg:152.91ms step:797/1480 train_time:120343ms step_avg:152.91ms step:798/1480 train_time:120501ms step_avg:152.92ms step:799/1480 train_time:120664ms step_avg:152.93ms step:800/1480 train_time:120821ms step_avg:152.94ms step:801/1480 train_time:120978ms step_avg:152.94ms step:802/1480 train_time:121136ms step_avg:152.95ms step:803/1480 train_time:121295ms step_avg:152.96ms step:804/1480 train_time:121453ms step_avg:152.96ms step:805/1480 train_time:121611ms step_avg:152.97ms step:806/1480 train_time:121768ms step_avg:152.97ms step:807/1480 train_time:121924ms step_avg:152.98ms step:808/1480 train_time:122083ms step_avg:152.99ms step:809/1480 train_time:122239ms step_avg:152.99ms step:810/1480 train_time:122396ms step_avg:152.99ms step:811/1480 train_time:122552ms step_avg:153.00ms step:812/1480 train_time:122709ms step_avg:153.00ms step:813/1480 train_time:122865ms step_avg:153.01ms step:814/1480 train_time:123022ms step_avg:153.01ms step:815/1480 train_time:123179ms step_avg:153.02ms step:816/1480 train_time:123338ms step_avg:153.02ms step:817/1480 train_time:123495ms step_avg:153.03ms step:818/1480 train_time:123653ms step_avg:153.04ms step:819/1480 train_time:123811ms step_avg:153.04ms step:820/1480 train_time:123969ms step_avg:153.05ms step:821/1480 train_time:124126ms step_avg:153.05ms step:822/1480 train_time:124284ms step_avg:153.06ms step:823/1480 train_time:124441ms step_avg:153.06ms step:824/1480 train_time:124598ms step_avg:153.07ms step:825/1480 train_time:124757ms step_avg:153.08ms step:826/1480 train_time:124917ms step_avg:153.08ms step:827/1480 train_time:125077ms step_avg:153.09ms step:828/1480 train_time:125236ms step_avg:153.10ms step:829/1480 train_time:125395ms step_avg:153.11ms step:830/1480 train_time:125554ms step_avg:153.11ms step:831/1480 train_time:125711ms step_avg:153.12ms step:832/1480 train_time:125870ms step_avg:153.13ms step:833/1480 train_time:126027ms step_avg:153.13ms step:834/1480 train_time:126185ms step_avg:153.14ms step:835/1480 train_time:126343ms step_avg:153.14ms step:836/1480 train_time:126502ms step_avg:153.15ms step:837/1480 train_time:126661ms step_avg:153.16ms step:838/1480 train_time:126817ms step_avg:153.16ms step:839/1480 train_time:126976ms step_avg:153.17ms step:840/1480 train_time:127134ms step_avg:153.17ms step:841/1480 train_time:127290ms step_avg:153.18ms step:842/1480 train_time:127447ms step_avg:153.18ms step:843/1480 train_time:127604ms step_avg:153.19ms step:844/1480 train_time:127760ms step_avg:153.19ms step:845/1480 train_time:127918ms step_avg:153.19ms step:846/1480 train_time:128077ms step_avg:153.20ms step:847/1480 train_time:128237ms step_avg:153.21ms step:848/1480 train_time:128395ms step_avg:153.22ms step:849/1480 train_time:128552ms step_avg:153.22ms step:850/1480 train_time:128709ms step_avg:153.22ms step:851/1480 train_time:128868ms step_avg:153.23ms step:852/1480 train_time:129026ms step_avg:153.24ms step:853/1480 train_time:129183ms step_avg:153.24ms step:854/1480 train_time:129339ms step_avg:153.25ms step:855/1480 train_time:129496ms step_avg:153.25ms step:856/1480 train_time:129654ms step_avg:153.26ms step:857/1480 train_time:129811ms step_avg:153.26ms step:858/1480 train_time:129970ms step_avg:153.27ms step:859/1480 train_time:130129ms step_avg:153.27ms step:860/1480 train_time:130286ms step_avg:153.28ms step:861/1480 train_time:130445ms step_avg:153.28ms step:862/1480 train_time:130607ms step_avg:153.29ms step:863/1480 train_time:130767ms step_avg:153.30ms step:864/1480 train_time:130925ms step_avg:153.31ms step:865/1480 train_time:131083ms step_avg:153.31ms step:866/1480 train_time:131241ms step_avg:153.32ms step:867/1480 train_time:131400ms step_avg:153.33ms step:868/1480 train_time:131557ms step_avg:153.33ms step:869/1480 train_time:131714ms step_avg:153.33ms step:870/1480 train_time:131874ms step_avg:153.34ms step:871/1480 train_time:132031ms step_avg:153.35ms step:872/1480 train_time:132189ms step_avg:153.35ms step:873/1480 train_time:132346ms step_avg:153.36ms step:874/1480 train_time:132506ms step_avg:153.36ms step:875/1480 train_time:132665ms step_avg:153.37ms step:875/1480 val_loss:3.5064 train_time:132737ms step_avg:153.45ms step:876/1480 train_time:132829ms step_avg:153.38ms step:877/1480 train_time:132984ms step_avg:153.38ms step:878/1480 train_time:133142ms step_avg:153.39ms step:879/1480 train_time:133301ms step_avg:153.40ms step:880/1480 train_time:133459ms step_avg:153.40ms step:881/1480 train_time:133617ms step_avg:153.41ms step:882/1480 train_time:133774ms step_avg:153.41ms step:883/1480 train_time:133936ms step_avg:153.42ms step:884/1480 train_time:134099ms step_avg:153.43ms step:885/1480 train_time:134260ms step_avg:153.44ms step:886/1480 train_time:134421ms step_avg:153.45ms step:887/1480 train_time:134580ms step_avg:153.45ms step:888/1480 train_time:134744ms step_avg:153.47ms step:889/1480 train_time:134905ms step_avg:153.48ms step:890/1480 train_time:135064ms step_avg:153.48ms step:891/1480 train_time:135223ms step_avg:153.49ms step:892/1480 train_time:135383ms step_avg:153.50ms step:893/1480 train_time:135542ms step_avg:153.50ms step:894/1480 train_time:135701ms step_avg:153.51ms step:895/1480 train_time:135863ms step_avg:153.52ms step:896/1480 train_time:136021ms step_avg:153.52ms step:897/1480 train_time:136181ms step_avg:153.53ms step:898/1480 train_time:136342ms step_avg:153.54ms step:899/1480 train_time:136502ms step_avg:153.55ms step:900/1480 train_time:136663ms step_avg:153.55ms step:901/1480 train_time:136823ms step_avg:153.56ms step:902/1480 train_time:136979ms step_avg:153.56ms step:903/1480 train_time:137141ms step_avg:153.57ms step:904/1480 train_time:137300ms step_avg:153.58ms step:905/1480 train_time:137458ms step_avg:153.58ms step:906/1480 train_time:137619ms step_avg:153.59ms step:907/1480 train_time:137783ms step_avg:153.60ms step:908/1480 train_time:137940ms step_avg:153.61ms step:909/1480 train_time:138099ms step_avg:153.61ms step:910/1480 train_time:138263ms step_avg:153.63ms step:911/1480 train_time:138422ms step_avg:153.63ms step:912/1480 train_time:138581ms step_avg:153.64ms step:913/1480 train_time:138743ms step_avg:153.65ms step:914/1480 train_time:138904ms step_avg:153.66ms step:915/1480 train_time:139066ms step_avg:153.66ms step:916/1480 train_time:139225ms step_avg:153.67ms step:917/1480 train_time:139382ms step_avg:153.67ms step:918/1480 train_time:139545ms step_avg:153.68ms step:919/1480 train_time:139705ms step_avg:153.69ms step:920/1480 train_time:139865ms step_avg:153.70ms step:921/1480 train_time:140024ms step_avg:153.70ms step:922/1480 train_time:140184ms step_avg:153.71ms step:923/1480 train_time:140342ms step_avg:153.72ms step:924/1480 train_time:140500ms step_avg:153.72ms step:925/1480 train_time:140661ms step_avg:153.73ms step:926/1480 train_time:140821ms step_avg:153.73ms step:927/1480 train_time:140979ms step_avg:153.74ms step:928/1480 train_time:141139ms step_avg:153.75ms step:929/1480 train_time:141299ms step_avg:153.75ms step:930/1480 train_time:141458ms step_avg:153.76ms step:931/1480 train_time:141618ms step_avg:153.77ms step:932/1480 train_time:141776ms step_avg:153.77ms step:933/1480 train_time:141937ms step_avg:153.78ms step:934/1480 train_time:142098ms step_avg:153.79ms step:935/1480 train_time:142259ms step_avg:153.79ms step:936/1480 train_time:142418ms step_avg:153.80ms step:937/1480 train_time:142579ms step_avg:153.81ms step:938/1480 train_time:142736ms step_avg:153.81ms step:939/1480 train_time:142899ms step_avg:153.82ms step:940/1480 train_time:143061ms step_avg:153.83ms step:941/1480 train_time:143220ms step_avg:153.84ms step:942/1480 train_time:143378ms step_avg:153.84ms step:943/1480 train_time:143541ms step_avg:153.85ms step:944/1480 train_time:143703ms step_avg:153.86ms step:945/1480 train_time:143862ms step_avg:153.86ms step:946/1480 train_time:144023ms step_avg:153.87ms step:947/1480 train_time:144185ms step_avg:153.88ms step:948/1480 train_time:144344ms step_avg:153.89ms step:949/1480 train_time:144522ms step_avg:153.91ms step:950/1480 train_time:144663ms step_avg:153.90ms step:951/1480 train_time:144826ms step_avg:153.91ms step:952/1480 train_time:144984ms step_avg:153.91ms step:953/1480 train_time:145143ms step_avg:153.92ms step:954/1480 train_time:145305ms step_avg:153.92ms step:955/1480 train_time:145464ms step_avg:153.93ms step:956/1480 train_time:145622ms step_avg:153.93ms step:957/1480 train_time:145782ms step_avg:153.94ms step:958/1480 train_time:145946ms step_avg:153.95ms step:959/1480 train_time:146104ms step_avg:153.96ms step:960/1480 train_time:146265ms step_avg:153.96ms step:961/1480 train_time:146424ms step_avg:153.97ms step:962/1480 train_time:146582ms step_avg:153.97ms step:963/1480 train_time:146742ms step_avg:153.98ms step:964/1480 train_time:146903ms step_avg:153.99ms step:965/1480 train_time:147062ms step_avg:153.99ms step:966/1480 train_time:147221ms step_avg:154.00ms step:967/1480 train_time:147379ms step_avg:154.00ms step:968/1480 train_time:147542ms step_avg:154.01ms step:969/1480 train_time:147702ms step_avg:154.02ms step:970/1480 train_time:147860ms step_avg:154.02ms step:971/1480 train_time:148019ms step_avg:154.03ms step:972/1480 train_time:148178ms step_avg:154.03ms step:973/1480 train_time:148336ms step_avg:154.03ms step:974/1480 train_time:148496ms step_avg:154.04ms step:975/1480 train_time:148657ms step_avg:154.05ms step:976/1480 train_time:148818ms step_avg:154.06ms step:977/1480 train_time:148977ms step_avg:154.06ms step:978/1480 train_time:149137ms step_avg:154.07ms step:979/1480 train_time:149298ms step_avg:154.07ms step:980/1480 train_time:149459ms step_avg:154.08ms step:981/1480 train_time:149621ms step_avg:154.09ms step:982/1480 train_time:149778ms step_avg:154.09ms step:983/1480 train_time:149939ms step_avg:154.10ms step:984/1480 train_time:150098ms step_avg:154.11ms step:985/1480 train_time:150261ms step_avg:154.11ms step:986/1480 train_time:150422ms step_avg:154.12ms step:987/1480 train_time:150581ms step_avg:154.13ms step:988/1480 train_time:150739ms step_avg:154.13ms step:989/1480 train_time:150898ms step_avg:154.13ms step:990/1480 train_time:151061ms step_avg:154.14ms step:991/1480 train_time:151223ms step_avg:154.15ms step:992/1480 train_time:151387ms step_avg:154.16ms step:993/1480 train_time:151555ms step_avg:154.18ms step:994/1480 train_time:151713ms step_avg:154.18ms step:995/1480 train_time:151872ms step_avg:154.18ms step:996/1480 train_time:152029ms step_avg:154.19ms step:997/1480 train_time:152189ms step_avg:154.19ms step:998/1480 train_time:152347ms step_avg:154.20ms step:999/1480 train_time:152505ms step_avg:154.20ms step:1000/1480 train_time:152666ms step_avg:154.21ms step:1000/1480 val_loss:3.4435 train_time:152739ms step_avg:154.28ms step:1001/1480 train_time:152835ms step_avg:154.22ms step:1002/1480 train_time:152988ms step_avg:154.22ms step:1003/1480 train_time:153152ms step_avg:154.23ms step:1004/1480 train_time:153314ms step_avg:154.24ms step:1005/1480 train_time:153474ms step_avg:154.25ms step:1006/1480 train_time:153635ms step_avg:154.25ms step:1007/1480 train_time:153796ms step_avg:154.26ms step:1008/1480 train_time:153956ms step_avg:154.26ms step:1009/1480 train_time:154123ms step_avg:154.28ms step:1010/1480 train_time:154281ms step_avg:154.28ms step:1011/1480 train_time:154442ms step_avg:154.29ms step:1012/1480 train_time:154599ms step_avg:154.29ms step:1013/1480 train_time:154762ms step_avg:154.30ms step:1014/1480 train_time:154922ms step_avg:154.30ms step:1015/1480 train_time:155083ms step_avg:154.31ms step:1016/1480 train_time:155242ms step_avg:154.32ms step:1017/1480 train_time:155405ms step_avg:154.33ms step:1018/1480 train_time:155567ms step_avg:154.33ms step:1019/1480 train_time:155730ms step_avg:154.34ms step:1020/1480 train_time:155891ms step_avg:154.35ms step:1021/1480 train_time:156051ms step_avg:154.35ms step:1022/1480 train_time:156211ms step_avg:154.36ms step:1023/1480 train_time:156373ms step_avg:154.37ms step:1024/1480 train_time:156532ms step_avg:154.37ms step:1025/1480 train_time:156693ms step_avg:154.38ms step:1026/1480 train_time:156853ms step_avg:154.38ms step:1027/1480 train_time:157014ms step_avg:154.39ms step:1028/1480 train_time:157176ms step_avg:154.40ms step:1029/1480 train_time:157339ms step_avg:154.41ms step:1030/1480 train_time:157499ms step_avg:154.41ms step:1031/1480 train_time:157658ms step_avg:154.41ms step:1032/1480 train_time:157821ms step_avg:154.42ms step:1033/1480 train_time:157980ms step_avg:154.43ms step:1034/1480 train_time:158141ms step_avg:154.43ms step:1035/1480 train_time:158302ms step_avg:154.44ms step:1036/1480 train_time:158460ms step_avg:154.44ms step:1037/1480 train_time:158620ms step_avg:154.45ms step:1038/1480 train_time:158779ms step_avg:154.45ms step:1039/1480 train_time:158941ms step_avg:154.46ms step:1040/1480 train_time:159100ms step_avg:154.47ms step:1041/1480 train_time:159260ms step_avg:154.47ms step:1042/1480 train_time:159417ms step_avg:154.47ms step:1043/1480 train_time:159577ms step_avg:154.48ms step:1044/1480 train_time:159737ms step_avg:154.48ms step:1045/1480 train_time:159899ms step_avg:154.49ms step:1046/1480 train_time:160058ms step_avg:154.50ms step:1047/1480 train_time:160218ms step_avg:154.50ms step:1048/1480 train_time:160380ms step_avg:154.51ms step:1049/1480 train_time:160540ms step_avg:154.51ms step:1050/1480 train_time:160701ms step_avg:154.52ms step:1051/1480 train_time:160862ms step_avg:154.53ms step:1052/1480 train_time:161021ms step_avg:154.53ms step:1053/1480 train_time:161182ms step_avg:154.54ms step:1054/1480 train_time:161343ms step_avg:154.54ms step:1055/1480 train_time:161502ms step_avg:154.55ms step:1056/1480 train_time:161660ms step_avg:154.55ms step:1057/1480 train_time:161819ms step_avg:154.56ms step:1058/1480 train_time:161981ms step_avg:154.56ms step:1059/1480 train_time:162145ms step_avg:154.57ms step:1060/1480 train_time:162308ms step_avg:154.58ms step:1061/1480 train_time:162465ms step_avg:154.58ms step:1062/1480 train_time:162625ms step_avg:154.59ms step:1063/1480 train_time:162785ms step_avg:154.59ms step:1064/1480 train_time:162943ms step_avg:154.60ms step:1065/1480 train_time:163105ms step_avg:154.60ms step:1066/1480 train_time:163267ms step_avg:154.61ms step:1067/1480 train_time:163432ms step_avg:154.62ms step:1068/1480 train_time:163593ms step_avg:154.62ms step:1069/1480 train_time:163756ms step_avg:154.63ms step:1070/1480 train_time:163916ms step_avg:154.64ms step:1071/1480 train_time:164077ms step_avg:154.64ms step:1072/1480 train_time:164236ms step_avg:154.65ms step:1073/1480 train_time:164395ms step_avg:154.65ms step:1074/1480 train_time:164553ms step_avg:154.66ms step:1075/1480 train_time:164715ms step_avg:154.66ms step:1076/1480 train_time:164874ms step_avg:154.67ms step:1077/1480 train_time:165033ms step_avg:154.67ms step:1078/1480 train_time:165198ms step_avg:154.68ms step:1079/1480 train_time:165361ms step_avg:154.69ms step:1080/1480 train_time:165521ms step_avg:154.69ms step:1081/1480 train_time:165682ms step_avg:154.70ms step:1082/1480 train_time:165842ms step_avg:154.70ms step:1083/1480 train_time:166002ms step_avg:154.71ms step:1084/1480 train_time:166161ms step_avg:154.71ms step:1085/1480 train_time:166322ms step_avg:154.72ms step:1086/1480 train_time:166483ms step_avg:154.72ms step:1087/1480 train_time:166642ms step_avg:154.73ms step:1088/1480 train_time:166804ms step_avg:154.73ms step:1089/1480 train_time:166968ms step_avg:154.74ms step:1090/1480 train_time:167132ms step_avg:154.75ms step:1091/1480 train_time:167293ms step_avg:154.76ms step:1092/1480 train_time:167454ms step_avg:154.76ms step:1093/1480 train_time:167616ms step_avg:154.77ms step:1094/1480 train_time:167776ms step_avg:154.78ms step:1095/1480 train_time:167936ms step_avg:154.78ms step:1096/1480 train_time:168100ms step_avg:154.79ms step:1097/1480 train_time:168261ms step_avg:154.79ms step:1098/1480 train_time:168421ms step_avg:154.80ms step:1099/1480 train_time:168584ms step_avg:154.81ms step:1100/1480 train_time:168747ms step_avg:154.81ms step:1101/1480 train_time:168911ms step_avg:154.82ms step:1102/1480 train_time:169073ms step_avg:154.83ms step:1103/1480 train_time:169238ms step_avg:154.84ms step:1104/1480 train_time:169400ms step_avg:154.84ms step:1105/1480 train_time:169561ms step_avg:154.85ms step:1106/1480 train_time:169722ms step_avg:154.86ms step:1107/1480 train_time:169883ms step_avg:154.86ms step:1108/1480 train_time:170042ms step_avg:154.87ms step:1109/1480 train_time:170201ms step_avg:154.87ms step:1110/1480 train_time:170361ms step_avg:154.87ms step:1111/1480 train_time:170521ms step_avg:154.88ms step:1112/1480 train_time:170683ms step_avg:154.88ms step:1113/1480 train_time:170852ms step_avg:154.90ms step:1114/1480 train_time:171015ms step_avg:154.91ms step:1115/1480 train_time:171176ms step_avg:154.91ms step:1116/1480 train_time:171336ms step_avg:154.92ms step:1117/1480 train_time:171499ms step_avg:154.92ms step:1118/1480 train_time:171666ms step_avg:154.93ms step:1119/1480 train_time:171827ms step_avg:154.94ms step:1120/1480 train_time:171989ms step_avg:154.95ms step:1121/1480 train_time:172152ms step_avg:154.95ms step:1122/1480 train_time:172313ms step_avg:154.96ms step:1123/1480 train_time:172473ms step_avg:154.96ms step:1124/1480 train_time:172635ms step_avg:154.97ms step:1125/1480 train_time:172798ms step_avg:154.98ms step:1125/1480 val_loss:3.3879 train_time:172872ms step_avg:155.04ms step:1126/1480 train_time:172963ms step_avg:154.98ms step:1127/1480 train_time:173122ms step_avg:154.99ms step:1128/1480 train_time:173284ms step_avg:154.99ms step:1129/1480 train_time:173447ms step_avg:155.00ms step:1130/1480 train_time:173608ms step_avg:155.01ms step:1131/1480 train_time:173776ms step_avg:155.02ms step:1132/1480 train_time:173936ms step_avg:155.02ms step:1133/1480 train_time:174098ms step_avg:155.03ms step:1134/1480 train_time:174260ms step_avg:155.04ms step:1135/1480 train_time:174420ms step_avg:155.04ms step:1136/1480 train_time:174582ms step_avg:155.05ms step:1137/1480 train_time:174745ms step_avg:155.05ms step:1138/1480 train_time:174909ms step_avg:155.06ms step:1139/1480 train_time:175091ms step_avg:155.09ms step:1140/1480 train_time:175233ms step_avg:155.07ms step:1141/1480 train_time:175396ms step_avg:155.08ms step:1142/1480 train_time:175557ms step_avg:155.09ms step:1143/1480 train_time:175721ms step_avg:155.09ms step:1144/1480 train_time:175882ms step_avg:155.10ms step:1145/1480 train_time:176042ms step_avg:155.10ms step:1146/1480 train_time:176206ms step_avg:155.11ms step:1147/1480 train_time:176366ms step_avg:155.12ms step:1148/1480 train_time:176528ms step_avg:155.12ms step:1149/1480 train_time:176694ms step_avg:155.13ms step:1150/1480 train_time:176855ms step_avg:155.14ms step:1151/1480 train_time:177019ms step_avg:155.14ms step:1152/1480 train_time:177182ms step_avg:155.15ms step:1153/1480 train_time:177346ms step_avg:155.16ms step:1154/1480 train_time:177507ms step_avg:155.16ms step:1155/1480 train_time:177668ms step_avg:155.17ms step:1156/1480 train_time:177836ms step_avg:155.18ms step:1157/1480 train_time:177999ms step_avg:155.19ms step:1158/1480 train_time:178160ms step_avg:155.19ms step:1159/1480 train_time:178322ms step_avg:155.20ms step:1160/1480 train_time:178481ms step_avg:155.20ms step:1161/1480 train_time:178643ms step_avg:155.21ms step:1162/1480 train_time:178804ms step_avg:155.21ms step:1163/1480 train_time:178966ms step_avg:155.22ms step:1164/1480 train_time:179130ms step_avg:155.23ms step:1165/1480 train_time:179291ms step_avg:155.23ms step:1166/1480 train_time:179454ms step_avg:155.24ms step:1167/1480 train_time:179615ms step_avg:155.24ms step:1168/1480 train_time:179777ms step_avg:155.25ms step:1169/1480 train_time:179938ms step_avg:155.25ms step:1170/1480 train_time:180097ms step_avg:155.26ms step:1171/1480 train_time:180259ms step_avg:155.26ms step:1172/1480 train_time:180421ms step_avg:155.27ms step:1173/1480 train_time:180582ms step_avg:155.27ms step:1174/1480 train_time:180755ms step_avg:155.29ms step:1175/1480 train_time:180917ms step_avg:155.29ms step:1176/1480 train_time:181081ms step_avg:155.30ms step:1177/1480 train_time:181247ms step_avg:155.31ms step:1178/1480 train_time:181408ms step_avg:155.32ms step:1179/1480 train_time:181567ms step_avg:155.32ms step:1180/1480 train_time:181737ms step_avg:155.33ms step:1181/1480 train_time:181899ms step_avg:155.34ms step:1182/1480 train_time:182059ms step_avg:155.34ms step:1183/1480 train_time:182221ms step_avg:155.35ms step:1184/1480 train_time:182382ms step_avg:155.35ms step:1185/1480 train_time:182546ms step_avg:155.36ms step:1186/1480 train_time:182710ms step_avg:155.37ms step:1187/1480 train_time:182882ms step_avg:155.38ms step:1188/1480 train_time:183042ms step_avg:155.38ms step:1189/1480 train_time:183202ms step_avg:155.39ms step:1190/1480 train_time:183363ms step_avg:155.39ms step:1191/1480 train_time:183527ms step_avg:155.40ms step:1192/1480 train_time:183688ms step_avg:155.40ms step:1193/1480 train_time:183850ms step_avg:155.41ms step:1194/1480 train_time:184011ms step_avg:155.41ms step:1195/1480 train_time:184172ms step_avg:155.42ms step:1196/1480 train_time:184343ms step_avg:155.43ms step:1197/1480 train_time:184504ms step_avg:155.44ms step:1198/1480 train_time:184674ms step_avg:155.45ms step:1199/1480 train_time:184837ms step_avg:155.46ms step:1200/1480 train_time:184996ms step_avg:155.46ms step:1201/1480 train_time:185158ms step_avg:155.46ms step:1202/1480 train_time:185327ms step_avg:155.48ms step:1203/1480 train_time:185494ms step_avg:155.48ms step:1204/1480 train_time:185657ms step_avg:155.49ms step:1205/1480 train_time:185818ms step_avg:155.50ms step:1206/1480 train_time:185978ms step_avg:155.50ms step:1207/1480 train_time:186140ms step_avg:155.51ms step:1208/1480 train_time:186300ms step_avg:155.51ms step:1209/1480 train_time:186464ms step_avg:155.52ms step:1210/1480 train_time:186632ms step_avg:155.53ms step:1211/1480 train_time:186794ms step_avg:155.53ms step:1212/1480 train_time:186957ms step_avg:155.54ms step:1213/1480 train_time:187121ms step_avg:155.55ms step:1214/1480 train_time:187286ms step_avg:155.55ms step:1215/1480 train_time:187452ms step_avg:155.56ms step:1216/1480 train_time:187614ms step_avg:155.57ms step:1217/1480 train_time:187776ms step_avg:155.57ms step:1218/1480 train_time:187937ms step_avg:155.58ms step:1219/1480 train_time:188102ms step_avg:155.58ms step:1220/1480 train_time:188264ms step_avg:155.59ms step:1221/1480 train_time:188426ms step_avg:155.60ms step:1222/1480 train_time:188588ms step_avg:155.60ms step:1223/1480 train_time:188751ms step_avg:155.61ms step:1224/1480 train_time:188917ms step_avg:155.62ms step:1225/1480 train_time:189081ms step_avg:155.62ms step:1226/1480 train_time:189246ms step_avg:155.63ms step:1227/1480 train_time:189411ms step_avg:155.64ms step:1228/1480 train_time:189573ms step_avg:155.64ms step:1229/1480 train_time:189737ms step_avg:155.65ms step:1230/1480 train_time:189904ms step_avg:155.66ms step:1231/1480 train_time:190069ms step_avg:155.67ms step:1232/1480 train_time:190235ms step_avg:155.68ms step:1233/1480 train_time:190396ms step_avg:155.68ms step:1234/1480 train_time:190558ms step_avg:155.68ms step:1235/1480 train_time:190722ms step_avg:155.69ms step:1236/1480 train_time:190882ms step_avg:155.70ms step:1237/1480 train_time:191045ms step_avg:155.70ms step:1238/1480 train_time:191218ms step_avg:155.72ms step:1239/1480 train_time:191379ms step_avg:155.72ms step:1240/1480 train_time:191543ms step_avg:155.73ms step:1241/1480 train_time:191708ms step_avg:155.73ms step:1242/1480 train_time:191869ms step_avg:155.74ms step:1243/1480 train_time:192035ms step_avg:155.75ms step:1244/1480 train_time:192195ms step_avg:155.75ms step:1245/1480 train_time:192357ms step_avg:155.75ms step:1246/1480 train_time:192520ms step_avg:155.76ms step:1247/1480 train_time:192683ms step_avg:155.77ms step:1248/1480 train_time:192844ms step_avg:155.77ms step:1249/1480 train_time:193005ms step_avg:155.77ms step:1250/1480 train_time:193166ms step_avg:155.78ms step:1250/1480 val_loss:3.3374 train_time:193242ms step_avg:155.84ms step:1251/1480 train_time:193338ms step_avg:155.79ms step:1252/1480 train_time:193497ms step_avg:155.79ms step:1253/1480 train_time:193658ms step_avg:155.80ms step:1254/1480 train_time:193818ms step_avg:155.80ms step:1255/1480 train_time:193991ms step_avg:155.82ms step:1256/1480 train_time:194156ms step_avg:155.82ms step:1257/1480 train_time:194317ms step_avg:155.83ms step:1258/1480 train_time:194482ms step_avg:155.83ms step:1259/1480 train_time:194647ms step_avg:155.84ms step:1260/1480 train_time:194807ms step_avg:155.85ms step:1261/1480 train_time:194970ms step_avg:155.85ms step:1262/1480 train_time:195134ms step_avg:155.86ms step:1263/1480 train_time:195300ms step_avg:155.87ms step:1264/1480 train_time:195459ms step_avg:155.87ms step:1265/1480 train_time:195618ms step_avg:155.87ms step:1266/1480 train_time:195782ms step_avg:155.88ms step:1267/1480 train_time:195943ms step_avg:155.88ms step:1268/1480 train_time:196105ms step_avg:155.89ms step:1269/1480 train_time:196273ms step_avg:155.90ms step:1270/1480 train_time:196435ms step_avg:155.90ms step:1271/1480 train_time:196597ms step_avg:155.91ms step:1272/1480 train_time:196759ms step_avg:155.91ms step:1273/1480 train_time:196921ms step_avg:155.92ms step:1274/1480 train_time:197086ms step_avg:155.92ms step:1275/1480 train_time:197248ms step_avg:155.93ms step:1276/1480 train_time:197408ms step_avg:155.93ms step:1277/1480 train_time:197571ms step_avg:155.94ms step:1278/1480 train_time:197731ms step_avg:155.94ms step:1279/1480 train_time:197893ms step_avg:155.94ms step:1280/1480 train_time:198058ms step_avg:155.95ms step:1281/1480 train_time:198219ms step_avg:155.96ms step:1282/1480 train_time:198379ms step_avg:155.96ms step:1283/1480 train_time:198541ms step_avg:155.96ms step:1284/1480 train_time:198703ms step_avg:155.97ms step:1285/1480 train_time:198866ms step_avg:155.97ms step:1286/1480 train_time:199029ms step_avg:155.98ms step:1287/1480 train_time:199192ms step_avg:155.98ms step:1288/1480 train_time:199353ms step_avg:155.99ms step:1289/1480 train_time:199522ms step_avg:156.00ms step:1290/1480 train_time:199691ms step_avg:156.01ms step:1291/1480 train_time:199856ms step_avg:156.02ms step:1292/1480 train_time:200018ms step_avg:156.02ms step:1293/1480 train_time:200183ms step_avg:156.03ms step:1294/1480 train_time:200348ms step_avg:156.03ms step:1295/1480 train_time:200512ms step_avg:156.04ms step:1296/1480 train_time:200676ms step_avg:156.05ms step:1297/1480 train_time:200838ms step_avg:156.05ms step:1298/1480 train_time:200999ms step_avg:156.06ms step:1299/1480 train_time:201162ms step_avg:156.06ms step:1300/1480 train_time:201323ms step_avg:156.06ms step:1301/1480 train_time:201485ms step_avg:156.07ms step:1302/1480 train_time:201652ms step_avg:156.08ms step:1303/1480 train_time:201818ms step_avg:156.08ms step:1304/1480 train_time:201982ms step_avg:156.09ms step:1305/1480 train_time:202143ms step_avg:156.09ms step:1306/1480 train_time:202308ms step_avg:156.10ms step:1307/1480 train_time:202471ms step_avg:156.11ms step:1308/1480 train_time:202634ms step_avg:156.11ms step:1309/1480 train_time:202799ms step_avg:156.12ms step:1310/1480 train_time:202961ms step_avg:156.12ms step:1311/1480 train_time:203121ms step_avg:156.13ms step:1312/1480 train_time:203287ms step_avg:156.13ms step:1313/1480 train_time:203450ms step_avg:156.14ms step:1314/1480 train_time:203615ms step_avg:156.15ms step:1315/1480 train_time:203778ms step_avg:156.15ms step:1316/1480 train_time:203939ms step_avg:156.16ms step:1317/1480 train_time:204100ms step_avg:156.16ms step:1318/1480 train_time:204268ms step_avg:156.17ms step:1319/1480 train_time:204434ms step_avg:156.18ms step:1320/1480 train_time:204602ms step_avg:156.18ms step:1321/1480 train_time:204766ms step_avg:156.19ms step:1322/1480 train_time:204937ms step_avg:156.20ms step:1323/1480 train_time:205100ms step_avg:156.21ms step:1324/1480 train_time:205267ms step_avg:156.22ms step:1325/1480 train_time:205435ms step_avg:156.22ms step:1326/1480 train_time:205600ms step_avg:156.23ms step:1327/1480 train_time:205762ms step_avg:156.24ms step:1328/1480 train_time:205923ms step_avg:156.24ms step:1329/1480 train_time:206114ms step_avg:156.27ms step:1330/1480 train_time:206269ms step_avg:156.26ms step:1331/1480 train_time:206432ms step_avg:156.27ms step:1332/1480 train_time:206596ms step_avg:156.28ms step:1333/1480 train_time:206759ms step_avg:156.28ms step:1334/1480 train_time:206923ms step_avg:156.29ms step:1335/1480 train_time:207087ms step_avg:156.29ms step:1336/1480 train_time:207257ms step_avg:156.30ms step:1337/1480 train_time:207425ms step_avg:156.31ms step:1338/1480 train_time:207588ms step_avg:156.32ms step:1339/1480 train_time:207753ms step_avg:156.32ms step:1340/1480 train_time:207916ms step_avg:156.33ms step:1341/1480 train_time:208077ms step_avg:156.33ms step:1342/1480 train_time:208241ms step_avg:156.34ms step:1343/1480 train_time:208402ms step_avg:156.34ms step:1344/1480 train_time:208565ms step_avg:156.35ms step:1345/1480 train_time:208732ms step_avg:156.35ms step:1346/1480 train_time:208894ms step_avg:156.36ms step:1347/1480 train_time:209057ms step_avg:156.36ms step:1348/1480 train_time:209218ms step_avg:156.37ms step:1349/1480 train_time:209381ms step_avg:156.37ms step:1350/1480 train_time:209548ms step_avg:156.38ms step:1351/1480 train_time:209712ms step_avg:156.38ms step:1352/1480 train_time:209874ms step_avg:156.39ms step:1353/1480 train_time:210039ms step_avg:156.40ms step:1354/1480 train_time:210202ms step_avg:156.40ms step:1355/1480 train_time:210363ms step_avg:156.40ms step:1356/1480 train_time:210527ms step_avg:156.41ms step:1357/1480 train_time:210693ms step_avg:156.42ms step:1358/1480 train_time:210856ms step_avg:156.42ms step:1359/1480 train_time:211020ms step_avg:156.43ms step:1360/1480 train_time:211187ms step_avg:156.43ms step:1361/1480 train_time:211355ms step_avg:156.44ms step:1362/1480 train_time:211518ms step_avg:156.45ms step:1363/1480 train_time:211687ms step_avg:156.46ms step:1364/1480 train_time:211850ms step_avg:156.46ms step:1365/1480 train_time:212009ms step_avg:156.46ms step:1366/1480 train_time:212174ms step_avg:156.47ms step:1367/1480 train_time:212336ms step_avg:156.47ms step:1368/1480 train_time:212499ms step_avg:156.48ms step:1369/1480 train_time:212669ms step_avg:156.49ms step:1370/1480 train_time:212834ms step_avg:156.50ms step:1371/1480 train_time:212998ms step_avg:156.50ms step:1372/1480 train_time:213166ms step_avg:156.51ms step:1373/1480 train_time:213328ms step_avg:156.51ms step:1374/1480 train_time:213495ms step_avg:156.52ms step:1375/1480 train_time:213658ms step_avg:156.53ms step:1375/1480 val_loss:3.2989 train_time:213732ms step_avg:156.58ms step:1376/1480 train_time:213824ms step_avg:156.53ms step:1377/1480 train_time:213985ms step_avg:156.54ms step:1378/1480 train_time:214146ms step_avg:156.54ms step:1379/1480 train_time:214311ms step_avg:156.55ms step:1380/1480 train_time:214475ms step_avg:156.55ms step:1381/1480 train_time:214644ms step_avg:156.56ms step:1382/1480 train_time:214807ms step_avg:156.56ms step:1383/1480 train_time:214970ms step_avg:156.57ms step:1384/1480 train_time:215138ms step_avg:156.58ms step:1385/1480 train_time:215299ms step_avg:156.58ms step:1386/1480 train_time:215461ms step_avg:156.58ms step:1387/1480 train_time:215627ms step_avg:156.59ms step:1388/1480 train_time:215789ms step_avg:156.60ms step:1389/1480 train_time:215954ms step_avg:156.60ms step:1390/1480 train_time:216116ms step_avg:156.61ms step:1391/1480 train_time:216279ms step_avg:156.61ms step:1392/1480 train_time:216443ms step_avg:156.62ms step:1393/1480 train_time:216604ms step_avg:156.62ms step:1394/1480 train_time:216766ms step_avg:156.62ms step:1395/1480 train_time:216928ms step_avg:156.63ms step:1396/1480 train_time:217090ms step_avg:156.63ms step:1397/1480 train_time:217249ms step_avg:156.63ms step:1398/1480 train_time:217410ms step_avg:156.64ms step:1399/1480 train_time:217571ms step_avg:156.64ms step:1400/1480 train_time:217739ms step_avg:156.65ms step:1401/1480 train_time:217901ms step_avg:156.65ms step:1402/1480 train_time:218062ms step_avg:156.65ms step:1403/1480 train_time:218226ms step_avg:156.66ms step:1404/1480 train_time:218388ms step_avg:156.66ms step:1405/1480 train_time:218554ms step_avg:156.67ms step:1406/1480 train_time:218720ms step_avg:156.68ms step:1407/1480 train_time:218882ms step_avg:156.68ms step:1408/1480 train_time:219043ms step_avg:156.68ms step:1409/1480 train_time:219216ms step_avg:156.69ms step:1410/1480 train_time:219379ms step_avg:156.70ms step:1411/1480 train_time:219540ms step_avg:156.70ms step:1412/1480 train_time:219702ms step_avg:156.71ms step:1413/1480 train_time:219863ms step_avg:156.71ms step:1414/1480 train_time:220027ms step_avg:156.71ms step:1415/1480 train_time:220191ms step_avg:156.72ms step:1416/1480 train_time:220365ms step_avg:156.73ms step:1417/1480 train_time:220530ms step_avg:156.74ms step:1418/1480 train_time:220694ms step_avg:156.74ms step:1419/1480 train_time:220858ms step_avg:156.75ms step:1420/1480 train_time:221024ms step_avg:156.75ms step:1421/1480 train_time:221187ms step_avg:156.76ms step:1422/1480 train_time:221351ms step_avg:156.76ms step:1423/1480 train_time:221512ms step_avg:156.77ms step:1424/1480 train_time:221681ms step_avg:156.78ms step:1425/1480 train_time:221850ms step_avg:156.78ms step:1426/1480 train_time:222013ms step_avg:156.79ms step:1427/1480 train_time:222180ms step_avg:156.80ms step:1428/1480 train_time:222343ms step_avg:156.80ms step:1429/1480 train_time:222505ms step_avg:156.80ms step:1430/1480 train_time:222668ms step_avg:156.81ms step:1431/1480 train_time:222832ms step_avg:156.81ms step:1432/1480 train_time:223001ms step_avg:156.82ms step:1433/1480 train_time:223169ms step_avg:156.83ms step:1434/1480 train_time:223338ms step_avg:156.84ms step:1435/1480 train_time:223504ms step_avg:156.84ms step:1436/1480 train_time:223667ms step_avg:156.85ms step:1437/1480 train_time:223828ms step_avg:156.85ms step:1438/1480 train_time:223990ms step_avg:156.86ms step:1439/1480 train_time:224156ms step_avg:156.86ms step:1440/1480 train_time:224320ms step_avg:156.87ms step:1441/1480 train_time:224483ms step_avg:156.87ms step:1442/1480 train_time:224649ms step_avg:156.88ms step:1443/1480 train_time:224825ms step_avg:156.89ms step:1444/1480 train_time:224987ms step_avg:156.89ms step:1445/1480 train_time:225150ms step_avg:156.90ms step:1446/1480 train_time:225317ms step_avg:156.91ms step:1447/1480 train_time:225484ms step_avg:156.91ms step:1448/1480 train_time:225646ms step_avg:156.92ms step:1449/1480 train_time:225811ms step_avg:156.92ms step:1450/1480 train_time:225976ms step_avg:156.93ms step:1451/1480 train_time:226139ms step_avg:156.93ms step:1452/1480 train_time:226304ms step_avg:156.94ms step:1453/1480 train_time:226465ms step_avg:156.94ms step:1454/1480 train_time:226627ms step_avg:156.94ms step:1455/1480 train_time:226797ms step_avg:156.95ms step:1456/1480 train_time:226960ms step_avg:156.96ms step:1457/1480 train_time:227122ms step_avg:156.96ms step:1458/1480 train_time:227284ms step_avg:156.96ms step:1459/1480 train_time:227449ms step_avg:156.97ms step:1460/1480 train_time:227611ms step_avg:156.97ms step:1461/1480 train_time:227776ms step_avg:156.98ms step:1462/1480 train_time:227942ms step_avg:156.98ms step:1463/1480 train_time:228107ms step_avg:156.99ms step:1464/1480 train_time:228271ms step_avg:157.00ms step:1465/1480 train_time:228435ms step_avg:157.00ms step:1466/1480 train_time:228598ms step_avg:157.00ms step:1467/1480 train_time:228762ms step_avg:157.01ms step:1468/1480 train_time:228926ms step_avg:157.01ms step:1469/1480 train_time:229090ms step_avg:157.02ms step:1470/1480 train_time:229257ms step_avg:157.03ms step:1471/1480 train_time:229427ms step_avg:157.03ms step:1472/1480 train_time:229598ms step_avg:157.04ms step:1473/1480 train_time:229762ms step_avg:157.05ms step:1474/1480 train_time:229927ms step_avg:157.05ms step:1475/1480 train_time:230097ms step_avg:157.06ms step:1476/1480 train_time:230261ms step_avg:157.07ms step:1477/1480 train_time:230428ms step_avg:157.07ms step:1478/1480 train_time:230600ms step_avg:157.08ms step:1479/1480 train_time:230765ms step_avg:157.09ms step:1480/1480 train_time:230927ms step_avg:157.09ms step:1480/1480 val_loss:3.2797 train_time:231003ms step_avg:157.14ms peak memory consumption: 34238 MiB