import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 07:39:26 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 131W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 113W / 700W | 37MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22786ms step_avg:nanms step:2/1480 train_time:22906ms step_avg:nanms step:3/1480 train_time:23045ms step_avg:nanms step:4/1480 train_time:23185ms step_avg:nanms step:5/1480 train_time:23326ms step_avg:nanms step:6/1480 train_time:23467ms step_avg:nanms step:7/1480 train_time:23609ms step_avg:nanms step:8/1480 train_time:23751ms step_avg:nanms step:9/1480 train_time:23896ms step_avg:nanms step:10/1480 train_time:24038ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.63ms step:14/1480 train_time:567ms step_avg:141.75ms step:15/1480 train_time:709ms step_avg:141.86ms step:16/1480 train_time:855ms step_avg:142.54ms step:17/1480 train_time:998ms step_avg:142.54ms step:18/1480 train_time:1140ms step_avg:142.45ms step:19/1480 train_time:1282ms step_avg:142.46ms step:20/1480 train_time:1424ms step_avg:142.39ms step:21/1480 train_time:1566ms step_avg:142.33ms step:22/1480 train_time:1710ms step_avg:142.50ms step:23/1480 train_time:1856ms step_avg:142.77ms step:24/1480 train_time:1999ms step_avg:142.82ms step:25/1480 train_time:2142ms step_avg:142.79ms step:26/1480 train_time:2283ms step_avg:142.69ms step:27/1480 train_time:2424ms step_avg:142.59ms step:28/1480 train_time:2566ms step_avg:142.57ms step:29/1480 train_time:2710ms step_avg:142.63ms step:30/1480 train_time:2856ms step_avg:142.81ms step:31/1480 train_time:3000ms step_avg:142.84ms step:32/1480 train_time:3143ms step_avg:142.86ms step:33/1480 train_time:3285ms step_avg:142.83ms step:34/1480 train_time:3426ms step_avg:142.76ms step:35/1480 train_time:3570ms step_avg:142.80ms step:36/1480 train_time:3715ms step_avg:142.87ms step:37/1480 train_time:3859ms step_avg:142.91ms step:38/1480 train_time:4002ms step_avg:142.92ms step:39/1480 train_time:4145ms step_avg:142.93ms step:40/1480 train_time:4288ms step_avg:142.92ms step:41/1480 train_time:4429ms step_avg:142.86ms step:42/1480 train_time:4572ms step_avg:142.88ms step:43/1480 train_time:4717ms step_avg:142.94ms step:44/1480 train_time:4860ms step_avg:142.95ms step:45/1480 train_time:5004ms step_avg:142.97ms step:46/1480 train_time:5146ms step_avg:142.96ms step:47/1480 train_time:5291ms step_avg:143.00ms step:48/1480 train_time:5434ms step_avg:143.01ms step:49/1480 train_time:5578ms step_avg:143.02ms step:50/1480 train_time:5721ms step_avg:143.02ms step:51/1480 train_time:5863ms step_avg:142.99ms step:52/1480 train_time:6005ms step_avg:142.99ms step:53/1480 train_time:6148ms step_avg:142.98ms step:54/1480 train_time:6291ms step_avg:142.98ms step:55/1480 train_time:6434ms step_avg:142.99ms step:56/1480 train_time:6578ms step_avg:143.01ms step:57/1480 train_time:6721ms step_avg:142.99ms step:58/1480 train_time:6864ms step_avg:142.99ms step:59/1480 train_time:7008ms step_avg:143.02ms step:60/1480 train_time:7153ms step_avg:143.07ms step:61/1480 train_time:7298ms step_avg:143.10ms step:62/1480 train_time:7440ms step_avg:143.07ms step:63/1480 train_time:7583ms step_avg:143.07ms step:64/1480 train_time:7726ms step_avg:143.07ms step:65/1480 train_time:7869ms step_avg:143.08ms step:66/1480 train_time:8013ms step_avg:143.09ms step:67/1480 train_time:8157ms step_avg:143.10ms step:68/1480 train_time:8300ms step_avg:143.10ms step:69/1480 train_time:8443ms step_avg:143.09ms step:70/1480 train_time:8585ms step_avg:143.08ms step:71/1480 train_time:8725ms step_avg:143.04ms step:72/1480 train_time:8867ms step_avg:143.02ms step:73/1480 train_time:9011ms step_avg:143.04ms step:74/1480 train_time:9155ms step_avg:143.05ms step:75/1480 train_time:9299ms step_avg:143.06ms step:76/1480 train_time:9440ms step_avg:143.03ms step:77/1480 train_time:9581ms step_avg:143.00ms step:78/1480 train_time:9722ms step_avg:142.98ms step:79/1480 train_time:9863ms step_avg:142.95ms step:80/1480 train_time:10007ms step_avg:142.95ms step:81/1480 train_time:10152ms step_avg:142.99ms step:82/1480 train_time:10296ms step_avg:143.01ms step:83/1480 train_time:10440ms step_avg:143.01ms step:84/1480 train_time:10582ms step_avg:143.00ms step:85/1480 train_time:10723ms step_avg:142.97ms step:86/1480 train_time:10865ms step_avg:142.96ms step:87/1480 train_time:11008ms step_avg:142.96ms step:88/1480 train_time:11153ms step_avg:142.98ms step:89/1480 train_time:11297ms step_avg:143.00ms step:90/1480 train_time:11440ms step_avg:143.00ms step:91/1480 train_time:11582ms step_avg:142.98ms step:92/1480 train_time:11723ms step_avg:142.96ms step:93/1480 train_time:11864ms step_avg:142.94ms step:94/1480 train_time:12007ms step_avg:142.94ms step:95/1480 train_time:12151ms step_avg:142.96ms step:96/1480 train_time:12295ms step_avg:142.97ms step:97/1480 train_time:12439ms step_avg:142.98ms step:98/1480 train_time:12580ms step_avg:142.96ms step:99/1480 train_time:12721ms step_avg:142.94ms step:100/1480 train_time:12862ms step_avg:142.91ms step:101/1480 train_time:13004ms step_avg:142.90ms step:102/1480 train_time:13147ms step_avg:142.90ms step:103/1480 train_time:13291ms step_avg:142.92ms step:104/1480 train_time:13435ms step_avg:142.93ms step:105/1480 train_time:13578ms step_avg:142.93ms step:106/1480 train_time:13720ms step_avg:142.92ms step:107/1480 train_time:13860ms step_avg:142.89ms step:108/1480 train_time:14003ms step_avg:142.89ms step:109/1480 train_time:14144ms step_avg:142.87ms step:110/1480 train_time:14288ms step_avg:142.88ms step:111/1480 train_time:14432ms step_avg:142.89ms step:112/1480 train_time:14581ms step_avg:142.95ms step:113/1480 train_time:14725ms step_avg:142.96ms step:114/1480 train_time:14872ms step_avg:143.00ms step:115/1480 train_time:15020ms step_avg:143.05ms step:116/1480 train_time:15165ms step_avg:143.07ms step:117/1480 train_time:15314ms step_avg:143.12ms step:118/1480 train_time:15460ms step_avg:143.15ms step:119/1480 train_time:15607ms step_avg:143.18ms step:120/1480 train_time:15754ms step_avg:143.22ms step:121/1480 train_time:15902ms step_avg:143.26ms step:122/1480 train_time:16047ms step_avg:143.28ms step:123/1480 train_time:16196ms step_avg:143.33ms step:124/1480 train_time:16344ms step_avg:143.37ms step:125/1480 train_time:16490ms step_avg:143.39ms step:125/1480 val_loss:4.4183 train_time:16547ms step_avg:143.88ms step:126/1480 train_time:16642ms step_avg:143.47ms step:127/1480 train_time:16791ms step_avg:143.51ms step:128/1480 train_time:16938ms step_avg:143.54ms step:129/1480 train_time:17083ms step_avg:143.55ms step:130/1480 train_time:17228ms step_avg:143.57ms step:131/1480 train_time:17374ms step_avg:143.59ms step:132/1480 train_time:17520ms step_avg:143.61ms step:133/1480 train_time:17669ms step_avg:143.65ms step:134/1480 train_time:17818ms step_avg:143.69ms step:135/1480 train_time:17964ms step_avg:143.71ms step:136/1480 train_time:18113ms step_avg:143.75ms step:137/1480 train_time:18259ms step_avg:143.77ms step:138/1480 train_time:18406ms step_avg:143.79ms step:139/1480 train_time:18554ms step_avg:143.83ms step:140/1480 train_time:18700ms step_avg:143.85ms step:141/1480 train_time:18848ms step_avg:143.88ms step:142/1480 train_time:18996ms step_avg:143.91ms step:143/1480 train_time:19141ms step_avg:143.92ms step:144/1480 train_time:19289ms step_avg:143.95ms step:145/1480 train_time:19437ms step_avg:143.97ms step:146/1480 train_time:19583ms step_avg:143.99ms step:147/1480 train_time:19731ms step_avg:144.02ms step:148/1480 train_time:19878ms step_avg:144.05ms step:149/1480 train_time:20025ms step_avg:144.07ms step:150/1480 train_time:20173ms step_avg:144.09ms step:151/1480 train_time:20319ms step_avg:144.11ms step:152/1480 train_time:20466ms step_avg:144.13ms step:153/1480 train_time:20614ms step_avg:144.15ms step:154/1480 train_time:20760ms step_avg:144.17ms step:155/1480 train_time:20908ms step_avg:144.20ms step:156/1480 train_time:21056ms step_avg:144.22ms step:157/1480 train_time:21202ms step_avg:144.23ms step:158/1480 train_time:21350ms step_avg:144.25ms step:159/1480 train_time:21496ms step_avg:144.27ms step:160/1480 train_time:21641ms step_avg:144.28ms step:161/1480 train_time:21788ms step_avg:144.29ms step:162/1480 train_time:21935ms step_avg:144.31ms step:163/1480 train_time:22080ms step_avg:144.32ms step:164/1480 train_time:22228ms step_avg:144.34ms step:165/1480 train_time:22375ms step_avg:144.35ms step:166/1480 train_time:22521ms step_avg:144.37ms step:167/1480 train_time:22669ms step_avg:144.39ms step:168/1480 train_time:22818ms step_avg:144.42ms step:169/1480 train_time:22965ms step_avg:144.43ms step:170/1480 train_time:23113ms step_avg:144.45ms step:171/1480 train_time:23259ms step_avg:144.47ms step:172/1480 train_time:23405ms step_avg:144.48ms step:173/1480 train_time:23553ms step_avg:144.50ms step:174/1480 train_time:23699ms step_avg:144.51ms step:175/1480 train_time:23846ms step_avg:144.52ms step:176/1480 train_time:23994ms step_avg:144.54ms step:177/1480 train_time:24140ms step_avg:144.55ms step:178/1480 train_time:24286ms step_avg:144.56ms step:179/1480 train_time:24432ms step_avg:144.57ms step:180/1480 train_time:24579ms step_avg:144.58ms step:181/1480 train_time:24724ms step_avg:144.59ms step:182/1480 train_time:24872ms step_avg:144.60ms step:183/1480 train_time:25019ms step_avg:144.62ms step:184/1480 train_time:25163ms step_avg:144.62ms step:185/1480 train_time:25312ms step_avg:144.64ms step:186/1480 train_time:25459ms step_avg:144.65ms step:187/1480 train_time:25604ms step_avg:144.66ms step:188/1480 train_time:25753ms step_avg:144.68ms step:189/1480 train_time:25899ms step_avg:144.69ms step:190/1480 train_time:26046ms step_avg:144.70ms step:191/1480 train_time:26193ms step_avg:144.71ms step:192/1480 train_time:26340ms step_avg:144.72ms step:193/1480 train_time:26487ms step_avg:144.74ms step:194/1480 train_time:26635ms step_avg:144.75ms step:195/1480 train_time:26781ms step_avg:144.76ms step:196/1480 train_time:26927ms step_avg:144.77ms step:197/1480 train_time:27074ms step_avg:144.78ms step:198/1480 train_time:27221ms step_avg:144.79ms step:199/1480 train_time:27366ms step_avg:144.79ms step:200/1480 train_time:27514ms step_avg:144.81ms step:201/1480 train_time:27660ms step_avg:144.81ms step:202/1480 train_time:27806ms step_avg:144.82ms step:203/1480 train_time:27955ms step_avg:144.84ms step:204/1480 train_time:28101ms step_avg:144.85ms step:205/1480 train_time:28247ms step_avg:144.86ms step:206/1480 train_time:28395ms step_avg:144.87ms step:207/1480 train_time:28541ms step_avg:144.88ms step:208/1480 train_time:28688ms step_avg:144.89ms step:209/1480 train_time:28836ms step_avg:144.90ms step:210/1480 train_time:28981ms step_avg:144.91ms step:211/1480 train_time:29127ms step_avg:144.91ms step:212/1480 train_time:29275ms step_avg:144.92ms step:213/1480 train_time:29421ms step_avg:144.93ms step:214/1480 train_time:29567ms step_avg:144.93ms step:215/1480 train_time:29715ms step_avg:144.95ms step:216/1480 train_time:29861ms step_avg:144.96ms step:217/1480 train_time:30009ms step_avg:144.97ms step:218/1480 train_time:30157ms step_avg:144.99ms step:219/1480 train_time:30302ms step_avg:144.99ms step:220/1480 train_time:30451ms step_avg:145.00ms step:221/1480 train_time:30599ms step_avg:145.02ms step:222/1480 train_time:30752ms step_avg:145.06ms step:223/1480 train_time:30903ms step_avg:145.08ms step:224/1480 train_time:31055ms step_avg:145.12ms step:225/1480 train_time:31205ms step_avg:145.14ms step:226/1480 train_time:31357ms step_avg:145.17ms step:227/1480 train_time:31506ms step_avg:145.19ms step:228/1480 train_time:31656ms step_avg:145.21ms step:229/1480 train_time:31806ms step_avg:145.23ms step:230/1480 train_time:31958ms step_avg:145.26ms step:231/1480 train_time:32107ms step_avg:145.28ms step:232/1480 train_time:32258ms step_avg:145.31ms step:233/1480 train_time:32408ms step_avg:145.33ms step:234/1480 train_time:32560ms step_avg:145.36ms step:235/1480 train_time:32712ms step_avg:145.39ms step:236/1480 train_time:32862ms step_avg:145.41ms step:237/1480 train_time:33014ms step_avg:145.43ms step:238/1480 train_time:33163ms step_avg:145.45ms step:239/1480 train_time:33314ms step_avg:145.48ms step:240/1480 train_time:33463ms step_avg:145.49ms step:241/1480 train_time:33614ms step_avg:145.51ms step:242/1480 train_time:33764ms step_avg:145.54ms step:243/1480 train_time:33916ms step_avg:145.56ms step:244/1480 train_time:34066ms step_avg:145.58ms step:245/1480 train_time:34218ms step_avg:145.61ms step:246/1480 train_time:34367ms step_avg:145.62ms step:247/1480 train_time:34518ms step_avg:145.65ms step:248/1480 train_time:34667ms step_avg:145.66ms step:249/1480 train_time:34819ms step_avg:145.69ms step:250/1480 train_time:34969ms step_avg:145.70ms step:250/1480 val_loss:3.9916 train_time:35030ms step_avg:145.96ms step:251/1480 train_time:35128ms step_avg:145.76ms step:252/1480 train_time:35281ms step_avg:145.79ms step:253/1480 train_time:35431ms step_avg:145.81ms step:254/1480 train_time:35579ms step_avg:145.82ms step:255/1480 train_time:35729ms step_avg:145.83ms step:256/1480 train_time:35878ms step_avg:145.84ms step:257/1480 train_time:36029ms step_avg:145.87ms step:258/1480 train_time:36179ms step_avg:145.88ms step:259/1480 train_time:36333ms step_avg:145.91ms step:260/1480 train_time:36484ms step_avg:145.93ms step:261/1480 train_time:36634ms step_avg:145.95ms step:262/1480 train_time:36783ms step_avg:145.97ms step:263/1480 train_time:36934ms step_avg:145.98ms step:264/1480 train_time:37083ms step_avg:146.00ms step:265/1480 train_time:37234ms step_avg:146.02ms step:266/1480 train_time:37384ms step_avg:146.03ms step:267/1480 train_time:37535ms step_avg:146.05ms step:268/1480 train_time:37685ms step_avg:146.07ms step:269/1480 train_time:37836ms step_avg:146.09ms step:270/1480 train_time:37986ms step_avg:146.10ms step:271/1480 train_time:38136ms step_avg:146.12ms step:272/1480 train_time:38286ms step_avg:146.13ms step:273/1480 train_time:38436ms step_avg:146.15ms step:274/1480 train_time:38587ms step_avg:146.16ms step:275/1480 train_time:38737ms step_avg:146.18ms step:276/1480 train_time:38889ms step_avg:146.20ms step:277/1480 train_time:39038ms step_avg:146.21ms step:278/1480 train_time:39189ms step_avg:146.23ms step:279/1480 train_time:39338ms step_avg:146.24ms step:280/1480 train_time:39490ms step_avg:146.26ms step:281/1480 train_time:39639ms step_avg:146.27ms step:282/1480 train_time:39790ms step_avg:146.29ms step:283/1480 train_time:39940ms step_avg:146.30ms step:284/1480 train_time:40091ms step_avg:146.32ms step:285/1480 train_time:40242ms step_avg:146.34ms step:286/1480 train_time:40394ms step_avg:146.35ms step:287/1480 train_time:40544ms step_avg:146.37ms step:288/1480 train_time:40693ms step_avg:146.38ms step:289/1480 train_time:40843ms step_avg:146.39ms step:290/1480 train_time:40994ms step_avg:146.41ms step:291/1480 train_time:41144ms step_avg:146.42ms step:292/1480 train_time:41294ms step_avg:146.43ms step:293/1480 train_time:41446ms step_avg:146.45ms step:294/1480 train_time:41596ms step_avg:146.46ms step:295/1480 train_time:41746ms step_avg:146.48ms step:296/1480 train_time:41897ms step_avg:146.49ms step:297/1480 train_time:42049ms step_avg:146.51ms step:298/1480 train_time:42199ms step_avg:146.52ms step:299/1480 train_time:42351ms step_avg:146.54ms step:300/1480 train_time:42500ms step_avg:146.55ms step:301/1480 train_time:42651ms step_avg:146.57ms step:302/1480 train_time:42800ms step_avg:146.58ms step:303/1480 train_time:42953ms step_avg:146.60ms step:304/1480 train_time:43104ms step_avg:146.61ms step:305/1480 train_time:43254ms step_avg:146.62ms step:306/1480 train_time:43403ms step_avg:146.63ms step:307/1480 train_time:43553ms step_avg:146.64ms step:308/1480 train_time:43703ms step_avg:146.66ms step:309/1480 train_time:43854ms step_avg:146.67ms step:310/1480 train_time:44005ms step_avg:146.68ms step:311/1480 train_time:44156ms step_avg:146.70ms step:312/1480 train_time:44308ms step_avg:146.71ms step:313/1480 train_time:44458ms step_avg:146.73ms step:314/1480 train_time:44609ms step_avg:146.74ms step:315/1480 train_time:44759ms step_avg:146.75ms step:316/1480 train_time:44909ms step_avg:146.76ms step:317/1480 train_time:45058ms step_avg:146.77ms step:318/1480 train_time:45208ms step_avg:146.78ms step:319/1480 train_time:45358ms step_avg:146.79ms step:320/1480 train_time:45510ms step_avg:146.81ms step:321/1480 train_time:45660ms step_avg:146.82ms step:322/1480 train_time:45811ms step_avg:146.83ms step:323/1480 train_time:45962ms step_avg:146.84ms step:324/1480 train_time:46113ms step_avg:146.86ms step:325/1480 train_time:46264ms step_avg:146.87ms step:326/1480 train_time:46414ms step_avg:146.88ms step:327/1480 train_time:46565ms step_avg:146.89ms step:328/1480 train_time:46715ms step_avg:146.90ms step:329/1480 train_time:46868ms step_avg:146.92ms step:330/1480 train_time:47020ms step_avg:146.94ms step:331/1480 train_time:47174ms step_avg:146.96ms step:332/1480 train_time:47327ms step_avg:146.98ms step:333/1480 train_time:47479ms step_avg:146.99ms step:334/1480 train_time:47634ms step_avg:147.02ms step:335/1480 train_time:47788ms step_avg:147.04ms step:336/1480 train_time:47944ms step_avg:147.07ms step:337/1480 train_time:48098ms step_avg:147.09ms step:338/1480 train_time:48251ms step_avg:147.11ms step:339/1480 train_time:48405ms step_avg:147.13ms step:340/1480 train_time:48559ms step_avg:147.15ms step:341/1480 train_time:48712ms step_avg:147.17ms step:342/1480 train_time:48866ms step_avg:147.19ms step:343/1480 train_time:49020ms step_avg:147.21ms step:344/1480 train_time:49174ms step_avg:147.23ms step:345/1480 train_time:49329ms step_avg:147.25ms step:346/1480 train_time:49484ms step_avg:147.28ms step:347/1480 train_time:49639ms step_avg:147.30ms step:348/1480 train_time:49792ms step_avg:147.31ms step:349/1480 train_time:49946ms step_avg:147.33ms step:350/1480 train_time:50100ms step_avg:147.35ms step:351/1480 train_time:50256ms step_avg:147.38ms step:352/1480 train_time:50408ms step_avg:147.39ms step:353/1480 train_time:50563ms step_avg:147.41ms step:354/1480 train_time:50716ms step_avg:147.43ms step:355/1480 train_time:50871ms step_avg:147.45ms step:356/1480 train_time:51025ms step_avg:147.47ms step:357/1480 train_time:51179ms step_avg:147.49ms step:358/1480 train_time:51333ms step_avg:147.51ms step:359/1480 train_time:51487ms step_avg:147.53ms step:360/1480 train_time:51642ms step_avg:147.55ms step:361/1480 train_time:51796ms step_avg:147.57ms step:362/1480 train_time:51950ms step_avg:147.59ms step:363/1480 train_time:52104ms step_avg:147.60ms step:364/1480 train_time:52259ms step_avg:147.62ms step:365/1480 train_time:52413ms step_avg:147.64ms step:366/1480 train_time:52567ms step_avg:147.66ms step:367/1480 train_time:52720ms step_avg:147.68ms step:368/1480 train_time:52875ms step_avg:147.69ms step:369/1480 train_time:53029ms step_avg:147.71ms step:370/1480 train_time:53183ms step_avg:147.73ms step:371/1480 train_time:53338ms step_avg:147.75ms step:372/1480 train_time:53491ms step_avg:147.77ms step:373/1480 train_time:53644ms step_avg:147.78ms step:374/1480 train_time:53797ms step_avg:147.79ms step:375/1480 train_time:53951ms step_avg:147.81ms step:375/1480 val_loss:3.8044 train_time:54010ms step_avg:147.97ms step:376/1480 train_time:54109ms step_avg:147.84ms step:377/1480 train_time:54266ms step_avg:147.86ms step:378/1480 train_time:54419ms step_avg:147.88ms step:379/1480 train_time:54572ms step_avg:147.89ms step:380/1480 train_time:54724ms step_avg:147.90ms step:381/1480 train_time:54875ms step_avg:147.91ms step:382/1480 train_time:55029ms step_avg:147.93ms step:383/1480 train_time:55184ms step_avg:147.95ms step:384/1480 train_time:55337ms step_avg:147.96ms step:385/1480 train_time:55491ms step_avg:147.98ms step:386/1480 train_time:55644ms step_avg:147.99ms step:387/1480 train_time:55797ms step_avg:148.00ms step:388/1480 train_time:55951ms step_avg:148.02ms step:389/1480 train_time:56105ms step_avg:148.03ms step:390/1480 train_time:56259ms step_avg:148.05ms step:391/1480 train_time:56412ms step_avg:148.06ms step:392/1480 train_time:56565ms step_avg:148.07ms step:393/1480 train_time:56718ms step_avg:148.09ms step:394/1480 train_time:56872ms step_avg:148.11ms step:395/1480 train_time:57025ms step_avg:148.12ms step:396/1480 train_time:57178ms step_avg:148.13ms step:397/1480 train_time:57333ms step_avg:148.15ms step:398/1480 train_time:57487ms step_avg:148.16ms step:399/1480 train_time:57641ms step_avg:148.18ms step:400/1480 train_time:57794ms step_avg:148.19ms step:401/1480 train_time:57950ms step_avg:148.21ms step:402/1480 train_time:58102ms step_avg:148.22ms step:403/1480 train_time:58255ms step_avg:148.23ms step:404/1480 train_time:58409ms step_avg:148.25ms step:405/1480 train_time:58563ms step_avg:148.26ms step:406/1480 train_time:58717ms step_avg:148.27ms step:407/1480 train_time:58871ms step_avg:148.29ms step:408/1480 train_time:59024ms step_avg:148.30ms step:409/1480 train_time:59179ms step_avg:148.32ms step:410/1480 train_time:59333ms step_avg:148.33ms step:411/1480 train_time:59486ms step_avg:148.35ms step:412/1480 train_time:59639ms step_avg:148.36ms step:413/1480 train_time:59792ms step_avg:148.37ms step:414/1480 train_time:59947ms step_avg:148.38ms step:415/1480 train_time:60101ms step_avg:148.40ms step:416/1480 train_time:60255ms step_avg:148.41ms step:417/1480 train_time:60409ms step_avg:148.43ms step:418/1480 train_time:60563ms step_avg:148.44ms step:419/1480 train_time:60717ms step_avg:148.45ms step:420/1480 train_time:60871ms step_avg:148.47ms step:421/1480 train_time:61025ms step_avg:148.48ms step:422/1480 train_time:61178ms step_avg:148.49ms step:423/1480 train_time:61332ms step_avg:148.50ms step:424/1480 train_time:61486ms step_avg:148.52ms step:425/1480 train_time:61642ms step_avg:148.53ms step:426/1480 train_time:61796ms step_avg:148.55ms step:427/1480 train_time:61949ms step_avg:148.56ms step:428/1480 train_time:62103ms step_avg:148.57ms step:429/1480 train_time:62257ms step_avg:148.59ms step:430/1480 train_time:62412ms step_avg:148.60ms step:431/1480 train_time:62567ms step_avg:148.62ms step:432/1480 train_time:62721ms step_avg:148.63ms step:433/1480 train_time:62875ms step_avg:148.64ms step:434/1480 train_time:63030ms step_avg:148.66ms step:435/1480 train_time:63184ms step_avg:148.67ms step:436/1480 train_time:63338ms step_avg:148.68ms step:437/1480 train_time:63492ms step_avg:148.69ms step:438/1480 train_time:63646ms step_avg:148.71ms step:439/1480 train_time:63799ms step_avg:148.71ms step:440/1480 train_time:63955ms step_avg:148.73ms step:441/1480 train_time:64111ms step_avg:148.75ms step:442/1480 train_time:64270ms step_avg:148.77ms step:443/1480 train_time:64427ms step_avg:148.79ms step:444/1480 train_time:64583ms step_avg:148.81ms step:445/1480 train_time:64738ms step_avg:148.82ms step:446/1480 train_time:64895ms step_avg:148.84ms step:447/1480 train_time:65052ms step_avg:148.86ms step:448/1480 train_time:65210ms step_avg:148.88ms step:449/1480 train_time:65368ms step_avg:148.90ms step:450/1480 train_time:65527ms step_avg:148.92ms step:451/1480 train_time:65684ms step_avg:148.94ms step:452/1480 train_time:65840ms step_avg:148.96ms step:453/1480 train_time:65996ms step_avg:148.98ms step:454/1480 train_time:66154ms step_avg:148.99ms step:455/1480 train_time:66311ms step_avg:149.01ms step:456/1480 train_time:66470ms step_avg:149.04ms step:457/1480 train_time:66628ms step_avg:149.06ms step:458/1480 train_time:66784ms step_avg:149.07ms step:459/1480 train_time:66943ms step_avg:149.09ms step:460/1480 train_time:67099ms step_avg:149.11ms step:461/1480 train_time:67257ms step_avg:149.13ms step:462/1480 train_time:67413ms step_avg:149.14ms step:463/1480 train_time:67571ms step_avg:149.16ms step:464/1480 train_time:67729ms step_avg:149.18ms step:465/1480 train_time:67886ms step_avg:149.20ms step:466/1480 train_time:68043ms step_avg:149.22ms step:467/1480 train_time:68199ms step_avg:149.23ms step:468/1480 train_time:68356ms step_avg:149.25ms step:469/1480 train_time:68512ms step_avg:149.26ms step:470/1480 train_time:68670ms step_avg:149.28ms step:471/1480 train_time:68828ms step_avg:149.30ms step:472/1480 train_time:68985ms step_avg:149.32ms step:473/1480 train_time:69141ms step_avg:149.33ms step:474/1480 train_time:69298ms step_avg:149.35ms step:475/1480 train_time:69454ms step_avg:149.36ms step:476/1480 train_time:69610ms step_avg:149.38ms step:477/1480 train_time:69768ms step_avg:149.40ms step:478/1480 train_time:69925ms step_avg:149.41ms step:479/1480 train_time:70081ms step_avg:149.43ms step:480/1480 train_time:70240ms step_avg:149.45ms step:481/1480 train_time:70396ms step_avg:149.46ms step:482/1480 train_time:70553ms step_avg:149.48ms step:483/1480 train_time:70709ms step_avg:149.49ms step:484/1480 train_time:70867ms step_avg:149.51ms step:485/1480 train_time:71025ms step_avg:149.53ms step:486/1480 train_time:71183ms step_avg:149.54ms step:487/1480 train_time:71340ms step_avg:149.56ms step:488/1480 train_time:71496ms step_avg:149.57ms step:489/1480 train_time:71653ms step_avg:149.59ms step:490/1480 train_time:71809ms step_avg:149.60ms step:491/1480 train_time:71966ms step_avg:149.62ms step:492/1480 train_time:72121ms step_avg:149.63ms step:493/1480 train_time:72278ms step_avg:149.64ms step:494/1480 train_time:72436ms step_avg:149.66ms step:495/1480 train_time:72593ms step_avg:149.68ms step:496/1480 train_time:72750ms step_avg:149.69ms step:497/1480 train_time:72908ms step_avg:149.71ms step:498/1480 train_time:73066ms step_avg:149.73ms step:499/1480 train_time:73225ms step_avg:149.74ms step:500/1480 train_time:73381ms step_avg:149.76ms step:500/1480 val_loss:3.6888 train_time:73444ms step_avg:149.89ms step:501/1480 train_time:73543ms step_avg:149.78ms step:502/1480 train_time:73700ms step_avg:149.80ms step:503/1480 train_time:73856ms step_avg:149.81ms step:504/1480 train_time:74011ms step_avg:149.82ms step:505/1480 train_time:74166ms step_avg:149.83ms step:506/1480 train_time:74323ms step_avg:149.84ms step:507/1480 train_time:74479ms step_avg:149.86ms step:508/1480 train_time:74637ms step_avg:149.87ms step:509/1480 train_time:74795ms step_avg:149.89ms step:510/1480 train_time:74952ms step_avg:149.90ms step:511/1480 train_time:75110ms step_avg:149.92ms step:512/1480 train_time:75266ms step_avg:149.93ms step:513/1480 train_time:75422ms step_avg:149.94ms step:514/1480 train_time:75578ms step_avg:149.96ms step:515/1480 train_time:75735ms step_avg:149.97ms step:516/1480 train_time:75894ms step_avg:149.99ms step:517/1480 train_time:76052ms step_avg:150.00ms step:518/1480 train_time:76208ms step_avg:150.02ms step:519/1480 train_time:76365ms step_avg:150.03ms step:520/1480 train_time:76522ms step_avg:150.04ms step:521/1480 train_time:76678ms step_avg:150.06ms step:522/1480 train_time:76836ms step_avg:150.07ms step:523/1480 train_time:76995ms step_avg:150.09ms step:524/1480 train_time:77153ms step_avg:150.10ms step:525/1480 train_time:77312ms step_avg:150.12ms step:526/1480 train_time:77470ms step_avg:150.14ms step:527/1480 train_time:77627ms step_avg:150.15ms step:528/1480 train_time:77783ms step_avg:150.16ms step:529/1480 train_time:77940ms step_avg:150.17ms step:530/1480 train_time:78097ms step_avg:150.19ms step:531/1480 train_time:78255ms step_avg:150.20ms step:532/1480 train_time:78413ms step_avg:150.22ms step:533/1480 train_time:78570ms step_avg:150.23ms step:534/1480 train_time:78725ms step_avg:150.24ms step:535/1480 train_time:78881ms step_avg:150.25ms step:536/1480 train_time:79039ms step_avg:150.26ms step:537/1480 train_time:79196ms step_avg:150.28ms step:538/1480 train_time:79354ms step_avg:150.29ms step:539/1480 train_time:79514ms step_avg:150.31ms step:540/1480 train_time:79672ms step_avg:150.32ms step:541/1480 train_time:79828ms step_avg:150.34ms step:542/1480 train_time:79984ms step_avg:150.35ms step:543/1480 train_time:80140ms step_avg:150.36ms step:544/1480 train_time:80296ms step_avg:150.37ms step:545/1480 train_time:80454ms step_avg:150.38ms step:546/1480 train_time:80609ms step_avg:150.39ms step:547/1480 train_time:80765ms step_avg:150.40ms step:548/1480 train_time:80924ms step_avg:150.42ms step:549/1480 train_time:81080ms step_avg:150.43ms step:550/1480 train_time:81239ms step_avg:150.44ms step:551/1480 train_time:81398ms step_avg:150.46ms step:552/1480 train_time:81559ms step_avg:150.48ms step:553/1480 train_time:81719ms step_avg:150.50ms step:554/1480 train_time:81878ms step_avg:150.51ms step:555/1480 train_time:82039ms step_avg:150.53ms step:556/1480 train_time:82197ms step_avg:150.54ms step:557/1480 train_time:82359ms step_avg:150.57ms step:558/1480 train_time:82520ms step_avg:150.58ms step:559/1480 train_time:82679ms step_avg:150.60ms step:560/1480 train_time:82839ms step_avg:150.62ms step:561/1480 train_time:82998ms step_avg:150.63ms step:562/1480 train_time:83158ms step_avg:150.65ms step:563/1480 train_time:83317ms step_avg:150.66ms step:564/1480 train_time:83476ms step_avg:150.68ms step:565/1480 train_time:83637ms step_avg:150.70ms step:566/1480 train_time:83797ms step_avg:150.71ms step:567/1480 train_time:83957ms step_avg:150.73ms step:568/1480 train_time:84116ms step_avg:150.75ms step:569/1480 train_time:84274ms step_avg:150.76ms step:570/1480 train_time:84434ms step_avg:150.78ms step:571/1480 train_time:84594ms step_avg:150.79ms step:572/1480 train_time:84756ms step_avg:150.81ms step:573/1480 train_time:84917ms step_avg:150.83ms step:574/1480 train_time:85078ms step_avg:150.85ms step:575/1480 train_time:85239ms step_avg:150.87ms step:576/1480 train_time:85398ms step_avg:150.88ms step:577/1480 train_time:85557ms step_avg:150.89ms step:578/1480 train_time:85717ms step_avg:150.91ms step:579/1480 train_time:85877ms step_avg:150.93ms step:580/1480 train_time:86036ms step_avg:150.94ms step:581/1480 train_time:86197ms step_avg:150.96ms step:582/1480 train_time:86358ms step_avg:150.98ms step:583/1480 train_time:86517ms step_avg:150.99ms step:584/1480 train_time:86677ms step_avg:151.01ms step:585/1480 train_time:86836ms step_avg:151.02ms step:586/1480 train_time:86997ms step_avg:151.04ms step:587/1480 train_time:87157ms step_avg:151.05ms step:588/1480 train_time:87317ms step_avg:151.07ms step:589/1480 train_time:87478ms step_avg:151.08ms step:590/1480 train_time:87638ms step_avg:151.10ms step:591/1480 train_time:87798ms step_avg:151.12ms step:592/1480 train_time:87958ms step_avg:151.13ms step:593/1480 train_time:88119ms step_avg:151.15ms step:594/1480 train_time:88279ms step_avg:151.16ms step:595/1480 train_time:88441ms step_avg:151.18ms step:596/1480 train_time:88602ms step_avg:151.20ms step:597/1480 train_time:88761ms step_avg:151.21ms step:598/1480 train_time:88919ms step_avg:151.22ms step:599/1480 train_time:89078ms step_avg:151.24ms step:600/1480 train_time:89239ms step_avg:151.25ms step:601/1480 train_time:89398ms step_avg:151.27ms step:602/1480 train_time:89559ms step_avg:151.28ms step:603/1480 train_time:89719ms step_avg:151.30ms step:604/1480 train_time:89878ms step_avg:151.31ms step:605/1480 train_time:90038ms step_avg:151.32ms step:606/1480 train_time:90200ms step_avg:151.34ms step:607/1480 train_time:90361ms step_avg:151.36ms step:608/1480 train_time:90520ms step_avg:151.37ms step:609/1480 train_time:90679ms step_avg:151.38ms step:610/1480 train_time:90838ms step_avg:151.40ms step:611/1480 train_time:90998ms step_avg:151.41ms step:612/1480 train_time:91158ms step_avg:151.43ms step:613/1480 train_time:91319ms step_avg:151.44ms step:614/1480 train_time:91479ms step_avg:151.46ms step:615/1480 train_time:91638ms step_avg:151.47ms step:616/1480 train_time:91796ms step_avg:151.48ms step:617/1480 train_time:91957ms step_avg:151.49ms step:618/1480 train_time:92116ms step_avg:151.51ms step:619/1480 train_time:92276ms step_avg:151.52ms step:620/1480 train_time:92437ms step_avg:151.54ms step:621/1480 train_time:92596ms step_avg:151.55ms step:622/1480 train_time:92757ms step_avg:151.56ms step:623/1480 train_time:92918ms step_avg:151.58ms step:624/1480 train_time:93078ms step_avg:151.59ms step:625/1480 train_time:93238ms step_avg:151.61ms step:625/1480 val_loss:3.6062 train_time:93300ms step_avg:151.71ms step:626/1480 train_time:93400ms step_avg:151.62ms step:627/1480 train_time:93560ms step_avg:151.64ms step:628/1480 train_time:93719ms step_avg:151.65ms step:629/1480 train_time:93879ms step_avg:151.66ms step:630/1480 train_time:94037ms step_avg:151.67ms step:631/1480 train_time:94195ms step_avg:151.68ms step:632/1480 train_time:94354ms step_avg:151.69ms step:633/1480 train_time:94514ms step_avg:151.71ms step:634/1480 train_time:94673ms step_avg:151.72ms step:635/1480 train_time:94831ms step_avg:151.73ms step:636/1480 train_time:94991ms step_avg:151.74ms step:637/1480 train_time:95149ms step_avg:151.75ms step:638/1480 train_time:95307ms step_avg:151.76ms step:639/1480 train_time:95465ms step_avg:151.77ms step:640/1480 train_time:95625ms step_avg:151.79ms step:641/1480 train_time:95784ms step_avg:151.80ms step:642/1480 train_time:95943ms step_avg:151.81ms step:643/1480 train_time:96104ms step_avg:151.82ms step:644/1480 train_time:96264ms step_avg:151.84ms step:645/1480 train_time:96423ms step_avg:151.85ms step:646/1480 train_time:96584ms step_avg:151.86ms step:647/1480 train_time:96743ms step_avg:151.87ms step:648/1480 train_time:96904ms step_avg:151.89ms step:649/1480 train_time:97064ms step_avg:151.90ms step:650/1480 train_time:97224ms step_avg:151.91ms step:651/1480 train_time:97384ms step_avg:151.93ms step:652/1480 train_time:97544ms step_avg:151.94ms step:653/1480 train_time:97704ms step_avg:151.95ms step:654/1480 train_time:97864ms step_avg:151.96ms step:655/1480 train_time:98024ms step_avg:151.98ms step:656/1480 train_time:98184ms step_avg:151.99ms step:657/1480 train_time:98344ms step_avg:152.00ms step:658/1480 train_time:98505ms step_avg:152.01ms step:659/1480 train_time:98666ms step_avg:152.03ms step:660/1480 train_time:98828ms step_avg:152.04ms step:661/1480 train_time:98989ms step_avg:152.06ms step:662/1480 train_time:99149ms step_avg:152.07ms step:663/1480 train_time:99309ms step_avg:152.08ms step:664/1480 train_time:99470ms step_avg:152.09ms step:665/1480 train_time:99632ms step_avg:152.11ms step:666/1480 train_time:99792ms step_avg:152.12ms step:667/1480 train_time:99952ms step_avg:152.13ms step:668/1480 train_time:100115ms step_avg:152.15ms step:669/1480 train_time:100277ms step_avg:152.17ms step:670/1480 train_time:100439ms step_avg:152.18ms step:671/1480 train_time:100600ms step_avg:152.19ms step:672/1480 train_time:100763ms step_avg:152.21ms step:673/1480 train_time:100927ms step_avg:152.23ms step:674/1480 train_time:101089ms step_avg:152.24ms step:675/1480 train_time:101251ms step_avg:152.26ms step:676/1480 train_time:101412ms step_avg:152.27ms step:677/1480 train_time:101574ms step_avg:152.28ms step:678/1480 train_time:101735ms step_avg:152.30ms step:679/1480 train_time:101897ms step_avg:152.31ms step:680/1480 train_time:102059ms step_avg:152.33ms step:681/1480 train_time:102220ms step_avg:152.34ms step:682/1480 train_time:102384ms step_avg:152.36ms step:683/1480 train_time:102547ms step_avg:152.37ms step:684/1480 train_time:102708ms step_avg:152.39ms step:685/1480 train_time:102869ms step_avg:152.40ms step:686/1480 train_time:103030ms step_avg:152.41ms step:687/1480 train_time:103189ms step_avg:152.42ms step:688/1480 train_time:103352ms step_avg:152.44ms step:689/1480 train_time:103515ms step_avg:152.45ms step:690/1480 train_time:103681ms step_avg:152.47ms step:691/1480 train_time:103843ms step_avg:152.49ms step:692/1480 train_time:104006ms step_avg:152.50ms step:693/1480 train_time:104167ms step_avg:152.51ms step:694/1480 train_time:104328ms step_avg:152.53ms step:695/1480 train_time:104488ms step_avg:152.54ms step:696/1480 train_time:104649ms step_avg:152.55ms step:697/1480 train_time:104812ms step_avg:152.57ms step:698/1480 train_time:104972ms step_avg:152.58ms step:699/1480 train_time:105135ms step_avg:152.59ms step:700/1480 train_time:105297ms step_avg:152.61ms step:701/1480 train_time:105458ms step_avg:152.62ms step:702/1480 train_time:105618ms step_avg:152.63ms step:703/1480 train_time:105779ms step_avg:152.64ms step:704/1480 train_time:105940ms step_avg:152.65ms step:705/1480 train_time:106105ms step_avg:152.67ms step:706/1480 train_time:106267ms step_avg:152.68ms step:707/1480 train_time:106429ms step_avg:152.70ms step:708/1480 train_time:106589ms step_avg:152.71ms step:709/1480 train_time:106750ms step_avg:152.72ms step:710/1480 train_time:106911ms step_avg:152.73ms step:711/1480 train_time:107072ms step_avg:152.74ms step:712/1480 train_time:107237ms step_avg:152.76ms step:713/1480 train_time:107402ms step_avg:152.78ms step:714/1480 train_time:107564ms step_avg:152.79ms step:715/1480 train_time:107725ms step_avg:152.80ms step:716/1480 train_time:107885ms step_avg:152.81ms step:717/1480 train_time:108047ms step_avg:152.83ms step:718/1480 train_time:108207ms step_avg:152.84ms step:719/1480 train_time:108367ms step_avg:152.85ms step:720/1480 train_time:108530ms step_avg:152.86ms step:721/1480 train_time:108691ms step_avg:152.87ms step:722/1480 train_time:108852ms step_avg:152.88ms step:723/1480 train_time:109012ms step_avg:152.89ms step:724/1480 train_time:109173ms step_avg:152.90ms step:725/1480 train_time:109336ms step_avg:152.92ms step:726/1480 train_time:109502ms step_avg:152.94ms step:727/1480 train_time:109665ms step_avg:152.95ms step:728/1480 train_time:109827ms step_avg:152.96ms step:729/1480 train_time:109988ms step_avg:152.97ms step:730/1480 train_time:110150ms step_avg:152.99ms step:731/1480 train_time:110311ms step_avg:153.00ms step:732/1480 train_time:110470ms step_avg:153.01ms step:733/1480 train_time:110632ms step_avg:153.02ms step:734/1480 train_time:110793ms step_avg:153.03ms step:735/1480 train_time:110953ms step_avg:153.04ms step:736/1480 train_time:111115ms step_avg:153.05ms step:737/1480 train_time:111278ms step_avg:153.06ms step:738/1480 train_time:111439ms step_avg:153.08ms step:739/1480 train_time:111599ms step_avg:153.09ms step:740/1480 train_time:111765ms step_avg:153.10ms step:741/1480 train_time:111928ms step_avg:153.12ms step:742/1480 train_time:112089ms step_avg:153.13ms step:743/1480 train_time:112249ms step_avg:153.14ms step:744/1480 train_time:112413ms step_avg:153.15ms step:745/1480 train_time:112577ms step_avg:153.17ms step:746/1480 train_time:112737ms step_avg:153.18ms step:747/1480 train_time:112898ms step_avg:153.19ms step:748/1480 train_time:113063ms step_avg:153.20ms step:749/1480 train_time:113227ms step_avg:153.22ms step:750/1480 train_time:113387ms step_avg:153.23ms step:750/1480 val_loss:3.5514 train_time:113450ms step_avg:153.31ms step:751/1480 train_time:113551ms step_avg:153.24ms step:752/1480 train_time:113712ms step_avg:153.25ms step:753/1480 train_time:113873ms step_avg:153.26ms step:754/1480 train_time:114033ms step_avg:153.27ms step:755/1480 train_time:114195ms step_avg:153.28ms step:756/1480 train_time:114357ms step_avg:153.29ms step:757/1480 train_time:114521ms step_avg:153.31ms step:758/1480 train_time:114684ms step_avg:153.32ms step:759/1480 train_time:114847ms step_avg:153.33ms step:760/1480 train_time:115008ms step_avg:153.34ms step:761/1480 train_time:115171ms step_avg:153.36ms step:762/1480 train_time:115332ms step_avg:153.37ms step:763/1480 train_time:115494ms step_avg:153.38ms step:764/1480 train_time:115655ms step_avg:153.39ms step:765/1480 train_time:115817ms step_avg:153.40ms step:766/1480 train_time:115982ms step_avg:153.42ms step:767/1480 train_time:116145ms step_avg:153.43ms step:768/1480 train_time:116308ms step_avg:153.44ms step:769/1480 train_time:116472ms step_avg:153.45ms step:770/1480 train_time:116633ms step_avg:153.46ms step:771/1480 train_time:116796ms step_avg:153.48ms step:772/1480 train_time:116957ms step_avg:153.49ms step:773/1480 train_time:117120ms step_avg:153.50ms step:774/1480 train_time:117284ms step_avg:153.51ms step:775/1480 train_time:117447ms step_avg:153.53ms step:776/1480 train_time:117612ms step_avg:153.54ms step:777/1480 train_time:117777ms step_avg:153.56ms step:778/1480 train_time:117940ms step_avg:153.57ms step:779/1480 train_time:118104ms step_avg:153.58ms step:780/1480 train_time:118269ms step_avg:153.60ms step:781/1480 train_time:118431ms step_avg:153.61ms step:782/1480 train_time:118595ms step_avg:153.62ms step:783/1480 train_time:118756ms step_avg:153.63ms step:784/1480 train_time:118920ms step_avg:153.64ms step:785/1480 train_time:119082ms step_avg:153.65ms step:786/1480 train_time:119249ms step_avg:153.67ms step:787/1480 train_time:119413ms step_avg:153.68ms step:788/1480 train_time:119577ms step_avg:153.70ms step:789/1480 train_time:119738ms step_avg:153.71ms step:790/1480 train_time:119904ms step_avg:153.72ms step:791/1480 train_time:120071ms step_avg:153.74ms step:792/1480 train_time:120234ms step_avg:153.75ms step:793/1480 train_time:120395ms step_avg:153.76ms step:794/1480 train_time:120559ms step_avg:153.77ms step:795/1480 train_time:120725ms step_avg:153.79ms step:796/1480 train_time:120892ms step_avg:153.81ms step:797/1480 train_time:121056ms step_avg:153.82ms step:798/1480 train_time:121219ms step_avg:153.83ms step:799/1480 train_time:121387ms step_avg:153.85ms step:800/1480 train_time:121551ms step_avg:153.86ms step:801/1480 train_time:121713ms step_avg:153.87ms step:802/1480 train_time:121880ms step_avg:153.89ms step:803/1480 train_time:122043ms step_avg:153.90ms step:804/1480 train_time:122205ms step_avg:153.91ms step:805/1480 train_time:122371ms step_avg:153.93ms step:806/1480 train_time:122531ms step_avg:153.93ms step:807/1480 train_time:122694ms step_avg:153.95ms step:808/1480 train_time:122860ms step_avg:153.96ms step:809/1480 train_time:123022ms step_avg:153.97ms step:810/1480 train_time:123184ms step_avg:153.98ms step:811/1480 train_time:123348ms step_avg:153.99ms step:812/1480 train_time:123512ms step_avg:154.00ms step:813/1480 train_time:123672ms step_avg:154.01ms step:814/1480 train_time:123836ms step_avg:154.02ms step:815/1480 train_time:123998ms step_avg:154.03ms step:816/1480 train_time:124166ms step_avg:154.05ms step:817/1480 train_time:124329ms step_avg:154.06ms step:818/1480 train_time:124492ms step_avg:154.07ms step:819/1480 train_time:124654ms step_avg:154.08ms step:820/1480 train_time:124819ms step_avg:154.10ms step:821/1480 train_time:124982ms step_avg:154.11ms step:822/1480 train_time:125145ms step_avg:154.12ms step:823/1480 train_time:125308ms step_avg:154.13ms step:824/1480 train_time:125470ms step_avg:154.14ms step:825/1480 train_time:125633ms step_avg:154.15ms step:826/1480 train_time:125799ms step_avg:154.17ms step:827/1480 train_time:125964ms step_avg:154.18ms step:828/1480 train_time:126127ms step_avg:154.19ms step:829/1480 train_time:126292ms step_avg:154.20ms step:830/1480 train_time:126457ms step_avg:154.22ms step:831/1480 train_time:126621ms step_avg:154.23ms step:832/1480 train_time:126786ms step_avg:154.24ms step:833/1480 train_time:126951ms step_avg:154.25ms step:834/1480 train_time:127114ms step_avg:154.27ms step:835/1480 train_time:127277ms step_avg:154.27ms step:836/1480 train_time:127443ms step_avg:154.29ms step:837/1480 train_time:127606ms step_avg:154.30ms step:838/1480 train_time:127771ms step_avg:154.31ms step:839/1480 train_time:127933ms step_avg:154.32ms step:840/1480 train_time:128094ms step_avg:154.33ms step:841/1480 train_time:128256ms step_avg:154.34ms step:842/1480 train_time:128420ms step_avg:154.35ms step:843/1480 train_time:128583ms step_avg:154.36ms step:844/1480 train_time:128746ms step_avg:154.37ms step:845/1480 train_time:128910ms step_avg:154.38ms step:846/1480 train_time:129076ms step_avg:154.40ms step:847/1480 train_time:129240ms step_avg:154.41ms step:848/1480 train_time:129402ms step_avg:154.42ms step:849/1480 train_time:129565ms step_avg:154.43ms step:850/1480 train_time:129728ms step_avg:154.44ms step:851/1480 train_time:129893ms step_avg:154.45ms step:852/1480 train_time:130054ms step_avg:154.46ms step:853/1480 train_time:130216ms step_avg:154.47ms step:854/1480 train_time:130382ms step_avg:154.48ms step:855/1480 train_time:130547ms step_avg:154.49ms step:856/1480 train_time:130709ms step_avg:154.50ms step:857/1480 train_time:130875ms step_avg:154.52ms step:858/1480 train_time:131041ms step_avg:154.53ms step:859/1480 train_time:131205ms step_avg:154.54ms step:860/1480 train_time:131368ms step_avg:154.55ms step:861/1480 train_time:131533ms step_avg:154.56ms step:862/1480 train_time:131703ms step_avg:154.58ms step:863/1480 train_time:131872ms step_avg:154.60ms step:864/1480 train_time:132035ms step_avg:154.61ms step:865/1480 train_time:132196ms step_avg:154.62ms step:866/1480 train_time:132365ms step_avg:154.63ms step:867/1480 train_time:132528ms step_avg:154.64ms step:868/1480 train_time:132690ms step_avg:154.65ms step:869/1480 train_time:132851ms step_avg:154.66ms step:870/1480 train_time:133015ms step_avg:154.67ms step:871/1480 train_time:133177ms step_avg:154.68ms step:872/1480 train_time:133343ms step_avg:154.69ms step:873/1480 train_time:133507ms step_avg:154.70ms step:874/1480 train_time:133673ms step_avg:154.71ms step:875/1480 train_time:133838ms step_avg:154.73ms step:875/1480 val_loss:3.5053 train_time:133905ms step_avg:154.80ms step:876/1480 train_time:134004ms step_avg:154.74ms step:877/1480 train_time:134169ms step_avg:154.75ms step:878/1480 train_time:134332ms step_avg:154.76ms step:879/1480 train_time:134498ms step_avg:154.77ms step:880/1480 train_time:134661ms step_avg:154.78ms step:881/1480 train_time:134823ms step_avg:154.79ms step:882/1480 train_time:134988ms step_avg:154.80ms step:883/1480 train_time:135153ms step_avg:154.81ms step:884/1480 train_time:135321ms step_avg:154.83ms step:885/1480 train_time:135486ms step_avg:154.84ms step:886/1480 train_time:135653ms step_avg:154.86ms step:887/1480 train_time:135822ms step_avg:154.87ms step:888/1480 train_time:135994ms step_avg:154.89ms step:889/1480 train_time:136163ms step_avg:154.91ms step:890/1480 train_time:136326ms step_avg:154.92ms step:891/1480 train_time:136491ms step_avg:154.93ms step:892/1480 train_time:136657ms step_avg:154.94ms step:893/1480 train_time:136820ms step_avg:154.95ms step:894/1480 train_time:136986ms step_avg:154.96ms step:895/1480 train_time:137154ms step_avg:154.98ms step:896/1480 train_time:137320ms step_avg:154.99ms step:897/1480 train_time:137484ms step_avg:155.00ms step:898/1480 train_time:137651ms step_avg:155.01ms step:899/1480 train_time:137816ms step_avg:155.02ms step:900/1480 train_time:137980ms step_avg:155.03ms step:901/1480 train_time:138144ms step_avg:155.04ms step:902/1480 train_time:138307ms step_avg:155.05ms step:903/1480 train_time:138479ms step_avg:155.07ms step:904/1480 train_time:138644ms step_avg:155.08ms step:905/1480 train_time:138806ms step_avg:155.09ms step:906/1480 train_time:138971ms step_avg:155.10ms step:907/1480 train_time:139140ms step_avg:155.12ms step:908/1480 train_time:139303ms step_avg:155.13ms step:909/1480 train_time:139469ms step_avg:155.14ms step:910/1480 train_time:139642ms step_avg:155.16ms step:911/1480 train_time:139806ms step_avg:155.17ms step:912/1480 train_time:139973ms step_avg:155.18ms step:913/1480 train_time:140142ms step_avg:155.20ms step:914/1480 train_time:140309ms step_avg:155.21ms step:915/1480 train_time:140479ms step_avg:155.23ms step:916/1480 train_time:140644ms step_avg:155.24ms step:917/1480 train_time:140807ms step_avg:155.24ms step:918/1480 train_time:140976ms step_avg:155.26ms step:919/1480 train_time:141146ms step_avg:155.28ms step:920/1480 train_time:141311ms step_avg:155.29ms step:921/1480 train_time:141478ms step_avg:155.30ms step:922/1480 train_time:141645ms step_avg:155.31ms step:923/1480 train_time:141807ms step_avg:155.32ms step:924/1480 train_time:141972ms step_avg:155.33ms step:925/1480 train_time:142139ms step_avg:155.34ms step:926/1480 train_time:142302ms step_avg:155.35ms step:927/1480 train_time:142465ms step_avg:155.36ms step:928/1480 train_time:142631ms step_avg:155.37ms step:929/1480 train_time:142797ms step_avg:155.38ms step:930/1480 train_time:142963ms step_avg:155.39ms step:931/1480 train_time:143126ms step_avg:155.40ms step:932/1480 train_time:143292ms step_avg:155.41ms step:933/1480 train_time:143462ms step_avg:155.43ms step:934/1480 train_time:143627ms step_avg:155.44ms step:935/1480 train_time:143799ms step_avg:155.46ms step:936/1480 train_time:143966ms step_avg:155.47ms step:937/1480 train_time:144137ms step_avg:155.49ms step:938/1480 train_time:144300ms step_avg:155.50ms step:939/1480 train_time:144469ms step_avg:155.51ms step:940/1480 train_time:144636ms step_avg:155.52ms step:941/1480 train_time:144801ms step_avg:155.53ms step:942/1480 train_time:144965ms step_avg:155.54ms step:943/1480 train_time:145135ms step_avg:155.56ms step:944/1480 train_time:145308ms step_avg:155.58ms step:945/1480 train_time:145471ms step_avg:155.58ms step:946/1480 train_time:145642ms step_avg:155.60ms step:947/1480 train_time:145810ms step_avg:155.61ms step:948/1480 train_time:145976ms step_avg:155.62ms step:949/1480 train_time:146142ms step_avg:155.64ms step:950/1480 train_time:146306ms step_avg:155.64ms step:951/1480 train_time:146474ms step_avg:155.66ms step:952/1480 train_time:146641ms step_avg:155.67ms step:953/1480 train_time:146808ms step_avg:155.68ms step:954/1480 train_time:146978ms step_avg:155.70ms step:955/1480 train_time:147141ms step_avg:155.71ms step:956/1480 train_time:147306ms step_avg:155.71ms step:957/1480 train_time:147474ms step_avg:155.73ms step:958/1480 train_time:147644ms step_avg:155.74ms step:959/1480 train_time:147808ms step_avg:155.75ms step:960/1480 train_time:147977ms step_avg:155.77ms step:961/1480 train_time:148143ms step_avg:155.78ms step:962/1480 train_time:148306ms step_avg:155.78ms step:963/1480 train_time:148470ms step_avg:155.79ms step:964/1480 train_time:148639ms step_avg:155.81ms step:965/1480 train_time:148804ms step_avg:155.82ms step:966/1480 train_time:148968ms step_avg:155.82ms step:967/1480 train_time:149133ms step_avg:155.83ms step:968/1480 train_time:149299ms step_avg:155.84ms step:969/1480 train_time:149465ms step_avg:155.85ms step:970/1480 train_time:149628ms step_avg:155.86ms step:971/1480 train_time:149792ms step_avg:155.87ms step:972/1480 train_time:149959ms step_avg:155.88ms step:973/1480 train_time:150123ms step_avg:155.89ms step:974/1480 train_time:150291ms step_avg:155.90ms step:975/1480 train_time:150457ms step_avg:155.91ms step:976/1480 train_time:150622ms step_avg:155.92ms step:977/1480 train_time:150785ms step_avg:155.93ms step:978/1480 train_time:150950ms step_avg:155.94ms step:979/1480 train_time:151115ms step_avg:155.95ms step:980/1480 train_time:151281ms step_avg:155.96ms step:981/1480 train_time:151451ms step_avg:155.97ms step:982/1480 train_time:151616ms step_avg:155.98ms step:983/1480 train_time:151781ms step_avg:155.99ms step:984/1480 train_time:151946ms step_avg:156.00ms step:985/1480 train_time:152115ms step_avg:156.02ms step:986/1480 train_time:152281ms step_avg:156.03ms step:987/1480 train_time:152444ms step_avg:156.03ms step:988/1480 train_time:152611ms step_avg:156.04ms step:989/1480 train_time:152777ms step_avg:156.05ms step:990/1480 train_time:152947ms step_avg:156.07ms step:991/1480 train_time:153114ms step_avg:156.08ms step:992/1480 train_time:153288ms step_avg:156.10ms step:993/1480 train_time:153465ms step_avg:156.12ms step:994/1480 train_time:153631ms step_avg:156.13ms step:995/1480 train_time:153797ms step_avg:156.14ms step:996/1480 train_time:153960ms step_avg:156.15ms step:997/1480 train_time:154125ms step_avg:156.15ms step:998/1480 train_time:154288ms step_avg:156.16ms step:999/1480 train_time:154456ms step_avg:156.17ms step:1000/1480 train_time:154624ms step_avg:156.19ms step:1000/1480 val_loss:3.4419 train_time:154691ms step_avg:156.25ms step:1001/1480 train_time:154794ms step_avg:156.20ms step:1002/1480 train_time:154959ms step_avg:156.21ms step:1003/1480 train_time:155130ms step_avg:156.22ms step:1004/1480 train_time:155299ms step_avg:156.24ms step:1005/1480 train_time:155466ms step_avg:156.25ms step:1006/1480 train_time:155634ms step_avg:156.26ms step:1007/1480 train_time:155799ms step_avg:156.27ms step:1008/1480 train_time:155966ms step_avg:156.28ms step:1009/1480 train_time:156140ms step_avg:156.30ms step:1010/1480 train_time:156305ms step_avg:156.31ms step:1011/1480 train_time:156473ms step_avg:156.32ms step:1012/1480 train_time:156639ms step_avg:156.33ms step:1013/1480 train_time:156809ms step_avg:156.34ms step:1014/1480 train_time:156976ms step_avg:156.35ms step:1015/1480 train_time:157145ms step_avg:156.36ms step:1016/1480 train_time:157314ms step_avg:156.38ms step:1017/1480 train_time:157485ms step_avg:156.39ms step:1018/1480 train_time:157655ms step_avg:156.40ms step:1019/1480 train_time:157822ms step_avg:156.41ms step:1020/1480 train_time:157992ms step_avg:156.43ms step:1021/1480 train_time:158157ms step_avg:156.44ms step:1022/1480 train_time:158324ms step_avg:156.45ms step:1023/1480 train_time:158491ms step_avg:156.46ms step:1024/1480 train_time:158658ms step_avg:156.47ms step:1025/1480 train_time:158827ms step_avg:156.48ms step:1026/1480 train_time:158993ms step_avg:156.49ms step:1027/1480 train_time:159158ms step_avg:156.50ms step:1028/1480 train_time:159330ms step_avg:156.51ms step:1029/1480 train_time:159507ms step_avg:156.53ms step:1030/1480 train_time:159675ms step_avg:156.54ms step:1031/1480 train_time:159839ms step_avg:156.55ms step:1032/1480 train_time:160013ms step_avg:156.57ms step:1033/1480 train_time:160180ms step_avg:156.58ms step:1034/1480 train_time:160348ms step_avg:156.59ms step:1035/1480 train_time:160516ms step_avg:156.60ms step:1036/1480 train_time:160681ms step_avg:156.61ms step:1037/1480 train_time:160848ms step_avg:156.62ms step:1038/1480 train_time:161019ms step_avg:156.63ms step:1039/1480 train_time:161192ms step_avg:156.65ms step:1040/1480 train_time:161359ms step_avg:156.66ms step:1041/1480 train_time:161524ms step_avg:156.67ms step:1042/1480 train_time:161688ms step_avg:156.67ms step:1043/1480 train_time:161855ms step_avg:156.68ms step:1044/1480 train_time:162019ms step_avg:156.69ms step:1045/1480 train_time:162187ms step_avg:156.70ms step:1046/1480 train_time:162357ms step_avg:156.71ms step:1047/1480 train_time:162522ms step_avg:156.72ms step:1048/1480 train_time:162688ms step_avg:156.73ms step:1049/1480 train_time:162854ms step_avg:156.74ms step:1050/1480 train_time:163021ms step_avg:156.75ms step:1051/1480 train_time:163191ms step_avg:156.76ms step:1052/1480 train_time:163361ms step_avg:156.78ms step:1053/1480 train_time:163528ms step_avg:156.79ms step:1054/1480 train_time:163696ms step_avg:156.80ms step:1055/1480 train_time:163862ms step_avg:156.81ms step:1056/1480 train_time:164027ms step_avg:156.81ms step:1057/1480 train_time:164194ms step_avg:156.82ms step:1058/1480 train_time:164363ms step_avg:156.83ms step:1059/1480 train_time:164535ms step_avg:156.85ms step:1060/1480 train_time:164704ms step_avg:156.86ms step:1061/1480 train_time:164868ms step_avg:156.87ms step:1062/1480 train_time:165036ms step_avg:156.88ms step:1063/1480 train_time:165200ms step_avg:156.89ms step:1064/1480 train_time:165364ms step_avg:156.89ms step:1065/1480 train_time:165531ms step_avg:156.90ms step:1066/1480 train_time:165698ms step_avg:156.91ms step:1067/1480 train_time:165867ms step_avg:156.92ms step:1068/1480 train_time:166034ms step_avg:156.93ms step:1069/1480 train_time:166204ms step_avg:156.94ms step:1070/1480 train_time:166371ms step_avg:156.95ms step:1071/1480 train_time:166544ms step_avg:156.97ms step:1072/1480 train_time:166709ms step_avg:156.98ms step:1073/1480 train_time:166874ms step_avg:156.98ms step:1074/1480 train_time:167040ms step_avg:156.99ms step:1075/1480 train_time:167211ms step_avg:157.01ms step:1076/1480 train_time:167379ms step_avg:157.02ms step:1077/1480 train_time:167545ms step_avg:157.02ms step:1078/1480 train_time:167720ms step_avg:157.04ms step:1079/1480 train_time:167893ms step_avg:157.06ms step:1080/1480 train_time:168063ms step_avg:157.07ms step:1081/1480 train_time:168228ms step_avg:157.08ms step:1082/1480 train_time:168397ms step_avg:157.09ms step:1083/1480 train_time:168564ms step_avg:157.10ms step:1084/1480 train_time:168730ms step_avg:157.10ms step:1085/1480 train_time:168899ms step_avg:157.12ms step:1086/1480 train_time:169065ms step_avg:157.12ms step:1087/1480 train_time:169232ms step_avg:157.13ms step:1088/1480 train_time:169402ms step_avg:157.14ms step:1089/1480 train_time:169575ms step_avg:157.16ms step:1090/1480 train_time:169745ms step_avg:157.17ms step:1091/1480 train_time:169912ms step_avg:157.18ms step:1092/1480 train_time:170080ms step_avg:157.19ms step:1093/1480 train_time:170249ms step_avg:157.20ms step:1094/1480 train_time:170416ms step_avg:157.21ms step:1095/1480 train_time:170580ms step_avg:157.22ms step:1096/1480 train_time:170749ms step_avg:157.23ms step:1097/1480 train_time:170917ms step_avg:157.24ms step:1098/1480 train_time:171087ms step_avg:157.25ms step:1099/1480 train_time:171259ms step_avg:157.26ms step:1100/1480 train_time:171433ms step_avg:157.28ms step:1101/1480 train_time:171602ms step_avg:157.29ms step:1102/1480 train_time:171774ms step_avg:157.30ms step:1103/1480 train_time:171949ms step_avg:157.32ms step:1104/1480 train_time:172117ms step_avg:157.33ms step:1105/1480 train_time:172286ms step_avg:157.34ms step:1106/1480 train_time:172455ms step_avg:157.35ms step:1107/1480 train_time:172622ms step_avg:157.36ms step:1108/1480 train_time:172787ms step_avg:157.37ms step:1109/1480 train_time:172955ms step_avg:157.37ms step:1110/1480 train_time:173119ms step_avg:157.38ms step:1111/1480 train_time:173286ms step_avg:157.39ms step:1112/1480 train_time:173457ms step_avg:157.40ms step:1113/1480 train_time:173636ms step_avg:157.42ms step:1114/1480 train_time:173808ms step_avg:157.43ms step:1115/1480 train_time:173981ms step_avg:157.45ms step:1116/1480 train_time:174149ms step_avg:157.46ms step:1117/1480 train_time:174320ms step_avg:157.47ms step:1118/1480 train_time:174495ms step_avg:157.49ms step:1119/1480 train_time:174662ms step_avg:157.49ms step:1120/1480 train_time:174830ms step_avg:157.50ms step:1121/1480 train_time:175000ms step_avg:157.52ms step:1122/1480 train_time:175166ms step_avg:157.52ms step:1123/1480 train_time:175333ms step_avg:157.53ms step:1124/1480 train_time:175500ms step_avg:157.54ms step:1125/1480 train_time:175668ms step_avg:157.55ms step:1125/1480 val_loss:3.3876 train_time:175736ms step_avg:157.61ms step:1126/1480 train_time:175839ms step_avg:157.56ms step:1127/1480 train_time:176012ms step_avg:157.58ms step:1128/1480 train_time:176183ms step_avg:157.59ms step:1129/1480 train_time:176356ms step_avg:157.60ms step:1130/1480 train_time:176526ms step_avg:157.61ms step:1131/1480 train_time:176704ms step_avg:157.63ms step:1132/1480 train_time:176871ms step_avg:157.64ms step:1133/1480 train_time:177043ms step_avg:157.65ms step:1134/1480 train_time:177214ms step_avg:157.66ms step:1135/1480 train_time:177383ms step_avg:157.67ms step:1136/1480 train_time:177553ms step_avg:157.68ms step:1137/1480 train_time:177721ms step_avg:157.69ms step:1138/1480 train_time:177894ms step_avg:157.71ms step:1139/1480 train_time:178063ms step_avg:157.72ms step:1140/1480 train_time:178231ms step_avg:157.73ms step:1141/1480 train_time:178404ms step_avg:157.74ms step:1142/1480 train_time:178571ms step_avg:157.75ms step:1143/1480 train_time:178743ms step_avg:157.76ms step:1144/1480 train_time:178913ms step_avg:157.77ms step:1145/1480 train_time:179078ms step_avg:157.78ms step:1146/1480 train_time:179249ms step_avg:157.79ms step:1147/1480 train_time:179416ms step_avg:157.80ms step:1148/1480 train_time:179586ms step_avg:157.81ms step:1149/1480 train_time:179755ms step_avg:157.82ms step:1150/1480 train_time:179924ms step_avg:157.83ms step:1151/1480 train_time:180097ms step_avg:157.84ms step:1152/1480 train_time:180269ms step_avg:157.85ms step:1153/1480 train_time:180443ms step_avg:157.87ms step:1154/1480 train_time:180610ms step_avg:157.88ms step:1155/1480 train_time:180781ms step_avg:157.89ms step:1156/1480 train_time:180960ms step_avg:157.91ms step:1157/1480 train_time:181129ms step_avg:157.92ms step:1158/1480 train_time:181296ms step_avg:157.92ms step:1159/1480 train_time:181465ms step_avg:157.93ms step:1160/1480 train_time:181631ms step_avg:157.94ms step:1161/1480 train_time:181800ms step_avg:157.95ms step:1162/1480 train_time:181971ms step_avg:157.96ms step:1163/1480 train_time:182139ms step_avg:157.97ms step:1164/1480 train_time:182309ms step_avg:157.98ms step:1165/1480 train_time:182474ms step_avg:157.99ms step:1166/1480 train_time:182645ms step_avg:158.00ms step:1167/1480 train_time:182814ms step_avg:158.01ms step:1168/1480 train_time:182982ms step_avg:158.02ms step:1169/1480 train_time:183150ms step_avg:158.02ms step:1170/1480 train_time:183319ms step_avg:158.03ms step:1171/1480 train_time:183487ms step_avg:158.04ms step:1172/1480 train_time:183653ms step_avg:158.05ms step:1173/1480 train_time:183825ms step_avg:158.06ms step:1174/1480 train_time:184009ms step_avg:158.08ms step:1175/1480 train_time:184179ms step_avg:158.09ms step:1176/1480 train_time:184351ms step_avg:158.11ms step:1177/1480 train_time:184527ms step_avg:158.12ms step:1178/1480 train_time:184695ms step_avg:158.13ms step:1179/1480 train_time:184861ms step_avg:158.14ms step:1180/1480 train_time:185040ms step_avg:158.15ms step:1181/1480 train_time:185210ms step_avg:158.16ms step:1182/1480 train_time:185376ms step_avg:158.17ms step:1183/1480 train_time:185548ms step_avg:158.18ms step:1184/1480 train_time:185716ms step_avg:158.19ms step:1185/1480 train_time:185890ms step_avg:158.20ms step:1186/1480 train_time:186061ms step_avg:158.21ms step:1187/1480 train_time:186245ms step_avg:158.24ms step:1188/1480 train_time:186412ms step_avg:158.24ms step:1189/1480 train_time:186583ms step_avg:158.26ms step:1190/1480 train_time:186751ms step_avg:158.26ms step:1191/1480 train_time:186923ms step_avg:158.28ms step:1192/1480 train_time:187090ms step_avg:158.28ms step:1193/1480 train_time:187256ms step_avg:158.29ms step:1194/1480 train_time:187426ms step_avg:158.30ms step:1195/1480 train_time:187600ms step_avg:158.31ms step:1196/1480 train_time:187783ms step_avg:158.33ms step:1197/1480 train_time:187953ms step_avg:158.34ms step:1198/1480 train_time:188134ms step_avg:158.36ms step:1199/1480 train_time:188306ms step_avg:158.37ms step:1200/1480 train_time:188473ms step_avg:158.38ms step:1201/1480 train_time:188641ms step_avg:158.39ms step:1202/1480 train_time:188821ms step_avg:158.41ms step:1203/1480 train_time:188996ms step_avg:158.42ms step:1204/1480 train_time:189171ms step_avg:158.43ms step:1205/1480 train_time:189339ms step_avg:158.44ms step:1206/1480 train_time:189509ms step_avg:158.45ms step:1207/1480 train_time:189677ms step_avg:158.46ms step:1208/1480 train_time:189845ms step_avg:158.47ms step:1209/1480 train_time:190020ms step_avg:158.48ms step:1210/1480 train_time:190195ms step_avg:158.50ms step:1211/1480 train_time:190368ms step_avg:158.51ms step:1212/1480 train_time:190540ms step_avg:158.52ms step:1213/1480 train_time:190713ms step_avg:158.53ms step:1214/1480 train_time:190890ms step_avg:158.55ms step:1215/1480 train_time:191062ms step_avg:158.56ms step:1216/1480 train_time:191233ms step_avg:158.57ms step:1217/1480 train_time:191407ms step_avg:158.58ms step:1218/1480 train_time:191575ms step_avg:158.59ms step:1219/1480 train_time:191753ms step_avg:158.60ms step:1220/1480 train_time:191922ms step_avg:158.61ms step:1221/1480 train_time:192092ms step_avg:158.62ms step:1222/1480 train_time:192259ms step_avg:158.63ms step:1223/1480 train_time:192430ms step_avg:158.64ms step:1224/1480 train_time:192610ms step_avg:158.66ms step:1225/1480 train_time:192782ms step_avg:158.67ms step:1226/1480 train_time:192953ms step_avg:158.68ms step:1227/1480 train_time:193125ms step_avg:158.69ms step:1228/1480 train_time:193294ms step_avg:158.70ms step:1229/1480 train_time:193468ms step_avg:158.71ms step:1230/1480 train_time:193648ms step_avg:158.73ms step:1231/1480 train_time:193824ms step_avg:158.74ms step:1232/1480 train_time:193999ms step_avg:158.76ms step:1233/1480 train_time:194171ms step_avg:158.77ms step:1234/1480 train_time:194340ms step_avg:158.77ms step:1235/1480 train_time:194515ms step_avg:158.79ms step:1236/1480 train_time:194684ms step_avg:158.80ms step:1237/1480 train_time:194854ms step_avg:158.81ms step:1238/1480 train_time:195039ms step_avg:158.83ms step:1239/1480 train_time:195211ms step_avg:158.84ms step:1240/1480 train_time:195380ms step_avg:158.85ms step:1241/1480 train_time:195552ms step_avg:158.86ms step:1242/1480 train_time:195722ms step_avg:158.87ms step:1243/1480 train_time:195897ms step_avg:158.88ms step:1244/1480 train_time:196064ms step_avg:158.88ms step:1245/1480 train_time:196234ms step_avg:158.89ms step:1246/1480 train_time:196406ms step_avg:158.90ms step:1247/1480 train_time:196574ms step_avg:158.91ms step:1248/1480 train_time:196742ms step_avg:158.92ms step:1249/1480 train_time:196911ms step_avg:158.93ms step:1250/1480 train_time:197080ms step_avg:158.94ms step:1250/1480 val_loss:3.3363 train_time:197154ms step_avg:158.99ms step:1251/1480 train_time:197263ms step_avg:158.95ms step:1252/1480 train_time:197433ms step_avg:158.96ms step:1253/1480 train_time:197602ms step_avg:158.97ms step:1254/1480 train_time:197773ms step_avg:158.98ms step:1255/1480 train_time:197959ms step_avg:159.00ms step:1256/1480 train_time:198134ms step_avg:159.02ms step:1257/1480 train_time:198304ms step_avg:159.03ms step:1258/1480 train_time:198481ms step_avg:159.04ms step:1259/1480 train_time:198652ms step_avg:159.05ms step:1260/1480 train_time:198820ms step_avg:159.06ms step:1261/1480 train_time:198992ms step_avg:159.07ms step:1262/1480 train_time:199168ms step_avg:159.08ms step:1263/1480 train_time:199343ms step_avg:159.09ms step:1264/1480 train_time:199509ms step_avg:159.10ms step:1265/1480 train_time:199676ms step_avg:159.10ms step:1266/1480 train_time:199846ms step_avg:159.11ms step:1267/1480 train_time:200018ms step_avg:159.12ms step:1268/1480 train_time:200188ms step_avg:159.13ms step:1269/1480 train_time:200365ms step_avg:159.15ms step:1270/1480 train_time:200536ms step_avg:159.16ms step:1271/1480 train_time:200705ms step_avg:159.16ms step:1272/1480 train_time:200873ms step_avg:159.17ms step:1273/1480 train_time:201043ms step_avg:159.18ms step:1274/1480 train_time:201216ms step_avg:159.19ms step:1275/1480 train_time:201383ms step_avg:159.20ms step:1276/1480 train_time:201548ms step_avg:159.20ms step:1277/1480 train_time:201722ms step_avg:159.21ms step:1278/1480 train_time:201890ms step_avg:159.22ms step:1279/1480 train_time:202062ms step_avg:159.23ms step:1280/1480 train_time:202241ms step_avg:159.25ms step:1281/1480 train_time:202410ms step_avg:159.25ms step:1282/1480 train_time:202576ms step_avg:159.26ms step:1283/1480 train_time:202747ms step_avg:159.27ms step:1284/1480 train_time:202919ms step_avg:159.28ms step:1285/1480 train_time:203088ms step_avg:159.28ms step:1286/1480 train_time:203259ms step_avg:159.29ms step:1287/1480 train_time:203431ms step_avg:159.30ms step:1288/1480 train_time:203603ms step_avg:159.31ms step:1289/1480 train_time:203785ms step_avg:159.33ms step:1290/1480 train_time:203964ms step_avg:159.35ms step:1291/1480 train_time:204138ms step_avg:159.36ms step:1292/1480 train_time:204311ms step_avg:159.37ms step:1293/1480 train_time:204487ms step_avg:159.38ms step:1294/1480 train_time:204659ms step_avg:159.39ms step:1295/1480 train_time:204830ms step_avg:159.40ms step:1296/1480 train_time:205005ms step_avg:159.41ms step:1297/1480 train_time:205178ms step_avg:159.42ms step:1298/1480 train_time:205347ms step_avg:159.43ms step:1299/1480 train_time:205519ms step_avg:159.44ms step:1300/1480 train_time:205687ms step_avg:159.45ms step:1301/1480 train_time:205855ms step_avg:159.45ms step:1302/1480 train_time:206027ms step_avg:159.46ms step:1303/1480 train_time:206203ms step_avg:159.48ms step:1304/1480 train_time:206378ms step_avg:159.49ms step:1305/1480 train_time:206547ms step_avg:159.50ms step:1306/1480 train_time:206723ms step_avg:159.51ms step:1307/1480 train_time:206890ms step_avg:159.51ms step:1308/1480 train_time:207060ms step_avg:159.52ms step:1309/1480 train_time:207232ms step_avg:159.53ms step:1310/1480 train_time:207401ms step_avg:159.54ms step:1311/1480 train_time:207569ms step_avg:159.55ms step:1312/1480 train_time:207743ms step_avg:159.56ms step:1313/1480 train_time:207912ms step_avg:159.56ms step:1314/1480 train_time:208084ms step_avg:159.57ms step:1315/1480 train_time:208254ms step_avg:159.58ms step:1316/1480 train_time:208421ms step_avg:159.59ms step:1317/1480 train_time:208593ms step_avg:159.60ms step:1318/1480 train_time:208773ms step_avg:159.61ms step:1319/1480 train_time:208948ms step_avg:159.62ms step:1320/1480 train_time:209126ms step_avg:159.64ms step:1321/1480 train_time:209300ms step_avg:159.65ms step:1322/1480 train_time:209480ms step_avg:159.66ms step:1323/1480 train_time:209652ms step_avg:159.67ms step:1324/1480 train_time:209826ms step_avg:159.68ms step:1325/1480 train_time:210007ms step_avg:159.70ms step:1326/1480 train_time:210182ms step_avg:159.71ms step:1327/1480 train_time:210352ms step_avg:159.72ms step:1328/1480 train_time:210524ms step_avg:159.73ms step:1329/1480 train_time:210722ms step_avg:159.76ms step:1330/1480 train_time:210903ms step_avg:159.77ms step:1331/1480 train_time:211073ms step_avg:159.78ms step:1332/1480 train_time:211246ms step_avg:159.79ms step:1333/1480 train_time:211423ms step_avg:159.81ms step:1334/1480 train_time:211594ms step_avg:159.81ms step:1335/1480 train_time:211764ms step_avg:159.82ms step:1336/1480 train_time:211947ms step_avg:159.84ms step:1337/1480 train_time:212123ms step_avg:159.85ms step:1338/1480 train_time:212295ms step_avg:159.86ms step:1339/1480 train_time:212468ms step_avg:159.87ms step:1340/1480 train_time:212642ms step_avg:159.88ms step:1341/1480 train_time:212811ms step_avg:159.89ms step:1342/1480 train_time:212983ms step_avg:159.90ms step:1343/1480 train_time:213153ms step_avg:159.90ms step:1344/1480 train_time:213324ms step_avg:159.91ms step:1345/1480 train_time:213503ms step_avg:159.93ms step:1346/1480 train_time:213673ms step_avg:159.94ms step:1347/1480 train_time:213843ms step_avg:159.94ms step:1348/1480 train_time:214014ms step_avg:159.95ms step:1349/1480 train_time:214184ms step_avg:159.96ms step:1350/1480 train_time:214361ms step_avg:159.97ms step:1351/1480 train_time:214531ms step_avg:159.98ms step:1352/1480 train_time:214702ms step_avg:159.99ms step:1353/1480 train_time:214880ms step_avg:160.00ms step:1354/1480 train_time:215050ms step_avg:160.01ms step:1355/1480 train_time:215219ms step_avg:160.01ms step:1356/1480 train_time:215390ms step_avg:160.02ms step:1357/1480 train_time:215565ms step_avg:160.03ms step:1358/1480 train_time:215738ms step_avg:160.04ms step:1359/1480 train_time:215909ms step_avg:160.05ms step:1360/1480 train_time:216084ms step_avg:160.06ms step:1361/1480 train_time:216262ms step_avg:160.08ms step:1362/1480 train_time:216439ms step_avg:160.09ms step:1363/1480 train_time:216619ms step_avg:160.10ms step:1364/1480 train_time:216787ms step_avg:160.11ms step:1365/1480 train_time:216956ms step_avg:160.11ms step:1366/1480 train_time:217127ms step_avg:160.12ms step:1367/1480 train_time:217300ms step_avg:160.13ms step:1368/1480 train_time:217474ms step_avg:160.14ms step:1369/1480 train_time:217655ms step_avg:160.16ms step:1370/1480 train_time:217834ms step_avg:160.17ms step:1371/1480 train_time:218004ms step_avg:160.18ms step:1372/1480 train_time:218180ms step_avg:160.19ms step:1373/1480 train_time:218350ms step_avg:160.20ms step:1374/1480 train_time:218525ms step_avg:160.21ms step:1375/1480 train_time:218697ms step_avg:160.22ms step:1375/1480 val_loss:3.2976 train_time:218765ms step_avg:160.27ms step:1376/1480 train_time:218870ms step_avg:160.23ms step:1377/1480 train_time:219043ms step_avg:160.24ms step:1378/1480 train_time:219212ms step_avg:160.24ms step:1379/1480 train_time:219388ms step_avg:160.25ms step:1380/1480 train_time:219561ms step_avg:160.26ms step:1381/1480 train_time:219744ms step_avg:160.28ms step:1382/1480 train_time:219915ms step_avg:160.29ms step:1383/1480 train_time:220088ms step_avg:160.30ms step:1384/1480 train_time:220264ms step_avg:160.31ms step:1385/1480 train_time:220430ms step_avg:160.31ms step:1386/1480 train_time:220599ms step_avg:160.32ms step:1387/1480 train_time:220770ms step_avg:160.33ms step:1388/1480 train_time:220938ms step_avg:160.33ms step:1389/1480 train_time:221111ms step_avg:160.34ms step:1390/1480 train_time:221279ms step_avg:160.35ms step:1391/1480 train_time:221450ms step_avg:160.36ms step:1392/1480 train_time:221624ms step_avg:160.36ms step:1393/1480 train_time:221794ms step_avg:160.37ms step:1394/1480 train_time:221964ms step_avg:160.38ms step:1395/1480 train_time:222134ms step_avg:160.39ms step:1396/1480 train_time:222302ms step_avg:160.39ms step:1397/1480 train_time:222470ms step_avg:160.40ms step:1398/1480 train_time:222637ms step_avg:160.40ms step:1399/1480 train_time:222807ms step_avg:160.41ms step:1400/1480 train_time:222984ms step_avg:160.42ms step:1401/1480 train_time:223151ms step_avg:160.42ms step:1402/1480 train_time:223323ms step_avg:160.43ms step:1403/1480 train_time:223499ms step_avg:160.44ms step:1404/1480 train_time:223670ms step_avg:160.45ms step:1405/1480 train_time:223844ms step_avg:160.46ms step:1406/1480 train_time:224017ms step_avg:160.47ms step:1407/1480 train_time:224186ms step_avg:160.48ms step:1408/1480 train_time:224355ms step_avg:160.48ms step:1409/1480 train_time:224540ms step_avg:160.50ms step:1410/1480 train_time:224709ms step_avg:160.51ms step:1411/1480 train_time:224877ms step_avg:160.51ms step:1412/1480 train_time:225048ms step_avg:160.52ms step:1413/1480 train_time:225217ms step_avg:160.53ms step:1414/1480 train_time:225388ms step_avg:160.53ms step:1415/1480 train_time:225561ms step_avg:160.54ms step:1416/1480 train_time:225748ms step_avg:160.56ms step:1417/1480 train_time:225920ms step_avg:160.57ms step:1418/1480 train_time:226090ms step_avg:160.58ms step:1419/1480 train_time:226265ms step_avg:160.59ms step:1420/1480 train_time:226439ms step_avg:160.59ms step:1421/1480 train_time:226612ms step_avg:160.60ms step:1422/1480 train_time:226784ms step_avg:160.61ms step:1423/1480 train_time:226953ms step_avg:160.62ms step:1424/1480 train_time:227131ms step_avg:160.63ms step:1425/1480 train_time:227313ms step_avg:160.65ms step:1426/1480 train_time:227484ms step_avg:160.65ms step:1427/1480 train_time:227658ms step_avg:160.66ms step:1428/1480 train_time:227830ms step_avg:160.67ms step:1429/1480 train_time:227998ms step_avg:160.68ms step:1430/1480 train_time:228173ms step_avg:160.69ms step:1431/1480 train_time:228351ms step_avg:160.70ms step:1432/1480 train_time:228528ms step_avg:160.71ms step:1433/1480 train_time:228708ms step_avg:160.72ms step:1434/1480 train_time:228888ms step_avg:160.74ms step:1435/1480 train_time:229063ms step_avg:160.75ms step:1436/1480 train_time:229237ms step_avg:160.76ms step:1437/1480 train_time:229409ms step_avg:160.76ms step:1438/1480 train_time:229578ms step_avg:160.77ms step:1439/1480 train_time:229752ms step_avg:160.78ms step:1440/1480 train_time:229921ms step_avg:160.78ms step:1441/1480 train_time:230092ms step_avg:160.79ms step:1442/1480 train_time:230270ms step_avg:160.80ms step:1443/1480 train_time:230458ms step_avg:160.82ms step:1444/1480 train_time:230629ms step_avg:160.83ms step:1445/1480 train_time:230800ms step_avg:160.84ms step:1446/1480 train_time:230976ms step_avg:160.85ms step:1447/1480 train_time:231154ms step_avg:160.86ms step:1448/1480 train_time:231327ms step_avg:160.87ms step:1449/1480 train_time:231500ms step_avg:160.88ms step:1450/1480 train_time:231673ms step_avg:160.88ms step:1451/1480 train_time:231845ms step_avg:160.89ms step:1452/1480 train_time:232016ms step_avg:160.90ms step:1453/1480 train_time:232186ms step_avg:160.90ms step:1454/1480 train_time:232357ms step_avg:160.91ms step:1455/1480 train_time:232536ms step_avg:160.92ms step:1456/1480 train_time:232708ms step_avg:160.93ms step:1457/1480 train_time:232879ms step_avg:160.94ms step:1458/1480 train_time:233049ms step_avg:160.95ms step:1459/1480 train_time:233224ms step_avg:160.96ms step:1460/1480 train_time:233394ms step_avg:160.96ms step:1461/1480 train_time:233570ms step_avg:160.97ms step:1462/1480 train_time:233741ms step_avg:160.98ms step:1463/1480 train_time:233917ms step_avg:160.99ms step:1464/1480 train_time:234092ms step_avg:161.00ms step:1465/1480 train_time:234267ms step_avg:161.01ms step:1466/1480 train_time:234437ms step_avg:161.01ms step:1467/1480 train_time:234613ms step_avg:161.02ms step:1468/1480 train_time:234785ms step_avg:161.03ms step:1469/1480 train_time:234959ms step_avg:161.04ms step:1470/1480 train_time:235139ms step_avg:161.05ms step:1471/1480 train_time:235327ms step_avg:161.07ms step:1472/1480 train_time:235507ms step_avg:161.09ms step:1473/1480 train_time:235679ms step_avg:161.09ms step:1474/1480 train_time:235857ms step_avg:161.10ms step:1475/1480 train_time:236037ms step_avg:161.12ms step:1476/1480 train_time:236210ms step_avg:161.13ms step:1477/1480 train_time:236392ms step_avg:161.14ms step:1478/1480 train_time:236575ms step_avg:161.15ms step:1479/1480 train_time:236750ms step_avg:161.16ms step:1480/1480 train_time:236922ms step_avg:161.17ms step:1480/1480 val_loss:3.2788 train_time:236993ms step_avg:161.22ms