import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 09:43:06 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 131W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 119W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23128ms step_avg:nanms step:2/1480 train_time:23216ms step_avg:nanms step:3/1480 train_time:23358ms step_avg:nanms step:4/1480 train_time:23498ms step_avg:nanms step:5/1480 train_time:23639ms step_avg:nanms step:6/1480 train_time:23781ms step_avg:nanms step:7/1480 train_time:23923ms step_avg:nanms step:8/1480 train_time:24066ms step_avg:nanms step:9/1480 train_time:24211ms step_avg:nanms step:10/1480 train_time:24355ms step_avg:nanms step:11/1480 train_time:140ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.64ms step:14/1480 train_time:567ms step_avg:141.71ms step:15/1480 train_time:710ms step_avg:141.99ms step:16/1480 train_time:855ms step_avg:142.43ms step:17/1480 train_time:997ms step_avg:142.45ms step:18/1480 train_time:1138ms step_avg:142.27ms step:19/1480 train_time:1280ms step_avg:142.22ms step:20/1480 train_time:1421ms step_avg:142.07ms step:21/1480 train_time:1562ms step_avg:142.01ms step:22/1480 train_time:1705ms step_avg:142.12ms step:23/1480 train_time:1848ms step_avg:142.15ms step:24/1480 train_time:1992ms step_avg:142.30ms step:25/1480 train_time:2135ms step_avg:142.33ms step:26/1480 train_time:2276ms step_avg:142.26ms step:27/1480 train_time:2419ms step_avg:142.29ms step:28/1480 train_time:2561ms step_avg:142.26ms step:29/1480 train_time:2705ms step_avg:142.36ms step:30/1480 train_time:2849ms step_avg:142.46ms step:31/1480 train_time:2994ms step_avg:142.55ms step:32/1480 train_time:3136ms step_avg:142.57ms step:33/1480 train_time:3278ms step_avg:142.53ms step:34/1480 train_time:3420ms step_avg:142.50ms step:35/1480 train_time:3561ms step_avg:142.44ms step:36/1480 train_time:3702ms step_avg:142.38ms step:37/1480 train_time:3844ms step_avg:142.36ms step:38/1480 train_time:3987ms step_avg:142.41ms step:39/1480 train_time:4133ms step_avg:142.50ms step:40/1480 train_time:4275ms step_avg:142.50ms step:41/1480 train_time:4417ms step_avg:142.48ms step:42/1480 train_time:4559ms step_avg:142.46ms step:43/1480 train_time:4701ms step_avg:142.45ms step:44/1480 train_time:4841ms step_avg:142.39ms step:45/1480 train_time:4984ms step_avg:142.41ms step:46/1480 train_time:5128ms step_avg:142.45ms step:47/1480 train_time:5272ms step_avg:142.48ms step:48/1480 train_time:5414ms step_avg:142.48ms step:49/1480 train_time:5557ms step_avg:142.48ms step:50/1480 train_time:5699ms step_avg:142.47ms step:51/1480 train_time:5840ms step_avg:142.43ms step:52/1480 train_time:5982ms step_avg:142.43ms step:53/1480 train_time:6125ms step_avg:142.44ms step:54/1480 train_time:6270ms step_avg:142.50ms step:55/1480 train_time:6414ms step_avg:142.54ms step:56/1480 train_time:6557ms step_avg:142.54ms step:57/1480 train_time:6699ms step_avg:142.53ms step:58/1480 train_time:6841ms step_avg:142.52ms step:59/1480 train_time:6983ms step_avg:142.51ms step:60/1480 train_time:7124ms step_avg:142.48ms step:61/1480 train_time:7267ms step_avg:142.49ms step:62/1480 train_time:7410ms step_avg:142.50ms step:63/1480 train_time:7553ms step_avg:142.51ms step:64/1480 train_time:7696ms step_avg:142.51ms step:65/1480 train_time:7838ms step_avg:142.50ms step:66/1480 train_time:7981ms step_avg:142.53ms step:67/1480 train_time:8125ms step_avg:142.54ms step:68/1480 train_time:8268ms step_avg:142.55ms step:69/1480 train_time:8411ms step_avg:142.56ms step:70/1480 train_time:8555ms step_avg:142.59ms step:71/1480 train_time:8698ms step_avg:142.58ms step:72/1480 train_time:8839ms step_avg:142.56ms step:73/1480 train_time:8979ms step_avg:142.53ms step:74/1480 train_time:9121ms step_avg:142.52ms step:75/1480 train_time:9263ms step_avg:142.51ms step:76/1480 train_time:9407ms step_avg:142.54ms step:77/1480 train_time:9550ms step_avg:142.54ms step:78/1480 train_time:9694ms step_avg:142.56ms step:79/1480 train_time:9837ms step_avg:142.56ms step:80/1480 train_time:9978ms step_avg:142.55ms step:81/1480 train_time:10121ms step_avg:142.54ms step:82/1480 train_time:10262ms step_avg:142.53ms step:83/1480 train_time:10404ms step_avg:142.52ms step:84/1480 train_time:10545ms step_avg:142.51ms step:85/1480 train_time:10688ms step_avg:142.51ms step:86/1480 train_time:10831ms step_avg:142.51ms step:87/1480 train_time:10973ms step_avg:142.51ms step:88/1480 train_time:11115ms step_avg:142.50ms step:89/1480 train_time:11257ms step_avg:142.49ms step:90/1480 train_time:11399ms step_avg:142.49ms step:91/1480 train_time:11541ms step_avg:142.49ms step:92/1480 train_time:11683ms step_avg:142.48ms step:93/1480 train_time:11826ms step_avg:142.48ms step:94/1480 train_time:11970ms step_avg:142.50ms step:95/1480 train_time:12113ms step_avg:142.50ms step:96/1480 train_time:12254ms step_avg:142.49ms step:97/1480 train_time:12398ms step_avg:142.50ms step:98/1480 train_time:12539ms step_avg:142.49ms step:99/1480 train_time:12681ms step_avg:142.48ms step:100/1480 train_time:12826ms step_avg:142.51ms step:101/1480 train_time:12969ms step_avg:142.52ms step:102/1480 train_time:13114ms step_avg:142.54ms step:103/1480 train_time:13255ms step_avg:142.53ms step:104/1480 train_time:13398ms step_avg:142.53ms step:105/1480 train_time:13539ms step_avg:142.52ms step:106/1480 train_time:13680ms step_avg:142.50ms step:107/1480 train_time:13821ms step_avg:142.48ms step:108/1480 train_time:13963ms step_avg:142.48ms step:109/1480 train_time:14106ms step_avg:142.49ms step:110/1480 train_time:14249ms step_avg:142.49ms step:111/1480 train_time:14394ms step_avg:142.51ms step:112/1480 train_time:14540ms step_avg:142.55ms step:113/1480 train_time:14684ms step_avg:142.57ms step:114/1480 train_time:14831ms step_avg:142.61ms step:115/1480 train_time:14979ms step_avg:142.65ms step:116/1480 train_time:15125ms step_avg:142.69ms step:117/1480 train_time:15273ms step_avg:142.74ms step:118/1480 train_time:15420ms step_avg:142.78ms step:119/1480 train_time:15565ms step_avg:142.80ms step:120/1480 train_time:15712ms step_avg:142.84ms step:121/1480 train_time:15859ms step_avg:142.87ms step:122/1480 train_time:16005ms step_avg:142.90ms step:123/1480 train_time:16152ms step_avg:142.94ms step:124/1480 train_time:16300ms step_avg:142.98ms step:125/1480 train_time:16447ms step_avg:143.02ms step:125/1480 val_loss:4.4176 train_time:16503ms step_avg:143.51ms step:126/1480 train_time:16598ms step_avg:143.09ms step:127/1480 train_time:16747ms step_avg:143.14ms step:128/1480 train_time:16892ms step_avg:143.15ms step:129/1480 train_time:17038ms step_avg:143.18ms step:130/1480 train_time:17185ms step_avg:143.21ms step:131/1480 train_time:17331ms step_avg:143.23ms step:132/1480 train_time:17477ms step_avg:143.25ms step:133/1480 train_time:17627ms step_avg:143.31ms step:134/1480 train_time:17773ms step_avg:143.33ms step:135/1480 train_time:17920ms step_avg:143.36ms step:136/1480 train_time:18067ms step_avg:143.39ms step:137/1480 train_time:18213ms step_avg:143.41ms step:138/1480 train_time:18361ms step_avg:143.44ms step:139/1480 train_time:18508ms step_avg:143.47ms step:140/1480 train_time:18654ms step_avg:143.49ms step:141/1480 train_time:18801ms step_avg:143.52ms step:142/1480 train_time:18949ms step_avg:143.56ms step:143/1480 train_time:19095ms step_avg:143.57ms step:144/1480 train_time:19241ms step_avg:143.59ms step:145/1480 train_time:19388ms step_avg:143.61ms step:146/1480 train_time:19534ms step_avg:143.63ms step:147/1480 train_time:19681ms step_avg:143.66ms step:148/1480 train_time:19829ms step_avg:143.69ms step:149/1480 train_time:19974ms step_avg:143.70ms step:150/1480 train_time:20122ms step_avg:143.73ms step:151/1480 train_time:20269ms step_avg:143.75ms step:152/1480 train_time:20415ms step_avg:143.77ms step:153/1480 train_time:20562ms step_avg:143.79ms step:154/1480 train_time:20709ms step_avg:143.81ms step:155/1480 train_time:20856ms step_avg:143.83ms step:156/1480 train_time:21002ms step_avg:143.85ms step:157/1480 train_time:21149ms step_avg:143.87ms step:158/1480 train_time:21294ms step_avg:143.88ms step:159/1480 train_time:21442ms step_avg:143.91ms step:160/1480 train_time:21589ms step_avg:143.93ms step:161/1480 train_time:21735ms step_avg:143.94ms step:162/1480 train_time:21882ms step_avg:143.96ms step:163/1480 train_time:22030ms step_avg:143.99ms step:164/1480 train_time:22175ms step_avg:143.99ms step:165/1480 train_time:22322ms step_avg:144.01ms step:166/1480 train_time:22470ms step_avg:144.04ms step:167/1480 train_time:22616ms step_avg:144.05ms step:168/1480 train_time:22764ms step_avg:144.08ms step:169/1480 train_time:22911ms step_avg:144.09ms step:170/1480 train_time:23057ms step_avg:144.11ms step:171/1480 train_time:23204ms step_avg:144.12ms step:172/1480 train_time:23350ms step_avg:144.14ms step:173/1480 train_time:23498ms step_avg:144.16ms step:174/1480 train_time:23645ms step_avg:144.18ms step:175/1480 train_time:23791ms step_avg:144.19ms step:176/1480 train_time:23938ms step_avg:144.21ms step:177/1480 train_time:24086ms step_avg:144.23ms step:178/1480 train_time:24232ms step_avg:144.24ms step:179/1480 train_time:24379ms step_avg:144.25ms step:180/1480 train_time:24526ms step_avg:144.27ms step:181/1480 train_time:24672ms step_avg:144.28ms step:182/1480 train_time:24819ms step_avg:144.29ms step:183/1480 train_time:24967ms step_avg:144.32ms step:184/1480 train_time:25113ms step_avg:144.33ms step:185/1480 train_time:25260ms step_avg:144.35ms step:186/1480 train_time:25408ms step_avg:144.36ms step:187/1480 train_time:25555ms step_avg:144.38ms step:188/1480 train_time:25702ms step_avg:144.40ms step:189/1480 train_time:25849ms step_avg:144.41ms step:190/1480 train_time:25995ms step_avg:144.42ms step:191/1480 train_time:26143ms step_avg:144.44ms step:192/1480 train_time:26290ms step_avg:144.45ms step:193/1480 train_time:26436ms step_avg:144.46ms step:194/1480 train_time:26582ms step_avg:144.47ms step:195/1480 train_time:26729ms step_avg:144.48ms step:196/1480 train_time:26875ms step_avg:144.49ms step:197/1480 train_time:27023ms step_avg:144.51ms step:198/1480 train_time:27170ms step_avg:144.52ms step:199/1480 train_time:27315ms step_avg:144.53ms step:200/1480 train_time:27463ms step_avg:144.54ms step:201/1480 train_time:27609ms step_avg:144.55ms step:202/1480 train_time:27755ms step_avg:144.56ms step:203/1480 train_time:27903ms step_avg:144.57ms step:204/1480 train_time:28050ms step_avg:144.59ms step:205/1480 train_time:28194ms step_avg:144.59ms step:206/1480 train_time:28341ms step_avg:144.60ms step:207/1480 train_time:28488ms step_avg:144.61ms step:208/1480 train_time:28633ms step_avg:144.61ms step:209/1480 train_time:28779ms step_avg:144.62ms step:210/1480 train_time:28926ms step_avg:144.63ms step:211/1480 train_time:29071ms step_avg:144.63ms step:212/1480 train_time:29220ms step_avg:144.65ms step:213/1480 train_time:29367ms step_avg:144.67ms step:214/1480 train_time:29513ms step_avg:144.67ms step:215/1480 train_time:29660ms step_avg:144.68ms step:216/1480 train_time:29807ms step_avg:144.69ms step:217/1480 train_time:29954ms step_avg:144.70ms step:218/1480 train_time:30101ms step_avg:144.72ms step:219/1480 train_time:30249ms step_avg:144.73ms step:220/1480 train_time:30395ms step_avg:144.74ms step:221/1480 train_time:30544ms step_avg:144.76ms step:222/1480 train_time:30694ms step_avg:144.78ms step:223/1480 train_time:30845ms step_avg:144.81ms step:224/1480 train_time:30995ms step_avg:144.84ms step:225/1480 train_time:31145ms step_avg:144.86ms step:226/1480 train_time:31295ms step_avg:144.88ms step:227/1480 train_time:31445ms step_avg:144.91ms step:228/1480 train_time:31595ms step_avg:144.93ms step:229/1480 train_time:31746ms step_avg:144.96ms step:230/1480 train_time:31895ms step_avg:144.98ms step:231/1480 train_time:32047ms step_avg:145.01ms step:232/1480 train_time:32197ms step_avg:145.03ms step:233/1480 train_time:32348ms step_avg:145.06ms step:234/1480 train_time:32498ms step_avg:145.08ms step:235/1480 train_time:32649ms step_avg:145.11ms step:236/1480 train_time:32800ms step_avg:145.13ms step:237/1480 train_time:32950ms step_avg:145.15ms step:238/1480 train_time:33101ms step_avg:145.18ms step:239/1480 train_time:33252ms step_avg:145.21ms step:240/1480 train_time:33403ms step_avg:145.23ms step:241/1480 train_time:33553ms step_avg:145.25ms step:242/1480 train_time:33705ms step_avg:145.28ms step:243/1480 train_time:33855ms step_avg:145.30ms step:244/1480 train_time:34006ms step_avg:145.32ms step:245/1480 train_time:34157ms step_avg:145.35ms step:246/1480 train_time:34308ms step_avg:145.37ms step:247/1480 train_time:34458ms step_avg:145.39ms step:248/1480 train_time:34609ms step_avg:145.41ms step:249/1480 train_time:34759ms step_avg:145.43ms step:250/1480 train_time:34909ms step_avg:145.45ms step:250/1480 val_loss:3.9986 train_time:34967ms step_avg:145.70ms step:251/1480 train_time:35063ms step_avg:145.49ms step:252/1480 train_time:35215ms step_avg:145.52ms step:253/1480 train_time:35364ms step_avg:145.53ms step:254/1480 train_time:35513ms step_avg:145.54ms step:255/1480 train_time:35662ms step_avg:145.56ms step:256/1480 train_time:35810ms step_avg:145.57ms step:257/1480 train_time:35961ms step_avg:145.59ms step:258/1480 train_time:36113ms step_avg:145.62ms step:259/1480 train_time:36265ms step_avg:145.64ms step:260/1480 train_time:36417ms step_avg:145.67ms step:261/1480 train_time:36567ms step_avg:145.69ms step:262/1480 train_time:36718ms step_avg:145.71ms step:263/1480 train_time:36868ms step_avg:145.72ms step:264/1480 train_time:37018ms step_avg:145.74ms step:265/1480 train_time:37169ms step_avg:145.76ms step:266/1480 train_time:37319ms step_avg:145.78ms step:267/1480 train_time:37470ms step_avg:145.80ms step:268/1480 train_time:37621ms step_avg:145.82ms step:269/1480 train_time:37771ms step_avg:145.84ms step:270/1480 train_time:37921ms step_avg:145.85ms step:271/1480 train_time:38071ms step_avg:145.87ms step:272/1480 train_time:38222ms step_avg:145.89ms step:273/1480 train_time:38374ms step_avg:145.91ms step:274/1480 train_time:38525ms step_avg:145.93ms step:275/1480 train_time:38676ms step_avg:145.95ms step:276/1480 train_time:38826ms step_avg:145.96ms step:277/1480 train_time:38978ms step_avg:145.98ms step:278/1480 train_time:39128ms step_avg:146.00ms step:279/1480 train_time:39279ms step_avg:146.02ms step:280/1480 train_time:39432ms step_avg:146.04ms step:281/1480 train_time:39582ms step_avg:146.06ms step:282/1480 train_time:39735ms step_avg:146.08ms step:283/1480 train_time:39885ms step_avg:146.10ms step:284/1480 train_time:40035ms step_avg:146.11ms step:285/1480 train_time:40185ms step_avg:146.13ms step:286/1480 train_time:40337ms step_avg:146.15ms step:287/1480 train_time:40487ms step_avg:146.16ms step:288/1480 train_time:40638ms step_avg:146.18ms step:289/1480 train_time:40789ms step_avg:146.20ms step:290/1480 train_time:40940ms step_avg:146.21ms step:291/1480 train_time:41089ms step_avg:146.22ms step:292/1480 train_time:41239ms step_avg:146.24ms step:293/1480 train_time:41389ms step_avg:146.25ms step:294/1480 train_time:41540ms step_avg:146.27ms step:295/1480 train_time:41689ms step_avg:146.28ms step:296/1480 train_time:41840ms step_avg:146.29ms step:297/1480 train_time:41990ms step_avg:146.31ms step:298/1480 train_time:42141ms step_avg:146.32ms step:299/1480 train_time:42291ms step_avg:146.33ms step:300/1480 train_time:42443ms step_avg:146.36ms step:301/1480 train_time:42593ms step_avg:146.37ms step:302/1480 train_time:42744ms step_avg:146.38ms step:303/1480 train_time:42893ms step_avg:146.39ms step:304/1480 train_time:43044ms step_avg:146.41ms step:305/1480 train_time:43195ms step_avg:146.43ms step:306/1480 train_time:43345ms step_avg:146.44ms step:307/1480 train_time:43497ms step_avg:146.45ms step:308/1480 train_time:43648ms step_avg:146.47ms step:309/1480 train_time:43799ms step_avg:146.48ms step:310/1480 train_time:43948ms step_avg:146.49ms step:311/1480 train_time:44099ms step_avg:146.51ms step:312/1480 train_time:44249ms step_avg:146.52ms step:313/1480 train_time:44399ms step_avg:146.53ms step:314/1480 train_time:44549ms step_avg:146.54ms step:315/1480 train_time:44700ms step_avg:146.56ms step:316/1480 train_time:44850ms step_avg:146.57ms step:317/1480 train_time:45000ms step_avg:146.58ms step:318/1480 train_time:45150ms step_avg:146.59ms step:319/1480 train_time:45301ms step_avg:146.61ms step:320/1480 train_time:45452ms step_avg:146.62ms step:321/1480 train_time:45602ms step_avg:146.63ms step:322/1480 train_time:45754ms step_avg:146.65ms step:323/1480 train_time:45904ms step_avg:146.66ms step:324/1480 train_time:46056ms step_avg:146.67ms step:325/1480 train_time:46205ms step_avg:146.68ms step:326/1480 train_time:46356ms step_avg:146.70ms step:327/1480 train_time:46506ms step_avg:146.71ms step:328/1480 train_time:46657ms step_avg:146.72ms step:329/1480 train_time:46807ms step_avg:146.73ms step:330/1480 train_time:46960ms step_avg:146.75ms step:331/1480 train_time:47113ms step_avg:146.77ms step:332/1480 train_time:47268ms step_avg:146.80ms step:333/1480 train_time:47420ms step_avg:146.81ms step:334/1480 train_time:47575ms step_avg:146.84ms step:335/1480 train_time:47728ms step_avg:146.86ms step:336/1480 train_time:47882ms step_avg:146.88ms step:337/1480 train_time:48035ms step_avg:146.90ms step:338/1480 train_time:48188ms step_avg:146.92ms step:339/1480 train_time:48341ms step_avg:146.93ms step:340/1480 train_time:48494ms step_avg:146.95ms step:341/1480 train_time:48647ms step_avg:146.97ms step:342/1480 train_time:48800ms step_avg:146.99ms step:343/1480 train_time:48954ms step_avg:147.01ms step:344/1480 train_time:49108ms step_avg:147.03ms step:345/1480 train_time:49262ms step_avg:147.05ms step:346/1480 train_time:49416ms step_avg:147.07ms step:347/1480 train_time:49570ms step_avg:147.09ms step:348/1480 train_time:49724ms step_avg:147.11ms step:349/1480 train_time:49879ms step_avg:147.14ms step:350/1480 train_time:50033ms step_avg:147.16ms step:351/1480 train_time:50186ms step_avg:147.17ms step:352/1480 train_time:50341ms step_avg:147.19ms step:353/1480 train_time:50494ms step_avg:147.21ms step:354/1480 train_time:50648ms step_avg:147.23ms step:355/1480 train_time:50801ms step_avg:147.25ms step:356/1480 train_time:50955ms step_avg:147.27ms step:357/1480 train_time:51110ms step_avg:147.29ms step:358/1480 train_time:51264ms step_avg:147.31ms step:359/1480 train_time:51419ms step_avg:147.33ms step:360/1480 train_time:51573ms step_avg:147.35ms step:361/1480 train_time:51726ms step_avg:147.37ms step:362/1480 train_time:51880ms step_avg:147.39ms step:363/1480 train_time:52033ms step_avg:147.40ms step:364/1480 train_time:52186ms step_avg:147.42ms step:365/1480 train_time:52343ms step_avg:147.44ms step:366/1480 train_time:52495ms step_avg:147.46ms step:367/1480 train_time:52648ms step_avg:147.47ms step:368/1480 train_time:52801ms step_avg:147.49ms step:369/1480 train_time:52955ms step_avg:147.51ms step:370/1480 train_time:53109ms step_avg:147.52ms step:371/1480 train_time:53263ms step_avg:147.54ms step:372/1480 train_time:53416ms step_avg:147.56ms step:373/1480 train_time:53569ms step_avg:147.57ms step:374/1480 train_time:53722ms step_avg:147.59ms step:375/1480 train_time:53876ms step_avg:147.61ms step:375/1480 val_loss:3.8129 train_time:53936ms step_avg:147.77ms step:376/1480 train_time:54035ms step_avg:147.64ms step:377/1480 train_time:54190ms step_avg:147.66ms step:378/1480 train_time:54342ms step_avg:147.67ms step:379/1480 train_time:54495ms step_avg:147.68ms step:380/1480 train_time:54646ms step_avg:147.69ms step:381/1480 train_time:54799ms step_avg:147.71ms step:382/1480 train_time:54953ms step_avg:147.72ms step:383/1480 train_time:55108ms step_avg:147.74ms step:384/1480 train_time:55265ms step_avg:147.77ms step:385/1480 train_time:55417ms step_avg:147.78ms step:386/1480 train_time:55569ms step_avg:147.79ms step:387/1480 train_time:55723ms step_avg:147.81ms step:388/1480 train_time:55876ms step_avg:147.82ms step:389/1480 train_time:56029ms step_avg:147.83ms step:390/1480 train_time:56183ms step_avg:147.85ms step:391/1480 train_time:56337ms step_avg:147.87ms step:392/1480 train_time:56491ms step_avg:147.88ms step:393/1480 train_time:56645ms step_avg:147.90ms step:394/1480 train_time:56799ms step_avg:147.91ms step:395/1480 train_time:56953ms step_avg:147.93ms step:396/1480 train_time:57107ms step_avg:147.95ms step:397/1480 train_time:57261ms step_avg:147.96ms step:398/1480 train_time:57414ms step_avg:147.97ms step:399/1480 train_time:57569ms step_avg:147.99ms step:400/1480 train_time:57722ms step_avg:148.01ms step:401/1480 train_time:57876ms step_avg:148.02ms step:402/1480 train_time:58030ms step_avg:148.04ms step:403/1480 train_time:58184ms step_avg:148.05ms step:404/1480 train_time:58337ms step_avg:148.06ms step:405/1480 train_time:58491ms step_avg:148.08ms step:406/1480 train_time:58645ms step_avg:148.09ms step:407/1480 train_time:58798ms step_avg:148.11ms step:408/1480 train_time:58951ms step_avg:148.12ms step:409/1480 train_time:59104ms step_avg:148.13ms step:410/1480 train_time:59259ms step_avg:148.15ms step:411/1480 train_time:59413ms step_avg:148.16ms step:412/1480 train_time:59567ms step_avg:148.18ms step:413/1480 train_time:59720ms step_avg:148.19ms step:414/1480 train_time:59875ms step_avg:148.21ms step:415/1480 train_time:60030ms step_avg:148.22ms step:416/1480 train_time:60183ms step_avg:148.23ms step:417/1480 train_time:60338ms step_avg:148.25ms step:418/1480 train_time:60492ms step_avg:148.27ms step:419/1480 train_time:60646ms step_avg:148.28ms step:420/1480 train_time:60799ms step_avg:148.29ms step:421/1480 train_time:60952ms step_avg:148.30ms step:422/1480 train_time:61106ms step_avg:148.31ms step:423/1480 train_time:61259ms step_avg:148.33ms step:424/1480 train_time:61414ms step_avg:148.34ms step:425/1480 train_time:61569ms step_avg:148.36ms step:426/1480 train_time:61724ms step_avg:148.37ms step:427/1480 train_time:61877ms step_avg:148.39ms step:428/1480 train_time:62030ms step_avg:148.40ms step:429/1480 train_time:62182ms step_avg:148.41ms step:430/1480 train_time:62336ms step_avg:148.42ms step:431/1480 train_time:62490ms step_avg:148.43ms step:432/1480 train_time:62644ms step_avg:148.45ms step:433/1480 train_time:62799ms step_avg:148.46ms step:434/1480 train_time:62952ms step_avg:148.47ms step:435/1480 train_time:63107ms step_avg:148.49ms step:436/1480 train_time:63261ms step_avg:148.50ms step:437/1480 train_time:63415ms step_avg:148.51ms step:438/1480 train_time:63570ms step_avg:148.53ms step:439/1480 train_time:63724ms step_avg:148.54ms step:440/1480 train_time:63878ms step_avg:148.55ms step:441/1480 train_time:64035ms step_avg:148.57ms step:442/1480 train_time:64194ms step_avg:148.60ms step:443/1480 train_time:64351ms step_avg:148.62ms step:444/1480 train_time:64507ms step_avg:148.63ms step:445/1480 train_time:64662ms step_avg:148.65ms step:446/1480 train_time:64817ms step_avg:148.66ms step:447/1480 train_time:64973ms step_avg:148.68ms step:448/1480 train_time:65130ms step_avg:148.70ms step:449/1480 train_time:65287ms step_avg:148.72ms step:450/1480 train_time:65443ms step_avg:148.74ms step:451/1480 train_time:65601ms step_avg:148.76ms step:452/1480 train_time:65758ms step_avg:148.77ms step:453/1480 train_time:65915ms step_avg:148.79ms step:454/1480 train_time:66073ms step_avg:148.81ms step:455/1480 train_time:66230ms step_avg:148.83ms step:456/1480 train_time:66386ms step_avg:148.85ms step:457/1480 train_time:66542ms step_avg:148.86ms step:458/1480 train_time:66698ms step_avg:148.88ms step:459/1480 train_time:66856ms step_avg:148.90ms step:460/1480 train_time:67013ms step_avg:148.92ms step:461/1480 train_time:67173ms step_avg:148.94ms step:462/1480 train_time:67331ms step_avg:148.96ms step:463/1480 train_time:67487ms step_avg:148.98ms step:464/1480 train_time:67643ms step_avg:148.99ms step:465/1480 train_time:67799ms step_avg:149.01ms step:466/1480 train_time:67956ms step_avg:149.03ms step:467/1480 train_time:68114ms step_avg:149.05ms step:468/1480 train_time:68272ms step_avg:149.07ms step:469/1480 train_time:68429ms step_avg:149.08ms step:470/1480 train_time:68586ms step_avg:149.10ms step:471/1480 train_time:68743ms step_avg:149.12ms step:472/1480 train_time:68899ms step_avg:149.13ms step:473/1480 train_time:69056ms step_avg:149.15ms step:474/1480 train_time:69213ms step_avg:149.17ms step:475/1480 train_time:69370ms step_avg:149.18ms step:476/1480 train_time:69527ms step_avg:149.20ms step:477/1480 train_time:69684ms step_avg:149.22ms step:478/1480 train_time:69840ms step_avg:149.23ms step:479/1480 train_time:69997ms step_avg:149.25ms step:480/1480 train_time:70153ms step_avg:149.26ms step:481/1480 train_time:70309ms step_avg:149.28ms step:482/1480 train_time:70465ms step_avg:149.29ms step:483/1480 train_time:70620ms step_avg:149.30ms step:484/1480 train_time:70779ms step_avg:149.32ms step:485/1480 train_time:70936ms step_avg:149.34ms step:486/1480 train_time:71094ms step_avg:149.36ms step:487/1480 train_time:71250ms step_avg:149.37ms step:488/1480 train_time:71406ms step_avg:149.39ms step:489/1480 train_time:71562ms step_avg:149.40ms step:490/1480 train_time:71719ms step_avg:149.42ms step:491/1480 train_time:71877ms step_avg:149.43ms step:492/1480 train_time:72035ms step_avg:149.45ms step:493/1480 train_time:72194ms step_avg:149.47ms step:494/1480 train_time:72353ms step_avg:149.49ms step:495/1480 train_time:72512ms step_avg:149.51ms step:496/1480 train_time:72670ms step_avg:149.53ms step:497/1480 train_time:72826ms step_avg:149.54ms step:498/1480 train_time:72982ms step_avg:149.55ms step:499/1480 train_time:73140ms step_avg:149.57ms step:500/1480 train_time:73297ms step_avg:149.59ms step:500/1480 val_loss:3.6942 train_time:73359ms step_avg:149.71ms step:501/1480 train_time:73460ms step_avg:149.61ms step:502/1480 train_time:73619ms step_avg:149.63ms step:503/1480 train_time:73775ms step_avg:149.64ms step:504/1480 train_time:73931ms step_avg:149.66ms step:505/1480 train_time:74086ms step_avg:149.67ms step:506/1480 train_time:74242ms step_avg:149.68ms step:507/1480 train_time:74398ms step_avg:149.70ms step:508/1480 train_time:74558ms step_avg:149.71ms step:509/1480 train_time:74714ms step_avg:149.73ms step:510/1480 train_time:74870ms step_avg:149.74ms step:511/1480 train_time:75027ms step_avg:149.75ms step:512/1480 train_time:75185ms step_avg:149.77ms step:513/1480 train_time:75341ms step_avg:149.78ms step:514/1480 train_time:75498ms step_avg:149.80ms step:515/1480 train_time:75654ms step_avg:149.81ms step:516/1480 train_time:75812ms step_avg:149.83ms step:517/1480 train_time:75969ms step_avg:149.84ms step:518/1480 train_time:76125ms step_avg:149.85ms step:519/1480 train_time:76282ms step_avg:149.87ms step:520/1480 train_time:76441ms step_avg:149.88ms step:521/1480 train_time:76598ms step_avg:149.90ms step:522/1480 train_time:76757ms step_avg:149.92ms step:523/1480 train_time:76913ms step_avg:149.93ms step:524/1480 train_time:77069ms step_avg:149.94ms step:525/1480 train_time:77226ms step_avg:149.95ms step:526/1480 train_time:77383ms step_avg:149.97ms step:527/1480 train_time:77539ms step_avg:149.98ms step:528/1480 train_time:77696ms step_avg:149.99ms step:529/1480 train_time:77853ms step_avg:150.01ms step:530/1480 train_time:78009ms step_avg:150.02ms step:531/1480 train_time:78167ms step_avg:150.03ms step:532/1480 train_time:78324ms step_avg:150.05ms step:533/1480 train_time:78481ms step_avg:150.06ms step:534/1480 train_time:78638ms step_avg:150.07ms step:535/1480 train_time:78794ms step_avg:150.08ms step:536/1480 train_time:78950ms step_avg:150.10ms step:537/1480 train_time:79107ms step_avg:150.11ms step:538/1480 train_time:79265ms step_avg:150.12ms step:539/1480 train_time:79424ms step_avg:150.14ms step:540/1480 train_time:79581ms step_avg:150.15ms step:541/1480 train_time:79738ms step_avg:150.16ms step:542/1480 train_time:79894ms step_avg:150.18ms step:543/1480 train_time:80051ms step_avg:150.19ms step:544/1480 train_time:80207ms step_avg:150.20ms step:545/1480 train_time:80364ms step_avg:150.21ms step:546/1480 train_time:80522ms step_avg:150.23ms step:547/1480 train_time:80679ms step_avg:150.24ms step:548/1480 train_time:80839ms step_avg:150.26ms step:549/1480 train_time:80996ms step_avg:150.27ms step:550/1480 train_time:81153ms step_avg:150.28ms step:551/1480 train_time:81310ms step_avg:150.30ms step:552/1480 train_time:81468ms step_avg:150.31ms step:553/1480 train_time:81627ms step_avg:150.33ms step:554/1480 train_time:81787ms step_avg:150.34ms step:555/1480 train_time:81946ms step_avg:150.36ms step:556/1480 train_time:82104ms step_avg:150.37ms step:557/1480 train_time:82265ms step_avg:150.39ms step:558/1480 train_time:82425ms step_avg:150.41ms step:559/1480 train_time:82583ms step_avg:150.42ms step:560/1480 train_time:82744ms step_avg:150.44ms step:561/1480 train_time:82903ms step_avg:150.46ms step:562/1480 train_time:83063ms step_avg:150.48ms step:563/1480 train_time:83223ms step_avg:150.49ms step:564/1480 train_time:83382ms step_avg:150.51ms step:565/1480 train_time:83543ms step_avg:150.53ms step:566/1480 train_time:83704ms step_avg:150.55ms step:567/1480 train_time:83864ms step_avg:150.56ms step:568/1480 train_time:84023ms step_avg:150.58ms step:569/1480 train_time:84183ms step_avg:150.60ms step:570/1480 train_time:84343ms step_avg:150.61ms step:571/1480 train_time:84502ms step_avg:150.63ms step:572/1480 train_time:84662ms step_avg:150.64ms step:573/1480 train_time:84823ms step_avg:150.66ms step:574/1480 train_time:84984ms step_avg:150.68ms step:575/1480 train_time:85144ms step_avg:150.70ms step:576/1480 train_time:85303ms step_avg:150.71ms step:577/1480 train_time:85464ms step_avg:150.73ms step:578/1480 train_time:85622ms step_avg:150.74ms step:579/1480 train_time:85782ms step_avg:150.76ms step:580/1480 train_time:85943ms step_avg:150.78ms step:581/1480 train_time:86103ms step_avg:150.79ms step:582/1480 train_time:86264ms step_avg:150.81ms step:583/1480 train_time:86424ms step_avg:150.83ms step:584/1480 train_time:86583ms step_avg:150.84ms step:585/1480 train_time:86743ms step_avg:150.86ms step:586/1480 train_time:86903ms step_avg:150.87ms step:587/1480 train_time:87061ms step_avg:150.89ms step:588/1480 train_time:87223ms step_avg:150.90ms step:589/1480 train_time:87383ms step_avg:150.92ms step:590/1480 train_time:87543ms step_avg:150.94ms step:591/1480 train_time:87703ms step_avg:150.95ms step:592/1480 train_time:87863ms step_avg:150.97ms step:593/1480 train_time:88024ms step_avg:150.98ms step:594/1480 train_time:88184ms step_avg:151.00ms step:595/1480 train_time:88346ms step_avg:151.02ms step:596/1480 train_time:88506ms step_avg:151.03ms step:597/1480 train_time:88665ms step_avg:151.05ms step:598/1480 train_time:88823ms step_avg:151.06ms step:599/1480 train_time:88981ms step_avg:151.07ms step:600/1480 train_time:89142ms step_avg:151.09ms step:601/1480 train_time:89300ms step_avg:151.10ms step:602/1480 train_time:89461ms step_avg:151.12ms step:603/1480 train_time:89622ms step_avg:151.13ms step:604/1480 train_time:89782ms step_avg:151.15ms step:605/1480 train_time:89942ms step_avg:151.16ms step:606/1480 train_time:90104ms step_avg:151.18ms step:607/1480 train_time:90267ms step_avg:151.20ms step:608/1480 train_time:90426ms step_avg:151.21ms step:609/1480 train_time:90585ms step_avg:151.23ms step:610/1480 train_time:90744ms step_avg:151.24ms step:611/1480 train_time:90903ms step_avg:151.25ms step:612/1480 train_time:91064ms step_avg:151.27ms step:613/1480 train_time:91224ms step_avg:151.28ms step:614/1480 train_time:91384ms step_avg:151.30ms step:615/1480 train_time:91544ms step_avg:151.31ms step:616/1480 train_time:91703ms step_avg:151.32ms step:617/1480 train_time:91863ms step_avg:151.34ms step:618/1480 train_time:92023ms step_avg:151.35ms step:619/1480 train_time:92183ms step_avg:151.37ms step:620/1480 train_time:92343ms step_avg:151.38ms step:621/1480 train_time:92504ms step_avg:151.40ms step:622/1480 train_time:92665ms step_avg:151.41ms step:623/1480 train_time:92825ms step_avg:151.43ms step:624/1480 train_time:92984ms step_avg:151.44ms step:625/1480 train_time:93144ms step_avg:151.45ms step:625/1480 val_loss:3.6099 train_time:93207ms step_avg:151.56ms step:626/1480 train_time:93306ms step_avg:151.47ms step:627/1480 train_time:93466ms step_avg:151.48ms step:628/1480 train_time:93625ms step_avg:151.50ms step:629/1480 train_time:93784ms step_avg:151.51ms step:630/1480 train_time:93943ms step_avg:151.52ms step:631/1480 train_time:94102ms step_avg:151.53ms step:632/1480 train_time:94261ms step_avg:151.54ms step:633/1480 train_time:94421ms step_avg:151.56ms step:634/1480 train_time:94583ms step_avg:151.58ms step:635/1480 train_time:94744ms step_avg:151.59ms step:636/1480 train_time:94903ms step_avg:151.60ms step:637/1480 train_time:95063ms step_avg:151.62ms step:638/1480 train_time:95224ms step_avg:151.63ms step:639/1480 train_time:95384ms step_avg:151.64ms step:640/1480 train_time:95545ms step_avg:151.66ms step:641/1480 train_time:95706ms step_avg:151.67ms step:642/1480 train_time:95865ms step_avg:151.69ms step:643/1480 train_time:96026ms step_avg:151.70ms step:644/1480 train_time:96185ms step_avg:151.71ms step:645/1480 train_time:96344ms step_avg:151.72ms step:646/1480 train_time:96504ms step_avg:151.74ms step:647/1480 train_time:96664ms step_avg:151.75ms step:648/1480 train_time:96827ms step_avg:151.77ms step:649/1480 train_time:96986ms step_avg:151.78ms step:650/1480 train_time:97145ms step_avg:151.79ms step:651/1480 train_time:97306ms step_avg:151.80ms step:652/1480 train_time:97466ms step_avg:151.82ms step:653/1480 train_time:97625ms step_avg:151.83ms step:654/1480 train_time:97786ms step_avg:151.84ms step:655/1480 train_time:97946ms step_avg:151.85ms step:656/1480 train_time:98106ms step_avg:151.87ms step:657/1480 train_time:98265ms step_avg:151.88ms step:658/1480 train_time:98425ms step_avg:151.89ms step:659/1480 train_time:98587ms step_avg:151.91ms step:660/1480 train_time:98748ms step_avg:151.92ms step:661/1480 train_time:98910ms step_avg:151.94ms step:662/1480 train_time:99070ms step_avg:151.95ms step:663/1480 train_time:99229ms step_avg:151.96ms step:664/1480 train_time:99391ms step_avg:151.97ms step:665/1480 train_time:99552ms step_avg:151.99ms step:666/1480 train_time:99711ms step_avg:152.00ms step:667/1480 train_time:99873ms step_avg:152.01ms step:668/1480 train_time:100034ms step_avg:152.03ms step:669/1480 train_time:100197ms step_avg:152.04ms step:670/1480 train_time:100358ms step_avg:152.06ms step:671/1480 train_time:100519ms step_avg:152.07ms step:672/1480 train_time:100681ms step_avg:152.09ms step:673/1480 train_time:100845ms step_avg:152.10ms step:674/1480 train_time:101008ms step_avg:152.12ms step:675/1480 train_time:101169ms step_avg:152.13ms step:676/1480 train_time:101331ms step_avg:152.15ms step:677/1480 train_time:101491ms step_avg:152.16ms step:678/1480 train_time:101651ms step_avg:152.17ms step:679/1480 train_time:101812ms step_avg:152.19ms step:680/1480 train_time:101973ms step_avg:152.20ms step:681/1480 train_time:102134ms step_avg:152.21ms step:682/1480 train_time:102299ms step_avg:152.23ms step:683/1480 train_time:102461ms step_avg:152.25ms step:684/1480 train_time:102623ms step_avg:152.26ms step:685/1480 train_time:102788ms step_avg:152.28ms step:686/1480 train_time:102949ms step_avg:152.29ms step:687/1480 train_time:103110ms step_avg:152.30ms step:688/1480 train_time:103273ms step_avg:152.32ms step:689/1480 train_time:103434ms step_avg:152.33ms step:690/1480 train_time:103599ms step_avg:152.35ms step:691/1480 train_time:103762ms step_avg:152.37ms step:692/1480 train_time:103925ms step_avg:152.38ms step:693/1480 train_time:104088ms step_avg:152.40ms step:694/1480 train_time:104249ms step_avg:152.41ms step:695/1480 train_time:104410ms step_avg:152.42ms step:696/1480 train_time:104570ms step_avg:152.43ms step:697/1480 train_time:104732ms step_avg:152.45ms step:698/1480 train_time:104893ms step_avg:152.46ms step:699/1480 train_time:105055ms step_avg:152.47ms step:700/1480 train_time:105217ms step_avg:152.49ms step:701/1480 train_time:105376ms step_avg:152.50ms step:702/1480 train_time:105537ms step_avg:152.51ms step:703/1480 train_time:105698ms step_avg:152.52ms step:704/1480 train_time:105861ms step_avg:152.54ms step:705/1480 train_time:106025ms step_avg:152.55ms step:706/1480 train_time:106189ms step_avg:152.57ms step:707/1480 train_time:106352ms step_avg:152.59ms step:708/1480 train_time:106512ms step_avg:152.60ms step:709/1480 train_time:106672ms step_avg:152.61ms step:710/1480 train_time:106833ms step_avg:152.62ms step:711/1480 train_time:106994ms step_avg:152.63ms step:712/1480 train_time:107159ms step_avg:152.65ms step:713/1480 train_time:107324ms step_avg:152.67ms step:714/1480 train_time:107486ms step_avg:152.68ms step:715/1480 train_time:107647ms step_avg:152.69ms step:716/1480 train_time:107807ms step_avg:152.70ms step:717/1480 train_time:107969ms step_avg:152.71ms step:718/1480 train_time:108129ms step_avg:152.72ms step:719/1480 train_time:108288ms step_avg:152.73ms step:720/1480 train_time:108450ms step_avg:152.75ms step:721/1480 train_time:108612ms step_avg:152.76ms step:722/1480 train_time:108772ms step_avg:152.77ms step:723/1480 train_time:108932ms step_avg:152.78ms step:724/1480 train_time:109094ms step_avg:152.79ms step:725/1480 train_time:109256ms step_avg:152.81ms step:726/1480 train_time:109420ms step_avg:152.82ms step:727/1480 train_time:109584ms step_avg:152.84ms step:728/1480 train_time:109745ms step_avg:152.85ms step:729/1480 train_time:109907ms step_avg:152.86ms step:730/1480 train_time:110069ms step_avg:152.87ms step:731/1480 train_time:110230ms step_avg:152.89ms step:732/1480 train_time:110390ms step_avg:152.89ms step:733/1480 train_time:110551ms step_avg:152.91ms step:734/1480 train_time:110713ms step_avg:152.92ms step:735/1480 train_time:110873ms step_avg:152.93ms step:736/1480 train_time:111035ms step_avg:152.94ms step:737/1480 train_time:111195ms step_avg:152.95ms step:738/1480 train_time:111356ms step_avg:152.96ms step:739/1480 train_time:111516ms step_avg:152.97ms step:740/1480 train_time:111682ms step_avg:152.99ms step:741/1480 train_time:111845ms step_avg:153.00ms step:742/1480 train_time:112008ms step_avg:153.02ms step:743/1480 train_time:112168ms step_avg:153.03ms step:744/1480 train_time:112331ms step_avg:153.04ms step:745/1480 train_time:112495ms step_avg:153.06ms step:746/1480 train_time:112656ms step_avg:153.07ms step:747/1480 train_time:112819ms step_avg:153.08ms step:748/1480 train_time:112986ms step_avg:153.10ms step:749/1480 train_time:113150ms step_avg:153.11ms step:750/1480 train_time:113310ms step_avg:153.12ms step:750/1480 val_loss:3.5568 train_time:113374ms step_avg:153.21ms step:751/1480 train_time:113474ms step_avg:153.14ms step:752/1480 train_time:113635ms step_avg:153.15ms step:753/1480 train_time:113795ms step_avg:153.16ms step:754/1480 train_time:113955ms step_avg:153.17ms step:755/1480 train_time:114115ms step_avg:153.17ms step:756/1480 train_time:114278ms step_avg:153.19ms step:757/1480 train_time:114445ms step_avg:153.21ms step:758/1480 train_time:114606ms step_avg:153.22ms step:759/1480 train_time:114770ms step_avg:153.23ms step:760/1480 train_time:114931ms step_avg:153.24ms step:761/1480 train_time:115093ms step_avg:153.25ms step:762/1480 train_time:115254ms step_avg:153.26ms step:763/1480 train_time:115417ms step_avg:153.28ms step:764/1480 train_time:115577ms step_avg:153.29ms step:765/1480 train_time:115737ms step_avg:153.29ms step:766/1480 train_time:115898ms step_avg:153.30ms step:767/1480 train_time:116062ms step_avg:153.32ms step:768/1480 train_time:116223ms step_avg:153.33ms step:769/1480 train_time:116389ms step_avg:153.34ms step:770/1480 train_time:116551ms step_avg:153.36ms step:771/1480 train_time:116714ms step_avg:153.37ms step:772/1480 train_time:116876ms step_avg:153.38ms step:773/1480 train_time:117037ms step_avg:153.39ms step:774/1480 train_time:117199ms step_avg:153.40ms step:775/1480 train_time:117361ms step_avg:153.41ms step:776/1480 train_time:117528ms step_avg:153.43ms step:777/1480 train_time:117694ms step_avg:153.45ms step:778/1480 train_time:117857ms step_avg:153.46ms step:779/1480 train_time:118020ms step_avg:153.47ms step:780/1480 train_time:118185ms step_avg:153.49ms step:781/1480 train_time:118349ms step_avg:153.50ms step:782/1480 train_time:118513ms step_avg:153.51ms step:783/1480 train_time:118675ms step_avg:153.52ms step:784/1480 train_time:118838ms step_avg:153.54ms step:785/1480 train_time:118999ms step_avg:153.55ms step:786/1480 train_time:119165ms step_avg:153.56ms step:787/1480 train_time:119329ms step_avg:153.58ms step:788/1480 train_time:119492ms step_avg:153.59ms step:789/1480 train_time:119653ms step_avg:153.60ms step:790/1480 train_time:119818ms step_avg:153.61ms step:791/1480 train_time:119986ms step_avg:153.63ms step:792/1480 train_time:120152ms step_avg:153.65ms step:793/1480 train_time:120313ms step_avg:153.66ms step:794/1480 train_time:120477ms step_avg:153.67ms step:795/1480 train_time:120642ms step_avg:153.68ms step:796/1480 train_time:120809ms step_avg:153.70ms step:797/1480 train_time:120973ms step_avg:153.71ms step:798/1480 train_time:121136ms step_avg:153.73ms step:799/1480 train_time:121303ms step_avg:153.74ms step:800/1480 train_time:121468ms step_avg:153.76ms step:801/1480 train_time:121632ms step_avg:153.77ms step:802/1480 train_time:121798ms step_avg:153.79ms step:803/1480 train_time:121961ms step_avg:153.80ms step:804/1480 train_time:122123ms step_avg:153.81ms step:805/1480 train_time:122288ms step_avg:153.82ms step:806/1480 train_time:122451ms step_avg:153.83ms step:807/1480 train_time:122612ms step_avg:153.84ms step:808/1480 train_time:122775ms step_avg:153.85ms step:809/1480 train_time:122937ms step_avg:153.86ms step:810/1480 train_time:123098ms step_avg:153.87ms step:811/1480 train_time:123263ms step_avg:153.89ms step:812/1480 train_time:123426ms step_avg:153.90ms step:813/1480 train_time:123588ms step_avg:153.91ms step:814/1480 train_time:123751ms step_avg:153.92ms step:815/1480 train_time:123913ms step_avg:153.93ms step:816/1480 train_time:124079ms step_avg:153.94ms step:817/1480 train_time:124242ms step_avg:153.96ms step:818/1480 train_time:124402ms step_avg:153.96ms step:819/1480 train_time:124567ms step_avg:153.98ms step:820/1480 train_time:124731ms step_avg:153.99ms step:821/1480 train_time:124892ms step_avg:154.00ms step:822/1480 train_time:125056ms step_avg:154.01ms step:823/1480 train_time:125219ms step_avg:154.02ms step:824/1480 train_time:125381ms step_avg:154.03ms step:825/1480 train_time:125548ms step_avg:154.05ms step:826/1480 train_time:125714ms step_avg:154.06ms step:827/1480 train_time:125878ms step_avg:154.07ms step:828/1480 train_time:126041ms step_avg:154.08ms step:829/1480 train_time:126207ms step_avg:154.10ms step:830/1480 train_time:126372ms step_avg:154.11ms step:831/1480 train_time:126534ms step_avg:154.12ms step:832/1480 train_time:126697ms step_avg:154.13ms step:833/1480 train_time:126863ms step_avg:154.15ms step:834/1480 train_time:127027ms step_avg:154.16ms step:835/1480 train_time:127192ms step_avg:154.17ms step:836/1480 train_time:127357ms step_avg:154.18ms step:837/1480 train_time:127520ms step_avg:154.20ms step:838/1480 train_time:127685ms step_avg:154.21ms step:839/1480 train_time:127848ms step_avg:154.22ms step:840/1480 train_time:128010ms step_avg:154.23ms step:841/1480 train_time:128171ms step_avg:154.24ms step:842/1480 train_time:128333ms step_avg:154.25ms step:843/1480 train_time:128495ms step_avg:154.26ms step:844/1480 train_time:128657ms step_avg:154.26ms step:845/1480 train_time:128820ms step_avg:154.28ms step:846/1480 train_time:128989ms step_avg:154.29ms step:847/1480 train_time:129152ms step_avg:154.30ms step:848/1480 train_time:129314ms step_avg:154.31ms step:849/1480 train_time:129477ms step_avg:154.32ms step:850/1480 train_time:129639ms step_avg:154.33ms step:851/1480 train_time:129806ms step_avg:154.35ms step:852/1480 train_time:129969ms step_avg:154.36ms step:853/1480 train_time:130131ms step_avg:154.37ms step:854/1480 train_time:130296ms step_avg:154.38ms step:855/1480 train_time:130459ms step_avg:154.39ms step:856/1480 train_time:130622ms step_avg:154.40ms step:857/1480 train_time:130790ms step_avg:154.42ms step:858/1480 train_time:130954ms step_avg:154.43ms step:859/1480 train_time:131116ms step_avg:154.44ms step:860/1480 train_time:131279ms step_avg:154.45ms step:861/1480 train_time:131447ms step_avg:154.46ms step:862/1480 train_time:131614ms step_avg:154.48ms step:863/1480 train_time:131783ms step_avg:154.49ms step:864/1480 train_time:131948ms step_avg:154.51ms step:865/1480 train_time:132110ms step_avg:154.51ms step:866/1480 train_time:132277ms step_avg:154.53ms step:867/1480 train_time:132440ms step_avg:154.54ms step:868/1480 train_time:132601ms step_avg:154.55ms step:869/1480 train_time:132765ms step_avg:154.56ms step:870/1480 train_time:132929ms step_avg:154.57ms step:871/1480 train_time:133092ms step_avg:154.58ms step:872/1480 train_time:133255ms step_avg:154.59ms step:873/1480 train_time:133417ms step_avg:154.60ms step:874/1480 train_time:133583ms step_avg:154.61ms step:875/1480 train_time:133749ms step_avg:154.62ms step:875/1480 val_loss:3.5107 train_time:133814ms step_avg:154.70ms step:876/1480 train_time:133915ms step_avg:154.64ms step:877/1480 train_time:134081ms step_avg:154.65ms step:878/1480 train_time:134244ms step_avg:154.66ms step:879/1480 train_time:134408ms step_avg:154.67ms step:880/1480 train_time:134571ms step_avg:154.68ms step:881/1480 train_time:134735ms step_avg:154.69ms step:882/1480 train_time:134900ms step_avg:154.70ms step:883/1480 train_time:135066ms step_avg:154.71ms step:884/1480 train_time:135234ms step_avg:154.73ms step:885/1480 train_time:135400ms step_avg:154.74ms step:886/1480 train_time:135565ms step_avg:154.75ms step:887/1480 train_time:135732ms step_avg:154.77ms step:888/1480 train_time:135906ms step_avg:154.79ms step:889/1480 train_time:136075ms step_avg:154.81ms step:890/1480 train_time:136238ms step_avg:154.82ms step:891/1480 train_time:136404ms step_avg:154.83ms step:892/1480 train_time:136569ms step_avg:154.84ms step:893/1480 train_time:136730ms step_avg:154.85ms step:894/1480 train_time:136898ms step_avg:154.86ms step:895/1480 train_time:137063ms step_avg:154.87ms step:896/1480 train_time:137228ms step_avg:154.88ms step:897/1480 train_time:137395ms step_avg:154.90ms step:898/1480 train_time:137564ms step_avg:154.91ms step:899/1480 train_time:137727ms step_avg:154.92ms step:900/1480 train_time:137889ms step_avg:154.93ms step:901/1480 train_time:138053ms step_avg:154.94ms step:902/1480 train_time:138217ms step_avg:154.95ms step:903/1480 train_time:138388ms step_avg:154.97ms step:904/1480 train_time:138554ms step_avg:154.98ms step:905/1480 train_time:138718ms step_avg:154.99ms step:906/1480 train_time:138884ms step_avg:155.00ms step:907/1480 train_time:139053ms step_avg:155.02ms step:908/1480 train_time:139217ms step_avg:155.03ms step:909/1480 train_time:139383ms step_avg:155.04ms step:910/1480 train_time:139554ms step_avg:155.06ms step:911/1480 train_time:139721ms step_avg:155.07ms step:912/1480 train_time:139885ms step_avg:155.08ms step:913/1480 train_time:140051ms step_avg:155.10ms step:914/1480 train_time:140218ms step_avg:155.11ms step:915/1480 train_time:140387ms step_avg:155.12ms step:916/1480 train_time:140552ms step_avg:155.14ms step:917/1480 train_time:140716ms step_avg:155.14ms step:918/1480 train_time:140885ms step_avg:155.16ms step:919/1480 train_time:141054ms step_avg:155.17ms step:920/1480 train_time:141220ms step_avg:155.19ms step:921/1480 train_time:141385ms step_avg:155.20ms step:922/1480 train_time:141553ms step_avg:155.21ms step:923/1480 train_time:141716ms step_avg:155.22ms step:924/1480 train_time:141881ms step_avg:155.23ms step:925/1480 train_time:142045ms step_avg:155.24ms step:926/1480 train_time:142209ms step_avg:155.25ms step:927/1480 train_time:142373ms step_avg:155.26ms step:928/1480 train_time:142539ms step_avg:155.27ms step:929/1480 train_time:142704ms step_avg:155.28ms step:930/1480 train_time:142868ms step_avg:155.29ms step:931/1480 train_time:143031ms step_avg:155.30ms step:932/1480 train_time:143196ms step_avg:155.31ms step:933/1480 train_time:143365ms step_avg:155.32ms step:934/1480 train_time:143531ms step_avg:155.34ms step:935/1480 train_time:143702ms step_avg:155.35ms step:936/1480 train_time:143868ms step_avg:155.37ms step:937/1480 train_time:144038ms step_avg:155.38ms step:938/1480 train_time:144201ms step_avg:155.39ms step:939/1480 train_time:144369ms step_avg:155.40ms step:940/1480 train_time:144537ms step_avg:155.42ms step:941/1480 train_time:144701ms step_avg:155.43ms step:942/1480 train_time:144865ms step_avg:155.43ms step:943/1480 train_time:145034ms step_avg:155.45ms step:944/1480 train_time:145207ms step_avg:155.47ms step:945/1480 train_time:145370ms step_avg:155.48ms step:946/1480 train_time:145541ms step_avg:155.49ms step:947/1480 train_time:145708ms step_avg:155.51ms step:948/1480 train_time:145875ms step_avg:155.52ms step:949/1480 train_time:146041ms step_avg:155.53ms step:950/1480 train_time:146205ms step_avg:155.54ms step:951/1480 train_time:146374ms step_avg:155.55ms step:952/1480 train_time:146541ms step_avg:155.56ms step:953/1480 train_time:146709ms step_avg:155.58ms step:954/1480 train_time:146879ms step_avg:155.59ms step:955/1480 train_time:147042ms step_avg:155.60ms step:956/1480 train_time:147207ms step_avg:155.61ms step:957/1480 train_time:147376ms step_avg:155.62ms step:958/1480 train_time:147545ms step_avg:155.64ms step:959/1480 train_time:147710ms step_avg:155.65ms step:960/1480 train_time:147877ms step_avg:155.66ms step:961/1480 train_time:148043ms step_avg:155.67ms step:962/1480 train_time:148206ms step_avg:155.68ms step:963/1480 train_time:148371ms step_avg:155.69ms step:964/1480 train_time:148541ms step_avg:155.70ms step:965/1480 train_time:148705ms step_avg:155.71ms step:966/1480 train_time:148869ms step_avg:155.72ms step:967/1480 train_time:149033ms step_avg:155.73ms step:968/1480 train_time:149198ms step_avg:155.74ms step:969/1480 train_time:149364ms step_avg:155.75ms step:970/1480 train_time:149527ms step_avg:155.76ms step:971/1480 train_time:149691ms step_avg:155.77ms step:972/1480 train_time:149855ms step_avg:155.77ms step:973/1480 train_time:150021ms step_avg:155.79ms step:974/1480 train_time:150189ms step_avg:155.80ms step:975/1480 train_time:150355ms step_avg:155.81ms step:976/1480 train_time:150521ms step_avg:155.82ms step:977/1480 train_time:150684ms step_avg:155.83ms step:978/1480 train_time:150850ms step_avg:155.84ms step:979/1480 train_time:151017ms step_avg:155.85ms step:980/1480 train_time:151183ms step_avg:155.86ms step:981/1480 train_time:151353ms step_avg:155.87ms step:982/1480 train_time:151518ms step_avg:155.88ms step:983/1480 train_time:151683ms step_avg:155.89ms step:984/1480 train_time:151846ms step_avg:155.90ms step:985/1480 train_time:152013ms step_avg:155.91ms step:986/1480 train_time:152180ms step_avg:155.92ms step:987/1480 train_time:152345ms step_avg:155.93ms step:988/1480 train_time:152511ms step_avg:155.94ms step:989/1480 train_time:152677ms step_avg:155.95ms step:990/1480 train_time:152848ms step_avg:155.97ms step:991/1480 train_time:153016ms step_avg:155.98ms step:992/1480 train_time:153188ms step_avg:156.00ms step:993/1480 train_time:153365ms step_avg:156.02ms step:994/1480 train_time:153529ms step_avg:156.03ms step:995/1480 train_time:153693ms step_avg:156.03ms step:996/1480 train_time:153855ms step_avg:156.04ms step:997/1480 train_time:154020ms step_avg:156.05ms step:998/1480 train_time:154184ms step_avg:156.06ms step:999/1480 train_time:154350ms step_avg:156.07ms step:1000/1480 train_time:154521ms step_avg:156.08ms step:1000/1480 val_loss:3.4473 train_time:154587ms step_avg:156.15ms step:1001/1480 train_time:154688ms step_avg:156.09ms step:1002/1480 train_time:154857ms step_avg:156.11ms step:1003/1480 train_time:155026ms step_avg:156.12ms step:1004/1480 train_time:155195ms step_avg:156.13ms step:1005/1480 train_time:155364ms step_avg:156.14ms step:1006/1480 train_time:155532ms step_avg:156.16ms step:1007/1480 train_time:155698ms step_avg:156.17ms step:1008/1480 train_time:155866ms step_avg:156.18ms step:1009/1480 train_time:156040ms step_avg:156.20ms step:1010/1480 train_time:156204ms step_avg:156.20ms step:1011/1480 train_time:156371ms step_avg:156.21ms step:1012/1480 train_time:156538ms step_avg:156.23ms step:1013/1480 train_time:156708ms step_avg:156.24ms step:1014/1480 train_time:156875ms step_avg:156.25ms step:1015/1480 train_time:157043ms step_avg:156.26ms step:1016/1480 train_time:157211ms step_avg:156.27ms step:1017/1480 train_time:157383ms step_avg:156.29ms step:1018/1480 train_time:157550ms step_avg:156.30ms step:1019/1480 train_time:157718ms step_avg:156.31ms step:1020/1480 train_time:157886ms step_avg:156.32ms step:1021/1480 train_time:158052ms step_avg:156.33ms step:1022/1480 train_time:158219ms step_avg:156.34ms step:1023/1480 train_time:158387ms step_avg:156.35ms step:1024/1480 train_time:158554ms step_avg:156.37ms step:1025/1480 train_time:158724ms step_avg:156.38ms step:1026/1480 train_time:158889ms step_avg:156.39ms step:1027/1480 train_time:159057ms step_avg:156.40ms step:1028/1480 train_time:159227ms step_avg:156.41ms step:1029/1480 train_time:159403ms step_avg:156.43ms step:1030/1480 train_time:159572ms step_avg:156.44ms step:1031/1480 train_time:159737ms step_avg:156.45ms step:1032/1480 train_time:159906ms step_avg:156.46ms step:1033/1480 train_time:160072ms step_avg:156.47ms step:1034/1480 train_time:160241ms step_avg:156.49ms step:1035/1480 train_time:160410ms step_avg:156.50ms step:1036/1480 train_time:160577ms step_avg:156.51ms step:1037/1480 train_time:160744ms step_avg:156.52ms step:1038/1480 train_time:160913ms step_avg:156.53ms step:1039/1480 train_time:161082ms step_avg:156.54ms step:1040/1480 train_time:161249ms step_avg:156.55ms step:1041/1480 train_time:161417ms step_avg:156.56ms step:1042/1480 train_time:161581ms step_avg:156.57ms step:1043/1480 train_time:161746ms step_avg:156.58ms step:1044/1480 train_time:161913ms step_avg:156.59ms step:1045/1480 train_time:162082ms step_avg:156.60ms step:1046/1480 train_time:162250ms step_avg:156.61ms step:1047/1480 train_time:162417ms step_avg:156.62ms step:1048/1480 train_time:162583ms step_avg:156.63ms step:1049/1480 train_time:162748ms step_avg:156.64ms step:1050/1480 train_time:162919ms step_avg:156.65ms step:1051/1480 train_time:163088ms step_avg:156.66ms step:1052/1480 train_time:163257ms step_avg:156.68ms step:1053/1480 train_time:163423ms step_avg:156.69ms step:1054/1480 train_time:163591ms step_avg:156.70ms step:1055/1480 train_time:163758ms step_avg:156.71ms step:1056/1480 train_time:163922ms step_avg:156.71ms step:1057/1480 train_time:164089ms step_avg:156.72ms step:1058/1480 train_time:164260ms step_avg:156.74ms step:1059/1480 train_time:164432ms step_avg:156.75ms step:1060/1480 train_time:164601ms step_avg:156.76ms step:1061/1480 train_time:164764ms step_avg:156.77ms step:1062/1480 train_time:164930ms step_avg:156.78ms step:1063/1480 train_time:165097ms step_avg:156.79ms step:1064/1480 train_time:165261ms step_avg:156.79ms step:1065/1480 train_time:165427ms step_avg:156.80ms step:1066/1480 train_time:165595ms step_avg:156.81ms step:1067/1480 train_time:165763ms step_avg:156.82ms step:1068/1480 train_time:165928ms step_avg:156.83ms step:1069/1480 train_time:166099ms step_avg:156.84ms step:1070/1480 train_time:166265ms step_avg:156.85ms step:1071/1480 train_time:166439ms step_avg:156.87ms step:1072/1480 train_time:166605ms step_avg:156.88ms step:1073/1480 train_time:166769ms step_avg:156.89ms step:1074/1480 train_time:166937ms step_avg:156.90ms step:1075/1480 train_time:167109ms step_avg:156.91ms step:1076/1480 train_time:167278ms step_avg:156.92ms step:1077/1480 train_time:167444ms step_avg:156.93ms step:1078/1480 train_time:167619ms step_avg:156.95ms step:1079/1480 train_time:167791ms step_avg:156.96ms step:1080/1480 train_time:167961ms step_avg:156.97ms step:1081/1480 train_time:168127ms step_avg:156.98ms step:1082/1480 train_time:168293ms step_avg:156.99ms step:1083/1480 train_time:168461ms step_avg:157.00ms step:1084/1480 train_time:168627ms step_avg:157.01ms step:1085/1480 train_time:168795ms step_avg:157.02ms step:1086/1480 train_time:168964ms step_avg:157.03ms step:1087/1480 train_time:169131ms step_avg:157.04ms step:1088/1480 train_time:169302ms step_avg:157.05ms step:1089/1480 train_time:169477ms step_avg:157.07ms step:1090/1480 train_time:169648ms step_avg:157.08ms step:1091/1480 train_time:169815ms step_avg:157.09ms step:1092/1480 train_time:169983ms step_avg:157.10ms step:1093/1480 train_time:170149ms step_avg:157.11ms step:1094/1480 train_time:170316ms step_avg:157.12ms step:1095/1480 train_time:170482ms step_avg:157.13ms step:1096/1480 train_time:170651ms step_avg:157.14ms step:1097/1480 train_time:170819ms step_avg:157.15ms step:1098/1480 train_time:170988ms step_avg:157.16ms step:1099/1480 train_time:171161ms step_avg:157.17ms step:1100/1480 train_time:171333ms step_avg:157.19ms step:1101/1480 train_time:171503ms step_avg:157.20ms step:1102/1480 train_time:171675ms step_avg:157.21ms step:1103/1480 train_time:171849ms step_avg:157.23ms step:1104/1480 train_time:172016ms step_avg:157.24ms step:1105/1480 train_time:172185ms step_avg:157.25ms step:1106/1480 train_time:172355ms step_avg:157.26ms step:1107/1480 train_time:172523ms step_avg:157.27ms step:1108/1480 train_time:172687ms step_avg:157.27ms step:1109/1480 train_time:172855ms step_avg:157.28ms step:1110/1480 train_time:173020ms step_avg:157.29ms step:1111/1480 train_time:173187ms step_avg:157.30ms step:1112/1480 train_time:173358ms step_avg:157.31ms step:1113/1480 train_time:173536ms step_avg:157.33ms step:1114/1480 train_time:173707ms step_avg:157.34ms step:1115/1480 train_time:173880ms step_avg:157.36ms step:1116/1480 train_time:174046ms step_avg:157.37ms step:1117/1480 train_time:174219ms step_avg:157.38ms step:1118/1480 train_time:174392ms step_avg:157.39ms step:1119/1480 train_time:174559ms step_avg:157.40ms step:1120/1480 train_time:174727ms step_avg:157.41ms step:1121/1480 train_time:174898ms step_avg:157.42ms step:1122/1480 train_time:175064ms step_avg:157.43ms step:1123/1480 train_time:175231ms step_avg:157.44ms step:1124/1480 train_time:175400ms step_avg:157.45ms step:1125/1480 train_time:175566ms step_avg:157.46ms step:1125/1480 val_loss:3.3918 train_time:175634ms step_avg:157.52ms step:1126/1480 train_time:175737ms step_avg:157.47ms step:1127/1480 train_time:175908ms step_avg:157.48ms step:1128/1480 train_time:176080ms step_avg:157.50ms step:1129/1480 train_time:176253ms step_avg:157.51ms step:1130/1480 train_time:176422ms step_avg:157.52ms step:1131/1480 train_time:176602ms step_avg:157.54ms step:1132/1480 train_time:176767ms step_avg:157.55ms step:1133/1480 train_time:176941ms step_avg:157.56ms step:1134/1480 train_time:177111ms step_avg:157.57ms step:1135/1480 train_time:177281ms step_avg:157.58ms step:1136/1480 train_time:177450ms step_avg:157.59ms step:1137/1480 train_time:177620ms step_avg:157.60ms step:1138/1480 train_time:177790ms step_avg:157.62ms step:1139/1480 train_time:177960ms step_avg:157.63ms step:1140/1480 train_time:178127ms step_avg:157.63ms step:1141/1480 train_time:178301ms step_avg:157.65ms step:1142/1480 train_time:178468ms step_avg:157.66ms step:1143/1480 train_time:178640ms step_avg:157.67ms step:1144/1480 train_time:178807ms step_avg:157.68ms step:1145/1480 train_time:178973ms step_avg:157.69ms step:1146/1480 train_time:179142ms step_avg:157.70ms step:1147/1480 train_time:179309ms step_avg:157.70ms step:1148/1480 train_time:179479ms step_avg:157.71ms step:1149/1480 train_time:179648ms step_avg:157.72ms step:1150/1480 train_time:179817ms step_avg:157.73ms step:1151/1480 train_time:179989ms step_avg:157.75ms step:1152/1480 train_time:180162ms step_avg:157.76ms step:1153/1480 train_time:180335ms step_avg:157.77ms step:1154/1480 train_time:180502ms step_avg:157.78ms step:1155/1480 train_time:180673ms step_avg:157.79ms step:1156/1480 train_time:180850ms step_avg:157.81ms step:1157/1480 train_time:181020ms step_avg:157.82ms step:1158/1480 train_time:181187ms step_avg:157.83ms step:1159/1480 train_time:181356ms step_avg:157.84ms step:1160/1480 train_time:181521ms step_avg:157.84ms step:1161/1480 train_time:181691ms step_avg:157.85ms step:1162/1480 train_time:181861ms step_avg:157.87ms step:1163/1480 train_time:182030ms step_avg:157.87ms step:1164/1480 train_time:182199ms step_avg:157.88ms step:1165/1480 train_time:182364ms step_avg:157.89ms step:1166/1480 train_time:182534ms step_avg:157.90ms step:1167/1480 train_time:182704ms step_avg:157.91ms step:1168/1480 train_time:182874ms step_avg:157.92ms step:1169/1480 train_time:183042ms step_avg:157.93ms step:1170/1480 train_time:183210ms step_avg:157.94ms step:1171/1480 train_time:183378ms step_avg:157.95ms step:1172/1480 train_time:183544ms step_avg:157.96ms step:1173/1480 train_time:183716ms step_avg:157.97ms step:1174/1480 train_time:183900ms step_avg:157.99ms step:1175/1480 train_time:184069ms step_avg:158.00ms step:1176/1480 train_time:184241ms step_avg:158.01ms step:1177/1480 train_time:184417ms step_avg:158.03ms step:1178/1480 train_time:184586ms step_avg:158.04ms step:1179/1480 train_time:184751ms step_avg:158.04ms step:1180/1480 train_time:184930ms step_avg:158.06ms step:1181/1480 train_time:185101ms step_avg:158.07ms step:1182/1480 train_time:185267ms step_avg:158.08ms step:1183/1480 train_time:185437ms step_avg:158.09ms step:1184/1480 train_time:185605ms step_avg:158.10ms step:1185/1480 train_time:185781ms step_avg:158.11ms step:1186/1480 train_time:185953ms step_avg:158.12ms step:1187/1480 train_time:186138ms step_avg:158.15ms step:1188/1480 train_time:186305ms step_avg:158.15ms step:1189/1480 train_time:186476ms step_avg:158.16ms step:1190/1480 train_time:186643ms step_avg:158.17ms step:1191/1480 train_time:186815ms step_avg:158.18ms step:1192/1480 train_time:186982ms step_avg:158.19ms step:1193/1480 train_time:187148ms step_avg:158.20ms step:1194/1480 train_time:187318ms step_avg:158.21ms step:1195/1480 train_time:187493ms step_avg:158.22ms step:1196/1480 train_time:187675ms step_avg:158.24ms step:1197/1480 train_time:187845ms step_avg:158.25ms step:1198/1480 train_time:188028ms step_avg:158.27ms step:1199/1480 train_time:188198ms step_avg:158.28ms step:1200/1480 train_time:188366ms step_avg:158.29ms step:1201/1480 train_time:188533ms step_avg:158.30ms step:1202/1480 train_time:188714ms step_avg:158.32ms step:1203/1480 train_time:188888ms step_avg:158.33ms step:1204/1480 train_time:189061ms step_avg:158.34ms step:1205/1480 train_time:189229ms step_avg:158.35ms step:1206/1480 train_time:189397ms step_avg:158.36ms step:1207/1480 train_time:189567ms step_avg:158.37ms step:1208/1480 train_time:189734ms step_avg:158.38ms step:1209/1480 train_time:189908ms step_avg:158.39ms step:1210/1480 train_time:190084ms step_avg:158.40ms step:1211/1480 train_time:190258ms step_avg:158.42ms step:1212/1480 train_time:190431ms step_avg:158.43ms step:1213/1480 train_time:190604ms step_avg:158.44ms step:1214/1480 train_time:190782ms step_avg:158.46ms step:1215/1480 train_time:190955ms step_avg:158.47ms step:1216/1480 train_time:191124ms step_avg:158.48ms step:1217/1480 train_time:191299ms step_avg:158.49ms step:1218/1480 train_time:191467ms step_avg:158.50ms step:1219/1480 train_time:191646ms step_avg:158.52ms step:1220/1480 train_time:191817ms step_avg:158.53ms step:1221/1480 train_time:191987ms step_avg:158.54ms step:1222/1480 train_time:192155ms step_avg:158.54ms step:1223/1480 train_time:192324ms step_avg:158.55ms step:1224/1480 train_time:192503ms step_avg:158.57ms step:1225/1480 train_time:192675ms step_avg:158.58ms step:1226/1480 train_time:192848ms step_avg:158.59ms step:1227/1480 train_time:193021ms step_avg:158.60ms step:1228/1480 train_time:193191ms step_avg:158.61ms step:1229/1480 train_time:193364ms step_avg:158.62ms step:1230/1480 train_time:193543ms step_avg:158.64ms step:1231/1480 train_time:193718ms step_avg:158.66ms step:1232/1480 train_time:193893ms step_avg:158.67ms step:1233/1480 train_time:194063ms step_avg:158.68ms step:1234/1480 train_time:194232ms step_avg:158.69ms step:1235/1480 train_time:194406ms step_avg:158.70ms step:1236/1480 train_time:194575ms step_avg:158.71ms step:1237/1480 train_time:194747ms step_avg:158.72ms step:1238/1480 train_time:194933ms step_avg:158.74ms step:1239/1480 train_time:195106ms step_avg:158.75ms step:1240/1480 train_time:195277ms step_avg:158.76ms step:1241/1480 train_time:195450ms step_avg:158.77ms step:1242/1480 train_time:195621ms step_avg:158.78ms step:1243/1480 train_time:195796ms step_avg:158.80ms step:1244/1480 train_time:195962ms step_avg:158.80ms step:1245/1480 train_time:196130ms step_avg:158.81ms step:1246/1480 train_time:196301ms step_avg:158.82ms step:1247/1480 train_time:196470ms step_avg:158.83ms step:1248/1480 train_time:196639ms step_avg:158.84ms step:1249/1480 train_time:196806ms step_avg:158.84ms step:1250/1480 train_time:196977ms step_avg:158.85ms step:1250/1480 val_loss:3.3413 train_time:197049ms step_avg:158.91ms step:1251/1480 train_time:197156ms step_avg:158.87ms step:1252/1480 train_time:197325ms step_avg:158.88ms step:1253/1480 train_time:197492ms step_avg:158.88ms step:1254/1480 train_time:197663ms step_avg:158.89ms step:1255/1480 train_time:197849ms step_avg:158.91ms step:1256/1480 train_time:198025ms step_avg:158.93ms step:1257/1480 train_time:198194ms step_avg:158.94ms step:1258/1480 train_time:198369ms step_avg:158.95ms step:1259/1480 train_time:198540ms step_avg:158.96ms step:1260/1480 train_time:198707ms step_avg:158.97ms step:1261/1480 train_time:198879ms step_avg:158.98ms step:1262/1480 train_time:199057ms step_avg:158.99ms step:1263/1480 train_time:199231ms step_avg:159.00ms step:1264/1480 train_time:199398ms step_avg:159.01ms step:1265/1480 train_time:199565ms step_avg:159.02ms step:1266/1480 train_time:199735ms step_avg:159.03ms step:1267/1480 train_time:199908ms step_avg:159.04ms step:1268/1480 train_time:200078ms step_avg:159.04ms step:1269/1480 train_time:200254ms step_avg:159.06ms step:1270/1480 train_time:200424ms step_avg:159.07ms step:1271/1480 train_time:200592ms step_avg:159.07ms step:1272/1480 train_time:200759ms step_avg:159.08ms step:1273/1480 train_time:200930ms step_avg:159.09ms step:1274/1480 train_time:201104ms step_avg:159.10ms step:1275/1480 train_time:201270ms step_avg:159.11ms step:1276/1480 train_time:201435ms step_avg:159.11ms step:1277/1480 train_time:201608ms step_avg:159.12ms step:1278/1480 train_time:201775ms step_avg:159.13ms step:1279/1480 train_time:201949ms step_avg:159.14ms step:1280/1480 train_time:202128ms step_avg:159.16ms step:1281/1480 train_time:202297ms step_avg:159.16ms step:1282/1480 train_time:202464ms step_avg:159.17ms step:1283/1480 train_time:202634ms step_avg:159.18ms step:1284/1480 train_time:202804ms step_avg:159.19ms step:1285/1480 train_time:202973ms step_avg:159.19ms step:1286/1480 train_time:203144ms step_avg:159.20ms step:1287/1480 train_time:203317ms step_avg:159.21ms step:1288/1480 train_time:203488ms step_avg:159.22ms step:1289/1480 train_time:203671ms step_avg:159.24ms step:1290/1480 train_time:203851ms step_avg:159.26ms step:1291/1480 train_time:204026ms step_avg:159.27ms step:1292/1480 train_time:204200ms step_avg:159.28ms step:1293/1480 train_time:204375ms step_avg:159.29ms step:1294/1480 train_time:204547ms step_avg:159.30ms step:1295/1480 train_time:204717ms step_avg:159.31ms step:1296/1480 train_time:204889ms step_avg:159.32ms step:1297/1480 train_time:205061ms step_avg:159.33ms step:1298/1480 train_time:205231ms step_avg:159.34ms step:1299/1480 train_time:205402ms step_avg:159.35ms step:1300/1480 train_time:205568ms step_avg:159.36ms step:1301/1480 train_time:205737ms step_avg:159.36ms step:1302/1480 train_time:205912ms step_avg:159.37ms step:1303/1480 train_time:206089ms step_avg:159.39ms step:1304/1480 train_time:206263ms step_avg:159.40ms step:1305/1480 train_time:206432ms step_avg:159.41ms step:1306/1480 train_time:206607ms step_avg:159.42ms step:1307/1480 train_time:206775ms step_avg:159.43ms step:1308/1480 train_time:206945ms step_avg:159.43ms step:1309/1480 train_time:207116ms step_avg:159.44ms step:1310/1480 train_time:207287ms step_avg:159.45ms step:1311/1480 train_time:207455ms step_avg:159.46ms step:1312/1480 train_time:207630ms step_avg:159.47ms step:1313/1480 train_time:207800ms step_avg:159.48ms step:1314/1480 train_time:207972ms step_avg:159.49ms step:1315/1480 train_time:208142ms step_avg:159.50ms step:1316/1480 train_time:208309ms step_avg:159.50ms step:1317/1480 train_time:208480ms step_avg:159.51ms step:1318/1480 train_time:208662ms step_avg:159.53ms step:1319/1480 train_time:208838ms step_avg:159.54ms step:1320/1480 train_time:209014ms step_avg:159.55ms step:1321/1480 train_time:209187ms step_avg:159.56ms step:1322/1480 train_time:209367ms step_avg:159.58ms step:1323/1480 train_time:209538ms step_avg:159.59ms step:1324/1480 train_time:209715ms step_avg:159.60ms step:1325/1480 train_time:209896ms step_avg:159.62ms step:1326/1480 train_time:210071ms step_avg:159.63ms step:1327/1480 train_time:210240ms step_avg:159.64ms step:1328/1480 train_time:210411ms step_avg:159.64ms step:1329/1480 train_time:210608ms step_avg:159.67ms step:1330/1480 train_time:210788ms step_avg:159.69ms step:1331/1480 train_time:210959ms step_avg:159.70ms step:1332/1480 train_time:211132ms step_avg:159.71ms step:1333/1480 train_time:211307ms step_avg:159.72ms step:1334/1480 train_time:211477ms step_avg:159.73ms step:1335/1480 train_time:211646ms step_avg:159.73ms step:1336/1480 train_time:211830ms step_avg:159.75ms step:1337/1480 train_time:212007ms step_avg:159.76ms step:1338/1480 train_time:212178ms step_avg:159.77ms step:1339/1480 train_time:212352ms step_avg:159.78ms step:1340/1480 train_time:212525ms step_avg:159.79ms step:1341/1480 train_time:212692ms step_avg:159.80ms step:1342/1480 train_time:212866ms step_avg:159.81ms step:1343/1480 train_time:213034ms step_avg:159.82ms step:1344/1480 train_time:213207ms step_avg:159.83ms step:1345/1480 train_time:213384ms step_avg:159.84ms step:1346/1480 train_time:213552ms step_avg:159.84ms step:1347/1480 train_time:213723ms step_avg:159.85ms step:1348/1480 train_time:213892ms step_avg:159.86ms step:1349/1480 train_time:214062ms step_avg:159.87ms step:1350/1480 train_time:214235ms step_avg:159.88ms step:1351/1480 train_time:214407ms step_avg:159.89ms step:1352/1480 train_time:214576ms step_avg:159.89ms step:1353/1480 train_time:214751ms step_avg:159.90ms step:1354/1480 train_time:214923ms step_avg:159.91ms step:1355/1480 train_time:215091ms step_avg:159.92ms step:1356/1480 train_time:215265ms step_avg:159.93ms step:1357/1480 train_time:215439ms step_avg:159.94ms step:1358/1480 train_time:215610ms step_avg:159.95ms step:1359/1480 train_time:215782ms step_avg:159.96ms step:1360/1480 train_time:215957ms step_avg:159.97ms step:1361/1480 train_time:216134ms step_avg:159.98ms step:1362/1480 train_time:216309ms step_avg:159.99ms step:1363/1480 train_time:216489ms step_avg:160.01ms step:1364/1480 train_time:216659ms step_avg:160.01ms step:1365/1480 train_time:216826ms step_avg:160.02ms step:1366/1480 train_time:216997ms step_avg:160.03ms step:1367/1480 train_time:217168ms step_avg:160.04ms step:1368/1480 train_time:217341ms step_avg:160.04ms step:1369/1480 train_time:217522ms step_avg:160.06ms step:1370/1480 train_time:217701ms step_avg:160.07ms step:1371/1480 train_time:217872ms step_avg:160.08ms step:1372/1480 train_time:218050ms step_avg:160.10ms step:1373/1480 train_time:218220ms step_avg:160.10ms step:1374/1480 train_time:218396ms step_avg:160.11ms step:1375/1480 train_time:218567ms step_avg:160.12ms step:1375/1480 val_loss:3.3024 train_time:218634ms step_avg:160.17ms step:1376/1480 train_time:218742ms step_avg:160.13ms step:1377/1480 train_time:218912ms step_avg:160.14ms step:1378/1480 train_time:219081ms step_avg:160.15ms step:1379/1480 train_time:219255ms step_avg:160.16ms step:1380/1480 train_time:219428ms step_avg:160.17ms step:1381/1480 train_time:219609ms step_avg:160.18ms step:1382/1480 train_time:219780ms step_avg:160.19ms step:1383/1480 train_time:219950ms step_avg:160.20ms step:1384/1480 train_time:220129ms step_avg:160.21ms step:1385/1480 train_time:220295ms step_avg:160.21ms step:1386/1480 train_time:220465ms step_avg:160.22ms step:1387/1480 train_time:220639ms step_avg:160.23ms step:1388/1480 train_time:220808ms step_avg:160.24ms step:1389/1480 train_time:220983ms step_avg:160.25ms step:1390/1480 train_time:221151ms step_avg:160.25ms step:1391/1480 train_time:221323ms step_avg:160.26ms step:1392/1480 train_time:221497ms step_avg:160.27ms step:1393/1480 train_time:221668ms step_avg:160.28ms step:1394/1480 train_time:221841ms step_avg:160.29ms step:1395/1480 train_time:222009ms step_avg:160.30ms step:1396/1480 train_time:222178ms step_avg:160.30ms step:1397/1480 train_time:222346ms step_avg:160.31ms step:1398/1480 train_time:222514ms step_avg:160.31ms step:1399/1480 train_time:222683ms step_avg:160.32ms step:1400/1480 train_time:222861ms step_avg:160.33ms step:1401/1480 train_time:223026ms step_avg:160.34ms step:1402/1480 train_time:223199ms step_avg:160.34ms step:1403/1480 train_time:223376ms step_avg:160.36ms step:1404/1480 train_time:223547ms step_avg:160.36ms step:1405/1480 train_time:223723ms step_avg:160.38ms step:1406/1480 train_time:223897ms step_avg:160.38ms step:1407/1480 train_time:224065ms step_avg:160.39ms step:1408/1480 train_time:224232ms step_avg:160.40ms step:1409/1480 train_time:224415ms step_avg:160.41ms step:1410/1480 train_time:224584ms step_avg:160.42ms step:1411/1480 train_time:224750ms step_avg:160.42ms step:1412/1480 train_time:224922ms step_avg:160.43ms step:1413/1480 train_time:225093ms step_avg:160.44ms step:1414/1480 train_time:225265ms step_avg:160.45ms step:1415/1480 train_time:225442ms step_avg:160.46ms step:1416/1480 train_time:225629ms step_avg:160.48ms step:1417/1480 train_time:225804ms step_avg:160.49ms step:1418/1480 train_time:225977ms step_avg:160.49ms step:1419/1480 train_time:226150ms step_avg:160.50ms step:1420/1480 train_time:226326ms step_avg:160.51ms step:1421/1480 train_time:226500ms step_avg:160.52ms step:1422/1480 train_time:226670ms step_avg:160.53ms step:1423/1480 train_time:226840ms step_avg:160.54ms step:1424/1480 train_time:227014ms step_avg:160.55ms step:1425/1480 train_time:227193ms step_avg:160.56ms step:1426/1480 train_time:227365ms step_avg:160.57ms step:1427/1480 train_time:227542ms step_avg:160.58ms step:1428/1480 train_time:227712ms step_avg:160.59ms step:1429/1480 train_time:227881ms step_avg:160.59ms step:1430/1480 train_time:228054ms step_avg:160.60ms step:1431/1480 train_time:228230ms step_avg:160.61ms step:1432/1480 train_time:228406ms step_avg:160.62ms step:1433/1480 train_time:228586ms step_avg:160.64ms step:1434/1480 train_time:228766ms step_avg:160.65ms step:1435/1480 train_time:228941ms step_avg:160.66ms step:1436/1480 train_time:229112ms step_avg:160.67ms step:1437/1480 train_time:229283ms step_avg:160.67ms step:1438/1480 train_time:229451ms step_avg:160.68ms step:1439/1480 train_time:229626ms step_avg:160.69ms step:1440/1480 train_time:229795ms step_avg:160.70ms step:1441/1480 train_time:229965ms step_avg:160.70ms step:1442/1480 train_time:230144ms step_avg:160.72ms step:1443/1480 train_time:230330ms step_avg:160.73ms step:1444/1480 train_time:230501ms step_avg:160.74ms step:1445/1480 train_time:230672ms step_avg:160.75ms step:1446/1480 train_time:230848ms step_avg:160.76ms step:1447/1480 train_time:231026ms step_avg:160.77ms step:1448/1480 train_time:231196ms step_avg:160.78ms step:1449/1480 train_time:231370ms step_avg:160.79ms step:1450/1480 train_time:231543ms step_avg:160.79ms step:1451/1480 train_time:231713ms step_avg:160.80ms step:1452/1480 train_time:231885ms step_avg:160.81ms step:1453/1480 train_time:232053ms step_avg:160.81ms step:1454/1480 train_time:232227ms step_avg:160.82ms step:1455/1480 train_time:232405ms step_avg:160.83ms step:1456/1480 train_time:232579ms step_avg:160.84ms step:1457/1480 train_time:232751ms step_avg:160.85ms step:1458/1480 train_time:232923ms step_avg:160.86ms step:1459/1480 train_time:233100ms step_avg:160.87ms step:1460/1480 train_time:233269ms step_avg:160.88ms step:1461/1480 train_time:233444ms step_avg:160.89ms step:1462/1480 train_time:233614ms step_avg:160.89ms step:1463/1480 train_time:233791ms step_avg:160.90ms step:1464/1480 train_time:233966ms step_avg:160.91ms step:1465/1480 train_time:234141ms step_avg:160.92ms step:1466/1480 train_time:234311ms step_avg:160.93ms step:1467/1480 train_time:234484ms step_avg:160.94ms step:1468/1480 train_time:234654ms step_avg:160.94ms step:1469/1480 train_time:234828ms step_avg:160.95ms step:1470/1480 train_time:235008ms step_avg:160.96ms step:1471/1480 train_time:235192ms step_avg:160.98ms step:1472/1480 train_time:235371ms step_avg:160.99ms step:1473/1480 train_time:235543ms step_avg:161.00ms step:1474/1480 train_time:235720ms step_avg:161.01ms step:1475/1480 train_time:235898ms step_avg:161.02ms step:1476/1480 train_time:236070ms step_avg:161.03ms step:1477/1480 train_time:236254ms step_avg:161.05ms step:1478/1480 train_time:236438ms step_avg:161.06ms step:1479/1480 train_time:236612ms step_avg:161.07ms step:1480/1480 train_time:236784ms step_avg:161.08ms step:1480/1480 val_loss:3.2835 train_time:236856ms step_avg:161.13ms