import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 12:45:57 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 109W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 120W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22741ms step_avg:nanms step:2/1480 train_time:22902ms step_avg:nanms step:3/1480 train_time:23040ms step_avg:nanms step:4/1480 train_time:23180ms step_avg:nanms step:5/1480 train_time:23323ms step_avg:nanms step:6/1480 train_time:23465ms step_avg:nanms step:7/1480 train_time:23607ms step_avg:nanms step:8/1480 train_time:23751ms step_avg:nanms step:9/1480 train_time:23897ms step_avg:nanms step:10/1480 train_time:24040ms step_avg:nanms step:11/1480 train_time:140ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.80ms step:14/1480 train_time:568ms step_avg:141.96ms step:15/1480 train_time:712ms step_avg:142.47ms step:16/1480 train_time:855ms step_avg:142.57ms step:17/1480 train_time:998ms step_avg:142.60ms step:18/1480 train_time:1140ms step_avg:142.48ms step:19/1480 train_time:1282ms step_avg:142.44ms step:20/1480 train_time:1424ms step_avg:142.36ms step:21/1480 train_time:1567ms step_avg:142.49ms step:22/1480 train_time:1712ms step_avg:142.63ms step:23/1480 train_time:1855ms step_avg:142.72ms step:24/1480 train_time:1998ms step_avg:142.69ms step:25/1480 train_time:2139ms step_avg:142.59ms step:26/1480 train_time:2281ms step_avg:142.57ms step:27/1480 train_time:2423ms step_avg:142.53ms step:28/1480 train_time:2567ms step_avg:142.59ms step:29/1480 train_time:2711ms step_avg:142.71ms step:30/1480 train_time:2855ms step_avg:142.77ms step:31/1480 train_time:2997ms step_avg:142.72ms step:32/1480 train_time:3138ms step_avg:142.65ms step:33/1480 train_time:3280ms step_avg:142.60ms step:34/1480 train_time:3423ms step_avg:142.62ms step:35/1480 train_time:3567ms step_avg:142.67ms step:36/1480 train_time:3712ms step_avg:142.75ms step:37/1480 train_time:3855ms step_avg:142.79ms step:38/1480 train_time:3997ms step_avg:142.76ms step:39/1480 train_time:4138ms step_avg:142.70ms step:40/1480 train_time:4281ms step_avg:142.70ms step:41/1480 train_time:4424ms step_avg:142.71ms step:42/1480 train_time:4568ms step_avg:142.74ms step:43/1480 train_time:4712ms step_avg:142.79ms step:44/1480 train_time:4856ms step_avg:142.81ms step:45/1480 train_time:4999ms step_avg:142.82ms step:46/1480 train_time:5139ms step_avg:142.75ms step:47/1480 train_time:5282ms step_avg:142.74ms step:48/1480 train_time:5423ms step_avg:142.72ms step:49/1480 train_time:5567ms step_avg:142.75ms step:50/1480 train_time:5712ms step_avg:142.79ms step:51/1480 train_time:5855ms step_avg:142.80ms step:52/1480 train_time:5997ms step_avg:142.78ms step:53/1480 train_time:6138ms step_avg:142.74ms step:54/1480 train_time:6280ms step_avg:142.73ms step:55/1480 train_time:6424ms step_avg:142.75ms step:56/1480 train_time:6568ms step_avg:142.78ms step:57/1480 train_time:6712ms step_avg:142.80ms step:58/1480 train_time:6854ms step_avg:142.79ms step:59/1480 train_time:6997ms step_avg:142.80ms step:60/1480 train_time:7138ms step_avg:142.75ms step:61/1480 train_time:7279ms step_avg:142.73ms step:62/1480 train_time:7423ms step_avg:142.75ms step:63/1480 train_time:7567ms step_avg:142.78ms step:64/1480 train_time:7711ms step_avg:142.79ms step:65/1480 train_time:7853ms step_avg:142.79ms step:66/1480 train_time:7996ms step_avg:142.78ms step:67/1480 train_time:8137ms step_avg:142.75ms step:68/1480 train_time:8278ms step_avg:142.73ms step:69/1480 train_time:8422ms step_avg:142.74ms step:70/1480 train_time:8565ms step_avg:142.75ms step:71/1480 train_time:8709ms step_avg:142.78ms step:72/1480 train_time:8852ms step_avg:142.78ms step:73/1480 train_time:8995ms step_avg:142.78ms step:74/1480 train_time:9136ms step_avg:142.75ms step:75/1480 train_time:9277ms step_avg:142.73ms step:76/1480 train_time:9421ms step_avg:142.74ms step:77/1480 train_time:9564ms step_avg:142.74ms step:78/1480 train_time:9708ms step_avg:142.76ms step:79/1480 train_time:9852ms step_avg:142.78ms step:80/1480 train_time:9994ms step_avg:142.77ms step:81/1480 train_time:10138ms step_avg:142.79ms step:82/1480 train_time:10276ms step_avg:142.73ms step:83/1480 train_time:10418ms step_avg:142.72ms step:84/1480 train_time:10560ms step_avg:142.70ms step:85/1480 train_time:10704ms step_avg:142.73ms step:86/1480 train_time:10848ms step_avg:142.74ms step:87/1480 train_time:10991ms step_avg:142.74ms step:88/1480 train_time:11134ms step_avg:142.75ms step:89/1480 train_time:11276ms step_avg:142.73ms step:90/1480 train_time:11418ms step_avg:142.73ms step:91/1480 train_time:11561ms step_avg:142.72ms step:92/1480 train_time:11704ms step_avg:142.73ms step:93/1480 train_time:11847ms step_avg:142.74ms step:94/1480 train_time:11990ms step_avg:142.74ms step:95/1480 train_time:12133ms step_avg:142.74ms step:96/1480 train_time:12274ms step_avg:142.72ms step:97/1480 train_time:12416ms step_avg:142.72ms step:98/1480 train_time:12558ms step_avg:142.71ms step:99/1480 train_time:12699ms step_avg:142.69ms step:100/1480 train_time:12841ms step_avg:142.68ms step:101/1480 train_time:12986ms step_avg:142.71ms step:102/1480 train_time:13129ms step_avg:142.71ms step:103/1480 train_time:13272ms step_avg:142.71ms step:104/1480 train_time:13415ms step_avg:142.71ms step:105/1480 train_time:13556ms step_avg:142.70ms step:106/1480 train_time:13698ms step_avg:142.68ms step:107/1480 train_time:13841ms step_avg:142.70ms step:108/1480 train_time:13983ms step_avg:142.69ms step:109/1480 train_time:14127ms step_avg:142.69ms step:110/1480 train_time:14269ms step_avg:142.69ms step:111/1480 train_time:14415ms step_avg:142.72ms step:112/1480 train_time:14559ms step_avg:142.74ms step:113/1480 train_time:14708ms step_avg:142.80ms step:114/1480 train_time:14855ms step_avg:142.83ms step:115/1480 train_time:15001ms step_avg:142.87ms step:116/1480 train_time:15149ms step_avg:142.92ms step:117/1480 train_time:15298ms step_avg:142.97ms step:118/1480 train_time:15444ms step_avg:143.00ms step:119/1480 train_time:15591ms step_avg:143.04ms step:120/1480 train_time:15738ms step_avg:143.07ms step:121/1480 train_time:15883ms step_avg:143.09ms step:122/1480 train_time:16031ms step_avg:143.13ms step:123/1480 train_time:16178ms step_avg:143.17ms step:124/1480 train_time:16325ms step_avg:143.20ms step:125/1480 train_time:16472ms step_avg:143.24ms step:125/1480 val_loss:4.4160 train_time:16530ms step_avg:143.74ms step:126/1480 train_time:16625ms step_avg:143.32ms step:127/1480 train_time:16775ms step_avg:143.38ms step:128/1480 train_time:16922ms step_avg:143.40ms step:129/1480 train_time:17067ms step_avg:143.42ms step:130/1480 train_time:17214ms step_avg:143.45ms step:131/1480 train_time:17360ms step_avg:143.47ms step:132/1480 train_time:17505ms step_avg:143.48ms step:133/1480 train_time:17655ms step_avg:143.54ms step:134/1480 train_time:17802ms step_avg:143.57ms step:135/1480 train_time:17951ms step_avg:143.61ms step:136/1480 train_time:18098ms step_avg:143.64ms step:137/1480 train_time:18244ms step_avg:143.66ms step:138/1480 train_time:18392ms step_avg:143.68ms step:139/1480 train_time:18539ms step_avg:143.71ms step:140/1480 train_time:18686ms step_avg:143.74ms step:141/1480 train_time:18834ms step_avg:143.77ms step:142/1480 train_time:18981ms step_avg:143.80ms step:143/1480 train_time:19128ms step_avg:143.82ms step:144/1480 train_time:19276ms step_avg:143.85ms step:145/1480 train_time:19423ms step_avg:143.87ms step:146/1480 train_time:19570ms step_avg:143.90ms step:147/1480 train_time:19718ms step_avg:143.93ms step:148/1480 train_time:19864ms step_avg:143.94ms step:149/1480 train_time:20013ms step_avg:143.98ms step:150/1480 train_time:20160ms step_avg:144.00ms step:151/1480 train_time:20308ms step_avg:144.03ms step:152/1480 train_time:20456ms step_avg:144.06ms step:153/1480 train_time:20602ms step_avg:144.07ms step:154/1480 train_time:20748ms step_avg:144.09ms step:155/1480 train_time:20895ms step_avg:144.10ms step:156/1480 train_time:21041ms step_avg:144.12ms step:157/1480 train_time:21188ms step_avg:144.14ms step:158/1480 train_time:21335ms step_avg:144.16ms step:159/1480 train_time:21481ms step_avg:144.17ms step:160/1480 train_time:21627ms step_avg:144.18ms step:161/1480 train_time:21775ms step_avg:144.20ms step:162/1480 train_time:21921ms step_avg:144.22ms step:163/1480 train_time:22068ms step_avg:144.23ms step:164/1480 train_time:22215ms step_avg:144.25ms step:165/1480 train_time:22361ms step_avg:144.26ms step:166/1480 train_time:22507ms step_avg:144.27ms step:167/1480 train_time:22654ms step_avg:144.30ms step:168/1480 train_time:22801ms step_avg:144.31ms step:169/1480 train_time:22948ms step_avg:144.33ms step:170/1480 train_time:23096ms step_avg:144.35ms step:171/1480 train_time:23241ms step_avg:144.36ms step:172/1480 train_time:23388ms step_avg:144.37ms step:173/1480 train_time:23535ms step_avg:144.39ms step:174/1480 train_time:23681ms step_avg:144.40ms step:175/1480 train_time:23828ms step_avg:144.41ms step:176/1480 train_time:23976ms step_avg:144.43ms step:177/1480 train_time:24123ms step_avg:144.45ms step:178/1480 train_time:24268ms step_avg:144.45ms step:179/1480 train_time:24416ms step_avg:144.47ms step:180/1480 train_time:24561ms step_avg:144.48ms step:181/1480 train_time:24708ms step_avg:144.49ms step:182/1480 train_time:24856ms step_avg:144.51ms step:183/1480 train_time:25002ms step_avg:144.52ms step:184/1480 train_time:25149ms step_avg:144.53ms step:185/1480 train_time:25296ms step_avg:144.55ms step:186/1480 train_time:25443ms step_avg:144.56ms step:187/1480 train_time:25590ms step_avg:144.58ms step:188/1480 train_time:25738ms step_avg:144.60ms step:189/1480 train_time:25884ms step_avg:144.61ms step:190/1480 train_time:26032ms step_avg:144.62ms step:191/1480 train_time:26180ms step_avg:144.64ms step:192/1480 train_time:26326ms step_avg:144.65ms step:193/1480 train_time:26472ms step_avg:144.66ms step:194/1480 train_time:26620ms step_avg:144.67ms step:195/1480 train_time:26765ms step_avg:144.67ms step:196/1480 train_time:26912ms step_avg:144.69ms step:197/1480 train_time:27059ms step_avg:144.70ms step:198/1480 train_time:27205ms step_avg:144.71ms step:199/1480 train_time:27352ms step_avg:144.72ms step:200/1480 train_time:27500ms step_avg:144.74ms step:201/1480 train_time:27647ms step_avg:144.75ms step:202/1480 train_time:27794ms step_avg:144.76ms step:203/1480 train_time:27941ms step_avg:144.77ms step:204/1480 train_time:28087ms step_avg:144.78ms step:205/1480 train_time:28234ms step_avg:144.79ms step:206/1480 train_time:28380ms step_avg:144.80ms step:207/1480 train_time:28528ms step_avg:144.81ms step:208/1480 train_time:28675ms step_avg:144.82ms step:209/1480 train_time:28821ms step_avg:144.83ms step:210/1480 train_time:28967ms step_avg:144.83ms step:211/1480 train_time:29115ms step_avg:144.85ms step:212/1480 train_time:29261ms step_avg:144.86ms step:213/1480 train_time:29407ms step_avg:144.86ms step:214/1480 train_time:29554ms step_avg:144.87ms step:215/1480 train_time:29700ms step_avg:144.88ms step:216/1480 train_time:29847ms step_avg:144.89ms step:217/1480 train_time:29994ms step_avg:144.90ms step:218/1480 train_time:30141ms step_avg:144.91ms step:219/1480 train_time:30288ms step_avg:144.92ms step:220/1480 train_time:30436ms step_avg:144.93ms step:221/1480 train_time:30584ms step_avg:144.95ms step:222/1480 train_time:30735ms step_avg:144.98ms step:223/1480 train_time:30885ms step_avg:145.00ms step:224/1480 train_time:31037ms step_avg:145.03ms step:225/1480 train_time:31185ms step_avg:145.05ms step:226/1480 train_time:31336ms step_avg:145.07ms step:227/1480 train_time:31485ms step_avg:145.09ms step:228/1480 train_time:31635ms step_avg:145.12ms step:229/1480 train_time:31786ms step_avg:145.14ms step:230/1480 train_time:31937ms step_avg:145.17ms step:231/1480 train_time:32087ms step_avg:145.19ms step:232/1480 train_time:32237ms step_avg:145.21ms step:233/1480 train_time:32388ms step_avg:145.24ms step:234/1480 train_time:32538ms step_avg:145.26ms step:235/1480 train_time:32688ms step_avg:145.28ms step:236/1480 train_time:32839ms step_avg:145.30ms step:237/1480 train_time:32988ms step_avg:145.32ms step:238/1480 train_time:33138ms step_avg:145.34ms step:239/1480 train_time:33288ms step_avg:145.36ms step:240/1480 train_time:33438ms step_avg:145.38ms step:241/1480 train_time:33588ms step_avg:145.40ms step:242/1480 train_time:33738ms step_avg:145.42ms step:243/1480 train_time:33888ms step_avg:145.44ms step:244/1480 train_time:34039ms step_avg:145.47ms step:245/1480 train_time:34189ms step_avg:145.49ms step:246/1480 train_time:34340ms step_avg:145.51ms step:247/1480 train_time:34490ms step_avg:145.53ms step:248/1480 train_time:34641ms step_avg:145.55ms step:249/1480 train_time:34792ms step_avg:145.57ms step:250/1480 train_time:34942ms step_avg:145.59ms step:250/1480 val_loss:3.9905 train_time:35002ms step_avg:145.84ms step:251/1480 train_time:35100ms step_avg:145.64ms step:252/1480 train_time:35252ms step_avg:145.67ms step:253/1480 train_time:35403ms step_avg:145.69ms step:254/1480 train_time:35553ms step_avg:145.71ms step:255/1480 train_time:35704ms step_avg:145.73ms step:256/1480 train_time:35854ms step_avg:145.75ms step:257/1480 train_time:36005ms step_avg:145.77ms step:258/1480 train_time:36157ms step_avg:145.79ms step:259/1480 train_time:36308ms step_avg:145.81ms step:260/1480 train_time:36459ms step_avg:145.84ms step:261/1480 train_time:36609ms step_avg:145.85ms step:262/1480 train_time:36759ms step_avg:145.87ms step:263/1480 train_time:36910ms step_avg:145.89ms step:264/1480 train_time:37060ms step_avg:145.91ms step:265/1480 train_time:37211ms step_avg:145.93ms step:266/1480 train_time:37362ms step_avg:145.94ms step:267/1480 train_time:37512ms step_avg:145.96ms step:268/1480 train_time:37664ms step_avg:145.98ms step:269/1480 train_time:37814ms step_avg:146.00ms step:270/1480 train_time:37965ms step_avg:146.02ms step:271/1480 train_time:38115ms step_avg:146.04ms step:272/1480 train_time:38267ms step_avg:146.06ms step:273/1480 train_time:38416ms step_avg:146.07ms step:274/1480 train_time:38567ms step_avg:146.09ms step:275/1480 train_time:38716ms step_avg:146.10ms step:276/1480 train_time:38867ms step_avg:146.12ms step:277/1480 train_time:39017ms step_avg:146.13ms step:278/1480 train_time:39167ms step_avg:146.15ms step:279/1480 train_time:39318ms step_avg:146.16ms step:280/1480 train_time:39470ms step_avg:146.18ms step:281/1480 train_time:39620ms step_avg:146.20ms step:282/1480 train_time:39771ms step_avg:146.22ms step:283/1480 train_time:39922ms step_avg:146.23ms step:284/1480 train_time:40072ms step_avg:146.25ms step:285/1480 train_time:40223ms step_avg:146.27ms step:286/1480 train_time:40373ms step_avg:146.28ms step:287/1480 train_time:40526ms step_avg:146.30ms step:288/1480 train_time:40676ms step_avg:146.32ms step:289/1480 train_time:40826ms step_avg:146.33ms step:290/1480 train_time:40976ms step_avg:146.34ms step:291/1480 train_time:41127ms step_avg:146.36ms step:292/1480 train_time:41278ms step_avg:146.38ms step:293/1480 train_time:41429ms step_avg:146.39ms step:294/1480 train_time:41579ms step_avg:146.41ms step:295/1480 train_time:41730ms step_avg:146.42ms step:296/1480 train_time:41881ms step_avg:146.44ms step:297/1480 train_time:42033ms step_avg:146.46ms step:298/1480 train_time:42186ms step_avg:146.48ms step:299/1480 train_time:42335ms step_avg:146.49ms step:300/1480 train_time:42487ms step_avg:146.51ms step:301/1480 train_time:42637ms step_avg:146.52ms step:302/1480 train_time:42787ms step_avg:146.53ms step:303/1480 train_time:42938ms step_avg:146.55ms step:304/1480 train_time:43089ms step_avg:146.56ms step:305/1480 train_time:43239ms step_avg:146.57ms step:306/1480 train_time:43390ms step_avg:146.59ms step:307/1480 train_time:43541ms step_avg:146.60ms step:308/1480 train_time:43691ms step_avg:146.62ms step:309/1480 train_time:43841ms step_avg:146.63ms step:310/1480 train_time:43991ms step_avg:146.64ms step:311/1480 train_time:44142ms step_avg:146.65ms step:312/1480 train_time:44292ms step_avg:146.66ms step:313/1480 train_time:44444ms step_avg:146.68ms step:314/1480 train_time:44593ms step_avg:146.69ms step:315/1480 train_time:44744ms step_avg:146.70ms step:316/1480 train_time:44894ms step_avg:146.71ms step:317/1480 train_time:45045ms step_avg:146.73ms step:318/1480 train_time:45195ms step_avg:146.74ms step:319/1480 train_time:45346ms step_avg:146.75ms step:320/1480 train_time:45496ms step_avg:146.76ms step:321/1480 train_time:45647ms step_avg:146.77ms step:322/1480 train_time:45796ms step_avg:146.78ms step:323/1480 train_time:45947ms step_avg:146.79ms step:324/1480 train_time:46096ms step_avg:146.80ms step:325/1480 train_time:46247ms step_avg:146.82ms step:326/1480 train_time:46398ms step_avg:146.83ms step:327/1480 train_time:46549ms step_avg:146.84ms step:328/1480 train_time:46698ms step_avg:146.85ms step:329/1480 train_time:46849ms step_avg:146.86ms step:330/1480 train_time:47001ms step_avg:146.88ms step:331/1480 train_time:47155ms step_avg:146.90ms step:332/1480 train_time:47308ms step_avg:146.92ms step:333/1480 train_time:47463ms step_avg:146.94ms step:334/1480 train_time:47616ms step_avg:146.96ms step:335/1480 train_time:47770ms step_avg:146.99ms step:336/1480 train_time:47923ms step_avg:147.00ms step:337/1480 train_time:48077ms step_avg:147.02ms step:338/1480 train_time:48231ms step_avg:147.05ms step:339/1480 train_time:48385ms step_avg:147.07ms step:340/1480 train_time:48540ms step_avg:147.09ms step:341/1480 train_time:48692ms step_avg:147.11ms step:342/1480 train_time:48845ms step_avg:147.12ms step:343/1480 train_time:48999ms step_avg:147.14ms step:344/1480 train_time:49153ms step_avg:147.16ms step:345/1480 train_time:49307ms step_avg:147.19ms step:346/1480 train_time:49463ms step_avg:147.21ms step:347/1480 train_time:49616ms step_avg:147.23ms step:348/1480 train_time:49770ms step_avg:147.25ms step:349/1480 train_time:49924ms step_avg:147.27ms step:350/1480 train_time:50077ms step_avg:147.29ms step:351/1480 train_time:50231ms step_avg:147.31ms step:352/1480 train_time:50386ms step_avg:147.33ms step:353/1480 train_time:50541ms step_avg:147.35ms step:354/1480 train_time:50694ms step_avg:147.37ms step:355/1480 train_time:50848ms step_avg:147.39ms step:356/1480 train_time:51002ms step_avg:147.41ms step:357/1480 train_time:51156ms step_avg:147.42ms step:358/1480 train_time:51309ms step_avg:147.44ms step:359/1480 train_time:51464ms step_avg:147.46ms step:360/1480 train_time:51621ms step_avg:147.49ms step:361/1480 train_time:51774ms step_avg:147.51ms step:362/1480 train_time:51928ms step_avg:147.52ms step:363/1480 train_time:52083ms step_avg:147.54ms step:364/1480 train_time:52236ms step_avg:147.56ms step:365/1480 train_time:52390ms step_avg:147.58ms step:366/1480 train_time:52544ms step_avg:147.60ms step:367/1480 train_time:52697ms step_avg:147.61ms step:368/1480 train_time:52850ms step_avg:147.63ms step:369/1480 train_time:53003ms step_avg:147.64ms step:370/1480 train_time:53157ms step_avg:147.66ms step:371/1480 train_time:53310ms step_avg:147.67ms step:372/1480 train_time:53467ms step_avg:147.70ms step:373/1480 train_time:53618ms step_avg:147.71ms step:374/1480 train_time:53771ms step_avg:147.72ms step:375/1480 train_time:53924ms step_avg:147.74ms step:375/1480 val_loss:3.8071 train_time:53986ms step_avg:147.91ms step:376/1480 train_time:54085ms step_avg:147.77ms step:377/1480 train_time:54240ms step_avg:147.79ms step:378/1480 train_time:54392ms step_avg:147.80ms step:379/1480 train_time:54545ms step_avg:147.82ms step:380/1480 train_time:54696ms step_avg:147.83ms step:381/1480 train_time:54848ms step_avg:147.84ms step:382/1480 train_time:55001ms step_avg:147.85ms step:383/1480 train_time:55157ms step_avg:147.87ms step:384/1480 train_time:55310ms step_avg:147.89ms step:385/1480 train_time:55464ms step_avg:147.90ms step:386/1480 train_time:55617ms step_avg:147.92ms step:387/1480 train_time:55770ms step_avg:147.93ms step:388/1480 train_time:55924ms step_avg:147.95ms step:389/1480 train_time:56077ms step_avg:147.96ms step:390/1480 train_time:56231ms step_avg:147.98ms step:391/1480 train_time:56385ms step_avg:147.99ms step:392/1480 train_time:56540ms step_avg:148.01ms step:393/1480 train_time:56692ms step_avg:148.02ms step:394/1480 train_time:56846ms step_avg:148.04ms step:395/1480 train_time:56999ms step_avg:148.05ms step:396/1480 train_time:57153ms step_avg:148.07ms step:397/1480 train_time:57308ms step_avg:148.08ms step:398/1480 train_time:57463ms step_avg:148.10ms step:399/1480 train_time:57617ms step_avg:148.12ms step:400/1480 train_time:57771ms step_avg:148.13ms step:401/1480 train_time:57923ms step_avg:148.14ms step:402/1480 train_time:58076ms step_avg:148.15ms step:403/1480 train_time:58231ms step_avg:148.17ms step:404/1480 train_time:58385ms step_avg:148.18ms step:405/1480 train_time:58540ms step_avg:148.20ms step:406/1480 train_time:58693ms step_avg:148.22ms step:407/1480 train_time:58848ms step_avg:148.23ms step:408/1480 train_time:59002ms step_avg:148.25ms step:409/1480 train_time:59157ms step_avg:148.26ms step:410/1480 train_time:59310ms step_avg:148.28ms step:411/1480 train_time:59464ms step_avg:148.29ms step:412/1480 train_time:59618ms step_avg:148.30ms step:413/1480 train_time:59771ms step_avg:148.31ms step:414/1480 train_time:59925ms step_avg:148.33ms step:415/1480 train_time:60079ms step_avg:148.34ms step:416/1480 train_time:60232ms step_avg:148.36ms step:417/1480 train_time:60385ms step_avg:148.37ms step:418/1480 train_time:60540ms step_avg:148.38ms step:419/1480 train_time:60693ms step_avg:148.39ms step:420/1480 train_time:60847ms step_avg:148.41ms step:421/1480 train_time:61000ms step_avg:148.42ms step:422/1480 train_time:61154ms step_avg:148.43ms step:423/1480 train_time:61306ms step_avg:148.44ms step:424/1480 train_time:61461ms step_avg:148.46ms step:425/1480 train_time:61614ms step_avg:148.47ms step:426/1480 train_time:61768ms step_avg:148.48ms step:427/1480 train_time:61923ms step_avg:148.50ms step:428/1480 train_time:62076ms step_avg:148.51ms step:429/1480 train_time:62231ms step_avg:148.52ms step:430/1480 train_time:62384ms step_avg:148.53ms step:431/1480 train_time:62538ms step_avg:148.55ms step:432/1480 train_time:62690ms step_avg:148.56ms step:433/1480 train_time:62844ms step_avg:148.57ms step:434/1480 train_time:62998ms step_avg:148.58ms step:435/1480 train_time:63151ms step_avg:148.59ms step:436/1480 train_time:63305ms step_avg:148.60ms step:437/1480 train_time:63460ms step_avg:148.62ms step:438/1480 train_time:63613ms step_avg:148.63ms step:439/1480 train_time:63767ms step_avg:148.64ms step:440/1480 train_time:63922ms step_avg:148.66ms step:441/1480 train_time:64078ms step_avg:148.67ms step:442/1480 train_time:64235ms step_avg:148.69ms step:443/1480 train_time:64391ms step_avg:148.71ms step:444/1480 train_time:64547ms step_avg:148.73ms step:445/1480 train_time:64703ms step_avg:148.74ms step:446/1480 train_time:64860ms step_avg:148.76ms step:447/1480 train_time:65016ms step_avg:148.78ms step:448/1480 train_time:65171ms step_avg:148.79ms step:449/1480 train_time:65330ms step_avg:148.82ms step:450/1480 train_time:65487ms step_avg:148.83ms step:451/1480 train_time:65647ms step_avg:148.86ms step:452/1480 train_time:65804ms step_avg:148.88ms step:453/1480 train_time:65962ms step_avg:148.90ms step:454/1480 train_time:66118ms step_avg:148.92ms step:455/1480 train_time:66274ms step_avg:148.93ms step:456/1480 train_time:66430ms step_avg:148.95ms step:457/1480 train_time:66585ms step_avg:148.96ms step:458/1480 train_time:66742ms step_avg:148.98ms step:459/1480 train_time:66900ms step_avg:149.00ms step:460/1480 train_time:67057ms step_avg:149.02ms step:461/1480 train_time:67214ms step_avg:149.03ms step:462/1480 train_time:67370ms step_avg:149.05ms step:463/1480 train_time:67527ms step_avg:149.07ms step:464/1480 train_time:67684ms step_avg:149.08ms step:465/1480 train_time:67842ms step_avg:149.10ms step:466/1480 train_time:67998ms step_avg:149.12ms step:467/1480 train_time:68155ms step_avg:149.14ms step:468/1480 train_time:68311ms step_avg:149.15ms step:469/1480 train_time:68467ms step_avg:149.17ms step:470/1480 train_time:68625ms step_avg:149.18ms step:471/1480 train_time:68782ms step_avg:149.20ms step:472/1480 train_time:68940ms step_avg:149.22ms step:473/1480 train_time:69095ms step_avg:149.23ms step:474/1480 train_time:69251ms step_avg:149.25ms step:475/1480 train_time:69408ms step_avg:149.27ms step:476/1480 train_time:69565ms step_avg:149.28ms step:477/1480 train_time:69724ms step_avg:149.30ms step:478/1480 train_time:69881ms step_avg:149.32ms step:479/1480 train_time:70038ms step_avg:149.33ms step:480/1480 train_time:70194ms step_avg:149.35ms step:481/1480 train_time:70351ms step_avg:149.37ms step:482/1480 train_time:70507ms step_avg:149.38ms step:483/1480 train_time:70666ms step_avg:149.40ms step:484/1480 train_time:70825ms step_avg:149.42ms step:485/1480 train_time:70982ms step_avg:149.44ms step:486/1480 train_time:71141ms step_avg:149.46ms step:487/1480 train_time:71298ms step_avg:149.47ms step:488/1480 train_time:71453ms step_avg:149.48ms step:489/1480 train_time:71608ms step_avg:149.50ms step:490/1480 train_time:71765ms step_avg:149.51ms step:491/1480 train_time:71921ms step_avg:149.52ms step:492/1480 train_time:72076ms step_avg:149.54ms step:493/1480 train_time:72233ms step_avg:149.55ms step:494/1480 train_time:72389ms step_avg:149.56ms step:495/1480 train_time:72545ms step_avg:149.58ms step:496/1480 train_time:72704ms step_avg:149.60ms step:497/1480 train_time:72859ms step_avg:149.61ms step:498/1480 train_time:73018ms step_avg:149.63ms step:499/1480 train_time:73175ms step_avg:149.64ms step:500/1480 train_time:73332ms step_avg:149.66ms step:500/1480 val_loss:3.6855 train_time:73393ms step_avg:149.78ms step:501/1480 train_time:73493ms step_avg:149.68ms step:502/1480 train_time:73651ms step_avg:149.70ms step:503/1480 train_time:73808ms step_avg:149.71ms step:504/1480 train_time:73963ms step_avg:149.72ms step:505/1480 train_time:74118ms step_avg:149.73ms step:506/1480 train_time:74274ms step_avg:149.75ms step:507/1480 train_time:74431ms step_avg:149.76ms step:508/1480 train_time:74590ms step_avg:149.78ms step:509/1480 train_time:74748ms step_avg:149.80ms step:510/1480 train_time:74904ms step_avg:149.81ms step:511/1480 train_time:75061ms step_avg:149.82ms step:512/1480 train_time:75219ms step_avg:149.84ms step:513/1480 train_time:75375ms step_avg:149.85ms step:514/1480 train_time:75531ms step_avg:149.86ms step:515/1480 train_time:75689ms step_avg:149.88ms step:516/1480 train_time:75850ms step_avg:149.90ms step:517/1480 train_time:76008ms step_avg:149.92ms step:518/1480 train_time:76165ms step_avg:149.93ms step:519/1480 train_time:76322ms step_avg:149.95ms step:520/1480 train_time:76480ms step_avg:149.96ms step:521/1480 train_time:76636ms step_avg:149.97ms step:522/1480 train_time:76792ms step_avg:149.98ms step:523/1480 train_time:76949ms step_avg:150.00ms step:524/1480 train_time:77107ms step_avg:150.01ms step:525/1480 train_time:77263ms step_avg:150.03ms step:526/1480 train_time:77422ms step_avg:150.04ms step:527/1480 train_time:77578ms step_avg:150.05ms step:528/1480 train_time:77734ms step_avg:150.07ms step:529/1480 train_time:77890ms step_avg:150.08ms step:530/1480 train_time:78049ms step_avg:150.09ms step:531/1480 train_time:78207ms step_avg:150.11ms step:532/1480 train_time:78364ms step_avg:150.12ms step:533/1480 train_time:78521ms step_avg:150.14ms step:534/1480 train_time:78676ms step_avg:150.15ms step:535/1480 train_time:78833ms step_avg:150.16ms step:536/1480 train_time:78990ms step_avg:150.17ms step:537/1480 train_time:79149ms step_avg:150.19ms step:538/1480 train_time:79307ms step_avg:150.20ms step:539/1480 train_time:79465ms step_avg:150.22ms step:540/1480 train_time:79623ms step_avg:150.23ms step:541/1480 train_time:79780ms step_avg:150.25ms step:542/1480 train_time:79936ms step_avg:150.26ms step:543/1480 train_time:80092ms step_avg:150.27ms step:544/1480 train_time:80250ms step_avg:150.28ms step:545/1480 train_time:80408ms step_avg:150.30ms step:546/1480 train_time:80564ms step_avg:150.31ms step:547/1480 train_time:80722ms step_avg:150.32ms step:548/1480 train_time:80878ms step_avg:150.33ms step:549/1480 train_time:81033ms step_avg:150.34ms step:550/1480 train_time:81191ms step_avg:150.35ms step:551/1480 train_time:81350ms step_avg:150.37ms step:552/1480 train_time:81510ms step_avg:150.39ms step:553/1480 train_time:81671ms step_avg:150.41ms step:554/1480 train_time:81832ms step_avg:150.43ms step:555/1480 train_time:81991ms step_avg:150.44ms step:556/1480 train_time:82150ms step_avg:150.46ms step:557/1480 train_time:82311ms step_avg:150.48ms step:558/1480 train_time:82471ms step_avg:150.49ms step:559/1480 train_time:82630ms step_avg:150.51ms step:560/1480 train_time:82790ms step_avg:150.53ms step:561/1480 train_time:82950ms step_avg:150.54ms step:562/1480 train_time:83111ms step_avg:150.56ms step:563/1480 train_time:83269ms step_avg:150.58ms step:564/1480 train_time:83429ms step_avg:150.59ms step:565/1480 train_time:83589ms step_avg:150.61ms step:566/1480 train_time:83749ms step_avg:150.63ms step:567/1480 train_time:83909ms step_avg:150.64ms step:568/1480 train_time:84069ms step_avg:150.66ms step:569/1480 train_time:84229ms step_avg:150.68ms step:570/1480 train_time:84389ms step_avg:150.69ms step:571/1480 train_time:84549ms step_avg:150.71ms step:572/1480 train_time:84709ms step_avg:150.73ms step:573/1480 train_time:84870ms step_avg:150.75ms step:574/1480 train_time:85031ms step_avg:150.76ms step:575/1480 train_time:85191ms step_avg:150.78ms step:576/1480 train_time:85353ms step_avg:150.80ms step:577/1480 train_time:85511ms step_avg:150.81ms step:578/1480 train_time:85671ms step_avg:150.83ms step:579/1480 train_time:85830ms step_avg:150.84ms step:580/1480 train_time:85990ms step_avg:150.86ms step:581/1480 train_time:86151ms step_avg:150.88ms step:582/1480 train_time:86312ms step_avg:150.90ms step:583/1480 train_time:86472ms step_avg:150.91ms step:584/1480 train_time:86631ms step_avg:150.93ms step:585/1480 train_time:86790ms step_avg:150.94ms step:586/1480 train_time:86950ms step_avg:150.96ms step:587/1480 train_time:87112ms step_avg:150.97ms step:588/1480 train_time:87271ms step_avg:150.99ms step:589/1480 train_time:87433ms step_avg:151.01ms step:590/1480 train_time:87592ms step_avg:151.02ms step:591/1480 train_time:87751ms step_avg:151.03ms step:592/1480 train_time:87911ms step_avg:151.05ms step:593/1480 train_time:88072ms step_avg:151.07ms step:594/1480 train_time:88232ms step_avg:151.08ms step:595/1480 train_time:88392ms step_avg:151.10ms step:596/1480 train_time:88553ms step_avg:151.11ms step:597/1480 train_time:88712ms step_avg:151.13ms step:598/1480 train_time:88870ms step_avg:151.14ms step:599/1480 train_time:89030ms step_avg:151.15ms step:600/1480 train_time:89189ms step_avg:151.17ms step:601/1480 train_time:89349ms step_avg:151.18ms step:602/1480 train_time:89509ms step_avg:151.20ms step:603/1480 train_time:89669ms step_avg:151.21ms step:604/1480 train_time:89830ms step_avg:151.23ms step:605/1480 train_time:89990ms step_avg:151.24ms step:606/1480 train_time:90154ms step_avg:151.26ms step:607/1480 train_time:90314ms step_avg:151.28ms step:608/1480 train_time:90473ms step_avg:151.29ms step:609/1480 train_time:90632ms step_avg:151.30ms step:610/1480 train_time:90790ms step_avg:151.32ms step:611/1480 train_time:90952ms step_avg:151.33ms step:612/1480 train_time:91112ms step_avg:151.35ms step:613/1480 train_time:91273ms step_avg:151.37ms step:614/1480 train_time:91433ms step_avg:151.38ms step:615/1480 train_time:91591ms step_avg:151.39ms step:616/1480 train_time:91750ms step_avg:151.40ms step:617/1480 train_time:91910ms step_avg:151.42ms step:618/1480 train_time:92069ms step_avg:151.43ms step:619/1480 train_time:92230ms step_avg:151.45ms step:620/1480 train_time:92390ms step_avg:151.46ms step:621/1480 train_time:92551ms step_avg:151.48ms step:622/1480 train_time:92712ms step_avg:151.49ms step:623/1480 train_time:92872ms step_avg:151.50ms step:624/1480 train_time:93031ms step_avg:151.52ms step:625/1480 train_time:93190ms step_avg:151.53ms step:625/1480 val_loss:3.6057 train_time:93254ms step_avg:151.63ms step:626/1480 train_time:93354ms step_avg:151.55ms step:627/1480 train_time:93514ms step_avg:151.56ms step:628/1480 train_time:93672ms step_avg:151.57ms step:629/1480 train_time:93831ms step_avg:151.58ms step:630/1480 train_time:93989ms step_avg:151.60ms step:631/1480 train_time:94147ms step_avg:151.60ms step:632/1480 train_time:94306ms step_avg:151.62ms step:633/1480 train_time:94466ms step_avg:151.63ms step:634/1480 train_time:94626ms step_avg:151.64ms step:635/1480 train_time:94786ms step_avg:151.66ms step:636/1480 train_time:94944ms step_avg:151.67ms step:637/1480 train_time:95105ms step_avg:151.68ms step:638/1480 train_time:95263ms step_avg:151.69ms step:639/1480 train_time:95423ms step_avg:151.71ms step:640/1480 train_time:95583ms step_avg:151.72ms step:641/1480 train_time:95742ms step_avg:151.73ms step:642/1480 train_time:95901ms step_avg:151.74ms step:643/1480 train_time:96060ms step_avg:151.75ms step:644/1480 train_time:96220ms step_avg:151.77ms step:645/1480 train_time:96379ms step_avg:151.78ms step:646/1480 train_time:96538ms step_avg:151.79ms step:647/1480 train_time:96698ms step_avg:151.80ms step:648/1480 train_time:96859ms step_avg:151.82ms step:649/1480 train_time:97021ms step_avg:151.83ms step:650/1480 train_time:97180ms step_avg:151.84ms step:651/1480 train_time:97341ms step_avg:151.86ms step:652/1480 train_time:97501ms step_avg:151.87ms step:653/1480 train_time:97661ms step_avg:151.88ms step:654/1480 train_time:97821ms step_avg:151.90ms step:655/1480 train_time:97981ms step_avg:151.91ms step:656/1480 train_time:98142ms step_avg:151.92ms step:657/1480 train_time:98302ms step_avg:151.94ms step:658/1480 train_time:98461ms step_avg:151.95ms step:659/1480 train_time:98623ms step_avg:151.96ms step:660/1480 train_time:98784ms step_avg:151.98ms step:661/1480 train_time:98946ms step_avg:151.99ms step:662/1480 train_time:99106ms step_avg:152.00ms step:663/1480 train_time:99265ms step_avg:152.01ms step:664/1480 train_time:99427ms step_avg:152.03ms step:665/1480 train_time:99588ms step_avg:152.04ms step:666/1480 train_time:99748ms step_avg:152.05ms step:667/1480 train_time:99910ms step_avg:152.07ms step:668/1480 train_time:100070ms step_avg:152.08ms step:669/1480 train_time:100231ms step_avg:152.10ms step:670/1480 train_time:100389ms step_avg:152.11ms step:671/1480 train_time:100550ms step_avg:152.12ms step:672/1480 train_time:100710ms step_avg:152.13ms step:673/1480 train_time:100870ms step_avg:152.14ms step:674/1480 train_time:101031ms step_avg:152.16ms step:675/1480 train_time:101193ms step_avg:152.17ms step:676/1480 train_time:101356ms step_avg:152.19ms step:677/1480 train_time:101517ms step_avg:152.20ms step:678/1480 train_time:101678ms step_avg:152.21ms step:679/1480 train_time:101840ms step_avg:152.23ms step:680/1480 train_time:102003ms step_avg:152.24ms step:681/1480 train_time:102164ms step_avg:152.26ms step:682/1480 train_time:102327ms step_avg:152.27ms step:683/1480 train_time:102487ms step_avg:152.28ms step:684/1480 train_time:102650ms step_avg:152.30ms step:685/1480 train_time:102812ms step_avg:152.31ms step:686/1480 train_time:102971ms step_avg:152.32ms step:687/1480 train_time:103132ms step_avg:152.34ms step:688/1480 train_time:103297ms step_avg:152.36ms step:689/1480 train_time:103461ms step_avg:152.37ms step:690/1480 train_time:103625ms step_avg:152.39ms step:691/1480 train_time:103786ms step_avg:152.40ms step:692/1480 train_time:103946ms step_avg:152.41ms step:693/1480 train_time:104107ms step_avg:152.43ms step:694/1480 train_time:104268ms step_avg:152.44ms step:695/1480 train_time:104427ms step_avg:152.45ms step:696/1480 train_time:104588ms step_avg:152.46ms step:697/1480 train_time:104750ms step_avg:152.47ms step:698/1480 train_time:104911ms step_avg:152.49ms step:699/1480 train_time:105074ms step_avg:152.50ms step:700/1480 train_time:105235ms step_avg:152.52ms step:701/1480 train_time:105395ms step_avg:152.53ms step:702/1480 train_time:105557ms step_avg:152.54ms step:703/1480 train_time:105720ms step_avg:152.55ms step:704/1480 train_time:105882ms step_avg:152.57ms step:705/1480 train_time:106045ms step_avg:152.58ms step:706/1480 train_time:106209ms step_avg:152.60ms step:707/1480 train_time:106370ms step_avg:152.61ms step:708/1480 train_time:106531ms step_avg:152.62ms step:709/1480 train_time:106694ms step_avg:152.64ms step:710/1480 train_time:106854ms step_avg:152.65ms step:711/1480 train_time:107019ms step_avg:152.67ms step:712/1480 train_time:107184ms step_avg:152.68ms step:713/1480 train_time:107347ms step_avg:152.70ms step:714/1480 train_time:107508ms step_avg:152.71ms step:715/1480 train_time:107667ms step_avg:152.72ms step:716/1480 train_time:107827ms step_avg:152.73ms step:717/1480 train_time:107989ms step_avg:152.74ms step:718/1480 train_time:108148ms step_avg:152.75ms step:719/1480 train_time:108307ms step_avg:152.76ms step:720/1480 train_time:108469ms step_avg:152.77ms step:721/1480 train_time:108630ms step_avg:152.78ms step:722/1480 train_time:108790ms step_avg:152.80ms step:723/1480 train_time:108950ms step_avg:152.81ms step:724/1480 train_time:109112ms step_avg:152.82ms step:725/1480 train_time:109274ms step_avg:152.83ms step:726/1480 train_time:109439ms step_avg:152.85ms step:727/1480 train_time:109603ms step_avg:152.86ms step:728/1480 train_time:109763ms step_avg:152.87ms step:729/1480 train_time:109925ms step_avg:152.89ms step:730/1480 train_time:110087ms step_avg:152.90ms step:731/1480 train_time:110248ms step_avg:152.91ms step:732/1480 train_time:110408ms step_avg:152.92ms step:733/1480 train_time:110568ms step_avg:152.93ms step:734/1480 train_time:110729ms step_avg:152.94ms step:735/1480 train_time:110890ms step_avg:152.95ms step:736/1480 train_time:111051ms step_avg:152.96ms step:737/1480 train_time:111213ms step_avg:152.97ms step:738/1480 train_time:111373ms step_avg:152.99ms step:739/1480 train_time:111533ms step_avg:152.99ms step:740/1480 train_time:111699ms step_avg:153.01ms step:741/1480 train_time:111863ms step_avg:153.03ms step:742/1480 train_time:112025ms step_avg:153.04ms step:743/1480 train_time:112187ms step_avg:153.05ms step:744/1480 train_time:112349ms step_avg:153.06ms step:745/1480 train_time:112514ms step_avg:153.08ms step:746/1480 train_time:112674ms step_avg:153.09ms step:747/1480 train_time:112833ms step_avg:153.10ms step:748/1480 train_time:112998ms step_avg:153.11ms step:749/1480 train_time:113162ms step_avg:153.13ms step:750/1480 train_time:113323ms step_avg:153.14ms step:750/1480 val_loss:3.5489 train_time:113387ms step_avg:153.23ms step:751/1480 train_time:113489ms step_avg:153.16ms step:752/1480 train_time:113650ms step_avg:153.17ms step:753/1480 train_time:113811ms step_avg:153.18ms step:754/1480 train_time:113971ms step_avg:153.19ms step:755/1480 train_time:114132ms step_avg:153.20ms step:756/1480 train_time:114293ms step_avg:153.21ms step:757/1480 train_time:114458ms step_avg:153.22ms step:758/1480 train_time:114619ms step_avg:153.23ms step:759/1480 train_time:114782ms step_avg:153.25ms step:760/1480 train_time:114945ms step_avg:153.26ms step:761/1480 train_time:115108ms step_avg:153.27ms step:762/1480 train_time:115269ms step_avg:153.28ms step:763/1480 train_time:115430ms step_avg:153.29ms step:764/1480 train_time:115591ms step_avg:153.30ms step:765/1480 train_time:115752ms step_avg:153.31ms step:766/1480 train_time:115913ms step_avg:153.32ms step:767/1480 train_time:116075ms step_avg:153.34ms step:768/1480 train_time:116238ms step_avg:153.35ms step:769/1480 train_time:116401ms step_avg:153.36ms step:770/1480 train_time:116565ms step_avg:153.38ms step:771/1480 train_time:116728ms step_avg:153.39ms step:772/1480 train_time:116890ms step_avg:153.40ms step:773/1480 train_time:117052ms step_avg:153.41ms step:774/1480 train_time:117213ms step_avg:153.42ms step:775/1480 train_time:117376ms step_avg:153.43ms step:776/1480 train_time:117541ms step_avg:153.45ms step:777/1480 train_time:117707ms step_avg:153.46ms step:778/1480 train_time:117871ms step_avg:153.48ms step:779/1480 train_time:118033ms step_avg:153.49ms step:780/1480 train_time:118196ms step_avg:153.50ms step:781/1480 train_time:118361ms step_avg:153.52ms step:782/1480 train_time:118526ms step_avg:153.53ms step:783/1480 train_time:118688ms step_avg:153.54ms step:784/1480 train_time:118851ms step_avg:153.55ms step:785/1480 train_time:119013ms step_avg:153.56ms step:786/1480 train_time:119178ms step_avg:153.58ms step:787/1480 train_time:119342ms step_avg:153.59ms step:788/1480 train_time:119506ms step_avg:153.61ms step:789/1480 train_time:119669ms step_avg:153.62ms step:790/1480 train_time:119833ms step_avg:153.63ms step:791/1480 train_time:119999ms step_avg:153.65ms step:792/1480 train_time:120165ms step_avg:153.66ms step:793/1480 train_time:120328ms step_avg:153.68ms step:794/1480 train_time:120492ms step_avg:153.69ms step:795/1480 train_time:120658ms step_avg:153.70ms step:796/1480 train_time:120826ms step_avg:153.72ms step:797/1480 train_time:120991ms step_avg:153.74ms step:798/1480 train_time:121154ms step_avg:153.75ms step:799/1480 train_time:121320ms step_avg:153.76ms step:800/1480 train_time:121485ms step_avg:153.78ms step:801/1480 train_time:121648ms step_avg:153.79ms step:802/1480 train_time:121815ms step_avg:153.81ms step:803/1480 train_time:121977ms step_avg:153.82ms step:804/1480 train_time:122139ms step_avg:153.83ms step:805/1480 train_time:122305ms step_avg:153.84ms step:806/1480 train_time:122468ms step_avg:153.85ms step:807/1480 train_time:122628ms step_avg:153.86ms step:808/1480 train_time:122792ms step_avg:153.88ms step:809/1480 train_time:122954ms step_avg:153.88ms step:810/1480 train_time:123115ms step_avg:153.89ms step:811/1480 train_time:123276ms step_avg:153.90ms step:812/1480 train_time:123437ms step_avg:153.91ms step:813/1480 train_time:123598ms step_avg:153.92ms step:814/1480 train_time:123764ms step_avg:153.94ms step:815/1480 train_time:123926ms step_avg:153.95ms step:816/1480 train_time:124091ms step_avg:153.96ms step:817/1480 train_time:124252ms step_avg:153.97ms step:818/1480 train_time:124413ms step_avg:153.98ms step:819/1480 train_time:124577ms step_avg:153.99ms step:820/1480 train_time:124742ms step_avg:154.00ms step:821/1480 train_time:124904ms step_avg:154.01ms step:822/1480 train_time:125068ms step_avg:154.02ms step:823/1480 train_time:125230ms step_avg:154.03ms step:824/1480 train_time:125391ms step_avg:154.04ms step:825/1480 train_time:125555ms step_avg:154.05ms step:826/1480 train_time:125721ms step_avg:154.07ms step:827/1480 train_time:125887ms step_avg:154.08ms step:828/1480 train_time:126050ms step_avg:154.10ms step:829/1480 train_time:126213ms step_avg:154.11ms step:830/1480 train_time:126377ms step_avg:154.12ms step:831/1480 train_time:126541ms step_avg:154.13ms step:832/1480 train_time:126705ms step_avg:154.14ms step:833/1480 train_time:126871ms step_avg:154.16ms step:834/1480 train_time:127035ms step_avg:154.17ms step:835/1480 train_time:127197ms step_avg:154.18ms step:836/1480 train_time:127364ms step_avg:154.19ms step:837/1480 train_time:127526ms step_avg:154.20ms step:838/1480 train_time:127691ms step_avg:154.22ms step:839/1480 train_time:127852ms step_avg:154.22ms step:840/1480 train_time:128013ms step_avg:154.23ms step:841/1480 train_time:128174ms step_avg:154.24ms step:842/1480 train_time:128336ms step_avg:154.25ms step:843/1480 train_time:128498ms step_avg:154.26ms step:844/1480 train_time:128662ms step_avg:154.27ms step:845/1480 train_time:128827ms step_avg:154.28ms step:846/1480 train_time:128991ms step_avg:154.30ms step:847/1480 train_time:129155ms step_avg:154.31ms step:848/1480 train_time:129316ms step_avg:154.32ms step:849/1480 train_time:129481ms step_avg:154.33ms step:850/1480 train_time:129645ms step_avg:154.34ms step:851/1480 train_time:129809ms step_avg:154.35ms step:852/1480 train_time:129971ms step_avg:154.36ms step:853/1480 train_time:130132ms step_avg:154.37ms step:854/1480 train_time:130295ms step_avg:154.38ms step:855/1480 train_time:130458ms step_avg:154.39ms step:856/1480 train_time:130622ms step_avg:154.40ms step:857/1480 train_time:130788ms step_avg:154.41ms step:858/1480 train_time:130952ms step_avg:154.42ms step:859/1480 train_time:131116ms step_avg:154.44ms step:860/1480 train_time:131277ms step_avg:154.44ms step:861/1480 train_time:131442ms step_avg:154.46ms step:862/1480 train_time:131610ms step_avg:154.47ms step:863/1480 train_time:131778ms step_avg:154.49ms step:864/1480 train_time:131942ms step_avg:154.50ms step:865/1480 train_time:132104ms step_avg:154.51ms step:866/1480 train_time:132272ms step_avg:154.52ms step:867/1480 train_time:132435ms step_avg:154.53ms step:868/1480 train_time:132595ms step_avg:154.54ms step:869/1480 train_time:132756ms step_avg:154.55ms step:870/1480 train_time:132921ms step_avg:154.56ms step:871/1480 train_time:133086ms step_avg:154.57ms step:872/1480 train_time:133249ms step_avg:154.58ms step:873/1480 train_time:133412ms step_avg:154.59ms step:874/1480 train_time:133577ms step_avg:154.60ms step:875/1480 train_time:133741ms step_avg:154.61ms step:875/1480 val_loss:3.5038 train_time:133806ms step_avg:154.69ms step:876/1480 train_time:133909ms step_avg:154.63ms step:877/1480 train_time:134074ms step_avg:154.64ms step:878/1480 train_time:134236ms step_avg:154.65ms step:879/1480 train_time:134400ms step_avg:154.66ms step:880/1480 train_time:134563ms step_avg:154.67ms step:881/1480 train_time:134727ms step_avg:154.68ms step:882/1480 train_time:134893ms step_avg:154.69ms step:883/1480 train_time:135058ms step_avg:154.71ms step:884/1480 train_time:135223ms step_avg:154.72ms step:885/1480 train_time:135388ms step_avg:154.73ms step:886/1480 train_time:135555ms step_avg:154.74ms step:887/1480 train_time:135721ms step_avg:154.76ms step:888/1480 train_time:135895ms step_avg:154.78ms step:889/1480 train_time:136063ms step_avg:154.79ms step:890/1480 train_time:136227ms step_avg:154.80ms step:891/1480 train_time:136393ms step_avg:154.82ms step:892/1480 train_time:136559ms step_avg:154.83ms step:893/1480 train_time:136720ms step_avg:154.84ms step:894/1480 train_time:136887ms step_avg:154.85ms step:895/1480 train_time:137053ms step_avg:154.86ms step:896/1480 train_time:137218ms step_avg:154.87ms step:897/1480 train_time:137385ms step_avg:154.89ms step:898/1480 train_time:137553ms step_avg:154.90ms step:899/1480 train_time:137717ms step_avg:154.91ms step:900/1480 train_time:137879ms step_avg:154.92ms step:901/1480 train_time:138042ms step_avg:154.93ms step:902/1480 train_time:138206ms step_avg:154.94ms step:903/1480 train_time:138377ms step_avg:154.96ms step:904/1480 train_time:138541ms step_avg:154.97ms step:905/1480 train_time:138704ms step_avg:154.98ms step:906/1480 train_time:138870ms step_avg:154.99ms step:907/1480 train_time:139038ms step_avg:155.00ms step:908/1480 train_time:139200ms step_avg:155.01ms step:909/1480 train_time:139364ms step_avg:155.02ms step:910/1480 train_time:139534ms step_avg:155.04ms step:911/1480 train_time:139699ms step_avg:155.05ms step:912/1480 train_time:139864ms step_avg:155.06ms step:913/1480 train_time:140032ms step_avg:155.07ms step:914/1480 train_time:140199ms step_avg:155.09ms step:915/1480 train_time:140369ms step_avg:155.10ms step:916/1480 train_time:140534ms step_avg:155.11ms step:917/1480 train_time:140696ms step_avg:155.12ms step:918/1480 train_time:140864ms step_avg:155.14ms step:919/1480 train_time:141034ms step_avg:155.15ms step:920/1480 train_time:141198ms step_avg:155.16ms step:921/1480 train_time:141364ms step_avg:155.17ms step:922/1480 train_time:141533ms step_avg:155.19ms step:923/1480 train_time:141695ms step_avg:155.20ms step:924/1480 train_time:141859ms step_avg:155.21ms step:925/1480 train_time:142025ms step_avg:155.22ms step:926/1480 train_time:142189ms step_avg:155.23ms step:927/1480 train_time:142353ms step_avg:155.24ms step:928/1480 train_time:142518ms step_avg:155.25ms step:929/1480 train_time:142682ms step_avg:155.26ms step:930/1480 train_time:142847ms step_avg:155.27ms step:931/1480 train_time:143011ms step_avg:155.28ms step:932/1480 train_time:143176ms step_avg:155.29ms step:933/1480 train_time:143344ms step_avg:155.30ms step:934/1480 train_time:143512ms step_avg:155.32ms step:935/1480 train_time:143681ms step_avg:155.33ms step:936/1480 train_time:143849ms step_avg:155.34ms step:937/1480 train_time:144018ms step_avg:155.36ms step:938/1480 train_time:144179ms step_avg:155.37ms step:939/1480 train_time:144348ms step_avg:155.38ms step:940/1480 train_time:144515ms step_avg:155.39ms step:941/1480 train_time:144678ms step_avg:155.40ms step:942/1480 train_time:144842ms step_avg:155.41ms step:943/1480 train_time:145013ms step_avg:155.43ms step:944/1480 train_time:145185ms step_avg:155.44ms step:945/1480 train_time:145349ms step_avg:155.45ms step:946/1480 train_time:145518ms step_avg:155.47ms step:947/1480 train_time:145685ms step_avg:155.48ms step:948/1480 train_time:145851ms step_avg:155.49ms step:949/1480 train_time:146016ms step_avg:155.50ms step:950/1480 train_time:146179ms step_avg:155.51ms step:951/1480 train_time:146347ms step_avg:155.52ms step:952/1480 train_time:146513ms step_avg:155.53ms step:953/1480 train_time:146681ms step_avg:155.55ms step:954/1480 train_time:146849ms step_avg:155.56ms step:955/1480 train_time:147013ms step_avg:155.57ms step:956/1480 train_time:147177ms step_avg:155.58ms step:957/1480 train_time:147344ms step_avg:155.59ms step:958/1480 train_time:147514ms step_avg:155.61ms step:959/1480 train_time:147679ms step_avg:155.62ms step:960/1480 train_time:147848ms step_avg:155.63ms step:961/1480 train_time:148012ms step_avg:155.64ms step:962/1480 train_time:148176ms step_avg:155.65ms step:963/1480 train_time:148341ms step_avg:155.66ms step:964/1480 train_time:148510ms step_avg:155.67ms step:965/1480 train_time:148674ms step_avg:155.68ms step:966/1480 train_time:148839ms step_avg:155.69ms step:967/1480 train_time:149003ms step_avg:155.70ms step:968/1480 train_time:149167ms step_avg:155.71ms step:969/1480 train_time:149334ms step_avg:155.72ms step:970/1480 train_time:149497ms step_avg:155.73ms step:971/1480 train_time:149660ms step_avg:155.73ms step:972/1480 train_time:149823ms step_avg:155.74ms step:973/1480 train_time:149987ms step_avg:155.75ms step:974/1480 train_time:150157ms step_avg:155.76ms step:975/1480 train_time:150321ms step_avg:155.77ms step:976/1480 train_time:150485ms step_avg:155.78ms step:977/1480 train_time:150650ms step_avg:155.79ms step:978/1480 train_time:150816ms step_avg:155.80ms step:979/1480 train_time:150981ms step_avg:155.81ms step:980/1480 train_time:151145ms step_avg:155.82ms step:981/1480 train_time:151313ms step_avg:155.83ms step:982/1480 train_time:151475ms step_avg:155.84ms step:983/1480 train_time:151640ms step_avg:155.85ms step:984/1480 train_time:151804ms step_avg:155.86ms step:985/1480 train_time:151971ms step_avg:155.87ms step:986/1480 train_time:152136ms step_avg:155.88ms step:987/1480 train_time:152299ms step_avg:155.88ms step:988/1480 train_time:152466ms step_avg:155.90ms step:989/1480 train_time:152634ms step_avg:155.91ms step:990/1480 train_time:152802ms step_avg:155.92ms step:991/1480 train_time:152970ms step_avg:155.93ms step:992/1480 train_time:153143ms step_avg:155.95ms step:993/1480 train_time:153321ms step_avg:155.97ms step:994/1480 train_time:153486ms step_avg:155.98ms step:995/1480 train_time:153651ms step_avg:155.99ms step:996/1480 train_time:153815ms step_avg:156.00ms step:997/1480 train_time:153980ms step_avg:156.01ms step:998/1480 train_time:154143ms step_avg:156.02ms step:999/1480 train_time:154311ms step_avg:156.03ms step:1000/1480 train_time:154480ms step_avg:156.04ms step:1000/1480 val_loss:3.4400 train_time:154550ms step_avg:156.11ms step:1001/1480 train_time:154650ms step_avg:156.05ms step:1002/1480 train_time:154816ms step_avg:156.06ms step:1003/1480 train_time:154987ms step_avg:156.08ms step:1004/1480 train_time:155157ms step_avg:156.09ms step:1005/1480 train_time:155324ms step_avg:156.10ms step:1006/1480 train_time:155491ms step_avg:156.12ms step:1007/1480 train_time:155656ms step_avg:156.12ms step:1008/1480 train_time:155824ms step_avg:156.14ms step:1009/1480 train_time:155998ms step_avg:156.15ms step:1010/1480 train_time:156164ms step_avg:156.16ms step:1011/1480 train_time:156329ms step_avg:156.17ms step:1012/1480 train_time:156494ms step_avg:156.18ms step:1013/1480 train_time:156665ms step_avg:156.20ms step:1014/1480 train_time:156831ms step_avg:156.21ms step:1015/1480 train_time:157002ms step_avg:156.22ms step:1016/1480 train_time:157170ms step_avg:156.23ms step:1017/1480 train_time:157340ms step_avg:156.25ms step:1018/1480 train_time:157507ms step_avg:156.26ms step:1019/1480 train_time:157675ms step_avg:156.27ms step:1020/1480 train_time:157846ms step_avg:156.28ms step:1021/1480 train_time:158011ms step_avg:156.29ms step:1022/1480 train_time:158178ms step_avg:156.30ms step:1023/1480 train_time:158344ms step_avg:156.31ms step:1024/1480 train_time:158511ms step_avg:156.32ms step:1025/1480 train_time:158682ms step_avg:156.34ms step:1026/1480 train_time:158847ms step_avg:156.35ms step:1027/1480 train_time:159012ms step_avg:156.35ms step:1028/1480 train_time:159185ms step_avg:156.37ms step:1029/1480 train_time:159360ms step_avg:156.39ms step:1030/1480 train_time:159528ms step_avg:156.40ms step:1031/1480 train_time:159693ms step_avg:156.41ms step:1032/1480 train_time:159865ms step_avg:156.42ms step:1033/1480 train_time:160032ms step_avg:156.43ms step:1034/1480 train_time:160200ms step_avg:156.45ms step:1035/1480 train_time:160369ms step_avg:156.46ms step:1036/1480 train_time:160534ms step_avg:156.47ms step:1037/1480 train_time:160701ms step_avg:156.48ms step:1038/1480 train_time:160870ms step_avg:156.49ms step:1039/1480 train_time:161041ms step_avg:156.50ms step:1040/1480 train_time:161206ms step_avg:156.51ms step:1041/1480 train_time:161372ms step_avg:156.52ms step:1042/1480 train_time:161536ms step_avg:156.53ms step:1043/1480 train_time:161700ms step_avg:156.53ms step:1044/1480 train_time:161867ms step_avg:156.54ms step:1045/1480 train_time:162034ms step_avg:156.55ms step:1046/1480 train_time:162203ms step_avg:156.57ms step:1047/1480 train_time:162370ms step_avg:156.58ms step:1048/1480 train_time:162535ms step_avg:156.58ms step:1049/1480 train_time:162701ms step_avg:156.59ms step:1050/1480 train_time:162871ms step_avg:156.61ms step:1051/1480 train_time:163041ms step_avg:156.62ms step:1052/1480 train_time:163207ms step_avg:156.63ms step:1053/1480 train_time:163373ms step_avg:156.64ms step:1054/1480 train_time:163542ms step_avg:156.65ms step:1055/1480 train_time:163708ms step_avg:156.66ms step:1056/1480 train_time:163874ms step_avg:156.67ms step:1057/1480 train_time:164042ms step_avg:156.68ms step:1058/1480 train_time:164209ms step_avg:156.69ms step:1059/1480 train_time:164383ms step_avg:156.70ms step:1060/1480 train_time:164551ms step_avg:156.72ms step:1061/1480 train_time:164714ms step_avg:156.72ms step:1062/1480 train_time:164882ms step_avg:156.73ms step:1063/1480 train_time:165048ms step_avg:156.74ms step:1064/1480 train_time:165211ms step_avg:156.75ms step:1065/1480 train_time:165377ms step_avg:156.76ms step:1066/1480 train_time:165546ms step_avg:156.77ms step:1067/1480 train_time:165713ms step_avg:156.78ms step:1068/1480 train_time:165879ms step_avg:156.79ms step:1069/1480 train_time:166051ms step_avg:156.80ms step:1070/1480 train_time:166216ms step_avg:156.81ms step:1071/1480 train_time:166388ms step_avg:156.82ms step:1072/1480 train_time:166554ms step_avg:156.83ms step:1073/1480 train_time:166719ms step_avg:156.84ms step:1074/1480 train_time:166886ms step_avg:156.85ms step:1075/1480 train_time:167056ms step_avg:156.86ms step:1076/1480 train_time:167225ms step_avg:156.87ms step:1077/1480 train_time:167392ms step_avg:156.88ms step:1078/1480 train_time:167567ms step_avg:156.90ms step:1079/1480 train_time:167741ms step_avg:156.91ms step:1080/1480 train_time:167909ms step_avg:156.92ms step:1081/1480 train_time:168075ms step_avg:156.93ms step:1082/1480 train_time:168242ms step_avg:156.94ms step:1083/1480 train_time:168407ms step_avg:156.95ms step:1084/1480 train_time:168573ms step_avg:156.96ms step:1085/1480 train_time:168743ms step_avg:156.97ms step:1086/1480 train_time:168910ms step_avg:156.98ms step:1087/1480 train_time:169074ms step_avg:156.99ms step:1088/1480 train_time:169245ms step_avg:157.00ms step:1089/1480 train_time:169415ms step_avg:157.01ms step:1090/1480 train_time:169587ms step_avg:157.03ms step:1091/1480 train_time:169756ms step_avg:157.04ms step:1092/1480 train_time:169924ms step_avg:157.05ms step:1093/1480 train_time:170091ms step_avg:157.06ms step:1094/1480 train_time:170258ms step_avg:157.06ms step:1095/1480 train_time:170423ms step_avg:157.07ms step:1096/1480 train_time:170591ms step_avg:157.08ms step:1097/1480 train_time:170760ms step_avg:157.09ms step:1098/1480 train_time:170929ms step_avg:157.10ms step:1099/1480 train_time:171100ms step_avg:157.12ms step:1100/1480 train_time:171271ms step_avg:157.13ms step:1101/1480 train_time:171441ms step_avg:157.14ms step:1102/1480 train_time:171612ms step_avg:157.15ms step:1103/1480 train_time:171788ms step_avg:157.17ms step:1104/1480 train_time:171957ms step_avg:157.18ms step:1105/1480 train_time:172126ms step_avg:157.19ms step:1106/1480 train_time:172293ms step_avg:157.20ms step:1107/1480 train_time:172464ms step_avg:157.21ms step:1108/1480 train_time:172630ms step_avg:157.22ms step:1109/1480 train_time:172795ms step_avg:157.23ms step:1110/1480 train_time:172964ms step_avg:157.24ms step:1111/1480 train_time:173129ms step_avg:157.25ms step:1112/1480 train_time:173298ms step_avg:157.26ms step:1113/1480 train_time:173479ms step_avg:157.28ms step:1114/1480 train_time:173652ms step_avg:157.29ms step:1115/1480 train_time:173825ms step_avg:157.31ms step:1116/1480 train_time:173991ms step_avg:157.32ms step:1117/1480 train_time:174165ms step_avg:157.33ms step:1118/1480 train_time:174340ms step_avg:157.35ms step:1119/1480 train_time:174506ms step_avg:157.35ms step:1120/1480 train_time:174674ms step_avg:157.36ms step:1121/1480 train_time:174846ms step_avg:157.38ms step:1122/1480 train_time:175011ms step_avg:157.38ms step:1123/1480 train_time:175178ms step_avg:157.39ms step:1124/1480 train_time:175347ms step_avg:157.40ms step:1125/1480 train_time:175515ms step_avg:157.41ms step:1125/1480 val_loss:3.3840 train_time:175582ms step_avg:157.47ms step:1126/1480 train_time:175685ms step_avg:157.42ms step:1127/1480 train_time:175857ms step_avg:157.44ms step:1128/1480 train_time:176028ms step_avg:157.45ms step:1129/1480 train_time:176201ms step_avg:157.46ms step:1130/1480 train_time:176372ms step_avg:157.48ms step:1131/1480 train_time:176551ms step_avg:157.49ms step:1132/1480 train_time:176716ms step_avg:157.50ms step:1133/1480 train_time:176888ms step_avg:157.51ms step:1134/1480 train_time:177059ms step_avg:157.53ms step:1135/1480 train_time:177228ms step_avg:157.54ms step:1136/1480 train_time:177396ms step_avg:157.55ms step:1137/1480 train_time:177566ms step_avg:157.56ms step:1138/1480 train_time:177737ms step_avg:157.57ms step:1139/1480 train_time:177905ms step_avg:157.58ms step:1140/1480 train_time:178075ms step_avg:157.59ms step:1141/1480 train_time:178248ms step_avg:157.60ms step:1142/1480 train_time:178415ms step_avg:157.61ms step:1143/1480 train_time:178586ms step_avg:157.62ms step:1144/1480 train_time:178755ms step_avg:157.63ms step:1145/1480 train_time:178922ms step_avg:157.64ms step:1146/1480 train_time:179092ms step_avg:157.65ms step:1147/1480 train_time:179260ms step_avg:157.66ms step:1148/1480 train_time:179429ms step_avg:157.67ms step:1149/1480 train_time:179599ms step_avg:157.68ms step:1150/1480 train_time:179767ms step_avg:157.69ms step:1151/1480 train_time:179938ms step_avg:157.70ms step:1152/1480 train_time:180110ms step_avg:157.71ms step:1153/1480 train_time:180284ms step_avg:157.73ms step:1154/1480 train_time:180451ms step_avg:157.74ms step:1155/1480 train_time:180622ms step_avg:157.75ms step:1156/1480 train_time:180801ms step_avg:157.77ms step:1157/1480 train_time:180970ms step_avg:157.78ms step:1158/1480 train_time:181136ms step_avg:157.78ms step:1159/1480 train_time:181303ms step_avg:157.79ms step:1160/1480 train_time:181470ms step_avg:157.80ms step:1161/1480 train_time:181640ms step_avg:157.81ms step:1162/1480 train_time:181810ms step_avg:157.82ms step:1163/1480 train_time:181978ms step_avg:157.83ms step:1164/1480 train_time:182149ms step_avg:157.84ms step:1165/1480 train_time:182314ms step_avg:157.85ms step:1166/1480 train_time:182483ms step_avg:157.86ms step:1167/1480 train_time:182651ms step_avg:157.87ms step:1168/1480 train_time:182817ms step_avg:157.87ms step:1169/1480 train_time:182987ms step_avg:157.88ms step:1170/1480 train_time:183156ms step_avg:157.89ms step:1171/1480 train_time:183322ms step_avg:157.90ms step:1172/1480 train_time:183490ms step_avg:157.91ms step:1173/1480 train_time:183660ms step_avg:157.92ms step:1174/1480 train_time:183841ms step_avg:157.94ms step:1175/1480 train_time:184013ms step_avg:157.95ms step:1176/1480 train_time:184186ms step_avg:157.96ms step:1177/1480 train_time:184363ms step_avg:157.98ms step:1178/1480 train_time:184530ms step_avg:157.99ms step:1179/1480 train_time:184696ms step_avg:157.99ms step:1180/1480 train_time:184876ms step_avg:158.01ms step:1181/1480 train_time:185047ms step_avg:158.02ms step:1182/1480 train_time:185215ms step_avg:158.03ms step:1183/1480 train_time:185387ms step_avg:158.05ms step:1184/1480 train_time:185554ms step_avg:158.05ms step:1185/1480 train_time:185727ms step_avg:158.07ms step:1186/1480 train_time:185898ms step_avg:158.08ms step:1187/1480 train_time:186080ms step_avg:158.10ms step:1188/1480 train_time:186247ms step_avg:158.10ms step:1189/1480 train_time:186418ms step_avg:158.11ms step:1190/1480 train_time:186586ms step_avg:158.12ms step:1191/1480 train_time:186757ms step_avg:158.14ms step:1192/1480 train_time:186924ms step_avg:158.14ms step:1193/1480 train_time:187092ms step_avg:158.15ms step:1194/1480 train_time:187259ms step_avg:158.16ms step:1195/1480 train_time:187433ms step_avg:158.17ms step:1196/1480 train_time:187615ms step_avg:158.19ms step:1197/1480 train_time:187787ms step_avg:158.20ms step:1198/1480 train_time:187971ms step_avg:158.22ms step:1199/1480 train_time:188141ms step_avg:158.23ms step:1200/1480 train_time:188310ms step_avg:158.24ms step:1201/1480 train_time:188477ms step_avg:158.25ms step:1202/1480 train_time:188659ms step_avg:158.27ms step:1203/1480 train_time:188835ms step_avg:158.29ms step:1204/1480 train_time:189009ms step_avg:158.30ms step:1205/1480 train_time:189177ms step_avg:158.31ms step:1206/1480 train_time:189346ms step_avg:158.32ms step:1207/1480 train_time:189516ms step_avg:158.33ms step:1208/1480 train_time:189682ms step_avg:158.33ms step:1209/1480 train_time:189854ms step_avg:158.34ms step:1210/1480 train_time:190031ms step_avg:158.36ms step:1211/1480 train_time:190205ms step_avg:158.37ms step:1212/1480 train_time:190377ms step_avg:158.38ms step:1213/1480 train_time:190551ms step_avg:158.40ms step:1214/1480 train_time:190729ms step_avg:158.41ms step:1215/1480 train_time:190901ms step_avg:158.42ms step:1216/1480 train_time:191072ms step_avg:158.43ms step:1217/1480 train_time:191246ms step_avg:158.45ms step:1218/1480 train_time:191418ms step_avg:158.46ms step:1219/1480 train_time:191597ms step_avg:158.48ms step:1220/1480 train_time:191766ms step_avg:158.48ms step:1221/1480 train_time:191934ms step_avg:158.49ms step:1222/1480 train_time:192100ms step_avg:158.50ms step:1223/1480 train_time:192272ms step_avg:158.51ms step:1224/1480 train_time:192451ms step_avg:158.53ms step:1225/1480 train_time:192623ms step_avg:158.54ms step:1226/1480 train_time:192795ms step_avg:158.55ms step:1227/1480 train_time:192967ms step_avg:158.56ms step:1228/1480 train_time:193137ms step_avg:158.57ms step:1229/1480 train_time:193311ms step_avg:158.58ms step:1230/1480 train_time:193494ms step_avg:158.60ms step:1231/1480 train_time:193671ms step_avg:158.62ms step:1232/1480 train_time:193847ms step_avg:158.63ms step:1233/1480 train_time:194016ms step_avg:158.64ms step:1234/1480 train_time:194187ms step_avg:158.65ms step:1235/1480 train_time:194362ms step_avg:158.66ms step:1236/1480 train_time:194530ms step_avg:158.67ms step:1237/1480 train_time:194701ms step_avg:158.68ms step:1238/1480 train_time:194886ms step_avg:158.70ms step:1239/1480 train_time:195057ms step_avg:158.71ms step:1240/1480 train_time:195227ms step_avg:158.72ms step:1241/1480 train_time:195401ms step_avg:158.73ms step:1242/1480 train_time:195571ms step_avg:158.74ms step:1243/1480 train_time:195743ms step_avg:158.75ms step:1244/1480 train_time:195909ms step_avg:158.76ms step:1245/1480 train_time:196078ms step_avg:158.77ms step:1246/1480 train_time:196249ms step_avg:158.78ms step:1247/1480 train_time:196417ms step_avg:158.79ms step:1248/1480 train_time:196588ms step_avg:158.79ms step:1249/1480 train_time:196756ms step_avg:158.80ms step:1250/1480 train_time:196925ms step_avg:158.81ms step:1250/1480 val_loss:3.3351 train_time:196997ms step_avg:158.87ms step:1251/1480 train_time:197106ms step_avg:158.83ms step:1252/1480 train_time:197276ms step_avg:158.84ms step:1253/1480 train_time:197444ms step_avg:158.84ms step:1254/1480 train_time:197617ms step_avg:158.86ms step:1255/1480 train_time:197802ms step_avg:158.88ms step:1256/1480 train_time:197975ms step_avg:158.89ms step:1257/1480 train_time:198144ms step_avg:158.90ms step:1258/1480 train_time:198321ms step_avg:158.91ms step:1259/1480 train_time:198493ms step_avg:158.92ms step:1260/1480 train_time:198660ms step_avg:158.93ms step:1261/1480 train_time:198833ms step_avg:158.94ms step:1262/1480 train_time:199010ms step_avg:158.95ms step:1263/1480 train_time:199182ms step_avg:158.96ms step:1264/1480 train_time:199349ms step_avg:158.97ms step:1265/1480 train_time:199517ms step_avg:158.98ms step:1266/1480 train_time:199689ms step_avg:158.99ms step:1267/1480 train_time:199858ms step_avg:159.00ms step:1268/1480 train_time:200030ms step_avg:159.01ms step:1269/1480 train_time:200206ms step_avg:159.02ms step:1270/1480 train_time:200376ms step_avg:159.03ms step:1271/1480 train_time:200546ms step_avg:159.04ms step:1272/1480 train_time:200711ms step_avg:159.04ms step:1273/1480 train_time:200881ms step_avg:159.05ms step:1274/1480 train_time:201054ms step_avg:159.06ms step:1275/1480 train_time:201222ms step_avg:159.07ms step:1276/1480 train_time:201388ms step_avg:159.07ms step:1277/1480 train_time:201559ms step_avg:159.08ms step:1278/1480 train_time:201727ms step_avg:159.09ms step:1279/1480 train_time:201899ms step_avg:159.10ms step:1280/1480 train_time:202077ms step_avg:159.12ms step:1281/1480 train_time:202246ms step_avg:159.12ms step:1282/1480 train_time:202412ms step_avg:159.13ms step:1283/1480 train_time:202582ms step_avg:159.14ms step:1284/1480 train_time:202752ms step_avg:159.15ms step:1285/1480 train_time:202921ms step_avg:159.15ms step:1286/1480 train_time:203091ms step_avg:159.16ms step:1287/1480 train_time:203262ms step_avg:159.17ms step:1288/1480 train_time:203434ms step_avg:159.18ms step:1289/1480 train_time:203620ms step_avg:159.20ms step:1290/1480 train_time:203800ms step_avg:159.22ms step:1291/1480 train_time:203973ms step_avg:159.23ms step:1292/1480 train_time:204147ms step_avg:159.24ms step:1293/1480 train_time:204322ms step_avg:159.25ms step:1294/1480 train_time:204494ms step_avg:159.26ms step:1295/1480 train_time:204665ms step_avg:159.27ms step:1296/1480 train_time:204840ms step_avg:159.28ms step:1297/1480 train_time:205011ms step_avg:159.29ms step:1298/1480 train_time:205183ms step_avg:159.30ms step:1299/1480 train_time:205353ms step_avg:159.31ms step:1300/1480 train_time:205520ms step_avg:159.32ms step:1301/1480 train_time:205689ms step_avg:159.33ms step:1302/1480 train_time:205862ms step_avg:159.34ms step:1303/1480 train_time:206041ms step_avg:159.35ms step:1304/1480 train_time:206215ms step_avg:159.36ms step:1305/1480 train_time:206384ms step_avg:159.37ms step:1306/1480 train_time:206560ms step_avg:159.38ms step:1307/1480 train_time:206728ms step_avg:159.39ms step:1308/1480 train_time:206897ms step_avg:159.40ms step:1309/1480 train_time:207068ms step_avg:159.41ms step:1310/1480 train_time:207237ms step_avg:159.41ms step:1311/1480 train_time:207406ms step_avg:159.42ms step:1312/1480 train_time:207579ms step_avg:159.43ms step:1313/1480 train_time:207748ms step_avg:159.44ms step:1314/1480 train_time:207922ms step_avg:159.45ms step:1315/1480 train_time:208093ms step_avg:159.46ms step:1316/1480 train_time:208260ms step_avg:159.46ms step:1317/1480 train_time:208431ms step_avg:159.47ms step:1318/1480 train_time:208612ms step_avg:159.49ms step:1319/1480 train_time:208787ms step_avg:159.50ms step:1320/1480 train_time:208963ms step_avg:159.51ms step:1321/1480 train_time:209136ms step_avg:159.52ms step:1322/1480 train_time:209319ms step_avg:159.54ms step:1323/1480 train_time:209491ms step_avg:159.55ms step:1324/1480 train_time:209665ms step_avg:159.56ms step:1325/1480 train_time:209846ms step_avg:159.58ms step:1326/1480 train_time:210022ms step_avg:159.59ms step:1327/1480 train_time:210193ms step_avg:159.60ms step:1328/1480 train_time:210362ms step_avg:159.61ms step:1329/1480 train_time:210559ms step_avg:159.63ms step:1330/1480 train_time:210738ms step_avg:159.65ms step:1331/1480 train_time:210908ms step_avg:159.66ms step:1332/1480 train_time:211082ms step_avg:159.67ms step:1333/1480 train_time:211258ms step_avg:159.68ms step:1334/1480 train_time:211429ms step_avg:159.69ms step:1335/1480 train_time:211599ms step_avg:159.70ms step:1336/1480 train_time:211783ms step_avg:159.72ms step:1337/1480 train_time:211958ms step_avg:159.73ms step:1338/1480 train_time:212128ms step_avg:159.74ms step:1339/1480 train_time:212302ms step_avg:159.75ms step:1340/1480 train_time:212475ms step_avg:159.76ms step:1341/1480 train_time:212643ms step_avg:159.76ms step:1342/1480 train_time:212818ms step_avg:159.77ms step:1343/1480 train_time:212986ms step_avg:159.78ms step:1344/1480 train_time:213159ms step_avg:159.79ms step:1345/1480 train_time:213338ms step_avg:159.80ms step:1346/1480 train_time:213505ms step_avg:159.81ms step:1347/1480 train_time:213676ms step_avg:159.82ms step:1348/1480 train_time:213844ms step_avg:159.82ms step:1349/1480 train_time:214015ms step_avg:159.83ms step:1350/1480 train_time:214190ms step_avg:159.84ms step:1351/1480 train_time:214359ms step_avg:159.85ms step:1352/1480 train_time:214529ms step_avg:159.86ms step:1353/1480 train_time:214706ms step_avg:159.87ms step:1354/1480 train_time:214876ms step_avg:159.88ms step:1355/1480 train_time:215044ms step_avg:159.88ms step:1356/1480 train_time:215217ms step_avg:159.89ms step:1357/1480 train_time:215389ms step_avg:159.90ms step:1358/1480 train_time:215561ms step_avg:159.91ms step:1359/1480 train_time:215734ms step_avg:159.92ms step:1360/1480 train_time:215908ms step_avg:159.93ms step:1361/1480 train_time:216085ms step_avg:159.94ms step:1362/1480 train_time:216259ms step_avg:159.96ms step:1363/1480 train_time:216439ms step_avg:159.97ms step:1364/1480 train_time:216608ms step_avg:159.98ms step:1365/1480 train_time:216775ms step_avg:159.98ms step:1366/1480 train_time:216945ms step_avg:159.99ms step:1367/1480 train_time:217117ms step_avg:160.00ms step:1368/1480 train_time:217291ms step_avg:160.01ms step:1369/1480 train_time:217473ms step_avg:160.02ms step:1370/1480 train_time:217651ms step_avg:160.04ms step:1371/1480 train_time:217822ms step_avg:160.05ms step:1372/1480 train_time:218000ms step_avg:160.06ms step:1373/1480 train_time:218169ms step_avg:160.07ms step:1374/1480 train_time:218345ms step_avg:160.08ms step:1375/1480 train_time:218517ms step_avg:160.09ms step:1375/1480 val_loss:3.2965 train_time:218585ms step_avg:160.14ms step:1376/1480 train_time:218694ms step_avg:160.10ms step:1377/1480 train_time:218866ms step_avg:160.11ms step:1378/1480 train_time:219034ms step_avg:160.11ms step:1379/1480 train_time:219208ms step_avg:160.12ms step:1380/1480 train_time:219382ms step_avg:160.13ms step:1381/1480 train_time:219565ms step_avg:160.15ms step:1382/1480 train_time:219736ms step_avg:160.16ms step:1383/1480 train_time:219908ms step_avg:160.17ms step:1384/1480 train_time:220085ms step_avg:160.18ms step:1385/1480 train_time:220251ms step_avg:160.18ms step:1386/1480 train_time:220421ms step_avg:160.19ms step:1387/1480 train_time:220593ms step_avg:160.20ms step:1388/1480 train_time:220761ms step_avg:160.20ms step:1389/1480 train_time:220934ms step_avg:160.21ms step:1390/1480 train_time:221101ms step_avg:160.22ms step:1391/1480 train_time:221272ms step_avg:160.23ms step:1392/1480 train_time:221443ms step_avg:160.23ms step:1393/1480 train_time:221615ms step_avg:160.24ms step:1394/1480 train_time:221786ms step_avg:160.25ms step:1395/1480 train_time:221955ms step_avg:160.26ms step:1396/1480 train_time:222124ms step_avg:160.26ms step:1397/1480 train_time:222292ms step_avg:160.27ms step:1398/1480 train_time:222458ms step_avg:160.27ms step:1399/1480 train_time:222628ms step_avg:160.28ms step:1400/1480 train_time:222805ms step_avg:160.29ms step:1401/1480 train_time:222972ms step_avg:160.30ms step:1402/1480 train_time:223143ms step_avg:160.30ms step:1403/1480 train_time:223320ms step_avg:160.32ms step:1404/1480 train_time:223491ms step_avg:160.32ms step:1405/1480 train_time:223666ms step_avg:160.33ms step:1406/1480 train_time:223841ms step_avg:160.34ms step:1407/1480 train_time:224009ms step_avg:160.35ms step:1408/1480 train_time:224177ms step_avg:160.36ms step:1409/1480 train_time:224360ms step_avg:160.37ms step:1410/1480 train_time:224530ms step_avg:160.38ms step:1411/1480 train_time:224697ms step_avg:160.38ms step:1412/1480 train_time:224867ms step_avg:160.39ms step:1413/1480 train_time:225036ms step_avg:160.40ms step:1414/1480 train_time:225208ms step_avg:160.40ms step:1415/1480 train_time:225381ms step_avg:160.41ms step:1416/1480 train_time:225569ms step_avg:160.43ms step:1417/1480 train_time:225741ms step_avg:160.44ms step:1418/1480 train_time:225913ms step_avg:160.45ms step:1419/1480 train_time:226088ms step_avg:160.46ms step:1420/1480 train_time:226262ms step_avg:160.47ms step:1421/1480 train_time:226435ms step_avg:160.48ms step:1422/1480 train_time:226609ms step_avg:160.49ms step:1423/1480 train_time:226778ms step_avg:160.49ms step:1424/1480 train_time:226956ms step_avg:160.51ms step:1425/1480 train_time:227136ms step_avg:160.52ms step:1426/1480 train_time:227308ms step_avg:160.53ms step:1427/1480 train_time:227483ms step_avg:160.54ms step:1428/1480 train_time:227655ms step_avg:160.55ms step:1429/1480 train_time:227824ms step_avg:160.55ms step:1430/1480 train_time:227997ms step_avg:160.56ms step:1431/1480 train_time:228171ms step_avg:160.57ms step:1432/1480 train_time:228347ms step_avg:160.58ms step:1433/1480 train_time:228527ms step_avg:160.60ms step:1434/1480 train_time:228707ms step_avg:160.61ms step:1435/1480 train_time:228882ms step_avg:160.62ms step:1436/1480 train_time:229055ms step_avg:160.63ms step:1437/1480 train_time:229226ms step_avg:160.63ms step:1438/1480 train_time:229394ms step_avg:160.64ms step:1439/1480 train_time:229569ms step_avg:160.65ms step:1440/1480 train_time:229739ms step_avg:160.66ms step:1441/1480 train_time:229911ms step_avg:160.66ms step:1442/1480 train_time:230090ms step_avg:160.68ms step:1443/1480 train_time:230280ms step_avg:160.70ms step:1444/1480 train_time:230452ms step_avg:160.71ms step:1445/1480 train_time:230623ms step_avg:160.71ms step:1446/1480 train_time:230798ms step_avg:160.72ms step:1447/1480 train_time:230976ms step_avg:160.73ms step:1448/1480 train_time:231150ms step_avg:160.74ms step:1449/1480 train_time:231325ms step_avg:160.75ms step:1450/1480 train_time:231497ms step_avg:160.76ms step:1451/1480 train_time:231669ms step_avg:160.77ms step:1452/1480 train_time:231841ms step_avg:160.78ms step:1453/1480 train_time:232011ms step_avg:160.78ms step:1454/1480 train_time:232181ms step_avg:160.79ms step:1455/1480 train_time:232359ms step_avg:160.80ms step:1456/1480 train_time:232533ms step_avg:160.81ms step:1457/1480 train_time:232702ms step_avg:160.82ms step:1458/1480 train_time:232873ms step_avg:160.82ms step:1459/1480 train_time:233051ms step_avg:160.84ms step:1460/1480 train_time:233223ms step_avg:160.84ms step:1461/1480 train_time:233397ms step_avg:160.85ms step:1462/1480 train_time:233568ms step_avg:160.86ms step:1463/1480 train_time:233744ms step_avg:160.87ms step:1464/1480 train_time:233919ms step_avg:160.88ms step:1465/1480 train_time:234092ms step_avg:160.89ms step:1466/1480 train_time:234263ms step_avg:160.89ms step:1467/1480 train_time:234436ms step_avg:160.90ms step:1468/1480 train_time:234608ms step_avg:160.91ms step:1469/1480 train_time:234780ms step_avg:160.92ms step:1470/1480 train_time:234959ms step_avg:160.93ms step:1471/1480 train_time:235144ms step_avg:160.95ms step:1472/1480 train_time:235328ms step_avg:160.96ms step:1473/1480 train_time:235499ms step_avg:160.97ms step:1474/1480 train_time:235677ms step_avg:160.98ms step:1475/1480 train_time:235857ms step_avg:160.99ms step:1476/1480 train_time:236029ms step_avg:161.00ms step:1477/1480 train_time:236212ms step_avg:161.02ms step:1478/1480 train_time:236395ms step_avg:161.03ms step:1479/1480 train_time:236568ms step_avg:161.04ms step:1480/1480 train_time:236740ms step_avg:161.05ms step:1480/1480 val_loss:3.2774 train_time:236813ms step_avg:161.10ms