import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 10:36:55 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 130W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 101W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 106W / 700W | 35MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23838ms step_avg:nanms step:2/1480 train_time:24012ms step_avg:nanms step:3/1480 train_time:24151ms step_avg:nanms step:4/1480 train_time:24292ms step_avg:nanms step:5/1480 train_time:24434ms step_avg:nanms step:6/1480 train_time:24575ms step_avg:nanms step:7/1480 train_time:24716ms step_avg:nanms step:8/1480 train_time:24858ms step_avg:nanms step:9/1480 train_time:25005ms step_avg:nanms step:10/1480 train_time:25147ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:427ms step_avg:142.17ms step:14/1480 train_time:569ms step_avg:142.37ms step:15/1480 train_time:712ms step_avg:142.40ms step:16/1480 train_time:854ms step_avg:142.36ms step:17/1480 train_time:997ms step_avg:142.41ms step:18/1480 train_time:1139ms step_avg:142.39ms step:19/1480 train_time:1281ms step_avg:142.37ms step:20/1480 train_time:1424ms step_avg:142.39ms step:21/1480 train_time:1567ms step_avg:142.46ms step:22/1480 train_time:1710ms step_avg:142.50ms step:23/1480 train_time:1854ms step_avg:142.58ms step:24/1480 train_time:1996ms step_avg:142.57ms step:25/1480 train_time:2138ms step_avg:142.50ms step:26/1480 train_time:2280ms step_avg:142.48ms step:27/1480 train_time:2421ms step_avg:142.44ms step:28/1480 train_time:2564ms step_avg:142.44ms step:29/1480 train_time:2706ms step_avg:142.43ms step:30/1480 train_time:2849ms step_avg:142.45ms step:31/1480 train_time:2992ms step_avg:142.50ms step:32/1480 train_time:3135ms step_avg:142.50ms step:33/1480 train_time:3279ms step_avg:142.56ms step:34/1480 train_time:3421ms step_avg:142.55ms step:35/1480 train_time:3565ms step_avg:142.58ms step:36/1480 train_time:3707ms step_avg:142.59ms step:37/1480 train_time:3851ms step_avg:142.65ms step:38/1480 train_time:3995ms step_avg:142.69ms step:39/1480 train_time:4137ms step_avg:142.66ms step:40/1480 train_time:4281ms step_avg:142.70ms step:41/1480 train_time:4423ms step_avg:142.68ms step:42/1480 train_time:4564ms step_avg:142.63ms step:43/1480 train_time:4706ms step_avg:142.62ms step:44/1480 train_time:4851ms step_avg:142.67ms step:45/1480 train_time:4995ms step_avg:142.72ms step:46/1480 train_time:5136ms step_avg:142.68ms step:47/1480 train_time:5279ms step_avg:142.67ms step:48/1480 train_time:5419ms step_avg:142.62ms step:49/1480 train_time:5562ms step_avg:142.61ms step:50/1480 train_time:5705ms step_avg:142.63ms step:51/1480 train_time:5850ms step_avg:142.67ms step:52/1480 train_time:5993ms step_avg:142.70ms step:53/1480 train_time:6136ms step_avg:142.70ms step:54/1480 train_time:6280ms step_avg:142.73ms step:55/1480 train_time:6422ms step_avg:142.71ms step:56/1480 train_time:6563ms step_avg:142.67ms step:57/1480 train_time:6704ms step_avg:142.64ms step:58/1480 train_time:6850ms step_avg:142.72ms step:59/1480 train_time:6993ms step_avg:142.72ms step:60/1480 train_time:7135ms step_avg:142.70ms step:61/1480 train_time:7279ms step_avg:142.72ms step:62/1480 train_time:7422ms step_avg:142.73ms step:63/1480 train_time:7566ms step_avg:142.75ms step:64/1480 train_time:7710ms step_avg:142.78ms step:65/1480 train_time:7854ms step_avg:142.80ms step:66/1480 train_time:7997ms step_avg:142.80ms step:67/1480 train_time:8138ms step_avg:142.77ms step:68/1480 train_time:8281ms step_avg:142.78ms step:69/1480 train_time:8423ms step_avg:142.76ms step:70/1480 train_time:8566ms step_avg:142.77ms step:71/1480 train_time:8709ms step_avg:142.77ms step:72/1480 train_time:8854ms step_avg:142.81ms step:73/1480 train_time:8997ms step_avg:142.82ms step:74/1480 train_time:9140ms step_avg:142.81ms step:75/1480 train_time:9281ms step_avg:142.79ms step:76/1480 train_time:9424ms step_avg:142.78ms step:77/1480 train_time:9566ms step_avg:142.77ms step:78/1480 train_time:9708ms step_avg:142.77ms step:79/1480 train_time:9852ms step_avg:142.78ms step:80/1480 train_time:9994ms step_avg:142.77ms step:81/1480 train_time:10136ms step_avg:142.76ms step:82/1480 train_time:10279ms step_avg:142.76ms step:83/1480 train_time:10420ms step_avg:142.73ms step:84/1480 train_time:10561ms step_avg:142.71ms step:85/1480 train_time:10702ms step_avg:142.69ms step:86/1480 train_time:10845ms step_avg:142.69ms step:87/1480 train_time:10989ms step_avg:142.71ms step:88/1480 train_time:11132ms step_avg:142.72ms step:89/1480 train_time:11276ms step_avg:142.73ms step:90/1480 train_time:11418ms step_avg:142.72ms step:91/1480 train_time:11560ms step_avg:142.72ms step:92/1480 train_time:11701ms step_avg:142.69ms step:93/1480 train_time:11843ms step_avg:142.68ms step:94/1480 train_time:11984ms step_avg:142.67ms step:95/1480 train_time:12128ms step_avg:142.68ms step:96/1480 train_time:12271ms step_avg:142.68ms step:97/1480 train_time:12413ms step_avg:142.68ms step:98/1480 train_time:12554ms step_avg:142.66ms step:99/1480 train_time:12697ms step_avg:142.66ms step:100/1480 train_time:12839ms step_avg:142.66ms step:101/1480 train_time:12980ms step_avg:142.64ms step:102/1480 train_time:13121ms step_avg:142.62ms step:103/1480 train_time:13264ms step_avg:142.62ms step:104/1480 train_time:13406ms step_avg:142.61ms step:105/1480 train_time:13548ms step_avg:142.61ms step:106/1480 train_time:13691ms step_avg:142.62ms step:107/1480 train_time:13833ms step_avg:142.61ms step:108/1480 train_time:13977ms step_avg:142.62ms step:109/1480 train_time:14118ms step_avg:142.61ms step:110/1480 train_time:14260ms step_avg:142.60ms step:111/1480 train_time:14404ms step_avg:142.61ms step:112/1480 train_time:14552ms step_avg:142.67ms step:113/1480 train_time:14700ms step_avg:142.71ms step:114/1480 train_time:14845ms step_avg:142.74ms step:115/1480 train_time:14992ms step_avg:142.78ms step:116/1480 train_time:15138ms step_avg:142.81ms step:117/1480 train_time:15286ms step_avg:142.86ms step:118/1480 train_time:15432ms step_avg:142.89ms step:119/1480 train_time:15580ms step_avg:142.94ms step:120/1480 train_time:15727ms step_avg:142.97ms step:121/1480 train_time:15876ms step_avg:143.02ms step:122/1480 train_time:16022ms step_avg:143.05ms step:123/1480 train_time:16170ms step_avg:143.09ms step:124/1480 train_time:16316ms step_avg:143.12ms step:125/1480 train_time:16463ms step_avg:143.15ms step:125/1480 val_loss:4.4247 train_time:16519ms step_avg:143.65ms step:126/1480 train_time:16614ms step_avg:143.23ms step:127/1480 train_time:16764ms step_avg:143.28ms step:128/1480 train_time:16911ms step_avg:143.31ms step:129/1480 train_time:17056ms step_avg:143.32ms step:130/1480 train_time:17202ms step_avg:143.35ms step:131/1480 train_time:17348ms step_avg:143.37ms step:132/1480 train_time:17493ms step_avg:143.38ms step:133/1480 train_time:17639ms step_avg:143.41ms step:134/1480 train_time:17788ms step_avg:143.45ms step:135/1480 train_time:17934ms step_avg:143.47ms step:136/1480 train_time:18080ms step_avg:143.49ms step:137/1480 train_time:18227ms step_avg:143.52ms step:138/1480 train_time:18372ms step_avg:143.53ms step:139/1480 train_time:18519ms step_avg:143.56ms step:140/1480 train_time:18668ms step_avg:143.60ms step:141/1480 train_time:18816ms step_avg:143.63ms step:142/1480 train_time:18964ms step_avg:143.66ms step:143/1480 train_time:19111ms step_avg:143.69ms step:144/1480 train_time:19256ms step_avg:143.70ms step:145/1480 train_time:19402ms step_avg:143.72ms step:146/1480 train_time:19549ms step_avg:143.74ms step:147/1480 train_time:19695ms step_avg:143.76ms step:148/1480 train_time:19844ms step_avg:143.80ms step:149/1480 train_time:19992ms step_avg:143.83ms step:150/1480 train_time:20140ms step_avg:143.86ms step:151/1480 train_time:20287ms step_avg:143.88ms step:152/1480 train_time:20433ms step_avg:143.89ms step:153/1480 train_time:20580ms step_avg:143.92ms step:154/1480 train_time:20727ms step_avg:143.94ms step:155/1480 train_time:20873ms step_avg:143.95ms step:156/1480 train_time:21021ms step_avg:143.98ms step:157/1480 train_time:21168ms step_avg:144.00ms step:158/1480 train_time:21314ms step_avg:144.01ms step:159/1480 train_time:21461ms step_avg:144.04ms step:160/1480 train_time:21608ms step_avg:144.05ms step:161/1480 train_time:21753ms step_avg:144.06ms step:162/1480 train_time:21900ms step_avg:144.08ms step:163/1480 train_time:22047ms step_avg:144.10ms step:164/1480 train_time:22194ms step_avg:144.11ms step:165/1480 train_time:22340ms step_avg:144.13ms step:166/1480 train_time:22488ms step_avg:144.15ms step:167/1480 train_time:22634ms step_avg:144.16ms step:168/1480 train_time:22783ms step_avg:144.19ms step:169/1480 train_time:22930ms step_avg:144.21ms step:170/1480 train_time:23077ms step_avg:144.23ms step:171/1480 train_time:23225ms step_avg:144.25ms step:172/1480 train_time:23371ms step_avg:144.27ms step:173/1480 train_time:23517ms step_avg:144.28ms step:174/1480 train_time:23664ms step_avg:144.29ms step:175/1480 train_time:23810ms step_avg:144.30ms step:176/1480 train_time:23956ms step_avg:144.31ms step:177/1480 train_time:24104ms step_avg:144.34ms step:178/1480 train_time:24250ms step_avg:144.35ms step:179/1480 train_time:24396ms step_avg:144.35ms step:180/1480 train_time:24543ms step_avg:144.37ms step:181/1480 train_time:24691ms step_avg:144.39ms step:182/1480 train_time:24837ms step_avg:144.40ms step:183/1480 train_time:24985ms step_avg:144.42ms step:184/1480 train_time:25131ms step_avg:144.43ms step:185/1480 train_time:25279ms step_avg:144.45ms step:186/1480 train_time:25426ms step_avg:144.47ms step:187/1480 train_time:25572ms step_avg:144.47ms step:188/1480 train_time:25720ms step_avg:144.49ms step:189/1480 train_time:25867ms step_avg:144.51ms step:190/1480 train_time:26013ms step_avg:144.52ms step:191/1480 train_time:26161ms step_avg:144.54ms step:192/1480 train_time:26308ms step_avg:144.55ms step:193/1480 train_time:26454ms step_avg:144.56ms step:194/1480 train_time:26602ms step_avg:144.57ms step:195/1480 train_time:26749ms step_avg:144.59ms step:196/1480 train_time:26895ms step_avg:144.60ms step:197/1480 train_time:27043ms step_avg:144.61ms step:198/1480 train_time:27190ms step_avg:144.63ms step:199/1480 train_time:27335ms step_avg:144.63ms step:200/1480 train_time:27484ms step_avg:144.65ms step:201/1480 train_time:27631ms step_avg:144.66ms step:202/1480 train_time:27776ms step_avg:144.67ms step:203/1480 train_time:27924ms step_avg:144.69ms step:204/1480 train_time:28071ms step_avg:144.69ms step:205/1480 train_time:28217ms step_avg:144.71ms step:206/1480 train_time:28366ms step_avg:144.72ms step:207/1480 train_time:28513ms step_avg:144.73ms step:208/1480 train_time:28659ms step_avg:144.74ms step:209/1480 train_time:28805ms step_avg:144.75ms step:210/1480 train_time:28951ms step_avg:144.76ms step:211/1480 train_time:29098ms step_avg:144.76ms step:212/1480 train_time:29245ms step_avg:144.78ms step:213/1480 train_time:29393ms step_avg:144.79ms step:214/1480 train_time:29541ms step_avg:144.81ms step:215/1480 train_time:29688ms step_avg:144.82ms step:216/1480 train_time:29834ms step_avg:144.83ms step:217/1480 train_time:29981ms step_avg:144.84ms step:218/1480 train_time:30128ms step_avg:144.85ms step:219/1480 train_time:30274ms step_avg:144.85ms step:220/1480 train_time:30420ms step_avg:144.86ms step:221/1480 train_time:30569ms step_avg:144.88ms step:222/1480 train_time:30721ms step_avg:144.91ms step:223/1480 train_time:30872ms step_avg:144.94ms step:224/1480 train_time:31023ms step_avg:144.97ms step:225/1480 train_time:31173ms step_avg:144.99ms step:226/1480 train_time:31325ms step_avg:145.02ms step:227/1480 train_time:31475ms step_avg:145.05ms step:228/1480 train_time:31627ms step_avg:145.08ms step:229/1480 train_time:31777ms step_avg:145.10ms step:230/1480 train_time:31928ms step_avg:145.13ms step:231/1480 train_time:32078ms step_avg:145.15ms step:232/1480 train_time:32229ms step_avg:145.18ms step:233/1480 train_time:32378ms step_avg:145.19ms step:234/1480 train_time:32530ms step_avg:145.22ms step:235/1480 train_time:32681ms step_avg:145.25ms step:236/1480 train_time:32832ms step_avg:145.27ms step:237/1480 train_time:32983ms step_avg:145.30ms step:238/1480 train_time:33133ms step_avg:145.32ms step:239/1480 train_time:33284ms step_avg:145.35ms step:240/1480 train_time:33433ms step_avg:145.36ms step:241/1480 train_time:33585ms step_avg:145.39ms step:242/1480 train_time:33735ms step_avg:145.41ms step:243/1480 train_time:33886ms step_avg:145.43ms step:244/1480 train_time:34036ms step_avg:145.45ms step:245/1480 train_time:34188ms step_avg:145.48ms step:246/1480 train_time:34337ms step_avg:145.50ms step:247/1480 train_time:34490ms step_avg:145.53ms step:248/1480 train_time:34641ms step_avg:145.55ms step:249/1480 train_time:34791ms step_avg:145.57ms step:250/1480 train_time:34940ms step_avg:145.58ms step:250/1480 val_loss:3.9944 train_time:34999ms step_avg:145.83ms step:251/1480 train_time:35096ms step_avg:145.63ms step:252/1480 train_time:35247ms step_avg:145.65ms step:253/1480 train_time:35398ms step_avg:145.67ms step:254/1480 train_time:35547ms step_avg:145.68ms step:255/1480 train_time:35696ms step_avg:145.70ms step:256/1480 train_time:35845ms step_avg:145.71ms step:257/1480 train_time:35996ms step_avg:145.73ms step:258/1480 train_time:36148ms step_avg:145.76ms step:259/1480 train_time:36300ms step_avg:145.78ms step:260/1480 train_time:36450ms step_avg:145.80ms step:261/1480 train_time:36600ms step_avg:145.82ms step:262/1480 train_time:36749ms step_avg:145.83ms step:263/1480 train_time:36899ms step_avg:145.85ms step:264/1480 train_time:37048ms step_avg:145.86ms step:265/1480 train_time:37202ms step_avg:145.89ms step:266/1480 train_time:37352ms step_avg:145.91ms step:267/1480 train_time:37503ms step_avg:145.92ms step:268/1480 train_time:37651ms step_avg:145.94ms step:269/1480 train_time:37801ms step_avg:145.95ms step:270/1480 train_time:37951ms step_avg:145.96ms step:271/1480 train_time:38102ms step_avg:145.98ms step:272/1480 train_time:38251ms step_avg:146.00ms step:273/1480 train_time:38403ms step_avg:146.02ms step:274/1480 train_time:38553ms step_avg:146.03ms step:275/1480 train_time:38704ms step_avg:146.05ms step:276/1480 train_time:38853ms step_avg:146.06ms step:277/1480 train_time:39004ms step_avg:146.08ms step:278/1480 train_time:39154ms step_avg:146.10ms step:279/1480 train_time:39305ms step_avg:146.12ms step:280/1480 train_time:39456ms step_avg:146.13ms step:281/1480 train_time:39606ms step_avg:146.15ms step:282/1480 train_time:39758ms step_avg:146.17ms step:283/1480 train_time:39908ms step_avg:146.18ms step:284/1480 train_time:40059ms step_avg:146.20ms step:285/1480 train_time:40208ms step_avg:146.21ms step:286/1480 train_time:40359ms step_avg:146.23ms step:287/1480 train_time:40509ms step_avg:146.24ms step:288/1480 train_time:40659ms step_avg:146.26ms step:289/1480 train_time:40808ms step_avg:146.27ms step:290/1480 train_time:40959ms step_avg:146.28ms step:291/1480 train_time:41110ms step_avg:146.30ms step:292/1480 train_time:41261ms step_avg:146.31ms step:293/1480 train_time:41411ms step_avg:146.33ms step:294/1480 train_time:41561ms step_avg:146.34ms step:295/1480 train_time:41710ms step_avg:146.35ms step:296/1480 train_time:41860ms step_avg:146.36ms step:297/1480 train_time:42009ms step_avg:146.37ms step:298/1480 train_time:42160ms step_avg:146.39ms step:299/1480 train_time:42309ms step_avg:146.40ms step:300/1480 train_time:42460ms step_avg:146.41ms step:301/1480 train_time:42610ms step_avg:146.43ms step:302/1480 train_time:42761ms step_avg:146.44ms step:303/1480 train_time:42911ms step_avg:146.45ms step:304/1480 train_time:43062ms step_avg:146.47ms step:305/1480 train_time:43212ms step_avg:146.48ms step:306/1480 train_time:43362ms step_avg:146.49ms step:307/1480 train_time:43514ms step_avg:146.51ms step:308/1480 train_time:43664ms step_avg:146.52ms step:309/1480 train_time:43815ms step_avg:146.54ms step:310/1480 train_time:43965ms step_avg:146.55ms step:311/1480 train_time:44117ms step_avg:146.57ms step:312/1480 train_time:44266ms step_avg:146.58ms step:313/1480 train_time:44419ms step_avg:146.60ms step:314/1480 train_time:44569ms step_avg:146.61ms step:315/1480 train_time:44719ms step_avg:146.62ms step:316/1480 train_time:44868ms step_avg:146.63ms step:317/1480 train_time:45020ms step_avg:146.64ms step:318/1480 train_time:45168ms step_avg:146.65ms step:319/1480 train_time:45320ms step_avg:146.67ms step:320/1480 train_time:45469ms step_avg:146.67ms step:321/1480 train_time:45621ms step_avg:146.69ms step:322/1480 train_time:45770ms step_avg:146.70ms step:323/1480 train_time:45920ms step_avg:146.71ms step:324/1480 train_time:46070ms step_avg:146.72ms step:325/1480 train_time:46221ms step_avg:146.73ms step:326/1480 train_time:46369ms step_avg:146.74ms step:327/1480 train_time:46521ms step_avg:146.75ms step:328/1480 train_time:46670ms step_avg:146.76ms step:329/1480 train_time:46822ms step_avg:146.78ms step:330/1480 train_time:46973ms step_avg:146.79ms step:331/1480 train_time:47127ms step_avg:146.81ms step:332/1480 train_time:47281ms step_avg:146.83ms step:333/1480 train_time:47436ms step_avg:146.86ms step:334/1480 train_time:47588ms step_avg:146.88ms step:335/1480 train_time:47741ms step_avg:146.90ms step:336/1480 train_time:47895ms step_avg:146.92ms step:337/1480 train_time:48050ms step_avg:146.94ms step:338/1480 train_time:48205ms step_avg:146.97ms step:339/1480 train_time:48357ms step_avg:146.98ms step:340/1480 train_time:48512ms step_avg:147.00ms step:341/1480 train_time:48665ms step_avg:147.03ms step:342/1480 train_time:48820ms step_avg:147.05ms step:343/1480 train_time:48974ms step_avg:147.07ms step:344/1480 train_time:49128ms step_avg:147.09ms step:345/1480 train_time:49281ms step_avg:147.11ms step:346/1480 train_time:49435ms step_avg:147.13ms step:347/1480 train_time:49589ms step_avg:147.15ms step:348/1480 train_time:49743ms step_avg:147.17ms step:349/1480 train_time:49898ms step_avg:147.19ms step:350/1480 train_time:50052ms step_avg:147.21ms step:351/1480 train_time:50207ms step_avg:147.24ms step:352/1480 train_time:50361ms step_avg:147.25ms step:353/1480 train_time:50516ms step_avg:147.28ms step:354/1480 train_time:50669ms step_avg:147.29ms step:355/1480 train_time:50822ms step_avg:147.31ms step:356/1480 train_time:50976ms step_avg:147.33ms step:357/1480 train_time:51129ms step_avg:147.34ms step:358/1480 train_time:51282ms step_avg:147.36ms step:359/1480 train_time:51436ms step_avg:147.38ms step:360/1480 train_time:51592ms step_avg:147.41ms step:361/1480 train_time:51746ms step_avg:147.43ms step:362/1480 train_time:51900ms step_avg:147.44ms step:363/1480 train_time:52055ms step_avg:147.47ms step:364/1480 train_time:52209ms step_avg:147.48ms step:365/1480 train_time:52362ms step_avg:147.50ms step:366/1480 train_time:52516ms step_avg:147.52ms step:367/1480 train_time:52669ms step_avg:147.53ms step:368/1480 train_time:52824ms step_avg:147.55ms step:369/1480 train_time:52976ms step_avg:147.57ms step:370/1480 train_time:53130ms step_avg:147.58ms step:371/1480 train_time:53284ms step_avg:147.60ms step:372/1480 train_time:53438ms step_avg:147.62ms step:373/1480 train_time:53593ms step_avg:147.64ms step:374/1480 train_time:53746ms step_avg:147.65ms step:375/1480 train_time:53901ms step_avg:147.67ms step:375/1480 val_loss:3.8076 train_time:53961ms step_avg:147.84ms step:376/1480 train_time:54058ms step_avg:147.70ms step:377/1480 train_time:54213ms step_avg:147.72ms step:378/1480 train_time:54366ms step_avg:147.73ms step:379/1480 train_time:54519ms step_avg:147.75ms step:380/1480 train_time:54671ms step_avg:147.76ms step:381/1480 train_time:54824ms step_avg:147.77ms step:382/1480 train_time:54979ms step_avg:147.79ms step:383/1480 train_time:55135ms step_avg:147.82ms step:384/1480 train_time:55291ms step_avg:147.84ms step:385/1480 train_time:55444ms step_avg:147.85ms step:386/1480 train_time:55597ms step_avg:147.86ms step:387/1480 train_time:55749ms step_avg:147.88ms step:388/1480 train_time:55903ms step_avg:147.89ms step:389/1480 train_time:56057ms step_avg:147.91ms step:390/1480 train_time:56212ms step_avg:147.93ms step:391/1480 train_time:56367ms step_avg:147.94ms step:392/1480 train_time:56520ms step_avg:147.96ms step:393/1480 train_time:56673ms step_avg:147.97ms step:394/1480 train_time:56826ms step_avg:147.98ms step:395/1480 train_time:56979ms step_avg:148.00ms step:396/1480 train_time:57132ms step_avg:148.01ms step:397/1480 train_time:57287ms step_avg:148.03ms step:398/1480 train_time:57440ms step_avg:148.04ms step:399/1480 train_time:57595ms step_avg:148.06ms step:400/1480 train_time:57749ms step_avg:148.08ms step:401/1480 train_time:57903ms step_avg:148.09ms step:402/1480 train_time:58056ms step_avg:148.10ms step:403/1480 train_time:58210ms step_avg:148.12ms step:404/1480 train_time:58365ms step_avg:148.14ms step:405/1480 train_time:58519ms step_avg:148.15ms step:406/1480 train_time:58674ms step_avg:148.17ms step:407/1480 train_time:58829ms step_avg:148.18ms step:408/1480 train_time:58982ms step_avg:148.20ms step:409/1480 train_time:59137ms step_avg:148.21ms step:410/1480 train_time:59291ms step_avg:148.23ms step:411/1480 train_time:59445ms step_avg:148.24ms step:412/1480 train_time:59599ms step_avg:148.26ms step:413/1480 train_time:59752ms step_avg:148.27ms step:414/1480 train_time:59907ms step_avg:148.28ms step:415/1480 train_time:60059ms step_avg:148.29ms step:416/1480 train_time:60213ms step_avg:148.31ms step:417/1480 train_time:60367ms step_avg:148.32ms step:418/1480 train_time:60520ms step_avg:148.33ms step:419/1480 train_time:60673ms step_avg:148.34ms step:420/1480 train_time:60827ms step_avg:148.36ms step:421/1480 train_time:60981ms step_avg:148.37ms step:422/1480 train_time:61135ms step_avg:148.39ms step:423/1480 train_time:61287ms step_avg:148.40ms step:424/1480 train_time:61441ms step_avg:148.41ms step:425/1480 train_time:61596ms step_avg:148.42ms step:426/1480 train_time:61749ms step_avg:148.44ms step:427/1480 train_time:61903ms step_avg:148.45ms step:428/1480 train_time:62056ms step_avg:148.46ms step:429/1480 train_time:62211ms step_avg:148.47ms step:430/1480 train_time:62364ms step_avg:148.49ms step:431/1480 train_time:62518ms step_avg:148.50ms step:432/1480 train_time:62672ms step_avg:148.51ms step:433/1480 train_time:62825ms step_avg:148.52ms step:434/1480 train_time:62978ms step_avg:148.53ms step:435/1480 train_time:63133ms step_avg:148.55ms step:436/1480 train_time:63288ms step_avg:148.56ms step:437/1480 train_time:63441ms step_avg:148.57ms step:438/1480 train_time:63595ms step_avg:148.59ms step:439/1480 train_time:63750ms step_avg:148.60ms step:440/1480 train_time:63905ms step_avg:148.62ms step:441/1480 train_time:64060ms step_avg:148.63ms step:442/1480 train_time:64218ms step_avg:148.65ms step:443/1480 train_time:64375ms step_avg:148.67ms step:444/1480 train_time:64534ms step_avg:148.70ms step:445/1480 train_time:64689ms step_avg:148.71ms step:446/1480 train_time:64845ms step_avg:148.73ms step:447/1480 train_time:65000ms step_avg:148.74ms step:448/1480 train_time:65157ms step_avg:148.76ms step:449/1480 train_time:65314ms step_avg:148.78ms step:450/1480 train_time:65472ms step_avg:148.80ms step:451/1480 train_time:65631ms step_avg:148.82ms step:452/1480 train_time:65788ms step_avg:148.84ms step:453/1480 train_time:65944ms step_avg:148.86ms step:454/1480 train_time:66100ms step_avg:148.87ms step:455/1480 train_time:66257ms step_avg:148.89ms step:456/1480 train_time:66413ms step_avg:148.91ms step:457/1480 train_time:66570ms step_avg:148.93ms step:458/1480 train_time:66728ms step_avg:148.95ms step:459/1480 train_time:66883ms step_avg:148.96ms step:460/1480 train_time:67039ms step_avg:148.98ms step:461/1480 train_time:67198ms step_avg:149.00ms step:462/1480 train_time:67355ms step_avg:149.02ms step:463/1480 train_time:67512ms step_avg:149.03ms step:464/1480 train_time:67670ms step_avg:149.05ms step:465/1480 train_time:67826ms step_avg:149.07ms step:466/1480 train_time:67981ms step_avg:149.08ms step:467/1480 train_time:68139ms step_avg:149.10ms step:468/1480 train_time:68295ms step_avg:149.12ms step:469/1480 train_time:68452ms step_avg:149.13ms step:470/1480 train_time:68609ms step_avg:149.15ms step:471/1480 train_time:68766ms step_avg:149.17ms step:472/1480 train_time:68921ms step_avg:149.18ms step:473/1480 train_time:69077ms step_avg:149.19ms step:474/1480 train_time:69235ms step_avg:149.21ms step:475/1480 train_time:69392ms step_avg:149.23ms step:476/1480 train_time:69550ms step_avg:149.25ms step:477/1480 train_time:69709ms step_avg:149.27ms step:478/1480 train_time:69867ms step_avg:149.29ms step:479/1480 train_time:70023ms step_avg:149.30ms step:480/1480 train_time:70180ms step_avg:149.32ms step:481/1480 train_time:70338ms step_avg:149.34ms step:482/1480 train_time:70494ms step_avg:149.35ms step:483/1480 train_time:70651ms step_avg:149.37ms step:484/1480 train_time:70809ms step_avg:149.39ms step:485/1480 train_time:70968ms step_avg:149.41ms step:486/1480 train_time:71125ms step_avg:149.42ms step:487/1480 train_time:71281ms step_avg:149.44ms step:488/1480 train_time:71438ms step_avg:149.45ms step:489/1480 train_time:71595ms step_avg:149.47ms step:490/1480 train_time:71752ms step_avg:149.48ms step:491/1480 train_time:71910ms step_avg:149.50ms step:492/1480 train_time:72067ms step_avg:149.52ms step:493/1480 train_time:72224ms step_avg:149.53ms step:494/1480 train_time:72380ms step_avg:149.54ms step:495/1480 train_time:72537ms step_avg:149.56ms step:496/1480 train_time:72695ms step_avg:149.58ms step:497/1480 train_time:72851ms step_avg:149.59ms step:498/1480 train_time:73009ms step_avg:149.61ms step:499/1480 train_time:73166ms step_avg:149.62ms step:500/1480 train_time:73322ms step_avg:149.64ms step:500/1480 val_loss:3.6868 train_time:73383ms step_avg:149.76ms step:501/1480 train_time:73480ms step_avg:149.65ms step:502/1480 train_time:73637ms step_avg:149.67ms step:503/1480 train_time:73794ms step_avg:149.68ms step:504/1480 train_time:73951ms step_avg:149.70ms step:505/1480 train_time:74107ms step_avg:149.71ms step:506/1480 train_time:74263ms step_avg:149.72ms step:507/1480 train_time:74419ms step_avg:149.74ms step:508/1480 train_time:74578ms step_avg:149.75ms step:509/1480 train_time:74734ms step_avg:149.77ms step:510/1480 train_time:74890ms step_avg:149.78ms step:511/1480 train_time:75046ms step_avg:149.79ms step:512/1480 train_time:75204ms step_avg:149.81ms step:513/1480 train_time:75358ms step_avg:149.82ms step:514/1480 train_time:75515ms step_avg:149.83ms step:515/1480 train_time:75672ms step_avg:149.85ms step:516/1480 train_time:75831ms step_avg:149.86ms step:517/1480 train_time:75989ms step_avg:149.88ms step:518/1480 train_time:76145ms step_avg:149.89ms step:519/1480 train_time:76302ms step_avg:149.90ms step:520/1480 train_time:76459ms step_avg:149.92ms step:521/1480 train_time:76615ms step_avg:149.93ms step:522/1480 train_time:76773ms step_avg:149.95ms step:523/1480 train_time:76931ms step_avg:149.96ms step:524/1480 train_time:77087ms step_avg:149.98ms step:525/1480 train_time:77243ms step_avg:149.99ms step:526/1480 train_time:77401ms step_avg:150.00ms step:527/1480 train_time:77558ms step_avg:150.01ms step:528/1480 train_time:77716ms step_avg:150.03ms step:529/1480 train_time:77873ms step_avg:150.04ms step:530/1480 train_time:78030ms step_avg:150.06ms step:531/1480 train_time:78187ms step_avg:150.07ms step:532/1480 train_time:78343ms step_avg:150.08ms step:533/1480 train_time:78501ms step_avg:150.10ms step:534/1480 train_time:78657ms step_avg:150.11ms step:535/1480 train_time:78815ms step_avg:150.12ms step:536/1480 train_time:78972ms step_avg:150.14ms step:537/1480 train_time:79130ms step_avg:150.15ms step:538/1480 train_time:79287ms step_avg:150.17ms step:539/1480 train_time:79446ms step_avg:150.18ms step:540/1480 train_time:79602ms step_avg:150.19ms step:541/1480 train_time:79757ms step_avg:150.20ms step:542/1480 train_time:79914ms step_avg:150.21ms step:543/1480 train_time:80071ms step_avg:150.23ms step:544/1480 train_time:80227ms step_avg:150.24ms step:545/1480 train_time:80384ms step_avg:150.25ms step:546/1480 train_time:80540ms step_avg:150.26ms step:547/1480 train_time:80697ms step_avg:150.27ms step:548/1480 train_time:80854ms step_avg:150.29ms step:549/1480 train_time:81012ms step_avg:150.30ms step:550/1480 train_time:81171ms step_avg:150.32ms step:551/1480 train_time:81330ms step_avg:150.33ms step:552/1480 train_time:81490ms step_avg:150.35ms step:553/1480 train_time:81650ms step_avg:150.37ms step:554/1480 train_time:81811ms step_avg:150.39ms step:555/1480 train_time:81972ms step_avg:150.41ms step:556/1480 train_time:82132ms step_avg:150.42ms step:557/1480 train_time:82294ms step_avg:150.45ms step:558/1480 train_time:82454ms step_avg:150.46ms step:559/1480 train_time:82614ms step_avg:150.48ms step:560/1480 train_time:82774ms step_avg:150.50ms step:561/1480 train_time:82933ms step_avg:150.51ms step:562/1480 train_time:83094ms step_avg:150.53ms step:563/1480 train_time:83253ms step_avg:150.55ms step:564/1480 train_time:83414ms step_avg:150.57ms step:565/1480 train_time:83574ms step_avg:150.58ms step:566/1480 train_time:83735ms step_avg:150.60ms step:567/1480 train_time:83894ms step_avg:150.62ms step:568/1480 train_time:84052ms step_avg:150.63ms step:569/1480 train_time:84211ms step_avg:150.65ms step:570/1480 train_time:84370ms step_avg:150.66ms step:571/1480 train_time:84529ms step_avg:150.68ms step:572/1480 train_time:84689ms step_avg:150.69ms step:573/1480 train_time:84848ms step_avg:150.71ms step:574/1480 train_time:85010ms step_avg:150.73ms step:575/1480 train_time:85171ms step_avg:150.75ms step:576/1480 train_time:85330ms step_avg:150.76ms step:577/1480 train_time:85490ms step_avg:150.78ms step:578/1480 train_time:85650ms step_avg:150.79ms step:579/1480 train_time:85810ms step_avg:150.81ms step:580/1480 train_time:85969ms step_avg:150.82ms step:581/1480 train_time:86129ms step_avg:150.84ms step:582/1480 train_time:86289ms step_avg:150.85ms step:583/1480 train_time:86449ms step_avg:150.87ms step:584/1480 train_time:86608ms step_avg:150.88ms step:585/1480 train_time:86766ms step_avg:150.90ms step:586/1480 train_time:86924ms step_avg:150.91ms step:587/1480 train_time:87082ms step_avg:150.92ms step:588/1480 train_time:87239ms step_avg:150.93ms step:589/1480 train_time:87401ms step_avg:150.95ms step:590/1480 train_time:87560ms step_avg:150.97ms step:591/1480 train_time:87718ms step_avg:150.98ms step:592/1480 train_time:87877ms step_avg:150.99ms step:593/1480 train_time:88037ms step_avg:151.01ms step:594/1480 train_time:88196ms step_avg:151.02ms step:595/1480 train_time:88356ms step_avg:151.04ms step:596/1480 train_time:88518ms step_avg:151.05ms step:597/1480 train_time:88677ms step_avg:151.07ms step:598/1480 train_time:88834ms step_avg:151.08ms step:599/1480 train_time:88993ms step_avg:151.09ms step:600/1480 train_time:89153ms step_avg:151.11ms step:601/1480 train_time:89313ms step_avg:151.12ms step:602/1480 train_time:89473ms step_avg:151.14ms step:603/1480 train_time:89634ms step_avg:151.15ms step:604/1480 train_time:89794ms step_avg:151.17ms step:605/1480 train_time:89955ms step_avg:151.18ms step:606/1480 train_time:90117ms step_avg:151.20ms step:607/1480 train_time:90279ms step_avg:151.22ms step:608/1480 train_time:90438ms step_avg:151.23ms step:609/1480 train_time:90598ms step_avg:151.25ms step:610/1480 train_time:90756ms step_avg:151.26ms step:611/1480 train_time:90916ms step_avg:151.27ms step:612/1480 train_time:91076ms step_avg:151.29ms step:613/1480 train_time:91235ms step_avg:151.30ms step:614/1480 train_time:91396ms step_avg:151.32ms step:615/1480 train_time:91555ms step_avg:151.33ms step:616/1480 train_time:91715ms step_avg:151.34ms step:617/1480 train_time:91876ms step_avg:151.36ms step:618/1480 train_time:92034ms step_avg:151.37ms step:619/1480 train_time:92194ms step_avg:151.39ms step:620/1480 train_time:92354ms step_avg:151.40ms step:621/1480 train_time:92514ms step_avg:151.41ms step:622/1480 train_time:92674ms step_avg:151.43ms step:623/1480 train_time:92835ms step_avg:151.44ms step:624/1480 train_time:92994ms step_avg:151.46ms step:625/1480 train_time:93154ms step_avg:151.47ms step:625/1480 val_loss:3.6080 train_time:93217ms step_avg:151.57ms step:626/1480 train_time:93315ms step_avg:151.49ms step:627/1480 train_time:93475ms step_avg:151.50ms step:628/1480 train_time:93632ms step_avg:151.51ms step:629/1480 train_time:93791ms step_avg:151.52ms step:630/1480 train_time:93949ms step_avg:151.53ms step:631/1480 train_time:94107ms step_avg:151.54ms step:632/1480 train_time:94266ms step_avg:151.55ms step:633/1480 train_time:94426ms step_avg:151.57ms step:634/1480 train_time:94586ms step_avg:151.58ms step:635/1480 train_time:94746ms step_avg:151.59ms step:636/1480 train_time:94906ms step_avg:151.61ms step:637/1480 train_time:95067ms step_avg:151.62ms step:638/1480 train_time:95226ms step_avg:151.63ms step:639/1480 train_time:95386ms step_avg:151.65ms step:640/1480 train_time:95546ms step_avg:151.66ms step:641/1480 train_time:95706ms step_avg:151.67ms step:642/1480 train_time:95866ms step_avg:151.69ms step:643/1480 train_time:96025ms step_avg:151.70ms step:644/1480 train_time:96185ms step_avg:151.71ms step:645/1480 train_time:96344ms step_avg:151.72ms step:646/1480 train_time:96504ms step_avg:151.74ms step:647/1480 train_time:96664ms step_avg:151.75ms step:648/1480 train_time:96826ms step_avg:151.76ms step:649/1480 train_time:96986ms step_avg:151.78ms step:650/1480 train_time:97146ms step_avg:151.79ms step:651/1480 train_time:97306ms step_avg:151.80ms step:652/1480 train_time:97466ms step_avg:151.82ms step:653/1480 train_time:97624ms step_avg:151.83ms step:654/1480 train_time:97785ms step_avg:151.84ms step:655/1480 train_time:97946ms step_avg:151.85ms step:656/1480 train_time:98106ms step_avg:151.87ms step:657/1480 train_time:98268ms step_avg:151.88ms step:658/1480 train_time:98427ms step_avg:151.89ms step:659/1480 train_time:98589ms step_avg:151.91ms step:660/1480 train_time:98749ms step_avg:151.92ms step:661/1480 train_time:98912ms step_avg:151.94ms step:662/1480 train_time:99072ms step_avg:151.95ms step:663/1480 train_time:99231ms step_avg:151.96ms step:664/1480 train_time:99392ms step_avg:151.98ms step:665/1480 train_time:99555ms step_avg:151.99ms step:666/1480 train_time:99716ms step_avg:152.01ms step:667/1480 train_time:99879ms step_avg:152.02ms step:668/1480 train_time:100041ms step_avg:152.04ms step:669/1480 train_time:100204ms step_avg:152.06ms step:670/1480 train_time:100366ms step_avg:152.07ms step:671/1480 train_time:100527ms step_avg:152.08ms step:672/1480 train_time:100690ms step_avg:152.10ms step:673/1480 train_time:100852ms step_avg:152.12ms step:674/1480 train_time:101014ms step_avg:152.13ms step:675/1480 train_time:101175ms step_avg:152.14ms step:676/1480 train_time:101335ms step_avg:152.16ms step:677/1480 train_time:101495ms step_avg:152.17ms step:678/1480 train_time:101655ms step_avg:152.18ms step:679/1480 train_time:101816ms step_avg:152.19ms step:680/1480 train_time:101980ms step_avg:152.21ms step:681/1480 train_time:102139ms step_avg:152.22ms step:682/1480 train_time:102301ms step_avg:152.23ms step:683/1480 train_time:102464ms step_avg:152.25ms step:684/1480 train_time:102626ms step_avg:152.26ms step:685/1480 train_time:102790ms step_avg:152.28ms step:686/1480 train_time:102952ms step_avg:152.30ms step:687/1480 train_time:103112ms step_avg:152.31ms step:688/1480 train_time:103275ms step_avg:152.32ms step:689/1480 train_time:103437ms step_avg:152.34ms step:690/1480 train_time:103601ms step_avg:152.36ms step:691/1480 train_time:103764ms step_avg:152.37ms step:692/1480 train_time:103927ms step_avg:152.39ms step:693/1480 train_time:104089ms step_avg:152.40ms step:694/1480 train_time:104250ms step_avg:152.41ms step:695/1480 train_time:104410ms step_avg:152.42ms step:696/1480 train_time:104569ms step_avg:152.43ms step:697/1480 train_time:104731ms step_avg:152.45ms step:698/1480 train_time:104892ms step_avg:152.46ms step:699/1480 train_time:105055ms step_avg:152.47ms step:700/1480 train_time:105217ms step_avg:152.49ms step:701/1480 train_time:105377ms step_avg:152.50ms step:702/1480 train_time:105537ms step_avg:152.51ms step:703/1480 train_time:105697ms step_avg:152.52ms step:704/1480 train_time:105856ms step_avg:152.53ms step:705/1480 train_time:106021ms step_avg:152.55ms step:706/1480 train_time:106186ms step_avg:152.57ms step:707/1480 train_time:106347ms step_avg:152.58ms step:708/1480 train_time:106508ms step_avg:152.59ms step:709/1480 train_time:106670ms step_avg:152.60ms step:710/1480 train_time:106830ms step_avg:152.61ms step:711/1480 train_time:106992ms step_avg:152.63ms step:712/1480 train_time:107156ms step_avg:152.64ms step:713/1480 train_time:107319ms step_avg:152.66ms step:714/1480 train_time:107481ms step_avg:152.67ms step:715/1480 train_time:107642ms step_avg:152.68ms step:716/1480 train_time:107803ms step_avg:152.69ms step:717/1480 train_time:107967ms step_avg:152.71ms step:718/1480 train_time:108127ms step_avg:152.72ms step:719/1480 train_time:108287ms step_avg:152.73ms step:720/1480 train_time:108449ms step_avg:152.74ms step:721/1480 train_time:108610ms step_avg:152.76ms step:722/1480 train_time:108772ms step_avg:152.77ms step:723/1480 train_time:108932ms step_avg:152.78ms step:724/1480 train_time:109093ms step_avg:152.79ms step:725/1480 train_time:109257ms step_avg:152.81ms step:726/1480 train_time:109420ms step_avg:152.82ms step:727/1480 train_time:109585ms step_avg:152.84ms step:728/1480 train_time:109747ms step_avg:152.85ms step:729/1480 train_time:109908ms step_avg:152.86ms step:730/1480 train_time:110071ms step_avg:152.88ms step:731/1480 train_time:110231ms step_avg:152.89ms step:732/1480 train_time:110391ms step_avg:152.90ms step:733/1480 train_time:110554ms step_avg:152.91ms step:734/1480 train_time:110716ms step_avg:152.92ms step:735/1480 train_time:110876ms step_avg:152.93ms step:736/1480 train_time:111037ms step_avg:152.94ms step:737/1480 train_time:111197ms step_avg:152.95ms step:738/1480 train_time:111359ms step_avg:152.97ms step:739/1480 train_time:111519ms step_avg:152.98ms step:740/1480 train_time:111686ms step_avg:152.99ms step:741/1480 train_time:111850ms step_avg:153.01ms step:742/1480 train_time:112013ms step_avg:153.02ms step:743/1480 train_time:112174ms step_avg:153.03ms step:744/1480 train_time:112337ms step_avg:153.05ms step:745/1480 train_time:112501ms step_avg:153.06ms step:746/1480 train_time:112662ms step_avg:153.07ms step:747/1480 train_time:112823ms step_avg:153.08ms step:748/1480 train_time:112989ms step_avg:153.10ms step:749/1480 train_time:113152ms step_avg:153.11ms step:750/1480 train_time:113311ms step_avg:153.12ms step:750/1480 val_loss:3.5504 train_time:113375ms step_avg:153.21ms step:751/1480 train_time:113475ms step_avg:153.14ms step:752/1480 train_time:113639ms step_avg:153.15ms step:753/1480 train_time:113801ms step_avg:153.16ms step:754/1480 train_time:113962ms step_avg:153.17ms step:755/1480 train_time:114124ms step_avg:153.19ms step:756/1480 train_time:114285ms step_avg:153.20ms step:757/1480 train_time:114448ms step_avg:153.21ms step:758/1480 train_time:114609ms step_avg:153.22ms step:759/1480 train_time:114770ms step_avg:153.23ms step:760/1480 train_time:114930ms step_avg:153.24ms step:761/1480 train_time:115091ms step_avg:153.25ms step:762/1480 train_time:115252ms step_avg:153.26ms step:763/1480 train_time:115413ms step_avg:153.27ms step:764/1480 train_time:115575ms step_avg:153.28ms step:765/1480 train_time:115737ms step_avg:153.29ms step:766/1480 train_time:115900ms step_avg:153.31ms step:767/1480 train_time:116063ms step_avg:153.32ms step:768/1480 train_time:116226ms step_avg:153.33ms step:769/1480 train_time:116389ms step_avg:153.35ms step:770/1480 train_time:116552ms step_avg:153.36ms step:771/1480 train_time:116718ms step_avg:153.37ms step:772/1480 train_time:116881ms step_avg:153.39ms step:773/1480 train_time:117044ms step_avg:153.40ms step:774/1480 train_time:117207ms step_avg:153.41ms step:775/1480 train_time:117369ms step_avg:153.42ms step:776/1480 train_time:117533ms step_avg:153.44ms step:777/1480 train_time:117699ms step_avg:153.45ms step:778/1480 train_time:117864ms step_avg:153.47ms step:779/1480 train_time:118027ms step_avg:153.48ms step:780/1480 train_time:118190ms step_avg:153.49ms step:781/1480 train_time:118352ms step_avg:153.50ms step:782/1480 train_time:118518ms step_avg:153.52ms step:783/1480 train_time:118679ms step_avg:153.53ms step:784/1480 train_time:118844ms step_avg:153.54ms step:785/1480 train_time:119007ms step_avg:153.56ms step:786/1480 train_time:119171ms step_avg:153.57ms step:787/1480 train_time:119334ms step_avg:153.58ms step:788/1480 train_time:119498ms step_avg:153.60ms step:789/1480 train_time:119658ms step_avg:153.60ms step:790/1480 train_time:119825ms step_avg:153.62ms step:791/1480 train_time:119991ms step_avg:153.64ms step:792/1480 train_time:120154ms step_avg:153.65ms step:793/1480 train_time:120315ms step_avg:153.66ms step:794/1480 train_time:120479ms step_avg:153.67ms step:795/1480 train_time:120645ms step_avg:153.69ms step:796/1480 train_time:120811ms step_avg:153.70ms step:797/1480 train_time:120976ms step_avg:153.72ms step:798/1480 train_time:121139ms step_avg:153.73ms step:799/1480 train_time:121306ms step_avg:153.75ms step:800/1480 train_time:121469ms step_avg:153.76ms step:801/1480 train_time:121631ms step_avg:153.77ms step:802/1480 train_time:121798ms step_avg:153.79ms step:803/1480 train_time:121960ms step_avg:153.80ms step:804/1480 train_time:122122ms step_avg:153.81ms step:805/1480 train_time:122287ms step_avg:153.82ms step:806/1480 train_time:122449ms step_avg:153.83ms step:807/1480 train_time:122611ms step_avg:153.84ms step:808/1480 train_time:122774ms step_avg:153.85ms step:809/1480 train_time:122935ms step_avg:153.86ms step:810/1480 train_time:123098ms step_avg:153.87ms step:811/1480 train_time:123261ms step_avg:153.88ms step:812/1480 train_time:123427ms step_avg:153.90ms step:813/1480 train_time:123587ms step_avg:153.91ms step:814/1480 train_time:123750ms step_avg:153.92ms step:815/1480 train_time:123911ms step_avg:153.93ms step:816/1480 train_time:124075ms step_avg:153.94ms step:817/1480 train_time:124238ms step_avg:153.95ms step:818/1480 train_time:124401ms step_avg:153.96ms step:819/1480 train_time:124564ms step_avg:153.97ms step:820/1480 train_time:124727ms step_avg:153.98ms step:821/1480 train_time:124888ms step_avg:153.99ms step:822/1480 train_time:125051ms step_avg:154.00ms step:823/1480 train_time:125213ms step_avg:154.01ms step:824/1480 train_time:125374ms step_avg:154.02ms step:825/1480 train_time:125539ms step_avg:154.04ms step:826/1480 train_time:125706ms step_avg:154.05ms step:827/1480 train_time:125871ms step_avg:154.06ms step:828/1480 train_time:126034ms step_avg:154.08ms step:829/1480 train_time:126197ms step_avg:154.09ms step:830/1480 train_time:126363ms step_avg:154.10ms step:831/1480 train_time:126527ms step_avg:154.11ms step:832/1480 train_time:126690ms step_avg:154.12ms step:833/1480 train_time:126854ms step_avg:154.14ms step:834/1480 train_time:127020ms step_avg:154.15ms step:835/1480 train_time:127184ms step_avg:154.16ms step:836/1480 train_time:127349ms step_avg:154.18ms step:837/1480 train_time:127511ms step_avg:154.18ms step:838/1480 train_time:127673ms step_avg:154.19ms step:839/1480 train_time:127834ms step_avg:154.20ms step:840/1480 train_time:127994ms step_avg:154.21ms step:841/1480 train_time:128154ms step_avg:154.22ms step:842/1480 train_time:128319ms step_avg:154.23ms step:843/1480 train_time:128482ms step_avg:154.24ms step:844/1480 train_time:128646ms step_avg:154.25ms step:845/1480 train_time:128808ms step_avg:154.26ms step:846/1480 train_time:128971ms step_avg:154.27ms step:847/1480 train_time:129135ms step_avg:154.28ms step:848/1480 train_time:129296ms step_avg:154.29ms step:849/1480 train_time:129458ms step_avg:154.30ms step:850/1480 train_time:129621ms step_avg:154.31ms step:851/1480 train_time:129785ms step_avg:154.32ms step:852/1480 train_time:129948ms step_avg:154.33ms step:853/1480 train_time:130110ms step_avg:154.34ms step:854/1480 train_time:130274ms step_avg:154.35ms step:855/1480 train_time:130436ms step_avg:154.36ms step:856/1480 train_time:130598ms step_avg:154.37ms step:857/1480 train_time:130765ms step_avg:154.39ms step:858/1480 train_time:130931ms step_avg:154.40ms step:859/1480 train_time:131094ms step_avg:154.41ms step:860/1480 train_time:131256ms step_avg:154.42ms step:861/1480 train_time:131423ms step_avg:154.43ms step:862/1480 train_time:131591ms step_avg:154.45ms step:863/1480 train_time:131760ms step_avg:154.47ms step:864/1480 train_time:131925ms step_avg:154.48ms step:865/1480 train_time:132086ms step_avg:154.49ms step:866/1480 train_time:132252ms step_avg:154.50ms step:867/1480 train_time:132414ms step_avg:154.51ms step:868/1480 train_time:132573ms step_avg:154.51ms step:869/1480 train_time:132735ms step_avg:154.52ms step:870/1480 train_time:132900ms step_avg:154.53ms step:871/1480 train_time:133062ms step_avg:154.54ms step:872/1480 train_time:133227ms step_avg:154.56ms step:873/1480 train_time:133388ms step_avg:154.56ms step:874/1480 train_time:133554ms step_avg:154.58ms step:875/1480 train_time:133719ms step_avg:154.59ms step:875/1480 val_loss:3.5039 train_time:133783ms step_avg:154.66ms step:876/1480 train_time:133885ms step_avg:154.60ms step:877/1480 train_time:134052ms step_avg:154.62ms step:878/1480 train_time:134214ms step_avg:154.62ms step:879/1480 train_time:134378ms step_avg:154.63ms step:880/1480 train_time:134540ms step_avg:154.64ms step:881/1480 train_time:134702ms step_avg:154.65ms step:882/1480 train_time:134867ms step_avg:154.66ms step:883/1480 train_time:135032ms step_avg:154.68ms step:884/1480 train_time:135198ms step_avg:154.69ms step:885/1480 train_time:135364ms step_avg:154.70ms step:886/1480 train_time:135530ms step_avg:154.71ms step:887/1480 train_time:135697ms step_avg:154.73ms step:888/1480 train_time:135869ms step_avg:154.75ms step:889/1480 train_time:136037ms step_avg:154.76ms step:890/1480 train_time:136199ms step_avg:154.77ms step:891/1480 train_time:136365ms step_avg:154.78ms step:892/1480 train_time:136530ms step_avg:154.80ms step:893/1480 train_time:136693ms step_avg:154.81ms step:894/1480 train_time:136858ms step_avg:154.82ms step:895/1480 train_time:137024ms step_avg:154.83ms step:896/1480 train_time:137189ms step_avg:154.84ms step:897/1480 train_time:137355ms step_avg:154.85ms step:898/1480 train_time:137523ms step_avg:154.87ms step:899/1480 train_time:137688ms step_avg:154.88ms step:900/1480 train_time:137852ms step_avg:154.89ms step:901/1480 train_time:138016ms step_avg:154.90ms step:902/1480 train_time:138179ms step_avg:154.91ms step:903/1480 train_time:138351ms step_avg:154.93ms step:904/1480 train_time:138516ms step_avg:154.94ms step:905/1480 train_time:138677ms step_avg:154.95ms step:906/1480 train_time:138845ms step_avg:154.96ms step:907/1480 train_time:139013ms step_avg:154.98ms step:908/1480 train_time:139175ms step_avg:154.98ms step:909/1480 train_time:139340ms step_avg:154.99ms step:910/1480 train_time:139510ms step_avg:155.01ms step:911/1480 train_time:139675ms step_avg:155.02ms step:912/1480 train_time:139841ms step_avg:155.03ms step:913/1480 train_time:140008ms step_avg:155.05ms step:914/1480 train_time:140176ms step_avg:155.06ms step:915/1480 train_time:140346ms step_avg:155.08ms step:916/1480 train_time:140511ms step_avg:155.09ms step:917/1480 train_time:140675ms step_avg:155.10ms step:918/1480 train_time:140844ms step_avg:155.11ms step:919/1480 train_time:141014ms step_avg:155.13ms step:920/1480 train_time:141177ms step_avg:155.14ms step:921/1480 train_time:141343ms step_avg:155.15ms step:922/1480 train_time:141511ms step_avg:155.17ms step:923/1480 train_time:141673ms step_avg:155.17ms step:924/1480 train_time:141837ms step_avg:155.18ms step:925/1480 train_time:142004ms step_avg:155.20ms step:926/1480 train_time:142168ms step_avg:155.21ms step:927/1480 train_time:142331ms step_avg:155.21ms step:928/1480 train_time:142497ms step_avg:155.23ms step:929/1480 train_time:142664ms step_avg:155.24ms step:930/1480 train_time:142829ms step_avg:155.25ms step:931/1480 train_time:142993ms step_avg:155.26ms step:932/1480 train_time:143159ms step_avg:155.27ms step:933/1480 train_time:143326ms step_avg:155.28ms step:934/1480 train_time:143494ms step_avg:155.30ms step:935/1480 train_time:143664ms step_avg:155.31ms step:936/1480 train_time:143830ms step_avg:155.32ms step:937/1480 train_time:143999ms step_avg:155.34ms step:938/1480 train_time:144163ms step_avg:155.35ms step:939/1480 train_time:144332ms step_avg:155.36ms step:940/1480 train_time:144497ms step_avg:155.37ms step:941/1480 train_time:144661ms step_avg:155.38ms step:942/1480 train_time:144826ms step_avg:155.39ms step:943/1480 train_time:144996ms step_avg:155.41ms step:944/1480 train_time:145170ms step_avg:155.43ms step:945/1480 train_time:145333ms step_avg:155.44ms step:946/1480 train_time:145501ms step_avg:155.45ms step:947/1480 train_time:145668ms step_avg:155.46ms step:948/1480 train_time:145833ms step_avg:155.47ms step:949/1480 train_time:145997ms step_avg:155.48ms step:950/1480 train_time:146161ms step_avg:155.49ms step:951/1480 train_time:146330ms step_avg:155.51ms step:952/1480 train_time:146495ms step_avg:155.51ms step:953/1480 train_time:146663ms step_avg:155.53ms step:954/1480 train_time:146833ms step_avg:155.54ms step:955/1480 train_time:146996ms step_avg:155.55ms step:956/1480 train_time:147162ms step_avg:155.56ms step:957/1480 train_time:147330ms step_avg:155.58ms step:958/1480 train_time:147500ms step_avg:155.59ms step:959/1480 train_time:147666ms step_avg:155.60ms step:960/1480 train_time:147832ms step_avg:155.61ms step:961/1480 train_time:147996ms step_avg:155.62ms step:962/1480 train_time:148159ms step_avg:155.63ms step:963/1480 train_time:148325ms step_avg:155.64ms step:964/1480 train_time:148494ms step_avg:155.65ms step:965/1480 train_time:148657ms step_avg:155.66ms step:966/1480 train_time:148823ms step_avg:155.67ms step:967/1480 train_time:148987ms step_avg:155.68ms step:968/1480 train_time:149152ms step_avg:155.69ms step:969/1480 train_time:149317ms step_avg:155.70ms step:970/1480 train_time:149481ms step_avg:155.71ms step:971/1480 train_time:149645ms step_avg:155.72ms step:972/1480 train_time:149811ms step_avg:155.73ms step:973/1480 train_time:149975ms step_avg:155.74ms step:974/1480 train_time:150145ms step_avg:155.75ms step:975/1480 train_time:150310ms step_avg:155.76ms step:976/1480 train_time:150475ms step_avg:155.77ms step:977/1480 train_time:150638ms step_avg:155.78ms step:978/1480 train_time:150804ms step_avg:155.79ms step:979/1480 train_time:150971ms step_avg:155.80ms step:980/1480 train_time:151136ms step_avg:155.81ms step:981/1480 train_time:151304ms step_avg:155.82ms step:982/1480 train_time:151469ms step_avg:155.83ms step:983/1480 train_time:151635ms step_avg:155.84ms step:984/1480 train_time:151798ms step_avg:155.85ms step:985/1480 train_time:151965ms step_avg:155.86ms step:986/1480 train_time:152131ms step_avg:155.87ms step:987/1480 train_time:152295ms step_avg:155.88ms step:988/1480 train_time:152461ms step_avg:155.89ms step:989/1480 train_time:152627ms step_avg:155.90ms step:990/1480 train_time:152797ms step_avg:155.92ms step:991/1480 train_time:152963ms step_avg:155.93ms step:992/1480 train_time:153138ms step_avg:155.94ms step:993/1480 train_time:153314ms step_avg:155.96ms step:994/1480 train_time:153478ms step_avg:155.97ms step:995/1480 train_time:153642ms step_avg:155.98ms step:996/1480 train_time:153806ms step_avg:155.99ms step:997/1480 train_time:153971ms step_avg:156.00ms step:998/1480 train_time:154134ms step_avg:156.01ms step:999/1480 train_time:154300ms step_avg:156.02ms step:1000/1480 train_time:154471ms step_avg:156.03ms step:1000/1480 val_loss:3.4418 train_time:154538ms step_avg:156.10ms step:1001/1480 train_time:154639ms step_avg:156.04ms step:1002/1480 train_time:154804ms step_avg:156.05ms step:1003/1480 train_time:154978ms step_avg:156.07ms step:1004/1480 train_time:155146ms step_avg:156.08ms step:1005/1480 train_time:155315ms step_avg:156.10ms step:1006/1480 train_time:155482ms step_avg:156.11ms step:1007/1480 train_time:155647ms step_avg:156.12ms step:1008/1480 train_time:155816ms step_avg:156.13ms step:1009/1480 train_time:155988ms step_avg:156.14ms step:1010/1480 train_time:156155ms step_avg:156.15ms step:1011/1480 train_time:156320ms step_avg:156.16ms step:1012/1480 train_time:156486ms step_avg:156.17ms step:1013/1480 train_time:156655ms step_avg:156.19ms step:1014/1480 train_time:156822ms step_avg:156.20ms step:1015/1480 train_time:156992ms step_avg:156.21ms step:1016/1480 train_time:157161ms step_avg:156.22ms step:1017/1480 train_time:157331ms step_avg:156.24ms step:1018/1480 train_time:157499ms step_avg:156.25ms step:1019/1480 train_time:157667ms step_avg:156.26ms step:1020/1480 train_time:157837ms step_avg:156.27ms step:1021/1480 train_time:158002ms step_avg:156.28ms step:1022/1480 train_time:158170ms step_avg:156.29ms step:1023/1480 train_time:158337ms step_avg:156.31ms step:1024/1480 train_time:158504ms step_avg:156.32ms step:1025/1480 train_time:158677ms step_avg:156.33ms step:1026/1480 train_time:158842ms step_avg:156.34ms step:1027/1480 train_time:159009ms step_avg:156.35ms step:1028/1480 train_time:159181ms step_avg:156.37ms step:1029/1480 train_time:159354ms step_avg:156.38ms step:1030/1480 train_time:159521ms step_avg:156.39ms step:1031/1480 train_time:159685ms step_avg:156.40ms step:1032/1480 train_time:159857ms step_avg:156.42ms step:1033/1480 train_time:160026ms step_avg:156.43ms step:1034/1480 train_time:160194ms step_avg:156.44ms step:1035/1480 train_time:160360ms step_avg:156.45ms step:1036/1480 train_time:160526ms step_avg:156.46ms step:1037/1480 train_time:160695ms step_avg:156.47ms step:1038/1480 train_time:160861ms step_avg:156.48ms step:1039/1480 train_time:161033ms step_avg:156.49ms step:1040/1480 train_time:161199ms step_avg:156.50ms step:1041/1480 train_time:161365ms step_avg:156.51ms step:1042/1480 train_time:161528ms step_avg:156.52ms step:1043/1480 train_time:161694ms step_avg:156.53ms step:1044/1480 train_time:161859ms step_avg:156.54ms step:1045/1480 train_time:162027ms step_avg:156.55ms step:1046/1480 train_time:162196ms step_avg:156.56ms step:1047/1480 train_time:162361ms step_avg:156.57ms step:1048/1480 train_time:162527ms step_avg:156.58ms step:1049/1480 train_time:162693ms step_avg:156.59ms step:1050/1480 train_time:162862ms step_avg:156.60ms step:1051/1480 train_time:163031ms step_avg:156.61ms step:1052/1480 train_time:163200ms step_avg:156.62ms step:1053/1480 train_time:163366ms step_avg:156.63ms step:1054/1480 train_time:163533ms step_avg:156.64ms step:1055/1480 train_time:163700ms step_avg:156.65ms step:1056/1480 train_time:163865ms step_avg:156.66ms step:1057/1480 train_time:164033ms step_avg:156.67ms step:1058/1480 train_time:164202ms step_avg:156.68ms step:1059/1480 train_time:164374ms step_avg:156.70ms step:1060/1480 train_time:164542ms step_avg:156.71ms step:1061/1480 train_time:164705ms step_avg:156.71ms step:1062/1480 train_time:164872ms step_avg:156.72ms step:1063/1480 train_time:165037ms step_avg:156.73ms step:1064/1480 train_time:165201ms step_avg:156.74ms step:1065/1480 train_time:165368ms step_avg:156.75ms step:1066/1480 train_time:165535ms step_avg:156.76ms step:1067/1480 train_time:165705ms step_avg:156.77ms step:1068/1480 train_time:165871ms step_avg:156.78ms step:1069/1480 train_time:166042ms step_avg:156.79ms step:1070/1480 train_time:166207ms step_avg:156.80ms step:1071/1480 train_time:166381ms step_avg:156.82ms step:1072/1480 train_time:166547ms step_avg:156.82ms step:1073/1480 train_time:166710ms step_avg:156.83ms step:1074/1480 train_time:166877ms step_avg:156.84ms step:1075/1480 train_time:167049ms step_avg:156.85ms step:1076/1480 train_time:167218ms step_avg:156.87ms step:1077/1480 train_time:167384ms step_avg:156.87ms step:1078/1480 train_time:167559ms step_avg:156.89ms step:1079/1480 train_time:167731ms step_avg:156.90ms step:1080/1480 train_time:167902ms step_avg:156.92ms step:1081/1480 train_time:168067ms step_avg:156.93ms step:1082/1480 train_time:168233ms step_avg:156.93ms step:1083/1480 train_time:168401ms step_avg:156.94ms step:1084/1480 train_time:168566ms step_avg:156.95ms step:1085/1480 train_time:168736ms step_avg:156.96ms step:1086/1480 train_time:168905ms step_avg:156.97ms step:1087/1480 train_time:169070ms step_avg:156.98ms step:1088/1480 train_time:169241ms step_avg:157.00ms step:1089/1480 train_time:169414ms step_avg:157.01ms step:1090/1480 train_time:169586ms step_avg:157.02ms step:1091/1480 train_time:169752ms step_avg:157.03ms step:1092/1480 train_time:169921ms step_avg:157.04ms step:1093/1480 train_time:170089ms step_avg:157.05ms step:1094/1480 train_time:170255ms step_avg:157.06ms step:1095/1480 train_time:170421ms step_avg:157.07ms step:1096/1480 train_time:170590ms step_avg:157.08ms step:1097/1480 train_time:170758ms step_avg:157.09ms step:1098/1480 train_time:170928ms step_avg:157.10ms step:1099/1480 train_time:171100ms step_avg:157.12ms step:1100/1480 train_time:171272ms step_avg:157.13ms step:1101/1480 train_time:171443ms step_avg:157.14ms step:1102/1480 train_time:171614ms step_avg:157.16ms step:1103/1480 train_time:171790ms step_avg:157.17ms step:1104/1480 train_time:171958ms step_avg:157.18ms step:1105/1480 train_time:172128ms step_avg:157.19ms step:1106/1480 train_time:172296ms step_avg:157.20ms step:1107/1480 train_time:172464ms step_avg:157.21ms step:1108/1480 train_time:172628ms step_avg:157.22ms step:1109/1480 train_time:172796ms step_avg:157.23ms step:1110/1480 train_time:172961ms step_avg:157.24ms step:1111/1480 train_time:173127ms step_avg:157.24ms step:1112/1480 train_time:173297ms step_avg:157.26ms step:1113/1480 train_time:173479ms step_avg:157.28ms step:1114/1480 train_time:173652ms step_avg:157.29ms step:1115/1480 train_time:173823ms step_avg:157.31ms step:1116/1480 train_time:173989ms step_avg:157.31ms step:1117/1480 train_time:174162ms step_avg:157.33ms step:1118/1480 train_time:174337ms step_avg:157.34ms step:1119/1480 train_time:174504ms step_avg:157.35ms step:1120/1480 train_time:174673ms step_avg:157.36ms step:1121/1480 train_time:174844ms step_avg:157.38ms step:1122/1480 train_time:175011ms step_avg:157.38ms step:1123/1480 train_time:175178ms step_avg:157.39ms step:1124/1480 train_time:175346ms step_avg:157.40ms step:1125/1480 train_time:175513ms step_avg:157.41ms step:1125/1480 val_loss:3.3856 train_time:175582ms step_avg:157.47ms step:1126/1480 train_time:175685ms step_avg:157.42ms step:1127/1480 train_time:175855ms step_avg:157.44ms step:1128/1480 train_time:176027ms step_avg:157.45ms step:1129/1480 train_time:176200ms step_avg:157.46ms step:1130/1480 train_time:176369ms step_avg:157.47ms step:1131/1480 train_time:176547ms step_avg:157.49ms step:1132/1480 train_time:176712ms step_avg:157.50ms step:1133/1480 train_time:176884ms step_avg:157.51ms step:1134/1480 train_time:177054ms step_avg:157.52ms step:1135/1480 train_time:177224ms step_avg:157.53ms step:1136/1480 train_time:177392ms step_avg:157.54ms step:1137/1480 train_time:177562ms step_avg:157.55ms step:1138/1480 train_time:177732ms step_avg:157.56ms step:1139/1480 train_time:177902ms step_avg:157.57ms step:1140/1480 train_time:178070ms step_avg:157.58ms step:1141/1480 train_time:178242ms step_avg:157.60ms step:1142/1480 train_time:178409ms step_avg:157.61ms step:1143/1480 train_time:178580ms step_avg:157.62ms step:1144/1480 train_time:178749ms step_avg:157.63ms step:1145/1480 train_time:178914ms step_avg:157.63ms step:1146/1480 train_time:179085ms step_avg:157.65ms step:1147/1480 train_time:179253ms step_avg:157.65ms step:1148/1480 train_time:179423ms step_avg:157.67ms step:1149/1480 train_time:179592ms step_avg:157.68ms step:1150/1480 train_time:179762ms step_avg:157.69ms step:1151/1480 train_time:179935ms step_avg:157.70ms step:1152/1480 train_time:180108ms step_avg:157.71ms step:1153/1480 train_time:180281ms step_avg:157.73ms step:1154/1480 train_time:180448ms step_avg:157.73ms step:1155/1480 train_time:180619ms step_avg:157.75ms step:1156/1480 train_time:180799ms step_avg:157.77ms step:1157/1480 train_time:180968ms step_avg:157.77ms step:1158/1480 train_time:181134ms step_avg:157.78ms step:1159/1480 train_time:181301ms step_avg:157.79ms step:1160/1480 train_time:181467ms step_avg:157.80ms step:1161/1480 train_time:181637ms step_avg:157.81ms step:1162/1480 train_time:181807ms step_avg:157.82ms step:1163/1480 train_time:181975ms step_avg:157.83ms step:1164/1480 train_time:182145ms step_avg:157.84ms step:1165/1480 train_time:182310ms step_avg:157.84ms step:1166/1480 train_time:182481ms step_avg:157.86ms step:1167/1480 train_time:182650ms step_avg:157.87ms step:1168/1480 train_time:182819ms step_avg:157.87ms step:1169/1480 train_time:182987ms step_avg:157.88ms step:1170/1480 train_time:183155ms step_avg:157.89ms step:1171/1480 train_time:183322ms step_avg:157.90ms step:1172/1480 train_time:183488ms step_avg:157.91ms step:1173/1480 train_time:183660ms step_avg:157.92ms step:1174/1480 train_time:183843ms step_avg:157.94ms step:1175/1480 train_time:184014ms step_avg:157.95ms step:1176/1480 train_time:184188ms step_avg:157.97ms step:1177/1480 train_time:184365ms step_avg:157.98ms step:1178/1480 train_time:184533ms step_avg:157.99ms step:1179/1480 train_time:184699ms step_avg:158.00ms step:1180/1480 train_time:184879ms step_avg:158.02ms step:1181/1480 train_time:185049ms step_avg:158.03ms step:1182/1480 train_time:185216ms step_avg:158.03ms step:1183/1480 train_time:185385ms step_avg:158.04ms step:1184/1480 train_time:185553ms step_avg:158.05ms step:1185/1480 train_time:185727ms step_avg:158.07ms step:1186/1480 train_time:185897ms step_avg:158.08ms step:1187/1480 train_time:186082ms step_avg:158.10ms step:1188/1480 train_time:186250ms step_avg:158.11ms step:1189/1480 train_time:186423ms step_avg:158.12ms step:1190/1480 train_time:186589ms step_avg:158.13ms step:1191/1480 train_time:186761ms step_avg:158.14ms step:1192/1480 train_time:186927ms step_avg:158.14ms step:1193/1480 train_time:187093ms step_avg:158.15ms step:1194/1480 train_time:187262ms step_avg:158.16ms step:1195/1480 train_time:187436ms step_avg:158.17ms step:1196/1480 train_time:187618ms step_avg:158.19ms step:1197/1480 train_time:187789ms step_avg:158.20ms step:1198/1480 train_time:187970ms step_avg:158.22ms step:1199/1480 train_time:188140ms step_avg:158.23ms step:1200/1480 train_time:188308ms step_avg:158.24ms step:1201/1480 train_time:188474ms step_avg:158.25ms step:1202/1480 train_time:188656ms step_avg:158.27ms step:1203/1480 train_time:188830ms step_avg:158.28ms step:1204/1480 train_time:189005ms step_avg:158.30ms step:1205/1480 train_time:189173ms step_avg:158.30ms step:1206/1480 train_time:189343ms step_avg:158.31ms step:1207/1480 train_time:189512ms step_avg:158.32ms step:1208/1480 train_time:189680ms step_avg:158.33ms step:1209/1480 train_time:189853ms step_avg:158.34ms step:1210/1480 train_time:190029ms step_avg:158.36ms step:1211/1480 train_time:190203ms step_avg:158.37ms step:1212/1480 train_time:190375ms step_avg:158.38ms step:1213/1480 train_time:190549ms step_avg:158.39ms step:1214/1480 train_time:190725ms step_avg:158.41ms step:1215/1480 train_time:190901ms step_avg:158.42ms step:1216/1480 train_time:191070ms step_avg:158.43ms step:1217/1480 train_time:191245ms step_avg:158.45ms step:1218/1480 train_time:191414ms step_avg:158.46ms step:1219/1480 train_time:191592ms step_avg:158.47ms step:1220/1480 train_time:191762ms step_avg:158.48ms step:1221/1480 train_time:191933ms step_avg:158.49ms step:1222/1480 train_time:192099ms step_avg:158.50ms step:1223/1480 train_time:192269ms step_avg:158.51ms step:1224/1480 train_time:192448ms step_avg:158.52ms step:1225/1480 train_time:192620ms step_avg:158.53ms step:1226/1480 train_time:192793ms step_avg:158.55ms step:1227/1480 train_time:192964ms step_avg:158.56ms step:1228/1480 train_time:193132ms step_avg:158.57ms step:1229/1480 train_time:193306ms step_avg:158.58ms step:1230/1480 train_time:193485ms step_avg:158.59ms step:1231/1480 train_time:193661ms step_avg:158.61ms step:1232/1480 train_time:193838ms step_avg:158.62ms step:1233/1480 train_time:194007ms step_avg:158.63ms step:1234/1480 train_time:194177ms step_avg:158.64ms step:1235/1480 train_time:194352ms step_avg:158.66ms step:1236/1480 train_time:194521ms step_avg:158.66ms step:1237/1480 train_time:194691ms step_avg:158.67ms step:1238/1480 train_time:194878ms step_avg:158.70ms step:1239/1480 train_time:195049ms step_avg:158.71ms step:1240/1480 train_time:195220ms step_avg:158.72ms step:1241/1480 train_time:195391ms step_avg:158.73ms step:1242/1480 train_time:195560ms step_avg:158.73ms step:1243/1480 train_time:195733ms step_avg:158.75ms step:1244/1480 train_time:195900ms step_avg:158.75ms step:1245/1480 train_time:196069ms step_avg:158.76ms step:1246/1480 train_time:196239ms step_avg:158.77ms step:1247/1480 train_time:196408ms step_avg:158.78ms step:1248/1480 train_time:196578ms step_avg:158.79ms step:1249/1480 train_time:196747ms step_avg:158.79ms step:1250/1480 train_time:196916ms step_avg:158.80ms step:1250/1480 val_loss:3.3354 train_time:196987ms step_avg:158.86ms step:1251/1480 train_time:197094ms step_avg:158.82ms step:1252/1480 train_time:197263ms step_avg:158.83ms step:1253/1480 train_time:197431ms step_avg:158.83ms step:1254/1480 train_time:197602ms step_avg:158.84ms step:1255/1480 train_time:197787ms step_avg:158.87ms step:1256/1480 train_time:197962ms step_avg:158.88ms step:1257/1480 train_time:198131ms step_avg:158.89ms step:1258/1480 train_time:198307ms step_avg:158.90ms step:1259/1480 train_time:198478ms step_avg:158.91ms step:1260/1480 train_time:198647ms step_avg:158.92ms step:1261/1480 train_time:198817ms step_avg:158.93ms step:1262/1480 train_time:198992ms step_avg:158.94ms step:1263/1480 train_time:199165ms step_avg:158.95ms step:1264/1480 train_time:199331ms step_avg:158.96ms step:1265/1480 train_time:199499ms step_avg:158.96ms step:1266/1480 train_time:199671ms step_avg:158.97ms step:1267/1480 train_time:199842ms step_avg:158.98ms step:1268/1480 train_time:200013ms step_avg:158.99ms step:1269/1480 train_time:200188ms step_avg:159.01ms step:1270/1480 train_time:200357ms step_avg:159.01ms step:1271/1480 train_time:200528ms step_avg:159.02ms step:1272/1480 train_time:200693ms step_avg:159.03ms step:1273/1480 train_time:200867ms step_avg:159.04ms step:1274/1480 train_time:201037ms step_avg:159.05ms step:1275/1480 train_time:201204ms step_avg:159.05ms step:1276/1480 train_time:201370ms step_avg:159.06ms step:1277/1480 train_time:201542ms step_avg:159.07ms step:1278/1480 train_time:201710ms step_avg:159.08ms step:1279/1480 train_time:201884ms step_avg:159.09ms step:1280/1480 train_time:202064ms step_avg:159.11ms step:1281/1480 train_time:202233ms step_avg:159.11ms step:1282/1480 train_time:202398ms step_avg:159.12ms step:1283/1480 train_time:202570ms step_avg:159.13ms step:1284/1480 train_time:202741ms step_avg:159.14ms step:1285/1480 train_time:202910ms step_avg:159.15ms step:1286/1480 train_time:203081ms step_avg:159.15ms step:1287/1480 train_time:203254ms step_avg:159.16ms step:1288/1480 train_time:203427ms step_avg:159.18ms step:1289/1480 train_time:203610ms step_avg:159.19ms step:1290/1480 train_time:203793ms step_avg:159.21ms step:1291/1480 train_time:203967ms step_avg:159.22ms step:1292/1480 train_time:204141ms step_avg:159.24ms step:1293/1480 train_time:204314ms step_avg:159.25ms step:1294/1480 train_time:204486ms step_avg:159.26ms step:1295/1480 train_time:204657ms step_avg:159.27ms step:1296/1480 train_time:204830ms step_avg:159.28ms step:1297/1480 train_time:205004ms step_avg:159.29ms step:1298/1480 train_time:205175ms step_avg:159.30ms step:1299/1480 train_time:205345ms step_avg:159.31ms step:1300/1480 train_time:205513ms step_avg:159.31ms step:1301/1480 train_time:205681ms step_avg:159.32ms step:1302/1480 train_time:205854ms step_avg:159.33ms step:1303/1480 train_time:206033ms step_avg:159.35ms step:1304/1480 train_time:206207ms step_avg:159.36ms step:1305/1480 train_time:206375ms step_avg:159.36ms step:1306/1480 train_time:206550ms step_avg:159.37ms step:1307/1480 train_time:206717ms step_avg:159.38ms step:1308/1480 train_time:206887ms step_avg:159.39ms step:1309/1480 train_time:207061ms step_avg:159.40ms step:1310/1480 train_time:207230ms step_avg:159.41ms step:1311/1480 train_time:207398ms step_avg:159.41ms step:1312/1480 train_time:207570ms step_avg:159.42ms step:1313/1480 train_time:207739ms step_avg:159.43ms step:1314/1480 train_time:207910ms step_avg:159.44ms step:1315/1480 train_time:208079ms step_avg:159.45ms step:1316/1480 train_time:208247ms step_avg:159.45ms step:1317/1480 train_time:208417ms step_avg:159.46ms step:1318/1480 train_time:208597ms step_avg:159.48ms step:1319/1480 train_time:208772ms step_avg:159.49ms step:1320/1480 train_time:208948ms step_avg:159.50ms step:1321/1480 train_time:209121ms step_avg:159.51ms step:1322/1480 train_time:209302ms step_avg:159.53ms step:1323/1480 train_time:209474ms step_avg:159.54ms step:1324/1480 train_time:209649ms step_avg:159.55ms step:1325/1480 train_time:209830ms step_avg:159.57ms step:1326/1480 train_time:210006ms step_avg:159.58ms step:1327/1480 train_time:210175ms step_avg:159.59ms step:1328/1480 train_time:210347ms step_avg:159.60ms step:1329/1480 train_time:210544ms step_avg:159.62ms step:1330/1480 train_time:210725ms step_avg:159.64ms step:1331/1480 train_time:210895ms step_avg:159.65ms step:1332/1480 train_time:211070ms step_avg:159.66ms step:1333/1480 train_time:211246ms step_avg:159.67ms step:1334/1480 train_time:211416ms step_avg:159.68ms step:1335/1480 train_time:211585ms step_avg:159.69ms step:1336/1480 train_time:211770ms step_avg:159.71ms step:1337/1480 train_time:211946ms step_avg:159.72ms step:1338/1480 train_time:212118ms step_avg:159.73ms step:1339/1480 train_time:212291ms step_avg:159.74ms step:1340/1480 train_time:212464ms step_avg:159.75ms step:1341/1480 train_time:212631ms step_avg:159.75ms step:1342/1480 train_time:212805ms step_avg:159.76ms step:1343/1480 train_time:212974ms step_avg:159.77ms step:1344/1480 train_time:213149ms step_avg:159.78ms step:1345/1480 train_time:213327ms step_avg:159.80ms step:1346/1480 train_time:213495ms step_avg:159.80ms step:1347/1480 train_time:213666ms step_avg:159.81ms step:1348/1480 train_time:213835ms step_avg:159.82ms step:1349/1480 train_time:214005ms step_avg:159.82ms step:1350/1480 train_time:214180ms step_avg:159.84ms step:1351/1480 train_time:214352ms step_avg:159.84ms step:1352/1480 train_time:214522ms step_avg:159.85ms step:1353/1480 train_time:214697ms step_avg:159.86ms step:1354/1480 train_time:214869ms step_avg:159.87ms step:1355/1480 train_time:215038ms step_avg:159.88ms step:1356/1480 train_time:215211ms step_avg:159.89ms step:1357/1480 train_time:215385ms step_avg:159.90ms step:1358/1480 train_time:215557ms step_avg:159.91ms step:1359/1480 train_time:215728ms step_avg:159.92ms step:1360/1480 train_time:215902ms step_avg:159.93ms step:1361/1480 train_time:216078ms step_avg:159.94ms step:1362/1480 train_time:216253ms step_avg:159.95ms step:1363/1480 train_time:216434ms step_avg:159.97ms step:1364/1480 train_time:216603ms step_avg:159.97ms step:1365/1480 train_time:216770ms step_avg:159.98ms step:1366/1480 train_time:216944ms step_avg:159.99ms step:1367/1480 train_time:217115ms step_avg:160.00ms step:1368/1480 train_time:217290ms step_avg:160.01ms step:1369/1480 train_time:217472ms step_avg:160.02ms step:1370/1480 train_time:217648ms step_avg:160.04ms step:1371/1480 train_time:217821ms step_avg:160.04ms step:1372/1480 train_time:217997ms step_avg:160.06ms step:1373/1480 train_time:218167ms step_avg:160.06ms step:1374/1480 train_time:218342ms step_avg:160.07ms step:1375/1480 train_time:218513ms step_avg:160.08ms step:1375/1480 val_loss:3.2965 train_time:218581ms step_avg:160.13ms step:1376/1480 train_time:218686ms step_avg:160.09ms step:1377/1480 train_time:218860ms step_avg:160.10ms step:1378/1480 train_time:219028ms step_avg:160.11ms step:1379/1480 train_time:219203ms step_avg:160.12ms step:1380/1480 train_time:219378ms step_avg:160.13ms step:1381/1480 train_time:219560ms step_avg:160.15ms step:1382/1480 train_time:219733ms step_avg:160.15ms step:1383/1480 train_time:219904ms step_avg:160.16ms step:1384/1480 train_time:220083ms step_avg:160.18ms step:1385/1480 train_time:220248ms step_avg:160.18ms step:1386/1480 train_time:220419ms step_avg:160.19ms step:1387/1480 train_time:220588ms step_avg:160.19ms step:1388/1480 train_time:220759ms step_avg:160.20ms step:1389/1480 train_time:220934ms step_avg:160.21ms step:1390/1480 train_time:221102ms step_avg:160.22ms step:1391/1480 train_time:221273ms step_avg:160.23ms step:1392/1480 train_time:221444ms step_avg:160.23ms step:1393/1480 train_time:221616ms step_avg:160.24ms step:1394/1480 train_time:221786ms step_avg:160.25ms step:1395/1480 train_time:221954ms step_avg:160.26ms step:1396/1480 train_time:222122ms step_avg:160.26ms step:1397/1480 train_time:222290ms step_avg:160.27ms step:1398/1480 train_time:222457ms step_avg:160.27ms step:1399/1480 train_time:222624ms step_avg:160.28ms step:1400/1480 train_time:222802ms step_avg:160.29ms step:1401/1480 train_time:222967ms step_avg:160.29ms step:1402/1480 train_time:223139ms step_avg:160.30ms step:1403/1480 train_time:223316ms step_avg:160.31ms step:1404/1480 train_time:223487ms step_avg:160.32ms step:1405/1480 train_time:223662ms step_avg:160.33ms step:1406/1480 train_time:223836ms step_avg:160.34ms step:1407/1480 train_time:224004ms step_avg:160.35ms step:1408/1480 train_time:224171ms step_avg:160.35ms step:1409/1480 train_time:224355ms step_avg:160.37ms step:1410/1480 train_time:224523ms step_avg:160.37ms step:1411/1480 train_time:224692ms step_avg:160.38ms step:1412/1480 train_time:224864ms step_avg:160.39ms step:1413/1480 train_time:225034ms step_avg:160.39ms step:1414/1480 train_time:225206ms step_avg:160.40ms step:1415/1480 train_time:225384ms step_avg:160.42ms step:1416/1480 train_time:225568ms step_avg:160.43ms step:1417/1480 train_time:225742ms step_avg:160.44ms step:1418/1480 train_time:225913ms step_avg:160.45ms step:1419/1480 train_time:226087ms step_avg:160.46ms step:1420/1480 train_time:226263ms step_avg:160.47ms step:1421/1480 train_time:226436ms step_avg:160.48ms step:1422/1480 train_time:226607ms step_avg:160.49ms step:1423/1480 train_time:226776ms step_avg:160.49ms step:1424/1480 train_time:226952ms step_avg:160.50ms step:1425/1480 train_time:227130ms step_avg:160.52ms step:1426/1480 train_time:227301ms step_avg:160.52ms step:1427/1480 train_time:227476ms step_avg:160.53ms step:1428/1480 train_time:227646ms step_avg:160.54ms step:1429/1480 train_time:227814ms step_avg:160.55ms step:1430/1480 train_time:227987ms step_avg:160.55ms step:1431/1480 train_time:228162ms step_avg:160.56ms step:1432/1480 train_time:228340ms step_avg:160.58ms step:1433/1480 train_time:228518ms step_avg:160.59ms step:1434/1480 train_time:228701ms step_avg:160.60ms step:1435/1480 train_time:228876ms step_avg:160.61ms step:1436/1480 train_time:229049ms step_avg:160.62ms step:1437/1480 train_time:229219ms step_avg:160.63ms step:1438/1480 train_time:229387ms step_avg:160.63ms step:1439/1480 train_time:229562ms step_avg:160.64ms step:1440/1480 train_time:229731ms step_avg:160.65ms step:1441/1480 train_time:229901ms step_avg:160.66ms step:1442/1480 train_time:230078ms step_avg:160.67ms step:1443/1480 train_time:230267ms step_avg:160.69ms step:1444/1480 train_time:230438ms step_avg:160.70ms step:1445/1480 train_time:230609ms step_avg:160.70ms step:1446/1480 train_time:230784ms step_avg:160.71ms step:1447/1480 train_time:230962ms step_avg:160.73ms step:1448/1480 train_time:231134ms step_avg:160.73ms step:1449/1480 train_time:231308ms step_avg:160.74ms step:1450/1480 train_time:231482ms step_avg:160.75ms step:1451/1480 train_time:231653ms step_avg:160.76ms step:1452/1480 train_time:231826ms step_avg:160.77ms step:1453/1480 train_time:231996ms step_avg:160.77ms step:1454/1480 train_time:232170ms step_avg:160.78ms step:1455/1480 train_time:232349ms step_avg:160.80ms step:1456/1480 train_time:232522ms step_avg:160.80ms step:1457/1480 train_time:232692ms step_avg:160.81ms step:1458/1480 train_time:232864ms step_avg:160.82ms step:1459/1480 train_time:233042ms step_avg:160.83ms step:1460/1480 train_time:233214ms step_avg:160.84ms step:1461/1480 train_time:233388ms step_avg:160.85ms step:1462/1480 train_time:233560ms step_avg:160.85ms step:1463/1480 train_time:233737ms step_avg:160.87ms step:1464/1480 train_time:233910ms step_avg:160.87ms step:1465/1480 train_time:234082ms step_avg:160.88ms step:1466/1480 train_time:234252ms step_avg:160.89ms step:1467/1480 train_time:234426ms step_avg:160.90ms step:1468/1480 train_time:234596ms step_avg:160.90ms step:1469/1480 train_time:234769ms step_avg:160.91ms step:1470/1480 train_time:234948ms step_avg:160.92ms step:1471/1480 train_time:235134ms step_avg:160.94ms step:1472/1480 train_time:235313ms step_avg:160.95ms step:1473/1480 train_time:235484ms step_avg:160.96ms step:1474/1480 train_time:235663ms step_avg:160.97ms step:1475/1480 train_time:235844ms step_avg:160.99ms step:1476/1480 train_time:236017ms step_avg:160.99ms step:1477/1480 train_time:236201ms step_avg:161.01ms step:1478/1480 train_time:236384ms step_avg:161.02ms step:1479/1480 train_time:236557ms step_avg:161.03ms step:1480/1480 train_time:236731ms step_avg:161.04ms step:1480/1480 val_loss:3.2776 train_time:236802ms step_avg:161.09ms