import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 08:00:57 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 78W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 81W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 105W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 118W / 700W | 39MiB / 81559MiB | 2% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 95W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23746ms step_avg:nanms step:2/1480 train_time:23833ms step_avg:nanms step:3/1480 train_time:23971ms step_avg:nanms step:4/1480 train_time:24112ms step_avg:nanms step:5/1480 train_time:24253ms step_avg:nanms step:6/1480 train_time:24394ms step_avg:nanms step:7/1480 train_time:24535ms step_avg:nanms step:8/1480 train_time:24677ms step_avg:nanms step:9/1480 train_time:24821ms step_avg:nanms step:10/1480 train_time:24966ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:285ms step_avg:nanms step:13/1480 train_time:426ms step_avg:142.10ms step:14/1480 train_time:568ms step_avg:142.03ms step:15/1480 train_time:710ms step_avg:142.08ms step:16/1480 train_time:852ms step_avg:141.98ms step:17/1480 train_time:995ms step_avg:142.09ms step:18/1480 train_time:1137ms step_avg:142.19ms step:19/1480 train_time:1280ms step_avg:142.26ms step:20/1480 train_time:1424ms step_avg:142.40ms step:21/1480 train_time:1567ms step_avg:142.47ms step:22/1480 train_time:1710ms step_avg:142.52ms step:23/1480 train_time:1852ms step_avg:142.44ms step:24/1480 train_time:1993ms step_avg:142.38ms step:25/1480 train_time:2137ms step_avg:142.47ms step:26/1480 train_time:2282ms step_avg:142.60ms step:27/1480 train_time:2429ms step_avg:142.87ms step:28/1480 train_time:2569ms step_avg:142.74ms step:29/1480 train_time:2710ms step_avg:142.64ms step:30/1480 train_time:2851ms step_avg:142.55ms step:31/1480 train_time:2993ms step_avg:142.50ms step:32/1480 train_time:3135ms step_avg:142.50ms step:33/1480 train_time:3277ms step_avg:142.50ms step:34/1480 train_time:3421ms step_avg:142.53ms step:35/1480 train_time:3565ms step_avg:142.58ms step:36/1480 train_time:3707ms step_avg:142.60ms step:37/1480 train_time:3849ms step_avg:142.54ms step:38/1480 train_time:3990ms step_avg:142.50ms step:39/1480 train_time:4132ms step_avg:142.49ms step:40/1480 train_time:4273ms step_avg:142.45ms step:41/1480 train_time:4419ms step_avg:142.54ms step:42/1480 train_time:4562ms step_avg:142.56ms step:43/1480 train_time:4705ms step_avg:142.58ms step:44/1480 train_time:4848ms step_avg:142.58ms step:45/1480 train_time:4990ms step_avg:142.57ms step:46/1480 train_time:5132ms step_avg:142.56ms step:47/1480 train_time:5274ms step_avg:142.53ms step:48/1480 train_time:5417ms step_avg:142.54ms step:49/1480 train_time:5558ms step_avg:142.51ms step:50/1480 train_time:5703ms step_avg:142.57ms step:51/1480 train_time:5847ms step_avg:142.60ms step:52/1480 train_time:5988ms step_avg:142.56ms step:53/1480 train_time:6130ms step_avg:142.55ms step:54/1480 train_time:6271ms step_avg:142.52ms step:55/1480 train_time:6412ms step_avg:142.48ms step:56/1480 train_time:6553ms step_avg:142.45ms step:57/1480 train_time:6695ms step_avg:142.44ms step:58/1480 train_time:6838ms step_avg:142.46ms step:59/1480 train_time:6982ms step_avg:142.49ms step:60/1480 train_time:7125ms step_avg:142.50ms step:61/1480 train_time:7267ms step_avg:142.50ms step:62/1480 train_time:7410ms step_avg:142.51ms step:63/1480 train_time:7551ms step_avg:142.48ms step:64/1480 train_time:7694ms step_avg:142.48ms step:65/1480 train_time:7838ms step_avg:142.51ms step:66/1480 train_time:7983ms step_avg:142.55ms step:67/1480 train_time:8127ms step_avg:142.57ms step:68/1480 train_time:8269ms step_avg:142.57ms step:69/1480 train_time:8411ms step_avg:142.55ms step:70/1480 train_time:8552ms step_avg:142.53ms step:71/1480 train_time:8693ms step_avg:142.50ms step:72/1480 train_time:8836ms step_avg:142.51ms step:73/1480 train_time:8979ms step_avg:142.52ms step:74/1480 train_time:9122ms step_avg:142.54ms step:75/1480 train_time:9266ms step_avg:142.55ms step:76/1480 train_time:9409ms step_avg:142.56ms step:77/1480 train_time:9551ms step_avg:142.55ms step:78/1480 train_time:9691ms step_avg:142.51ms step:79/1480 train_time:9832ms step_avg:142.49ms step:80/1480 train_time:9975ms step_avg:142.50ms step:81/1480 train_time:10117ms step_avg:142.50ms step:82/1480 train_time:10262ms step_avg:142.53ms step:83/1480 train_time:10405ms step_avg:142.53ms step:84/1480 train_time:10548ms step_avg:142.54ms step:85/1480 train_time:10689ms step_avg:142.51ms step:86/1480 train_time:10830ms step_avg:142.50ms step:87/1480 train_time:10972ms step_avg:142.49ms step:88/1480 train_time:11113ms step_avg:142.47ms step:89/1480 train_time:11253ms step_avg:142.45ms step:90/1480 train_time:11396ms step_avg:142.46ms step:91/1480 train_time:11540ms step_avg:142.47ms step:92/1480 train_time:11684ms step_avg:142.48ms step:93/1480 train_time:11826ms step_avg:142.49ms step:94/1480 train_time:11968ms step_avg:142.48ms step:95/1480 train_time:12110ms step_avg:142.47ms step:96/1480 train_time:12251ms step_avg:142.45ms step:97/1480 train_time:12392ms step_avg:142.44ms step:98/1480 train_time:12534ms step_avg:142.43ms step:99/1480 train_time:12676ms step_avg:142.43ms step:100/1480 train_time:12820ms step_avg:142.44ms step:101/1480 train_time:12962ms step_avg:142.44ms step:102/1480 train_time:13105ms step_avg:142.45ms step:103/1480 train_time:13248ms step_avg:142.45ms step:104/1480 train_time:13389ms step_avg:142.44ms step:105/1480 train_time:13532ms step_avg:142.44ms step:106/1480 train_time:13673ms step_avg:142.43ms step:107/1480 train_time:13814ms step_avg:142.42ms step:108/1480 train_time:13956ms step_avg:142.41ms step:109/1480 train_time:14099ms step_avg:142.42ms step:110/1480 train_time:14243ms step_avg:142.43ms step:111/1480 train_time:14388ms step_avg:142.45ms step:112/1480 train_time:14534ms step_avg:142.49ms step:113/1480 train_time:14682ms step_avg:142.55ms step:114/1480 train_time:14830ms step_avg:142.59ms step:115/1480 train_time:14975ms step_avg:142.62ms step:116/1480 train_time:15122ms step_avg:142.66ms step:117/1480 train_time:15269ms step_avg:142.71ms step:118/1480 train_time:15415ms step_avg:142.73ms step:119/1480 train_time:15564ms step_avg:142.78ms step:120/1480 train_time:15711ms step_avg:142.83ms step:121/1480 train_time:15857ms step_avg:142.86ms step:122/1480 train_time:16006ms step_avg:142.91ms step:123/1480 train_time:16152ms step_avg:142.94ms step:124/1480 train_time:16300ms step_avg:142.99ms step:125/1480 train_time:16448ms step_avg:143.03ms step:125/1480 val_loss:4.4184 train_time:16505ms step_avg:143.52ms step:126/1480 train_time:16601ms step_avg:143.11ms step:127/1480 train_time:16749ms step_avg:143.15ms step:128/1480 train_time:16894ms step_avg:143.17ms step:129/1480 train_time:17040ms step_avg:143.19ms step:130/1480 train_time:17186ms step_avg:143.22ms step:131/1480 train_time:17331ms step_avg:143.23ms step:132/1480 train_time:17478ms step_avg:143.27ms step:133/1480 train_time:17627ms step_avg:143.31ms step:134/1480 train_time:17775ms step_avg:143.35ms step:135/1480 train_time:17923ms step_avg:143.38ms step:136/1480 train_time:18069ms step_avg:143.40ms step:137/1480 train_time:18215ms step_avg:143.42ms step:138/1480 train_time:18361ms step_avg:143.45ms step:139/1480 train_time:18507ms step_avg:143.47ms step:140/1480 train_time:18655ms step_avg:143.50ms step:141/1480 train_time:18803ms step_avg:143.54ms step:142/1480 train_time:18949ms step_avg:143.56ms step:143/1480 train_time:19096ms step_avg:143.58ms step:144/1480 train_time:19244ms step_avg:143.61ms step:145/1480 train_time:19390ms step_avg:143.63ms step:146/1480 train_time:19537ms step_avg:143.66ms step:147/1480 train_time:19685ms step_avg:143.68ms step:148/1480 train_time:19831ms step_avg:143.70ms step:149/1480 train_time:19979ms step_avg:143.74ms step:150/1480 train_time:20126ms step_avg:143.75ms step:151/1480 train_time:20271ms step_avg:143.77ms step:152/1480 train_time:20419ms step_avg:143.79ms step:153/1480 train_time:20566ms step_avg:143.81ms step:154/1480 train_time:20711ms step_avg:143.83ms step:155/1480 train_time:20859ms step_avg:143.86ms step:156/1480 train_time:21006ms step_avg:143.88ms step:157/1480 train_time:21153ms step_avg:143.90ms step:158/1480 train_time:21300ms step_avg:143.92ms step:159/1480 train_time:21446ms step_avg:143.94ms step:160/1480 train_time:21590ms step_avg:143.94ms step:161/1480 train_time:21739ms step_avg:143.97ms step:162/1480 train_time:21886ms step_avg:143.99ms step:163/1480 train_time:22033ms step_avg:144.00ms step:164/1480 train_time:22181ms step_avg:144.03ms step:165/1480 train_time:22328ms step_avg:144.05ms step:166/1480 train_time:22475ms step_avg:144.07ms step:167/1480 train_time:22621ms step_avg:144.09ms step:168/1480 train_time:22767ms step_avg:144.10ms step:169/1480 train_time:22913ms step_avg:144.11ms step:170/1480 train_time:23062ms step_avg:144.13ms step:171/1480 train_time:23208ms step_avg:144.15ms step:172/1480 train_time:23354ms step_avg:144.16ms step:173/1480 train_time:23502ms step_avg:144.18ms step:174/1480 train_time:23648ms step_avg:144.19ms step:175/1480 train_time:23793ms step_avg:144.20ms step:176/1480 train_time:23942ms step_avg:144.23ms step:177/1480 train_time:24088ms step_avg:144.24ms step:178/1480 train_time:24236ms step_avg:144.26ms step:179/1480 train_time:24383ms step_avg:144.28ms step:180/1480 train_time:24529ms step_avg:144.29ms step:181/1480 train_time:24678ms step_avg:144.31ms step:182/1480 train_time:24824ms step_avg:144.32ms step:183/1480 train_time:24969ms step_avg:144.33ms step:184/1480 train_time:25116ms step_avg:144.34ms step:185/1480 train_time:25264ms step_avg:144.37ms step:186/1480 train_time:25410ms step_avg:144.37ms step:187/1480 train_time:25558ms step_avg:144.39ms step:188/1480 train_time:25705ms step_avg:144.41ms step:189/1480 train_time:25851ms step_avg:144.42ms step:190/1480 train_time:25998ms step_avg:144.43ms step:191/1480 train_time:26146ms step_avg:144.45ms step:192/1480 train_time:26292ms step_avg:144.46ms step:193/1480 train_time:26441ms step_avg:144.48ms step:194/1480 train_time:26587ms step_avg:144.49ms step:195/1480 train_time:26734ms step_avg:144.51ms step:196/1480 train_time:26881ms step_avg:144.52ms step:197/1480 train_time:27027ms step_avg:144.53ms step:198/1480 train_time:27173ms step_avg:144.54ms step:199/1480 train_time:27320ms step_avg:144.55ms step:200/1480 train_time:27467ms step_avg:144.56ms step:201/1480 train_time:27613ms step_avg:144.57ms step:202/1480 train_time:27761ms step_avg:144.59ms step:203/1480 train_time:27907ms step_avg:144.60ms step:204/1480 train_time:28055ms step_avg:144.61ms step:205/1480 train_time:28203ms step_avg:144.63ms step:206/1480 train_time:28349ms step_avg:144.64ms step:207/1480 train_time:28495ms step_avg:144.64ms step:208/1480 train_time:28642ms step_avg:144.66ms step:209/1480 train_time:28788ms step_avg:144.66ms step:210/1480 train_time:28936ms step_avg:144.68ms step:211/1480 train_time:29082ms step_avg:144.69ms step:212/1480 train_time:29228ms step_avg:144.69ms step:213/1480 train_time:29376ms step_avg:144.71ms step:214/1480 train_time:29523ms step_avg:144.72ms step:215/1480 train_time:29669ms step_avg:144.73ms step:216/1480 train_time:29816ms step_avg:144.74ms step:217/1480 train_time:29963ms step_avg:144.75ms step:218/1480 train_time:30110ms step_avg:144.76ms step:219/1480 train_time:30256ms step_avg:144.77ms step:220/1480 train_time:30404ms step_avg:144.78ms step:221/1480 train_time:30552ms step_avg:144.80ms step:222/1480 train_time:30703ms step_avg:144.83ms step:223/1480 train_time:30854ms step_avg:144.86ms step:224/1480 train_time:31006ms step_avg:144.89ms step:225/1480 train_time:31157ms step_avg:144.91ms step:226/1480 train_time:31307ms step_avg:144.94ms step:227/1480 train_time:31459ms step_avg:144.97ms step:228/1480 train_time:31609ms step_avg:144.99ms step:229/1480 train_time:31760ms step_avg:145.02ms step:230/1480 train_time:31910ms step_avg:145.04ms step:231/1480 train_time:32061ms step_avg:145.07ms step:232/1480 train_time:32211ms step_avg:145.09ms step:233/1480 train_time:32362ms step_avg:145.12ms step:234/1480 train_time:32512ms step_avg:145.14ms step:235/1480 train_time:32663ms step_avg:145.17ms step:236/1480 train_time:32813ms step_avg:145.19ms step:237/1480 train_time:32964ms step_avg:145.22ms step:238/1480 train_time:33114ms step_avg:145.24ms step:239/1480 train_time:33265ms step_avg:145.26ms step:240/1480 train_time:33414ms step_avg:145.28ms step:241/1480 train_time:33565ms step_avg:145.30ms step:242/1480 train_time:33715ms step_avg:145.32ms step:243/1480 train_time:33866ms step_avg:145.35ms step:244/1480 train_time:34016ms step_avg:145.37ms step:245/1480 train_time:34167ms step_avg:145.39ms step:246/1480 train_time:34316ms step_avg:145.41ms step:247/1480 train_time:34467ms step_avg:145.43ms step:248/1480 train_time:34619ms step_avg:145.46ms step:249/1480 train_time:34769ms step_avg:145.48ms step:250/1480 train_time:34921ms step_avg:145.50ms step:250/1480 val_loss:3.9943 train_time:34980ms step_avg:145.75ms step:251/1480 train_time:35077ms step_avg:145.55ms step:252/1480 train_time:35228ms step_avg:145.57ms step:253/1480 train_time:35379ms step_avg:145.59ms step:254/1480 train_time:35527ms step_avg:145.60ms step:255/1480 train_time:35677ms step_avg:145.62ms step:256/1480 train_time:35826ms step_avg:145.63ms step:257/1480 train_time:35977ms step_avg:145.66ms step:258/1480 train_time:36128ms step_avg:145.68ms step:259/1480 train_time:36280ms step_avg:145.70ms step:260/1480 train_time:36431ms step_avg:145.72ms step:261/1480 train_time:36582ms step_avg:145.74ms step:262/1480 train_time:36731ms step_avg:145.76ms step:263/1480 train_time:36881ms step_avg:145.78ms step:264/1480 train_time:37032ms step_avg:145.79ms step:265/1480 train_time:37183ms step_avg:145.81ms step:266/1480 train_time:37333ms step_avg:145.83ms step:267/1480 train_time:37486ms step_avg:145.86ms step:268/1480 train_time:37633ms step_avg:145.87ms step:269/1480 train_time:37784ms step_avg:145.88ms step:270/1480 train_time:37934ms step_avg:145.90ms step:271/1480 train_time:38085ms step_avg:145.92ms step:272/1480 train_time:38236ms step_avg:145.94ms step:273/1480 train_time:38386ms step_avg:145.95ms step:274/1480 train_time:38536ms step_avg:145.97ms step:275/1480 train_time:38687ms step_avg:145.99ms step:276/1480 train_time:38838ms step_avg:146.01ms step:277/1480 train_time:38987ms step_avg:146.02ms step:278/1480 train_time:39138ms step_avg:146.04ms step:279/1480 train_time:39289ms step_avg:146.06ms step:280/1480 train_time:39440ms step_avg:146.07ms step:281/1480 train_time:39590ms step_avg:146.09ms step:282/1480 train_time:39741ms step_avg:146.11ms step:283/1480 train_time:39890ms step_avg:146.12ms step:284/1480 train_time:40041ms step_avg:146.13ms step:285/1480 train_time:40191ms step_avg:146.15ms step:286/1480 train_time:40341ms step_avg:146.16ms step:287/1480 train_time:40492ms step_avg:146.18ms step:288/1480 train_time:40643ms step_avg:146.20ms step:289/1480 train_time:40794ms step_avg:146.22ms step:290/1480 train_time:40945ms step_avg:146.23ms step:291/1480 train_time:41096ms step_avg:146.25ms step:292/1480 train_time:41246ms step_avg:146.26ms step:293/1480 train_time:41397ms step_avg:146.28ms step:294/1480 train_time:41547ms step_avg:146.29ms step:295/1480 train_time:41698ms step_avg:146.31ms step:296/1480 train_time:41847ms step_avg:146.32ms step:297/1480 train_time:41998ms step_avg:146.33ms step:298/1480 train_time:42148ms step_avg:146.35ms step:299/1480 train_time:42299ms step_avg:146.37ms step:300/1480 train_time:42449ms step_avg:146.38ms step:301/1480 train_time:42599ms step_avg:146.39ms step:302/1480 train_time:42750ms step_avg:146.40ms step:303/1480 train_time:42901ms step_avg:146.42ms step:304/1480 train_time:43050ms step_avg:146.43ms step:305/1480 train_time:43202ms step_avg:146.45ms step:306/1480 train_time:43352ms step_avg:146.46ms step:307/1480 train_time:43503ms step_avg:146.48ms step:308/1480 train_time:43653ms step_avg:146.49ms step:309/1480 train_time:43804ms step_avg:146.50ms step:310/1480 train_time:43953ms step_avg:146.51ms step:311/1480 train_time:44104ms step_avg:146.53ms step:312/1480 train_time:44255ms step_avg:146.54ms step:313/1480 train_time:44405ms step_avg:146.55ms step:314/1480 train_time:44556ms step_avg:146.56ms step:315/1480 train_time:44706ms step_avg:146.58ms step:316/1480 train_time:44857ms step_avg:146.59ms step:317/1480 train_time:45007ms step_avg:146.60ms step:318/1480 train_time:45159ms step_avg:146.62ms step:319/1480 train_time:45310ms step_avg:146.63ms step:320/1480 train_time:45461ms step_avg:146.65ms step:321/1480 train_time:45611ms step_avg:146.66ms step:322/1480 train_time:45763ms step_avg:146.68ms step:323/1480 train_time:45912ms step_avg:146.68ms step:324/1480 train_time:46062ms step_avg:146.69ms step:325/1480 train_time:46211ms step_avg:146.70ms step:326/1480 train_time:46362ms step_avg:146.72ms step:327/1480 train_time:46512ms step_avg:146.72ms step:328/1480 train_time:46662ms step_avg:146.74ms step:329/1480 train_time:46812ms step_avg:146.75ms step:330/1480 train_time:46964ms step_avg:146.76ms step:331/1480 train_time:47118ms step_avg:146.79ms step:332/1480 train_time:47273ms step_avg:146.81ms step:333/1480 train_time:47426ms step_avg:146.83ms step:334/1480 train_time:47580ms step_avg:146.85ms step:335/1480 train_time:47734ms step_avg:146.87ms step:336/1480 train_time:47887ms step_avg:146.89ms step:337/1480 train_time:48041ms step_avg:146.91ms step:338/1480 train_time:48195ms step_avg:146.94ms step:339/1480 train_time:48348ms step_avg:146.96ms step:340/1480 train_time:48503ms step_avg:146.98ms step:341/1480 train_time:48657ms step_avg:147.00ms step:342/1480 train_time:48810ms step_avg:147.02ms step:343/1480 train_time:48966ms step_avg:147.04ms step:344/1480 train_time:49119ms step_avg:147.06ms step:345/1480 train_time:49274ms step_avg:147.09ms step:346/1480 train_time:49427ms step_avg:147.10ms step:347/1480 train_time:49581ms step_avg:147.13ms step:348/1480 train_time:49735ms step_avg:147.14ms step:349/1480 train_time:49889ms step_avg:147.16ms step:350/1480 train_time:50042ms step_avg:147.18ms step:351/1480 train_time:50196ms step_avg:147.20ms step:352/1480 train_time:50351ms step_avg:147.22ms step:353/1480 train_time:50505ms step_avg:147.24ms step:354/1480 train_time:50658ms step_avg:147.26ms step:355/1480 train_time:50811ms step_avg:147.28ms step:356/1480 train_time:50966ms step_avg:147.30ms step:357/1480 train_time:51120ms step_avg:147.32ms step:358/1480 train_time:51274ms step_avg:147.34ms step:359/1480 train_time:51428ms step_avg:147.36ms step:360/1480 train_time:51583ms step_avg:147.38ms step:361/1480 train_time:51738ms step_avg:147.40ms step:362/1480 train_time:51893ms step_avg:147.42ms step:363/1480 train_time:52046ms step_avg:147.44ms step:364/1480 train_time:52201ms step_avg:147.46ms step:365/1480 train_time:52357ms step_avg:147.48ms step:366/1480 train_time:52510ms step_avg:147.50ms step:367/1480 train_time:52664ms step_avg:147.52ms step:368/1480 train_time:52817ms step_avg:147.53ms step:369/1480 train_time:52971ms step_avg:147.55ms step:370/1480 train_time:53124ms step_avg:147.57ms step:371/1480 train_time:53279ms step_avg:147.59ms step:372/1480 train_time:53433ms step_avg:147.60ms step:373/1480 train_time:53586ms step_avg:147.62ms step:374/1480 train_time:53739ms step_avg:147.63ms step:375/1480 train_time:53893ms step_avg:147.65ms step:375/1480 val_loss:3.8084 train_time:53954ms step_avg:147.82ms step:376/1480 train_time:54051ms step_avg:147.68ms step:377/1480 train_time:54205ms step_avg:147.70ms step:378/1480 train_time:54359ms step_avg:147.71ms step:379/1480 train_time:54511ms step_avg:147.73ms step:380/1480 train_time:54663ms step_avg:147.74ms step:381/1480 train_time:54815ms step_avg:147.75ms step:382/1480 train_time:54968ms step_avg:147.76ms step:383/1480 train_time:55123ms step_avg:147.78ms step:384/1480 train_time:55277ms step_avg:147.80ms step:385/1480 train_time:55429ms step_avg:147.81ms step:386/1480 train_time:55583ms step_avg:147.83ms step:387/1480 train_time:55737ms step_avg:147.84ms step:388/1480 train_time:55891ms step_avg:147.86ms step:389/1480 train_time:56045ms step_avg:147.88ms step:390/1480 train_time:56200ms step_avg:147.90ms step:391/1480 train_time:56355ms step_avg:147.91ms step:392/1480 train_time:56508ms step_avg:147.93ms step:393/1480 train_time:56663ms step_avg:147.94ms step:394/1480 train_time:56816ms step_avg:147.96ms step:395/1480 train_time:56970ms step_avg:147.97ms step:396/1480 train_time:57124ms step_avg:147.99ms step:397/1480 train_time:57279ms step_avg:148.01ms step:398/1480 train_time:57433ms step_avg:148.02ms step:399/1480 train_time:57587ms step_avg:148.04ms step:400/1480 train_time:57741ms step_avg:148.05ms step:401/1480 train_time:57896ms step_avg:148.07ms step:402/1480 train_time:58050ms step_avg:148.09ms step:403/1480 train_time:58204ms step_avg:148.10ms step:404/1480 train_time:58359ms step_avg:148.12ms step:405/1480 train_time:58513ms step_avg:148.13ms step:406/1480 train_time:58668ms step_avg:148.15ms step:407/1480 train_time:58822ms step_avg:148.17ms step:408/1480 train_time:58976ms step_avg:148.18ms step:409/1480 train_time:59131ms step_avg:148.20ms step:410/1480 train_time:59284ms step_avg:148.21ms step:411/1480 train_time:59437ms step_avg:148.22ms step:412/1480 train_time:59590ms step_avg:148.23ms step:413/1480 train_time:59743ms step_avg:148.25ms step:414/1480 train_time:59898ms step_avg:148.26ms step:415/1480 train_time:60051ms step_avg:148.27ms step:416/1480 train_time:60205ms step_avg:148.29ms step:417/1480 train_time:60360ms step_avg:148.30ms step:418/1480 train_time:60513ms step_avg:148.32ms step:419/1480 train_time:60665ms step_avg:148.33ms step:420/1480 train_time:60819ms step_avg:148.34ms step:421/1480 train_time:60974ms step_avg:148.35ms step:422/1480 train_time:61127ms step_avg:148.37ms step:423/1480 train_time:61280ms step_avg:148.38ms step:424/1480 train_time:61434ms step_avg:148.39ms step:425/1480 train_time:61589ms step_avg:148.41ms step:426/1480 train_time:61743ms step_avg:148.42ms step:427/1480 train_time:61898ms step_avg:148.44ms step:428/1480 train_time:62051ms step_avg:148.45ms step:429/1480 train_time:62203ms step_avg:148.46ms step:430/1480 train_time:62358ms step_avg:148.47ms step:431/1480 train_time:62511ms step_avg:148.48ms step:432/1480 train_time:62664ms step_avg:148.49ms step:433/1480 train_time:62818ms step_avg:148.51ms step:434/1480 train_time:62974ms step_avg:148.52ms step:435/1480 train_time:63127ms step_avg:148.53ms step:436/1480 train_time:63282ms step_avg:148.55ms step:437/1480 train_time:63436ms step_avg:148.56ms step:438/1480 train_time:63591ms step_avg:148.58ms step:439/1480 train_time:63745ms step_avg:148.59ms step:440/1480 train_time:63900ms step_avg:148.60ms step:441/1480 train_time:64058ms step_avg:148.63ms step:442/1480 train_time:64216ms step_avg:148.65ms step:443/1480 train_time:64374ms step_avg:148.67ms step:444/1480 train_time:64530ms step_avg:148.69ms step:445/1480 train_time:64686ms step_avg:148.70ms step:446/1480 train_time:64841ms step_avg:148.72ms step:447/1480 train_time:64998ms step_avg:148.74ms step:448/1480 train_time:65155ms step_avg:148.76ms step:449/1480 train_time:65315ms step_avg:148.78ms step:450/1480 train_time:65474ms step_avg:148.80ms step:451/1480 train_time:65632ms step_avg:148.82ms step:452/1480 train_time:65788ms step_avg:148.84ms step:453/1480 train_time:65946ms step_avg:148.86ms step:454/1480 train_time:66102ms step_avg:148.88ms step:455/1480 train_time:66258ms step_avg:148.89ms step:456/1480 train_time:66412ms step_avg:148.91ms step:457/1480 train_time:66569ms step_avg:148.92ms step:458/1480 train_time:66724ms step_avg:148.94ms step:459/1480 train_time:66881ms step_avg:148.95ms step:460/1480 train_time:67038ms step_avg:148.97ms step:461/1480 train_time:67197ms step_avg:149.00ms step:462/1480 train_time:67353ms step_avg:149.01ms step:463/1480 train_time:67510ms step_avg:149.03ms step:464/1480 train_time:67665ms step_avg:149.04ms step:465/1480 train_time:67821ms step_avg:149.06ms step:466/1480 train_time:67977ms step_avg:149.07ms step:467/1480 train_time:68135ms step_avg:149.09ms step:468/1480 train_time:68293ms step_avg:149.11ms step:469/1480 train_time:68449ms step_avg:149.13ms step:470/1480 train_time:68606ms step_avg:149.14ms step:471/1480 train_time:68762ms step_avg:149.16ms step:472/1480 train_time:68919ms step_avg:149.18ms step:473/1480 train_time:69078ms step_avg:149.20ms step:474/1480 train_time:69233ms step_avg:149.21ms step:475/1480 train_time:69389ms step_avg:149.22ms step:476/1480 train_time:69545ms step_avg:149.24ms step:477/1480 train_time:69702ms step_avg:149.26ms step:478/1480 train_time:69859ms step_avg:149.27ms step:479/1480 train_time:70015ms step_avg:149.29ms step:480/1480 train_time:70173ms step_avg:149.30ms step:481/1480 train_time:70331ms step_avg:149.32ms step:482/1480 train_time:70487ms step_avg:149.34ms step:483/1480 train_time:70643ms step_avg:149.35ms step:484/1480 train_time:70801ms step_avg:149.37ms step:485/1480 train_time:70959ms step_avg:149.39ms step:486/1480 train_time:71117ms step_avg:149.41ms step:487/1480 train_time:71276ms step_avg:149.42ms step:488/1480 train_time:71433ms step_avg:149.44ms step:489/1480 train_time:71591ms step_avg:149.46ms step:490/1480 train_time:71748ms step_avg:149.48ms step:491/1480 train_time:71905ms step_avg:149.49ms step:492/1480 train_time:72062ms step_avg:149.51ms step:493/1480 train_time:72220ms step_avg:149.52ms step:494/1480 train_time:72378ms step_avg:149.54ms step:495/1480 train_time:72536ms step_avg:149.56ms step:496/1480 train_time:72694ms step_avg:149.58ms step:497/1480 train_time:72852ms step_avg:149.59ms step:498/1480 train_time:73009ms step_avg:149.61ms step:499/1480 train_time:73166ms step_avg:149.62ms step:500/1480 train_time:73323ms step_avg:149.64ms step:500/1480 val_loss:3.6884 train_time:73384ms step_avg:149.76ms step:501/1480 train_time:73482ms step_avg:149.66ms step:502/1480 train_time:73640ms step_avg:149.68ms step:503/1480 train_time:73797ms step_avg:149.69ms step:504/1480 train_time:73954ms step_avg:149.70ms step:505/1480 train_time:74109ms step_avg:149.71ms step:506/1480 train_time:74266ms step_avg:149.73ms step:507/1480 train_time:74421ms step_avg:149.74ms step:508/1480 train_time:74581ms step_avg:149.76ms step:509/1480 train_time:74738ms step_avg:149.77ms step:510/1480 train_time:74895ms step_avg:149.79ms step:511/1480 train_time:75054ms step_avg:149.81ms step:512/1480 train_time:75210ms step_avg:149.82ms step:513/1480 train_time:75366ms step_avg:149.83ms step:514/1480 train_time:75523ms step_avg:149.85ms step:515/1480 train_time:75679ms step_avg:149.86ms step:516/1480 train_time:75838ms step_avg:149.88ms step:517/1480 train_time:75995ms step_avg:149.89ms step:518/1480 train_time:76152ms step_avg:149.91ms step:519/1480 train_time:76310ms step_avg:149.92ms step:520/1480 train_time:76469ms step_avg:149.94ms step:521/1480 train_time:76626ms step_avg:149.95ms step:522/1480 train_time:76783ms step_avg:149.97ms step:523/1480 train_time:76940ms step_avg:149.98ms step:524/1480 train_time:77097ms step_avg:149.99ms step:525/1480 train_time:77256ms step_avg:150.01ms step:526/1480 train_time:77413ms step_avg:150.03ms step:527/1480 train_time:77571ms step_avg:150.04ms step:528/1480 train_time:77728ms step_avg:150.05ms step:529/1480 train_time:77884ms step_avg:150.07ms step:530/1480 train_time:78040ms step_avg:150.08ms step:531/1480 train_time:78197ms step_avg:150.09ms step:532/1480 train_time:78355ms step_avg:150.10ms step:533/1480 train_time:78513ms step_avg:150.12ms step:534/1480 train_time:78671ms step_avg:150.14ms step:535/1480 train_time:78829ms step_avg:150.15ms step:536/1480 train_time:78987ms step_avg:150.17ms step:537/1480 train_time:79144ms step_avg:150.18ms step:538/1480 train_time:79301ms step_avg:150.19ms step:539/1480 train_time:79460ms step_avg:150.21ms step:540/1480 train_time:79617ms step_avg:150.22ms step:541/1480 train_time:79774ms step_avg:150.23ms step:542/1480 train_time:79932ms step_avg:150.25ms step:543/1480 train_time:80089ms step_avg:150.26ms step:544/1480 train_time:80246ms step_avg:150.27ms step:545/1480 train_time:80402ms step_avg:150.28ms step:546/1480 train_time:80559ms step_avg:150.30ms step:547/1480 train_time:80715ms step_avg:150.31ms step:548/1480 train_time:80874ms step_avg:150.32ms step:549/1480 train_time:81031ms step_avg:150.34ms step:550/1480 train_time:81188ms step_avg:150.35ms step:551/1480 train_time:81347ms step_avg:150.36ms step:552/1480 train_time:81504ms step_avg:150.38ms step:553/1480 train_time:81662ms step_avg:150.39ms step:554/1480 train_time:81821ms step_avg:150.41ms step:555/1480 train_time:81980ms step_avg:150.42ms step:556/1480 train_time:82137ms step_avg:150.43ms step:557/1480 train_time:82297ms step_avg:150.45ms step:558/1480 train_time:82458ms step_avg:150.47ms step:559/1480 train_time:82617ms step_avg:150.49ms step:560/1480 train_time:82778ms step_avg:150.51ms step:561/1480 train_time:82937ms step_avg:150.52ms step:562/1480 train_time:83097ms step_avg:150.54ms step:563/1480 train_time:83256ms step_avg:150.55ms step:564/1480 train_time:83416ms step_avg:150.57ms step:565/1480 train_time:83577ms step_avg:150.59ms step:566/1480 train_time:83738ms step_avg:150.61ms step:567/1480 train_time:83898ms step_avg:150.62ms step:568/1480 train_time:84056ms step_avg:150.64ms step:569/1480 train_time:84215ms step_avg:150.65ms step:570/1480 train_time:84375ms step_avg:150.67ms step:571/1480 train_time:84536ms step_avg:150.69ms step:572/1480 train_time:84696ms step_avg:150.71ms step:573/1480 train_time:84857ms step_avg:150.72ms step:574/1480 train_time:85017ms step_avg:150.74ms step:575/1480 train_time:85178ms step_avg:150.76ms step:576/1480 train_time:85338ms step_avg:150.77ms step:577/1480 train_time:85496ms step_avg:150.79ms step:578/1480 train_time:85657ms step_avg:150.80ms step:579/1480 train_time:85817ms step_avg:150.82ms step:580/1480 train_time:85977ms step_avg:150.84ms step:581/1480 train_time:86138ms step_avg:150.85ms step:582/1480 train_time:86297ms step_avg:150.87ms step:583/1480 train_time:86458ms step_avg:150.89ms step:584/1480 train_time:86618ms step_avg:150.90ms step:585/1480 train_time:86778ms step_avg:150.92ms step:586/1480 train_time:86937ms step_avg:150.93ms step:587/1480 train_time:87098ms step_avg:150.95ms step:588/1480 train_time:87256ms step_avg:150.96ms step:589/1480 train_time:87417ms step_avg:150.98ms step:590/1480 train_time:87577ms step_avg:151.00ms step:591/1480 train_time:87736ms step_avg:151.01ms step:592/1480 train_time:87896ms step_avg:151.02ms step:593/1480 train_time:88057ms step_avg:151.04ms step:594/1480 train_time:88218ms step_avg:151.06ms step:595/1480 train_time:88381ms step_avg:151.08ms step:596/1480 train_time:88542ms step_avg:151.10ms step:597/1480 train_time:88701ms step_avg:151.11ms step:598/1480 train_time:88859ms step_avg:151.12ms step:599/1480 train_time:89017ms step_avg:151.13ms step:600/1480 train_time:89178ms step_avg:151.15ms step:601/1480 train_time:89337ms step_avg:151.16ms step:602/1480 train_time:89497ms step_avg:151.18ms step:603/1480 train_time:89658ms step_avg:151.19ms step:604/1480 train_time:89818ms step_avg:151.21ms step:605/1480 train_time:89978ms step_avg:151.22ms step:606/1480 train_time:90140ms step_avg:151.24ms step:607/1480 train_time:90300ms step_avg:151.26ms step:608/1480 train_time:90459ms step_avg:151.27ms step:609/1480 train_time:90617ms step_avg:151.28ms step:610/1480 train_time:90777ms step_avg:151.29ms step:611/1480 train_time:90938ms step_avg:151.31ms step:612/1480 train_time:91097ms step_avg:151.32ms step:613/1480 train_time:91259ms step_avg:151.34ms step:614/1480 train_time:91419ms step_avg:151.36ms step:615/1480 train_time:91577ms step_avg:151.37ms step:616/1480 train_time:91737ms step_avg:151.38ms step:617/1480 train_time:91896ms step_avg:151.39ms step:618/1480 train_time:92056ms step_avg:151.41ms step:619/1480 train_time:92216ms step_avg:151.42ms step:620/1480 train_time:92377ms step_avg:151.44ms step:621/1480 train_time:92536ms step_avg:151.45ms step:622/1480 train_time:92696ms step_avg:151.46ms step:623/1480 train_time:92857ms step_avg:151.48ms step:624/1480 train_time:93017ms step_avg:151.49ms step:625/1480 train_time:93178ms step_avg:151.51ms step:625/1480 val_loss:3.6049 train_time:93241ms step_avg:151.61ms step:626/1480 train_time:93340ms step_avg:151.53ms step:627/1480 train_time:93499ms step_avg:151.54ms step:628/1480 train_time:93656ms step_avg:151.55ms step:629/1480 train_time:93814ms step_avg:151.56ms step:630/1480 train_time:93972ms step_avg:151.57ms step:631/1480 train_time:94130ms step_avg:151.58ms step:632/1480 train_time:94289ms step_avg:151.59ms step:633/1480 train_time:94449ms step_avg:151.60ms step:634/1480 train_time:94609ms step_avg:151.62ms step:635/1480 train_time:94769ms step_avg:151.63ms step:636/1480 train_time:94928ms step_avg:151.64ms step:637/1480 train_time:95088ms step_avg:151.66ms step:638/1480 train_time:95249ms step_avg:151.67ms step:639/1480 train_time:95408ms step_avg:151.68ms step:640/1480 train_time:95568ms step_avg:151.70ms step:641/1480 train_time:95729ms step_avg:151.71ms step:642/1480 train_time:95888ms step_avg:151.72ms step:643/1480 train_time:96050ms step_avg:151.74ms step:644/1480 train_time:96209ms step_avg:151.75ms step:645/1480 train_time:96367ms step_avg:151.76ms step:646/1480 train_time:96526ms step_avg:151.77ms step:647/1480 train_time:96685ms step_avg:151.78ms step:648/1480 train_time:96847ms step_avg:151.80ms step:649/1480 train_time:97008ms step_avg:151.81ms step:650/1480 train_time:97168ms step_avg:151.82ms step:651/1480 train_time:97328ms step_avg:151.84ms step:652/1480 train_time:97488ms step_avg:151.85ms step:653/1480 train_time:97648ms step_avg:151.86ms step:654/1480 train_time:97808ms step_avg:151.88ms step:655/1480 train_time:97968ms step_avg:151.89ms step:656/1480 train_time:98128ms step_avg:151.90ms step:657/1480 train_time:98288ms step_avg:151.91ms step:658/1480 train_time:98448ms step_avg:151.93ms step:659/1480 train_time:98609ms step_avg:151.94ms step:660/1480 train_time:98771ms step_avg:151.96ms step:661/1480 train_time:98933ms step_avg:151.97ms step:662/1480 train_time:99092ms step_avg:151.98ms step:663/1480 train_time:99253ms step_avg:151.99ms step:664/1480 train_time:99414ms step_avg:152.01ms step:665/1480 train_time:99575ms step_avg:152.02ms step:666/1480 train_time:99735ms step_avg:152.03ms step:667/1480 train_time:99895ms step_avg:152.05ms step:668/1480 train_time:100057ms step_avg:152.06ms step:669/1480 train_time:100221ms step_avg:152.08ms step:670/1480 train_time:100381ms step_avg:152.09ms step:671/1480 train_time:100542ms step_avg:152.11ms step:672/1480 train_time:100704ms step_avg:152.12ms step:673/1480 train_time:100867ms step_avg:152.14ms step:674/1480 train_time:101031ms step_avg:152.15ms step:675/1480 train_time:101192ms step_avg:152.17ms step:676/1480 train_time:101354ms step_avg:152.18ms step:677/1480 train_time:101514ms step_avg:152.20ms step:678/1480 train_time:101675ms step_avg:152.21ms step:679/1480 train_time:101837ms step_avg:152.22ms step:680/1480 train_time:101998ms step_avg:152.24ms step:681/1480 train_time:102159ms step_avg:152.25ms step:682/1480 train_time:102324ms step_avg:152.27ms step:683/1480 train_time:102487ms step_avg:152.28ms step:684/1480 train_time:102650ms step_avg:152.30ms step:685/1480 train_time:102813ms step_avg:152.32ms step:686/1480 train_time:102974ms step_avg:152.33ms step:687/1480 train_time:103134ms step_avg:152.34ms step:688/1480 train_time:103297ms step_avg:152.36ms step:689/1480 train_time:103460ms step_avg:152.37ms step:690/1480 train_time:103625ms step_avg:152.39ms step:691/1480 train_time:103787ms step_avg:152.40ms step:692/1480 train_time:103950ms step_avg:152.42ms step:693/1480 train_time:104112ms step_avg:152.43ms step:694/1480 train_time:104273ms step_avg:152.45ms step:695/1480 train_time:104433ms step_avg:152.46ms step:696/1480 train_time:104593ms step_avg:152.47ms step:697/1480 train_time:104755ms step_avg:152.48ms step:698/1480 train_time:104916ms step_avg:152.49ms step:699/1480 train_time:105078ms step_avg:152.51ms step:700/1480 train_time:105238ms step_avg:152.52ms step:701/1480 train_time:105399ms step_avg:152.53ms step:702/1480 train_time:105561ms step_avg:152.54ms step:703/1480 train_time:105722ms step_avg:152.56ms step:704/1480 train_time:105883ms step_avg:152.57ms step:705/1480 train_time:106048ms step_avg:152.59ms step:706/1480 train_time:106210ms step_avg:152.60ms step:707/1480 train_time:106371ms step_avg:152.61ms step:708/1480 train_time:106531ms step_avg:152.62ms step:709/1480 train_time:106693ms step_avg:152.64ms step:710/1480 train_time:106853ms step_avg:152.65ms step:711/1480 train_time:107015ms step_avg:152.66ms step:712/1480 train_time:107180ms step_avg:152.68ms step:713/1480 train_time:107345ms step_avg:152.70ms step:714/1480 train_time:107505ms step_avg:152.71ms step:715/1480 train_time:107668ms step_avg:152.72ms step:716/1480 train_time:107830ms step_avg:152.73ms step:717/1480 train_time:107992ms step_avg:152.75ms step:718/1480 train_time:108151ms step_avg:152.76ms step:719/1480 train_time:108310ms step_avg:152.76ms step:720/1480 train_time:108472ms step_avg:152.78ms step:721/1480 train_time:108633ms step_avg:152.79ms step:722/1480 train_time:108795ms step_avg:152.80ms step:723/1480 train_time:108955ms step_avg:152.81ms step:724/1480 train_time:109117ms step_avg:152.82ms step:725/1480 train_time:109281ms step_avg:152.84ms step:726/1480 train_time:109446ms step_avg:152.86ms step:727/1480 train_time:109610ms step_avg:152.87ms step:728/1480 train_time:109771ms step_avg:152.88ms step:729/1480 train_time:109932ms step_avg:152.90ms step:730/1480 train_time:110093ms step_avg:152.91ms step:731/1480 train_time:110254ms step_avg:152.92ms step:732/1480 train_time:110413ms step_avg:152.93ms step:733/1480 train_time:110573ms step_avg:152.94ms step:734/1480 train_time:110737ms step_avg:152.95ms step:735/1480 train_time:110897ms step_avg:152.96ms step:736/1480 train_time:111059ms step_avg:152.97ms step:737/1480 train_time:111219ms step_avg:152.98ms step:738/1480 train_time:111382ms step_avg:153.00ms step:739/1480 train_time:111543ms step_avg:153.01ms step:740/1480 train_time:111709ms step_avg:153.03ms step:741/1480 train_time:111873ms step_avg:153.04ms step:742/1480 train_time:112034ms step_avg:153.05ms step:743/1480 train_time:112194ms step_avg:153.06ms step:744/1480 train_time:112357ms step_avg:153.07ms step:745/1480 train_time:112521ms step_avg:153.09ms step:746/1480 train_time:112682ms step_avg:153.10ms step:747/1480 train_time:112844ms step_avg:153.11ms step:748/1480 train_time:113010ms step_avg:153.13ms step:749/1480 train_time:113173ms step_avg:153.14ms step:750/1480 train_time:113333ms step_avg:153.15ms step:750/1480 val_loss:3.5504 train_time:113398ms step_avg:153.24ms step:751/1480 train_time:113499ms step_avg:153.17ms step:752/1480 train_time:113663ms step_avg:153.19ms step:753/1480 train_time:113826ms step_avg:153.20ms step:754/1480 train_time:113987ms step_avg:153.21ms step:755/1480 train_time:114147ms step_avg:153.22ms step:756/1480 train_time:114310ms step_avg:153.23ms step:757/1480 train_time:114474ms step_avg:153.24ms step:758/1480 train_time:114633ms step_avg:153.25ms step:759/1480 train_time:114796ms step_avg:153.27ms step:760/1480 train_time:114958ms step_avg:153.28ms step:761/1480 train_time:115123ms step_avg:153.29ms step:762/1480 train_time:115286ms step_avg:153.31ms step:763/1480 train_time:115448ms step_avg:153.32ms step:764/1480 train_time:115610ms step_avg:153.33ms step:765/1480 train_time:115771ms step_avg:153.34ms step:766/1480 train_time:115935ms step_avg:153.35ms step:767/1480 train_time:116096ms step_avg:153.36ms step:768/1480 train_time:116257ms step_avg:153.37ms step:769/1480 train_time:116422ms step_avg:153.39ms step:770/1480 train_time:116587ms step_avg:153.40ms step:771/1480 train_time:116749ms step_avg:153.42ms step:772/1480 train_time:116911ms step_avg:153.43ms step:773/1480 train_time:117073ms step_avg:153.44ms step:774/1480 train_time:117235ms step_avg:153.45ms step:775/1480 train_time:117396ms step_avg:153.46ms step:776/1480 train_time:117564ms step_avg:153.48ms step:777/1480 train_time:117730ms step_avg:153.49ms step:778/1480 train_time:117893ms step_avg:153.51ms step:779/1480 train_time:118054ms step_avg:153.52ms step:780/1480 train_time:118216ms step_avg:153.53ms step:781/1480 train_time:118380ms step_avg:153.54ms step:782/1480 train_time:118545ms step_avg:153.56ms step:783/1480 train_time:118708ms step_avg:153.57ms step:784/1480 train_time:118871ms step_avg:153.58ms step:785/1480 train_time:119032ms step_avg:153.59ms step:786/1480 train_time:119196ms step_avg:153.60ms step:787/1480 train_time:119360ms step_avg:153.62ms step:788/1480 train_time:119526ms step_avg:153.63ms step:789/1480 train_time:119688ms step_avg:153.64ms step:790/1480 train_time:119852ms step_avg:153.66ms step:791/1480 train_time:120021ms step_avg:153.68ms step:792/1480 train_time:120186ms step_avg:153.69ms step:793/1480 train_time:120348ms step_avg:153.70ms step:794/1480 train_time:120513ms step_avg:153.72ms step:795/1480 train_time:120676ms step_avg:153.73ms step:796/1480 train_time:120843ms step_avg:153.74ms step:797/1480 train_time:121008ms step_avg:153.76ms step:798/1480 train_time:121171ms step_avg:153.77ms step:799/1480 train_time:121338ms step_avg:153.79ms step:800/1480 train_time:121501ms step_avg:153.80ms step:801/1480 train_time:121666ms step_avg:153.81ms step:802/1480 train_time:121834ms step_avg:153.83ms step:803/1480 train_time:121995ms step_avg:153.84ms step:804/1480 train_time:122157ms step_avg:153.85ms step:805/1480 train_time:122323ms step_avg:153.87ms step:806/1480 train_time:122486ms step_avg:153.88ms step:807/1480 train_time:122647ms step_avg:153.89ms step:808/1480 train_time:122810ms step_avg:153.90ms step:809/1480 train_time:122972ms step_avg:153.91ms step:810/1480 train_time:123132ms step_avg:153.92ms step:811/1480 train_time:123294ms step_avg:153.93ms step:812/1480 train_time:123458ms step_avg:153.94ms step:813/1480 train_time:123619ms step_avg:153.95ms step:814/1480 train_time:123783ms step_avg:153.96ms step:815/1480 train_time:123946ms step_avg:153.97ms step:816/1480 train_time:124110ms step_avg:153.98ms step:817/1480 train_time:124271ms step_avg:153.99ms step:818/1480 train_time:124433ms step_avg:154.00ms step:819/1480 train_time:124597ms step_avg:154.01ms step:820/1480 train_time:124762ms step_avg:154.03ms step:821/1480 train_time:124925ms step_avg:154.04ms step:822/1480 train_time:125088ms step_avg:154.05ms step:823/1480 train_time:125251ms step_avg:154.06ms step:824/1480 train_time:125413ms step_avg:154.07ms step:825/1480 train_time:125577ms step_avg:154.08ms step:826/1480 train_time:125746ms step_avg:154.10ms step:827/1480 train_time:125911ms step_avg:154.11ms step:828/1480 train_time:126074ms step_avg:154.12ms step:829/1480 train_time:126237ms step_avg:154.14ms step:830/1480 train_time:126401ms step_avg:154.15ms step:831/1480 train_time:126566ms step_avg:154.16ms step:832/1480 train_time:126729ms step_avg:154.17ms step:833/1480 train_time:126893ms step_avg:154.18ms step:834/1480 train_time:127056ms step_avg:154.19ms step:835/1480 train_time:127220ms step_avg:154.21ms step:836/1480 train_time:127386ms step_avg:154.22ms step:837/1480 train_time:127548ms step_avg:154.23ms step:838/1480 train_time:127712ms step_avg:154.24ms step:839/1480 train_time:127875ms step_avg:154.25ms step:840/1480 train_time:128035ms step_avg:154.26ms step:841/1480 train_time:128196ms step_avg:154.27ms step:842/1480 train_time:128362ms step_avg:154.28ms step:843/1480 train_time:128525ms step_avg:154.29ms step:844/1480 train_time:128687ms step_avg:154.30ms step:845/1480 train_time:128849ms step_avg:154.31ms step:846/1480 train_time:129015ms step_avg:154.32ms step:847/1480 train_time:129180ms step_avg:154.34ms step:848/1480 train_time:129343ms step_avg:154.35ms step:849/1480 train_time:129507ms step_avg:154.36ms step:850/1480 train_time:129671ms step_avg:154.37ms step:851/1480 train_time:129834ms step_avg:154.38ms step:852/1480 train_time:129994ms step_avg:154.39ms step:853/1480 train_time:130156ms step_avg:154.40ms step:854/1480 train_time:130322ms step_avg:154.41ms step:855/1480 train_time:130488ms step_avg:154.42ms step:856/1480 train_time:130649ms step_avg:154.43ms step:857/1480 train_time:130813ms step_avg:154.44ms step:858/1480 train_time:130980ms step_avg:154.46ms step:859/1480 train_time:131146ms step_avg:154.47ms step:860/1480 train_time:131309ms step_avg:154.48ms step:861/1480 train_time:131474ms step_avg:154.49ms step:862/1480 train_time:131644ms step_avg:154.51ms step:863/1480 train_time:131812ms step_avg:154.53ms step:864/1480 train_time:131976ms step_avg:154.54ms step:865/1480 train_time:132137ms step_avg:154.55ms step:866/1480 train_time:132306ms step_avg:154.56ms step:867/1480 train_time:132469ms step_avg:154.57ms step:868/1480 train_time:132630ms step_avg:154.58ms step:869/1480 train_time:132792ms step_avg:154.59ms step:870/1480 train_time:132957ms step_avg:154.60ms step:871/1480 train_time:133119ms step_avg:154.61ms step:872/1480 train_time:133284ms step_avg:154.62ms step:873/1480 train_time:133447ms step_avg:154.63ms step:874/1480 train_time:133613ms step_avg:154.64ms step:875/1480 train_time:133776ms step_avg:154.65ms step:875/1480 val_loss:3.5056 train_time:133840ms step_avg:154.73ms step:876/1480 train_time:133941ms step_avg:154.67ms step:877/1480 train_time:134104ms step_avg:154.68ms step:878/1480 train_time:134267ms step_avg:154.69ms step:879/1480 train_time:134433ms step_avg:154.70ms step:880/1480 train_time:134597ms step_avg:154.71ms step:881/1480 train_time:134760ms step_avg:154.72ms step:882/1480 train_time:134925ms step_avg:154.73ms step:883/1480 train_time:135091ms step_avg:154.74ms step:884/1480 train_time:135259ms step_avg:154.76ms step:885/1480 train_time:135424ms step_avg:154.77ms step:886/1480 train_time:135591ms step_avg:154.78ms step:887/1480 train_time:135762ms step_avg:154.80ms step:888/1480 train_time:135935ms step_avg:154.82ms step:889/1480 train_time:136102ms step_avg:154.84ms step:890/1480 train_time:136264ms step_avg:154.85ms step:891/1480 train_time:136429ms step_avg:154.86ms step:892/1480 train_time:136594ms step_avg:154.87ms step:893/1480 train_time:136758ms step_avg:154.88ms step:894/1480 train_time:136926ms step_avg:154.89ms step:895/1480 train_time:137094ms step_avg:154.91ms step:896/1480 train_time:137261ms step_avg:154.92ms step:897/1480 train_time:137426ms step_avg:154.93ms step:898/1480 train_time:137592ms step_avg:154.95ms step:899/1480 train_time:137758ms step_avg:154.96ms step:900/1480 train_time:137922ms step_avg:154.97ms step:901/1480 train_time:138086ms step_avg:154.98ms step:902/1480 train_time:138250ms step_avg:154.99ms step:903/1480 train_time:138424ms step_avg:155.01ms step:904/1480 train_time:138587ms step_avg:155.02ms step:905/1480 train_time:138750ms step_avg:155.03ms step:906/1480 train_time:138919ms step_avg:155.04ms step:907/1480 train_time:139086ms step_avg:155.06ms step:908/1480 train_time:139247ms step_avg:155.06ms step:909/1480 train_time:139413ms step_avg:155.08ms step:910/1480 train_time:139583ms step_avg:155.09ms step:911/1480 train_time:139748ms step_avg:155.10ms step:912/1480 train_time:139915ms step_avg:155.12ms step:913/1480 train_time:140084ms step_avg:155.13ms step:914/1480 train_time:140251ms step_avg:155.14ms step:915/1480 train_time:140422ms step_avg:155.16ms step:916/1480 train_time:140586ms step_avg:155.17ms step:917/1480 train_time:140752ms step_avg:155.18ms step:918/1480 train_time:140922ms step_avg:155.20ms step:919/1480 train_time:141091ms step_avg:155.22ms step:920/1480 train_time:141258ms step_avg:155.23ms step:921/1480 train_time:141426ms step_avg:155.24ms step:922/1480 train_time:141589ms step_avg:155.25ms step:923/1480 train_time:141751ms step_avg:155.26ms step:924/1480 train_time:141917ms step_avg:155.27ms step:925/1480 train_time:142083ms step_avg:155.28ms step:926/1480 train_time:142245ms step_avg:155.29ms step:927/1480 train_time:142408ms step_avg:155.30ms step:928/1480 train_time:142576ms step_avg:155.31ms step:929/1480 train_time:142742ms step_avg:155.32ms step:930/1480 train_time:142907ms step_avg:155.33ms step:931/1480 train_time:143069ms step_avg:155.34ms step:932/1480 train_time:143237ms step_avg:155.35ms step:933/1480 train_time:143404ms step_avg:155.37ms step:934/1480 train_time:143570ms step_avg:155.38ms step:935/1480 train_time:143742ms step_avg:155.40ms step:936/1480 train_time:143907ms step_avg:155.41ms step:937/1480 train_time:144078ms step_avg:155.42ms step:938/1480 train_time:144241ms step_avg:155.43ms step:939/1480 train_time:144410ms step_avg:155.45ms step:940/1480 train_time:144577ms step_avg:155.46ms step:941/1480 train_time:144741ms step_avg:155.47ms step:942/1480 train_time:144905ms step_avg:155.48ms step:943/1480 train_time:145075ms step_avg:155.49ms step:944/1480 train_time:145248ms step_avg:155.51ms step:945/1480 train_time:145411ms step_avg:155.52ms step:946/1480 train_time:145582ms step_avg:155.54ms step:947/1480 train_time:145749ms step_avg:155.55ms step:948/1480 train_time:145916ms step_avg:155.56ms step:949/1480 train_time:146083ms step_avg:155.57ms step:950/1480 train_time:146247ms step_avg:155.58ms step:951/1480 train_time:146415ms step_avg:155.60ms step:952/1480 train_time:146581ms step_avg:155.61ms step:953/1480 train_time:146747ms step_avg:155.62ms step:954/1480 train_time:146918ms step_avg:155.63ms step:955/1480 train_time:147081ms step_avg:155.64ms step:956/1480 train_time:147246ms step_avg:155.65ms step:957/1480 train_time:147415ms step_avg:155.67ms step:958/1480 train_time:147586ms step_avg:155.68ms step:959/1480 train_time:147749ms step_avg:155.69ms step:960/1480 train_time:147918ms step_avg:155.70ms step:961/1480 train_time:148083ms step_avg:155.71ms step:962/1480 train_time:148246ms step_avg:155.72ms step:963/1480 train_time:148412ms step_avg:155.73ms step:964/1480 train_time:148580ms step_avg:155.74ms step:965/1480 train_time:148744ms step_avg:155.75ms step:966/1480 train_time:148908ms step_avg:155.76ms step:967/1480 train_time:149071ms step_avg:155.77ms step:968/1480 train_time:149236ms step_avg:155.78ms step:969/1480 train_time:149402ms step_avg:155.79ms step:970/1480 train_time:149565ms step_avg:155.80ms step:971/1480 train_time:149730ms step_avg:155.81ms step:972/1480 train_time:149896ms step_avg:155.82ms step:973/1480 train_time:150060ms step_avg:155.83ms step:974/1480 train_time:150228ms step_avg:155.84ms step:975/1480 train_time:150393ms step_avg:155.85ms step:976/1480 train_time:150559ms step_avg:155.86ms step:977/1480 train_time:150723ms step_avg:155.87ms step:978/1480 train_time:150888ms step_avg:155.88ms step:979/1480 train_time:151053ms step_avg:155.89ms step:980/1480 train_time:151220ms step_avg:155.90ms step:981/1480 train_time:151386ms step_avg:155.91ms step:982/1480 train_time:151550ms step_avg:155.92ms step:983/1480 train_time:151719ms step_avg:155.93ms step:984/1480 train_time:151883ms step_avg:155.94ms step:985/1480 train_time:152051ms step_avg:155.95ms step:986/1480 train_time:152217ms step_avg:155.96ms step:987/1480 train_time:152382ms step_avg:155.97ms step:988/1480 train_time:152548ms step_avg:155.98ms step:989/1480 train_time:152714ms step_avg:155.99ms step:990/1480 train_time:152884ms step_avg:156.00ms step:991/1480 train_time:153051ms step_avg:156.02ms step:992/1480 train_time:153226ms step_avg:156.03ms step:993/1480 train_time:153403ms step_avg:156.06ms step:994/1480 train_time:153568ms step_avg:156.07ms step:995/1480 train_time:153733ms step_avg:156.07ms step:996/1480 train_time:153896ms step_avg:156.08ms step:997/1480 train_time:154061ms step_avg:156.09ms step:998/1480 train_time:154224ms step_avg:156.10ms step:999/1480 train_time:154389ms step_avg:156.11ms step:1000/1480 train_time:154561ms step_avg:156.12ms step:1000/1480 val_loss:3.4422 train_time:154627ms step_avg:156.19ms step:1001/1480 train_time:154730ms step_avg:156.14ms step:1002/1480 train_time:154898ms step_avg:156.15ms step:1003/1480 train_time:155067ms step_avg:156.16ms step:1004/1480 train_time:155237ms step_avg:156.17ms step:1005/1480 train_time:155405ms step_avg:156.19ms step:1006/1480 train_time:155575ms step_avg:156.20ms step:1007/1480 train_time:155739ms step_avg:156.21ms step:1008/1480 train_time:155906ms step_avg:156.22ms step:1009/1480 train_time:156080ms step_avg:156.24ms step:1010/1480 train_time:156245ms step_avg:156.24ms step:1011/1480 train_time:156411ms step_avg:156.26ms step:1012/1480 train_time:156578ms step_avg:156.27ms step:1013/1480 train_time:156746ms step_avg:156.28ms step:1014/1480 train_time:156915ms step_avg:156.29ms step:1015/1480 train_time:157085ms step_avg:156.30ms step:1016/1480 train_time:157254ms step_avg:156.32ms step:1017/1480 train_time:157426ms step_avg:156.33ms step:1018/1480 train_time:157595ms step_avg:156.34ms step:1019/1480 train_time:157762ms step_avg:156.36ms step:1020/1480 train_time:157931ms step_avg:156.37ms step:1021/1480 train_time:158097ms step_avg:156.38ms step:1022/1480 train_time:158264ms step_avg:156.39ms step:1023/1480 train_time:158431ms step_avg:156.40ms step:1024/1480 train_time:158599ms step_avg:156.41ms step:1025/1480 train_time:158769ms step_avg:156.42ms step:1026/1480 train_time:158936ms step_avg:156.43ms step:1027/1480 train_time:159102ms step_avg:156.44ms step:1028/1480 train_time:159276ms step_avg:156.46ms step:1029/1480 train_time:159452ms step_avg:156.48ms step:1030/1480 train_time:159620ms step_avg:156.49ms step:1031/1480 train_time:159784ms step_avg:156.50ms step:1032/1480 train_time:159958ms step_avg:156.51ms step:1033/1480 train_time:160124ms step_avg:156.52ms step:1034/1480 train_time:160293ms step_avg:156.54ms step:1035/1480 train_time:160461ms step_avg:156.55ms step:1036/1480 train_time:160627ms step_avg:156.56ms step:1037/1480 train_time:160796ms step_avg:156.57ms step:1038/1480 train_time:160965ms step_avg:156.58ms step:1039/1480 train_time:161137ms step_avg:156.60ms step:1040/1480 train_time:161302ms step_avg:156.60ms step:1041/1480 train_time:161470ms step_avg:156.61ms step:1042/1480 train_time:161635ms step_avg:156.62ms step:1043/1480 train_time:161800ms step_avg:156.63ms step:1044/1480 train_time:161964ms step_avg:156.64ms step:1045/1480 train_time:162134ms step_avg:156.65ms step:1046/1480 train_time:162304ms step_avg:156.66ms step:1047/1480 train_time:162471ms step_avg:156.67ms step:1048/1480 train_time:162637ms step_avg:156.68ms step:1049/1480 train_time:162803ms step_avg:156.69ms step:1050/1480 train_time:162972ms step_avg:156.70ms step:1051/1480 train_time:163140ms step_avg:156.72ms step:1052/1480 train_time:163309ms step_avg:156.73ms step:1053/1480 train_time:163477ms step_avg:156.74ms step:1054/1480 train_time:163643ms step_avg:156.75ms step:1055/1480 train_time:163808ms step_avg:156.75ms step:1056/1480 train_time:163974ms step_avg:156.76ms step:1057/1480 train_time:164140ms step_avg:156.77ms step:1058/1480 train_time:164309ms step_avg:156.78ms step:1059/1480 train_time:164483ms step_avg:156.80ms step:1060/1480 train_time:164654ms step_avg:156.81ms step:1061/1480 train_time:164822ms step_avg:156.82ms step:1062/1480 train_time:164984ms step_avg:156.83ms step:1063/1480 train_time:165150ms step_avg:156.84ms step:1064/1480 train_time:165315ms step_avg:156.85ms step:1065/1480 train_time:165482ms step_avg:156.86ms step:1066/1480 train_time:165652ms step_avg:156.87ms step:1067/1480 train_time:165822ms step_avg:156.88ms step:1068/1480 train_time:165988ms step_avg:156.89ms step:1069/1480 train_time:166159ms step_avg:156.90ms step:1070/1480 train_time:166324ms step_avg:156.91ms step:1071/1480 train_time:166497ms step_avg:156.92ms step:1072/1480 train_time:166662ms step_avg:156.93ms step:1073/1480 train_time:166824ms step_avg:156.94ms step:1074/1480 train_time:166991ms step_avg:156.95ms step:1075/1480 train_time:167163ms step_avg:156.96ms step:1076/1480 train_time:167330ms step_avg:156.97ms step:1077/1480 train_time:167498ms step_avg:156.98ms step:1078/1480 train_time:167677ms step_avg:157.00ms step:1079/1480 train_time:167847ms step_avg:157.01ms step:1080/1480 train_time:168018ms step_avg:157.03ms step:1081/1480 train_time:168184ms step_avg:157.03ms step:1082/1480 train_time:168351ms step_avg:157.04ms step:1083/1480 train_time:168518ms step_avg:157.05ms step:1084/1480 train_time:168684ms step_avg:157.06ms step:1085/1480 train_time:168850ms step_avg:157.07ms step:1086/1480 train_time:169018ms step_avg:157.08ms step:1087/1480 train_time:169184ms step_avg:157.09ms step:1088/1480 train_time:169354ms step_avg:157.10ms step:1089/1480 train_time:169525ms step_avg:157.11ms step:1090/1480 train_time:169699ms step_avg:157.13ms step:1091/1480 train_time:169867ms step_avg:157.14ms step:1092/1480 train_time:170035ms step_avg:157.15ms step:1093/1480 train_time:170202ms step_avg:157.16ms step:1094/1480 train_time:170369ms step_avg:157.17ms step:1095/1480 train_time:170534ms step_avg:157.17ms step:1096/1480 train_time:170703ms step_avg:157.18ms step:1097/1480 train_time:170873ms step_avg:157.20ms step:1098/1480 train_time:171043ms step_avg:157.21ms step:1099/1480 train_time:171214ms step_avg:157.22ms step:1100/1480 train_time:171387ms step_avg:157.24ms step:1101/1480 train_time:171558ms step_avg:157.25ms step:1102/1480 train_time:171729ms step_avg:157.26ms step:1103/1480 train_time:171905ms step_avg:157.28ms step:1104/1480 train_time:172073ms step_avg:157.29ms step:1105/1480 train_time:172242ms step_avg:157.30ms step:1106/1480 train_time:172410ms step_avg:157.31ms step:1107/1480 train_time:172581ms step_avg:157.32ms step:1108/1480 train_time:172746ms step_avg:157.33ms step:1109/1480 train_time:172914ms step_avg:157.34ms step:1110/1480 train_time:173080ms step_avg:157.35ms step:1111/1480 train_time:173247ms step_avg:157.35ms step:1112/1480 train_time:173418ms step_avg:157.37ms step:1113/1480 train_time:173599ms step_avg:157.39ms step:1114/1480 train_time:173771ms step_avg:157.40ms step:1115/1480 train_time:173943ms step_avg:157.41ms step:1116/1480 train_time:174110ms step_avg:157.42ms step:1117/1480 train_time:174282ms step_avg:157.44ms step:1118/1480 train_time:174457ms step_avg:157.45ms step:1119/1480 train_time:174622ms step_avg:157.46ms step:1120/1480 train_time:174791ms step_avg:157.47ms step:1121/1480 train_time:174961ms step_avg:157.48ms step:1122/1480 train_time:175127ms step_avg:157.49ms step:1123/1480 train_time:175295ms step_avg:157.50ms step:1124/1480 train_time:175462ms step_avg:157.51ms step:1125/1480 train_time:175628ms step_avg:157.51ms step:1125/1480 val_loss:3.3861 train_time:175696ms step_avg:157.58ms step:1126/1480 train_time:175798ms step_avg:157.52ms step:1127/1480 train_time:175971ms step_avg:157.54ms step:1128/1480 train_time:176140ms step_avg:157.55ms step:1129/1480 train_time:176314ms step_avg:157.56ms step:1130/1480 train_time:176484ms step_avg:157.57ms step:1131/1480 train_time:176661ms step_avg:157.59ms step:1132/1480 train_time:176827ms step_avg:157.60ms step:1133/1480 train_time:176999ms step_avg:157.61ms step:1134/1480 train_time:177170ms step_avg:157.62ms step:1135/1480 train_time:177338ms step_avg:157.63ms step:1136/1480 train_time:177509ms step_avg:157.65ms step:1137/1480 train_time:177678ms step_avg:157.66ms step:1138/1480 train_time:177849ms step_avg:157.67ms step:1139/1480 train_time:178017ms step_avg:157.68ms step:1140/1480 train_time:178187ms step_avg:157.69ms step:1141/1480 train_time:178358ms step_avg:157.70ms step:1142/1480 train_time:178527ms step_avg:157.71ms step:1143/1480 train_time:178696ms step_avg:157.72ms step:1144/1480 train_time:178864ms step_avg:157.73ms step:1145/1480 train_time:179031ms step_avg:157.74ms step:1146/1480 train_time:179202ms step_avg:157.75ms step:1147/1480 train_time:179371ms step_avg:157.76ms step:1148/1480 train_time:179539ms step_avg:157.77ms step:1149/1480 train_time:179712ms step_avg:157.78ms step:1150/1480 train_time:179880ms step_avg:157.79ms step:1151/1480 train_time:180054ms step_avg:157.80ms step:1152/1480 train_time:180225ms step_avg:157.82ms step:1153/1480 train_time:180398ms step_avg:157.83ms step:1154/1480 train_time:180566ms step_avg:157.84ms step:1155/1480 train_time:180737ms step_avg:157.85ms step:1156/1480 train_time:180915ms step_avg:157.87ms step:1157/1480 train_time:181086ms step_avg:157.88ms step:1158/1480 train_time:181253ms step_avg:157.89ms step:1159/1480 train_time:181420ms step_avg:157.89ms step:1160/1480 train_time:181585ms step_avg:157.90ms step:1161/1480 train_time:181756ms step_avg:157.91ms step:1162/1480 train_time:181927ms step_avg:157.92ms step:1163/1480 train_time:182096ms step_avg:157.93ms step:1164/1480 train_time:182264ms step_avg:157.94ms step:1165/1480 train_time:182430ms step_avg:157.95ms step:1166/1480 train_time:182597ms step_avg:157.96ms step:1167/1480 train_time:182764ms step_avg:157.96ms step:1168/1480 train_time:182933ms step_avg:157.97ms step:1169/1480 train_time:183101ms step_avg:157.98ms step:1170/1480 train_time:183271ms step_avg:157.99ms step:1171/1480 train_time:183437ms step_avg:158.00ms step:1172/1480 train_time:183603ms step_avg:158.01ms step:1173/1480 train_time:183774ms step_avg:158.02ms step:1174/1480 train_time:183955ms step_avg:158.04ms step:1175/1480 train_time:184126ms step_avg:158.05ms step:1176/1480 train_time:184297ms step_avg:158.06ms step:1177/1480 train_time:184473ms step_avg:158.07ms step:1178/1480 train_time:184640ms step_avg:158.08ms step:1179/1480 train_time:184807ms step_avg:158.09ms step:1180/1480 train_time:184986ms step_avg:158.11ms step:1181/1480 train_time:185156ms step_avg:158.12ms step:1182/1480 train_time:185324ms step_avg:158.13ms step:1183/1480 train_time:185494ms step_avg:158.14ms step:1184/1480 train_time:185662ms step_avg:158.14ms step:1185/1480 train_time:185835ms step_avg:158.16ms step:1186/1480 train_time:186006ms step_avg:158.17ms step:1187/1480 train_time:186189ms step_avg:158.19ms step:1188/1480 train_time:186355ms step_avg:158.20ms step:1189/1480 train_time:186526ms step_avg:158.21ms step:1190/1480 train_time:186694ms step_avg:158.21ms step:1191/1480 train_time:186865ms step_avg:158.23ms step:1192/1480 train_time:187032ms step_avg:158.23ms step:1193/1480 train_time:187197ms step_avg:158.24ms step:1194/1480 train_time:187367ms step_avg:158.25ms step:1195/1480 train_time:187543ms step_avg:158.26ms step:1196/1480 train_time:187726ms step_avg:158.29ms step:1197/1480 train_time:187897ms step_avg:158.30ms step:1198/1480 train_time:188077ms step_avg:158.31ms step:1199/1480 train_time:188247ms step_avg:158.32ms step:1200/1480 train_time:188416ms step_avg:158.33ms step:1201/1480 train_time:188584ms step_avg:158.34ms step:1202/1480 train_time:188767ms step_avg:158.36ms step:1203/1480 train_time:188943ms step_avg:158.38ms step:1204/1480 train_time:189119ms step_avg:158.39ms step:1205/1480 train_time:189287ms step_avg:158.40ms step:1206/1480 train_time:189454ms step_avg:158.41ms step:1207/1480 train_time:189624ms step_avg:158.42ms step:1208/1480 train_time:189792ms step_avg:158.42ms step:1209/1480 train_time:189965ms step_avg:158.44ms step:1210/1480 train_time:190139ms step_avg:158.45ms step:1211/1480 train_time:190313ms step_avg:158.46ms step:1212/1480 train_time:190483ms step_avg:158.47ms step:1213/1480 train_time:190657ms step_avg:158.48ms step:1214/1480 train_time:190834ms step_avg:158.50ms step:1215/1480 train_time:191006ms step_avg:158.51ms step:1216/1480 train_time:191175ms step_avg:158.52ms step:1217/1480 train_time:191349ms step_avg:158.53ms step:1218/1480 train_time:191519ms step_avg:158.54ms step:1219/1480 train_time:191699ms step_avg:158.56ms step:1220/1480 train_time:191869ms step_avg:158.57ms step:1221/1480 train_time:192038ms step_avg:158.58ms step:1222/1480 train_time:192207ms step_avg:158.59ms step:1223/1480 train_time:192375ms step_avg:158.59ms step:1224/1480 train_time:192553ms step_avg:158.61ms step:1225/1480 train_time:192724ms step_avg:158.62ms step:1226/1480 train_time:192897ms step_avg:158.63ms step:1227/1480 train_time:193070ms step_avg:158.64ms step:1228/1480 train_time:193239ms step_avg:158.65ms step:1229/1480 train_time:193414ms step_avg:158.67ms step:1230/1480 train_time:193594ms step_avg:158.68ms step:1231/1480 train_time:193770ms step_avg:158.70ms step:1232/1480 train_time:193943ms step_avg:158.71ms step:1233/1480 train_time:194114ms step_avg:158.72ms step:1234/1480 train_time:194284ms step_avg:158.73ms step:1235/1480 train_time:194458ms step_avg:158.74ms step:1236/1480 train_time:194626ms step_avg:158.75ms step:1237/1480 train_time:194796ms step_avg:158.76ms step:1238/1480 train_time:194980ms step_avg:158.78ms step:1239/1480 train_time:195152ms step_avg:158.79ms step:1240/1480 train_time:195324ms step_avg:158.80ms step:1241/1480 train_time:195497ms step_avg:158.81ms step:1242/1480 train_time:195667ms step_avg:158.82ms step:1243/1480 train_time:195839ms step_avg:158.83ms step:1244/1480 train_time:196007ms step_avg:158.84ms step:1245/1480 train_time:196174ms step_avg:158.85ms step:1246/1480 train_time:196344ms step_avg:158.85ms step:1247/1480 train_time:196513ms step_avg:158.86ms step:1248/1480 train_time:196681ms step_avg:158.87ms step:1249/1480 train_time:196851ms step_avg:158.88ms step:1250/1480 train_time:197019ms step_avg:158.89ms step:1250/1480 val_loss:3.3368 train_time:197091ms step_avg:158.94ms step:1251/1480 train_time:197199ms step_avg:158.90ms step:1252/1480 train_time:197368ms step_avg:158.91ms step:1253/1480 train_time:197535ms step_avg:158.92ms step:1254/1480 train_time:197707ms step_avg:158.93ms step:1255/1480 train_time:197893ms step_avg:158.95ms step:1256/1480 train_time:198068ms step_avg:158.96ms step:1257/1480 train_time:198238ms step_avg:158.97ms step:1258/1480 train_time:198413ms step_avg:158.98ms step:1259/1480 train_time:198585ms step_avg:159.00ms step:1260/1480 train_time:198752ms step_avg:159.00ms step:1261/1480 train_time:198923ms step_avg:159.01ms step:1262/1480 train_time:199097ms step_avg:159.02ms step:1263/1480 train_time:199272ms step_avg:159.04ms step:1264/1480 train_time:199436ms step_avg:159.04ms step:1265/1480 train_time:199604ms step_avg:159.05ms step:1266/1480 train_time:199776ms step_avg:159.06ms step:1267/1480 train_time:199948ms step_avg:159.07ms step:1268/1480 train_time:200118ms step_avg:159.08ms step:1269/1480 train_time:200294ms step_avg:159.09ms step:1270/1480 train_time:200464ms step_avg:159.10ms step:1271/1480 train_time:200634ms step_avg:159.11ms step:1272/1480 train_time:200801ms step_avg:159.11ms step:1273/1480 train_time:200972ms step_avg:159.12ms step:1274/1480 train_time:201145ms step_avg:159.13ms step:1275/1480 train_time:201312ms step_avg:159.14ms step:1276/1480 train_time:201477ms step_avg:159.14ms step:1277/1480 train_time:201651ms step_avg:159.16ms step:1278/1480 train_time:201818ms step_avg:159.16ms step:1279/1480 train_time:201990ms step_avg:159.17ms step:1280/1480 train_time:202169ms step_avg:159.19ms step:1281/1480 train_time:202338ms step_avg:159.20ms step:1282/1480 train_time:202505ms step_avg:159.20ms step:1283/1480 train_time:202674ms step_avg:159.21ms step:1284/1480 train_time:202846ms step_avg:159.22ms step:1285/1480 train_time:203015ms step_avg:159.23ms step:1286/1480 train_time:203187ms step_avg:159.24ms step:1287/1480 train_time:203358ms step_avg:159.25ms step:1288/1480 train_time:203530ms step_avg:159.26ms step:1289/1480 train_time:203710ms step_avg:159.27ms step:1290/1480 train_time:203889ms step_avg:159.29ms step:1291/1480 train_time:204063ms step_avg:159.30ms step:1292/1480 train_time:204237ms step_avg:159.31ms step:1293/1480 train_time:204413ms step_avg:159.32ms step:1294/1480 train_time:204583ms step_avg:159.33ms step:1295/1480 train_time:204754ms step_avg:159.34ms step:1296/1480 train_time:204928ms step_avg:159.35ms step:1297/1480 train_time:205099ms step_avg:159.36ms step:1298/1480 train_time:205271ms step_avg:159.37ms step:1299/1480 train_time:205441ms step_avg:159.38ms step:1300/1480 train_time:205610ms step_avg:159.39ms step:1301/1480 train_time:205777ms step_avg:159.39ms step:1302/1480 train_time:205954ms step_avg:159.41ms step:1303/1480 train_time:206130ms step_avg:159.42ms step:1304/1480 train_time:206306ms step_avg:159.43ms step:1305/1480 train_time:206475ms step_avg:159.44ms step:1306/1480 train_time:206651ms step_avg:159.45ms step:1307/1480 train_time:206819ms step_avg:159.46ms step:1308/1480 train_time:206988ms step_avg:159.47ms step:1309/1480 train_time:207160ms step_avg:159.48ms step:1310/1480 train_time:207329ms step_avg:159.48ms step:1311/1480 train_time:207497ms step_avg:159.49ms step:1312/1480 train_time:207672ms step_avg:159.50ms step:1313/1480 train_time:207840ms step_avg:159.51ms step:1314/1480 train_time:208013ms step_avg:159.52ms step:1315/1480 train_time:208184ms step_avg:159.53ms step:1316/1480 train_time:208352ms step_avg:159.53ms step:1317/1480 train_time:208524ms step_avg:159.54ms step:1318/1480 train_time:208705ms step_avg:159.56ms step:1319/1480 train_time:208882ms step_avg:159.57ms step:1320/1480 train_time:209059ms step_avg:159.59ms step:1321/1480 train_time:209232ms step_avg:159.60ms step:1322/1480 train_time:209412ms step_avg:159.61ms step:1323/1480 train_time:209587ms step_avg:159.62ms step:1324/1480 train_time:209761ms step_avg:159.64ms step:1325/1480 train_time:209942ms step_avg:159.65ms step:1326/1480 train_time:210116ms step_avg:159.66ms step:1327/1480 train_time:210286ms step_avg:159.67ms step:1328/1480 train_time:210456ms step_avg:159.68ms step:1329/1480 train_time:210653ms step_avg:159.71ms step:1330/1480 train_time:210832ms step_avg:159.72ms step:1331/1480 train_time:211003ms step_avg:159.73ms step:1332/1480 train_time:211178ms step_avg:159.74ms step:1333/1480 train_time:211354ms step_avg:159.75ms step:1334/1480 train_time:211528ms step_avg:159.76ms step:1335/1480 train_time:211695ms step_avg:159.77ms step:1336/1480 train_time:211881ms step_avg:159.79ms step:1337/1480 train_time:212056ms step_avg:159.80ms step:1338/1480 train_time:212228ms step_avg:159.81ms step:1339/1480 train_time:212402ms step_avg:159.82ms step:1340/1480 train_time:212575ms step_avg:159.83ms step:1341/1480 train_time:212745ms step_avg:159.84ms step:1342/1480 train_time:212916ms step_avg:159.85ms step:1343/1480 train_time:213087ms step_avg:159.85ms step:1344/1480 train_time:213257ms step_avg:159.86ms step:1345/1480 train_time:213438ms step_avg:159.88ms step:1346/1480 train_time:213607ms step_avg:159.89ms step:1347/1480 train_time:213776ms step_avg:159.89ms step:1348/1480 train_time:213947ms step_avg:159.90ms step:1349/1480 train_time:214117ms step_avg:159.91ms step:1350/1480 train_time:214292ms step_avg:159.92ms step:1351/1480 train_time:214465ms step_avg:159.93ms step:1352/1480 train_time:214636ms step_avg:159.94ms step:1353/1480 train_time:214813ms step_avg:159.95ms step:1354/1480 train_time:214984ms step_avg:159.96ms step:1355/1480 train_time:215151ms step_avg:159.96ms step:1356/1480 train_time:215324ms step_avg:159.97ms step:1357/1480 train_time:215497ms step_avg:159.98ms step:1358/1480 train_time:215670ms step_avg:159.99ms step:1359/1480 train_time:215842ms step_avg:160.00ms step:1360/1480 train_time:216015ms step_avg:160.01ms step:1361/1480 train_time:216192ms step_avg:160.02ms step:1362/1480 train_time:216368ms step_avg:160.04ms step:1363/1480 train_time:216548ms step_avg:160.05ms step:1364/1480 train_time:216716ms step_avg:160.06ms step:1365/1480 train_time:216885ms step_avg:160.06ms step:1366/1480 train_time:217056ms step_avg:160.07ms step:1367/1480 train_time:217228ms step_avg:160.08ms step:1368/1480 train_time:217401ms step_avg:160.09ms step:1369/1480 train_time:217583ms step_avg:160.11ms step:1370/1480 train_time:217761ms step_avg:160.12ms step:1371/1480 train_time:217933ms step_avg:160.13ms step:1372/1480 train_time:218111ms step_avg:160.14ms step:1373/1480 train_time:218281ms step_avg:160.15ms step:1374/1480 train_time:218459ms step_avg:160.16ms step:1375/1480 train_time:218630ms step_avg:160.17ms step:1375/1480 val_loss:3.2976 train_time:218698ms step_avg:160.22ms step:1376/1480 train_time:218805ms step_avg:160.18ms step:1377/1480 train_time:218978ms step_avg:160.19ms step:1378/1480 train_time:219147ms step_avg:160.20ms step:1379/1480 train_time:219322ms step_avg:160.21ms step:1380/1480 train_time:219495ms step_avg:160.22ms step:1381/1480 train_time:219678ms step_avg:160.23ms step:1382/1480 train_time:219849ms step_avg:160.24ms step:1383/1480 train_time:220023ms step_avg:160.25ms step:1384/1480 train_time:220201ms step_avg:160.26ms step:1385/1480 train_time:220367ms step_avg:160.27ms step:1386/1480 train_time:220538ms step_avg:160.27ms step:1387/1480 train_time:220709ms step_avg:160.28ms step:1388/1480 train_time:220877ms step_avg:160.29ms step:1389/1480 train_time:221050ms step_avg:160.30ms step:1390/1480 train_time:221220ms step_avg:160.30ms step:1391/1480 train_time:221391ms step_avg:160.31ms step:1392/1480 train_time:221565ms step_avg:160.32ms step:1393/1480 train_time:221735ms step_avg:160.33ms step:1394/1480 train_time:221904ms step_avg:160.34ms step:1395/1480 train_time:222072ms step_avg:160.34ms step:1396/1480 train_time:222243ms step_avg:160.35ms step:1397/1480 train_time:222410ms step_avg:160.35ms step:1398/1480 train_time:222577ms step_avg:160.36ms step:1399/1480 train_time:222747ms step_avg:160.37ms step:1400/1480 train_time:222925ms step_avg:160.38ms step:1401/1480 train_time:223091ms step_avg:160.38ms step:1402/1480 train_time:223263ms step_avg:160.39ms step:1403/1480 train_time:223439ms step_avg:160.40ms step:1404/1480 train_time:223611ms step_avg:160.41ms step:1405/1480 train_time:223786ms step_avg:160.42ms step:1406/1480 train_time:223961ms step_avg:160.43ms step:1407/1480 train_time:224128ms step_avg:160.44ms step:1408/1480 train_time:224296ms step_avg:160.44ms step:1409/1480 train_time:224479ms step_avg:160.46ms step:1410/1480 train_time:224649ms step_avg:160.46ms step:1411/1480 train_time:224818ms step_avg:160.47ms step:1412/1480 train_time:224989ms step_avg:160.48ms step:1413/1480 train_time:225160ms step_avg:160.48ms step:1414/1480 train_time:225331ms step_avg:160.49ms step:1415/1480 train_time:225507ms step_avg:160.50ms step:1416/1480 train_time:225693ms step_avg:160.52ms step:1417/1480 train_time:225867ms step_avg:160.53ms step:1418/1480 train_time:226041ms step_avg:160.54ms step:1419/1480 train_time:226214ms step_avg:160.55ms step:1420/1480 train_time:226389ms step_avg:160.56ms step:1421/1480 train_time:226565ms step_avg:160.57ms step:1422/1480 train_time:226737ms step_avg:160.58ms step:1423/1480 train_time:226906ms step_avg:160.58ms step:1424/1480 train_time:227084ms step_avg:160.60ms step:1425/1480 train_time:227267ms step_avg:160.61ms step:1426/1480 train_time:227440ms step_avg:160.62ms step:1427/1480 train_time:227613ms step_avg:160.63ms step:1428/1480 train_time:227784ms step_avg:160.64ms step:1429/1480 train_time:227950ms step_avg:160.64ms step:1430/1480 train_time:228125ms step_avg:160.65ms step:1431/1480 train_time:228300ms step_avg:160.66ms step:1432/1480 train_time:228476ms step_avg:160.67ms step:1433/1480 train_time:228656ms step_avg:160.69ms step:1434/1480 train_time:228838ms step_avg:160.70ms step:1435/1480 train_time:229012ms step_avg:160.71ms step:1436/1480 train_time:229188ms step_avg:160.72ms step:1437/1480 train_time:229358ms step_avg:160.73ms step:1438/1480 train_time:229526ms step_avg:160.73ms step:1439/1480 train_time:229701ms step_avg:160.74ms step:1440/1480 train_time:229871ms step_avg:160.75ms step:1441/1480 train_time:230043ms step_avg:160.76ms step:1442/1480 train_time:230218ms step_avg:160.77ms step:1443/1480 train_time:230409ms step_avg:160.79ms step:1444/1480 train_time:230579ms step_avg:160.79ms step:1445/1480 train_time:230751ms step_avg:160.80ms step:1446/1480 train_time:230927ms step_avg:160.81ms step:1447/1480 train_time:231104ms step_avg:160.82ms step:1448/1480 train_time:231275ms step_avg:160.83ms step:1449/1480 train_time:231449ms step_avg:160.84ms step:1450/1480 train_time:231623ms step_avg:160.85ms step:1451/1480 train_time:231793ms step_avg:160.86ms step:1452/1480 train_time:231969ms step_avg:160.87ms step:1453/1480 train_time:232139ms step_avg:160.87ms step:1454/1480 train_time:232310ms step_avg:160.88ms step:1455/1480 train_time:232490ms step_avg:160.89ms step:1456/1480 train_time:232662ms step_avg:160.90ms step:1457/1480 train_time:232832ms step_avg:160.91ms step:1458/1480 train_time:233004ms step_avg:160.91ms step:1459/1480 train_time:233178ms step_avg:160.92ms step:1460/1480 train_time:233349ms step_avg:160.93ms step:1461/1480 train_time:233524ms step_avg:160.94ms step:1462/1480 train_time:233694ms step_avg:160.95ms step:1463/1480 train_time:233872ms step_avg:160.96ms step:1464/1480 train_time:234048ms step_avg:160.97ms step:1465/1480 train_time:234221ms step_avg:160.98ms step:1466/1480 train_time:234392ms step_avg:160.98ms step:1467/1480 train_time:234567ms step_avg:160.99ms step:1468/1480 train_time:234736ms step_avg:161.00ms step:1469/1480 train_time:234909ms step_avg:161.01ms step:1470/1480 train_time:235091ms step_avg:161.02ms step:1471/1480 train_time:235277ms step_avg:161.04ms step:1472/1480 train_time:235457ms step_avg:161.05ms step:1473/1480 train_time:235629ms step_avg:161.06ms step:1474/1480 train_time:235806ms step_avg:161.07ms step:1475/1480 train_time:235986ms step_avg:161.08ms step:1476/1480 train_time:236158ms step_avg:161.09ms step:1477/1480 train_time:236343ms step_avg:161.11ms step:1478/1480 train_time:236526ms step_avg:161.12ms step:1479/1480 train_time:236699ms step_avg:161.13ms step:1480/1480 train_time:236871ms step_avg:161.14ms step:1480/1480 val_loss:3.2786 train_time:236942ms step_avg:161.19ms