import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 10:04:38 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 98W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 92W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 95W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 41MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 104W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 37C P0 79W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23046ms step_avg:nanms step:2/1480 train_time:23171ms step_avg:nanms step:3/1480 train_time:23310ms step_avg:nanms step:4/1480 train_time:23451ms step_avg:nanms step:5/1480 train_time:23593ms step_avg:nanms step:6/1480 train_time:23734ms step_avg:nanms step:7/1480 train_time:23875ms step_avg:nanms step:8/1480 train_time:24017ms step_avg:nanms step:9/1480 train_time:24164ms step_avg:nanms step:10/1480 train_time:24306ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.63ms step:14/1480 train_time:566ms step_avg:141.52ms step:15/1480 train_time:709ms step_avg:141.79ms step:16/1480 train_time:854ms step_avg:142.33ms step:17/1480 train_time:997ms step_avg:142.46ms step:18/1480 train_time:1140ms step_avg:142.44ms step:19/1480 train_time:1283ms step_avg:142.56ms step:20/1480 train_time:1426ms step_avg:142.65ms step:21/1480 train_time:1570ms step_avg:142.71ms step:22/1480 train_time:1713ms step_avg:142.78ms step:23/1480 train_time:1858ms step_avg:142.95ms step:24/1480 train_time:2001ms step_avg:142.90ms step:25/1480 train_time:2143ms step_avg:142.86ms step:26/1480 train_time:2285ms step_avg:142.83ms step:27/1480 train_time:2429ms step_avg:142.90ms step:28/1480 train_time:2574ms step_avg:143.00ms step:29/1480 train_time:2717ms step_avg:143.02ms step:30/1480 train_time:2860ms step_avg:143.02ms step:31/1480 train_time:3002ms step_avg:142.95ms step:32/1480 train_time:3143ms step_avg:142.88ms step:33/1480 train_time:3287ms step_avg:142.90ms step:34/1480 train_time:3431ms step_avg:142.96ms step:35/1480 train_time:3577ms step_avg:143.07ms step:36/1480 train_time:3719ms step_avg:143.06ms step:37/1480 train_time:3862ms step_avg:143.04ms step:38/1480 train_time:4003ms step_avg:142.97ms step:39/1480 train_time:4147ms step_avg:142.99ms step:40/1480 train_time:4291ms step_avg:143.02ms step:41/1480 train_time:4435ms step_avg:143.07ms step:42/1480 train_time:4578ms step_avg:143.06ms step:43/1480 train_time:4720ms step_avg:143.03ms step:44/1480 train_time:4862ms step_avg:143.00ms step:45/1480 train_time:5003ms step_avg:142.94ms step:46/1480 train_time:5145ms step_avg:142.91ms step:47/1480 train_time:5288ms step_avg:142.93ms step:48/1480 train_time:5433ms step_avg:142.97ms step:49/1480 train_time:5576ms step_avg:142.98ms step:50/1480 train_time:5719ms step_avg:142.96ms step:51/1480 train_time:5861ms step_avg:142.94ms step:52/1480 train_time:6002ms step_avg:142.89ms step:53/1480 train_time:6146ms step_avg:142.94ms step:54/1480 train_time:6290ms step_avg:142.95ms step:55/1480 train_time:6435ms step_avg:142.99ms step:56/1480 train_time:6579ms step_avg:143.02ms step:57/1480 train_time:6722ms step_avg:143.02ms step:58/1480 train_time:6864ms step_avg:143.00ms step:59/1480 train_time:7005ms step_avg:142.96ms step:60/1480 train_time:7149ms step_avg:142.97ms step:61/1480 train_time:7293ms step_avg:143.00ms step:62/1480 train_time:7437ms step_avg:143.01ms step:63/1480 train_time:7580ms step_avg:143.02ms step:64/1480 train_time:7721ms step_avg:142.98ms step:65/1480 train_time:7863ms step_avg:142.97ms step:66/1480 train_time:8005ms step_avg:142.94ms step:67/1480 train_time:8149ms step_avg:142.96ms step:68/1480 train_time:8293ms step_avg:142.99ms step:69/1480 train_time:8436ms step_avg:142.99ms step:70/1480 train_time:8579ms step_avg:142.99ms step:71/1480 train_time:8721ms step_avg:142.96ms step:72/1480 train_time:8862ms step_avg:142.94ms step:73/1480 train_time:9004ms step_avg:142.93ms step:74/1480 train_time:9149ms step_avg:142.95ms step:75/1480 train_time:9294ms step_avg:142.99ms step:76/1480 train_time:9437ms step_avg:142.98ms step:77/1480 train_time:9580ms step_avg:142.99ms step:78/1480 train_time:9721ms step_avg:142.96ms step:79/1480 train_time:9862ms step_avg:142.93ms step:80/1480 train_time:10003ms step_avg:142.90ms step:81/1480 train_time:10146ms step_avg:142.91ms step:82/1480 train_time:10289ms step_avg:142.91ms step:83/1480 train_time:10433ms step_avg:142.92ms step:84/1480 train_time:10576ms step_avg:142.92ms step:85/1480 train_time:10718ms step_avg:142.90ms step:86/1480 train_time:10860ms step_avg:142.90ms step:87/1480 train_time:11001ms step_avg:142.87ms step:88/1480 train_time:11145ms step_avg:142.88ms step:89/1480 train_time:11289ms step_avg:142.90ms step:90/1480 train_time:11433ms step_avg:142.91ms step:91/1480 train_time:11577ms step_avg:142.92ms step:92/1480 train_time:11719ms step_avg:142.92ms step:93/1480 train_time:11861ms step_avg:142.90ms step:94/1480 train_time:12002ms step_avg:142.88ms step:95/1480 train_time:12145ms step_avg:142.88ms step:96/1480 train_time:12289ms step_avg:142.90ms step:97/1480 train_time:12434ms step_avg:142.92ms step:98/1480 train_time:12577ms step_avg:142.93ms step:99/1480 train_time:12719ms step_avg:142.92ms step:100/1480 train_time:12861ms step_avg:142.90ms step:101/1480 train_time:13001ms step_avg:142.86ms step:102/1480 train_time:13142ms step_avg:142.84ms step:103/1480 train_time:13284ms step_avg:142.84ms step:104/1480 train_time:13426ms step_avg:142.83ms step:105/1480 train_time:13570ms step_avg:142.84ms step:106/1480 train_time:13712ms step_avg:142.84ms step:107/1480 train_time:13856ms step_avg:142.84ms step:108/1480 train_time:13998ms step_avg:142.83ms step:109/1480 train_time:14139ms step_avg:142.82ms step:110/1480 train_time:14281ms step_avg:142.81ms step:111/1480 train_time:14426ms step_avg:142.83ms step:112/1480 train_time:14574ms step_avg:142.88ms step:113/1480 train_time:14721ms step_avg:142.92ms step:114/1480 train_time:14867ms step_avg:142.95ms step:115/1480 train_time:15014ms step_avg:142.99ms step:116/1480 train_time:15161ms step_avg:143.03ms step:117/1480 train_time:15306ms step_avg:143.05ms step:118/1480 train_time:15454ms step_avg:143.10ms step:119/1480 train_time:15601ms step_avg:143.13ms step:120/1480 train_time:15746ms step_avg:143.15ms step:121/1480 train_time:15894ms step_avg:143.19ms step:122/1480 train_time:16040ms step_avg:143.22ms step:123/1480 train_time:16186ms step_avg:143.24ms step:124/1480 train_time:16335ms step_avg:143.29ms step:125/1480 train_time:16482ms step_avg:143.33ms step:125/1480 val_loss:4.4007 train_time:16540ms step_avg:143.83ms step:126/1480 train_time:16637ms step_avg:143.42ms step:127/1480 train_time:16788ms step_avg:143.49ms step:128/1480 train_time:16935ms step_avg:143.52ms step:129/1480 train_time:17079ms step_avg:143.52ms step:130/1480 train_time:17224ms step_avg:143.53ms step:131/1480 train_time:17369ms step_avg:143.55ms step:132/1480 train_time:17515ms step_avg:143.57ms step:133/1480 train_time:17663ms step_avg:143.60ms step:134/1480 train_time:17813ms step_avg:143.65ms step:135/1480 train_time:17959ms step_avg:143.67ms step:136/1480 train_time:18106ms step_avg:143.70ms step:137/1480 train_time:18253ms step_avg:143.72ms step:138/1480 train_time:18399ms step_avg:143.74ms step:139/1480 train_time:18546ms step_avg:143.77ms step:140/1480 train_time:18694ms step_avg:143.80ms step:141/1480 train_time:18841ms step_avg:143.83ms step:142/1480 train_time:18989ms step_avg:143.86ms step:143/1480 train_time:19136ms step_avg:143.88ms step:144/1480 train_time:19283ms step_avg:143.90ms step:145/1480 train_time:19430ms step_avg:143.93ms step:146/1480 train_time:19576ms step_avg:143.94ms step:147/1480 train_time:19724ms step_avg:143.97ms step:148/1480 train_time:19871ms step_avg:144.00ms step:149/1480 train_time:20017ms step_avg:144.01ms step:150/1480 train_time:20166ms step_avg:144.04ms step:151/1480 train_time:20312ms step_avg:144.06ms step:152/1480 train_time:20458ms step_avg:144.07ms step:153/1480 train_time:20607ms step_avg:144.10ms step:154/1480 train_time:20754ms step_avg:144.12ms step:155/1480 train_time:20899ms step_avg:144.13ms step:156/1480 train_time:21046ms step_avg:144.15ms step:157/1480 train_time:21194ms step_avg:144.18ms step:158/1480 train_time:21338ms step_avg:144.18ms step:159/1480 train_time:21486ms step_avg:144.20ms step:160/1480 train_time:21632ms step_avg:144.21ms step:161/1480 train_time:21778ms step_avg:144.23ms step:162/1480 train_time:21926ms step_avg:144.25ms step:163/1480 train_time:22073ms step_avg:144.27ms step:164/1480 train_time:22220ms step_avg:144.29ms step:165/1480 train_time:22367ms step_avg:144.30ms step:166/1480 train_time:22514ms step_avg:144.32ms step:167/1480 train_time:22660ms step_avg:144.33ms step:168/1480 train_time:22809ms step_avg:144.36ms step:169/1480 train_time:22955ms step_avg:144.37ms step:170/1480 train_time:23101ms step_avg:144.38ms step:171/1480 train_time:23248ms step_avg:144.40ms step:172/1480 train_time:23396ms step_avg:144.42ms step:173/1480 train_time:23541ms step_avg:144.42ms step:174/1480 train_time:23691ms step_avg:144.46ms step:175/1480 train_time:23837ms step_avg:144.47ms step:176/1480 train_time:23983ms step_avg:144.48ms step:177/1480 train_time:24131ms step_avg:144.50ms step:178/1480 train_time:24277ms step_avg:144.51ms step:179/1480 train_time:24424ms step_avg:144.52ms step:180/1480 train_time:24571ms step_avg:144.53ms step:181/1480 train_time:24716ms step_avg:144.54ms step:182/1480 train_time:24863ms step_avg:144.55ms step:183/1480 train_time:25011ms step_avg:144.57ms step:184/1480 train_time:25156ms step_avg:144.58ms step:185/1480 train_time:25303ms step_avg:144.59ms step:186/1480 train_time:25451ms step_avg:144.61ms step:187/1480 train_time:25597ms step_avg:144.62ms step:188/1480 train_time:25745ms step_avg:144.63ms step:189/1480 train_time:25891ms step_avg:144.64ms step:190/1480 train_time:26037ms step_avg:144.65ms step:191/1480 train_time:26184ms step_avg:144.66ms step:192/1480 train_time:26331ms step_avg:144.68ms step:193/1480 train_time:26477ms step_avg:144.68ms step:194/1480 train_time:26624ms step_avg:144.70ms step:195/1480 train_time:26770ms step_avg:144.70ms step:196/1480 train_time:26916ms step_avg:144.71ms step:197/1480 train_time:27064ms step_avg:144.73ms step:198/1480 train_time:27211ms step_avg:144.74ms step:199/1480 train_time:27357ms step_avg:144.75ms step:200/1480 train_time:27506ms step_avg:144.77ms step:201/1480 train_time:27653ms step_avg:144.78ms step:202/1480 train_time:27799ms step_avg:144.79ms step:203/1480 train_time:27946ms step_avg:144.80ms step:204/1480 train_time:28094ms step_avg:144.81ms step:205/1480 train_time:28239ms step_avg:144.82ms step:206/1480 train_time:28385ms step_avg:144.82ms step:207/1480 train_time:28534ms step_avg:144.84ms step:208/1480 train_time:28681ms step_avg:144.85ms step:209/1480 train_time:28829ms step_avg:144.87ms step:210/1480 train_time:28976ms step_avg:144.88ms step:211/1480 train_time:29123ms step_avg:144.89ms step:212/1480 train_time:29270ms step_avg:144.90ms step:213/1480 train_time:29416ms step_avg:144.91ms step:214/1480 train_time:29563ms step_avg:144.92ms step:215/1480 train_time:29710ms step_avg:144.93ms step:216/1480 train_time:29856ms step_avg:144.93ms step:217/1480 train_time:30004ms step_avg:144.95ms step:218/1480 train_time:30151ms step_avg:144.96ms step:219/1480 train_time:30299ms step_avg:144.97ms step:220/1480 train_time:30446ms step_avg:144.98ms step:221/1480 train_time:30595ms step_avg:145.00ms step:222/1480 train_time:30745ms step_avg:145.03ms step:223/1480 train_time:30896ms step_avg:145.05ms step:224/1480 train_time:31047ms step_avg:145.08ms step:225/1480 train_time:31197ms step_avg:145.10ms step:226/1480 train_time:31348ms step_avg:145.13ms step:227/1480 train_time:31498ms step_avg:145.15ms step:228/1480 train_time:31647ms step_avg:145.17ms step:229/1480 train_time:31799ms step_avg:145.20ms step:230/1480 train_time:31950ms step_avg:145.23ms step:231/1480 train_time:32101ms step_avg:145.25ms step:232/1480 train_time:32252ms step_avg:145.28ms step:233/1480 train_time:32401ms step_avg:145.30ms step:234/1480 train_time:32552ms step_avg:145.32ms step:235/1480 train_time:32703ms step_avg:145.35ms step:236/1480 train_time:32853ms step_avg:145.37ms step:237/1480 train_time:33004ms step_avg:145.39ms step:238/1480 train_time:33155ms step_avg:145.41ms step:239/1480 train_time:33305ms step_avg:145.44ms step:240/1480 train_time:33455ms step_avg:145.46ms step:241/1480 train_time:33606ms step_avg:145.48ms step:242/1480 train_time:33757ms step_avg:145.50ms step:243/1480 train_time:33906ms step_avg:145.52ms step:244/1480 train_time:34056ms step_avg:145.54ms step:245/1480 train_time:34206ms step_avg:145.56ms step:246/1480 train_time:34356ms step_avg:145.58ms step:247/1480 train_time:34506ms step_avg:145.60ms step:248/1480 train_time:34657ms step_avg:145.62ms step:249/1480 train_time:34808ms step_avg:145.64ms step:250/1480 train_time:34957ms step_avg:145.66ms step:250/1480 val_loss:3.9862 train_time:35015ms step_avg:145.90ms step:251/1480 train_time:35111ms step_avg:145.69ms step:252/1480 train_time:35263ms step_avg:145.72ms step:253/1480 train_time:35413ms step_avg:145.73ms step:254/1480 train_time:35563ms step_avg:145.75ms step:255/1480 train_time:35712ms step_avg:145.76ms step:256/1480 train_time:35862ms step_avg:145.78ms step:257/1480 train_time:36011ms step_avg:145.79ms step:258/1480 train_time:36163ms step_avg:145.82ms step:259/1480 train_time:36314ms step_avg:145.84ms step:260/1480 train_time:36466ms step_avg:145.86ms step:261/1480 train_time:36617ms step_avg:145.88ms step:262/1480 train_time:36767ms step_avg:145.90ms step:263/1480 train_time:36915ms step_avg:145.91ms step:264/1480 train_time:37066ms step_avg:145.93ms step:265/1480 train_time:37217ms step_avg:145.95ms step:266/1480 train_time:37368ms step_avg:145.97ms step:267/1480 train_time:37519ms step_avg:145.99ms step:268/1480 train_time:37669ms step_avg:146.00ms step:269/1480 train_time:37819ms step_avg:146.02ms step:270/1480 train_time:37969ms step_avg:146.04ms step:271/1480 train_time:38122ms step_avg:146.06ms step:272/1480 train_time:38272ms step_avg:146.07ms step:273/1480 train_time:38423ms step_avg:146.09ms step:274/1480 train_time:38573ms step_avg:146.11ms step:275/1480 train_time:38725ms step_avg:146.13ms step:276/1480 train_time:38875ms step_avg:146.14ms step:277/1480 train_time:39025ms step_avg:146.16ms step:278/1480 train_time:39175ms step_avg:146.18ms step:279/1480 train_time:39327ms step_avg:146.20ms step:280/1480 train_time:39478ms step_avg:146.21ms step:281/1480 train_time:39628ms step_avg:146.23ms step:282/1480 train_time:39779ms step_avg:146.25ms step:283/1480 train_time:39930ms step_avg:146.26ms step:284/1480 train_time:40079ms step_avg:146.27ms step:285/1480 train_time:40230ms step_avg:146.29ms step:286/1480 train_time:40379ms step_avg:146.30ms step:287/1480 train_time:40531ms step_avg:146.32ms step:288/1480 train_time:40683ms step_avg:146.34ms step:289/1480 train_time:40835ms step_avg:146.36ms step:290/1480 train_time:40985ms step_avg:146.38ms step:291/1480 train_time:41136ms step_avg:146.39ms step:292/1480 train_time:41286ms step_avg:146.40ms step:293/1480 train_time:41436ms step_avg:146.42ms step:294/1480 train_time:41587ms step_avg:146.43ms step:295/1480 train_time:41738ms step_avg:146.45ms step:296/1480 train_time:41888ms step_avg:146.46ms step:297/1480 train_time:42039ms step_avg:146.48ms step:298/1480 train_time:42190ms step_avg:146.49ms step:299/1480 train_time:42340ms step_avg:146.51ms step:300/1480 train_time:42492ms step_avg:146.52ms step:301/1480 train_time:42643ms step_avg:146.54ms step:302/1480 train_time:42792ms step_avg:146.55ms step:303/1480 train_time:42945ms step_avg:146.57ms step:304/1480 train_time:43096ms step_avg:146.59ms step:305/1480 train_time:43247ms step_avg:146.60ms step:306/1480 train_time:43397ms step_avg:146.61ms step:307/1480 train_time:43548ms step_avg:146.63ms step:308/1480 train_time:43698ms step_avg:146.64ms step:309/1480 train_time:43849ms step_avg:146.65ms step:310/1480 train_time:43999ms step_avg:146.66ms step:311/1480 train_time:44150ms step_avg:146.68ms step:312/1480 train_time:44302ms step_avg:146.70ms step:313/1480 train_time:44453ms step_avg:146.71ms step:314/1480 train_time:44604ms step_avg:146.72ms step:315/1480 train_time:44753ms step_avg:146.73ms step:316/1480 train_time:44903ms step_avg:146.74ms step:317/1480 train_time:45053ms step_avg:146.75ms step:318/1480 train_time:45205ms step_avg:146.77ms step:319/1480 train_time:45356ms step_avg:146.78ms step:320/1480 train_time:45507ms step_avg:146.80ms step:321/1480 train_time:45656ms step_avg:146.81ms step:322/1480 train_time:45807ms step_avg:146.82ms step:323/1480 train_time:45957ms step_avg:146.83ms step:324/1480 train_time:46108ms step_avg:146.84ms step:325/1480 train_time:46257ms step_avg:146.85ms step:326/1480 train_time:46409ms step_avg:146.86ms step:327/1480 train_time:46561ms step_avg:146.88ms step:328/1480 train_time:46711ms step_avg:146.89ms step:329/1480 train_time:46861ms step_avg:146.90ms step:330/1480 train_time:47013ms step_avg:146.92ms step:331/1480 train_time:47166ms step_avg:146.93ms step:332/1480 train_time:47321ms step_avg:146.96ms step:333/1480 train_time:47473ms step_avg:146.98ms step:334/1480 train_time:47626ms step_avg:146.99ms step:335/1480 train_time:47779ms step_avg:147.01ms step:336/1480 train_time:47932ms step_avg:147.03ms step:337/1480 train_time:48086ms step_avg:147.05ms step:338/1480 train_time:48240ms step_avg:147.07ms step:339/1480 train_time:48393ms step_avg:147.09ms step:340/1480 train_time:48547ms step_avg:147.11ms step:341/1480 train_time:48701ms step_avg:147.13ms step:342/1480 train_time:48856ms step_avg:147.16ms step:343/1480 train_time:49009ms step_avg:147.18ms step:344/1480 train_time:49164ms step_avg:147.20ms step:345/1480 train_time:49320ms step_avg:147.22ms step:346/1480 train_time:49475ms step_avg:147.25ms step:347/1480 train_time:49629ms step_avg:147.27ms step:348/1480 train_time:49782ms step_avg:147.28ms step:349/1480 train_time:49937ms step_avg:147.31ms step:350/1480 train_time:50091ms step_avg:147.33ms step:351/1480 train_time:50245ms step_avg:147.35ms step:352/1480 train_time:50399ms step_avg:147.36ms step:353/1480 train_time:50552ms step_avg:147.38ms step:354/1480 train_time:50706ms step_avg:147.40ms step:355/1480 train_time:50861ms step_avg:147.42ms step:356/1480 train_time:51015ms step_avg:147.44ms step:357/1480 train_time:51169ms step_avg:147.46ms step:358/1480 train_time:51323ms step_avg:147.48ms step:359/1480 train_time:51476ms step_avg:147.50ms step:360/1480 train_time:51631ms step_avg:147.52ms step:361/1480 train_time:51786ms step_avg:147.54ms step:362/1480 train_time:51941ms step_avg:147.56ms step:363/1480 train_time:52094ms step_avg:147.58ms step:364/1480 train_time:52249ms step_avg:147.60ms step:365/1480 train_time:52402ms step_avg:147.61ms step:366/1480 train_time:52556ms step_avg:147.63ms step:367/1480 train_time:52711ms step_avg:147.65ms step:368/1480 train_time:52863ms step_avg:147.66ms step:369/1480 train_time:53017ms step_avg:147.68ms step:370/1480 train_time:53171ms step_avg:147.70ms step:371/1480 train_time:53327ms step_avg:147.72ms step:372/1480 train_time:53481ms step_avg:147.74ms step:373/1480 train_time:53634ms step_avg:147.75ms step:374/1480 train_time:53788ms step_avg:147.77ms step:375/1480 train_time:53943ms step_avg:147.79ms step:375/1480 val_loss:3.8027 train_time:54004ms step_avg:147.95ms step:376/1480 train_time:54102ms step_avg:147.82ms step:377/1480 train_time:54258ms step_avg:147.84ms step:378/1480 train_time:54411ms step_avg:147.86ms step:379/1480 train_time:54562ms step_avg:147.87ms step:380/1480 train_time:54716ms step_avg:147.88ms step:381/1480 train_time:54868ms step_avg:147.89ms step:382/1480 train_time:55022ms step_avg:147.91ms step:383/1480 train_time:55179ms step_avg:147.93ms step:384/1480 train_time:55334ms step_avg:147.95ms step:385/1480 train_time:55487ms step_avg:147.97ms step:386/1480 train_time:55642ms step_avg:147.98ms step:387/1480 train_time:55794ms step_avg:147.99ms step:388/1480 train_time:55947ms step_avg:148.01ms step:389/1480 train_time:56101ms step_avg:148.02ms step:390/1480 train_time:56256ms step_avg:148.04ms step:391/1480 train_time:56410ms step_avg:148.06ms step:392/1480 train_time:56564ms step_avg:148.07ms step:393/1480 train_time:56717ms step_avg:148.09ms step:394/1480 train_time:56871ms step_avg:148.10ms step:395/1480 train_time:57024ms step_avg:148.12ms step:396/1480 train_time:57178ms step_avg:148.13ms step:397/1480 train_time:57331ms step_avg:148.14ms step:398/1480 train_time:57486ms step_avg:148.16ms step:399/1480 train_time:57640ms step_avg:148.17ms step:400/1480 train_time:57794ms step_avg:148.19ms step:401/1480 train_time:57947ms step_avg:148.20ms step:402/1480 train_time:58101ms step_avg:148.22ms step:403/1480 train_time:58255ms step_avg:148.23ms step:404/1480 train_time:58409ms step_avg:148.25ms step:405/1480 train_time:58563ms step_avg:148.26ms step:406/1480 train_time:58716ms step_avg:148.27ms step:407/1480 train_time:58870ms step_avg:148.29ms step:408/1480 train_time:59024ms step_avg:148.30ms step:409/1480 train_time:59178ms step_avg:148.32ms step:410/1480 train_time:59331ms step_avg:148.33ms step:411/1480 train_time:59484ms step_avg:148.34ms step:412/1480 train_time:59639ms step_avg:148.36ms step:413/1480 train_time:59796ms step_avg:148.38ms step:414/1480 train_time:59947ms step_avg:148.38ms step:415/1480 train_time:60100ms step_avg:148.40ms step:416/1480 train_time:60253ms step_avg:148.41ms step:417/1480 train_time:60408ms step_avg:148.42ms step:418/1480 train_time:60564ms step_avg:148.44ms step:419/1480 train_time:60719ms step_avg:148.46ms step:420/1480 train_time:60872ms step_avg:148.47ms step:421/1480 train_time:61026ms step_avg:148.48ms step:422/1480 train_time:61179ms step_avg:148.49ms step:423/1480 train_time:61333ms step_avg:148.51ms step:424/1480 train_time:61485ms step_avg:148.52ms step:425/1480 train_time:61640ms step_avg:148.53ms step:426/1480 train_time:61794ms step_avg:148.54ms step:427/1480 train_time:61948ms step_avg:148.56ms step:428/1480 train_time:62100ms step_avg:148.56ms step:429/1480 train_time:62253ms step_avg:148.58ms step:430/1480 train_time:62407ms step_avg:148.59ms step:431/1480 train_time:62560ms step_avg:148.60ms step:432/1480 train_time:62715ms step_avg:148.61ms step:433/1480 train_time:62868ms step_avg:148.62ms step:434/1480 train_time:63022ms step_avg:148.64ms step:435/1480 train_time:63177ms step_avg:148.65ms step:436/1480 train_time:63333ms step_avg:148.67ms step:437/1480 train_time:63486ms step_avg:148.68ms step:438/1480 train_time:63640ms step_avg:148.69ms step:439/1480 train_time:63796ms step_avg:148.71ms step:440/1480 train_time:63951ms step_avg:148.72ms step:441/1480 train_time:64108ms step_avg:148.74ms step:442/1480 train_time:64264ms step_avg:148.76ms step:443/1480 train_time:64420ms step_avg:148.78ms step:444/1480 train_time:64576ms step_avg:148.79ms step:445/1480 train_time:64732ms step_avg:148.81ms step:446/1480 train_time:64887ms step_avg:148.82ms step:447/1480 train_time:65044ms step_avg:148.84ms step:448/1480 train_time:65202ms step_avg:148.86ms step:449/1480 train_time:65361ms step_avg:148.89ms step:450/1480 train_time:65519ms step_avg:148.91ms step:451/1480 train_time:65675ms step_avg:148.92ms step:452/1480 train_time:65831ms step_avg:148.94ms step:453/1480 train_time:65987ms step_avg:148.96ms step:454/1480 train_time:66144ms step_avg:148.97ms step:455/1480 train_time:66301ms step_avg:148.99ms step:456/1480 train_time:66460ms step_avg:149.01ms step:457/1480 train_time:66618ms step_avg:149.03ms step:458/1480 train_time:66774ms step_avg:149.05ms step:459/1480 train_time:66930ms step_avg:149.06ms step:460/1480 train_time:67086ms step_avg:149.08ms step:461/1480 train_time:67245ms step_avg:149.10ms step:462/1480 train_time:67403ms step_avg:149.12ms step:463/1480 train_time:67562ms step_avg:149.14ms step:464/1480 train_time:67721ms step_avg:149.17ms step:465/1480 train_time:67879ms step_avg:149.18ms step:466/1480 train_time:68036ms step_avg:149.20ms step:467/1480 train_time:68192ms step_avg:149.22ms step:468/1480 train_time:68347ms step_avg:149.23ms step:469/1480 train_time:68504ms step_avg:149.25ms step:470/1480 train_time:68661ms step_avg:149.26ms step:471/1480 train_time:68816ms step_avg:149.28ms step:472/1480 train_time:68973ms step_avg:149.29ms step:473/1480 train_time:69129ms step_avg:149.31ms step:474/1480 train_time:69285ms step_avg:149.32ms step:475/1480 train_time:69443ms step_avg:149.34ms step:476/1480 train_time:69600ms step_avg:149.36ms step:477/1480 train_time:69757ms step_avg:149.37ms step:478/1480 train_time:69913ms step_avg:149.39ms step:479/1480 train_time:70070ms step_avg:149.40ms step:480/1480 train_time:70226ms step_avg:149.42ms step:481/1480 train_time:70383ms step_avg:149.43ms step:482/1480 train_time:70540ms step_avg:149.45ms step:483/1480 train_time:70696ms step_avg:149.46ms step:484/1480 train_time:70853ms step_avg:149.48ms step:485/1480 train_time:71010ms step_avg:149.50ms step:486/1480 train_time:71167ms step_avg:149.51ms step:487/1480 train_time:71324ms step_avg:149.53ms step:488/1480 train_time:71480ms step_avg:149.54ms step:489/1480 train_time:71637ms step_avg:149.56ms step:490/1480 train_time:71794ms step_avg:149.57ms step:491/1480 train_time:71949ms step_avg:149.58ms step:492/1480 train_time:72106ms step_avg:149.60ms step:493/1480 train_time:72262ms step_avg:149.61ms step:494/1480 train_time:72421ms step_avg:149.63ms step:495/1480 train_time:72578ms step_avg:149.64ms step:496/1480 train_time:72736ms step_avg:149.66ms step:497/1480 train_time:72892ms step_avg:149.68ms step:498/1480 train_time:73049ms step_avg:149.69ms step:499/1480 train_time:73208ms step_avg:149.71ms step:500/1480 train_time:73365ms step_avg:149.72ms step:500/1480 val_loss:3.6827 train_time:73427ms step_avg:149.85ms step:501/1480 train_time:73525ms step_avg:149.75ms step:502/1480 train_time:73684ms step_avg:149.76ms step:503/1480 train_time:73840ms step_avg:149.78ms step:504/1480 train_time:73996ms step_avg:149.79ms step:505/1480 train_time:74151ms step_avg:149.80ms step:506/1480 train_time:74307ms step_avg:149.81ms step:507/1480 train_time:74464ms step_avg:149.83ms step:508/1480 train_time:74625ms step_avg:149.85ms step:509/1480 train_time:74783ms step_avg:149.87ms step:510/1480 train_time:74939ms step_avg:149.88ms step:511/1480 train_time:75096ms step_avg:149.89ms step:512/1480 train_time:75253ms step_avg:149.91ms step:513/1480 train_time:75411ms step_avg:149.92ms step:514/1480 train_time:75567ms step_avg:149.93ms step:515/1480 train_time:75726ms step_avg:149.95ms step:516/1480 train_time:75885ms step_avg:149.97ms step:517/1480 train_time:76043ms step_avg:149.99ms step:518/1480 train_time:76199ms step_avg:150.00ms step:519/1480 train_time:76356ms step_avg:150.01ms step:520/1480 train_time:76512ms step_avg:150.02ms step:521/1480 train_time:76669ms step_avg:150.04ms step:522/1480 train_time:76826ms step_avg:150.05ms step:523/1480 train_time:76985ms step_avg:150.07ms step:524/1480 train_time:77143ms step_avg:150.08ms step:525/1480 train_time:77299ms step_avg:150.10ms step:526/1480 train_time:77455ms step_avg:150.11ms step:527/1480 train_time:77610ms step_avg:150.12ms step:528/1480 train_time:77766ms step_avg:150.13ms step:529/1480 train_time:77925ms step_avg:150.14ms step:530/1480 train_time:78083ms step_avg:150.16ms step:531/1480 train_time:78240ms step_avg:150.17ms step:532/1480 train_time:78397ms step_avg:150.19ms step:533/1480 train_time:78553ms step_avg:150.20ms step:534/1480 train_time:78709ms step_avg:150.21ms step:535/1480 train_time:78866ms step_avg:150.22ms step:536/1480 train_time:79025ms step_avg:150.24ms step:537/1480 train_time:79183ms step_avg:150.25ms step:538/1480 train_time:79340ms step_avg:150.27ms step:539/1480 train_time:79499ms step_avg:150.28ms step:540/1480 train_time:79656ms step_avg:150.29ms step:541/1480 train_time:79811ms step_avg:150.30ms step:542/1480 train_time:79968ms step_avg:150.31ms step:543/1480 train_time:80126ms step_avg:150.33ms step:544/1480 train_time:80283ms step_avg:150.34ms step:545/1480 train_time:80440ms step_avg:150.36ms step:546/1480 train_time:80597ms step_avg:150.37ms step:547/1480 train_time:80753ms step_avg:150.38ms step:548/1480 train_time:80911ms step_avg:150.39ms step:549/1480 train_time:81067ms step_avg:150.40ms step:550/1480 train_time:81226ms step_avg:150.42ms step:551/1480 train_time:81385ms step_avg:150.43ms step:552/1480 train_time:81546ms step_avg:150.45ms step:553/1480 train_time:81707ms step_avg:150.47ms step:554/1480 train_time:81868ms step_avg:150.49ms step:555/1480 train_time:82029ms step_avg:150.51ms step:556/1480 train_time:82187ms step_avg:150.53ms step:557/1480 train_time:82348ms step_avg:150.54ms step:558/1480 train_time:82508ms step_avg:150.56ms step:559/1480 train_time:82667ms step_avg:150.58ms step:560/1480 train_time:82828ms step_avg:150.60ms step:561/1480 train_time:82988ms step_avg:150.61ms step:562/1480 train_time:83147ms step_avg:150.63ms step:563/1480 train_time:83306ms step_avg:150.64ms step:564/1480 train_time:83466ms step_avg:150.66ms step:565/1480 train_time:83626ms step_avg:150.68ms step:566/1480 train_time:83788ms step_avg:150.70ms step:567/1480 train_time:83947ms step_avg:150.71ms step:568/1480 train_time:84107ms step_avg:150.73ms step:569/1480 train_time:84267ms step_avg:150.75ms step:570/1480 train_time:84426ms step_avg:150.76ms step:571/1480 train_time:84586ms step_avg:150.78ms step:572/1480 train_time:84746ms step_avg:150.79ms step:573/1480 train_time:84907ms step_avg:150.81ms step:574/1480 train_time:85068ms step_avg:150.83ms step:575/1480 train_time:85230ms step_avg:150.85ms step:576/1480 train_time:85389ms step_avg:150.86ms step:577/1480 train_time:85547ms step_avg:150.88ms step:578/1480 train_time:85705ms step_avg:150.89ms step:579/1480 train_time:85865ms step_avg:150.91ms step:580/1480 train_time:86026ms step_avg:150.92ms step:581/1480 train_time:86186ms step_avg:150.94ms step:582/1480 train_time:86347ms step_avg:150.96ms step:583/1480 train_time:86507ms step_avg:150.97ms step:584/1480 train_time:86667ms step_avg:150.99ms step:585/1480 train_time:86827ms step_avg:151.00ms step:586/1480 train_time:86987ms step_avg:151.02ms step:587/1480 train_time:87147ms step_avg:151.03ms step:588/1480 train_time:87306ms step_avg:151.05ms step:589/1480 train_time:87466ms step_avg:151.06ms step:590/1480 train_time:87627ms step_avg:151.08ms step:591/1480 train_time:87787ms step_avg:151.10ms step:592/1480 train_time:87947ms step_avg:151.11ms step:593/1480 train_time:88108ms step_avg:151.13ms step:594/1480 train_time:88268ms step_avg:151.14ms step:595/1480 train_time:88429ms step_avg:151.16ms step:596/1480 train_time:88590ms step_avg:151.18ms step:597/1480 train_time:88750ms step_avg:151.19ms step:598/1480 train_time:88909ms step_avg:151.21ms step:599/1480 train_time:89068ms step_avg:151.22ms step:600/1480 train_time:89227ms step_avg:151.23ms step:601/1480 train_time:89387ms step_avg:151.25ms step:602/1480 train_time:89546ms step_avg:151.26ms step:603/1480 train_time:89708ms step_avg:151.28ms step:604/1480 train_time:89867ms step_avg:151.29ms step:605/1480 train_time:90027ms step_avg:151.31ms step:606/1480 train_time:90189ms step_avg:151.32ms step:607/1480 train_time:90350ms step_avg:151.34ms step:608/1480 train_time:90509ms step_avg:151.35ms step:609/1480 train_time:90667ms step_avg:151.36ms step:610/1480 train_time:90826ms step_avg:151.38ms step:611/1480 train_time:90988ms step_avg:151.39ms step:612/1480 train_time:91149ms step_avg:151.41ms step:613/1480 train_time:91309ms step_avg:151.42ms step:614/1480 train_time:91468ms step_avg:151.44ms step:615/1480 train_time:91628ms step_avg:151.45ms step:616/1480 train_time:91786ms step_avg:151.46ms step:617/1480 train_time:91946ms step_avg:151.48ms step:618/1480 train_time:92105ms step_avg:151.49ms step:619/1480 train_time:92266ms step_avg:151.50ms step:620/1480 train_time:92427ms step_avg:151.52ms step:621/1480 train_time:92587ms step_avg:151.53ms step:622/1480 train_time:92747ms step_avg:151.55ms step:623/1480 train_time:92908ms step_avg:151.56ms step:624/1480 train_time:93068ms step_avg:151.58ms step:625/1480 train_time:93227ms step_avg:151.59ms step:625/1480 val_loss:3.6023 train_time:93289ms step_avg:151.69ms step:626/1480 train_time:93388ms step_avg:151.60ms step:627/1480 train_time:93549ms step_avg:151.62ms step:628/1480 train_time:93706ms step_avg:151.63ms step:629/1480 train_time:93864ms step_avg:151.64ms step:630/1480 train_time:94023ms step_avg:151.65ms step:631/1480 train_time:94181ms step_avg:151.66ms step:632/1480 train_time:94339ms step_avg:151.67ms step:633/1480 train_time:94500ms step_avg:151.68ms step:634/1480 train_time:94659ms step_avg:151.70ms step:635/1480 train_time:94819ms step_avg:151.71ms step:636/1480 train_time:94980ms step_avg:151.72ms step:637/1480 train_time:95139ms step_avg:151.74ms step:638/1480 train_time:95298ms step_avg:151.75ms step:639/1480 train_time:95458ms step_avg:151.76ms step:640/1480 train_time:95618ms step_avg:151.77ms step:641/1480 train_time:95778ms step_avg:151.79ms step:642/1480 train_time:95938ms step_avg:151.80ms step:643/1480 train_time:96099ms step_avg:151.81ms step:644/1480 train_time:96258ms step_avg:151.83ms step:645/1480 train_time:96417ms step_avg:151.84ms step:646/1480 train_time:96577ms step_avg:151.85ms step:647/1480 train_time:96736ms step_avg:151.86ms step:648/1480 train_time:96897ms step_avg:151.88ms step:649/1480 train_time:97058ms step_avg:151.89ms step:650/1480 train_time:97217ms step_avg:151.90ms step:651/1480 train_time:97379ms step_avg:151.92ms step:652/1480 train_time:97539ms step_avg:151.93ms step:653/1480 train_time:97699ms step_avg:151.94ms step:654/1480 train_time:97859ms step_avg:151.95ms step:655/1480 train_time:98017ms step_avg:151.96ms step:656/1480 train_time:98178ms step_avg:151.98ms step:657/1480 train_time:98338ms step_avg:151.99ms step:658/1480 train_time:98499ms step_avg:152.00ms step:659/1480 train_time:98661ms step_avg:152.02ms step:660/1480 train_time:98824ms step_avg:152.04ms step:661/1480 train_time:98986ms step_avg:152.05ms step:662/1480 train_time:99146ms step_avg:152.06ms step:663/1480 train_time:99305ms step_avg:152.07ms step:664/1480 train_time:99467ms step_avg:152.09ms step:665/1480 train_time:99628ms step_avg:152.10ms step:666/1480 train_time:99787ms step_avg:152.11ms step:667/1480 train_time:99949ms step_avg:152.13ms step:668/1480 train_time:100113ms step_avg:152.15ms step:669/1480 train_time:100277ms step_avg:152.17ms step:670/1480 train_time:100438ms step_avg:152.18ms step:671/1480 train_time:100600ms step_avg:152.19ms step:672/1480 train_time:100761ms step_avg:152.21ms step:673/1480 train_time:100923ms step_avg:152.22ms step:674/1480 train_time:101085ms step_avg:152.24ms step:675/1480 train_time:101246ms step_avg:152.25ms step:676/1480 train_time:101407ms step_avg:152.26ms step:677/1480 train_time:101568ms step_avg:152.28ms step:678/1480 train_time:101729ms step_avg:152.29ms step:679/1480 train_time:101890ms step_avg:152.30ms step:680/1480 train_time:102054ms step_avg:152.32ms step:681/1480 train_time:102216ms step_avg:152.33ms step:682/1480 train_time:102379ms step_avg:152.35ms step:683/1480 train_time:102541ms step_avg:152.36ms step:684/1480 train_time:102702ms step_avg:152.38ms step:685/1480 train_time:102865ms step_avg:152.39ms step:686/1480 train_time:103025ms step_avg:152.40ms step:687/1480 train_time:103185ms step_avg:152.41ms step:688/1480 train_time:103347ms step_avg:152.43ms step:689/1480 train_time:103509ms step_avg:152.44ms step:690/1480 train_time:103675ms step_avg:152.46ms step:691/1480 train_time:103837ms step_avg:152.48ms step:692/1480 train_time:104000ms step_avg:152.49ms step:693/1480 train_time:104162ms step_avg:152.51ms step:694/1480 train_time:104323ms step_avg:152.52ms step:695/1480 train_time:104483ms step_avg:152.53ms step:696/1480 train_time:104643ms step_avg:152.54ms step:697/1480 train_time:104807ms step_avg:152.56ms step:698/1480 train_time:104967ms step_avg:152.57ms step:699/1480 train_time:105128ms step_avg:152.58ms step:700/1480 train_time:105290ms step_avg:152.59ms step:701/1480 train_time:105451ms step_avg:152.61ms step:702/1480 train_time:105613ms step_avg:152.62ms step:703/1480 train_time:105775ms step_avg:152.63ms step:704/1480 train_time:105936ms step_avg:152.64ms step:705/1480 train_time:106101ms step_avg:152.66ms step:706/1480 train_time:106265ms step_avg:152.68ms step:707/1480 train_time:106426ms step_avg:152.69ms step:708/1480 train_time:106588ms step_avg:152.70ms step:709/1480 train_time:106749ms step_avg:152.72ms step:710/1480 train_time:106908ms step_avg:152.73ms step:711/1480 train_time:107071ms step_avg:152.74ms step:712/1480 train_time:107238ms step_avg:152.76ms step:713/1480 train_time:107402ms step_avg:152.78ms step:714/1480 train_time:107562ms step_avg:152.79ms step:715/1480 train_time:107722ms step_avg:152.80ms step:716/1480 train_time:107882ms step_avg:152.81ms step:717/1480 train_time:108043ms step_avg:152.82ms step:718/1480 train_time:108202ms step_avg:152.83ms step:719/1480 train_time:108361ms step_avg:152.84ms step:720/1480 train_time:108523ms step_avg:152.85ms step:721/1480 train_time:108685ms step_avg:152.86ms step:722/1480 train_time:108846ms step_avg:152.87ms step:723/1480 train_time:109007ms step_avg:152.88ms step:724/1480 train_time:109168ms step_avg:152.90ms step:725/1480 train_time:109331ms step_avg:152.91ms step:726/1480 train_time:109495ms step_avg:152.93ms step:727/1480 train_time:109659ms step_avg:152.94ms step:728/1480 train_time:109820ms step_avg:152.95ms step:729/1480 train_time:109981ms step_avg:152.96ms step:730/1480 train_time:110143ms step_avg:152.98ms step:731/1480 train_time:110304ms step_avg:152.99ms step:732/1480 train_time:110464ms step_avg:153.00ms step:733/1480 train_time:110624ms step_avg:153.01ms step:734/1480 train_time:110784ms step_avg:153.02ms step:735/1480 train_time:110945ms step_avg:153.03ms step:736/1480 train_time:111107ms step_avg:153.04ms step:737/1480 train_time:111268ms step_avg:153.05ms step:738/1480 train_time:111429ms step_avg:153.06ms step:739/1480 train_time:111589ms step_avg:153.07ms step:740/1480 train_time:111754ms step_avg:153.09ms step:741/1480 train_time:111918ms step_avg:153.10ms step:742/1480 train_time:112081ms step_avg:153.12ms step:743/1480 train_time:112241ms step_avg:153.13ms step:744/1480 train_time:112405ms step_avg:153.14ms step:745/1480 train_time:112568ms step_avg:153.15ms step:746/1480 train_time:112727ms step_avg:153.16ms step:747/1480 train_time:112889ms step_avg:153.17ms step:748/1480 train_time:113055ms step_avg:153.19ms step:749/1480 train_time:113220ms step_avg:153.21ms step:750/1480 train_time:113380ms step_avg:153.22ms step:750/1480 val_loss:3.5473 train_time:113444ms step_avg:153.30ms step:751/1480 train_time:113545ms step_avg:153.23ms step:752/1480 train_time:113706ms step_avg:153.24ms step:753/1480 train_time:113868ms step_avg:153.25ms step:754/1480 train_time:114029ms step_avg:153.26ms step:755/1480 train_time:114191ms step_avg:153.28ms step:756/1480 train_time:114352ms step_avg:153.29ms step:757/1480 train_time:114516ms step_avg:153.30ms step:758/1480 train_time:114677ms step_avg:153.31ms step:759/1480 train_time:114839ms step_avg:153.32ms step:760/1480 train_time:114999ms step_avg:153.33ms step:761/1480 train_time:115160ms step_avg:153.34ms step:762/1480 train_time:115322ms step_avg:153.35ms step:763/1480 train_time:115484ms step_avg:153.36ms step:764/1480 train_time:115645ms step_avg:153.37ms step:765/1480 train_time:115806ms step_avg:153.39ms step:766/1480 train_time:115968ms step_avg:153.40ms step:767/1480 train_time:116131ms step_avg:153.41ms step:768/1480 train_time:116293ms step_avg:153.42ms step:769/1480 train_time:116457ms step_avg:153.44ms step:770/1480 train_time:116620ms step_avg:153.45ms step:771/1480 train_time:116783ms step_avg:153.46ms step:772/1480 train_time:116944ms step_avg:153.47ms step:773/1480 train_time:117105ms step_avg:153.48ms step:774/1480 train_time:117268ms step_avg:153.49ms step:775/1480 train_time:117430ms step_avg:153.50ms step:776/1480 train_time:117595ms step_avg:153.52ms step:777/1480 train_time:117761ms step_avg:153.53ms step:778/1480 train_time:117924ms step_avg:153.55ms step:779/1480 train_time:118086ms step_avg:153.56ms step:780/1480 train_time:118249ms step_avg:153.57ms step:781/1480 train_time:118413ms step_avg:153.58ms step:782/1480 train_time:118578ms step_avg:153.60ms step:783/1480 train_time:118739ms step_avg:153.61ms step:784/1480 train_time:118901ms step_avg:153.62ms step:785/1480 train_time:119063ms step_avg:153.63ms step:786/1480 train_time:119228ms step_avg:153.64ms step:787/1480 train_time:119391ms step_avg:153.66ms step:788/1480 train_time:119557ms step_avg:153.67ms step:789/1480 train_time:119718ms step_avg:153.68ms step:790/1480 train_time:119883ms step_avg:153.70ms step:791/1480 train_time:120051ms step_avg:153.71ms step:792/1480 train_time:120217ms step_avg:153.73ms step:793/1480 train_time:120379ms step_avg:153.74ms step:794/1480 train_time:120542ms step_avg:153.75ms step:795/1480 train_time:120708ms step_avg:153.77ms step:796/1480 train_time:120873ms step_avg:153.78ms step:797/1480 train_time:121038ms step_avg:153.80ms step:798/1480 train_time:121201ms step_avg:153.81ms step:799/1480 train_time:121368ms step_avg:153.82ms step:800/1480 train_time:121531ms step_avg:153.84ms step:801/1480 train_time:121696ms step_avg:153.85ms step:802/1480 train_time:121863ms step_avg:153.87ms step:803/1480 train_time:122024ms step_avg:153.88ms step:804/1480 train_time:122186ms step_avg:153.89ms step:805/1480 train_time:122352ms step_avg:153.90ms step:806/1480 train_time:122514ms step_avg:153.91ms step:807/1480 train_time:122676ms step_avg:153.92ms step:808/1480 train_time:122840ms step_avg:153.94ms step:809/1480 train_time:123002ms step_avg:153.95ms step:810/1480 train_time:123164ms step_avg:153.95ms step:811/1480 train_time:123328ms step_avg:153.97ms step:812/1480 train_time:123492ms step_avg:153.98ms step:813/1480 train_time:123654ms step_avg:153.99ms step:814/1480 train_time:123818ms step_avg:154.00ms step:815/1480 train_time:123980ms step_avg:154.01ms step:816/1480 train_time:124146ms step_avg:154.03ms step:817/1480 train_time:124308ms step_avg:154.04ms step:818/1480 train_time:124469ms step_avg:154.05ms step:819/1480 train_time:124634ms step_avg:154.06ms step:820/1480 train_time:124796ms step_avg:154.07ms step:821/1480 train_time:124959ms step_avg:154.08ms step:822/1480 train_time:125121ms step_avg:154.09ms step:823/1480 train_time:125283ms step_avg:154.10ms step:824/1480 train_time:125444ms step_avg:154.11ms step:825/1480 train_time:125609ms step_avg:154.12ms step:826/1480 train_time:125776ms step_avg:154.14ms step:827/1480 train_time:125940ms step_avg:154.15ms step:828/1480 train_time:126103ms step_avg:154.16ms step:829/1480 train_time:126266ms step_avg:154.17ms step:830/1480 train_time:126431ms step_avg:154.18ms step:831/1480 train_time:126596ms step_avg:154.20ms step:832/1480 train_time:126760ms step_avg:154.21ms step:833/1480 train_time:126925ms step_avg:154.22ms step:834/1480 train_time:127090ms step_avg:154.24ms step:835/1480 train_time:127254ms step_avg:154.25ms step:836/1480 train_time:127419ms step_avg:154.26ms step:837/1480 train_time:127581ms step_avg:154.27ms step:838/1480 train_time:127743ms step_avg:154.28ms step:839/1480 train_time:127905ms step_avg:154.29ms step:840/1480 train_time:128065ms step_avg:154.30ms step:841/1480 train_time:128226ms step_avg:154.30ms step:842/1480 train_time:128392ms step_avg:154.32ms step:843/1480 train_time:128555ms step_avg:154.33ms step:844/1480 train_time:128718ms step_avg:154.34ms step:845/1480 train_time:128881ms step_avg:154.35ms step:846/1480 train_time:129046ms step_avg:154.36ms step:847/1480 train_time:129210ms step_avg:154.37ms step:848/1480 train_time:129372ms step_avg:154.38ms step:849/1480 train_time:129536ms step_avg:154.39ms step:850/1480 train_time:129699ms step_avg:154.40ms step:851/1480 train_time:129864ms step_avg:154.42ms step:852/1480 train_time:130026ms step_avg:154.43ms step:853/1480 train_time:130188ms step_avg:154.43ms step:854/1480 train_time:130355ms step_avg:154.45ms step:855/1480 train_time:130519ms step_avg:154.46ms step:856/1480 train_time:130679ms step_avg:154.47ms step:857/1480 train_time:130844ms step_avg:154.48ms step:858/1480 train_time:131010ms step_avg:154.49ms step:859/1480 train_time:131175ms step_avg:154.50ms step:860/1480 train_time:131337ms step_avg:154.51ms step:861/1480 train_time:131502ms step_avg:154.53ms step:862/1480 train_time:131672ms step_avg:154.54ms step:863/1480 train_time:131840ms step_avg:154.56ms step:864/1480 train_time:132003ms step_avg:154.57ms step:865/1480 train_time:132163ms step_avg:154.58ms step:866/1480 train_time:132331ms step_avg:154.59ms step:867/1480 train_time:132495ms step_avg:154.60ms step:868/1480 train_time:132657ms step_avg:154.61ms step:869/1480 train_time:132819ms step_avg:154.62ms step:870/1480 train_time:132984ms step_avg:154.63ms step:871/1480 train_time:133146ms step_avg:154.64ms step:872/1480 train_time:133311ms step_avg:154.65ms step:873/1480 train_time:133474ms step_avg:154.66ms step:874/1480 train_time:133639ms step_avg:154.67ms step:875/1480 train_time:133803ms step_avg:154.69ms step:875/1480 val_loss:3.5026 train_time:133867ms step_avg:154.76ms step:876/1480 train_time:133967ms step_avg:154.70ms step:877/1480 train_time:134132ms step_avg:154.71ms step:878/1480 train_time:134294ms step_avg:154.72ms step:879/1480 train_time:134459ms step_avg:154.73ms step:880/1480 train_time:134623ms step_avg:154.74ms step:881/1480 train_time:134786ms step_avg:154.75ms step:882/1480 train_time:134952ms step_avg:154.76ms step:883/1480 train_time:135118ms step_avg:154.77ms step:884/1480 train_time:135286ms step_avg:154.79ms step:885/1480 train_time:135451ms step_avg:154.80ms step:886/1480 train_time:135617ms step_avg:154.81ms step:887/1480 train_time:135785ms step_avg:154.83ms step:888/1480 train_time:135958ms step_avg:154.85ms step:889/1480 train_time:136126ms step_avg:154.86ms step:890/1480 train_time:136289ms step_avg:154.87ms step:891/1480 train_time:136454ms step_avg:154.89ms step:892/1480 train_time:136618ms step_avg:154.90ms step:893/1480 train_time:136781ms step_avg:154.90ms step:894/1480 train_time:136948ms step_avg:154.92ms step:895/1480 train_time:137113ms step_avg:154.93ms step:896/1480 train_time:137280ms step_avg:154.94ms step:897/1480 train_time:137446ms step_avg:154.96ms step:898/1480 train_time:137613ms step_avg:154.97ms step:899/1480 train_time:137777ms step_avg:154.98ms step:900/1480 train_time:137941ms step_avg:154.99ms step:901/1480 train_time:138106ms step_avg:155.00ms step:902/1480 train_time:138269ms step_avg:155.01ms step:903/1480 train_time:138440ms step_avg:155.03ms step:904/1480 train_time:138606ms step_avg:155.04ms step:905/1480 train_time:138768ms step_avg:155.05ms step:906/1480 train_time:138934ms step_avg:155.06ms step:907/1480 train_time:139104ms step_avg:155.08ms step:908/1480 train_time:139267ms step_avg:155.09ms step:909/1480 train_time:139432ms step_avg:155.10ms step:910/1480 train_time:139603ms step_avg:155.11ms step:911/1480 train_time:139768ms step_avg:155.13ms step:912/1480 train_time:139932ms step_avg:155.14ms step:913/1480 train_time:140099ms step_avg:155.15ms step:914/1480 train_time:140267ms step_avg:155.16ms step:915/1480 train_time:140436ms step_avg:155.18ms step:916/1480 train_time:140601ms step_avg:155.19ms step:917/1480 train_time:140765ms step_avg:155.20ms step:918/1480 train_time:140933ms step_avg:155.21ms step:919/1480 train_time:141102ms step_avg:155.23ms step:920/1480 train_time:141268ms step_avg:155.24ms step:921/1480 train_time:141433ms step_avg:155.25ms step:922/1480 train_time:141602ms step_avg:155.27ms step:923/1480 train_time:141765ms step_avg:155.27ms step:924/1480 train_time:141929ms step_avg:155.28ms step:925/1480 train_time:142094ms step_avg:155.29ms step:926/1480 train_time:142257ms step_avg:155.30ms step:927/1480 train_time:142422ms step_avg:155.31ms step:928/1480 train_time:142589ms step_avg:155.33ms step:929/1480 train_time:142754ms step_avg:155.34ms step:930/1480 train_time:142920ms step_avg:155.35ms step:931/1480 train_time:143085ms step_avg:155.36ms step:932/1480 train_time:143250ms step_avg:155.37ms step:933/1480 train_time:143417ms step_avg:155.38ms step:934/1480 train_time:143585ms step_avg:155.40ms step:935/1480 train_time:143755ms step_avg:155.41ms step:936/1480 train_time:143923ms step_avg:155.42ms step:937/1480 train_time:144092ms step_avg:155.44ms step:938/1480 train_time:144254ms step_avg:155.45ms step:939/1480 train_time:144423ms step_avg:155.46ms step:940/1480 train_time:144589ms step_avg:155.47ms step:941/1480 train_time:144753ms step_avg:155.48ms step:942/1480 train_time:144917ms step_avg:155.49ms step:943/1480 train_time:145088ms step_avg:155.51ms step:944/1480 train_time:145261ms step_avg:155.53ms step:945/1480 train_time:145426ms step_avg:155.54ms step:946/1480 train_time:145596ms step_avg:155.55ms step:947/1480 train_time:145765ms step_avg:155.57ms step:948/1480 train_time:145931ms step_avg:155.58ms step:949/1480 train_time:146095ms step_avg:155.59ms step:950/1480 train_time:146260ms step_avg:155.60ms step:951/1480 train_time:146428ms step_avg:155.61ms step:952/1480 train_time:146593ms step_avg:155.62ms step:953/1480 train_time:146760ms step_avg:155.63ms step:954/1480 train_time:146929ms step_avg:155.65ms step:955/1480 train_time:147093ms step_avg:155.65ms step:956/1480 train_time:147259ms step_avg:155.66ms step:957/1480 train_time:147427ms step_avg:155.68ms step:958/1480 train_time:147596ms step_avg:155.69ms step:959/1480 train_time:147761ms step_avg:155.70ms step:960/1480 train_time:147928ms step_avg:155.71ms step:961/1480 train_time:148092ms step_avg:155.72ms step:962/1480 train_time:148255ms step_avg:155.73ms step:963/1480 train_time:148422ms step_avg:155.74ms step:964/1480 train_time:148590ms step_avg:155.75ms step:965/1480 train_time:148754ms step_avg:155.76ms step:966/1480 train_time:148921ms step_avg:155.77ms step:967/1480 train_time:149085ms step_avg:155.78ms step:968/1480 train_time:149249ms step_avg:155.79ms step:969/1480 train_time:149416ms step_avg:155.80ms step:970/1480 train_time:149579ms step_avg:155.81ms step:971/1480 train_time:149744ms step_avg:155.82ms step:972/1480 train_time:149909ms step_avg:155.83ms step:973/1480 train_time:150072ms step_avg:155.84ms step:974/1480 train_time:150240ms step_avg:155.85ms step:975/1480 train_time:150406ms step_avg:155.86ms step:976/1480 train_time:150571ms step_avg:155.87ms step:977/1480 train_time:150735ms step_avg:155.88ms step:978/1480 train_time:150902ms step_avg:155.89ms step:979/1480 train_time:151069ms step_avg:155.90ms step:980/1480 train_time:151233ms step_avg:155.91ms step:981/1480 train_time:151404ms step_avg:155.93ms step:982/1480 train_time:151568ms step_avg:155.93ms step:983/1480 train_time:151732ms step_avg:155.94ms step:984/1480 train_time:151896ms step_avg:155.95ms step:985/1480 train_time:152064ms step_avg:155.96ms step:986/1480 train_time:152229ms step_avg:155.97ms step:987/1480 train_time:152392ms step_avg:155.98ms step:988/1480 train_time:152561ms step_avg:155.99ms step:989/1480 train_time:152726ms step_avg:156.00ms step:990/1480 train_time:152895ms step_avg:156.02ms step:991/1480 train_time:153063ms step_avg:156.03ms step:992/1480 train_time:153236ms step_avg:156.04ms step:993/1480 train_time:153412ms step_avg:156.07ms step:994/1480 train_time:153576ms step_avg:156.07ms step:995/1480 train_time:153740ms step_avg:156.08ms step:996/1480 train_time:153905ms step_avg:156.09ms step:997/1480 train_time:154069ms step_avg:156.10ms step:998/1480 train_time:154232ms step_avg:156.11ms step:999/1480 train_time:154400ms step_avg:156.12ms step:1000/1480 train_time:154568ms step_avg:156.13ms step:1000/1480 val_loss:3.4402 train_time:154635ms step_avg:156.20ms step:1001/1480 train_time:154738ms step_avg:156.14ms step:1002/1480 train_time:154904ms step_avg:156.15ms step:1003/1480 train_time:155074ms step_avg:156.17ms step:1004/1480 train_time:155244ms step_avg:156.18ms step:1005/1480 train_time:155412ms step_avg:156.19ms step:1006/1480 train_time:155581ms step_avg:156.21ms step:1007/1480 train_time:155747ms step_avg:156.22ms step:1008/1480 train_time:155913ms step_avg:156.23ms step:1009/1480 train_time:156086ms step_avg:156.24ms step:1010/1480 train_time:156252ms step_avg:156.25ms step:1011/1480 train_time:156417ms step_avg:156.26ms step:1012/1480 train_time:156583ms step_avg:156.27ms step:1013/1480 train_time:156754ms step_avg:156.29ms step:1014/1480 train_time:156922ms step_avg:156.30ms step:1015/1480 train_time:157091ms step_avg:156.31ms step:1016/1480 train_time:157260ms step_avg:156.32ms step:1017/1480 train_time:157432ms step_avg:156.34ms step:1018/1480 train_time:157599ms step_avg:156.35ms step:1019/1480 train_time:157767ms step_avg:156.36ms step:1020/1480 train_time:157936ms step_avg:156.37ms step:1021/1480 train_time:158102ms step_avg:156.38ms step:1022/1480 train_time:158269ms step_avg:156.39ms step:1023/1480 train_time:158437ms step_avg:156.40ms step:1024/1480 train_time:158604ms step_avg:156.41ms step:1025/1480 train_time:158775ms step_avg:156.43ms step:1026/1480 train_time:158941ms step_avg:156.44ms step:1027/1480 train_time:159108ms step_avg:156.45ms step:1028/1480 train_time:159279ms step_avg:156.46ms step:1029/1480 train_time:159452ms step_avg:156.48ms step:1030/1480 train_time:159620ms step_avg:156.49ms step:1031/1480 train_time:159785ms step_avg:156.50ms step:1032/1480 train_time:159955ms step_avg:156.51ms step:1033/1480 train_time:160122ms step_avg:156.52ms step:1034/1480 train_time:160290ms step_avg:156.53ms step:1035/1480 train_time:160457ms step_avg:156.54ms step:1036/1480 train_time:160624ms step_avg:156.55ms step:1037/1480 train_time:160790ms step_avg:156.56ms step:1038/1480 train_time:160958ms step_avg:156.57ms step:1039/1480 train_time:161129ms step_avg:156.59ms step:1040/1480 train_time:161295ms step_avg:156.60ms step:1041/1480 train_time:161464ms step_avg:156.61ms step:1042/1480 train_time:161629ms step_avg:156.62ms step:1043/1480 train_time:161794ms step_avg:156.62ms step:1044/1480 train_time:161958ms step_avg:156.63ms step:1045/1480 train_time:162129ms step_avg:156.65ms step:1046/1480 train_time:162296ms step_avg:156.66ms step:1047/1480 train_time:162464ms step_avg:156.67ms step:1048/1480 train_time:162631ms step_avg:156.68ms step:1049/1480 train_time:162796ms step_avg:156.68ms step:1050/1480 train_time:162965ms step_avg:156.70ms step:1051/1480 train_time:163135ms step_avg:156.71ms step:1052/1480 train_time:163303ms step_avg:156.72ms step:1053/1480 train_time:163469ms step_avg:156.73ms step:1054/1480 train_time:163638ms step_avg:156.74ms step:1055/1480 train_time:163805ms step_avg:156.75ms step:1056/1480 train_time:163969ms step_avg:156.76ms step:1057/1480 train_time:164135ms step_avg:156.77ms step:1058/1480 train_time:164306ms step_avg:156.78ms step:1059/1480 train_time:164479ms step_avg:156.80ms step:1060/1480 train_time:164647ms step_avg:156.81ms step:1061/1480 train_time:164810ms step_avg:156.81ms step:1062/1480 train_time:164974ms step_avg:156.82ms step:1063/1480 train_time:165140ms step_avg:156.83ms step:1064/1480 train_time:165304ms step_avg:156.84ms step:1065/1480 train_time:165472ms step_avg:156.85ms step:1066/1480 train_time:165640ms step_avg:156.86ms step:1067/1480 train_time:165808ms step_avg:156.87ms step:1068/1480 train_time:165974ms step_avg:156.88ms step:1069/1480 train_time:166146ms step_avg:156.89ms step:1070/1480 train_time:166311ms step_avg:156.90ms step:1071/1480 train_time:166484ms step_avg:156.91ms step:1072/1480 train_time:166649ms step_avg:156.92ms step:1073/1480 train_time:166813ms step_avg:156.93ms step:1074/1480 train_time:166980ms step_avg:156.94ms step:1075/1480 train_time:167150ms step_avg:156.95ms step:1076/1480 train_time:167318ms step_avg:156.96ms step:1077/1480 train_time:167485ms step_avg:156.97ms step:1078/1480 train_time:167659ms step_avg:156.98ms step:1079/1480 train_time:167830ms step_avg:157.00ms step:1080/1480 train_time:168000ms step_avg:157.01ms step:1081/1480 train_time:168166ms step_avg:157.02ms step:1082/1480 train_time:168333ms step_avg:157.03ms step:1083/1480 train_time:168499ms step_avg:157.04ms step:1084/1480 train_time:168668ms step_avg:157.05ms step:1085/1480 train_time:168836ms step_avg:157.06ms step:1086/1480 train_time:169006ms step_avg:157.07ms step:1087/1480 train_time:169173ms step_avg:157.08ms step:1088/1480 train_time:169345ms step_avg:157.09ms step:1089/1480 train_time:169516ms step_avg:157.10ms step:1090/1480 train_time:169688ms step_avg:157.12ms step:1091/1480 train_time:169855ms step_avg:157.13ms step:1092/1480 train_time:170024ms step_avg:157.14ms step:1093/1480 train_time:170191ms step_avg:157.15ms step:1094/1480 train_time:170357ms step_avg:157.16ms step:1095/1480 train_time:170522ms step_avg:157.16ms step:1096/1480 train_time:170689ms step_avg:157.17ms step:1097/1480 train_time:170857ms step_avg:157.18ms step:1098/1480 train_time:171028ms step_avg:157.20ms step:1099/1480 train_time:171200ms step_avg:157.21ms step:1100/1480 train_time:171372ms step_avg:157.22ms step:1101/1480 train_time:171544ms step_avg:157.24ms step:1102/1480 train_time:171715ms step_avg:157.25ms step:1103/1480 train_time:171892ms step_avg:157.27ms step:1104/1480 train_time:172060ms step_avg:157.28ms step:1105/1480 train_time:172230ms step_avg:157.29ms step:1106/1480 train_time:172398ms step_avg:157.30ms step:1107/1480 train_time:172567ms step_avg:157.31ms step:1108/1480 train_time:172732ms step_avg:157.31ms step:1109/1480 train_time:172898ms step_avg:157.32ms step:1110/1480 train_time:173065ms step_avg:157.33ms step:1111/1480 train_time:173231ms step_avg:157.34ms step:1112/1480 train_time:173402ms step_avg:157.35ms step:1113/1480 train_time:173581ms step_avg:157.37ms step:1114/1480 train_time:173754ms step_avg:157.39ms step:1115/1480 train_time:173928ms step_avg:157.40ms step:1116/1480 train_time:174094ms step_avg:157.41ms step:1117/1480 train_time:174268ms step_avg:157.42ms step:1118/1480 train_time:174444ms step_avg:157.44ms step:1119/1480 train_time:174610ms step_avg:157.45ms step:1120/1480 train_time:174778ms step_avg:157.46ms step:1121/1480 train_time:174950ms step_avg:157.47ms step:1122/1480 train_time:175115ms step_avg:157.48ms step:1123/1480 train_time:175283ms step_avg:157.49ms step:1124/1480 train_time:175451ms step_avg:157.50ms step:1125/1480 train_time:175618ms step_avg:157.50ms step:1125/1480 val_loss:3.3849 train_time:175686ms step_avg:157.57ms step:1126/1480 train_time:175789ms step_avg:157.52ms step:1127/1480 train_time:175962ms step_avg:157.53ms step:1128/1480 train_time:176131ms step_avg:157.54ms step:1129/1480 train_time:176304ms step_avg:157.55ms step:1130/1480 train_time:176473ms step_avg:157.57ms step:1131/1480 train_time:176651ms step_avg:157.58ms step:1132/1480 train_time:176818ms step_avg:157.59ms step:1133/1480 train_time:176989ms step_avg:157.60ms step:1134/1480 train_time:177159ms step_avg:157.61ms step:1135/1480 train_time:177326ms step_avg:157.62ms step:1136/1480 train_time:177499ms step_avg:157.64ms step:1137/1480 train_time:177668ms step_avg:157.65ms step:1138/1480 train_time:177841ms step_avg:157.66ms step:1139/1480 train_time:178008ms step_avg:157.67ms step:1140/1480 train_time:178176ms step_avg:157.68ms step:1141/1480 train_time:178346ms step_avg:157.69ms step:1142/1480 train_time:178513ms step_avg:157.70ms step:1143/1480 train_time:178683ms step_avg:157.71ms step:1144/1480 train_time:178853ms step_avg:157.72ms step:1145/1480 train_time:179019ms step_avg:157.73ms step:1146/1480 train_time:179189ms step_avg:157.74ms step:1147/1480 train_time:179360ms step_avg:157.75ms step:1148/1480 train_time:179529ms step_avg:157.76ms step:1149/1480 train_time:179700ms step_avg:157.77ms step:1150/1480 train_time:179868ms step_avg:157.78ms step:1151/1480 train_time:180041ms step_avg:157.79ms step:1152/1480 train_time:180212ms step_avg:157.80ms step:1153/1480 train_time:180385ms step_avg:157.82ms step:1154/1480 train_time:180552ms step_avg:157.83ms step:1155/1480 train_time:180724ms step_avg:157.84ms step:1156/1480 train_time:180903ms step_avg:157.86ms step:1157/1480 train_time:181074ms step_avg:157.87ms step:1158/1480 train_time:181241ms step_avg:157.88ms step:1159/1480 train_time:181408ms step_avg:157.88ms step:1160/1480 train_time:181575ms step_avg:157.89ms step:1161/1480 train_time:181745ms step_avg:157.90ms step:1162/1480 train_time:181916ms step_avg:157.91ms step:1163/1480 train_time:182085ms step_avg:157.92ms step:1164/1480 train_time:182254ms step_avg:157.93ms step:1165/1480 train_time:182419ms step_avg:157.94ms step:1166/1480 train_time:182589ms step_avg:157.95ms step:1167/1480 train_time:182758ms step_avg:157.96ms step:1168/1480 train_time:182926ms step_avg:157.97ms step:1169/1480 train_time:183097ms step_avg:157.98ms step:1170/1480 train_time:183266ms step_avg:157.99ms step:1171/1480 train_time:183434ms step_avg:158.00ms step:1172/1480 train_time:183601ms step_avg:158.00ms step:1173/1480 train_time:183774ms step_avg:158.02ms step:1174/1480 train_time:183956ms step_avg:158.04ms step:1175/1480 train_time:184125ms step_avg:158.05ms step:1176/1480 train_time:184298ms step_avg:158.06ms step:1177/1480 train_time:184477ms step_avg:158.08ms step:1178/1480 train_time:184643ms step_avg:158.08ms step:1179/1480 train_time:184808ms step_avg:158.09ms step:1180/1480 train_time:184988ms step_avg:158.11ms step:1181/1480 train_time:185159ms step_avg:158.12ms step:1182/1480 train_time:185326ms step_avg:158.13ms step:1183/1480 train_time:185496ms step_avg:158.14ms step:1184/1480 train_time:185663ms step_avg:158.15ms step:1185/1480 train_time:185837ms step_avg:158.16ms step:1186/1480 train_time:186008ms step_avg:158.17ms step:1187/1480 train_time:186191ms step_avg:158.19ms step:1188/1480 train_time:186358ms step_avg:158.20ms step:1189/1480 train_time:186530ms step_avg:158.21ms step:1190/1480 train_time:186698ms step_avg:158.22ms step:1191/1480 train_time:186867ms step_avg:158.23ms step:1192/1480 train_time:187033ms step_avg:158.23ms step:1193/1480 train_time:187199ms step_avg:158.24ms step:1194/1480 train_time:187370ms step_avg:158.25ms step:1195/1480 train_time:187544ms step_avg:158.26ms step:1196/1480 train_time:187727ms step_avg:158.29ms step:1197/1480 train_time:187899ms step_avg:158.30ms step:1198/1480 train_time:188081ms step_avg:158.32ms step:1199/1480 train_time:188250ms step_avg:158.33ms step:1200/1480 train_time:188420ms step_avg:158.34ms step:1201/1480 train_time:188588ms step_avg:158.34ms step:1202/1480 train_time:188768ms step_avg:158.36ms step:1203/1480 train_time:188943ms step_avg:158.38ms step:1204/1480 train_time:189118ms step_avg:158.39ms step:1205/1480 train_time:189285ms step_avg:158.40ms step:1206/1480 train_time:189454ms step_avg:158.41ms step:1207/1480 train_time:189623ms step_avg:158.42ms step:1208/1480 train_time:189791ms step_avg:158.42ms step:1209/1480 train_time:189965ms step_avg:158.44ms step:1210/1480 train_time:190141ms step_avg:158.45ms step:1211/1480 train_time:190316ms step_avg:158.46ms step:1212/1480 train_time:190487ms step_avg:158.48ms step:1213/1480 train_time:190660ms step_avg:158.49ms step:1214/1480 train_time:190838ms step_avg:158.50ms step:1215/1480 train_time:191014ms step_avg:158.52ms step:1216/1480 train_time:191184ms step_avg:158.53ms step:1217/1480 train_time:191358ms step_avg:158.54ms step:1218/1480 train_time:191527ms step_avg:158.55ms step:1219/1480 train_time:191706ms step_avg:158.57ms step:1220/1480 train_time:191877ms step_avg:158.58ms step:1221/1480 train_time:192045ms step_avg:158.58ms step:1222/1480 train_time:192212ms step_avg:158.59ms step:1223/1480 train_time:192382ms step_avg:158.60ms step:1224/1480 train_time:192560ms step_avg:158.62ms step:1225/1480 train_time:192733ms step_avg:158.63ms step:1226/1480 train_time:192906ms step_avg:158.64ms step:1227/1480 train_time:193080ms step_avg:158.65ms step:1228/1480 train_time:193250ms step_avg:158.66ms step:1229/1480 train_time:193423ms step_avg:158.67ms step:1230/1480 train_time:193603ms step_avg:158.69ms step:1231/1480 train_time:193779ms step_avg:158.71ms step:1232/1480 train_time:193954ms step_avg:158.72ms step:1233/1480 train_time:194124ms step_avg:158.73ms step:1234/1480 train_time:194295ms step_avg:158.74ms step:1235/1480 train_time:194469ms step_avg:158.75ms step:1236/1480 train_time:194638ms step_avg:158.76ms step:1237/1480 train_time:194808ms step_avg:158.77ms step:1238/1480 train_time:194995ms step_avg:158.79ms step:1239/1480 train_time:195166ms step_avg:158.80ms step:1240/1480 train_time:195336ms step_avg:158.81ms step:1241/1480 train_time:195511ms step_avg:158.82ms step:1242/1480 train_time:195681ms step_avg:158.83ms step:1243/1480 train_time:195856ms step_avg:158.84ms step:1244/1480 train_time:196021ms step_avg:158.85ms step:1245/1480 train_time:196189ms step_avg:158.86ms step:1246/1480 train_time:196360ms step_avg:158.87ms step:1247/1480 train_time:196529ms step_avg:158.88ms step:1248/1480 train_time:196699ms step_avg:158.88ms step:1249/1480 train_time:196866ms step_avg:158.89ms step:1250/1480 train_time:197035ms step_avg:158.90ms step:1250/1480 val_loss:3.3352 train_time:197108ms step_avg:158.96ms step:1251/1480 train_time:197217ms step_avg:158.92ms step:1252/1480 train_time:197386ms step_avg:158.93ms step:1253/1480 train_time:197554ms step_avg:158.93ms step:1254/1480 train_time:197726ms step_avg:158.94ms step:1255/1480 train_time:197911ms step_avg:158.96ms step:1256/1480 train_time:198086ms step_avg:158.98ms step:1257/1480 train_time:198256ms step_avg:158.99ms step:1258/1480 train_time:198429ms step_avg:159.00ms step:1259/1480 train_time:198600ms step_avg:159.01ms step:1260/1480 train_time:198766ms step_avg:159.01ms step:1261/1480 train_time:198939ms step_avg:159.02ms step:1262/1480 train_time:199115ms step_avg:159.04ms step:1263/1480 train_time:199289ms step_avg:159.05ms step:1264/1480 train_time:199455ms step_avg:159.06ms step:1265/1480 train_time:199622ms step_avg:159.06ms step:1266/1480 train_time:199795ms step_avg:159.07ms step:1267/1480 train_time:199965ms step_avg:159.08ms step:1268/1480 train_time:200137ms step_avg:159.09ms step:1269/1480 train_time:200313ms step_avg:159.10ms step:1270/1480 train_time:200483ms step_avg:159.11ms step:1271/1480 train_time:200654ms step_avg:159.12ms step:1272/1480 train_time:200820ms step_avg:159.13ms step:1273/1480 train_time:200990ms step_avg:159.14ms step:1274/1480 train_time:201162ms step_avg:159.15ms step:1275/1480 train_time:201330ms step_avg:159.15ms step:1276/1480 train_time:201498ms step_avg:159.16ms step:1277/1480 train_time:201669ms step_avg:159.17ms step:1278/1480 train_time:201837ms step_avg:159.18ms step:1279/1480 train_time:202009ms step_avg:159.19ms step:1280/1480 train_time:202187ms step_avg:159.20ms step:1281/1480 train_time:202358ms step_avg:159.21ms step:1282/1480 train_time:202524ms step_avg:159.22ms step:1283/1480 train_time:202696ms step_avg:159.23ms step:1284/1480 train_time:202866ms step_avg:159.24ms step:1285/1480 train_time:203034ms step_avg:159.24ms step:1286/1480 train_time:203204ms step_avg:159.25ms step:1287/1480 train_time:203375ms step_avg:159.26ms step:1288/1480 train_time:203545ms step_avg:159.27ms step:1289/1480 train_time:203730ms step_avg:159.29ms step:1290/1480 train_time:203910ms step_avg:159.30ms step:1291/1480 train_time:204083ms step_avg:159.32ms step:1292/1480 train_time:204258ms step_avg:159.33ms step:1293/1480 train_time:204432ms step_avg:159.34ms step:1294/1480 train_time:204604ms step_avg:159.35ms step:1295/1480 train_time:204774ms step_avg:159.36ms step:1296/1480 train_time:204947ms step_avg:159.37ms step:1297/1480 train_time:205119ms step_avg:159.38ms step:1298/1480 train_time:205288ms step_avg:159.38ms step:1299/1480 train_time:205458ms step_avg:159.39ms step:1300/1480 train_time:205626ms step_avg:159.40ms step:1301/1480 train_time:205797ms step_avg:159.41ms step:1302/1480 train_time:205970ms step_avg:159.42ms step:1303/1480 train_time:206147ms step_avg:159.43ms step:1304/1480 train_time:206322ms step_avg:159.44ms step:1305/1480 train_time:206490ms step_avg:159.45ms step:1306/1480 train_time:206665ms step_avg:159.46ms step:1307/1480 train_time:206833ms step_avg:159.47ms step:1308/1480 train_time:207003ms step_avg:159.48ms step:1309/1480 train_time:207175ms step_avg:159.49ms step:1310/1480 train_time:207344ms step_avg:159.50ms step:1311/1480 train_time:207511ms step_avg:159.50ms step:1312/1480 train_time:207685ms step_avg:159.51ms step:1313/1480 train_time:207855ms step_avg:159.52ms step:1314/1480 train_time:208027ms step_avg:159.53ms step:1315/1480 train_time:208199ms step_avg:159.54ms step:1316/1480 train_time:208365ms step_avg:159.54ms step:1317/1480 train_time:208536ms step_avg:159.55ms step:1318/1480 train_time:208715ms step_avg:159.57ms step:1319/1480 train_time:208891ms step_avg:159.58ms step:1320/1480 train_time:209067ms step_avg:159.59ms step:1321/1480 train_time:209240ms step_avg:159.60ms step:1322/1480 train_time:209420ms step_avg:159.62ms step:1323/1480 train_time:209590ms step_avg:159.63ms step:1324/1480 train_time:209766ms step_avg:159.64ms step:1325/1480 train_time:209948ms step_avg:159.66ms step:1326/1480 train_time:210124ms step_avg:159.67ms step:1327/1480 train_time:210294ms step_avg:159.68ms step:1328/1480 train_time:210464ms step_avg:159.68ms step:1329/1480 train_time:210660ms step_avg:159.71ms step:1330/1480 train_time:210840ms step_avg:159.73ms step:1331/1480 train_time:211010ms step_avg:159.73ms step:1332/1480 train_time:211185ms step_avg:159.75ms step:1333/1480 train_time:211360ms step_avg:159.76ms step:1334/1480 train_time:211531ms step_avg:159.77ms step:1335/1480 train_time:211701ms step_avg:159.77ms step:1336/1480 train_time:211884ms step_avg:159.79ms step:1337/1480 train_time:212059ms step_avg:159.80ms step:1338/1480 train_time:212230ms step_avg:159.81ms step:1339/1480 train_time:212405ms step_avg:159.82ms step:1340/1480 train_time:212576ms step_avg:159.83ms step:1341/1480 train_time:212744ms step_avg:159.84ms step:1342/1480 train_time:212920ms step_avg:159.85ms step:1343/1480 train_time:213089ms step_avg:159.86ms step:1344/1480 train_time:213261ms step_avg:159.87ms step:1345/1480 train_time:213439ms step_avg:159.88ms step:1346/1480 train_time:213609ms step_avg:159.89ms step:1347/1480 train_time:213780ms step_avg:159.90ms step:1348/1480 train_time:213949ms step_avg:159.90ms step:1349/1480 train_time:214119ms step_avg:159.91ms step:1350/1480 train_time:214292ms step_avg:159.92ms step:1351/1480 train_time:214463ms step_avg:159.93ms step:1352/1480 train_time:214633ms step_avg:159.94ms step:1353/1480 train_time:214807ms step_avg:159.95ms step:1354/1480 train_time:214979ms step_avg:159.95ms step:1355/1480 train_time:215147ms step_avg:159.96ms step:1356/1480 train_time:215320ms step_avg:159.97ms step:1357/1480 train_time:215493ms step_avg:159.98ms step:1358/1480 train_time:215664ms step_avg:159.99ms step:1359/1480 train_time:215836ms step_avg:160.00ms step:1360/1480 train_time:216011ms step_avg:160.01ms step:1361/1480 train_time:216189ms step_avg:160.02ms step:1362/1480 train_time:216363ms step_avg:160.03ms step:1363/1480 train_time:216543ms step_avg:160.05ms step:1364/1480 train_time:216711ms step_avg:160.05ms step:1365/1480 train_time:216880ms step_avg:160.06ms step:1366/1480 train_time:217051ms step_avg:160.07ms step:1367/1480 train_time:217222ms step_avg:160.08ms step:1368/1480 train_time:217396ms step_avg:160.09ms step:1369/1480 train_time:217577ms step_avg:160.10ms step:1370/1480 train_time:217755ms step_avg:160.11ms step:1371/1480 train_time:217927ms step_avg:160.12ms step:1372/1480 train_time:218105ms step_avg:160.14ms step:1373/1480 train_time:218275ms step_avg:160.14ms step:1374/1480 train_time:218450ms step_avg:160.15ms step:1375/1480 train_time:218621ms step_avg:160.16ms step:1375/1480 val_loss:3.2968 train_time:218688ms step_avg:160.21ms step:1376/1480 train_time:218793ms step_avg:160.17ms step:1377/1480 train_time:218966ms step_avg:160.18ms step:1378/1480 train_time:219135ms step_avg:160.19ms step:1379/1480 train_time:219313ms step_avg:160.20ms step:1380/1480 train_time:219487ms step_avg:160.21ms step:1381/1480 train_time:219668ms step_avg:160.22ms step:1382/1480 train_time:219839ms step_avg:160.23ms step:1383/1480 train_time:220012ms step_avg:160.24ms step:1384/1480 train_time:220191ms step_avg:160.26ms step:1385/1480 train_time:220356ms step_avg:160.26ms step:1386/1480 train_time:220526ms step_avg:160.27ms step:1387/1480 train_time:220698ms step_avg:160.27ms step:1388/1480 train_time:220868ms step_avg:160.28ms step:1389/1480 train_time:221040ms step_avg:160.29ms step:1390/1480 train_time:221210ms step_avg:160.30ms step:1391/1480 train_time:221379ms step_avg:160.30ms step:1392/1480 train_time:221552ms step_avg:160.31ms step:1393/1480 train_time:221722ms step_avg:160.32ms step:1394/1480 train_time:221894ms step_avg:160.33ms step:1395/1480 train_time:222061ms step_avg:160.33ms step:1396/1480 train_time:222230ms step_avg:160.34ms step:1397/1480 train_time:222397ms step_avg:160.34ms step:1398/1480 train_time:222564ms step_avg:160.35ms step:1399/1480 train_time:222735ms step_avg:160.36ms step:1400/1480 train_time:222913ms step_avg:160.37ms step:1401/1480 train_time:223078ms step_avg:160.37ms step:1402/1480 train_time:223250ms step_avg:160.38ms step:1403/1480 train_time:223426ms step_avg:160.39ms step:1404/1480 train_time:223596ms step_avg:160.40ms step:1405/1480 train_time:223773ms step_avg:160.41ms step:1406/1480 train_time:223948ms step_avg:160.42ms step:1407/1480 train_time:224116ms step_avg:160.43ms step:1408/1480 train_time:224284ms step_avg:160.43ms step:1409/1480 train_time:224468ms step_avg:160.45ms step:1410/1480 train_time:224637ms step_avg:160.45ms step:1411/1480 train_time:224806ms step_avg:160.46ms step:1412/1480 train_time:224976ms step_avg:160.47ms step:1413/1480 train_time:225147ms step_avg:160.48ms step:1414/1480 train_time:225318ms step_avg:160.48ms step:1415/1480 train_time:225494ms step_avg:160.49ms step:1416/1480 train_time:225681ms step_avg:160.51ms step:1417/1480 train_time:225856ms step_avg:160.52ms step:1418/1480 train_time:226028ms step_avg:160.53ms step:1419/1480 train_time:226201ms step_avg:160.54ms step:1420/1480 train_time:226376ms step_avg:160.55ms step:1421/1480 train_time:226550ms step_avg:160.56ms step:1422/1480 train_time:226720ms step_avg:160.57ms step:1423/1480 train_time:226890ms step_avg:160.57ms step:1424/1480 train_time:227067ms step_avg:160.58ms step:1425/1480 train_time:227246ms step_avg:160.60ms step:1426/1480 train_time:227418ms step_avg:160.61ms step:1427/1480 train_time:227595ms step_avg:160.62ms step:1428/1480 train_time:227766ms step_avg:160.63ms step:1429/1480 train_time:227935ms step_avg:160.63ms step:1430/1480 train_time:228110ms step_avg:160.64ms step:1431/1480 train_time:228284ms step_avg:160.65ms step:1432/1480 train_time:228461ms step_avg:160.66ms step:1433/1480 train_time:228641ms step_avg:160.68ms step:1434/1480 train_time:228821ms step_avg:160.69ms step:1435/1480 train_time:228995ms step_avg:160.70ms step:1436/1480 train_time:229169ms step_avg:160.71ms step:1437/1480 train_time:229339ms step_avg:160.71ms step:1438/1480 train_time:229508ms step_avg:160.72ms step:1439/1480 train_time:229679ms step_avg:160.73ms step:1440/1480 train_time:229851ms step_avg:160.73ms step:1441/1480 train_time:230022ms step_avg:160.74ms step:1442/1480 train_time:230200ms step_avg:160.75ms step:1443/1480 train_time:230390ms step_avg:160.77ms step:1444/1480 train_time:230561ms step_avg:160.78ms step:1445/1480 train_time:230733ms step_avg:160.79ms step:1446/1480 train_time:230910ms step_avg:160.80ms step:1447/1480 train_time:231089ms step_avg:160.81ms step:1448/1480 train_time:231259ms step_avg:160.82ms step:1449/1480 train_time:231432ms step_avg:160.83ms step:1450/1480 train_time:231604ms step_avg:160.84ms step:1451/1480 train_time:231776ms step_avg:160.84ms step:1452/1480 train_time:231950ms step_avg:160.85ms step:1453/1480 train_time:232119ms step_avg:160.86ms step:1454/1480 train_time:232292ms step_avg:160.87ms step:1455/1480 train_time:232472ms step_avg:160.88ms step:1456/1480 train_time:232644ms step_avg:160.89ms step:1457/1480 train_time:232817ms step_avg:160.90ms step:1458/1480 train_time:232989ms step_avg:160.90ms step:1459/1480 train_time:233165ms step_avg:160.91ms step:1460/1480 train_time:233336ms step_avg:160.92ms step:1461/1480 train_time:233512ms step_avg:160.93ms step:1462/1480 train_time:233682ms step_avg:160.94ms step:1463/1480 train_time:233859ms step_avg:160.95ms step:1464/1480 train_time:234034ms step_avg:160.96ms step:1465/1480 train_time:234205ms step_avg:160.97ms step:1466/1480 train_time:234375ms step_avg:160.97ms step:1467/1480 train_time:234549ms step_avg:160.98ms step:1468/1480 train_time:234717ms step_avg:160.99ms step:1469/1480 train_time:234891ms step_avg:160.99ms step:1470/1480 train_time:235072ms step_avg:161.01ms step:1471/1480 train_time:235257ms step_avg:161.02ms step:1472/1480 train_time:235437ms step_avg:161.04ms step:1473/1480 train_time:235610ms step_avg:161.05ms step:1474/1480 train_time:235789ms step_avg:161.06ms step:1475/1480 train_time:235969ms step_avg:161.07ms step:1476/1480 train_time:236141ms step_avg:161.08ms step:1477/1480 train_time:236324ms step_avg:161.09ms step:1478/1480 train_time:236509ms step_avg:161.11ms step:1479/1480 train_time:236680ms step_avg:161.12ms step:1480/1480 train_time:236851ms step_avg:161.12ms step:1480/1480 val_loss:3.2777 train_time:236922ms step_avg:161.17ms