import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 09:16:12 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 44C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 99W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 97W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 109W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 77W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 83W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22737ms step_avg:nanms step:2/1480 train_time:22838ms step_avg:nanms step:3/1480 train_time:22978ms step_avg:nanms step:4/1480 train_time:23120ms step_avg:nanms step:5/1480 train_time:23262ms step_avg:nanms step:6/1480 train_time:23404ms step_avg:nanms step:7/1480 train_time:23546ms step_avg:nanms step:8/1480 train_time:23688ms step_avg:nanms step:9/1480 train_time:23833ms step_avg:nanms step:10/1480 train_time:23975ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:426ms step_avg:142.03ms step:14/1480 train_time:569ms step_avg:142.20ms step:15/1480 train_time:711ms step_avg:142.24ms step:16/1480 train_time:854ms step_avg:142.35ms step:17/1480 train_time:997ms step_avg:142.37ms step:18/1480 train_time:1139ms step_avg:142.33ms step:19/1480 train_time:1280ms step_avg:142.24ms step:20/1480 train_time:1424ms step_avg:142.42ms step:21/1480 train_time:1569ms step_avg:142.61ms step:22/1480 train_time:1712ms step_avg:142.64ms step:23/1480 train_time:1856ms step_avg:142.73ms step:24/1480 train_time:1996ms step_avg:142.58ms step:25/1480 train_time:2138ms step_avg:142.53ms step:26/1480 train_time:2279ms step_avg:142.46ms step:27/1480 train_time:2423ms step_avg:142.55ms step:28/1480 train_time:2568ms step_avg:142.69ms step:29/1480 train_time:2711ms step_avg:142.68ms step:30/1480 train_time:2853ms step_avg:142.64ms step:31/1480 train_time:2995ms step_avg:142.61ms step:32/1480 train_time:3137ms step_avg:142.59ms step:33/1480 train_time:3279ms step_avg:142.55ms step:34/1480 train_time:3421ms step_avg:142.55ms step:35/1480 train_time:3566ms step_avg:142.65ms step:36/1480 train_time:3709ms step_avg:142.66ms step:37/1480 train_time:3853ms step_avg:142.71ms step:38/1480 train_time:3994ms step_avg:142.64ms step:39/1480 train_time:4136ms step_avg:142.61ms step:40/1480 train_time:4277ms step_avg:142.56ms step:41/1480 train_time:4418ms step_avg:142.53ms step:42/1480 train_time:4561ms step_avg:142.54ms step:43/1480 train_time:4705ms step_avg:142.57ms step:44/1480 train_time:4849ms step_avg:142.63ms step:45/1480 train_time:4993ms step_avg:142.65ms step:46/1480 train_time:5135ms step_avg:142.64ms step:47/1480 train_time:5276ms step_avg:142.59ms step:48/1480 train_time:5418ms step_avg:142.58ms step:49/1480 train_time:5563ms step_avg:142.64ms step:50/1480 train_time:5708ms step_avg:142.70ms step:51/1480 train_time:5852ms step_avg:142.73ms step:52/1480 train_time:5994ms step_avg:142.70ms step:53/1480 train_time:6136ms step_avg:142.70ms step:54/1480 train_time:6278ms step_avg:142.67ms step:55/1480 train_time:6421ms step_avg:142.69ms step:56/1480 train_time:6565ms step_avg:142.72ms step:57/1480 train_time:6709ms step_avg:142.75ms step:58/1480 train_time:6853ms step_avg:142.76ms step:59/1480 train_time:6995ms step_avg:142.75ms step:60/1480 train_time:7136ms step_avg:142.72ms step:61/1480 train_time:7277ms step_avg:142.69ms step:62/1480 train_time:7419ms step_avg:142.68ms step:63/1480 train_time:7563ms step_avg:142.70ms step:64/1480 train_time:7708ms step_avg:142.74ms step:65/1480 train_time:7852ms step_avg:142.76ms step:66/1480 train_time:7994ms step_avg:142.75ms step:67/1480 train_time:8135ms step_avg:142.72ms step:68/1480 train_time:8276ms step_avg:142.70ms step:69/1480 train_time:8418ms step_avg:142.68ms step:70/1480 train_time:8561ms step_avg:142.68ms step:71/1480 train_time:8706ms step_avg:142.72ms step:72/1480 train_time:8850ms step_avg:142.74ms step:73/1480 train_time:8993ms step_avg:142.75ms step:74/1480 train_time:9136ms step_avg:142.74ms step:75/1480 train_time:9277ms step_avg:142.72ms step:76/1480 train_time:9419ms step_avg:142.71ms step:77/1480 train_time:9563ms step_avg:142.73ms step:78/1480 train_time:9708ms step_avg:142.77ms step:79/1480 train_time:9852ms step_avg:142.78ms step:80/1480 train_time:9994ms step_avg:142.77ms step:81/1480 train_time:10136ms step_avg:142.76ms step:82/1480 train_time:10277ms step_avg:142.74ms step:83/1480 train_time:10418ms step_avg:142.71ms step:84/1480 train_time:10560ms step_avg:142.70ms step:85/1480 train_time:10704ms step_avg:142.72ms step:86/1480 train_time:10849ms step_avg:142.75ms step:87/1480 train_time:10992ms step_avg:142.76ms step:88/1480 train_time:11134ms step_avg:142.75ms step:89/1480 train_time:11275ms step_avg:142.72ms step:90/1480 train_time:11416ms step_avg:142.70ms step:91/1480 train_time:11558ms step_avg:142.69ms step:92/1480 train_time:11701ms step_avg:142.69ms step:93/1480 train_time:11844ms step_avg:142.70ms step:94/1480 train_time:11988ms step_avg:142.71ms step:95/1480 train_time:12131ms step_avg:142.72ms step:96/1480 train_time:12272ms step_avg:142.70ms step:97/1480 train_time:12413ms step_avg:142.68ms step:98/1480 train_time:12555ms step_avg:142.67ms step:99/1480 train_time:12696ms step_avg:142.65ms step:100/1480 train_time:12841ms step_avg:142.68ms step:101/1480 train_time:12984ms step_avg:142.68ms step:102/1480 train_time:13128ms step_avg:142.70ms step:103/1480 train_time:13272ms step_avg:142.71ms step:104/1480 train_time:13412ms step_avg:142.69ms step:105/1480 train_time:13555ms step_avg:142.68ms step:106/1480 train_time:13696ms step_avg:142.67ms step:107/1480 train_time:13838ms step_avg:142.66ms step:108/1480 train_time:13980ms step_avg:142.66ms step:109/1480 train_time:14124ms step_avg:142.67ms step:110/1480 train_time:14268ms step_avg:142.68ms step:111/1480 train_time:14413ms step_avg:142.70ms step:112/1480 train_time:14560ms step_avg:142.74ms step:113/1480 train_time:14707ms step_avg:142.78ms step:114/1480 train_time:14854ms step_avg:142.82ms step:115/1480 train_time:15000ms step_avg:142.86ms step:116/1480 train_time:15148ms step_avg:142.91ms step:117/1480 train_time:15296ms step_avg:142.95ms step:118/1480 train_time:15443ms step_avg:142.99ms step:119/1480 train_time:15590ms step_avg:143.02ms step:120/1480 train_time:15737ms step_avg:143.06ms step:121/1480 train_time:15883ms step_avg:143.09ms step:122/1480 train_time:16032ms step_avg:143.15ms step:123/1480 train_time:16179ms step_avg:143.17ms step:124/1480 train_time:16326ms step_avg:143.21ms step:125/1480 train_time:16473ms step_avg:143.24ms step:125/1480 val_loss:4.4230 train_time:16530ms step_avg:143.74ms step:126/1480 train_time:16627ms step_avg:143.33ms step:127/1480 train_time:16777ms step_avg:143.40ms step:128/1480 train_time:16924ms step_avg:143.43ms step:129/1480 train_time:17070ms step_avg:143.45ms step:130/1480 train_time:17215ms step_avg:143.46ms step:131/1480 train_time:17361ms step_avg:143.48ms step:132/1480 train_time:17507ms step_avg:143.50ms step:133/1480 train_time:17653ms step_avg:143.52ms step:134/1480 train_time:17802ms step_avg:143.57ms step:135/1480 train_time:17949ms step_avg:143.59ms step:136/1480 train_time:18097ms step_avg:143.62ms step:137/1480 train_time:18242ms step_avg:143.64ms step:138/1480 train_time:18388ms step_avg:143.66ms step:139/1480 train_time:18533ms step_avg:143.67ms step:140/1480 train_time:18682ms step_avg:143.71ms step:141/1480 train_time:18828ms step_avg:143.73ms step:142/1480 train_time:18975ms step_avg:143.75ms step:143/1480 train_time:19122ms step_avg:143.77ms step:144/1480 train_time:19269ms step_avg:143.80ms step:145/1480 train_time:19415ms step_avg:143.81ms step:146/1480 train_time:19563ms step_avg:143.84ms step:147/1480 train_time:19709ms step_avg:143.86ms step:148/1480 train_time:19857ms step_avg:143.89ms step:149/1480 train_time:20005ms step_avg:143.92ms step:150/1480 train_time:20151ms step_avg:143.93ms step:151/1480 train_time:20300ms step_avg:143.97ms step:152/1480 train_time:20446ms step_avg:143.99ms step:153/1480 train_time:20593ms step_avg:144.00ms step:154/1480 train_time:20741ms step_avg:144.03ms step:155/1480 train_time:20888ms step_avg:144.05ms step:156/1480 train_time:21034ms step_avg:144.07ms step:157/1480 train_time:21182ms step_avg:144.09ms step:158/1480 train_time:21327ms step_avg:144.10ms step:159/1480 train_time:21474ms step_avg:144.12ms step:160/1480 train_time:21620ms step_avg:144.14ms step:161/1480 train_time:21767ms step_avg:144.15ms step:162/1480 train_time:21912ms step_avg:144.16ms step:163/1480 train_time:22060ms step_avg:144.19ms step:164/1480 train_time:22207ms step_avg:144.20ms step:165/1480 train_time:22353ms step_avg:144.21ms step:166/1480 train_time:22501ms step_avg:144.24ms step:167/1480 train_time:22647ms step_avg:144.25ms step:168/1480 train_time:22794ms step_avg:144.27ms step:169/1480 train_time:22942ms step_avg:144.29ms step:170/1480 train_time:23089ms step_avg:144.31ms step:171/1480 train_time:23236ms step_avg:144.33ms step:172/1480 train_time:23385ms step_avg:144.35ms step:173/1480 train_time:23530ms step_avg:144.35ms step:174/1480 train_time:23676ms step_avg:144.36ms step:175/1480 train_time:23823ms step_avg:144.38ms step:176/1480 train_time:23969ms step_avg:144.39ms step:177/1480 train_time:24115ms step_avg:144.40ms step:178/1480 train_time:24264ms step_avg:144.43ms step:179/1480 train_time:24410ms step_avg:144.44ms step:180/1480 train_time:24557ms step_avg:144.45ms step:181/1480 train_time:24705ms step_avg:144.47ms step:182/1480 train_time:24851ms step_avg:144.48ms step:183/1480 train_time:25000ms step_avg:144.51ms step:184/1480 train_time:25147ms step_avg:144.52ms step:185/1480 train_time:25295ms step_avg:144.54ms step:186/1480 train_time:25441ms step_avg:144.55ms step:187/1480 train_time:25588ms step_avg:144.56ms step:188/1480 train_time:25734ms step_avg:144.57ms step:189/1480 train_time:25883ms step_avg:144.60ms step:190/1480 train_time:26028ms step_avg:144.60ms step:191/1480 train_time:26174ms step_avg:144.61ms step:192/1480 train_time:26321ms step_avg:144.62ms step:193/1480 train_time:26468ms step_avg:144.63ms step:194/1480 train_time:26614ms step_avg:144.64ms step:195/1480 train_time:26762ms step_avg:144.66ms step:196/1480 train_time:26908ms step_avg:144.67ms step:197/1480 train_time:27053ms step_avg:144.67ms step:198/1480 train_time:27200ms step_avg:144.68ms step:199/1480 train_time:27347ms step_avg:144.69ms step:200/1480 train_time:27495ms step_avg:144.71ms step:201/1480 train_time:27642ms step_avg:144.72ms step:202/1480 train_time:27788ms step_avg:144.73ms step:203/1480 train_time:27935ms step_avg:144.74ms step:204/1480 train_time:28083ms step_avg:144.76ms step:205/1480 train_time:28229ms step_avg:144.76ms step:206/1480 train_time:28376ms step_avg:144.78ms step:207/1480 train_time:28524ms step_avg:144.79ms step:208/1480 train_time:28671ms step_avg:144.80ms step:209/1480 train_time:28818ms step_avg:144.81ms step:210/1480 train_time:28966ms step_avg:144.83ms step:211/1480 train_time:29110ms step_avg:144.83ms step:212/1480 train_time:29258ms step_avg:144.84ms step:213/1480 train_time:29405ms step_avg:144.85ms step:214/1480 train_time:29550ms step_avg:144.85ms step:215/1480 train_time:29699ms step_avg:144.87ms step:216/1480 train_time:29845ms step_avg:144.88ms step:217/1480 train_time:29992ms step_avg:144.89ms step:218/1480 train_time:30139ms step_avg:144.90ms step:219/1480 train_time:30286ms step_avg:144.91ms step:220/1480 train_time:30431ms step_avg:144.91ms step:221/1480 train_time:30580ms step_avg:144.93ms step:222/1480 train_time:30730ms step_avg:144.95ms step:223/1480 train_time:30881ms step_avg:144.98ms step:224/1480 train_time:31031ms step_avg:145.00ms step:225/1480 train_time:31182ms step_avg:145.03ms step:226/1480 train_time:31331ms step_avg:145.05ms step:227/1480 train_time:31484ms step_avg:145.09ms step:228/1480 train_time:31633ms step_avg:145.11ms step:229/1480 train_time:31784ms step_avg:145.13ms step:230/1480 train_time:31934ms step_avg:145.16ms step:231/1480 train_time:32085ms step_avg:145.18ms step:232/1480 train_time:32234ms step_avg:145.20ms step:233/1480 train_time:32383ms step_avg:145.22ms step:234/1480 train_time:32534ms step_avg:145.24ms step:235/1480 train_time:32685ms step_avg:145.27ms step:236/1480 train_time:32835ms step_avg:145.29ms step:237/1480 train_time:32986ms step_avg:145.31ms step:238/1480 train_time:33135ms step_avg:145.33ms step:239/1480 train_time:33287ms step_avg:145.36ms step:240/1480 train_time:33435ms step_avg:145.37ms step:241/1480 train_time:33586ms step_avg:145.40ms step:242/1480 train_time:33737ms step_avg:145.42ms step:243/1480 train_time:33888ms step_avg:145.44ms step:244/1480 train_time:34039ms step_avg:145.47ms step:245/1480 train_time:34190ms step_avg:145.49ms step:246/1480 train_time:34342ms step_avg:145.52ms step:247/1480 train_time:34492ms step_avg:145.53ms step:248/1480 train_time:34643ms step_avg:145.56ms step:249/1480 train_time:34792ms step_avg:145.57ms step:250/1480 train_time:34944ms step_avg:145.60ms step:250/1480 val_loss:3.9959 train_time:35003ms step_avg:145.85ms step:251/1480 train_time:35099ms step_avg:145.64ms step:252/1480 train_time:35251ms step_avg:145.66ms step:253/1480 train_time:35400ms step_avg:145.68ms step:254/1480 train_time:35550ms step_avg:145.70ms step:255/1480 train_time:35699ms step_avg:145.71ms step:256/1480 train_time:35848ms step_avg:145.73ms step:257/1480 train_time:35999ms step_avg:145.75ms step:258/1480 train_time:36152ms step_avg:145.77ms step:259/1480 train_time:36304ms step_avg:145.80ms step:260/1480 train_time:36454ms step_avg:145.82ms step:261/1480 train_time:36603ms step_avg:145.83ms step:262/1480 train_time:36753ms step_avg:145.85ms step:263/1480 train_time:36902ms step_avg:145.86ms step:264/1480 train_time:37053ms step_avg:145.88ms step:265/1480 train_time:37203ms step_avg:145.90ms step:266/1480 train_time:37355ms step_avg:145.92ms step:267/1480 train_time:37505ms step_avg:145.94ms step:268/1480 train_time:37656ms step_avg:145.95ms step:269/1480 train_time:37805ms step_avg:145.97ms step:270/1480 train_time:37955ms step_avg:145.98ms step:271/1480 train_time:38106ms step_avg:146.00ms step:272/1480 train_time:38256ms step_avg:146.02ms step:273/1480 train_time:38407ms step_avg:146.04ms step:274/1480 train_time:38559ms step_avg:146.06ms step:275/1480 train_time:38710ms step_avg:146.08ms step:276/1480 train_time:38860ms step_avg:146.09ms step:277/1480 train_time:39010ms step_avg:146.11ms step:278/1480 train_time:39161ms step_avg:146.12ms step:279/1480 train_time:39312ms step_avg:146.14ms step:280/1480 train_time:39463ms step_avg:146.16ms step:281/1480 train_time:39613ms step_avg:146.17ms step:282/1480 train_time:39764ms step_avg:146.19ms step:283/1480 train_time:39915ms step_avg:146.21ms step:284/1480 train_time:40064ms step_avg:146.22ms step:285/1480 train_time:40215ms step_avg:146.24ms step:286/1480 train_time:40367ms step_avg:146.26ms step:287/1480 train_time:40519ms step_avg:146.28ms step:288/1480 train_time:40669ms step_avg:146.29ms step:289/1480 train_time:40819ms step_avg:146.31ms step:290/1480 train_time:40969ms step_avg:146.32ms step:291/1480 train_time:41119ms step_avg:146.33ms step:292/1480 train_time:41270ms step_avg:146.35ms step:293/1480 train_time:41420ms step_avg:146.36ms step:294/1480 train_time:41572ms step_avg:146.38ms step:295/1480 train_time:41722ms step_avg:146.39ms step:296/1480 train_time:41874ms step_avg:146.41ms step:297/1480 train_time:42024ms step_avg:146.42ms step:298/1480 train_time:42174ms step_avg:146.44ms step:299/1480 train_time:42324ms step_avg:146.45ms step:300/1480 train_time:42476ms step_avg:146.47ms step:301/1480 train_time:42625ms step_avg:146.48ms step:302/1480 train_time:42777ms step_avg:146.50ms step:303/1480 train_time:42927ms step_avg:146.51ms step:304/1480 train_time:43077ms step_avg:146.52ms step:305/1480 train_time:43226ms step_avg:146.53ms step:306/1480 train_time:43378ms step_avg:146.55ms step:307/1480 train_time:43527ms step_avg:146.56ms step:308/1480 train_time:43680ms step_avg:146.58ms step:309/1480 train_time:43830ms step_avg:146.59ms step:310/1480 train_time:43980ms step_avg:146.60ms step:311/1480 train_time:44130ms step_avg:146.61ms step:312/1480 train_time:44280ms step_avg:146.62ms step:313/1480 train_time:44430ms step_avg:146.63ms step:314/1480 train_time:44581ms step_avg:146.65ms step:315/1480 train_time:44731ms step_avg:146.66ms step:316/1480 train_time:44882ms step_avg:146.67ms step:317/1480 train_time:45033ms step_avg:146.69ms step:318/1480 train_time:45184ms step_avg:146.70ms step:319/1480 train_time:45335ms step_avg:146.72ms step:320/1480 train_time:45486ms step_avg:146.73ms step:321/1480 train_time:45637ms step_avg:146.74ms step:322/1480 train_time:45787ms step_avg:146.75ms step:323/1480 train_time:45938ms step_avg:146.77ms step:324/1480 train_time:46088ms step_avg:146.78ms step:325/1480 train_time:46238ms step_avg:146.79ms step:326/1480 train_time:46389ms step_avg:146.80ms step:327/1480 train_time:46540ms step_avg:146.81ms step:328/1480 train_time:46689ms step_avg:146.82ms step:329/1480 train_time:46840ms step_avg:146.83ms step:330/1480 train_time:46993ms step_avg:146.85ms step:331/1480 train_time:47148ms step_avg:146.88ms step:332/1480 train_time:47303ms step_avg:146.90ms step:333/1480 train_time:47457ms step_avg:146.92ms step:334/1480 train_time:47611ms step_avg:146.95ms step:335/1480 train_time:47763ms step_avg:146.96ms step:336/1480 train_time:47916ms step_avg:146.98ms step:337/1480 train_time:48071ms step_avg:147.01ms step:338/1480 train_time:48224ms step_avg:147.03ms step:339/1480 train_time:48379ms step_avg:147.05ms step:340/1480 train_time:48534ms step_avg:147.07ms step:341/1480 train_time:48688ms step_avg:147.09ms step:342/1480 train_time:48840ms step_avg:147.11ms step:343/1480 train_time:48994ms step_avg:147.13ms step:344/1480 train_time:49149ms step_avg:147.15ms step:345/1480 train_time:49304ms step_avg:147.18ms step:346/1480 train_time:49458ms step_avg:147.20ms step:347/1480 train_time:49613ms step_avg:147.22ms step:348/1480 train_time:49767ms step_avg:147.24ms step:349/1480 train_time:49921ms step_avg:147.26ms step:350/1480 train_time:50074ms step_avg:147.28ms step:351/1480 train_time:50231ms step_avg:147.30ms step:352/1480 train_time:50385ms step_avg:147.32ms step:353/1480 train_time:50539ms step_avg:147.34ms step:354/1480 train_time:50694ms step_avg:147.37ms step:355/1480 train_time:50849ms step_avg:147.39ms step:356/1480 train_time:51002ms step_avg:147.41ms step:357/1480 train_time:51156ms step_avg:147.42ms step:358/1480 train_time:51310ms step_avg:147.44ms step:359/1480 train_time:51463ms step_avg:147.46ms step:360/1480 train_time:51618ms step_avg:147.48ms step:361/1480 train_time:51773ms step_avg:147.50ms step:362/1480 train_time:51927ms step_avg:147.52ms step:363/1480 train_time:52081ms step_avg:147.54ms step:364/1480 train_time:52235ms step_avg:147.56ms step:365/1480 train_time:52390ms step_avg:147.58ms step:366/1480 train_time:52543ms step_avg:147.59ms step:367/1480 train_time:52697ms step_avg:147.61ms step:368/1480 train_time:52851ms step_avg:147.63ms step:369/1480 train_time:53005ms step_avg:147.65ms step:370/1480 train_time:53158ms step_avg:147.66ms step:371/1480 train_time:53312ms step_avg:147.68ms step:372/1480 train_time:53468ms step_avg:147.70ms step:373/1480 train_time:53621ms step_avg:147.72ms step:374/1480 train_time:53775ms step_avg:147.73ms step:375/1480 train_time:53928ms step_avg:147.75ms step:375/1480 val_loss:3.8134 train_time:53987ms step_avg:147.91ms step:376/1480 train_time:54087ms step_avg:147.78ms step:377/1480 train_time:54241ms step_avg:147.80ms step:378/1480 train_time:54395ms step_avg:147.81ms step:379/1480 train_time:54547ms step_avg:147.83ms step:380/1480 train_time:54700ms step_avg:147.84ms step:381/1480 train_time:54852ms step_avg:147.85ms step:382/1480 train_time:55006ms step_avg:147.87ms step:383/1480 train_time:55160ms step_avg:147.88ms step:384/1480 train_time:55314ms step_avg:147.90ms step:385/1480 train_time:55468ms step_avg:147.91ms step:386/1480 train_time:55620ms step_avg:147.93ms step:387/1480 train_time:55775ms step_avg:147.94ms step:388/1480 train_time:55927ms step_avg:147.95ms step:389/1480 train_time:56081ms step_avg:147.97ms step:390/1480 train_time:56235ms step_avg:147.99ms step:391/1480 train_time:56390ms step_avg:148.00ms step:392/1480 train_time:56543ms step_avg:148.02ms step:393/1480 train_time:56696ms step_avg:148.03ms step:394/1480 train_time:56849ms step_avg:148.05ms step:395/1480 train_time:57003ms step_avg:148.06ms step:396/1480 train_time:57156ms step_avg:148.07ms step:397/1480 train_time:57311ms step_avg:148.09ms step:398/1480 train_time:57465ms step_avg:148.11ms step:399/1480 train_time:57619ms step_avg:148.12ms step:400/1480 train_time:57773ms step_avg:148.14ms step:401/1480 train_time:57927ms step_avg:148.15ms step:402/1480 train_time:58080ms step_avg:148.16ms step:403/1480 train_time:58235ms step_avg:148.18ms step:404/1480 train_time:58390ms step_avg:148.20ms step:405/1480 train_time:58544ms step_avg:148.21ms step:406/1480 train_time:58697ms step_avg:148.22ms step:407/1480 train_time:58852ms step_avg:148.24ms step:408/1480 train_time:59007ms step_avg:148.26ms step:409/1480 train_time:59160ms step_avg:148.27ms step:410/1480 train_time:59314ms step_avg:148.28ms step:411/1480 train_time:59469ms step_avg:148.30ms step:412/1480 train_time:59622ms step_avg:148.31ms step:413/1480 train_time:59775ms step_avg:148.33ms step:414/1480 train_time:59930ms step_avg:148.34ms step:415/1480 train_time:60084ms step_avg:148.36ms step:416/1480 train_time:60237ms step_avg:148.37ms step:417/1480 train_time:60391ms step_avg:148.38ms step:418/1480 train_time:60546ms step_avg:148.40ms step:419/1480 train_time:60700ms step_avg:148.41ms step:420/1480 train_time:60853ms step_avg:148.42ms step:421/1480 train_time:61010ms step_avg:148.44ms step:422/1480 train_time:61165ms step_avg:148.46ms step:423/1480 train_time:61318ms step_avg:148.47ms step:424/1480 train_time:61472ms step_avg:148.48ms step:425/1480 train_time:61626ms step_avg:148.50ms step:426/1480 train_time:61781ms step_avg:148.51ms step:427/1480 train_time:61934ms step_avg:148.52ms step:428/1480 train_time:62088ms step_avg:148.54ms step:429/1480 train_time:62241ms step_avg:148.55ms step:430/1480 train_time:62395ms step_avg:148.56ms step:431/1480 train_time:62548ms step_avg:148.57ms step:432/1480 train_time:62701ms step_avg:148.58ms step:433/1480 train_time:62854ms step_avg:148.59ms step:434/1480 train_time:63009ms step_avg:148.61ms step:435/1480 train_time:63163ms step_avg:148.62ms step:436/1480 train_time:63317ms step_avg:148.63ms step:437/1480 train_time:63471ms step_avg:148.64ms step:438/1480 train_time:63625ms step_avg:148.66ms step:439/1480 train_time:63779ms step_avg:148.67ms step:440/1480 train_time:63934ms step_avg:148.68ms step:441/1480 train_time:64091ms step_avg:148.70ms step:442/1480 train_time:64250ms step_avg:148.73ms step:443/1480 train_time:64408ms step_avg:148.75ms step:444/1480 train_time:64564ms step_avg:148.76ms step:445/1480 train_time:64720ms step_avg:148.78ms step:446/1480 train_time:64876ms step_avg:148.80ms step:447/1480 train_time:65032ms step_avg:148.81ms step:448/1480 train_time:65188ms step_avg:148.83ms step:449/1480 train_time:65346ms step_avg:148.85ms step:450/1480 train_time:65504ms step_avg:148.87ms step:451/1480 train_time:65661ms step_avg:148.89ms step:452/1480 train_time:65817ms step_avg:148.91ms step:453/1480 train_time:65974ms step_avg:148.93ms step:454/1480 train_time:66130ms step_avg:148.94ms step:455/1480 train_time:66288ms step_avg:148.96ms step:456/1480 train_time:66445ms step_avg:148.98ms step:457/1480 train_time:66601ms step_avg:149.00ms step:458/1480 train_time:66756ms step_avg:149.01ms step:459/1480 train_time:66914ms step_avg:149.03ms step:460/1480 train_time:67071ms step_avg:149.05ms step:461/1480 train_time:67230ms step_avg:149.07ms step:462/1480 train_time:67389ms step_avg:149.09ms step:463/1480 train_time:67548ms step_avg:149.11ms step:464/1480 train_time:67706ms step_avg:149.13ms step:465/1480 train_time:67863ms step_avg:149.15ms step:466/1480 train_time:68021ms step_avg:149.17ms step:467/1480 train_time:68179ms step_avg:149.19ms step:468/1480 train_time:68334ms step_avg:149.20ms step:469/1480 train_time:68491ms step_avg:149.22ms step:470/1480 train_time:68648ms step_avg:149.23ms step:471/1480 train_time:68805ms step_avg:149.25ms step:472/1480 train_time:68961ms step_avg:149.27ms step:473/1480 train_time:69117ms step_avg:149.28ms step:474/1480 train_time:69274ms step_avg:149.30ms step:475/1480 train_time:69430ms step_avg:149.31ms step:476/1480 train_time:69588ms step_avg:149.33ms step:477/1480 train_time:69744ms step_avg:149.35ms step:478/1480 train_time:69900ms step_avg:149.36ms step:479/1480 train_time:70055ms step_avg:149.37ms step:480/1480 train_time:70214ms step_avg:149.39ms step:481/1480 train_time:70372ms step_avg:149.41ms step:482/1480 train_time:70529ms step_avg:149.43ms step:483/1480 train_time:70686ms step_avg:149.44ms step:484/1480 train_time:70843ms step_avg:149.46ms step:485/1480 train_time:70999ms step_avg:149.47ms step:486/1480 train_time:71155ms step_avg:149.49ms step:487/1480 train_time:71313ms step_avg:149.50ms step:488/1480 train_time:71470ms step_avg:149.52ms step:489/1480 train_time:71627ms step_avg:149.53ms step:490/1480 train_time:71784ms step_avg:149.55ms step:491/1480 train_time:71941ms step_avg:149.56ms step:492/1480 train_time:72098ms step_avg:149.58ms step:493/1480 train_time:72254ms step_avg:149.60ms step:494/1480 train_time:72412ms step_avg:149.61ms step:495/1480 train_time:72570ms step_avg:149.63ms step:496/1480 train_time:72726ms step_avg:149.64ms step:497/1480 train_time:72884ms step_avg:149.66ms step:498/1480 train_time:73040ms step_avg:149.67ms step:499/1480 train_time:73197ms step_avg:149.69ms step:500/1480 train_time:73354ms step_avg:149.70ms step:500/1480 val_loss:3.6917 train_time:73416ms step_avg:149.83ms step:501/1480 train_time:73514ms step_avg:149.72ms step:502/1480 train_time:73673ms step_avg:149.74ms step:503/1480 train_time:73830ms step_avg:149.76ms step:504/1480 train_time:73986ms step_avg:149.77ms step:505/1480 train_time:74141ms step_avg:149.78ms step:506/1480 train_time:74297ms step_avg:149.79ms step:507/1480 train_time:74454ms step_avg:149.81ms step:508/1480 train_time:74613ms step_avg:149.82ms step:509/1480 train_time:74771ms step_avg:149.84ms step:510/1480 train_time:74927ms step_avg:149.85ms step:511/1480 train_time:75084ms step_avg:149.87ms step:512/1480 train_time:75242ms step_avg:149.88ms step:513/1480 train_time:75397ms step_avg:149.89ms step:514/1480 train_time:75554ms step_avg:149.91ms step:515/1480 train_time:75712ms step_avg:149.92ms step:516/1480 train_time:75872ms step_avg:149.95ms step:517/1480 train_time:76031ms step_avg:149.96ms step:518/1480 train_time:76191ms step_avg:149.98ms step:519/1480 train_time:76350ms step_avg:150.00ms step:520/1480 train_time:76508ms step_avg:150.02ms step:521/1480 train_time:76665ms step_avg:150.03ms step:522/1480 train_time:76821ms step_avg:150.04ms step:523/1480 train_time:76977ms step_avg:150.05ms step:524/1480 train_time:77134ms step_avg:150.07ms step:525/1480 train_time:77293ms step_avg:150.08ms step:526/1480 train_time:77451ms step_avg:150.10ms step:527/1480 train_time:77608ms step_avg:150.11ms step:528/1480 train_time:77763ms step_avg:150.12ms step:529/1480 train_time:77919ms step_avg:150.13ms step:530/1480 train_time:78076ms step_avg:150.15ms step:531/1480 train_time:78233ms step_avg:150.16ms step:532/1480 train_time:78392ms step_avg:150.18ms step:533/1480 train_time:78550ms step_avg:150.19ms step:534/1480 train_time:78706ms step_avg:150.20ms step:535/1480 train_time:78865ms step_avg:150.22ms step:536/1480 train_time:79022ms step_avg:150.23ms step:537/1480 train_time:79178ms step_avg:150.24ms step:538/1480 train_time:79334ms step_avg:150.25ms step:539/1480 train_time:79494ms step_avg:150.27ms step:540/1480 train_time:79651ms step_avg:150.29ms step:541/1480 train_time:79808ms step_avg:150.30ms step:542/1480 train_time:79964ms step_avg:150.31ms step:543/1480 train_time:80120ms step_avg:150.32ms step:544/1480 train_time:80277ms step_avg:150.33ms step:545/1480 train_time:80433ms step_avg:150.34ms step:546/1480 train_time:80591ms step_avg:150.36ms step:547/1480 train_time:80748ms step_avg:150.37ms step:548/1480 train_time:80907ms step_avg:150.38ms step:549/1480 train_time:81062ms step_avg:150.39ms step:550/1480 train_time:81220ms step_avg:150.41ms step:551/1480 train_time:81378ms step_avg:150.42ms step:552/1480 train_time:81536ms step_avg:150.44ms step:553/1480 train_time:81696ms step_avg:150.45ms step:554/1480 train_time:81856ms step_avg:150.47ms step:555/1480 train_time:82016ms step_avg:150.49ms step:556/1480 train_time:82174ms step_avg:150.50ms step:557/1480 train_time:82333ms step_avg:150.52ms step:558/1480 train_time:82494ms step_avg:150.54ms step:559/1480 train_time:82654ms step_avg:150.55ms step:560/1480 train_time:82813ms step_avg:150.57ms step:561/1480 train_time:82972ms step_avg:150.59ms step:562/1480 train_time:83132ms step_avg:150.60ms step:563/1480 train_time:83291ms step_avg:150.62ms step:564/1480 train_time:83451ms step_avg:150.63ms step:565/1480 train_time:83609ms step_avg:150.65ms step:566/1480 train_time:83769ms step_avg:150.66ms step:567/1480 train_time:83929ms step_avg:150.68ms step:568/1480 train_time:84087ms step_avg:150.69ms step:569/1480 train_time:84245ms step_avg:150.71ms step:570/1480 train_time:84403ms step_avg:150.72ms step:571/1480 train_time:84563ms step_avg:150.74ms step:572/1480 train_time:84720ms step_avg:150.75ms step:573/1480 train_time:84878ms step_avg:150.76ms step:574/1480 train_time:85038ms step_avg:150.78ms step:575/1480 train_time:85199ms step_avg:150.79ms step:576/1480 train_time:85360ms step_avg:150.81ms step:577/1480 train_time:85518ms step_avg:150.83ms step:578/1480 train_time:85677ms step_avg:150.84ms step:579/1480 train_time:85836ms step_avg:150.85ms step:580/1480 train_time:85995ms step_avg:150.87ms step:581/1480 train_time:86155ms step_avg:150.88ms step:582/1480 train_time:86315ms step_avg:150.90ms step:583/1480 train_time:86475ms step_avg:150.92ms step:584/1480 train_time:86635ms step_avg:150.93ms step:585/1480 train_time:86794ms step_avg:150.95ms step:586/1480 train_time:86953ms step_avg:150.96ms step:587/1480 train_time:87112ms step_avg:150.97ms step:588/1480 train_time:87271ms step_avg:150.99ms step:589/1480 train_time:87432ms step_avg:151.01ms step:590/1480 train_time:87594ms step_avg:151.02ms step:591/1480 train_time:87753ms step_avg:151.04ms step:592/1480 train_time:87914ms step_avg:151.05ms step:593/1480 train_time:88074ms step_avg:151.07ms step:594/1480 train_time:88234ms step_avg:151.09ms step:595/1480 train_time:88396ms step_avg:151.10ms step:596/1480 train_time:88559ms step_avg:151.12ms step:597/1480 train_time:88717ms step_avg:151.14ms step:598/1480 train_time:88876ms step_avg:151.15ms step:599/1480 train_time:89035ms step_avg:151.16ms step:600/1480 train_time:89194ms step_avg:151.18ms step:601/1480 train_time:89354ms step_avg:151.19ms step:602/1480 train_time:89515ms step_avg:151.21ms step:603/1480 train_time:89675ms step_avg:151.22ms step:604/1480 train_time:89834ms step_avg:151.24ms step:605/1480 train_time:89994ms step_avg:151.25ms step:606/1480 train_time:90155ms step_avg:151.27ms step:607/1480 train_time:90316ms step_avg:151.28ms step:608/1480 train_time:90476ms step_avg:151.30ms step:609/1480 train_time:90635ms step_avg:151.31ms step:610/1480 train_time:90794ms step_avg:151.32ms step:611/1480 train_time:90955ms step_avg:151.34ms step:612/1480 train_time:91114ms step_avg:151.35ms step:613/1480 train_time:91274ms step_avg:151.37ms step:614/1480 train_time:91434ms step_avg:151.38ms step:615/1480 train_time:91594ms step_avg:151.39ms step:616/1480 train_time:91752ms step_avg:151.41ms step:617/1480 train_time:91913ms step_avg:151.42ms step:618/1480 train_time:92072ms step_avg:151.43ms step:619/1480 train_time:92232ms step_avg:151.45ms step:620/1480 train_time:92391ms step_avg:151.46ms step:621/1480 train_time:92552ms step_avg:151.48ms step:622/1480 train_time:92712ms step_avg:151.49ms step:623/1480 train_time:92875ms step_avg:151.51ms step:624/1480 train_time:93034ms step_avg:151.52ms step:625/1480 train_time:93193ms step_avg:151.53ms step:625/1480 val_loss:3.6128 train_time:93256ms step_avg:151.64ms step:626/1480 train_time:93357ms step_avg:151.55ms step:627/1480 train_time:93518ms step_avg:151.57ms step:628/1480 train_time:93677ms step_avg:151.58ms step:629/1480 train_time:93835ms step_avg:151.59ms step:630/1480 train_time:93993ms step_avg:151.60ms step:631/1480 train_time:94152ms step_avg:151.61ms step:632/1480 train_time:94311ms step_avg:151.62ms step:633/1480 train_time:94471ms step_avg:151.64ms step:634/1480 train_time:94631ms step_avg:151.65ms step:635/1480 train_time:94793ms step_avg:151.67ms step:636/1480 train_time:94952ms step_avg:151.68ms step:637/1480 train_time:95111ms step_avg:151.69ms step:638/1480 train_time:95268ms step_avg:151.70ms step:639/1480 train_time:95427ms step_avg:151.71ms step:640/1480 train_time:95586ms step_avg:151.72ms step:641/1480 train_time:95746ms step_avg:151.74ms step:642/1480 train_time:95904ms step_avg:151.75ms step:643/1480 train_time:96062ms step_avg:151.76ms step:644/1480 train_time:96221ms step_avg:151.77ms step:645/1480 train_time:96380ms step_avg:151.78ms step:646/1480 train_time:96541ms step_avg:151.79ms step:647/1480 train_time:96700ms step_avg:151.81ms step:648/1480 train_time:96861ms step_avg:151.82ms step:649/1480 train_time:97021ms step_avg:151.83ms step:650/1480 train_time:97180ms step_avg:151.84ms step:651/1480 train_time:97340ms step_avg:151.86ms step:652/1480 train_time:97500ms step_avg:151.87ms step:653/1480 train_time:97660ms step_avg:151.88ms step:654/1480 train_time:97820ms step_avg:151.89ms step:655/1480 train_time:97980ms step_avg:151.91ms step:656/1480 train_time:98139ms step_avg:151.92ms step:657/1480 train_time:98299ms step_avg:151.93ms step:658/1480 train_time:98459ms step_avg:151.94ms step:659/1480 train_time:98620ms step_avg:151.96ms step:660/1480 train_time:98782ms step_avg:151.97ms step:661/1480 train_time:98944ms step_avg:151.99ms step:662/1480 train_time:99104ms step_avg:152.00ms step:663/1480 train_time:99264ms step_avg:152.01ms step:664/1480 train_time:99426ms step_avg:152.03ms step:665/1480 train_time:99588ms step_avg:152.04ms step:666/1480 train_time:99748ms step_avg:152.05ms step:667/1480 train_time:99909ms step_avg:152.07ms step:668/1480 train_time:100071ms step_avg:152.08ms step:669/1480 train_time:100236ms step_avg:152.10ms step:670/1480 train_time:100396ms step_avg:152.11ms step:671/1480 train_time:100557ms step_avg:152.13ms step:672/1480 train_time:100720ms step_avg:152.14ms step:673/1480 train_time:100883ms step_avg:152.16ms step:674/1480 train_time:101045ms step_avg:152.18ms step:675/1480 train_time:101206ms step_avg:152.19ms step:676/1480 train_time:101368ms step_avg:152.20ms step:677/1480 train_time:101529ms step_avg:152.22ms step:678/1480 train_time:101689ms step_avg:152.23ms step:679/1480 train_time:101850ms step_avg:152.24ms step:680/1480 train_time:102010ms step_avg:152.25ms step:681/1480 train_time:102172ms step_avg:152.27ms step:682/1480 train_time:102337ms step_avg:152.29ms step:683/1480 train_time:102501ms step_avg:152.30ms step:684/1480 train_time:102662ms step_avg:152.32ms step:685/1480 train_time:102824ms step_avg:152.33ms step:686/1480 train_time:102985ms step_avg:152.34ms step:687/1480 train_time:103144ms step_avg:152.36ms step:688/1480 train_time:103307ms step_avg:152.37ms step:689/1480 train_time:103469ms step_avg:152.38ms step:690/1480 train_time:103633ms step_avg:152.40ms step:691/1480 train_time:103795ms step_avg:152.42ms step:692/1480 train_time:103958ms step_avg:152.43ms step:693/1480 train_time:104121ms step_avg:152.45ms step:694/1480 train_time:104282ms step_avg:152.46ms step:695/1480 train_time:104443ms step_avg:152.47ms step:696/1480 train_time:104603ms step_avg:152.48ms step:697/1480 train_time:104766ms step_avg:152.50ms step:698/1480 train_time:104926ms step_avg:152.51ms step:699/1480 train_time:105088ms step_avg:152.52ms step:700/1480 train_time:105251ms step_avg:152.54ms step:701/1480 train_time:105411ms step_avg:152.55ms step:702/1480 train_time:105571ms step_avg:152.56ms step:703/1480 train_time:105731ms step_avg:152.57ms step:704/1480 train_time:105893ms step_avg:152.58ms step:705/1480 train_time:106057ms step_avg:152.60ms step:706/1480 train_time:106222ms step_avg:152.62ms step:707/1480 train_time:106386ms step_avg:152.63ms step:708/1480 train_time:106548ms step_avg:152.65ms step:709/1480 train_time:106708ms step_avg:152.66ms step:710/1480 train_time:106867ms step_avg:152.67ms step:711/1480 train_time:107029ms step_avg:152.68ms step:712/1480 train_time:107193ms step_avg:152.70ms step:713/1480 train_time:107358ms step_avg:152.71ms step:714/1480 train_time:107520ms step_avg:152.73ms step:715/1480 train_time:107681ms step_avg:152.74ms step:716/1480 train_time:107841ms step_avg:152.75ms step:717/1480 train_time:108003ms step_avg:152.76ms step:718/1480 train_time:108163ms step_avg:152.77ms step:719/1480 train_time:108324ms step_avg:152.78ms step:720/1480 train_time:108486ms step_avg:152.80ms step:721/1480 train_time:108647ms step_avg:152.81ms step:722/1480 train_time:108808ms step_avg:152.82ms step:723/1480 train_time:108968ms step_avg:152.83ms step:724/1480 train_time:109129ms step_avg:152.84ms step:725/1480 train_time:109293ms step_avg:152.86ms step:726/1480 train_time:109458ms step_avg:152.87ms step:727/1480 train_time:109622ms step_avg:152.89ms step:728/1480 train_time:109782ms step_avg:152.90ms step:729/1480 train_time:109943ms step_avg:152.91ms step:730/1480 train_time:110104ms step_avg:152.92ms step:731/1480 train_time:110264ms step_avg:152.93ms step:732/1480 train_time:110424ms step_avg:152.94ms step:733/1480 train_time:110586ms step_avg:152.95ms step:734/1480 train_time:110748ms step_avg:152.97ms step:735/1480 train_time:110907ms step_avg:152.98ms step:736/1480 train_time:111070ms step_avg:152.99ms step:737/1480 train_time:111234ms step_avg:153.00ms step:738/1480 train_time:111394ms step_avg:153.01ms step:739/1480 train_time:111556ms step_avg:153.03ms step:740/1480 train_time:111723ms step_avg:153.05ms step:741/1480 train_time:111886ms step_avg:153.06ms step:742/1480 train_time:112047ms step_avg:153.07ms step:743/1480 train_time:112208ms step_avg:153.08ms step:744/1480 train_time:112370ms step_avg:153.09ms step:745/1480 train_time:112536ms step_avg:153.11ms step:746/1480 train_time:112698ms step_avg:153.12ms step:747/1480 train_time:112860ms step_avg:153.13ms step:748/1480 train_time:113025ms step_avg:153.15ms step:749/1480 train_time:113187ms step_avg:153.16ms step:750/1480 train_time:113347ms step_avg:153.17ms step:750/1480 val_loss:3.5554 train_time:113411ms step_avg:153.26ms step:751/1480 train_time:113511ms step_avg:153.19ms step:752/1480 train_time:113671ms step_avg:153.20ms step:753/1480 train_time:113833ms step_avg:153.21ms step:754/1480 train_time:113993ms step_avg:153.22ms step:755/1480 train_time:114154ms step_avg:153.23ms step:756/1480 train_time:114314ms step_avg:153.24ms step:757/1480 train_time:114479ms step_avg:153.25ms step:758/1480 train_time:114640ms step_avg:153.26ms step:759/1480 train_time:114803ms step_avg:153.28ms step:760/1480 train_time:114965ms step_avg:153.29ms step:761/1480 train_time:115128ms step_avg:153.30ms step:762/1480 train_time:115289ms step_avg:153.31ms step:763/1480 train_time:115450ms step_avg:153.32ms step:764/1480 train_time:115612ms step_avg:153.33ms step:765/1480 train_time:115773ms step_avg:153.34ms step:766/1480 train_time:115935ms step_avg:153.35ms step:767/1480 train_time:116098ms step_avg:153.37ms step:768/1480 train_time:116260ms step_avg:153.38ms step:769/1480 train_time:116425ms step_avg:153.39ms step:770/1480 train_time:116587ms step_avg:153.40ms step:771/1480 train_time:116750ms step_avg:153.42ms step:772/1480 train_time:116911ms step_avg:153.43ms step:773/1480 train_time:117073ms step_avg:153.44ms step:774/1480 train_time:117235ms step_avg:153.45ms step:775/1480 train_time:117400ms step_avg:153.46ms step:776/1480 train_time:117565ms step_avg:153.48ms step:777/1480 train_time:117732ms step_avg:153.50ms step:778/1480 train_time:117894ms step_avg:153.51ms step:779/1480 train_time:118056ms step_avg:153.52ms step:780/1480 train_time:118222ms step_avg:153.54ms step:781/1480 train_time:118386ms step_avg:153.55ms step:782/1480 train_time:118549ms step_avg:153.56ms step:783/1480 train_time:118710ms step_avg:153.57ms step:784/1480 train_time:118873ms step_avg:153.58ms step:785/1480 train_time:119034ms step_avg:153.59ms step:786/1480 train_time:119201ms step_avg:153.61ms step:787/1480 train_time:119365ms step_avg:153.62ms step:788/1480 train_time:119529ms step_avg:153.64ms step:789/1480 train_time:119690ms step_avg:153.65ms step:790/1480 train_time:119855ms step_avg:153.66ms step:791/1480 train_time:120022ms step_avg:153.68ms step:792/1480 train_time:120187ms step_avg:153.69ms step:793/1480 train_time:120349ms step_avg:153.70ms step:794/1480 train_time:120511ms step_avg:153.71ms step:795/1480 train_time:120675ms step_avg:153.73ms step:796/1480 train_time:120842ms step_avg:153.74ms step:797/1480 train_time:121006ms step_avg:153.76ms step:798/1480 train_time:121169ms step_avg:153.77ms step:799/1480 train_time:121335ms step_avg:153.78ms step:800/1480 train_time:121498ms step_avg:153.79ms step:801/1480 train_time:121662ms step_avg:153.81ms step:802/1480 train_time:121831ms step_avg:153.83ms step:803/1480 train_time:121992ms step_avg:153.84ms step:804/1480 train_time:122154ms step_avg:153.85ms step:805/1480 train_time:122319ms step_avg:153.86ms step:806/1480 train_time:122482ms step_avg:153.87ms step:807/1480 train_time:122644ms step_avg:153.88ms step:808/1480 train_time:122808ms step_avg:153.89ms step:809/1480 train_time:122969ms step_avg:153.90ms step:810/1480 train_time:123131ms step_avg:153.91ms step:811/1480 train_time:123293ms step_avg:153.92ms step:812/1480 train_time:123456ms step_avg:153.93ms step:813/1480 train_time:123617ms step_avg:153.94ms step:814/1480 train_time:123781ms step_avg:153.96ms step:815/1480 train_time:123943ms step_avg:153.97ms step:816/1480 train_time:124108ms step_avg:153.98ms step:817/1480 train_time:124269ms step_avg:153.99ms step:818/1480 train_time:124432ms step_avg:154.00ms step:819/1480 train_time:124597ms step_avg:154.01ms step:820/1480 train_time:124762ms step_avg:154.03ms step:821/1480 train_time:124925ms step_avg:154.04ms step:822/1480 train_time:125089ms step_avg:154.05ms step:823/1480 train_time:125251ms step_avg:154.06ms step:824/1480 train_time:125413ms step_avg:154.07ms step:825/1480 train_time:125578ms step_avg:154.08ms step:826/1480 train_time:125746ms step_avg:154.10ms step:827/1480 train_time:125910ms step_avg:154.11ms step:828/1480 train_time:126074ms step_avg:154.12ms step:829/1480 train_time:126236ms step_avg:154.13ms step:830/1480 train_time:126401ms step_avg:154.15ms step:831/1480 train_time:126566ms step_avg:154.16ms step:832/1480 train_time:126729ms step_avg:154.17ms step:833/1480 train_time:126895ms step_avg:154.19ms step:834/1480 train_time:127059ms step_avg:154.20ms step:835/1480 train_time:127223ms step_avg:154.21ms step:836/1480 train_time:127387ms step_avg:154.22ms step:837/1480 train_time:127550ms step_avg:154.23ms step:838/1480 train_time:127714ms step_avg:154.24ms step:839/1480 train_time:127876ms step_avg:154.25ms step:840/1480 train_time:128037ms step_avg:154.26ms step:841/1480 train_time:128198ms step_avg:154.27ms step:842/1480 train_time:128363ms step_avg:154.28ms step:843/1480 train_time:128527ms step_avg:154.29ms step:844/1480 train_time:128689ms step_avg:154.30ms step:845/1480 train_time:128852ms step_avg:154.31ms step:846/1480 train_time:129018ms step_avg:154.33ms step:847/1480 train_time:129183ms step_avg:154.34ms step:848/1480 train_time:129346ms step_avg:154.35ms step:849/1480 train_time:129509ms step_avg:154.36ms step:850/1480 train_time:129672ms step_avg:154.37ms step:851/1480 train_time:129837ms step_avg:154.38ms step:852/1480 train_time:130001ms step_avg:154.40ms step:853/1480 train_time:130164ms step_avg:154.41ms step:854/1480 train_time:130328ms step_avg:154.42ms step:855/1480 train_time:130491ms step_avg:154.43ms step:856/1480 train_time:130653ms step_avg:154.44ms step:857/1480 train_time:130817ms step_avg:154.45ms step:858/1480 train_time:130983ms step_avg:154.46ms step:859/1480 train_time:131147ms step_avg:154.47ms step:860/1480 train_time:131308ms step_avg:154.48ms step:861/1480 train_time:131474ms step_avg:154.49ms step:862/1480 train_time:131645ms step_avg:154.51ms step:863/1480 train_time:131813ms step_avg:154.53ms step:864/1480 train_time:131977ms step_avg:154.54ms step:865/1480 train_time:132139ms step_avg:154.55ms step:866/1480 train_time:132307ms step_avg:154.56ms step:867/1480 train_time:132470ms step_avg:154.57ms step:868/1480 train_time:132631ms step_avg:154.58ms step:869/1480 train_time:132793ms step_avg:154.59ms step:870/1480 train_time:132958ms step_avg:154.60ms step:871/1480 train_time:133122ms step_avg:154.61ms step:872/1480 train_time:133287ms step_avg:154.63ms step:873/1480 train_time:133449ms step_avg:154.63ms step:874/1480 train_time:133616ms step_avg:154.65ms step:875/1480 train_time:133781ms step_avg:154.66ms step:875/1480 val_loss:3.5096 train_time:133846ms step_avg:154.73ms step:876/1480 train_time:133946ms step_avg:154.67ms step:877/1480 train_time:134112ms step_avg:154.68ms step:878/1480 train_time:134275ms step_avg:154.69ms step:879/1480 train_time:134439ms step_avg:154.70ms step:880/1480 train_time:134600ms step_avg:154.71ms step:881/1480 train_time:134763ms step_avg:154.72ms step:882/1480 train_time:134928ms step_avg:154.73ms step:883/1480 train_time:135095ms step_avg:154.75ms step:884/1480 train_time:135262ms step_avg:154.76ms step:885/1480 train_time:135428ms step_avg:154.77ms step:886/1480 train_time:135593ms step_avg:154.79ms step:887/1480 train_time:135760ms step_avg:154.80ms step:888/1480 train_time:135935ms step_avg:154.82ms step:889/1480 train_time:136104ms step_avg:154.84ms step:890/1480 train_time:136266ms step_avg:154.85ms step:891/1480 train_time:136433ms step_avg:154.86ms step:892/1480 train_time:136599ms step_avg:154.87ms step:893/1480 train_time:136761ms step_avg:154.88ms step:894/1480 train_time:136928ms step_avg:154.90ms step:895/1480 train_time:137094ms step_avg:154.91ms step:896/1480 train_time:137257ms step_avg:154.92ms step:897/1480 train_time:137424ms step_avg:154.93ms step:898/1480 train_time:137591ms step_avg:154.94ms step:899/1480 train_time:137755ms step_avg:154.95ms step:900/1480 train_time:137918ms step_avg:154.96ms step:901/1480 train_time:138081ms step_avg:154.97ms step:902/1480 train_time:138247ms step_avg:154.99ms step:903/1480 train_time:138417ms step_avg:155.00ms step:904/1480 train_time:138584ms step_avg:155.02ms step:905/1480 train_time:138746ms step_avg:155.02ms step:906/1480 train_time:138913ms step_avg:155.04ms step:907/1480 train_time:139080ms step_avg:155.05ms step:908/1480 train_time:139244ms step_avg:155.06ms step:909/1480 train_time:139408ms step_avg:155.07ms step:910/1480 train_time:139578ms step_avg:155.09ms step:911/1480 train_time:139745ms step_avg:155.10ms step:912/1480 train_time:139911ms step_avg:155.11ms step:913/1480 train_time:140078ms step_avg:155.13ms step:914/1480 train_time:140245ms step_avg:155.14ms step:915/1480 train_time:140413ms step_avg:155.15ms step:916/1480 train_time:140579ms step_avg:155.16ms step:917/1480 train_time:140744ms step_avg:155.17ms step:918/1480 train_time:140913ms step_avg:155.19ms step:919/1480 train_time:141082ms step_avg:155.21ms step:920/1480 train_time:141249ms step_avg:155.22ms step:921/1480 train_time:141414ms step_avg:155.23ms step:922/1480 train_time:141581ms step_avg:155.24ms step:923/1480 train_time:141745ms step_avg:155.25ms step:924/1480 train_time:141910ms step_avg:155.26ms step:925/1480 train_time:142074ms step_avg:155.27ms step:926/1480 train_time:142238ms step_avg:155.28ms step:927/1480 train_time:142403ms step_avg:155.29ms step:928/1480 train_time:142571ms step_avg:155.31ms step:929/1480 train_time:142735ms step_avg:155.32ms step:930/1480 train_time:142900ms step_avg:155.33ms step:931/1480 train_time:143062ms step_avg:155.33ms step:932/1480 train_time:143229ms step_avg:155.35ms step:933/1480 train_time:143397ms step_avg:155.36ms step:934/1480 train_time:143566ms step_avg:155.37ms step:935/1480 train_time:143736ms step_avg:155.39ms step:936/1480 train_time:143903ms step_avg:155.40ms step:937/1480 train_time:144073ms step_avg:155.42ms step:938/1480 train_time:144235ms step_avg:155.43ms step:939/1480 train_time:144404ms step_avg:155.44ms step:940/1480 train_time:144572ms step_avg:155.45ms step:941/1480 train_time:144735ms step_avg:155.46ms step:942/1480 train_time:144901ms step_avg:155.47ms step:943/1480 train_time:145075ms step_avg:155.49ms step:944/1480 train_time:145245ms step_avg:155.51ms step:945/1480 train_time:145409ms step_avg:155.52ms step:946/1480 train_time:145577ms step_avg:155.53ms step:947/1480 train_time:145744ms step_avg:155.54ms step:948/1480 train_time:145910ms step_avg:155.55ms step:949/1480 train_time:146076ms step_avg:155.57ms step:950/1480 train_time:146239ms step_avg:155.57ms step:951/1480 train_time:146407ms step_avg:155.59ms step:952/1480 train_time:146573ms step_avg:155.60ms step:953/1480 train_time:146740ms step_avg:155.61ms step:954/1480 train_time:146910ms step_avg:155.62ms step:955/1480 train_time:147073ms step_avg:155.63ms step:956/1480 train_time:147237ms step_avg:155.64ms step:957/1480 train_time:147405ms step_avg:155.66ms step:958/1480 train_time:147575ms step_avg:155.67ms step:959/1480 train_time:147738ms step_avg:155.68ms step:960/1480 train_time:147906ms step_avg:155.69ms step:961/1480 train_time:148072ms step_avg:155.70ms step:962/1480 train_time:148235ms step_avg:155.71ms step:963/1480 train_time:148401ms step_avg:155.72ms step:964/1480 train_time:148570ms step_avg:155.73ms step:965/1480 train_time:148735ms step_avg:155.74ms step:966/1480 train_time:148898ms step_avg:155.75ms step:967/1480 train_time:149062ms step_avg:155.76ms step:968/1480 train_time:149227ms step_avg:155.77ms step:969/1480 train_time:149394ms step_avg:155.78ms step:970/1480 train_time:149557ms step_avg:155.79ms step:971/1480 train_time:149722ms step_avg:155.80ms step:972/1480 train_time:149889ms step_avg:155.81ms step:973/1480 train_time:150053ms step_avg:155.82ms step:974/1480 train_time:150220ms step_avg:155.83ms step:975/1480 train_time:150386ms step_avg:155.84ms step:976/1480 train_time:150552ms step_avg:155.85ms step:977/1480 train_time:150716ms step_avg:155.86ms step:978/1480 train_time:150880ms step_avg:155.87ms step:979/1480 train_time:151046ms step_avg:155.88ms step:980/1480 train_time:151213ms step_avg:155.89ms step:981/1480 train_time:151382ms step_avg:155.90ms step:982/1480 train_time:151546ms step_avg:155.91ms step:983/1480 train_time:151711ms step_avg:155.92ms step:984/1480 train_time:151876ms step_avg:155.93ms step:985/1480 train_time:152044ms step_avg:155.94ms step:986/1480 train_time:152211ms step_avg:155.95ms step:987/1480 train_time:152375ms step_avg:155.96ms step:988/1480 train_time:152541ms step_avg:155.97ms step:989/1480 train_time:152707ms step_avg:155.98ms step:990/1480 train_time:152877ms step_avg:156.00ms step:991/1480 train_time:153044ms step_avg:156.01ms step:992/1480 train_time:153217ms step_avg:156.03ms step:993/1480 train_time:153395ms step_avg:156.05ms step:994/1480 train_time:153560ms step_avg:156.06ms step:995/1480 train_time:153724ms step_avg:156.06ms step:996/1480 train_time:153887ms step_avg:156.07ms step:997/1480 train_time:154052ms step_avg:156.08ms step:998/1480 train_time:154214ms step_avg:156.09ms step:999/1480 train_time:154381ms step_avg:156.10ms step:1000/1480 train_time:154551ms step_avg:156.11ms step:1000/1480 val_loss:3.4453 train_time:154618ms step_avg:156.18ms step:1001/1480 train_time:154720ms step_avg:156.13ms step:1002/1480 train_time:154887ms step_avg:156.14ms step:1003/1480 train_time:155061ms step_avg:156.15ms step:1004/1480 train_time:155229ms step_avg:156.17ms step:1005/1480 train_time:155397ms step_avg:156.18ms step:1006/1480 train_time:155564ms step_avg:156.19ms step:1007/1480 train_time:155730ms step_avg:156.20ms step:1008/1480 train_time:155897ms step_avg:156.21ms step:1009/1480 train_time:156070ms step_avg:156.23ms step:1010/1480 train_time:156235ms step_avg:156.24ms step:1011/1480 train_time:156401ms step_avg:156.24ms step:1012/1480 train_time:156566ms step_avg:156.25ms step:1013/1480 train_time:156737ms step_avg:156.27ms step:1014/1480 train_time:156905ms step_avg:156.28ms step:1015/1480 train_time:157075ms step_avg:156.29ms step:1016/1480 train_time:157245ms step_avg:156.31ms step:1017/1480 train_time:157416ms step_avg:156.32ms step:1018/1480 train_time:157585ms step_avg:156.33ms step:1019/1480 train_time:157755ms step_avg:156.35ms step:1020/1480 train_time:157925ms step_avg:156.36ms step:1021/1480 train_time:158090ms step_avg:156.37ms step:1022/1480 train_time:158257ms step_avg:156.38ms step:1023/1480 train_time:158425ms step_avg:156.39ms step:1024/1480 train_time:158591ms step_avg:156.40ms step:1025/1480 train_time:158763ms step_avg:156.42ms step:1026/1480 train_time:158928ms step_avg:156.43ms step:1027/1480 train_time:159096ms step_avg:156.44ms step:1028/1480 train_time:159269ms step_avg:156.45ms step:1029/1480 train_time:159443ms step_avg:156.47ms step:1030/1480 train_time:159609ms step_avg:156.48ms step:1031/1480 train_time:159775ms step_avg:156.49ms step:1032/1480 train_time:159946ms step_avg:156.50ms step:1033/1480 train_time:160111ms step_avg:156.51ms step:1034/1480 train_time:160280ms step_avg:156.52ms step:1035/1480 train_time:160446ms step_avg:156.53ms step:1036/1480 train_time:160613ms step_avg:156.54ms step:1037/1480 train_time:160782ms step_avg:156.56ms step:1038/1480 train_time:160949ms step_avg:156.57ms step:1039/1480 train_time:161122ms step_avg:156.58ms step:1040/1480 train_time:161287ms step_avg:156.59ms step:1041/1480 train_time:161453ms step_avg:156.60ms step:1042/1480 train_time:161618ms step_avg:156.61ms step:1043/1480 train_time:161785ms step_avg:156.62ms step:1044/1480 train_time:161949ms step_avg:156.62ms step:1045/1480 train_time:162120ms step_avg:156.64ms step:1046/1480 train_time:162288ms step_avg:156.65ms step:1047/1480 train_time:162454ms step_avg:156.66ms step:1048/1480 train_time:162619ms step_avg:156.67ms step:1049/1480 train_time:162785ms step_avg:156.67ms step:1050/1480 train_time:162954ms step_avg:156.69ms step:1051/1480 train_time:163125ms step_avg:156.70ms step:1052/1480 train_time:163294ms step_avg:156.71ms step:1053/1480 train_time:163461ms step_avg:156.72ms step:1054/1480 train_time:163628ms step_avg:156.73ms step:1055/1480 train_time:163794ms step_avg:156.74ms step:1056/1480 train_time:163958ms step_avg:156.75ms step:1057/1480 train_time:164125ms step_avg:156.76ms step:1058/1480 train_time:164293ms step_avg:156.77ms step:1059/1480 train_time:164466ms step_avg:156.78ms step:1060/1480 train_time:164636ms step_avg:156.80ms step:1061/1480 train_time:164799ms step_avg:156.80ms step:1062/1480 train_time:164965ms step_avg:156.81ms step:1063/1480 train_time:165129ms step_avg:156.82ms step:1064/1480 train_time:165293ms step_avg:156.82ms step:1065/1480 train_time:165462ms step_avg:156.84ms step:1066/1480 train_time:165628ms step_avg:156.84ms step:1067/1480 train_time:165798ms step_avg:156.86ms step:1068/1480 train_time:165965ms step_avg:156.87ms step:1069/1480 train_time:166137ms step_avg:156.88ms step:1070/1480 train_time:166304ms step_avg:156.89ms step:1071/1480 train_time:166477ms step_avg:156.91ms step:1072/1480 train_time:166644ms step_avg:156.91ms step:1073/1480 train_time:166808ms step_avg:156.92ms step:1074/1480 train_time:166974ms step_avg:156.93ms step:1075/1480 train_time:167144ms step_avg:156.94ms step:1076/1480 train_time:167311ms step_avg:156.95ms step:1077/1480 train_time:167478ms step_avg:156.96ms step:1078/1480 train_time:167651ms step_avg:156.98ms step:1079/1480 train_time:167823ms step_avg:156.99ms step:1080/1480 train_time:167992ms step_avg:157.00ms step:1081/1480 train_time:168159ms step_avg:157.01ms step:1082/1480 train_time:168326ms step_avg:157.02ms step:1083/1480 train_time:168492ms step_avg:157.03ms step:1084/1480 train_time:168660ms step_avg:157.04ms step:1085/1480 train_time:168829ms step_avg:157.05ms step:1086/1480 train_time:168997ms step_avg:157.06ms step:1087/1480 train_time:169164ms step_avg:157.07ms step:1088/1480 train_time:169334ms step_avg:157.08ms step:1089/1480 train_time:169507ms step_avg:157.10ms step:1090/1480 train_time:169681ms step_avg:157.11ms step:1091/1480 train_time:169848ms step_avg:157.12ms step:1092/1480 train_time:170017ms step_avg:157.13ms step:1093/1480 train_time:170185ms step_avg:157.14ms step:1094/1480 train_time:170350ms step_avg:157.15ms step:1095/1480 train_time:170515ms step_avg:157.16ms step:1096/1480 train_time:170684ms step_avg:157.17ms step:1097/1480 train_time:170852ms step_avg:157.18ms step:1098/1480 train_time:171025ms step_avg:157.19ms step:1099/1480 train_time:171196ms step_avg:157.20ms step:1100/1480 train_time:171367ms step_avg:157.22ms step:1101/1480 train_time:171538ms step_avg:157.23ms step:1102/1480 train_time:171710ms step_avg:157.24ms step:1103/1480 train_time:171886ms step_avg:157.26ms step:1104/1480 train_time:172054ms step_avg:157.27ms step:1105/1480 train_time:172225ms step_avg:157.28ms step:1106/1480 train_time:172392ms step_avg:157.29ms step:1107/1480 train_time:172563ms step_avg:157.30ms step:1108/1480 train_time:172728ms step_avg:157.31ms step:1109/1480 train_time:172895ms step_avg:157.32ms step:1110/1480 train_time:173063ms step_avg:157.33ms step:1111/1480 train_time:173230ms step_avg:157.34ms step:1112/1480 train_time:173401ms step_avg:157.35ms step:1113/1480 train_time:173582ms step_avg:157.37ms step:1114/1480 train_time:173753ms step_avg:157.39ms step:1115/1480 train_time:173926ms step_avg:157.40ms step:1116/1480 train_time:174094ms step_avg:157.41ms step:1117/1480 train_time:174267ms step_avg:157.42ms step:1118/1480 train_time:174440ms step_avg:157.44ms step:1119/1480 train_time:174605ms step_avg:157.44ms step:1120/1480 train_time:174775ms step_avg:157.45ms step:1121/1480 train_time:174945ms step_avg:157.47ms step:1122/1480 train_time:175111ms step_avg:157.47ms step:1123/1480 train_time:175278ms step_avg:157.48ms step:1124/1480 train_time:175445ms step_avg:157.49ms step:1125/1480 train_time:175612ms step_avg:157.50ms step:1125/1480 val_loss:3.3904 train_time:175680ms step_avg:157.56ms step:1126/1480 train_time:175783ms step_avg:157.51ms step:1127/1480 train_time:175953ms step_avg:157.52ms step:1128/1480 train_time:176125ms step_avg:157.54ms step:1129/1480 train_time:176297ms step_avg:157.55ms step:1130/1480 train_time:176468ms step_avg:157.56ms step:1131/1480 train_time:176646ms step_avg:157.58ms step:1132/1480 train_time:176811ms step_avg:157.59ms step:1133/1480 train_time:176984ms step_avg:157.60ms step:1134/1480 train_time:177155ms step_avg:157.61ms step:1135/1480 train_time:177323ms step_avg:157.62ms step:1136/1480 train_time:177492ms step_avg:157.63ms step:1137/1480 train_time:177662ms step_avg:157.64ms step:1138/1480 train_time:177833ms step_avg:157.65ms step:1139/1480 train_time:178003ms step_avg:157.66ms step:1140/1480 train_time:178170ms step_avg:157.67ms step:1141/1480 train_time:178344ms step_avg:157.69ms step:1142/1480 train_time:178511ms step_avg:157.70ms step:1143/1480 train_time:178683ms step_avg:157.71ms step:1144/1480 train_time:178853ms step_avg:157.72ms step:1145/1480 train_time:179017ms step_avg:157.72ms step:1146/1480 train_time:179189ms step_avg:157.74ms step:1147/1480 train_time:179358ms step_avg:157.75ms step:1148/1480 train_time:179528ms step_avg:157.76ms step:1149/1480 train_time:179698ms step_avg:157.77ms step:1150/1480 train_time:179867ms step_avg:157.78ms step:1151/1480 train_time:180037ms step_avg:157.79ms step:1152/1480 train_time:180211ms step_avg:157.80ms step:1153/1480 train_time:180385ms step_avg:157.82ms step:1154/1480 train_time:180552ms step_avg:157.83ms step:1155/1480 train_time:180724ms step_avg:157.84ms step:1156/1480 train_time:180904ms step_avg:157.86ms step:1157/1480 train_time:181072ms step_avg:157.87ms step:1158/1480 train_time:181239ms step_avg:157.87ms step:1159/1480 train_time:181407ms step_avg:157.88ms step:1160/1480 train_time:181572ms step_avg:157.89ms step:1161/1480 train_time:181744ms step_avg:157.90ms step:1162/1480 train_time:181914ms step_avg:157.91ms step:1163/1480 train_time:182086ms step_avg:157.92ms step:1164/1480 train_time:182254ms step_avg:157.93ms step:1165/1480 train_time:182418ms step_avg:157.94ms step:1166/1480 train_time:182588ms step_avg:157.95ms step:1167/1480 train_time:182756ms step_avg:157.96ms step:1168/1480 train_time:182926ms step_avg:157.97ms step:1169/1480 train_time:183094ms step_avg:157.98ms step:1170/1480 train_time:183261ms step_avg:157.98ms step:1171/1480 train_time:183428ms step_avg:157.99ms step:1172/1480 train_time:183593ms step_avg:158.00ms step:1173/1480 train_time:183764ms step_avg:158.01ms step:1174/1480 train_time:183947ms step_avg:158.03ms step:1175/1480 train_time:184118ms step_avg:158.04ms step:1176/1480 train_time:184291ms step_avg:158.05ms step:1177/1480 train_time:184467ms step_avg:158.07ms step:1178/1480 train_time:184634ms step_avg:158.08ms step:1179/1480 train_time:184800ms step_avg:158.08ms step:1180/1480 train_time:184977ms step_avg:158.10ms step:1181/1480 train_time:185148ms step_avg:158.11ms step:1182/1480 train_time:185315ms step_avg:158.12ms step:1183/1480 train_time:185486ms step_avg:158.13ms step:1184/1480 train_time:185653ms step_avg:158.14ms step:1185/1480 train_time:185828ms step_avg:158.15ms step:1186/1480 train_time:185998ms step_avg:158.16ms step:1187/1480 train_time:186182ms step_avg:158.18ms step:1188/1480 train_time:186348ms step_avg:158.19ms step:1189/1480 train_time:186518ms step_avg:158.20ms step:1190/1480 train_time:186687ms step_avg:158.21ms step:1191/1480 train_time:186858ms step_avg:158.22ms step:1192/1480 train_time:187025ms step_avg:158.23ms step:1193/1480 train_time:187190ms step_avg:158.23ms step:1194/1480 train_time:187359ms step_avg:158.24ms step:1195/1480 train_time:187534ms step_avg:158.26ms step:1196/1480 train_time:187716ms step_avg:158.28ms step:1197/1480 train_time:187890ms step_avg:158.29ms step:1198/1480 train_time:188070ms step_avg:158.31ms step:1199/1480 train_time:188241ms step_avg:158.32ms step:1200/1480 train_time:188410ms step_avg:158.33ms step:1201/1480 train_time:188579ms step_avg:158.34ms step:1202/1480 train_time:188759ms step_avg:158.36ms step:1203/1480 train_time:188935ms step_avg:158.37ms step:1204/1480 train_time:189110ms step_avg:158.38ms step:1205/1480 train_time:189278ms step_avg:158.39ms step:1206/1480 train_time:189445ms step_avg:158.40ms step:1207/1480 train_time:189614ms step_avg:158.41ms step:1208/1480 train_time:189782ms step_avg:158.42ms step:1209/1480 train_time:189955ms step_avg:158.43ms step:1210/1480 train_time:190130ms step_avg:158.44ms step:1211/1480 train_time:190304ms step_avg:158.45ms step:1212/1480 train_time:190474ms step_avg:158.46ms step:1213/1480 train_time:190647ms step_avg:158.48ms step:1214/1480 train_time:190825ms step_avg:158.49ms step:1215/1480 train_time:190996ms step_avg:158.50ms step:1216/1480 train_time:191166ms step_avg:158.51ms step:1217/1480 train_time:191342ms step_avg:158.53ms step:1218/1480 train_time:191514ms step_avg:158.54ms step:1219/1480 train_time:191691ms step_avg:158.55ms step:1220/1480 train_time:191861ms step_avg:158.56ms step:1221/1480 train_time:192030ms step_avg:158.57ms step:1222/1480 train_time:192197ms step_avg:158.58ms step:1223/1480 train_time:192368ms step_avg:158.59ms step:1224/1480 train_time:192546ms step_avg:158.60ms step:1225/1480 train_time:192717ms step_avg:158.61ms step:1226/1480 train_time:192890ms step_avg:158.63ms step:1227/1480 train_time:193063ms step_avg:158.64ms step:1228/1480 train_time:193232ms step_avg:158.65ms step:1229/1480 train_time:193404ms step_avg:158.66ms step:1230/1480 train_time:193587ms step_avg:158.68ms step:1231/1480 train_time:193763ms step_avg:158.69ms step:1232/1480 train_time:193937ms step_avg:158.70ms step:1233/1480 train_time:194108ms step_avg:158.71ms step:1234/1480 train_time:194278ms step_avg:158.72ms step:1235/1480 train_time:194451ms step_avg:158.74ms step:1236/1480 train_time:194621ms step_avg:158.74ms step:1237/1480 train_time:194792ms step_avg:158.75ms step:1238/1480 train_time:194976ms step_avg:158.77ms step:1239/1480 train_time:195148ms step_avg:158.79ms step:1240/1480 train_time:195318ms step_avg:158.80ms step:1241/1480 train_time:195491ms step_avg:158.81ms step:1242/1480 train_time:195660ms step_avg:158.81ms step:1243/1480 train_time:195833ms step_avg:158.83ms step:1244/1480 train_time:196001ms step_avg:158.83ms step:1245/1480 train_time:196168ms step_avg:158.84ms step:1246/1480 train_time:196339ms step_avg:158.85ms step:1247/1480 train_time:196509ms step_avg:158.86ms step:1248/1480 train_time:196678ms step_avg:158.87ms step:1249/1480 train_time:196846ms step_avg:158.87ms step:1250/1480 train_time:197015ms step_avg:158.88ms step:1250/1480 val_loss:3.3397 train_time:197089ms step_avg:158.94ms step:1251/1480 train_time:197196ms step_avg:158.90ms step:1252/1480 train_time:197366ms step_avg:158.91ms step:1253/1480 train_time:197534ms step_avg:158.92ms step:1254/1480 train_time:197704ms step_avg:158.93ms step:1255/1480 train_time:197890ms step_avg:158.95ms step:1256/1480 train_time:198063ms step_avg:158.96ms step:1257/1480 train_time:198235ms step_avg:158.97ms step:1258/1480 train_time:198411ms step_avg:158.98ms step:1259/1480 train_time:198583ms step_avg:158.99ms step:1260/1480 train_time:198751ms step_avg:159.00ms step:1261/1480 train_time:198923ms step_avg:159.01ms step:1262/1480 train_time:199097ms step_avg:159.02ms step:1263/1480 train_time:199271ms step_avg:159.03ms step:1264/1480 train_time:199437ms step_avg:159.04ms step:1265/1480 train_time:199605ms step_avg:159.05ms step:1266/1480 train_time:199777ms step_avg:159.06ms step:1267/1480 train_time:199949ms step_avg:159.07ms step:1268/1480 train_time:200120ms step_avg:159.08ms step:1269/1480 train_time:200295ms step_avg:159.09ms step:1270/1480 train_time:200464ms step_avg:159.10ms step:1271/1480 train_time:200635ms step_avg:159.11ms step:1272/1480 train_time:200801ms step_avg:159.11ms step:1273/1480 train_time:200973ms step_avg:159.12ms step:1274/1480 train_time:201147ms step_avg:159.14ms step:1275/1480 train_time:201314ms step_avg:159.14ms step:1276/1480 train_time:201479ms step_avg:159.15ms step:1277/1480 train_time:201653ms step_avg:159.16ms step:1278/1480 train_time:201822ms step_avg:159.17ms step:1279/1480 train_time:201994ms step_avg:159.18ms step:1280/1480 train_time:202174ms step_avg:159.19ms step:1281/1480 train_time:202345ms step_avg:159.20ms step:1282/1480 train_time:202512ms step_avg:159.21ms step:1283/1480 train_time:202683ms step_avg:159.22ms step:1284/1480 train_time:202852ms step_avg:159.22ms step:1285/1480 train_time:203021ms step_avg:159.23ms step:1286/1480 train_time:203190ms step_avg:159.24ms step:1287/1480 train_time:203362ms step_avg:159.25ms step:1288/1480 train_time:203533ms step_avg:159.26ms step:1289/1480 train_time:203715ms step_avg:159.28ms step:1290/1480 train_time:203894ms step_avg:159.29ms step:1291/1480 train_time:204070ms step_avg:159.30ms step:1292/1480 train_time:204244ms step_avg:159.32ms step:1293/1480 train_time:204418ms step_avg:159.33ms step:1294/1480 train_time:204590ms step_avg:159.34ms step:1295/1480 train_time:204761ms step_avg:159.35ms step:1296/1480 train_time:204935ms step_avg:159.36ms step:1297/1480 train_time:205108ms step_avg:159.37ms step:1298/1480 train_time:205280ms step_avg:159.38ms step:1299/1480 train_time:205451ms step_avg:159.39ms step:1300/1480 train_time:205618ms step_avg:159.39ms step:1301/1480 train_time:205785ms step_avg:159.40ms step:1302/1480 train_time:205959ms step_avg:159.41ms step:1303/1480 train_time:206136ms step_avg:159.42ms step:1304/1480 train_time:206311ms step_avg:159.44ms step:1305/1480 train_time:206480ms step_avg:159.44ms step:1306/1480 train_time:206654ms step_avg:159.46ms step:1307/1480 train_time:206821ms step_avg:159.46ms step:1308/1480 train_time:206989ms step_avg:159.47ms step:1309/1480 train_time:207159ms step_avg:159.48ms step:1310/1480 train_time:207328ms step_avg:159.48ms step:1311/1480 train_time:207497ms step_avg:159.49ms step:1312/1480 train_time:207670ms step_avg:159.50ms step:1313/1480 train_time:207838ms step_avg:159.51ms step:1314/1480 train_time:208011ms step_avg:159.52ms step:1315/1480 train_time:208182ms step_avg:159.53ms step:1316/1480 train_time:208350ms step_avg:159.53ms step:1317/1480 train_time:208520ms step_avg:159.54ms step:1318/1480 train_time:208700ms step_avg:159.56ms step:1319/1480 train_time:208876ms step_avg:159.57ms step:1320/1480 train_time:209053ms step_avg:159.58ms step:1321/1480 train_time:209226ms step_avg:159.59ms step:1322/1480 train_time:209407ms step_avg:159.61ms step:1323/1480 train_time:209578ms step_avg:159.62ms step:1324/1480 train_time:209753ms step_avg:159.63ms step:1325/1480 train_time:209934ms step_avg:159.65ms step:1326/1480 train_time:210110ms step_avg:159.66ms step:1327/1480 train_time:210281ms step_avg:159.67ms step:1328/1480 train_time:210453ms step_avg:159.68ms step:1329/1480 train_time:210648ms step_avg:159.70ms step:1330/1480 train_time:210827ms step_avg:159.72ms step:1331/1480 train_time:210998ms step_avg:159.73ms step:1332/1480 train_time:211174ms step_avg:159.74ms step:1333/1480 train_time:211350ms step_avg:159.75ms step:1334/1480 train_time:211521ms step_avg:159.76ms step:1335/1480 train_time:211690ms step_avg:159.77ms step:1336/1480 train_time:211873ms step_avg:159.78ms step:1337/1480 train_time:212049ms step_avg:159.80ms step:1338/1480 train_time:212219ms step_avg:159.80ms step:1339/1480 train_time:212393ms step_avg:159.81ms step:1340/1480 train_time:212566ms step_avg:159.82ms step:1341/1480 train_time:212733ms step_avg:159.83ms step:1342/1480 train_time:212908ms step_avg:159.84ms step:1343/1480 train_time:213077ms step_avg:159.85ms step:1344/1480 train_time:213251ms step_avg:159.86ms step:1345/1480 train_time:213428ms step_avg:159.87ms step:1346/1480 train_time:213596ms step_avg:159.88ms step:1347/1480 train_time:213767ms step_avg:159.89ms step:1348/1480 train_time:213936ms step_avg:159.89ms step:1349/1480 train_time:214106ms step_avg:159.90ms step:1350/1480 train_time:214280ms step_avg:159.91ms step:1351/1480 train_time:214451ms step_avg:159.92ms step:1352/1480 train_time:214621ms step_avg:159.93ms step:1353/1480 train_time:214795ms step_avg:159.94ms step:1354/1480 train_time:214967ms step_avg:159.95ms step:1355/1480 train_time:215136ms step_avg:159.95ms step:1356/1480 train_time:215309ms step_avg:159.96ms step:1357/1480 train_time:215483ms step_avg:159.97ms step:1358/1480 train_time:215655ms step_avg:159.98ms step:1359/1480 train_time:215827ms step_avg:159.99ms step:1360/1480 train_time:216002ms step_avg:160.00ms step:1361/1480 train_time:216179ms step_avg:160.01ms step:1362/1480 train_time:216354ms step_avg:160.03ms step:1363/1480 train_time:216535ms step_avg:160.04ms step:1364/1480 train_time:216705ms step_avg:160.05ms step:1365/1480 train_time:216871ms step_avg:160.05ms step:1366/1480 train_time:217044ms step_avg:160.06ms step:1367/1480 train_time:217213ms step_avg:160.07ms step:1368/1480 train_time:217385ms step_avg:160.08ms step:1369/1480 train_time:217565ms step_avg:160.09ms step:1370/1480 train_time:217745ms step_avg:160.11ms step:1371/1480 train_time:217915ms step_avg:160.11ms step:1372/1480 train_time:218092ms step_avg:160.13ms step:1373/1480 train_time:218262ms step_avg:160.13ms step:1374/1480 train_time:218438ms step_avg:160.15ms step:1375/1480 train_time:218610ms step_avg:160.15ms step:1375/1480 val_loss:3.3013 train_time:218677ms step_avg:160.20ms step:1376/1480 train_time:218785ms step_avg:160.16ms step:1377/1480 train_time:218955ms step_avg:160.17ms step:1378/1480 train_time:219123ms step_avg:160.18ms step:1379/1480 train_time:219299ms step_avg:160.19ms step:1380/1480 train_time:219472ms step_avg:160.20ms step:1381/1480 train_time:219654ms step_avg:160.21ms step:1382/1480 train_time:219826ms step_avg:160.22ms step:1383/1480 train_time:219999ms step_avg:160.23ms step:1384/1480 train_time:220176ms step_avg:160.24ms step:1385/1480 train_time:220342ms step_avg:160.25ms step:1386/1480 train_time:220513ms step_avg:160.26ms step:1387/1480 train_time:220684ms step_avg:160.26ms step:1388/1480 train_time:220854ms step_avg:160.27ms step:1389/1480 train_time:221026ms step_avg:160.28ms step:1390/1480 train_time:221195ms step_avg:160.29ms step:1391/1480 train_time:221365ms step_avg:160.29ms step:1392/1480 train_time:221538ms step_avg:160.30ms step:1393/1480 train_time:221707ms step_avg:160.31ms step:1394/1480 train_time:221879ms step_avg:160.32ms step:1395/1480 train_time:222048ms step_avg:160.32ms step:1396/1480 train_time:222217ms step_avg:160.33ms step:1397/1480 train_time:222385ms step_avg:160.34ms step:1398/1480 train_time:222553ms step_avg:160.34ms step:1399/1480 train_time:222723ms step_avg:160.35ms step:1400/1480 train_time:222899ms step_avg:160.36ms step:1401/1480 train_time:223065ms step_avg:160.36ms step:1402/1480 train_time:223238ms step_avg:160.37ms step:1403/1480 train_time:223414ms step_avg:160.38ms step:1404/1480 train_time:223585ms step_avg:160.39ms step:1405/1480 train_time:223759ms step_avg:160.40ms step:1406/1480 train_time:223934ms step_avg:160.41ms step:1407/1480 train_time:224100ms step_avg:160.42ms step:1408/1480 train_time:224268ms step_avg:160.42ms step:1409/1480 train_time:224453ms step_avg:160.44ms step:1410/1480 train_time:224621ms step_avg:160.44ms step:1411/1480 train_time:224790ms step_avg:160.45ms step:1412/1480 train_time:224962ms step_avg:160.46ms step:1413/1480 train_time:225132ms step_avg:160.46ms step:1414/1480 train_time:225302ms step_avg:160.47ms step:1415/1480 train_time:225479ms step_avg:160.48ms step:1416/1480 train_time:225665ms step_avg:160.50ms step:1417/1480 train_time:225839ms step_avg:160.51ms step:1418/1480 train_time:226009ms step_avg:160.52ms step:1419/1480 train_time:226184ms step_avg:160.53ms step:1420/1480 train_time:226360ms step_avg:160.54ms step:1421/1480 train_time:226532ms step_avg:160.55ms step:1422/1480 train_time:226704ms step_avg:160.55ms step:1423/1480 train_time:226872ms step_avg:160.56ms step:1424/1480 train_time:227049ms step_avg:160.57ms step:1425/1480 train_time:227228ms step_avg:160.59ms step:1426/1480 train_time:227400ms step_avg:160.59ms step:1427/1480 train_time:227576ms step_avg:160.60ms step:1428/1480 train_time:227748ms step_avg:160.61ms step:1429/1480 train_time:227916ms step_avg:160.62ms step:1430/1480 train_time:228089ms step_avg:160.63ms step:1431/1480 train_time:228264ms step_avg:160.64ms step:1432/1480 train_time:228441ms step_avg:160.65ms step:1433/1480 train_time:228620ms step_avg:160.66ms step:1434/1480 train_time:228801ms step_avg:160.67ms step:1435/1480 train_time:228976ms step_avg:160.68ms step:1436/1480 train_time:229150ms step_avg:160.69ms step:1437/1480 train_time:229321ms step_avg:160.70ms step:1438/1480 train_time:229490ms step_avg:160.71ms step:1439/1480 train_time:229663ms step_avg:160.72ms step:1440/1480 train_time:229831ms step_avg:160.72ms step:1441/1480 train_time:230000ms step_avg:160.73ms step:1442/1480 train_time:230180ms step_avg:160.74ms step:1443/1480 train_time:230368ms step_avg:160.76ms step:1444/1480 train_time:230539ms step_avg:160.77ms step:1445/1480 train_time:230709ms step_avg:160.77ms step:1446/1480 train_time:230884ms step_avg:160.78ms step:1447/1480 train_time:231063ms step_avg:160.80ms step:1448/1480 train_time:231233ms step_avg:160.80ms step:1449/1480 train_time:231406ms step_avg:160.81ms step:1450/1480 train_time:231579ms step_avg:160.82ms step:1451/1480 train_time:231750ms step_avg:160.83ms step:1452/1480 train_time:231925ms step_avg:160.84ms step:1453/1480 train_time:232095ms step_avg:160.84ms step:1454/1480 train_time:232267ms step_avg:160.85ms step:1455/1480 train_time:232446ms step_avg:160.86ms step:1456/1480 train_time:232619ms step_avg:160.87ms step:1457/1480 train_time:232790ms step_avg:160.88ms step:1458/1480 train_time:232962ms step_avg:160.89ms step:1459/1480 train_time:233139ms step_avg:160.90ms step:1460/1480 train_time:233311ms step_avg:160.90ms step:1461/1480 train_time:233486ms step_avg:160.91ms step:1462/1480 train_time:233657ms step_avg:160.92ms step:1463/1480 train_time:233832ms step_avg:160.93ms step:1464/1480 train_time:234007ms step_avg:160.94ms step:1465/1480 train_time:234179ms step_avg:160.95ms step:1466/1480 train_time:234349ms step_avg:160.95ms step:1467/1480 train_time:234523ms step_avg:160.96ms step:1468/1480 train_time:234693ms step_avg:160.97ms step:1469/1480 train_time:234868ms step_avg:160.98ms step:1470/1480 train_time:235047ms step_avg:160.99ms step:1471/1480 train_time:235232ms step_avg:161.01ms step:1472/1480 train_time:235410ms step_avg:161.02ms step:1473/1480 train_time:235582ms step_avg:161.03ms step:1474/1480 train_time:235760ms step_avg:161.04ms step:1475/1480 train_time:235940ms step_avg:161.05ms step:1476/1480 train_time:236113ms step_avg:161.06ms step:1477/1480 train_time:236298ms step_avg:161.08ms step:1478/1480 train_time:236482ms step_avg:161.09ms step:1479/1480 train_time:236658ms step_avg:161.10ms step:1480/1480 train_time:236830ms step_avg:161.11ms step:1480/1480 val_loss:3.2822 train_time:236902ms step_avg:161.16ms