import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 09:21:33 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 104W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 108W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 114W / 700W | 37MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 87W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 92W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 95W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23455ms step_avg:nanms step:2/1480 train_time:23542ms step_avg:nanms step:3/1480 train_time:23681ms step_avg:nanms step:4/1480 train_time:23823ms step_avg:nanms step:5/1480 train_time:23964ms step_avg:nanms step:6/1480 train_time:24105ms step_avg:nanms step:7/1480 train_time:24246ms step_avg:nanms step:8/1480 train_time:24387ms step_avg:nanms step:9/1480 train_time:24531ms step_avg:nanms step:10/1480 train_time:24673ms step_avg:nanms step:11/1480 train_time:144ms step_avg:nanms step:12/1480 train_time:287ms step_avg:nanms step:13/1480 train_time:429ms step_avg:143.10ms step:14/1480 train_time:571ms step_avg:142.81ms step:15/1480 train_time:713ms step_avg:142.66ms step:16/1480 train_time:856ms step_avg:142.69ms step:17/1480 train_time:997ms step_avg:142.45ms step:18/1480 train_time:1141ms step_avg:142.64ms step:19/1480 train_time:1285ms step_avg:142.78ms step:20/1480 train_time:1429ms step_avg:142.90ms step:21/1480 train_time:1571ms step_avg:142.86ms step:22/1480 train_time:1713ms step_avg:142.78ms step:23/1480 train_time:1855ms step_avg:142.71ms step:24/1480 train_time:1997ms step_avg:142.63ms step:25/1480 train_time:2138ms step_avg:142.56ms step:26/1480 train_time:2280ms step_avg:142.50ms step:27/1480 train_time:2423ms step_avg:142.54ms step:28/1480 train_time:2566ms step_avg:142.58ms step:29/1480 train_time:2709ms step_avg:142.59ms step:30/1480 train_time:2852ms step_avg:142.61ms step:31/1480 train_time:2995ms step_avg:142.61ms step:32/1480 train_time:3137ms step_avg:142.61ms step:33/1480 train_time:3282ms step_avg:142.70ms step:34/1480 train_time:3426ms step_avg:142.77ms step:35/1480 train_time:3571ms step_avg:142.85ms step:36/1480 train_time:3714ms step_avg:142.83ms step:37/1480 train_time:3856ms step_avg:142.80ms step:38/1480 train_time:3997ms step_avg:142.76ms step:39/1480 train_time:4139ms step_avg:142.71ms step:40/1480 train_time:4282ms step_avg:142.73ms step:41/1480 train_time:4428ms step_avg:142.83ms step:42/1480 train_time:4571ms step_avg:142.85ms step:43/1480 train_time:4714ms step_avg:142.86ms step:44/1480 train_time:4855ms step_avg:142.81ms step:45/1480 train_time:4996ms step_avg:142.75ms step:46/1480 train_time:5139ms step_avg:142.74ms step:47/1480 train_time:5282ms step_avg:142.76ms step:48/1480 train_time:5427ms step_avg:142.81ms step:49/1480 train_time:5571ms step_avg:142.86ms step:50/1480 train_time:5715ms step_avg:142.87ms step:51/1480 train_time:5857ms step_avg:142.85ms step:52/1480 train_time:5998ms step_avg:142.81ms step:53/1480 train_time:6140ms step_avg:142.80ms step:54/1480 train_time:6286ms step_avg:142.87ms step:55/1480 train_time:6429ms step_avg:142.86ms step:56/1480 train_time:6572ms step_avg:142.88ms step:57/1480 train_time:6715ms step_avg:142.88ms step:58/1480 train_time:6856ms step_avg:142.84ms step:59/1480 train_time:6998ms step_avg:142.82ms step:60/1480 train_time:7141ms step_avg:142.81ms step:61/1480 train_time:7285ms step_avg:142.84ms step:62/1480 train_time:7429ms step_avg:142.86ms step:63/1480 train_time:7573ms step_avg:142.88ms step:64/1480 train_time:7715ms step_avg:142.87ms step:65/1480 train_time:7857ms step_avg:142.85ms step:66/1480 train_time:7997ms step_avg:142.81ms step:67/1480 train_time:8141ms step_avg:142.82ms step:68/1480 train_time:8286ms step_avg:142.86ms step:69/1480 train_time:8430ms step_avg:142.87ms step:70/1480 train_time:8573ms step_avg:142.89ms step:71/1480 train_time:8714ms step_avg:142.85ms step:72/1480 train_time:8855ms step_avg:142.82ms step:73/1480 train_time:8996ms step_avg:142.79ms step:74/1480 train_time:9138ms step_avg:142.78ms step:75/1480 train_time:9281ms step_avg:142.79ms step:76/1480 train_time:9426ms step_avg:142.81ms step:77/1480 train_time:9570ms step_avg:142.84ms step:78/1480 train_time:9714ms step_avg:142.85ms step:79/1480 train_time:9855ms step_avg:142.83ms step:80/1480 train_time:9996ms step_avg:142.80ms step:81/1480 train_time:10138ms step_avg:142.79ms step:82/1480 train_time:10281ms step_avg:142.79ms step:83/1480 train_time:10425ms step_avg:142.81ms step:84/1480 train_time:10570ms step_avg:142.83ms step:85/1480 train_time:10712ms step_avg:142.83ms step:86/1480 train_time:10854ms step_avg:142.81ms step:87/1480 train_time:10995ms step_avg:142.79ms step:88/1480 train_time:11136ms step_avg:142.77ms step:89/1480 train_time:11280ms step_avg:142.79ms step:90/1480 train_time:11421ms step_avg:142.77ms step:91/1480 train_time:11566ms step_avg:142.78ms step:92/1480 train_time:11709ms step_avg:142.80ms step:93/1480 train_time:11853ms step_avg:142.81ms step:94/1480 train_time:11994ms step_avg:142.79ms step:95/1480 train_time:12135ms step_avg:142.76ms step:96/1480 train_time:12277ms step_avg:142.75ms step:97/1480 train_time:12418ms step_avg:142.74ms step:98/1480 train_time:12560ms step_avg:142.73ms step:99/1480 train_time:12702ms step_avg:142.72ms step:100/1480 train_time:12845ms step_avg:142.72ms step:101/1480 train_time:12988ms step_avg:142.73ms step:102/1480 train_time:13130ms step_avg:142.72ms step:103/1480 train_time:13272ms step_avg:142.71ms step:104/1480 train_time:13414ms step_avg:142.70ms step:105/1480 train_time:13555ms step_avg:142.68ms step:106/1480 train_time:13696ms step_avg:142.67ms step:107/1480 train_time:13837ms step_avg:142.65ms step:108/1480 train_time:13979ms step_avg:142.64ms step:109/1480 train_time:14120ms step_avg:142.63ms step:110/1480 train_time:14261ms step_avg:142.61ms step:111/1480 train_time:14405ms step_avg:142.63ms step:112/1480 train_time:14554ms step_avg:142.68ms step:113/1480 train_time:14699ms step_avg:142.71ms step:114/1480 train_time:14848ms step_avg:142.77ms step:115/1480 train_time:14995ms step_avg:142.81ms step:116/1480 train_time:15141ms step_avg:142.84ms step:117/1480 train_time:15288ms step_avg:142.88ms step:118/1480 train_time:15434ms step_avg:142.91ms step:119/1480 train_time:15580ms step_avg:142.94ms step:120/1480 train_time:15727ms step_avg:142.97ms step:121/1480 train_time:15876ms step_avg:143.02ms step:122/1480 train_time:16022ms step_avg:143.05ms step:123/1480 train_time:16170ms step_avg:143.10ms step:124/1480 train_time:16316ms step_avg:143.12ms step:125/1480 train_time:16463ms step_avg:143.15ms step:125/1480 val_loss:4.4183 train_time:16519ms step_avg:143.64ms step:126/1480 train_time:16615ms step_avg:143.23ms step:127/1480 train_time:16764ms step_avg:143.29ms step:128/1480 train_time:16911ms step_avg:143.32ms step:129/1480 train_time:17057ms step_avg:143.33ms step:130/1480 train_time:17203ms step_avg:143.36ms step:131/1480 train_time:17350ms step_avg:143.38ms step:132/1480 train_time:17495ms step_avg:143.40ms step:133/1480 train_time:17642ms step_avg:143.43ms step:134/1480 train_time:17790ms step_avg:143.47ms step:135/1480 train_time:17937ms step_avg:143.49ms step:136/1480 train_time:18084ms step_avg:143.52ms step:137/1480 train_time:18230ms step_avg:143.55ms step:138/1480 train_time:18376ms step_avg:143.56ms step:139/1480 train_time:18524ms step_avg:143.60ms step:140/1480 train_time:18671ms step_avg:143.62ms step:141/1480 train_time:18818ms step_avg:143.65ms step:142/1480 train_time:18965ms step_avg:143.68ms step:143/1480 train_time:19112ms step_avg:143.70ms step:144/1480 train_time:19258ms step_avg:143.72ms step:145/1480 train_time:19405ms step_avg:143.74ms step:146/1480 train_time:19551ms step_avg:143.76ms step:147/1480 train_time:19699ms step_avg:143.79ms step:148/1480 train_time:19847ms step_avg:143.82ms step:149/1480 train_time:19992ms step_avg:143.83ms step:150/1480 train_time:20139ms step_avg:143.85ms step:151/1480 train_time:20286ms step_avg:143.87ms step:152/1480 train_time:20432ms step_avg:143.89ms step:153/1480 train_time:20578ms step_avg:143.90ms step:154/1480 train_time:20726ms step_avg:143.93ms step:155/1480 train_time:20874ms step_avg:143.96ms step:156/1480 train_time:21022ms step_avg:143.99ms step:157/1480 train_time:21169ms step_avg:144.01ms step:158/1480 train_time:21314ms step_avg:144.01ms step:159/1480 train_time:21461ms step_avg:144.03ms step:160/1480 train_time:21608ms step_avg:144.05ms step:161/1480 train_time:21753ms step_avg:144.06ms step:162/1480 train_time:21900ms step_avg:144.08ms step:163/1480 train_time:22047ms step_avg:144.10ms step:164/1480 train_time:22193ms step_avg:144.11ms step:165/1480 train_time:22340ms step_avg:144.13ms step:166/1480 train_time:22487ms step_avg:144.15ms step:167/1480 train_time:22632ms step_avg:144.15ms step:168/1480 train_time:22780ms step_avg:144.18ms step:169/1480 train_time:22928ms step_avg:144.20ms step:170/1480 train_time:23074ms step_avg:144.22ms step:171/1480 train_time:23223ms step_avg:144.24ms step:172/1480 train_time:23369ms step_avg:144.25ms step:173/1480 train_time:23515ms step_avg:144.26ms step:174/1480 train_time:23661ms step_avg:144.28ms step:175/1480 train_time:23809ms step_avg:144.29ms step:176/1480 train_time:23953ms step_avg:144.30ms step:177/1480 train_time:24103ms step_avg:144.33ms step:178/1480 train_time:24250ms step_avg:144.35ms step:179/1480 train_time:24396ms step_avg:144.36ms step:180/1480 train_time:24544ms step_avg:144.37ms step:181/1480 train_time:24691ms step_avg:144.39ms step:182/1480 train_time:24836ms step_avg:144.40ms step:183/1480 train_time:24984ms step_avg:144.42ms step:184/1480 train_time:25131ms step_avg:144.43ms step:185/1480 train_time:25277ms step_avg:144.44ms step:186/1480 train_time:25425ms step_avg:144.46ms step:187/1480 train_time:25571ms step_avg:144.47ms step:188/1480 train_time:25719ms step_avg:144.49ms step:189/1480 train_time:25866ms step_avg:144.50ms step:190/1480 train_time:26012ms step_avg:144.51ms step:191/1480 train_time:26157ms step_avg:144.52ms step:192/1480 train_time:26306ms step_avg:144.54ms step:193/1480 train_time:26452ms step_avg:144.55ms step:194/1480 train_time:26599ms step_avg:144.56ms step:195/1480 train_time:26746ms step_avg:144.57ms step:196/1480 train_time:26892ms step_avg:144.58ms step:197/1480 train_time:27038ms step_avg:144.59ms step:198/1480 train_time:27186ms step_avg:144.60ms step:199/1480 train_time:27332ms step_avg:144.61ms step:200/1480 train_time:27481ms step_avg:144.64ms step:201/1480 train_time:27628ms step_avg:144.65ms step:202/1480 train_time:27775ms step_avg:144.66ms step:203/1480 train_time:27924ms step_avg:144.68ms step:204/1480 train_time:28071ms step_avg:144.70ms step:205/1480 train_time:28218ms step_avg:144.71ms step:206/1480 train_time:28366ms step_avg:144.73ms step:207/1480 train_time:28513ms step_avg:144.74ms step:208/1480 train_time:28659ms step_avg:144.74ms step:209/1480 train_time:28808ms step_avg:144.76ms step:210/1480 train_time:28952ms step_avg:144.76ms step:211/1480 train_time:29099ms step_avg:144.77ms step:212/1480 train_time:29246ms step_avg:144.78ms step:213/1480 train_time:29392ms step_avg:144.79ms step:214/1480 train_time:29537ms step_avg:144.79ms step:215/1480 train_time:29686ms step_avg:144.81ms step:216/1480 train_time:29832ms step_avg:144.82ms step:217/1480 train_time:29979ms step_avg:144.82ms step:218/1480 train_time:30128ms step_avg:144.85ms step:219/1480 train_time:30274ms step_avg:144.85ms step:220/1480 train_time:30420ms step_avg:144.86ms step:221/1480 train_time:30569ms step_avg:144.88ms step:222/1480 train_time:30719ms step_avg:144.90ms step:223/1480 train_time:30869ms step_avg:144.93ms step:224/1480 train_time:31019ms step_avg:144.95ms step:225/1480 train_time:31170ms step_avg:144.98ms step:226/1480 train_time:31321ms step_avg:145.00ms step:227/1480 train_time:31472ms step_avg:145.03ms step:228/1480 train_time:31623ms step_avg:145.06ms step:229/1480 train_time:31775ms step_avg:145.09ms step:230/1480 train_time:31926ms step_avg:145.12ms step:231/1480 train_time:32077ms step_avg:145.14ms step:232/1480 train_time:32227ms step_avg:145.17ms step:233/1480 train_time:32377ms step_avg:145.19ms step:234/1480 train_time:32528ms step_avg:145.21ms step:235/1480 train_time:32678ms step_avg:145.24ms step:236/1480 train_time:32831ms step_avg:145.27ms step:237/1480 train_time:32978ms step_avg:145.28ms step:238/1480 train_time:33129ms step_avg:145.30ms step:239/1480 train_time:33280ms step_avg:145.33ms step:240/1480 train_time:33431ms step_avg:145.35ms step:241/1480 train_time:33581ms step_avg:145.37ms step:242/1480 train_time:33732ms step_avg:145.40ms step:243/1480 train_time:33882ms step_avg:145.42ms step:244/1480 train_time:34033ms step_avg:145.44ms step:245/1480 train_time:34183ms step_avg:145.46ms step:246/1480 train_time:34334ms step_avg:145.48ms step:247/1480 train_time:34485ms step_avg:145.50ms step:248/1480 train_time:34634ms step_avg:145.52ms step:249/1480 train_time:34786ms step_avg:145.55ms step:250/1480 train_time:34936ms step_avg:145.57ms step:250/1480 val_loss:3.9950 train_time:34995ms step_avg:145.81ms step:251/1480 train_time:35091ms step_avg:145.60ms step:252/1480 train_time:35244ms step_avg:145.64ms step:253/1480 train_time:35394ms step_avg:145.65ms step:254/1480 train_time:35544ms step_avg:145.67ms step:255/1480 train_time:35692ms step_avg:145.68ms step:256/1480 train_time:35843ms step_avg:145.70ms step:257/1480 train_time:35991ms step_avg:145.71ms step:258/1480 train_time:36146ms step_avg:145.75ms step:259/1480 train_time:36297ms step_avg:145.77ms step:260/1480 train_time:36449ms step_avg:145.79ms step:261/1480 train_time:36599ms step_avg:145.81ms step:262/1480 train_time:36748ms step_avg:145.83ms step:263/1480 train_time:36898ms step_avg:145.84ms step:264/1480 train_time:37048ms step_avg:145.86ms step:265/1480 train_time:37199ms step_avg:145.88ms step:266/1480 train_time:37350ms step_avg:145.90ms step:267/1480 train_time:37501ms step_avg:145.92ms step:268/1480 train_time:37652ms step_avg:145.94ms step:269/1480 train_time:37803ms step_avg:145.96ms step:270/1480 train_time:37952ms step_avg:145.97ms step:271/1480 train_time:38103ms step_avg:145.99ms step:272/1480 train_time:38253ms step_avg:146.00ms step:273/1480 train_time:38405ms step_avg:146.03ms step:274/1480 train_time:38556ms step_avg:146.05ms step:275/1480 train_time:38708ms step_avg:146.07ms step:276/1480 train_time:38858ms step_avg:146.08ms step:277/1480 train_time:39008ms step_avg:146.10ms step:278/1480 train_time:39158ms step_avg:146.11ms step:279/1480 train_time:39308ms step_avg:146.13ms step:280/1480 train_time:39459ms step_avg:146.14ms step:281/1480 train_time:39610ms step_avg:146.16ms step:282/1480 train_time:39761ms step_avg:146.18ms step:283/1480 train_time:39911ms step_avg:146.19ms step:284/1480 train_time:40061ms step_avg:146.21ms step:285/1480 train_time:40212ms step_avg:146.22ms step:286/1480 train_time:40363ms step_avg:146.24ms step:287/1480 train_time:40513ms step_avg:146.25ms step:288/1480 train_time:40663ms step_avg:146.27ms step:289/1480 train_time:40813ms step_avg:146.28ms step:290/1480 train_time:40963ms step_avg:146.30ms step:291/1480 train_time:41114ms step_avg:146.31ms step:292/1480 train_time:41264ms step_avg:146.33ms step:293/1480 train_time:41414ms step_avg:146.34ms step:294/1480 train_time:41565ms step_avg:146.35ms step:295/1480 train_time:41715ms step_avg:146.37ms step:296/1480 train_time:41866ms step_avg:146.38ms step:297/1480 train_time:42017ms step_avg:146.40ms step:298/1480 train_time:42167ms step_avg:146.41ms step:299/1480 train_time:42318ms step_avg:146.43ms step:300/1480 train_time:42468ms step_avg:146.44ms step:301/1480 train_time:42619ms step_avg:146.46ms step:302/1480 train_time:42769ms step_avg:146.47ms step:303/1480 train_time:42920ms step_avg:146.48ms step:304/1480 train_time:43070ms step_avg:146.50ms step:305/1480 train_time:43222ms step_avg:146.52ms step:306/1480 train_time:43372ms step_avg:146.53ms step:307/1480 train_time:43523ms step_avg:146.54ms step:308/1480 train_time:43674ms step_avg:146.56ms step:309/1480 train_time:43826ms step_avg:146.58ms step:310/1480 train_time:43976ms step_avg:146.59ms step:311/1480 train_time:44127ms step_avg:146.60ms step:312/1480 train_time:44277ms step_avg:146.61ms step:313/1480 train_time:44430ms step_avg:146.63ms step:314/1480 train_time:44578ms step_avg:146.64ms step:315/1480 train_time:44728ms step_avg:146.65ms step:316/1480 train_time:44878ms step_avg:146.66ms step:317/1480 train_time:45029ms step_avg:146.67ms step:318/1480 train_time:45180ms step_avg:146.69ms step:319/1480 train_time:45331ms step_avg:146.70ms step:320/1480 train_time:45481ms step_avg:146.71ms step:321/1480 train_time:45632ms step_avg:146.73ms step:322/1480 train_time:45782ms step_avg:146.74ms step:323/1480 train_time:45933ms step_avg:146.75ms step:324/1480 train_time:46084ms step_avg:146.76ms step:325/1480 train_time:46234ms step_avg:146.77ms step:326/1480 train_time:46385ms step_avg:146.79ms step:327/1480 train_time:46535ms step_avg:146.80ms step:328/1480 train_time:46686ms step_avg:146.81ms step:329/1480 train_time:46836ms step_avg:146.82ms step:330/1480 train_time:46989ms step_avg:146.84ms step:331/1480 train_time:47144ms step_avg:146.87ms step:332/1480 train_time:47298ms step_avg:146.89ms step:333/1480 train_time:47452ms step_avg:146.91ms step:334/1480 train_time:47605ms step_avg:146.93ms step:335/1480 train_time:47759ms step_avg:146.95ms step:336/1480 train_time:47914ms step_avg:146.97ms step:337/1480 train_time:48068ms step_avg:147.00ms step:338/1480 train_time:48223ms step_avg:147.02ms step:339/1480 train_time:48378ms step_avg:147.05ms step:340/1480 train_time:48532ms step_avg:147.07ms step:341/1480 train_time:48685ms step_avg:147.08ms step:342/1480 train_time:48839ms step_avg:147.11ms step:343/1480 train_time:48994ms step_avg:147.13ms step:344/1480 train_time:49148ms step_avg:147.15ms step:345/1480 train_time:49303ms step_avg:147.17ms step:346/1480 train_time:49458ms step_avg:147.20ms step:347/1480 train_time:49613ms step_avg:147.22ms step:348/1480 train_time:49766ms step_avg:147.24ms step:349/1480 train_time:49918ms step_avg:147.25ms step:350/1480 train_time:50072ms step_avg:147.27ms step:351/1480 train_time:50225ms step_avg:147.29ms step:352/1480 train_time:50381ms step_avg:147.31ms step:353/1480 train_time:50535ms step_avg:147.33ms step:354/1480 train_time:50688ms step_avg:147.35ms step:355/1480 train_time:50842ms step_avg:147.37ms step:356/1480 train_time:50996ms step_avg:147.39ms step:357/1480 train_time:51150ms step_avg:147.41ms step:358/1480 train_time:51303ms step_avg:147.42ms step:359/1480 train_time:51458ms step_avg:147.45ms step:360/1480 train_time:51613ms step_avg:147.47ms step:361/1480 train_time:51768ms step_avg:147.49ms step:362/1480 train_time:51924ms step_avg:147.51ms step:363/1480 train_time:52078ms step_avg:147.53ms step:364/1480 train_time:52232ms step_avg:147.55ms step:365/1480 train_time:52387ms step_avg:147.57ms step:366/1480 train_time:52541ms step_avg:147.59ms step:367/1480 train_time:52694ms step_avg:147.60ms step:368/1480 train_time:52847ms step_avg:147.62ms step:369/1480 train_time:53001ms step_avg:147.64ms step:370/1480 train_time:53156ms step_avg:147.65ms step:371/1480 train_time:53310ms step_avg:147.67ms step:372/1480 train_time:53464ms step_avg:147.69ms step:373/1480 train_time:53619ms step_avg:147.71ms step:374/1480 train_time:53772ms step_avg:147.73ms step:375/1480 train_time:53925ms step_avg:147.74ms step:375/1480 val_loss:3.8050 train_time:53986ms step_avg:147.91ms step:376/1480 train_time:54086ms step_avg:147.78ms step:377/1480 train_time:54242ms step_avg:147.80ms step:378/1480 train_time:54394ms step_avg:147.81ms step:379/1480 train_time:54547ms step_avg:147.82ms step:380/1480 train_time:54699ms step_avg:147.83ms step:381/1480 train_time:54850ms step_avg:147.84ms step:382/1480 train_time:55004ms step_avg:147.86ms step:383/1480 train_time:55160ms step_avg:147.88ms step:384/1480 train_time:55314ms step_avg:147.90ms step:385/1480 train_time:55468ms step_avg:147.91ms step:386/1480 train_time:55622ms step_avg:147.93ms step:387/1480 train_time:55776ms step_avg:147.95ms step:388/1480 train_time:55928ms step_avg:147.96ms step:389/1480 train_time:56081ms step_avg:147.97ms step:390/1480 train_time:56236ms step_avg:147.99ms step:391/1480 train_time:56390ms step_avg:148.00ms step:392/1480 train_time:56545ms step_avg:148.02ms step:393/1480 train_time:56700ms step_avg:148.04ms step:394/1480 train_time:56854ms step_avg:148.06ms step:395/1480 train_time:57007ms step_avg:148.07ms step:396/1480 train_time:57160ms step_avg:148.08ms step:397/1480 train_time:57314ms step_avg:148.10ms step:398/1480 train_time:57468ms step_avg:148.11ms step:399/1480 train_time:57622ms step_avg:148.13ms step:400/1480 train_time:57777ms step_avg:148.15ms step:401/1480 train_time:57930ms step_avg:148.16ms step:402/1480 train_time:58084ms step_avg:148.17ms step:403/1480 train_time:58239ms step_avg:148.19ms step:404/1480 train_time:58391ms step_avg:148.20ms step:405/1480 train_time:58547ms step_avg:148.22ms step:406/1480 train_time:58701ms step_avg:148.24ms step:407/1480 train_time:58854ms step_avg:148.25ms step:408/1480 train_time:59009ms step_avg:148.26ms step:409/1480 train_time:59164ms step_avg:148.28ms step:410/1480 train_time:59318ms step_avg:148.30ms step:411/1480 train_time:59471ms step_avg:148.31ms step:412/1480 train_time:59625ms step_avg:148.32ms step:413/1480 train_time:59779ms step_avg:148.34ms step:414/1480 train_time:59934ms step_avg:148.35ms step:415/1480 train_time:60087ms step_avg:148.36ms step:416/1480 train_time:60242ms step_avg:148.38ms step:417/1480 train_time:60394ms step_avg:148.39ms step:418/1480 train_time:60548ms step_avg:148.40ms step:419/1480 train_time:60702ms step_avg:148.42ms step:420/1480 train_time:60855ms step_avg:148.43ms step:421/1480 train_time:61009ms step_avg:148.44ms step:422/1480 train_time:61162ms step_avg:148.45ms step:423/1480 train_time:61316ms step_avg:148.46ms step:424/1480 train_time:61469ms step_avg:148.48ms step:425/1480 train_time:61623ms step_avg:148.49ms step:426/1480 train_time:61776ms step_avg:148.50ms step:427/1480 train_time:61929ms step_avg:148.51ms step:428/1480 train_time:62083ms step_avg:148.52ms step:429/1480 train_time:62238ms step_avg:148.54ms step:430/1480 train_time:62391ms step_avg:148.55ms step:431/1480 train_time:62545ms step_avg:148.56ms step:432/1480 train_time:62700ms step_avg:148.58ms step:433/1480 train_time:62853ms step_avg:148.59ms step:434/1480 train_time:63006ms step_avg:148.60ms step:435/1480 train_time:63160ms step_avg:148.61ms step:436/1480 train_time:63314ms step_avg:148.62ms step:437/1480 train_time:63468ms step_avg:148.64ms step:438/1480 train_time:63623ms step_avg:148.65ms step:439/1480 train_time:63777ms step_avg:148.67ms step:440/1480 train_time:63932ms step_avg:148.68ms step:441/1480 train_time:64088ms step_avg:148.70ms step:442/1480 train_time:64246ms step_avg:148.72ms step:443/1480 train_time:64403ms step_avg:148.74ms step:444/1480 train_time:64560ms step_avg:148.76ms step:445/1480 train_time:64717ms step_avg:148.77ms step:446/1480 train_time:64873ms step_avg:148.79ms step:447/1480 train_time:65030ms step_avg:148.81ms step:448/1480 train_time:65186ms step_avg:148.83ms step:449/1480 train_time:65346ms step_avg:148.85ms step:450/1480 train_time:65505ms step_avg:148.87ms step:451/1480 train_time:65664ms step_avg:148.90ms step:452/1480 train_time:65820ms step_avg:148.91ms step:453/1480 train_time:65975ms step_avg:148.93ms step:454/1480 train_time:66131ms step_avg:148.94ms step:455/1480 train_time:66287ms step_avg:148.96ms step:456/1480 train_time:66446ms step_avg:148.98ms step:457/1480 train_time:66604ms step_avg:149.00ms step:458/1480 train_time:66760ms step_avg:149.02ms step:459/1480 train_time:66917ms step_avg:149.04ms step:460/1480 train_time:67074ms step_avg:149.05ms step:461/1480 train_time:67232ms step_avg:149.07ms step:462/1480 train_time:67389ms step_avg:149.09ms step:463/1480 train_time:67545ms step_avg:149.11ms step:464/1480 train_time:67703ms step_avg:149.13ms step:465/1480 train_time:67859ms step_avg:149.14ms step:466/1480 train_time:68016ms step_avg:149.16ms step:467/1480 train_time:68173ms step_avg:149.17ms step:468/1480 train_time:68329ms step_avg:149.19ms step:469/1480 train_time:68484ms step_avg:149.20ms step:470/1480 train_time:68643ms step_avg:149.22ms step:471/1480 train_time:68800ms step_avg:149.24ms step:472/1480 train_time:68956ms step_avg:149.26ms step:473/1480 train_time:69112ms step_avg:149.27ms step:474/1480 train_time:69269ms step_avg:149.29ms step:475/1480 train_time:69425ms step_avg:149.30ms step:476/1480 train_time:69582ms step_avg:149.32ms step:477/1480 train_time:69740ms step_avg:149.34ms step:478/1480 train_time:69898ms step_avg:149.35ms step:479/1480 train_time:70053ms step_avg:149.37ms step:480/1480 train_time:70211ms step_avg:149.39ms step:481/1480 train_time:70369ms step_avg:149.40ms step:482/1480 train_time:70525ms step_avg:149.42ms step:483/1480 train_time:70681ms step_avg:149.43ms step:484/1480 train_time:70837ms step_avg:149.45ms step:485/1480 train_time:70995ms step_avg:149.46ms step:486/1480 train_time:71152ms step_avg:149.48ms step:487/1480 train_time:71309ms step_avg:149.50ms step:488/1480 train_time:71467ms step_avg:149.51ms step:489/1480 train_time:71626ms step_avg:149.53ms step:490/1480 train_time:71783ms step_avg:149.55ms step:491/1480 train_time:71940ms step_avg:149.56ms step:492/1480 train_time:72095ms step_avg:149.57ms step:493/1480 train_time:72252ms step_avg:149.59ms step:494/1480 train_time:72409ms step_avg:149.61ms step:495/1480 train_time:72568ms step_avg:149.62ms step:496/1480 train_time:72727ms step_avg:149.64ms step:497/1480 train_time:72884ms step_avg:149.66ms step:498/1480 train_time:73043ms step_avg:149.68ms step:499/1480 train_time:73201ms step_avg:149.69ms step:500/1480 train_time:73358ms step_avg:149.71ms step:500/1480 val_loss:3.6877 train_time:73420ms step_avg:149.84ms step:501/1480 train_time:73520ms step_avg:149.74ms step:502/1480 train_time:73679ms step_avg:149.75ms step:503/1480 train_time:73834ms step_avg:149.76ms step:504/1480 train_time:73989ms step_avg:149.78ms step:505/1480 train_time:74144ms step_avg:149.79ms step:506/1480 train_time:74302ms step_avg:149.80ms step:507/1480 train_time:74458ms step_avg:149.82ms step:508/1480 train_time:74616ms step_avg:149.83ms step:509/1480 train_time:74772ms step_avg:149.84ms step:510/1480 train_time:74930ms step_avg:149.86ms step:511/1480 train_time:75086ms step_avg:149.87ms step:512/1480 train_time:75244ms step_avg:149.89ms step:513/1480 train_time:75399ms step_avg:149.90ms step:514/1480 train_time:75555ms step_avg:149.91ms step:515/1480 train_time:75711ms step_avg:149.92ms step:516/1480 train_time:75869ms step_avg:149.94ms step:517/1480 train_time:76027ms step_avg:149.95ms step:518/1480 train_time:76185ms step_avg:149.97ms step:519/1480 train_time:76343ms step_avg:149.99ms step:520/1480 train_time:76501ms step_avg:150.00ms step:521/1480 train_time:76658ms step_avg:150.01ms step:522/1480 train_time:76815ms step_avg:150.03ms step:523/1480 train_time:76971ms step_avg:150.04ms step:524/1480 train_time:77127ms step_avg:150.05ms step:525/1480 train_time:77284ms step_avg:150.07ms step:526/1480 train_time:77443ms step_avg:150.08ms step:527/1480 train_time:77600ms step_avg:150.10ms step:528/1480 train_time:77757ms step_avg:150.11ms step:529/1480 train_time:77914ms step_avg:150.12ms step:530/1480 train_time:78070ms step_avg:150.14ms step:531/1480 train_time:78227ms step_avg:150.15ms step:532/1480 train_time:78384ms step_avg:150.16ms step:533/1480 train_time:78541ms step_avg:150.17ms step:534/1480 train_time:78697ms step_avg:150.18ms step:535/1480 train_time:78853ms step_avg:150.20ms step:536/1480 train_time:79011ms step_avg:150.21ms step:537/1480 train_time:79168ms step_avg:150.22ms step:538/1480 train_time:79326ms step_avg:150.24ms step:539/1480 train_time:79484ms step_avg:150.25ms step:540/1480 train_time:79641ms step_avg:150.27ms step:541/1480 train_time:79798ms step_avg:150.28ms step:542/1480 train_time:79955ms step_avg:150.29ms step:543/1480 train_time:80112ms step_avg:150.30ms step:544/1480 train_time:80268ms step_avg:150.32ms step:545/1480 train_time:80426ms step_avg:150.33ms step:546/1480 train_time:80584ms step_avg:150.34ms step:547/1480 train_time:80741ms step_avg:150.36ms step:548/1480 train_time:80901ms step_avg:150.37ms step:549/1480 train_time:81057ms step_avg:150.38ms step:550/1480 train_time:81215ms step_avg:150.40ms step:551/1480 train_time:81373ms step_avg:150.41ms step:552/1480 train_time:81531ms step_avg:150.43ms step:553/1480 train_time:81690ms step_avg:150.44ms step:554/1480 train_time:81849ms step_avg:150.46ms step:555/1480 train_time:82009ms step_avg:150.48ms step:556/1480 train_time:82168ms step_avg:150.49ms step:557/1480 train_time:82329ms step_avg:150.51ms step:558/1480 train_time:82489ms step_avg:150.53ms step:559/1480 train_time:82648ms step_avg:150.54ms step:560/1480 train_time:82808ms step_avg:150.56ms step:561/1480 train_time:82967ms step_avg:150.58ms step:562/1480 train_time:83127ms step_avg:150.59ms step:563/1480 train_time:83286ms step_avg:150.61ms step:564/1480 train_time:83445ms step_avg:150.62ms step:565/1480 train_time:83604ms step_avg:150.64ms step:566/1480 train_time:83765ms step_avg:150.66ms step:567/1480 train_time:83925ms step_avg:150.67ms step:568/1480 train_time:84083ms step_avg:150.69ms step:569/1480 train_time:84243ms step_avg:150.70ms step:570/1480 train_time:84402ms step_avg:150.72ms step:571/1480 train_time:84561ms step_avg:150.73ms step:572/1480 train_time:84720ms step_avg:150.75ms step:573/1480 train_time:84883ms step_avg:150.77ms step:574/1480 train_time:85045ms step_avg:150.79ms step:575/1480 train_time:85207ms step_avg:150.81ms step:576/1480 train_time:85366ms step_avg:150.82ms step:577/1480 train_time:85527ms step_avg:150.84ms step:578/1480 train_time:85686ms step_avg:150.86ms step:579/1480 train_time:85845ms step_avg:150.87ms step:580/1480 train_time:86006ms step_avg:150.89ms step:581/1480 train_time:86167ms step_avg:150.91ms step:582/1480 train_time:86328ms step_avg:150.92ms step:583/1480 train_time:86488ms step_avg:150.94ms step:584/1480 train_time:86647ms step_avg:150.95ms step:585/1480 train_time:86806ms step_avg:150.97ms step:586/1480 train_time:86967ms step_avg:150.98ms step:587/1480 train_time:87128ms step_avg:151.00ms step:588/1480 train_time:87288ms step_avg:151.02ms step:589/1480 train_time:87448ms step_avg:151.03ms step:590/1480 train_time:87608ms step_avg:151.05ms step:591/1480 train_time:87767ms step_avg:151.06ms step:592/1480 train_time:87927ms step_avg:151.08ms step:593/1480 train_time:88088ms step_avg:151.09ms step:594/1480 train_time:88249ms step_avg:151.11ms step:595/1480 train_time:88410ms step_avg:151.13ms step:596/1480 train_time:88570ms step_avg:151.14ms step:597/1480 train_time:88730ms step_avg:151.16ms step:598/1480 train_time:88887ms step_avg:151.17ms step:599/1480 train_time:89046ms step_avg:151.18ms step:600/1480 train_time:89208ms step_avg:151.20ms step:601/1480 train_time:89367ms step_avg:151.21ms step:602/1480 train_time:89528ms step_avg:151.23ms step:603/1480 train_time:89689ms step_avg:151.25ms step:604/1480 train_time:89847ms step_avg:151.26ms step:605/1480 train_time:90006ms step_avg:151.27ms step:606/1480 train_time:90168ms step_avg:151.29ms step:607/1480 train_time:90330ms step_avg:151.31ms step:608/1480 train_time:90489ms step_avg:151.32ms step:609/1480 train_time:90649ms step_avg:151.33ms step:610/1480 train_time:90808ms step_avg:151.35ms step:611/1480 train_time:90968ms step_avg:151.36ms step:612/1480 train_time:91129ms step_avg:151.38ms step:613/1480 train_time:91290ms step_avg:151.39ms step:614/1480 train_time:91449ms step_avg:151.41ms step:615/1480 train_time:91608ms step_avg:151.42ms step:616/1480 train_time:91765ms step_avg:151.43ms step:617/1480 train_time:91926ms step_avg:151.44ms step:618/1480 train_time:92086ms step_avg:151.46ms step:619/1480 train_time:92246ms step_avg:151.47ms step:620/1480 train_time:92407ms step_avg:151.49ms step:621/1480 train_time:92567ms step_avg:151.50ms step:622/1480 train_time:92727ms step_avg:151.51ms step:623/1480 train_time:92888ms step_avg:151.53ms step:624/1480 train_time:93048ms step_avg:151.54ms step:625/1480 train_time:93208ms step_avg:151.56ms step:625/1480 val_loss:3.6063 train_time:93271ms step_avg:151.66ms step:626/1480 train_time:93369ms step_avg:151.57ms step:627/1480 train_time:93528ms step_avg:151.58ms step:628/1480 train_time:93686ms step_avg:151.60ms step:629/1480 train_time:93843ms step_avg:151.60ms step:630/1480 train_time:94002ms step_avg:151.62ms step:631/1480 train_time:94161ms step_avg:151.63ms step:632/1480 train_time:94321ms step_avg:151.64ms step:633/1480 train_time:94481ms step_avg:151.66ms step:634/1480 train_time:94641ms step_avg:151.67ms step:635/1480 train_time:94801ms step_avg:151.68ms step:636/1480 train_time:94960ms step_avg:151.69ms step:637/1480 train_time:95120ms step_avg:151.71ms step:638/1480 train_time:95280ms step_avg:151.72ms step:639/1480 train_time:95440ms step_avg:151.73ms step:640/1480 train_time:95601ms step_avg:151.75ms step:641/1480 train_time:95762ms step_avg:151.76ms step:642/1480 train_time:95921ms step_avg:151.77ms step:643/1480 train_time:96081ms step_avg:151.79ms step:644/1480 train_time:96240ms step_avg:151.80ms step:645/1480 train_time:96400ms step_avg:151.81ms step:646/1480 train_time:96561ms step_avg:151.82ms step:647/1480 train_time:96721ms step_avg:151.84ms step:648/1480 train_time:96882ms step_avg:151.85ms step:649/1480 train_time:97041ms step_avg:151.86ms step:650/1480 train_time:97201ms step_avg:151.88ms step:651/1480 train_time:97361ms step_avg:151.89ms step:652/1480 train_time:97521ms step_avg:151.90ms step:653/1480 train_time:97681ms step_avg:151.92ms step:654/1480 train_time:97840ms step_avg:151.93ms step:655/1480 train_time:98001ms step_avg:151.94ms step:656/1480 train_time:98160ms step_avg:151.95ms step:657/1480 train_time:98319ms step_avg:151.96ms step:658/1480 train_time:98479ms step_avg:151.97ms step:659/1480 train_time:98641ms step_avg:151.99ms step:660/1480 train_time:98803ms step_avg:152.01ms step:661/1480 train_time:98965ms step_avg:152.02ms step:662/1480 train_time:99125ms step_avg:152.03ms step:663/1480 train_time:99285ms step_avg:152.04ms step:664/1480 train_time:99448ms step_avg:152.06ms step:665/1480 train_time:99609ms step_avg:152.07ms step:666/1480 train_time:99769ms step_avg:152.09ms step:667/1480 train_time:99930ms step_avg:152.10ms step:668/1480 train_time:100092ms step_avg:152.12ms step:669/1480 train_time:100255ms step_avg:152.13ms step:670/1480 train_time:100414ms step_avg:152.14ms step:671/1480 train_time:100576ms step_avg:152.16ms step:672/1480 train_time:100737ms step_avg:152.17ms step:673/1480 train_time:100902ms step_avg:152.19ms step:674/1480 train_time:101064ms step_avg:152.21ms step:675/1480 train_time:101225ms step_avg:152.22ms step:676/1480 train_time:101387ms step_avg:152.23ms step:677/1480 train_time:101547ms step_avg:152.24ms step:678/1480 train_time:101708ms step_avg:152.26ms step:679/1480 train_time:101870ms step_avg:152.27ms step:680/1480 train_time:102030ms step_avg:152.28ms step:681/1480 train_time:102192ms step_avg:152.30ms step:682/1480 train_time:102354ms step_avg:152.31ms step:683/1480 train_time:102516ms step_avg:152.33ms step:684/1480 train_time:102678ms step_avg:152.34ms step:685/1480 train_time:102842ms step_avg:152.36ms step:686/1480 train_time:103004ms step_avg:152.37ms step:687/1480 train_time:103164ms step_avg:152.38ms step:688/1480 train_time:103327ms step_avg:152.40ms step:689/1480 train_time:103490ms step_avg:152.41ms step:690/1480 train_time:103653ms step_avg:152.43ms step:691/1480 train_time:103815ms step_avg:152.44ms step:692/1480 train_time:103976ms step_avg:152.46ms step:693/1480 train_time:104139ms step_avg:152.47ms step:694/1480 train_time:104302ms step_avg:152.49ms step:695/1480 train_time:104463ms step_avg:152.50ms step:696/1480 train_time:104623ms step_avg:152.51ms step:697/1480 train_time:104785ms step_avg:152.53ms step:698/1480 train_time:104945ms step_avg:152.54ms step:699/1480 train_time:105108ms step_avg:152.55ms step:700/1480 train_time:105270ms step_avg:152.56ms step:701/1480 train_time:105428ms step_avg:152.57ms step:702/1480 train_time:105589ms step_avg:152.59ms step:703/1480 train_time:105749ms step_avg:152.60ms step:704/1480 train_time:105909ms step_avg:152.61ms step:705/1480 train_time:106071ms step_avg:152.62ms step:706/1480 train_time:106235ms step_avg:152.64ms step:707/1480 train_time:106396ms step_avg:152.65ms step:708/1480 train_time:106558ms step_avg:152.66ms step:709/1480 train_time:106722ms step_avg:152.68ms step:710/1480 train_time:106882ms step_avg:152.69ms step:711/1480 train_time:107043ms step_avg:152.70ms step:712/1480 train_time:107207ms step_avg:152.72ms step:713/1480 train_time:107370ms step_avg:152.73ms step:714/1480 train_time:107530ms step_avg:152.74ms step:715/1480 train_time:107689ms step_avg:152.75ms step:716/1480 train_time:107850ms step_avg:152.76ms step:717/1480 train_time:108013ms step_avg:152.78ms step:718/1480 train_time:108173ms step_avg:152.79ms step:719/1480 train_time:108333ms step_avg:152.80ms step:720/1480 train_time:108498ms step_avg:152.81ms step:721/1480 train_time:108662ms step_avg:152.83ms step:722/1480 train_time:108824ms step_avg:152.84ms step:723/1480 train_time:108984ms step_avg:152.85ms step:724/1480 train_time:109146ms step_avg:152.87ms step:725/1480 train_time:109308ms step_avg:152.88ms step:726/1480 train_time:109471ms step_avg:152.89ms step:727/1480 train_time:109635ms step_avg:152.91ms step:728/1480 train_time:109797ms step_avg:152.92ms step:729/1480 train_time:109959ms step_avg:152.93ms step:730/1480 train_time:110123ms step_avg:152.95ms step:731/1480 train_time:110284ms step_avg:152.96ms step:732/1480 train_time:110443ms step_avg:152.97ms step:733/1480 train_time:110605ms step_avg:152.98ms step:734/1480 train_time:110767ms step_avg:152.99ms step:735/1480 train_time:110927ms step_avg:153.00ms step:736/1480 train_time:111089ms step_avg:153.02ms step:737/1480 train_time:111249ms step_avg:153.02ms step:738/1480 train_time:111409ms step_avg:153.03ms step:739/1480 train_time:111569ms step_avg:153.04ms step:740/1480 train_time:111734ms step_avg:153.06ms step:741/1480 train_time:111898ms step_avg:153.07ms step:742/1480 train_time:112061ms step_avg:153.09ms step:743/1480 train_time:112222ms step_avg:153.10ms step:744/1480 train_time:112386ms step_avg:153.11ms step:745/1480 train_time:112549ms step_avg:153.13ms step:746/1480 train_time:112708ms step_avg:153.14ms step:747/1480 train_time:112871ms step_avg:153.15ms step:748/1480 train_time:113036ms step_avg:153.17ms step:749/1480 train_time:113202ms step_avg:153.18ms step:750/1480 train_time:113362ms step_avg:153.19ms step:750/1480 val_loss:3.5492 train_time:113427ms step_avg:153.28ms step:751/1480 train_time:113527ms step_avg:153.21ms step:752/1480 train_time:113687ms step_avg:153.22ms step:753/1480 train_time:113848ms step_avg:153.23ms step:754/1480 train_time:114008ms step_avg:153.24ms step:755/1480 train_time:114168ms step_avg:153.25ms step:756/1480 train_time:114329ms step_avg:153.26ms step:757/1480 train_time:114494ms step_avg:153.27ms step:758/1480 train_time:114657ms step_avg:153.29ms step:759/1480 train_time:114819ms step_avg:153.30ms step:760/1480 train_time:114981ms step_avg:153.31ms step:761/1480 train_time:115142ms step_avg:153.32ms step:762/1480 train_time:115303ms step_avg:153.33ms step:763/1480 train_time:115465ms step_avg:153.34ms step:764/1480 train_time:115626ms step_avg:153.35ms step:765/1480 train_time:115787ms step_avg:153.36ms step:766/1480 train_time:115951ms step_avg:153.37ms step:767/1480 train_time:116114ms step_avg:153.39ms step:768/1480 train_time:116277ms step_avg:153.40ms step:769/1480 train_time:116441ms step_avg:153.41ms step:770/1480 train_time:116603ms step_avg:153.42ms step:771/1480 train_time:116768ms step_avg:153.44ms step:772/1480 train_time:116928ms step_avg:153.45ms step:773/1480 train_time:117091ms step_avg:153.46ms step:774/1480 train_time:117256ms step_avg:153.48ms step:775/1480 train_time:117419ms step_avg:153.49ms step:776/1480 train_time:117583ms step_avg:153.50ms step:777/1480 train_time:117747ms step_avg:153.52ms step:778/1480 train_time:117911ms step_avg:153.53ms step:779/1480 train_time:118073ms step_avg:153.54ms step:780/1480 train_time:118238ms step_avg:153.56ms step:781/1480 train_time:118401ms step_avg:153.57ms step:782/1480 train_time:118566ms step_avg:153.58ms step:783/1480 train_time:118726ms step_avg:153.59ms step:784/1480 train_time:118890ms step_avg:153.60ms step:785/1480 train_time:119053ms step_avg:153.62ms step:786/1480 train_time:119220ms step_avg:153.63ms step:787/1480 train_time:119383ms step_avg:153.65ms step:788/1480 train_time:119546ms step_avg:153.66ms step:789/1480 train_time:119708ms step_avg:153.67ms step:790/1480 train_time:119873ms step_avg:153.68ms step:791/1480 train_time:120040ms step_avg:153.70ms step:792/1480 train_time:120204ms step_avg:153.71ms step:793/1480 train_time:120365ms step_avg:153.72ms step:794/1480 train_time:120530ms step_avg:153.74ms step:795/1480 train_time:120697ms step_avg:153.75ms step:796/1480 train_time:120863ms step_avg:153.77ms step:797/1480 train_time:121026ms step_avg:153.78ms step:798/1480 train_time:121189ms step_avg:153.79ms step:799/1480 train_time:121357ms step_avg:153.81ms step:800/1480 train_time:121521ms step_avg:153.82ms step:801/1480 train_time:121683ms step_avg:153.83ms step:802/1480 train_time:121851ms step_avg:153.85ms step:803/1480 train_time:122013ms step_avg:153.86ms step:804/1480 train_time:122175ms step_avg:153.87ms step:805/1480 train_time:122341ms step_avg:153.89ms step:806/1480 train_time:122502ms step_avg:153.90ms step:807/1480 train_time:122664ms step_avg:153.91ms step:808/1480 train_time:122828ms step_avg:153.92ms step:809/1480 train_time:122989ms step_avg:153.93ms step:810/1480 train_time:123151ms step_avg:153.94ms step:811/1480 train_time:123315ms step_avg:153.95ms step:812/1480 train_time:123480ms step_avg:153.97ms step:813/1480 train_time:123641ms step_avg:153.97ms step:814/1480 train_time:123803ms step_avg:153.98ms step:815/1480 train_time:123966ms step_avg:153.99ms step:816/1480 train_time:124132ms step_avg:154.01ms step:817/1480 train_time:124295ms step_avg:154.02ms step:818/1480 train_time:124458ms step_avg:154.03ms step:819/1480 train_time:124621ms step_avg:154.04ms step:820/1480 train_time:124784ms step_avg:154.05ms step:821/1480 train_time:124946ms step_avg:154.06ms step:822/1480 train_time:125109ms step_avg:154.08ms step:823/1480 train_time:125270ms step_avg:154.08ms step:824/1480 train_time:125432ms step_avg:154.09ms step:825/1480 train_time:125598ms step_avg:154.11ms step:826/1480 train_time:125764ms step_avg:154.12ms step:827/1480 train_time:125928ms step_avg:154.13ms step:828/1480 train_time:126091ms step_avg:154.14ms step:829/1480 train_time:126254ms step_avg:154.16ms step:830/1480 train_time:126420ms step_avg:154.17ms step:831/1480 train_time:126584ms step_avg:154.18ms step:832/1480 train_time:126747ms step_avg:154.19ms step:833/1480 train_time:126910ms step_avg:154.20ms step:834/1480 train_time:127077ms step_avg:154.22ms step:835/1480 train_time:127240ms step_avg:154.23ms step:836/1480 train_time:127405ms step_avg:154.24ms step:837/1480 train_time:127567ms step_avg:154.25ms step:838/1480 train_time:127731ms step_avg:154.27ms step:839/1480 train_time:127895ms step_avg:154.28ms step:840/1480 train_time:128057ms step_avg:154.29ms step:841/1480 train_time:128220ms step_avg:154.30ms step:842/1480 train_time:128384ms step_avg:154.31ms step:843/1480 train_time:128546ms step_avg:154.32ms step:844/1480 train_time:128708ms step_avg:154.33ms step:845/1480 train_time:128872ms step_avg:154.34ms step:846/1480 train_time:129038ms step_avg:154.35ms step:847/1480 train_time:129202ms step_avg:154.36ms step:848/1480 train_time:129364ms step_avg:154.37ms step:849/1480 train_time:129526ms step_avg:154.38ms step:850/1480 train_time:129689ms step_avg:154.39ms step:851/1480 train_time:129856ms step_avg:154.41ms step:852/1480 train_time:130019ms step_avg:154.42ms step:853/1480 train_time:130182ms step_avg:154.43ms step:854/1480 train_time:130346ms step_avg:154.44ms step:855/1480 train_time:130508ms step_avg:154.45ms step:856/1480 train_time:130669ms step_avg:154.46ms step:857/1480 train_time:130836ms step_avg:154.47ms step:858/1480 train_time:131001ms step_avg:154.48ms step:859/1480 train_time:131165ms step_avg:154.49ms step:860/1480 train_time:131327ms step_avg:154.50ms step:861/1480 train_time:131492ms step_avg:154.52ms step:862/1480 train_time:131662ms step_avg:154.53ms step:863/1480 train_time:131830ms step_avg:154.55ms step:864/1480 train_time:131993ms step_avg:154.56ms step:865/1480 train_time:132155ms step_avg:154.57ms step:866/1480 train_time:132323ms step_avg:154.58ms step:867/1480 train_time:132486ms step_avg:154.59ms step:868/1480 train_time:132646ms step_avg:154.60ms step:869/1480 train_time:132808ms step_avg:154.61ms step:870/1480 train_time:132973ms step_avg:154.62ms step:871/1480 train_time:133139ms step_avg:154.63ms step:872/1480 train_time:133301ms step_avg:154.64ms step:873/1480 train_time:133464ms step_avg:154.65ms step:874/1480 train_time:133630ms step_avg:154.66ms step:875/1480 train_time:133795ms step_avg:154.68ms step:875/1480 val_loss:3.5044 train_time:133860ms step_avg:154.75ms step:876/1480 train_time:133960ms step_avg:154.69ms step:877/1480 train_time:134126ms step_avg:154.70ms step:878/1480 train_time:134290ms step_avg:154.71ms step:879/1480 train_time:134454ms step_avg:154.72ms step:880/1480 train_time:134617ms step_avg:154.73ms step:881/1480 train_time:134779ms step_avg:154.74ms step:882/1480 train_time:134943ms step_avg:154.75ms step:883/1480 train_time:135110ms step_avg:154.77ms step:884/1480 train_time:135276ms step_avg:154.78ms step:885/1480 train_time:135440ms step_avg:154.79ms step:886/1480 train_time:135608ms step_avg:154.80ms step:887/1480 train_time:135776ms step_avg:154.82ms step:888/1480 train_time:135949ms step_avg:154.84ms step:889/1480 train_time:136116ms step_avg:154.85ms step:890/1480 train_time:136278ms step_avg:154.86ms step:891/1480 train_time:136444ms step_avg:154.87ms step:892/1480 train_time:136611ms step_avg:154.89ms step:893/1480 train_time:136773ms step_avg:154.90ms step:894/1480 train_time:136940ms step_avg:154.91ms step:895/1480 train_time:137108ms step_avg:154.92ms step:896/1480 train_time:137274ms step_avg:154.94ms step:897/1480 train_time:137443ms step_avg:154.95ms step:898/1480 train_time:137610ms step_avg:154.97ms step:899/1480 train_time:137774ms step_avg:154.98ms step:900/1480 train_time:137937ms step_avg:154.99ms step:901/1480 train_time:138102ms step_avg:155.00ms step:902/1480 train_time:138267ms step_avg:155.01ms step:903/1480 train_time:138441ms step_avg:155.03ms step:904/1480 train_time:138608ms step_avg:155.04ms step:905/1480 train_time:138771ms step_avg:155.05ms step:906/1480 train_time:138938ms step_avg:155.06ms step:907/1480 train_time:139107ms step_avg:155.08ms step:908/1480 train_time:139270ms step_avg:155.09ms step:909/1480 train_time:139434ms step_avg:155.10ms step:910/1480 train_time:139605ms step_avg:155.12ms step:911/1480 train_time:139770ms step_avg:155.13ms step:912/1480 train_time:139935ms step_avg:155.14ms step:913/1480 train_time:140104ms step_avg:155.15ms step:914/1480 train_time:140273ms step_avg:155.17ms step:915/1480 train_time:140444ms step_avg:155.19ms step:916/1480 train_time:140608ms step_avg:155.20ms step:917/1480 train_time:140772ms step_avg:155.21ms step:918/1480 train_time:140940ms step_avg:155.22ms step:919/1480 train_time:141112ms step_avg:155.24ms step:920/1480 train_time:141277ms step_avg:155.25ms step:921/1480 train_time:141442ms step_avg:155.26ms step:922/1480 train_time:141610ms step_avg:155.27ms step:923/1480 train_time:141773ms step_avg:155.28ms step:924/1480 train_time:141938ms step_avg:155.29ms step:925/1480 train_time:142103ms step_avg:155.30ms step:926/1480 train_time:142267ms step_avg:155.31ms step:927/1480 train_time:142431ms step_avg:155.32ms step:928/1480 train_time:142596ms step_avg:155.33ms step:929/1480 train_time:142761ms step_avg:155.34ms step:930/1480 train_time:142928ms step_avg:155.36ms step:931/1480 train_time:143092ms step_avg:155.37ms step:932/1480 train_time:143256ms step_avg:155.38ms step:933/1480 train_time:143423ms step_avg:155.39ms step:934/1480 train_time:143590ms step_avg:155.40ms step:935/1480 train_time:143758ms step_avg:155.41ms step:936/1480 train_time:143926ms step_avg:155.43ms step:937/1480 train_time:144095ms step_avg:155.44ms step:938/1480 train_time:144257ms step_avg:155.45ms step:939/1480 train_time:144428ms step_avg:155.47ms step:940/1480 train_time:144595ms step_avg:155.48ms step:941/1480 train_time:144757ms step_avg:155.49ms step:942/1480 train_time:144923ms step_avg:155.50ms step:943/1480 train_time:145094ms step_avg:155.51ms step:944/1480 train_time:145266ms step_avg:155.53ms step:945/1480 train_time:145430ms step_avg:155.54ms step:946/1480 train_time:145598ms step_avg:155.55ms step:947/1480 train_time:145768ms step_avg:155.57ms step:948/1480 train_time:145934ms step_avg:155.58ms step:949/1480 train_time:146099ms step_avg:155.59ms step:950/1480 train_time:146262ms step_avg:155.60ms step:951/1480 train_time:146430ms step_avg:155.61ms step:952/1480 train_time:146594ms step_avg:155.62ms step:953/1480 train_time:146762ms step_avg:155.63ms step:954/1480 train_time:146930ms step_avg:155.65ms step:955/1480 train_time:147092ms step_avg:155.65ms step:956/1480 train_time:147256ms step_avg:155.66ms step:957/1480 train_time:147424ms step_avg:155.68ms step:958/1480 train_time:147593ms step_avg:155.69ms step:959/1480 train_time:147756ms step_avg:155.70ms step:960/1480 train_time:147924ms step_avg:155.71ms step:961/1480 train_time:148090ms step_avg:155.72ms step:962/1480 train_time:148253ms step_avg:155.73ms step:963/1480 train_time:148419ms step_avg:155.74ms step:964/1480 train_time:148586ms step_avg:155.75ms step:965/1480 train_time:148750ms step_avg:155.76ms step:966/1480 train_time:148915ms step_avg:155.77ms step:967/1480 train_time:149078ms step_avg:155.78ms step:968/1480 train_time:149243ms step_avg:155.79ms step:969/1480 train_time:149411ms step_avg:155.80ms step:970/1480 train_time:149574ms step_avg:155.81ms step:971/1480 train_time:149737ms step_avg:155.81ms step:972/1480 train_time:149901ms step_avg:155.82ms step:973/1480 train_time:150068ms step_avg:155.83ms step:974/1480 train_time:150236ms step_avg:155.85ms step:975/1480 train_time:150402ms step_avg:155.86ms step:976/1480 train_time:150569ms step_avg:155.87ms step:977/1480 train_time:150734ms step_avg:155.88ms step:978/1480 train_time:150898ms step_avg:155.89ms step:979/1480 train_time:151064ms step_avg:155.90ms step:980/1480 train_time:151230ms step_avg:155.91ms step:981/1480 train_time:151398ms step_avg:155.92ms step:982/1480 train_time:151561ms step_avg:155.93ms step:983/1480 train_time:151726ms step_avg:155.94ms step:984/1480 train_time:151890ms step_avg:155.94ms step:985/1480 train_time:152056ms step_avg:155.95ms step:986/1480 train_time:152221ms step_avg:155.96ms step:987/1480 train_time:152387ms step_avg:155.97ms step:988/1480 train_time:152553ms step_avg:155.98ms step:989/1480 train_time:152717ms step_avg:155.99ms step:990/1480 train_time:152888ms step_avg:156.01ms step:991/1480 train_time:153055ms step_avg:156.02ms step:992/1480 train_time:153230ms step_avg:156.04ms step:993/1480 train_time:153409ms step_avg:156.06ms step:994/1480 train_time:153575ms step_avg:156.07ms step:995/1480 train_time:153738ms step_avg:156.08ms step:996/1480 train_time:153900ms step_avg:156.09ms step:997/1480 train_time:154065ms step_avg:156.09ms step:998/1480 train_time:154230ms step_avg:156.10ms step:999/1480 train_time:154395ms step_avg:156.11ms step:1000/1480 train_time:154564ms step_avg:156.13ms step:1000/1480 val_loss:3.4399 train_time:154633ms step_avg:156.19ms step:1001/1480 train_time:154733ms step_avg:156.14ms step:1002/1480 train_time:154900ms step_avg:156.15ms step:1003/1480 train_time:155072ms step_avg:156.16ms step:1004/1480 train_time:155241ms step_avg:156.18ms step:1005/1480 train_time:155410ms step_avg:156.19ms step:1006/1480 train_time:155578ms step_avg:156.20ms step:1007/1480 train_time:155745ms step_avg:156.21ms step:1008/1480 train_time:155912ms step_avg:156.22ms step:1009/1480 train_time:156086ms step_avg:156.24ms step:1010/1480 train_time:156252ms step_avg:156.25ms step:1011/1480 train_time:156418ms step_avg:156.26ms step:1012/1480 train_time:156584ms step_avg:156.27ms step:1013/1480 train_time:156754ms step_avg:156.29ms step:1014/1480 train_time:156921ms step_avg:156.30ms step:1015/1480 train_time:157090ms step_avg:156.31ms step:1016/1480 train_time:157260ms step_avg:156.32ms step:1017/1480 train_time:157433ms step_avg:156.34ms step:1018/1480 train_time:157601ms step_avg:156.35ms step:1019/1480 train_time:157769ms step_avg:156.36ms step:1020/1480 train_time:157938ms step_avg:156.37ms step:1021/1480 train_time:158104ms step_avg:156.38ms step:1022/1480 train_time:158271ms step_avg:156.39ms step:1023/1480 train_time:158438ms step_avg:156.40ms step:1024/1480 train_time:158605ms step_avg:156.42ms step:1025/1480 train_time:158777ms step_avg:156.43ms step:1026/1480 train_time:158943ms step_avg:156.44ms step:1027/1480 train_time:159109ms step_avg:156.45ms step:1028/1480 train_time:159280ms step_avg:156.46ms step:1029/1480 train_time:159454ms step_avg:156.48ms step:1030/1480 train_time:159622ms step_avg:156.49ms step:1031/1480 train_time:159786ms step_avg:156.50ms step:1032/1480 train_time:159956ms step_avg:156.51ms step:1033/1480 train_time:160122ms step_avg:156.52ms step:1034/1480 train_time:160291ms step_avg:156.53ms step:1035/1480 train_time:160458ms step_avg:156.54ms step:1036/1480 train_time:160624ms step_avg:156.55ms step:1037/1480 train_time:160790ms step_avg:156.56ms step:1038/1480 train_time:160957ms step_avg:156.57ms step:1039/1480 train_time:161128ms step_avg:156.59ms step:1040/1480 train_time:161293ms step_avg:156.60ms step:1041/1480 train_time:161460ms step_avg:156.61ms step:1042/1480 train_time:161625ms step_avg:156.61ms step:1043/1480 train_time:161790ms step_avg:156.62ms step:1044/1480 train_time:161954ms step_avg:156.63ms step:1045/1480 train_time:162125ms step_avg:156.64ms step:1046/1480 train_time:162292ms step_avg:156.65ms step:1047/1480 train_time:162458ms step_avg:156.66ms step:1048/1480 train_time:162626ms step_avg:156.67ms step:1049/1480 train_time:162792ms step_avg:156.68ms step:1050/1480 train_time:162962ms step_avg:156.69ms step:1051/1480 train_time:163132ms step_avg:156.71ms step:1052/1480 train_time:163300ms step_avg:156.72ms step:1053/1480 train_time:163466ms step_avg:156.73ms step:1054/1480 train_time:163634ms step_avg:156.74ms step:1055/1480 train_time:163800ms step_avg:156.75ms step:1056/1480 train_time:163967ms step_avg:156.76ms step:1057/1480 train_time:164133ms step_avg:156.77ms step:1058/1480 train_time:164304ms step_avg:156.78ms step:1059/1480 train_time:164476ms step_avg:156.79ms step:1060/1480 train_time:164646ms step_avg:156.81ms step:1061/1480 train_time:164809ms step_avg:156.81ms step:1062/1480 train_time:164974ms step_avg:156.82ms step:1063/1480 train_time:165139ms step_avg:156.83ms step:1064/1480 train_time:165305ms step_avg:156.84ms step:1065/1480 train_time:165473ms step_avg:156.85ms step:1066/1480 train_time:165642ms step_avg:156.86ms step:1067/1480 train_time:165811ms step_avg:156.87ms step:1068/1480 train_time:165976ms step_avg:156.88ms step:1069/1480 train_time:166149ms step_avg:156.89ms step:1070/1480 train_time:166315ms step_avg:156.90ms step:1071/1480 train_time:166489ms step_avg:156.92ms step:1072/1480 train_time:166655ms step_avg:156.93ms step:1073/1480 train_time:166819ms step_avg:156.93ms step:1074/1480 train_time:166988ms step_avg:156.94ms step:1075/1480 train_time:167157ms step_avg:156.95ms step:1076/1480 train_time:167324ms step_avg:156.96ms step:1077/1480 train_time:167490ms step_avg:156.97ms step:1078/1480 train_time:167665ms step_avg:156.99ms step:1079/1480 train_time:167838ms step_avg:157.00ms step:1080/1480 train_time:168009ms step_avg:157.02ms step:1081/1480 train_time:168175ms step_avg:157.03ms step:1082/1480 train_time:168341ms step_avg:157.03ms step:1083/1480 train_time:168508ms step_avg:157.04ms step:1084/1480 train_time:168674ms step_avg:157.05ms step:1085/1480 train_time:168845ms step_avg:157.07ms step:1086/1480 train_time:169013ms step_avg:157.08ms step:1087/1480 train_time:169178ms step_avg:157.08ms step:1088/1480 train_time:169349ms step_avg:157.10ms step:1089/1480 train_time:169521ms step_avg:157.11ms step:1090/1480 train_time:169692ms step_avg:157.12ms step:1091/1480 train_time:169861ms step_avg:157.13ms step:1092/1480 train_time:170029ms step_avg:157.14ms step:1093/1480 train_time:170197ms step_avg:157.15ms step:1094/1480 train_time:170364ms step_avg:157.16ms step:1095/1480 train_time:170528ms step_avg:157.17ms step:1096/1480 train_time:170696ms step_avg:157.18ms step:1097/1480 train_time:170865ms step_avg:157.19ms step:1098/1480 train_time:171035ms step_avg:157.20ms step:1099/1480 train_time:171205ms step_avg:157.21ms step:1100/1480 train_time:171376ms step_avg:157.23ms step:1101/1480 train_time:171547ms step_avg:157.24ms step:1102/1480 train_time:171718ms step_avg:157.25ms step:1103/1480 train_time:171893ms step_avg:157.27ms step:1104/1480 train_time:172060ms step_avg:157.28ms step:1105/1480 train_time:172230ms step_avg:157.29ms step:1106/1480 train_time:172399ms step_avg:157.30ms step:1107/1480 train_time:172570ms step_avg:157.31ms step:1108/1480 train_time:172735ms step_avg:157.32ms step:1109/1480 train_time:172902ms step_avg:157.33ms step:1110/1480 train_time:173068ms step_avg:157.33ms step:1111/1480 train_time:173235ms step_avg:157.34ms step:1112/1480 train_time:173406ms step_avg:157.36ms step:1113/1480 train_time:173585ms step_avg:157.38ms step:1114/1480 train_time:173757ms step_avg:157.39ms step:1115/1480 train_time:173930ms step_avg:157.40ms step:1116/1480 train_time:174097ms step_avg:157.41ms step:1117/1480 train_time:174269ms step_avg:157.42ms step:1118/1480 train_time:174444ms step_avg:157.44ms step:1119/1480 train_time:174610ms step_avg:157.45ms step:1120/1480 train_time:174777ms step_avg:157.46ms step:1121/1480 train_time:174949ms step_avg:157.47ms step:1122/1480 train_time:175114ms step_avg:157.48ms step:1123/1480 train_time:175282ms step_avg:157.49ms step:1124/1480 train_time:175449ms step_avg:157.49ms step:1125/1480 train_time:175616ms step_avg:157.50ms step:1125/1480 val_loss:3.3848 train_time:175684ms step_avg:157.56ms step:1126/1480 train_time:175786ms step_avg:157.51ms step:1127/1480 train_time:175957ms step_avg:157.53ms step:1128/1480 train_time:176127ms step_avg:157.54ms step:1129/1480 train_time:176302ms step_avg:157.55ms step:1130/1480 train_time:176471ms step_avg:157.56ms step:1131/1480 train_time:176647ms step_avg:157.58ms step:1132/1480 train_time:176813ms step_avg:157.59ms step:1133/1480 train_time:176986ms step_avg:157.60ms step:1134/1480 train_time:177157ms step_avg:157.61ms step:1135/1480 train_time:177323ms step_avg:157.62ms step:1136/1480 train_time:177495ms step_avg:157.63ms step:1137/1480 train_time:177665ms step_avg:157.64ms step:1138/1480 train_time:177838ms step_avg:157.66ms step:1139/1480 train_time:178005ms step_avg:157.67ms step:1140/1480 train_time:178174ms step_avg:157.68ms step:1141/1480 train_time:178345ms step_avg:157.69ms step:1142/1480 train_time:178512ms step_avg:157.70ms step:1143/1480 train_time:178680ms step_avg:157.71ms step:1144/1480 train_time:178850ms step_avg:157.72ms step:1145/1480 train_time:179015ms step_avg:157.72ms step:1146/1480 train_time:179184ms step_avg:157.73ms step:1147/1480 train_time:179354ms step_avg:157.74ms step:1148/1480 train_time:179521ms step_avg:157.75ms step:1149/1480 train_time:179693ms step_avg:157.76ms step:1150/1480 train_time:179861ms step_avg:157.77ms step:1151/1480 train_time:180034ms step_avg:157.79ms step:1152/1480 train_time:180205ms step_avg:157.80ms step:1153/1480 train_time:180379ms step_avg:157.81ms step:1154/1480 train_time:180546ms step_avg:157.82ms step:1155/1480 train_time:180719ms step_avg:157.83ms step:1156/1480 train_time:180899ms step_avg:157.85ms step:1157/1480 train_time:181068ms step_avg:157.86ms step:1158/1480 train_time:181235ms step_avg:157.87ms step:1159/1480 train_time:181401ms step_avg:157.88ms step:1160/1480 train_time:181567ms step_avg:157.88ms step:1161/1480 train_time:181739ms step_avg:157.90ms step:1162/1480 train_time:181908ms step_avg:157.91ms step:1163/1480 train_time:182079ms step_avg:157.92ms step:1164/1480 train_time:182248ms step_avg:157.93ms step:1165/1480 train_time:182413ms step_avg:157.93ms step:1166/1480 train_time:182582ms step_avg:157.94ms step:1167/1480 train_time:182749ms step_avg:157.95ms step:1168/1480 train_time:182917ms step_avg:157.96ms step:1169/1480 train_time:183085ms step_avg:157.97ms step:1170/1480 train_time:183254ms step_avg:157.98ms step:1171/1480 train_time:183420ms step_avg:157.98ms step:1172/1480 train_time:183587ms step_avg:157.99ms step:1173/1480 train_time:183760ms step_avg:158.01ms step:1174/1480 train_time:183940ms step_avg:158.02ms step:1175/1480 train_time:184111ms step_avg:158.03ms step:1176/1480 train_time:184282ms step_avg:158.05ms step:1177/1480 train_time:184459ms step_avg:158.06ms step:1178/1480 train_time:184625ms step_avg:158.07ms step:1179/1480 train_time:184790ms step_avg:158.08ms step:1180/1480 train_time:184970ms step_avg:158.09ms step:1181/1480 train_time:185140ms step_avg:158.10ms step:1182/1480 train_time:185307ms step_avg:158.11ms step:1183/1480 train_time:185479ms step_avg:158.12ms step:1184/1480 train_time:185645ms step_avg:158.13ms step:1185/1480 train_time:185819ms step_avg:158.14ms step:1186/1480 train_time:185990ms step_avg:158.15ms step:1187/1480 train_time:186175ms step_avg:158.18ms step:1188/1480 train_time:186341ms step_avg:158.18ms step:1189/1480 train_time:186514ms step_avg:158.20ms step:1190/1480 train_time:186681ms step_avg:158.20ms step:1191/1480 train_time:186853ms step_avg:158.22ms step:1192/1480 train_time:187019ms step_avg:158.22ms step:1193/1480 train_time:187186ms step_avg:158.23ms step:1194/1480 train_time:187354ms step_avg:158.24ms step:1195/1480 train_time:187528ms step_avg:158.25ms step:1196/1480 train_time:187711ms step_avg:158.27ms step:1197/1480 train_time:187883ms step_avg:158.28ms step:1198/1480 train_time:188062ms step_avg:158.30ms step:1199/1480 train_time:188232ms step_avg:158.31ms step:1200/1480 train_time:188400ms step_avg:158.32ms step:1201/1480 train_time:188569ms step_avg:158.33ms step:1202/1480 train_time:188750ms step_avg:158.35ms step:1203/1480 train_time:188925ms step_avg:158.36ms step:1204/1480 train_time:189099ms step_avg:158.37ms step:1205/1480 train_time:189267ms step_avg:158.38ms step:1206/1480 train_time:189435ms step_avg:158.39ms step:1207/1480 train_time:189604ms step_avg:158.40ms step:1208/1480 train_time:189772ms step_avg:158.41ms step:1209/1480 train_time:189946ms step_avg:158.42ms step:1210/1480 train_time:190121ms step_avg:158.43ms step:1211/1480 train_time:190295ms step_avg:158.45ms step:1212/1480 train_time:190467ms step_avg:158.46ms step:1213/1480 train_time:190640ms step_avg:158.47ms step:1214/1480 train_time:190818ms step_avg:158.49ms step:1215/1480 train_time:190991ms step_avg:158.50ms step:1216/1480 train_time:191160ms step_avg:158.51ms step:1217/1480 train_time:191333ms step_avg:158.52ms step:1218/1480 train_time:191503ms step_avg:158.53ms step:1219/1480 train_time:191682ms step_avg:158.55ms step:1220/1480 train_time:191851ms step_avg:158.55ms step:1221/1480 train_time:192019ms step_avg:158.56ms step:1222/1480 train_time:192187ms step_avg:158.57ms step:1223/1480 train_time:192358ms step_avg:158.58ms step:1224/1480 train_time:192536ms step_avg:158.60ms step:1225/1480 train_time:192708ms step_avg:158.61ms step:1226/1480 train_time:192882ms step_avg:158.62ms step:1227/1480 train_time:193055ms step_avg:158.63ms step:1228/1480 train_time:193225ms step_avg:158.64ms step:1229/1480 train_time:193398ms step_avg:158.65ms step:1230/1480 train_time:193579ms step_avg:158.67ms step:1231/1480 train_time:193754ms step_avg:158.68ms step:1232/1480 train_time:193928ms step_avg:158.70ms step:1233/1480 train_time:194098ms step_avg:158.71ms step:1234/1480 train_time:194267ms step_avg:158.71ms step:1235/1480 train_time:194441ms step_avg:158.73ms step:1236/1480 train_time:194608ms step_avg:158.73ms step:1237/1480 train_time:194780ms step_avg:158.74ms step:1238/1480 train_time:194963ms step_avg:158.76ms step:1239/1480 train_time:195133ms step_avg:158.77ms step:1240/1480 train_time:195303ms step_avg:158.78ms step:1241/1480 train_time:195478ms step_avg:158.80ms step:1242/1480 train_time:195647ms step_avg:158.80ms step:1243/1480 train_time:195822ms step_avg:158.82ms step:1244/1480 train_time:195989ms step_avg:158.82ms step:1245/1480 train_time:196160ms step_avg:158.83ms step:1246/1480 train_time:196328ms step_avg:158.84ms step:1247/1480 train_time:196498ms step_avg:158.85ms step:1248/1480 train_time:196667ms step_avg:158.86ms step:1249/1480 train_time:196835ms step_avg:158.87ms step:1250/1480 train_time:197004ms step_avg:158.87ms step:1250/1480 val_loss:3.3356 train_time:197075ms step_avg:158.93ms step:1251/1480 train_time:197184ms step_avg:158.89ms step:1252/1480 train_time:197354ms step_avg:158.90ms step:1253/1480 train_time:197522ms step_avg:158.91ms step:1254/1480 train_time:197694ms step_avg:158.92ms step:1255/1480 train_time:197879ms step_avg:158.94ms step:1256/1480 train_time:198053ms step_avg:158.95ms step:1257/1480 train_time:198223ms step_avg:158.96ms step:1258/1480 train_time:198400ms step_avg:158.97ms step:1259/1480 train_time:198573ms step_avg:158.99ms step:1260/1480 train_time:198739ms step_avg:158.99ms step:1261/1480 train_time:198911ms step_avg:159.00ms step:1262/1480 train_time:199086ms step_avg:159.01ms step:1263/1480 train_time:199259ms step_avg:159.03ms step:1264/1480 train_time:199426ms step_avg:159.03ms step:1265/1480 train_time:199594ms step_avg:159.04ms step:1266/1480 train_time:199764ms step_avg:159.05ms step:1267/1480 train_time:199935ms step_avg:159.06ms step:1268/1480 train_time:200104ms step_avg:159.07ms step:1269/1480 train_time:200281ms step_avg:159.08ms step:1270/1480 train_time:200451ms step_avg:159.09ms step:1271/1480 train_time:200620ms step_avg:159.10ms step:1272/1480 train_time:200787ms step_avg:159.10ms step:1273/1480 train_time:200958ms step_avg:159.11ms step:1274/1480 train_time:201129ms step_avg:159.12ms step:1275/1480 train_time:201299ms step_avg:159.13ms step:1276/1480 train_time:201463ms step_avg:159.13ms step:1277/1480 train_time:201635ms step_avg:159.14ms step:1278/1480 train_time:201803ms step_avg:159.15ms step:1279/1480 train_time:201976ms step_avg:159.16ms step:1280/1480 train_time:202154ms step_avg:159.18ms step:1281/1480 train_time:202322ms step_avg:159.18ms step:1282/1480 train_time:202487ms step_avg:159.19ms step:1283/1480 train_time:202658ms step_avg:159.20ms step:1284/1480 train_time:202826ms step_avg:159.20ms step:1285/1480 train_time:202996ms step_avg:159.21ms step:1286/1480 train_time:203164ms step_avg:159.22ms step:1287/1480 train_time:203335ms step_avg:159.23ms step:1288/1480 train_time:203506ms step_avg:159.24ms step:1289/1480 train_time:203690ms step_avg:159.26ms step:1290/1480 train_time:203868ms step_avg:159.27ms step:1291/1480 train_time:204040ms step_avg:159.28ms step:1292/1480 train_time:204215ms step_avg:159.29ms step:1293/1480 train_time:204389ms step_avg:159.31ms step:1294/1480 train_time:204561ms step_avg:159.32ms step:1295/1480 train_time:204734ms step_avg:159.33ms step:1296/1480 train_time:204906ms step_avg:159.34ms step:1297/1480 train_time:205079ms step_avg:159.35ms step:1298/1480 train_time:205250ms step_avg:159.36ms step:1299/1480 train_time:205423ms step_avg:159.37ms step:1300/1480 train_time:205591ms step_avg:159.37ms step:1301/1480 train_time:205760ms step_avg:159.38ms step:1302/1480 train_time:205936ms step_avg:159.39ms step:1303/1480 train_time:206112ms step_avg:159.41ms step:1304/1480 train_time:206286ms step_avg:159.42ms step:1305/1480 train_time:206456ms step_avg:159.43ms step:1306/1480 train_time:206629ms step_avg:159.44ms step:1307/1480 train_time:206797ms step_avg:159.44ms step:1308/1480 train_time:206966ms step_avg:159.45ms step:1309/1480 train_time:207138ms step_avg:159.46ms step:1310/1480 train_time:207307ms step_avg:159.47ms step:1311/1480 train_time:207475ms step_avg:159.47ms step:1312/1480 train_time:207645ms step_avg:159.48ms step:1313/1480 train_time:207815ms step_avg:159.49ms step:1314/1480 train_time:207986ms step_avg:159.50ms step:1315/1480 train_time:208158ms step_avg:159.51ms step:1316/1480 train_time:208323ms step_avg:159.51ms step:1317/1480 train_time:208496ms step_avg:159.52ms step:1318/1480 train_time:208676ms step_avg:159.54ms step:1319/1480 train_time:208853ms step_avg:159.55ms step:1320/1480 train_time:209028ms step_avg:159.56ms step:1321/1480 train_time:209201ms step_avg:159.57ms step:1322/1480 train_time:209380ms step_avg:159.59ms step:1323/1480 train_time:209554ms step_avg:159.60ms step:1324/1480 train_time:209730ms step_avg:159.61ms step:1325/1480 train_time:209910ms step_avg:159.63ms step:1326/1480 train_time:210085ms step_avg:159.64ms step:1327/1480 train_time:210255ms step_avg:159.65ms step:1328/1480 train_time:210424ms step_avg:159.65ms step:1329/1480 train_time:210621ms step_avg:159.68ms step:1330/1480 train_time:210801ms step_avg:159.70ms step:1331/1480 train_time:210972ms step_avg:159.71ms step:1332/1480 train_time:211146ms step_avg:159.72ms step:1333/1480 train_time:211321ms step_avg:159.73ms step:1334/1480 train_time:211493ms step_avg:159.74ms step:1335/1480 train_time:211662ms step_avg:159.74ms step:1336/1480 train_time:211845ms step_avg:159.76ms step:1337/1480 train_time:212022ms step_avg:159.78ms step:1338/1480 train_time:212195ms step_avg:159.79ms step:1339/1480 train_time:212368ms step_avg:159.80ms step:1340/1480 train_time:212540ms step_avg:159.80ms step:1341/1480 train_time:212708ms step_avg:159.81ms step:1342/1480 train_time:212881ms step_avg:159.82ms step:1343/1480 train_time:213051ms step_avg:159.83ms step:1344/1480 train_time:213223ms step_avg:159.84ms step:1345/1480 train_time:213402ms step_avg:159.85ms step:1346/1480 train_time:213571ms step_avg:159.86ms step:1347/1480 train_time:213740ms step_avg:159.87ms step:1348/1480 train_time:213910ms step_avg:159.87ms step:1349/1480 train_time:214080ms step_avg:159.88ms step:1350/1480 train_time:214256ms step_avg:159.89ms step:1351/1480 train_time:214428ms step_avg:159.90ms step:1352/1480 train_time:214600ms step_avg:159.91ms step:1353/1480 train_time:214777ms step_avg:159.92ms step:1354/1480 train_time:214947ms step_avg:159.93ms step:1355/1480 train_time:215114ms step_avg:159.94ms step:1356/1480 train_time:215287ms step_avg:159.95ms step:1357/1480 train_time:215460ms step_avg:159.96ms step:1358/1480 train_time:215634ms step_avg:159.97ms step:1359/1480 train_time:215805ms step_avg:159.97ms step:1360/1480 train_time:215979ms step_avg:159.98ms step:1361/1480 train_time:216155ms step_avg:160.00ms step:1362/1480 train_time:216330ms step_avg:160.01ms step:1363/1480 train_time:216510ms step_avg:160.02ms step:1364/1480 train_time:216679ms step_avg:160.03ms step:1365/1480 train_time:216845ms step_avg:160.03ms step:1366/1480 train_time:217016ms step_avg:160.04ms step:1367/1480 train_time:217187ms step_avg:160.05ms step:1368/1480 train_time:217360ms step_avg:160.06ms step:1369/1480 train_time:217542ms step_avg:160.08ms step:1370/1480 train_time:217719ms step_avg:160.09ms step:1371/1480 train_time:217891ms step_avg:160.10ms step:1372/1480 train_time:218067ms step_avg:160.11ms step:1373/1480 train_time:218237ms step_avg:160.11ms step:1374/1480 train_time:218412ms step_avg:160.13ms step:1375/1480 train_time:218583ms step_avg:160.13ms step:1375/1480 val_loss:3.2967 train_time:218650ms step_avg:160.18ms step:1376/1480 train_time:218756ms step_avg:160.14ms step:1377/1480 train_time:218928ms step_avg:160.15ms step:1378/1480 train_time:219098ms step_avg:160.16ms step:1379/1480 train_time:219274ms step_avg:160.17ms step:1380/1480 train_time:219448ms step_avg:160.18ms step:1381/1480 train_time:219631ms step_avg:160.20ms step:1382/1480 train_time:219802ms step_avg:160.21ms step:1383/1480 train_time:219975ms step_avg:160.22ms step:1384/1480 train_time:220153ms step_avg:160.23ms step:1385/1480 train_time:220318ms step_avg:160.23ms step:1386/1480 train_time:220489ms step_avg:160.24ms step:1387/1480 train_time:220660ms step_avg:160.25ms step:1388/1480 train_time:220828ms step_avg:160.25ms step:1389/1480 train_time:221001ms step_avg:160.26ms step:1390/1480 train_time:221171ms step_avg:160.27ms step:1391/1480 train_time:221342ms step_avg:160.28ms step:1392/1480 train_time:221515ms step_avg:160.29ms step:1393/1480 train_time:221683ms step_avg:160.29ms step:1394/1480 train_time:221853ms step_avg:160.30ms step:1395/1480 train_time:222022ms step_avg:160.30ms step:1396/1480 train_time:222191ms step_avg:160.31ms step:1397/1480 train_time:222359ms step_avg:160.32ms step:1398/1480 train_time:222527ms step_avg:160.32ms step:1399/1480 train_time:222696ms step_avg:160.33ms step:1400/1480 train_time:222874ms step_avg:160.34ms step:1401/1480 train_time:223039ms step_avg:160.34ms step:1402/1480 train_time:223211ms step_avg:160.35ms step:1403/1480 train_time:223386ms step_avg:160.36ms step:1404/1480 train_time:223555ms step_avg:160.37ms step:1405/1480 train_time:223730ms step_avg:160.38ms step:1406/1480 train_time:223907ms step_avg:160.39ms step:1407/1480 train_time:224075ms step_avg:160.40ms step:1408/1480 train_time:224243ms step_avg:160.40ms step:1409/1480 train_time:224426ms step_avg:160.42ms step:1410/1480 train_time:224596ms step_avg:160.43ms step:1411/1480 train_time:224763ms step_avg:160.43ms step:1412/1480 train_time:224933ms step_avg:160.44ms step:1413/1480 train_time:225104ms step_avg:160.44ms step:1414/1480 train_time:225275ms step_avg:160.45ms step:1415/1480 train_time:225449ms step_avg:160.46ms step:1416/1480 train_time:225635ms step_avg:160.48ms step:1417/1480 train_time:225810ms step_avg:160.49ms step:1418/1480 train_time:225981ms step_avg:160.50ms step:1419/1480 train_time:226155ms step_avg:160.51ms step:1420/1480 train_time:226328ms step_avg:160.52ms step:1421/1480 train_time:226501ms step_avg:160.53ms step:1422/1480 train_time:226675ms step_avg:160.53ms step:1423/1480 train_time:226843ms step_avg:160.54ms step:1424/1480 train_time:227020ms step_avg:160.55ms step:1425/1480 train_time:227201ms step_avg:160.57ms step:1426/1480 train_time:227373ms step_avg:160.57ms step:1427/1480 train_time:227550ms step_avg:160.59ms step:1428/1480 train_time:227720ms step_avg:160.59ms step:1429/1480 train_time:227889ms step_avg:160.60ms step:1430/1480 train_time:228062ms step_avg:160.61ms step:1431/1480 train_time:228239ms step_avg:160.62ms step:1432/1480 train_time:228416ms step_avg:160.63ms step:1433/1480 train_time:228595ms step_avg:160.64ms step:1434/1480 train_time:228775ms step_avg:160.66ms step:1435/1480 train_time:228950ms step_avg:160.67ms step:1436/1480 train_time:229124ms step_avg:160.68ms step:1437/1480 train_time:229295ms step_avg:160.68ms step:1438/1480 train_time:229463ms step_avg:160.69ms step:1439/1480 train_time:229637ms step_avg:160.70ms step:1440/1480 train_time:229806ms step_avg:160.70ms step:1441/1480 train_time:229976ms step_avg:160.71ms step:1442/1480 train_time:230154ms step_avg:160.72ms step:1443/1480 train_time:230343ms step_avg:160.74ms step:1444/1480 train_time:230514ms step_avg:160.75ms step:1445/1480 train_time:230684ms step_avg:160.76ms step:1446/1480 train_time:230859ms step_avg:160.77ms step:1447/1480 train_time:231037ms step_avg:160.78ms step:1448/1480 train_time:231209ms step_avg:160.78ms step:1449/1480 train_time:231382ms step_avg:160.79ms step:1450/1480 train_time:231556ms step_avg:160.80ms step:1451/1480 train_time:231728ms step_avg:160.81ms step:1452/1480 train_time:231900ms step_avg:160.82ms step:1453/1480 train_time:232070ms step_avg:160.82ms step:1454/1480 train_time:232244ms step_avg:160.83ms step:1455/1480 train_time:232424ms step_avg:160.85ms step:1456/1480 train_time:232597ms step_avg:160.86ms step:1457/1480 train_time:232767ms step_avg:160.86ms step:1458/1480 train_time:232937ms step_avg:160.87ms step:1459/1480 train_time:233115ms step_avg:160.88ms step:1460/1480 train_time:233285ms step_avg:160.89ms step:1461/1480 train_time:233459ms step_avg:160.90ms step:1462/1480 train_time:233631ms step_avg:160.90ms step:1463/1480 train_time:233807ms step_avg:160.91ms step:1464/1480 train_time:233981ms step_avg:160.92ms step:1465/1480 train_time:234154ms step_avg:160.93ms step:1466/1480 train_time:234323ms step_avg:160.94ms step:1467/1480 train_time:234498ms step_avg:160.95ms step:1468/1480 train_time:234670ms step_avg:160.95ms step:1469/1480 train_time:234842ms step_avg:160.96ms step:1470/1480 train_time:235022ms step_avg:160.97ms step:1471/1480 train_time:235208ms step_avg:160.99ms step:1472/1480 train_time:235387ms step_avg:161.00ms step:1473/1480 train_time:235558ms step_avg:161.01ms step:1474/1480 train_time:235736ms step_avg:161.02ms step:1475/1480 train_time:235918ms step_avg:161.04ms step:1476/1480 train_time:236090ms step_avg:161.04ms step:1477/1480 train_time:236274ms step_avg:161.06ms step:1478/1480 train_time:236457ms step_avg:161.07ms step:1479/1480 train_time:236632ms step_avg:161.08ms step:1480/1480 train_time:236807ms step_avg:161.09ms step:1480/1480 val_loss:3.2778 train_time:236879ms step_avg:161.14ms