import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:02:03 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 129W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 99W / 700W | 29MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23092ms step_avg:nanms step:2/1480 train_time:23206ms step_avg:nanms step:3/1480 train_time:23345ms step_avg:nanms step:4/1480 train_time:23487ms step_avg:nanms step:5/1480 train_time:23627ms step_avg:nanms step:6/1480 train_time:23768ms step_avg:nanms step:7/1480 train_time:23908ms step_avg:nanms step:8/1480 train_time:24051ms step_avg:nanms step:9/1480 train_time:24199ms step_avg:nanms step:10/1480 train_time:24343ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.73ms step:14/1480 train_time:566ms step_avg:141.43ms step:15/1480 train_time:709ms step_avg:141.86ms step:16/1480 train_time:854ms step_avg:142.34ms step:17/1480 train_time:999ms step_avg:142.72ms step:18/1480 train_time:1142ms step_avg:142.75ms step:19/1480 train_time:1285ms step_avg:142.78ms step:20/1480 train_time:1427ms step_avg:142.67ms step:21/1480 train_time:1567ms step_avg:142.45ms step:22/1480 train_time:1709ms step_avg:142.43ms step:23/1480 train_time:1853ms step_avg:142.53ms step:24/1480 train_time:1997ms step_avg:142.63ms step:25/1480 train_time:2142ms step_avg:142.77ms step:26/1480 train_time:2284ms step_avg:142.75ms step:27/1480 train_time:2425ms step_avg:142.66ms step:28/1480 train_time:2566ms step_avg:142.56ms step:29/1480 train_time:2708ms step_avg:142.52ms step:30/1480 train_time:2851ms step_avg:142.56ms step:31/1480 train_time:2994ms step_avg:142.59ms step:32/1480 train_time:3139ms step_avg:142.69ms step:33/1480 train_time:3282ms step_avg:142.71ms step:34/1480 train_time:3424ms step_avg:142.66ms step:35/1480 train_time:3565ms step_avg:142.59ms step:36/1480 train_time:3706ms step_avg:142.54ms step:37/1480 train_time:3849ms step_avg:142.55ms step:38/1480 train_time:3992ms step_avg:142.59ms step:39/1480 train_time:4136ms step_avg:142.63ms step:40/1480 train_time:4279ms step_avg:142.65ms step:41/1480 train_time:4423ms step_avg:142.66ms step:42/1480 train_time:4564ms step_avg:142.64ms step:43/1480 train_time:4705ms step_avg:142.59ms step:44/1480 train_time:4848ms step_avg:142.60ms step:45/1480 train_time:4993ms step_avg:142.64ms step:46/1480 train_time:5137ms step_avg:142.70ms step:47/1480 train_time:5281ms step_avg:142.73ms step:48/1480 train_time:5424ms step_avg:142.73ms step:49/1480 train_time:5565ms step_avg:142.70ms step:50/1480 train_time:5707ms step_avg:142.66ms step:51/1480 train_time:5848ms step_avg:142.64ms step:52/1480 train_time:5990ms step_avg:142.62ms step:53/1480 train_time:6135ms step_avg:142.68ms step:54/1480 train_time:6278ms step_avg:142.68ms step:55/1480 train_time:6421ms step_avg:142.68ms step:56/1480 train_time:6564ms step_avg:142.69ms step:57/1480 train_time:6705ms step_avg:142.65ms step:58/1480 train_time:6846ms step_avg:142.63ms step:59/1480 train_time:6988ms step_avg:142.61ms step:60/1480 train_time:7130ms step_avg:142.61ms step:61/1480 train_time:7274ms step_avg:142.62ms step:62/1480 train_time:7418ms step_avg:142.65ms step:63/1480 train_time:7561ms step_avg:142.67ms step:64/1480 train_time:7702ms step_avg:142.64ms step:65/1480 train_time:7845ms step_avg:142.63ms step:66/1480 train_time:7985ms step_avg:142.60ms step:67/1480 train_time:8127ms step_avg:142.59ms step:68/1480 train_time:8269ms step_avg:142.57ms step:69/1480 train_time:8413ms step_avg:142.60ms step:70/1480 train_time:8557ms step_avg:142.62ms step:71/1480 train_time:8701ms step_avg:142.63ms step:72/1480 train_time:8844ms step_avg:142.65ms step:73/1480 train_time:8985ms step_avg:142.63ms step:74/1480 train_time:9128ms step_avg:142.63ms step:75/1480 train_time:9269ms step_avg:142.61ms step:76/1480 train_time:9413ms step_avg:142.63ms step:77/1480 train_time:9557ms step_avg:142.65ms step:78/1480 train_time:9700ms step_avg:142.65ms step:79/1480 train_time:9844ms step_avg:142.66ms step:80/1480 train_time:9985ms step_avg:142.64ms step:81/1480 train_time:10128ms step_avg:142.64ms step:82/1480 train_time:10269ms step_avg:142.62ms step:83/1480 train_time:10411ms step_avg:142.61ms step:84/1480 train_time:10555ms step_avg:142.64ms step:85/1480 train_time:10699ms step_avg:142.65ms step:86/1480 train_time:10842ms step_avg:142.66ms step:87/1480 train_time:10984ms step_avg:142.65ms step:88/1480 train_time:11126ms step_avg:142.65ms step:89/1480 train_time:11267ms step_avg:142.62ms step:90/1480 train_time:11409ms step_avg:142.61ms step:91/1480 train_time:11550ms step_avg:142.60ms step:92/1480 train_time:11693ms step_avg:142.59ms step:93/1480 train_time:11836ms step_avg:142.60ms step:94/1480 train_time:11978ms step_avg:142.59ms step:95/1480 train_time:12121ms step_avg:142.60ms step:96/1480 train_time:12263ms step_avg:142.60ms step:97/1480 train_time:12405ms step_avg:142.59ms step:98/1480 train_time:12547ms step_avg:142.58ms step:99/1480 train_time:12690ms step_avg:142.59ms step:100/1480 train_time:12834ms step_avg:142.60ms step:101/1480 train_time:12977ms step_avg:142.61ms step:102/1480 train_time:13122ms step_avg:142.63ms step:103/1480 train_time:13264ms step_avg:142.62ms step:104/1480 train_time:13406ms step_avg:142.62ms step:105/1480 train_time:13547ms step_avg:142.60ms step:106/1480 train_time:13689ms step_avg:142.60ms step:107/1480 train_time:13833ms step_avg:142.61ms step:108/1480 train_time:13974ms step_avg:142.59ms step:109/1480 train_time:14118ms step_avg:142.60ms step:110/1480 train_time:14261ms step_avg:142.61ms step:111/1480 train_time:14406ms step_avg:142.63ms step:112/1480 train_time:14552ms step_avg:142.66ms step:113/1480 train_time:14698ms step_avg:142.70ms step:114/1480 train_time:14846ms step_avg:142.75ms step:115/1480 train_time:14992ms step_avg:142.78ms step:116/1480 train_time:15140ms step_avg:142.83ms step:117/1480 train_time:15287ms step_avg:142.87ms step:118/1480 train_time:15434ms step_avg:142.91ms step:119/1480 train_time:15582ms step_avg:142.95ms step:120/1480 train_time:15729ms step_avg:142.99ms step:121/1480 train_time:15875ms step_avg:143.01ms step:122/1480 train_time:16023ms step_avg:143.06ms step:123/1480 train_time:16169ms step_avg:143.08ms step:124/1480 train_time:16316ms step_avg:143.12ms step:125/1480 train_time:16463ms step_avg:143.15ms step:125/1480 val_loss:4.4014 train_time:16519ms step_avg:143.64ms step:126/1480 train_time:16614ms step_avg:143.23ms step:127/1480 train_time:16764ms step_avg:143.28ms step:128/1480 train_time:16911ms step_avg:143.31ms step:129/1480 train_time:17056ms step_avg:143.33ms step:130/1480 train_time:17201ms step_avg:143.35ms step:131/1480 train_time:17349ms step_avg:143.38ms step:132/1480 train_time:17496ms step_avg:143.41ms step:133/1480 train_time:17647ms step_avg:143.47ms step:134/1480 train_time:17794ms step_avg:143.50ms step:135/1480 train_time:17941ms step_avg:143.53ms step:136/1480 train_time:18089ms step_avg:143.56ms step:137/1480 train_time:18234ms step_avg:143.57ms step:138/1480 train_time:18380ms step_avg:143.59ms step:139/1480 train_time:18528ms step_avg:143.63ms step:140/1480 train_time:18676ms step_avg:143.66ms step:141/1480 train_time:18824ms step_avg:143.69ms step:142/1480 train_time:18971ms step_avg:143.72ms step:143/1480 train_time:19117ms step_avg:143.73ms step:144/1480 train_time:19264ms step_avg:143.76ms step:145/1480 train_time:19411ms step_avg:143.78ms step:146/1480 train_time:19557ms step_avg:143.80ms step:147/1480 train_time:19704ms step_avg:143.83ms step:148/1480 train_time:19852ms step_avg:143.85ms step:149/1480 train_time:19997ms step_avg:143.87ms step:150/1480 train_time:20145ms step_avg:143.89ms step:151/1480 train_time:20291ms step_avg:143.91ms step:152/1480 train_time:20437ms step_avg:143.93ms step:153/1480 train_time:20586ms step_avg:143.96ms step:154/1480 train_time:20732ms step_avg:143.97ms step:155/1480 train_time:20879ms step_avg:144.00ms step:156/1480 train_time:21027ms step_avg:144.02ms step:157/1480 train_time:21174ms step_avg:144.04ms step:158/1480 train_time:21320ms step_avg:144.05ms step:159/1480 train_time:21467ms step_avg:144.07ms step:160/1480 train_time:21613ms step_avg:144.08ms step:161/1480 train_time:21758ms step_avg:144.09ms step:162/1480 train_time:21904ms step_avg:144.11ms step:163/1480 train_time:22052ms step_avg:144.13ms step:164/1480 train_time:22198ms step_avg:144.14ms step:165/1480 train_time:22347ms step_avg:144.17ms step:166/1480 train_time:22493ms step_avg:144.19ms step:167/1480 train_time:22639ms step_avg:144.20ms step:168/1480 train_time:22787ms step_avg:144.22ms step:169/1480 train_time:22933ms step_avg:144.24ms step:170/1480 train_time:23082ms step_avg:144.26ms step:171/1480 train_time:23229ms step_avg:144.28ms step:172/1480 train_time:23375ms step_avg:144.29ms step:173/1480 train_time:23522ms step_avg:144.31ms step:174/1480 train_time:23669ms step_avg:144.32ms step:175/1480 train_time:23814ms step_avg:144.33ms step:176/1480 train_time:23962ms step_avg:144.35ms step:177/1480 train_time:24108ms step_avg:144.36ms step:178/1480 train_time:24255ms step_avg:144.37ms step:179/1480 train_time:24400ms step_avg:144.38ms step:180/1480 train_time:24549ms step_avg:144.40ms step:181/1480 train_time:24695ms step_avg:144.41ms step:182/1480 train_time:24841ms step_avg:144.43ms step:183/1480 train_time:24989ms step_avg:144.44ms step:184/1480 train_time:25135ms step_avg:144.45ms step:185/1480 train_time:25283ms step_avg:144.47ms step:186/1480 train_time:25430ms step_avg:144.49ms step:187/1480 train_time:25576ms step_avg:144.50ms step:188/1480 train_time:25725ms step_avg:144.52ms step:189/1480 train_time:25871ms step_avg:144.53ms step:190/1480 train_time:26015ms step_avg:144.53ms step:191/1480 train_time:26161ms step_avg:144.54ms step:192/1480 train_time:26308ms step_avg:144.55ms step:193/1480 train_time:26454ms step_avg:144.56ms step:194/1480 train_time:26600ms step_avg:144.57ms step:195/1480 train_time:26747ms step_avg:144.58ms step:196/1480 train_time:26893ms step_avg:144.59ms step:197/1480 train_time:27041ms step_avg:144.60ms step:198/1480 train_time:27188ms step_avg:144.62ms step:199/1480 train_time:27334ms step_avg:144.62ms step:200/1480 train_time:27482ms step_avg:144.64ms step:201/1480 train_time:27628ms step_avg:144.65ms step:202/1480 train_time:27773ms step_avg:144.65ms step:203/1480 train_time:27920ms step_avg:144.66ms step:204/1480 train_time:28067ms step_avg:144.68ms step:205/1480 train_time:28213ms step_avg:144.68ms step:206/1480 train_time:28361ms step_avg:144.70ms step:207/1480 train_time:28509ms step_avg:144.71ms step:208/1480 train_time:28655ms step_avg:144.72ms step:209/1480 train_time:28802ms step_avg:144.73ms step:210/1480 train_time:28949ms step_avg:144.74ms step:211/1480 train_time:29095ms step_avg:144.75ms step:212/1480 train_time:29241ms step_avg:144.76ms step:213/1480 train_time:29388ms step_avg:144.77ms step:214/1480 train_time:29534ms step_avg:144.77ms step:215/1480 train_time:29681ms step_avg:144.78ms step:216/1480 train_time:29828ms step_avg:144.80ms step:217/1480 train_time:29973ms step_avg:144.80ms step:218/1480 train_time:30120ms step_avg:144.81ms step:219/1480 train_time:30267ms step_avg:144.82ms step:220/1480 train_time:30414ms step_avg:144.83ms step:221/1480 train_time:30562ms step_avg:144.84ms step:222/1480 train_time:30713ms step_avg:144.87ms step:223/1480 train_time:30862ms step_avg:144.89ms step:224/1480 train_time:31012ms step_avg:144.92ms step:225/1480 train_time:31163ms step_avg:144.94ms step:226/1480 train_time:31313ms step_avg:144.97ms step:227/1480 train_time:31464ms step_avg:144.99ms step:228/1480 train_time:31613ms step_avg:145.02ms step:229/1480 train_time:31765ms step_avg:145.05ms step:230/1480 train_time:31915ms step_avg:145.07ms step:231/1480 train_time:32067ms step_avg:145.10ms step:232/1480 train_time:32217ms step_avg:145.12ms step:233/1480 train_time:32368ms step_avg:145.15ms step:234/1480 train_time:32517ms step_avg:145.17ms step:235/1480 train_time:32669ms step_avg:145.19ms step:236/1480 train_time:32820ms step_avg:145.22ms step:237/1480 train_time:32970ms step_avg:145.24ms step:238/1480 train_time:33120ms step_avg:145.26ms step:239/1480 train_time:33270ms step_avg:145.28ms step:240/1480 train_time:33420ms step_avg:145.30ms step:241/1480 train_time:33571ms step_avg:145.33ms step:242/1480 train_time:33721ms step_avg:145.35ms step:243/1480 train_time:33872ms step_avg:145.37ms step:244/1480 train_time:34022ms step_avg:145.39ms step:245/1480 train_time:34172ms step_avg:145.41ms step:246/1480 train_time:34321ms step_avg:145.43ms step:247/1480 train_time:34471ms step_avg:145.45ms step:248/1480 train_time:34621ms step_avg:145.47ms step:249/1480 train_time:34771ms step_avg:145.49ms step:250/1480 train_time:34920ms step_avg:145.50ms step:250/1480 val_loss:3.9965 train_time:34979ms step_avg:145.75ms step:251/1480 train_time:35075ms step_avg:145.54ms step:252/1480 train_time:35227ms step_avg:145.57ms step:253/1480 train_time:35378ms step_avg:145.59ms step:254/1480 train_time:35528ms step_avg:145.60ms step:255/1480 train_time:35677ms step_avg:145.62ms step:256/1480 train_time:35826ms step_avg:145.64ms step:257/1480 train_time:35976ms step_avg:145.65ms step:258/1480 train_time:36128ms step_avg:145.68ms step:259/1480 train_time:36280ms step_avg:145.70ms step:260/1480 train_time:36430ms step_avg:145.72ms step:261/1480 train_time:36581ms step_avg:145.74ms step:262/1480 train_time:36730ms step_avg:145.76ms step:263/1480 train_time:36880ms step_avg:145.77ms step:264/1480 train_time:37031ms step_avg:145.79ms step:265/1480 train_time:37182ms step_avg:145.81ms step:266/1480 train_time:37333ms step_avg:145.83ms step:267/1480 train_time:37484ms step_avg:145.85ms step:268/1480 train_time:37633ms step_avg:145.86ms step:269/1480 train_time:37784ms step_avg:145.88ms step:270/1480 train_time:37932ms step_avg:145.89ms step:271/1480 train_time:38083ms step_avg:145.91ms step:272/1480 train_time:38232ms step_avg:145.92ms step:273/1480 train_time:38383ms step_avg:145.94ms step:274/1480 train_time:38533ms step_avg:145.96ms step:275/1480 train_time:38684ms step_avg:145.98ms step:276/1480 train_time:38833ms step_avg:145.99ms step:277/1480 train_time:38984ms step_avg:146.01ms step:278/1480 train_time:39133ms step_avg:146.02ms step:279/1480 train_time:39284ms step_avg:146.04ms step:280/1480 train_time:39435ms step_avg:146.05ms step:281/1480 train_time:39585ms step_avg:146.07ms step:282/1480 train_time:39736ms step_avg:146.09ms step:283/1480 train_time:39886ms step_avg:146.10ms step:284/1480 train_time:40036ms step_avg:146.12ms step:285/1480 train_time:40186ms step_avg:146.13ms step:286/1480 train_time:40337ms step_avg:146.15ms step:287/1480 train_time:40487ms step_avg:146.16ms step:288/1480 train_time:40638ms step_avg:146.18ms step:289/1480 train_time:40789ms step_avg:146.20ms step:290/1480 train_time:40940ms step_avg:146.21ms step:291/1480 train_time:41091ms step_avg:146.23ms step:292/1480 train_time:41241ms step_avg:146.24ms step:293/1480 train_time:41391ms step_avg:146.26ms step:294/1480 train_time:41541ms step_avg:146.27ms step:295/1480 train_time:41691ms step_avg:146.29ms step:296/1480 train_time:41843ms step_avg:146.30ms step:297/1480 train_time:41994ms step_avg:146.32ms step:298/1480 train_time:42145ms step_avg:146.34ms step:299/1480 train_time:42295ms step_avg:146.35ms step:300/1480 train_time:42447ms step_avg:146.37ms step:301/1480 train_time:42597ms step_avg:146.38ms step:302/1480 train_time:42747ms step_avg:146.39ms step:303/1480 train_time:42898ms step_avg:146.41ms step:304/1480 train_time:43048ms step_avg:146.42ms step:305/1480 train_time:43200ms step_avg:146.44ms step:306/1480 train_time:43350ms step_avg:146.45ms step:307/1480 train_time:43501ms step_avg:146.47ms step:308/1480 train_time:43652ms step_avg:146.48ms step:309/1480 train_time:43803ms step_avg:146.50ms step:310/1480 train_time:43954ms step_avg:146.51ms step:311/1480 train_time:44105ms step_avg:146.53ms step:312/1480 train_time:44256ms step_avg:146.54ms step:313/1480 train_time:44406ms step_avg:146.55ms step:314/1480 train_time:44556ms step_avg:146.57ms step:315/1480 train_time:44707ms step_avg:146.58ms step:316/1480 train_time:44857ms step_avg:146.59ms step:317/1480 train_time:45008ms step_avg:146.61ms step:318/1480 train_time:45157ms step_avg:146.61ms step:319/1480 train_time:45307ms step_avg:146.63ms step:320/1480 train_time:45459ms step_avg:146.64ms step:321/1480 train_time:45609ms step_avg:146.65ms step:322/1480 train_time:45760ms step_avg:146.67ms step:323/1480 train_time:45909ms step_avg:146.67ms step:324/1480 train_time:46061ms step_avg:146.69ms step:325/1480 train_time:46212ms step_avg:146.70ms step:326/1480 train_time:46363ms step_avg:146.72ms step:327/1480 train_time:46512ms step_avg:146.73ms step:328/1480 train_time:46664ms step_avg:146.74ms step:329/1480 train_time:46814ms step_avg:146.75ms step:330/1480 train_time:46966ms step_avg:146.77ms step:331/1480 train_time:47120ms step_avg:146.79ms step:332/1480 train_time:47275ms step_avg:146.82ms step:333/1480 train_time:47429ms step_avg:146.84ms step:334/1480 train_time:47582ms step_avg:146.86ms step:335/1480 train_time:47736ms step_avg:146.88ms step:336/1480 train_time:47890ms step_avg:146.90ms step:337/1480 train_time:48043ms step_avg:146.92ms step:338/1480 train_time:48198ms step_avg:146.94ms step:339/1480 train_time:48351ms step_avg:146.96ms step:340/1480 train_time:48506ms step_avg:146.99ms step:341/1480 train_time:48660ms step_avg:147.01ms step:342/1480 train_time:48811ms step_avg:147.02ms step:343/1480 train_time:48966ms step_avg:147.05ms step:344/1480 train_time:49122ms step_avg:147.07ms step:345/1480 train_time:49280ms step_avg:147.10ms step:346/1480 train_time:49434ms step_avg:147.12ms step:347/1480 train_time:49588ms step_avg:147.14ms step:348/1480 train_time:49741ms step_avg:147.16ms step:349/1480 train_time:49896ms step_avg:147.18ms step:350/1480 train_time:50050ms step_avg:147.20ms step:351/1480 train_time:50204ms step_avg:147.23ms step:352/1480 train_time:50359ms step_avg:147.25ms step:353/1480 train_time:50512ms step_avg:147.27ms step:354/1480 train_time:50665ms step_avg:147.28ms step:355/1480 train_time:50820ms step_avg:147.30ms step:356/1480 train_time:50974ms step_avg:147.32ms step:357/1480 train_time:51128ms step_avg:147.34ms step:358/1480 train_time:51282ms step_avg:147.36ms step:359/1480 train_time:51436ms step_avg:147.38ms step:360/1480 train_time:51589ms step_avg:147.40ms step:361/1480 train_time:51743ms step_avg:147.42ms step:362/1480 train_time:51899ms step_avg:147.44ms step:363/1480 train_time:52051ms step_avg:147.45ms step:364/1480 train_time:52205ms step_avg:147.47ms step:365/1480 train_time:52359ms step_avg:147.49ms step:366/1480 train_time:52513ms step_avg:147.51ms step:367/1480 train_time:52667ms step_avg:147.53ms step:368/1480 train_time:52820ms step_avg:147.54ms step:369/1480 train_time:52975ms step_avg:147.56ms step:370/1480 train_time:53128ms step_avg:147.58ms step:371/1480 train_time:53281ms step_avg:147.59ms step:372/1480 train_time:53435ms step_avg:147.61ms step:373/1480 train_time:53593ms step_avg:147.64ms step:374/1480 train_time:53741ms step_avg:147.64ms step:375/1480 train_time:53897ms step_avg:147.66ms step:375/1480 val_loss:3.8148 train_time:53958ms step_avg:147.83ms step:376/1480 train_time:54057ms step_avg:147.70ms step:377/1480 train_time:54211ms step_avg:147.71ms step:378/1480 train_time:54364ms step_avg:147.73ms step:379/1480 train_time:54517ms step_avg:147.74ms step:380/1480 train_time:54668ms step_avg:147.75ms step:381/1480 train_time:54820ms step_avg:147.76ms step:382/1480 train_time:54976ms step_avg:147.79ms step:383/1480 train_time:55132ms step_avg:147.81ms step:384/1480 train_time:55287ms step_avg:147.82ms step:385/1480 train_time:55441ms step_avg:147.84ms step:386/1480 train_time:55594ms step_avg:147.86ms step:387/1480 train_time:55747ms step_avg:147.87ms step:388/1480 train_time:55901ms step_avg:147.89ms step:389/1480 train_time:56055ms step_avg:147.90ms step:390/1480 train_time:56208ms step_avg:147.92ms step:391/1480 train_time:56364ms step_avg:147.94ms step:392/1480 train_time:56517ms step_avg:147.95ms step:393/1480 train_time:56672ms step_avg:147.97ms step:394/1480 train_time:56824ms step_avg:147.98ms step:395/1480 train_time:56977ms step_avg:147.99ms step:396/1480 train_time:57131ms step_avg:148.01ms step:397/1480 train_time:57284ms step_avg:148.02ms step:398/1480 train_time:57438ms step_avg:148.04ms step:399/1480 train_time:57593ms step_avg:148.05ms step:400/1480 train_time:57748ms step_avg:148.07ms step:401/1480 train_time:57901ms step_avg:148.08ms step:402/1480 train_time:58055ms step_avg:148.10ms step:403/1480 train_time:58209ms step_avg:148.11ms step:404/1480 train_time:58363ms step_avg:148.13ms step:405/1480 train_time:58517ms step_avg:148.14ms step:406/1480 train_time:58672ms step_avg:148.16ms step:407/1480 train_time:58824ms step_avg:148.17ms step:408/1480 train_time:58977ms step_avg:148.18ms step:409/1480 train_time:59131ms step_avg:148.20ms step:410/1480 train_time:59283ms step_avg:148.21ms step:411/1480 train_time:59437ms step_avg:148.22ms step:412/1480 train_time:59592ms step_avg:148.24ms step:413/1480 train_time:59745ms step_avg:148.25ms step:414/1480 train_time:59899ms step_avg:148.27ms step:415/1480 train_time:60054ms step_avg:148.28ms step:416/1480 train_time:60207ms step_avg:148.29ms step:417/1480 train_time:60362ms step_avg:148.31ms step:418/1480 train_time:60516ms step_avg:148.32ms step:419/1480 train_time:60670ms step_avg:148.34ms step:420/1480 train_time:60823ms step_avg:148.35ms step:421/1480 train_time:60977ms step_avg:148.36ms step:422/1480 train_time:61131ms step_avg:148.38ms step:423/1480 train_time:61283ms step_avg:148.39ms step:424/1480 train_time:61437ms step_avg:148.40ms step:425/1480 train_time:61593ms step_avg:148.42ms step:426/1480 train_time:61747ms step_avg:148.43ms step:427/1480 train_time:61900ms step_avg:148.44ms step:428/1480 train_time:62055ms step_avg:148.46ms step:429/1480 train_time:62207ms step_avg:148.47ms step:430/1480 train_time:62361ms step_avg:148.48ms step:431/1480 train_time:62516ms step_avg:148.49ms step:432/1480 train_time:62670ms step_avg:148.51ms step:433/1480 train_time:62822ms step_avg:148.52ms step:434/1480 train_time:62976ms step_avg:148.53ms step:435/1480 train_time:63130ms step_avg:148.54ms step:436/1480 train_time:63283ms step_avg:148.55ms step:437/1480 train_time:63436ms step_avg:148.56ms step:438/1480 train_time:63591ms step_avg:148.58ms step:439/1480 train_time:63744ms step_avg:148.59ms step:440/1480 train_time:63899ms step_avg:148.60ms step:441/1480 train_time:64057ms step_avg:148.62ms step:442/1480 train_time:64214ms step_avg:148.64ms step:443/1480 train_time:64370ms step_avg:148.66ms step:444/1480 train_time:64525ms step_avg:148.68ms step:445/1480 train_time:64680ms step_avg:148.69ms step:446/1480 train_time:64835ms step_avg:148.70ms step:447/1480 train_time:64991ms step_avg:148.72ms step:448/1480 train_time:65146ms step_avg:148.74ms step:449/1480 train_time:65303ms step_avg:148.75ms step:450/1480 train_time:65460ms step_avg:148.77ms step:451/1480 train_time:65619ms step_avg:148.80ms step:452/1480 train_time:65775ms step_avg:148.81ms step:453/1480 train_time:65931ms step_avg:148.83ms step:454/1480 train_time:66086ms step_avg:148.84ms step:455/1480 train_time:66242ms step_avg:148.86ms step:456/1480 train_time:66401ms step_avg:148.88ms step:457/1480 train_time:66557ms step_avg:148.90ms step:458/1480 train_time:66714ms step_avg:148.91ms step:459/1480 train_time:66872ms step_avg:148.94ms step:460/1480 train_time:67029ms step_avg:148.95ms step:461/1480 train_time:67186ms step_avg:148.97ms step:462/1480 train_time:67342ms step_avg:148.99ms step:463/1480 train_time:67500ms step_avg:149.01ms step:464/1480 train_time:67658ms step_avg:149.03ms step:465/1480 train_time:67815ms step_avg:149.04ms step:466/1480 train_time:67972ms step_avg:149.06ms step:467/1480 train_time:68130ms step_avg:149.08ms step:468/1480 train_time:68285ms step_avg:149.09ms step:469/1480 train_time:68440ms step_avg:149.11ms step:470/1480 train_time:68598ms step_avg:149.13ms step:471/1480 train_time:68755ms step_avg:149.14ms step:472/1480 train_time:68912ms step_avg:149.16ms step:473/1480 train_time:69068ms step_avg:149.17ms step:474/1480 train_time:69224ms step_avg:149.19ms step:475/1480 train_time:69380ms step_avg:149.20ms step:476/1480 train_time:69537ms step_avg:149.22ms step:477/1480 train_time:69696ms step_avg:149.24ms step:478/1480 train_time:69853ms step_avg:149.26ms step:479/1480 train_time:70009ms step_avg:149.27ms step:480/1480 train_time:70166ms step_avg:149.29ms step:481/1480 train_time:70321ms step_avg:149.30ms step:482/1480 train_time:70479ms step_avg:149.32ms step:483/1480 train_time:70636ms step_avg:149.34ms step:484/1480 train_time:70793ms step_avg:149.35ms step:485/1480 train_time:70952ms step_avg:149.37ms step:486/1480 train_time:71109ms step_avg:149.39ms step:487/1480 train_time:71266ms step_avg:149.40ms step:488/1480 train_time:71422ms step_avg:149.42ms step:489/1480 train_time:71578ms step_avg:149.43ms step:490/1480 train_time:71734ms step_avg:149.45ms step:491/1480 train_time:71891ms step_avg:149.46ms step:492/1480 train_time:72047ms step_avg:149.48ms step:493/1480 train_time:72204ms step_avg:149.49ms step:494/1480 train_time:72361ms step_avg:149.51ms step:495/1480 train_time:72518ms step_avg:149.52ms step:496/1480 train_time:72677ms step_avg:149.54ms step:497/1480 train_time:72834ms step_avg:149.56ms step:498/1480 train_time:72991ms step_avg:149.57ms step:499/1480 train_time:73148ms step_avg:149.59ms step:500/1480 train_time:73304ms step_avg:149.60ms step:500/1480 val_loss:3.6941 train_time:73366ms step_avg:149.73ms step:501/1480 train_time:73463ms step_avg:149.62ms step:502/1480 train_time:73618ms step_avg:149.63ms step:503/1480 train_time:73774ms step_avg:149.64ms step:504/1480 train_time:73930ms step_avg:149.66ms step:505/1480 train_time:74083ms step_avg:149.66ms step:506/1480 train_time:74240ms step_avg:149.68ms step:507/1480 train_time:74397ms step_avg:149.69ms step:508/1480 train_time:74556ms step_avg:149.71ms step:509/1480 train_time:74713ms step_avg:149.73ms step:510/1480 train_time:74869ms step_avg:149.74ms step:511/1480 train_time:75025ms step_avg:149.75ms step:512/1480 train_time:75182ms step_avg:149.77ms step:513/1480 train_time:75340ms step_avg:149.78ms step:514/1480 train_time:75497ms step_avg:149.80ms step:515/1480 train_time:75656ms step_avg:149.81ms step:516/1480 train_time:75815ms step_avg:149.83ms step:517/1480 train_time:75973ms step_avg:149.85ms step:518/1480 train_time:76130ms step_avg:149.86ms step:519/1480 train_time:76286ms step_avg:149.87ms step:520/1480 train_time:76442ms step_avg:149.89ms step:521/1480 train_time:76599ms step_avg:149.90ms step:522/1480 train_time:76756ms step_avg:149.91ms step:523/1480 train_time:76914ms step_avg:149.93ms step:524/1480 train_time:77071ms step_avg:149.94ms step:525/1480 train_time:77226ms step_avg:149.95ms step:526/1480 train_time:77382ms step_avg:149.96ms step:527/1480 train_time:77539ms step_avg:149.98ms step:528/1480 train_time:77696ms step_avg:149.99ms step:529/1480 train_time:77854ms step_avg:150.01ms step:530/1480 train_time:78013ms step_avg:150.03ms step:531/1480 train_time:78171ms step_avg:150.04ms step:532/1480 train_time:78327ms step_avg:150.05ms step:533/1480 train_time:78483ms step_avg:150.06ms step:534/1480 train_time:78638ms step_avg:150.07ms step:535/1480 train_time:78796ms step_avg:150.09ms step:536/1480 train_time:78954ms step_avg:150.10ms step:537/1480 train_time:79113ms step_avg:150.12ms step:538/1480 train_time:79271ms step_avg:150.14ms step:539/1480 train_time:79430ms step_avg:150.15ms step:540/1480 train_time:79586ms step_avg:150.16ms step:541/1480 train_time:79741ms step_avg:150.17ms step:542/1480 train_time:79898ms step_avg:150.18ms step:543/1480 train_time:80054ms step_avg:150.20ms step:544/1480 train_time:80212ms step_avg:150.21ms step:545/1480 train_time:80369ms step_avg:150.22ms step:546/1480 train_time:80525ms step_avg:150.23ms step:547/1480 train_time:80681ms step_avg:150.24ms step:548/1480 train_time:80839ms step_avg:150.26ms step:549/1480 train_time:80996ms step_avg:150.27ms step:550/1480 train_time:81154ms step_avg:150.29ms step:551/1480 train_time:81313ms step_avg:150.30ms step:552/1480 train_time:81472ms step_avg:150.32ms step:553/1480 train_time:81634ms step_avg:150.34ms step:554/1480 train_time:81792ms step_avg:150.35ms step:555/1480 train_time:81953ms step_avg:150.37ms step:556/1480 train_time:82111ms step_avg:150.39ms step:557/1480 train_time:82274ms step_avg:150.41ms step:558/1480 train_time:82435ms step_avg:150.43ms step:559/1480 train_time:82595ms step_avg:150.45ms step:560/1480 train_time:82756ms step_avg:150.47ms step:561/1480 train_time:82916ms step_avg:150.48ms step:562/1480 train_time:83076ms step_avg:150.50ms step:563/1480 train_time:83235ms step_avg:150.52ms step:564/1480 train_time:83395ms step_avg:150.53ms step:565/1480 train_time:83555ms step_avg:150.55ms step:566/1480 train_time:83716ms step_avg:150.57ms step:567/1480 train_time:83876ms step_avg:150.59ms step:568/1480 train_time:84036ms step_avg:150.60ms step:569/1480 train_time:84195ms step_avg:150.62ms step:570/1480 train_time:84355ms step_avg:150.63ms step:571/1480 train_time:84516ms step_avg:150.65ms step:572/1480 train_time:84676ms step_avg:150.67ms step:573/1480 train_time:84836ms step_avg:150.69ms step:574/1480 train_time:84997ms step_avg:150.70ms step:575/1480 train_time:85158ms step_avg:150.72ms step:576/1480 train_time:85317ms step_avg:150.74ms step:577/1480 train_time:85476ms step_avg:150.75ms step:578/1480 train_time:85635ms step_avg:150.77ms step:579/1480 train_time:85796ms step_avg:150.78ms step:580/1480 train_time:85957ms step_avg:150.80ms step:581/1480 train_time:86117ms step_avg:150.82ms step:582/1480 train_time:86278ms step_avg:150.84ms step:583/1480 train_time:86437ms step_avg:150.85ms step:584/1480 train_time:86597ms step_avg:150.87ms step:585/1480 train_time:86756ms step_avg:150.88ms step:586/1480 train_time:86916ms step_avg:150.90ms step:587/1480 train_time:87077ms step_avg:150.91ms step:588/1480 train_time:87236ms step_avg:150.93ms step:589/1480 train_time:87397ms step_avg:150.94ms step:590/1480 train_time:87557ms step_avg:150.96ms step:591/1480 train_time:87716ms step_avg:150.97ms step:592/1480 train_time:87877ms step_avg:150.99ms step:593/1480 train_time:88038ms step_avg:151.01ms step:594/1480 train_time:88198ms step_avg:151.02ms step:595/1480 train_time:88359ms step_avg:151.04ms step:596/1480 train_time:88519ms step_avg:151.06ms step:597/1480 train_time:88677ms step_avg:151.07ms step:598/1480 train_time:88836ms step_avg:151.08ms step:599/1480 train_time:88994ms step_avg:151.09ms step:600/1480 train_time:89154ms step_avg:151.11ms step:601/1480 train_time:89314ms step_avg:151.12ms step:602/1480 train_time:89474ms step_avg:151.14ms step:603/1480 train_time:89635ms step_avg:151.16ms step:604/1480 train_time:89795ms step_avg:151.17ms step:605/1480 train_time:89956ms step_avg:151.19ms step:606/1480 train_time:90117ms step_avg:151.20ms step:607/1480 train_time:90280ms step_avg:151.22ms step:608/1480 train_time:90439ms step_avg:151.24ms step:609/1480 train_time:90598ms step_avg:151.25ms step:610/1480 train_time:90756ms step_avg:151.26ms step:611/1480 train_time:90916ms step_avg:151.27ms step:612/1480 train_time:91076ms step_avg:151.29ms step:613/1480 train_time:91237ms step_avg:151.30ms step:614/1480 train_time:91397ms step_avg:151.32ms step:615/1480 train_time:91555ms step_avg:151.33ms step:616/1480 train_time:91714ms step_avg:151.34ms step:617/1480 train_time:91875ms step_avg:151.36ms step:618/1480 train_time:92034ms step_avg:151.37ms step:619/1480 train_time:92194ms step_avg:151.39ms step:620/1480 train_time:92356ms step_avg:151.40ms step:621/1480 train_time:92515ms step_avg:151.42ms step:622/1480 train_time:92677ms step_avg:151.43ms step:623/1480 train_time:92837ms step_avg:151.45ms step:624/1480 train_time:92996ms step_avg:151.46ms step:625/1480 train_time:93155ms step_avg:151.47ms step:625/1480 val_loss:3.6127 train_time:93219ms step_avg:151.58ms step:626/1480 train_time:93317ms step_avg:151.49ms step:627/1480 train_time:93476ms step_avg:151.50ms step:628/1480 train_time:93633ms step_avg:151.51ms step:629/1480 train_time:93792ms step_avg:151.52ms step:630/1480 train_time:93950ms step_avg:151.53ms step:631/1480 train_time:94108ms step_avg:151.54ms step:632/1480 train_time:94267ms step_avg:151.56ms step:633/1480 train_time:94428ms step_avg:151.57ms step:634/1480 train_time:94588ms step_avg:151.58ms step:635/1480 train_time:94748ms step_avg:151.60ms step:636/1480 train_time:94907ms step_avg:151.61ms step:637/1480 train_time:95067ms step_avg:151.62ms step:638/1480 train_time:95227ms step_avg:151.64ms step:639/1480 train_time:95387ms step_avg:151.65ms step:640/1480 train_time:95547ms step_avg:151.66ms step:641/1480 train_time:95707ms step_avg:151.68ms step:642/1480 train_time:95867ms step_avg:151.69ms step:643/1480 train_time:96027ms step_avg:151.70ms step:644/1480 train_time:96186ms step_avg:151.71ms step:645/1480 train_time:96346ms step_avg:151.73ms step:646/1480 train_time:96507ms step_avg:151.74ms step:647/1480 train_time:96666ms step_avg:151.75ms step:648/1480 train_time:96828ms step_avg:151.77ms step:649/1480 train_time:96987ms step_avg:151.78ms step:650/1480 train_time:97148ms step_avg:151.79ms step:651/1480 train_time:97308ms step_avg:151.81ms step:652/1480 train_time:97467ms step_avg:151.82ms step:653/1480 train_time:97627ms step_avg:151.83ms step:654/1480 train_time:97786ms step_avg:151.84ms step:655/1480 train_time:97947ms step_avg:151.86ms step:656/1480 train_time:98107ms step_avg:151.87ms step:657/1480 train_time:98268ms step_avg:151.88ms step:658/1480 train_time:98428ms step_avg:151.89ms step:659/1480 train_time:98589ms step_avg:151.91ms step:660/1480 train_time:98752ms step_avg:151.93ms step:661/1480 train_time:98914ms step_avg:151.94ms step:662/1480 train_time:99073ms step_avg:151.95ms step:663/1480 train_time:99232ms step_avg:151.96ms step:664/1480 train_time:99393ms step_avg:151.98ms step:665/1480 train_time:99554ms step_avg:151.99ms step:666/1480 train_time:99713ms step_avg:152.00ms step:667/1480 train_time:99874ms step_avg:152.02ms step:668/1480 train_time:100035ms step_avg:152.03ms step:669/1480 train_time:100196ms step_avg:152.04ms step:670/1480 train_time:100355ms step_avg:152.05ms step:671/1480 train_time:100515ms step_avg:152.07ms step:672/1480 train_time:100676ms step_avg:152.08ms step:673/1480 train_time:100839ms step_avg:152.09ms step:674/1480 train_time:101001ms step_avg:152.11ms step:675/1480 train_time:101164ms step_avg:152.13ms step:676/1480 train_time:101327ms step_avg:152.14ms step:677/1480 train_time:101488ms step_avg:152.16ms step:678/1480 train_time:101649ms step_avg:152.17ms step:679/1480 train_time:101810ms step_avg:152.18ms step:680/1480 train_time:101972ms step_avg:152.20ms step:681/1480 train_time:102133ms step_avg:152.21ms step:682/1480 train_time:102296ms step_avg:152.23ms step:683/1480 train_time:102457ms step_avg:152.24ms step:684/1480 train_time:102619ms step_avg:152.25ms step:685/1480 train_time:102781ms step_avg:152.27ms step:686/1480 train_time:102943ms step_avg:152.28ms step:687/1480 train_time:103104ms step_avg:152.29ms step:688/1480 train_time:103268ms step_avg:152.31ms step:689/1480 train_time:103432ms step_avg:152.33ms step:690/1480 train_time:103594ms step_avg:152.34ms step:691/1480 train_time:103754ms step_avg:152.35ms step:692/1480 train_time:103914ms step_avg:152.37ms step:693/1480 train_time:104075ms step_avg:152.38ms step:694/1480 train_time:104235ms step_avg:152.39ms step:695/1480 train_time:104398ms step_avg:152.41ms step:696/1480 train_time:104556ms step_avg:152.41ms step:697/1480 train_time:104720ms step_avg:152.43ms step:698/1480 train_time:104881ms step_avg:152.44ms step:699/1480 train_time:105046ms step_avg:152.46ms step:700/1480 train_time:105209ms step_avg:152.48ms step:701/1480 train_time:105370ms step_avg:152.49ms step:702/1480 train_time:105530ms step_avg:152.50ms step:703/1480 train_time:105690ms step_avg:152.51ms step:704/1480 train_time:105850ms step_avg:152.52ms step:705/1480 train_time:106015ms step_avg:152.54ms step:706/1480 train_time:106180ms step_avg:152.56ms step:707/1480 train_time:106340ms step_avg:152.57ms step:708/1480 train_time:106500ms step_avg:152.58ms step:709/1480 train_time:106662ms step_avg:152.59ms step:710/1480 train_time:106825ms step_avg:152.61ms step:711/1480 train_time:106988ms step_avg:152.62ms step:712/1480 train_time:107153ms step_avg:152.64ms step:713/1480 train_time:107317ms step_avg:152.66ms step:714/1480 train_time:107476ms step_avg:152.67ms step:715/1480 train_time:107634ms step_avg:152.67ms step:716/1480 train_time:107792ms step_avg:152.68ms step:717/1480 train_time:107956ms step_avg:152.70ms step:718/1480 train_time:108115ms step_avg:152.71ms step:719/1480 train_time:108275ms step_avg:152.72ms step:720/1480 train_time:108438ms step_avg:152.73ms step:721/1480 train_time:108599ms step_avg:152.74ms step:722/1480 train_time:108760ms step_avg:152.75ms step:723/1480 train_time:108920ms step_avg:152.76ms step:724/1480 train_time:109082ms step_avg:152.78ms step:725/1480 train_time:109248ms step_avg:152.79ms step:726/1480 train_time:109411ms step_avg:152.81ms step:727/1480 train_time:109574ms step_avg:152.82ms step:728/1480 train_time:109734ms step_avg:152.83ms step:729/1480 train_time:109894ms step_avg:152.84ms step:730/1480 train_time:110056ms step_avg:152.86ms step:731/1480 train_time:110216ms step_avg:152.86ms step:732/1480 train_time:110375ms step_avg:152.87ms step:733/1480 train_time:110538ms step_avg:152.89ms step:734/1480 train_time:110699ms step_avg:152.90ms step:735/1480 train_time:110860ms step_avg:152.91ms step:736/1480 train_time:111023ms step_avg:152.92ms step:737/1480 train_time:111184ms step_avg:152.94ms step:738/1480 train_time:111347ms step_avg:152.95ms step:739/1480 train_time:111509ms step_avg:152.96ms step:740/1480 train_time:111673ms step_avg:152.98ms step:741/1480 train_time:111835ms step_avg:152.99ms step:742/1480 train_time:111996ms step_avg:153.00ms step:743/1480 train_time:112155ms step_avg:153.01ms step:744/1480 train_time:112318ms step_avg:153.02ms step:745/1480 train_time:112482ms step_avg:153.04ms step:746/1480 train_time:112643ms step_avg:153.05ms step:747/1480 train_time:112805ms step_avg:153.06ms step:748/1480 train_time:112970ms step_avg:153.08ms step:749/1480 train_time:113133ms step_avg:153.09ms step:750/1480 train_time:113292ms step_avg:153.10ms step:750/1480 val_loss:3.5568 train_time:113358ms step_avg:153.19ms step:751/1480 train_time:113458ms step_avg:153.12ms step:752/1480 train_time:113623ms step_avg:153.13ms step:753/1480 train_time:113786ms step_avg:153.14ms step:754/1480 train_time:113945ms step_avg:153.15ms step:755/1480 train_time:114107ms step_avg:153.16ms step:756/1480 train_time:114268ms step_avg:153.17ms step:757/1480 train_time:114432ms step_avg:153.19ms step:758/1480 train_time:114592ms step_avg:153.20ms step:759/1480 train_time:114754ms step_avg:153.21ms step:760/1480 train_time:114913ms step_avg:153.22ms step:761/1480 train_time:115075ms step_avg:153.23ms step:762/1480 train_time:115236ms step_avg:153.24ms step:763/1480 train_time:115398ms step_avg:153.25ms step:764/1480 train_time:115562ms step_avg:153.27ms step:765/1480 train_time:115725ms step_avg:153.28ms step:766/1480 train_time:115889ms step_avg:153.29ms step:767/1480 train_time:116049ms step_avg:153.30ms step:768/1480 train_time:116210ms step_avg:153.31ms step:769/1480 train_time:116374ms step_avg:153.33ms step:770/1480 train_time:116537ms step_avg:153.34ms step:771/1480 train_time:116703ms step_avg:153.36ms step:772/1480 train_time:116866ms step_avg:153.37ms step:773/1480 train_time:117030ms step_avg:153.38ms step:774/1480 train_time:117191ms step_avg:153.39ms step:775/1480 train_time:117353ms step_avg:153.40ms step:776/1480 train_time:117516ms step_avg:153.41ms step:777/1480 train_time:117682ms step_avg:153.43ms step:778/1480 train_time:117845ms step_avg:153.44ms step:779/1480 train_time:118008ms step_avg:153.46ms step:780/1480 train_time:118172ms step_avg:153.47ms step:781/1480 train_time:118333ms step_avg:153.48ms step:782/1480 train_time:118496ms step_avg:153.49ms step:783/1480 train_time:118657ms step_avg:153.50ms step:784/1480 train_time:118820ms step_avg:153.51ms step:785/1480 train_time:118985ms step_avg:153.53ms step:786/1480 train_time:119148ms step_avg:153.54ms step:787/1480 train_time:119311ms step_avg:153.55ms step:788/1480 train_time:119476ms step_avg:153.57ms step:789/1480 train_time:119637ms step_avg:153.58ms step:790/1480 train_time:119804ms step_avg:153.59ms step:791/1480 train_time:119969ms step_avg:153.61ms step:792/1480 train_time:120133ms step_avg:153.62ms step:793/1480 train_time:120293ms step_avg:153.63ms step:794/1480 train_time:120459ms step_avg:153.65ms step:795/1480 train_time:120624ms step_avg:153.66ms step:796/1480 train_time:120790ms step_avg:153.68ms step:797/1480 train_time:120953ms step_avg:153.69ms step:798/1480 train_time:121115ms step_avg:153.70ms step:799/1480 train_time:121283ms step_avg:153.72ms step:800/1480 train_time:121446ms step_avg:153.73ms step:801/1480 train_time:121611ms step_avg:153.74ms step:802/1480 train_time:121778ms step_avg:153.76ms step:803/1480 train_time:121941ms step_avg:153.77ms step:804/1480 train_time:122105ms step_avg:153.78ms step:805/1480 train_time:122270ms step_avg:153.80ms step:806/1480 train_time:122431ms step_avg:153.81ms step:807/1480 train_time:122592ms step_avg:153.82ms step:808/1480 train_time:122758ms step_avg:153.83ms step:809/1480 train_time:122922ms step_avg:153.85ms step:810/1480 train_time:123084ms step_avg:153.86ms step:811/1480 train_time:123246ms step_avg:153.87ms step:812/1480 train_time:123410ms step_avg:153.88ms step:813/1480 train_time:123571ms step_avg:153.89ms step:814/1480 train_time:123734ms step_avg:153.90ms step:815/1480 train_time:123896ms step_avg:153.91ms step:816/1480 train_time:124063ms step_avg:153.92ms step:817/1480 train_time:124226ms step_avg:153.94ms step:818/1480 train_time:124387ms step_avg:153.94ms step:819/1480 train_time:124550ms step_avg:153.96ms step:820/1480 train_time:124713ms step_avg:153.97ms step:821/1480 train_time:124875ms step_avg:153.98ms step:822/1480 train_time:125037ms step_avg:153.99ms step:823/1480 train_time:125203ms step_avg:154.00ms step:824/1480 train_time:125366ms step_avg:154.01ms step:825/1480 train_time:125533ms step_avg:154.03ms step:826/1480 train_time:125700ms step_avg:154.04ms step:827/1480 train_time:125864ms step_avg:154.06ms step:828/1480 train_time:126026ms step_avg:154.07ms step:829/1480 train_time:126191ms step_avg:154.08ms step:830/1480 train_time:126354ms step_avg:154.09ms step:831/1480 train_time:126519ms step_avg:154.10ms step:832/1480 train_time:126683ms step_avg:154.12ms step:833/1480 train_time:126847ms step_avg:154.13ms step:834/1480 train_time:127012ms step_avg:154.14ms step:835/1480 train_time:127178ms step_avg:154.15ms step:836/1480 train_time:127343ms step_avg:154.17ms step:837/1480 train_time:127505ms step_avg:154.18ms step:838/1480 train_time:127669ms step_avg:154.19ms step:839/1480 train_time:127832ms step_avg:154.20ms step:840/1480 train_time:127993ms step_avg:154.21ms step:841/1480 train_time:128152ms step_avg:154.21ms step:842/1480 train_time:128315ms step_avg:154.22ms step:843/1480 train_time:128478ms step_avg:154.24ms step:844/1480 train_time:128642ms step_avg:154.25ms step:845/1480 train_time:128807ms step_avg:154.26ms step:846/1480 train_time:128973ms step_avg:154.27ms step:847/1480 train_time:129136ms step_avg:154.28ms step:848/1480 train_time:129299ms step_avg:154.29ms step:849/1480 train_time:129463ms step_avg:154.31ms step:850/1480 train_time:129626ms step_avg:154.32ms step:851/1480 train_time:129792ms step_avg:154.33ms step:852/1480 train_time:129953ms step_avg:154.34ms step:853/1480 train_time:130115ms step_avg:154.35ms step:854/1480 train_time:130280ms step_avg:154.36ms step:855/1480 train_time:130444ms step_avg:154.37ms step:856/1480 train_time:130606ms step_avg:154.38ms step:857/1480 train_time:130771ms step_avg:154.39ms step:858/1480 train_time:130936ms step_avg:154.41ms step:859/1480 train_time:131099ms step_avg:154.42ms step:860/1480 train_time:131262ms step_avg:154.43ms step:861/1480 train_time:131427ms step_avg:154.44ms step:862/1480 train_time:131596ms step_avg:154.46ms step:863/1480 train_time:131765ms step_avg:154.47ms step:864/1480 train_time:131929ms step_avg:154.48ms step:865/1480 train_time:132089ms step_avg:154.49ms step:866/1480 train_time:132255ms step_avg:154.50ms step:867/1480 train_time:132419ms step_avg:154.51ms step:868/1480 train_time:132580ms step_avg:154.52ms step:869/1480 train_time:132743ms step_avg:154.53ms step:870/1480 train_time:132908ms step_avg:154.54ms step:871/1480 train_time:133070ms step_avg:154.55ms step:872/1480 train_time:133235ms step_avg:154.57ms step:873/1480 train_time:133397ms step_avg:154.57ms step:874/1480 train_time:133565ms step_avg:154.59ms step:875/1480 train_time:133730ms step_avg:154.60ms step:875/1480 val_loss:3.5116 train_time:133795ms step_avg:154.68ms step:876/1480 train_time:133895ms step_avg:154.61ms step:877/1480 train_time:134059ms step_avg:154.62ms step:878/1480 train_time:134222ms step_avg:154.63ms step:879/1480 train_time:134387ms step_avg:154.65ms step:880/1480 train_time:134550ms step_avg:154.66ms step:881/1480 train_time:134712ms step_avg:154.66ms step:882/1480 train_time:134876ms step_avg:154.67ms step:883/1480 train_time:135044ms step_avg:154.69ms step:884/1480 train_time:135211ms step_avg:154.70ms step:885/1480 train_time:135376ms step_avg:154.72ms step:886/1480 train_time:135543ms step_avg:154.73ms step:887/1480 train_time:135713ms step_avg:154.75ms step:888/1480 train_time:135886ms step_avg:154.77ms step:889/1480 train_time:136054ms step_avg:154.78ms step:890/1480 train_time:136217ms step_avg:154.79ms step:891/1480 train_time:136382ms step_avg:154.80ms step:892/1480 train_time:136548ms step_avg:154.82ms step:893/1480 train_time:136711ms step_avg:154.83ms step:894/1480 train_time:136876ms step_avg:154.84ms step:895/1480 train_time:137042ms step_avg:154.85ms step:896/1480 train_time:137208ms step_avg:154.86ms step:897/1480 train_time:137374ms step_avg:154.87ms step:898/1480 train_time:137542ms step_avg:154.89ms step:899/1480 train_time:137707ms step_avg:154.90ms step:900/1480 train_time:137871ms step_avg:154.91ms step:901/1480 train_time:138035ms step_avg:154.92ms step:902/1480 train_time:138198ms step_avg:154.93ms step:903/1480 train_time:138371ms step_avg:154.95ms step:904/1480 train_time:138535ms step_avg:154.96ms step:905/1480 train_time:138696ms step_avg:154.97ms step:906/1480 train_time:138863ms step_avg:154.98ms step:907/1480 train_time:139031ms step_avg:155.00ms step:908/1480 train_time:139192ms step_avg:155.00ms step:909/1480 train_time:139356ms step_avg:155.01ms step:910/1480 train_time:139528ms step_avg:155.03ms step:911/1480 train_time:139692ms step_avg:155.04ms step:912/1480 train_time:139857ms step_avg:155.05ms step:913/1480 train_time:140027ms step_avg:155.07ms step:914/1480 train_time:140194ms step_avg:155.08ms step:915/1480 train_time:140364ms step_avg:155.10ms step:916/1480 train_time:140528ms step_avg:155.11ms step:917/1480 train_time:140690ms step_avg:155.12ms step:918/1480 train_time:140858ms step_avg:155.13ms step:919/1480 train_time:141029ms step_avg:155.15ms step:920/1480 train_time:141194ms step_avg:155.16ms step:921/1480 train_time:141360ms step_avg:155.17ms step:922/1480 train_time:141530ms step_avg:155.19ms step:923/1480 train_time:141693ms step_avg:155.20ms step:924/1480 train_time:141858ms step_avg:155.21ms step:925/1480 train_time:142024ms step_avg:155.22ms step:926/1480 train_time:142187ms step_avg:155.23ms step:927/1480 train_time:142351ms step_avg:155.24ms step:928/1480 train_time:142516ms step_avg:155.25ms step:929/1480 train_time:142681ms step_avg:155.26ms step:930/1480 train_time:142847ms step_avg:155.27ms step:931/1480 train_time:143010ms step_avg:155.28ms step:932/1480 train_time:143176ms step_avg:155.29ms step:933/1480 train_time:143345ms step_avg:155.30ms step:934/1480 train_time:143511ms step_avg:155.31ms step:935/1480 train_time:143682ms step_avg:155.33ms step:936/1480 train_time:143850ms step_avg:155.35ms step:937/1480 train_time:144018ms step_avg:155.36ms step:938/1480 train_time:144179ms step_avg:155.37ms step:939/1480 train_time:144351ms step_avg:155.38ms step:940/1480 train_time:144518ms step_avg:155.40ms step:941/1480 train_time:144681ms step_avg:155.40ms step:942/1480 train_time:144848ms step_avg:155.42ms step:943/1480 train_time:145016ms step_avg:155.43ms step:944/1480 train_time:145190ms step_avg:155.45ms step:945/1480 train_time:145354ms step_avg:155.46ms step:946/1480 train_time:145523ms step_avg:155.47ms step:947/1480 train_time:145691ms step_avg:155.49ms step:948/1480 train_time:145857ms step_avg:155.50ms step:949/1480 train_time:146023ms step_avg:155.51ms step:950/1480 train_time:146187ms step_avg:155.52ms step:951/1480 train_time:146354ms step_avg:155.53ms step:952/1480 train_time:146518ms step_avg:155.54ms step:953/1480 train_time:146686ms step_avg:155.55ms step:954/1480 train_time:146854ms step_avg:155.57ms step:955/1480 train_time:147018ms step_avg:155.57ms step:956/1480 train_time:147184ms step_avg:155.59ms step:957/1480 train_time:147352ms step_avg:155.60ms step:958/1480 train_time:147523ms step_avg:155.61ms step:959/1480 train_time:147688ms step_avg:155.62ms step:960/1480 train_time:147855ms step_avg:155.64ms step:961/1480 train_time:148019ms step_avg:155.65ms step:962/1480 train_time:148184ms step_avg:155.66ms step:963/1480 train_time:148352ms step_avg:155.67ms step:964/1480 train_time:148519ms step_avg:155.68ms step:965/1480 train_time:148683ms step_avg:155.69ms step:966/1480 train_time:148848ms step_avg:155.70ms step:967/1480 train_time:149012ms step_avg:155.71ms step:968/1480 train_time:149177ms step_avg:155.72ms step:969/1480 train_time:149344ms step_avg:155.73ms step:970/1480 train_time:149508ms step_avg:155.74ms step:971/1480 train_time:149673ms step_avg:155.75ms step:972/1480 train_time:149836ms step_avg:155.75ms step:973/1480 train_time:150000ms step_avg:155.76ms step:974/1480 train_time:150169ms step_avg:155.78ms step:975/1480 train_time:150334ms step_avg:155.79ms step:976/1480 train_time:150497ms step_avg:155.79ms step:977/1480 train_time:150663ms step_avg:155.80ms step:978/1480 train_time:150830ms step_avg:155.82ms step:979/1480 train_time:150997ms step_avg:155.83ms step:980/1480 train_time:151163ms step_avg:155.84ms step:981/1480 train_time:151330ms step_avg:155.85ms step:982/1480 train_time:151493ms step_avg:155.86ms step:983/1480 train_time:151659ms step_avg:155.87ms step:984/1480 train_time:151823ms step_avg:155.88ms step:985/1480 train_time:151989ms step_avg:155.89ms step:986/1480 train_time:152154ms step_avg:155.90ms step:987/1480 train_time:152317ms step_avg:155.90ms step:988/1480 train_time:152484ms step_avg:155.91ms step:989/1480 train_time:152651ms step_avg:155.93ms step:990/1480 train_time:152820ms step_avg:155.94ms step:991/1480 train_time:152989ms step_avg:155.95ms step:992/1480 train_time:153162ms step_avg:155.97ms step:993/1480 train_time:153341ms step_avg:155.99ms step:994/1480 train_time:153508ms step_avg:156.00ms step:995/1480 train_time:153672ms step_avg:156.01ms step:996/1480 train_time:153835ms step_avg:156.02ms step:997/1480 train_time:154000ms step_avg:156.03ms step:998/1480 train_time:154164ms step_avg:156.04ms step:999/1480 train_time:154329ms step_avg:156.05ms step:1000/1480 train_time:154497ms step_avg:156.06ms step:1000/1480 val_loss:3.4475 train_time:154566ms step_avg:156.13ms step:1001/1480 train_time:154667ms step_avg:156.07ms step:1002/1480 train_time:154832ms step_avg:156.08ms step:1003/1480 train_time:155004ms step_avg:156.10ms step:1004/1480 train_time:155172ms step_avg:156.11ms step:1005/1480 train_time:155340ms step_avg:156.12ms step:1006/1480 train_time:155506ms step_avg:156.13ms step:1007/1480 train_time:155672ms step_avg:156.14ms step:1008/1480 train_time:155840ms step_avg:156.15ms step:1009/1480 train_time:156014ms step_avg:156.17ms step:1010/1480 train_time:156181ms step_avg:156.18ms step:1011/1480 train_time:156345ms step_avg:156.19ms step:1012/1480 train_time:156510ms step_avg:156.20ms step:1013/1480 train_time:156680ms step_avg:156.21ms step:1014/1480 train_time:156847ms step_avg:156.22ms step:1015/1480 train_time:157018ms step_avg:156.24ms step:1016/1480 train_time:157186ms step_avg:156.25ms step:1017/1480 train_time:157356ms step_avg:156.26ms step:1018/1480 train_time:157524ms step_avg:156.27ms step:1019/1480 train_time:157691ms step_avg:156.28ms step:1020/1480 train_time:157862ms step_avg:156.30ms step:1021/1480 train_time:158027ms step_avg:156.31ms step:1022/1480 train_time:158194ms step_avg:156.32ms step:1023/1480 train_time:158362ms step_avg:156.33ms step:1024/1480 train_time:158528ms step_avg:156.34ms step:1025/1480 train_time:158700ms step_avg:156.35ms step:1026/1480 train_time:158866ms step_avg:156.36ms step:1027/1480 train_time:159031ms step_avg:156.37ms step:1028/1480 train_time:159205ms step_avg:156.39ms step:1029/1480 train_time:159381ms step_avg:156.41ms step:1030/1480 train_time:159548ms step_avg:156.42ms step:1031/1480 train_time:159712ms step_avg:156.43ms step:1032/1480 train_time:159883ms step_avg:156.44ms step:1033/1480 train_time:160048ms step_avg:156.45ms step:1034/1480 train_time:160217ms step_avg:156.46ms step:1035/1480 train_time:160385ms step_avg:156.47ms step:1036/1480 train_time:160549ms step_avg:156.48ms step:1037/1480 train_time:160717ms step_avg:156.49ms step:1038/1480 train_time:160885ms step_avg:156.50ms step:1039/1480 train_time:161056ms step_avg:156.52ms step:1040/1480 train_time:161223ms step_avg:156.53ms step:1041/1480 train_time:161389ms step_avg:156.54ms step:1042/1480 train_time:161552ms step_avg:156.54ms step:1043/1480 train_time:161718ms step_avg:156.55ms step:1044/1480 train_time:161883ms step_avg:156.56ms step:1045/1480 train_time:162050ms step_avg:156.57ms step:1046/1480 train_time:162219ms step_avg:156.58ms step:1047/1480 train_time:162385ms step_avg:156.59ms step:1048/1480 train_time:162550ms step_avg:156.60ms step:1049/1480 train_time:162717ms step_avg:156.61ms step:1050/1480 train_time:162886ms step_avg:156.62ms step:1051/1480 train_time:163054ms step_avg:156.63ms step:1052/1480 train_time:163222ms step_avg:156.64ms step:1053/1480 train_time:163388ms step_avg:156.65ms step:1054/1480 train_time:163557ms step_avg:156.66ms step:1055/1480 train_time:163723ms step_avg:156.67ms step:1056/1480 train_time:163888ms step_avg:156.68ms step:1057/1480 train_time:164056ms step_avg:156.69ms step:1058/1480 train_time:164225ms step_avg:156.70ms step:1059/1480 train_time:164397ms step_avg:156.72ms step:1060/1480 train_time:164565ms step_avg:156.73ms step:1061/1480 train_time:164728ms step_avg:156.73ms step:1062/1480 train_time:164894ms step_avg:156.74ms step:1063/1480 train_time:165060ms step_avg:156.75ms step:1064/1480 train_time:165223ms step_avg:156.76ms step:1065/1480 train_time:165390ms step_avg:156.77ms step:1066/1480 train_time:165559ms step_avg:156.78ms step:1067/1480 train_time:165726ms step_avg:156.79ms step:1068/1480 train_time:165891ms step_avg:156.80ms step:1069/1480 train_time:166063ms step_avg:156.81ms step:1070/1480 train_time:166229ms step_avg:156.82ms step:1071/1480 train_time:166401ms step_avg:156.83ms step:1072/1480 train_time:166569ms step_avg:156.84ms step:1073/1480 train_time:166732ms step_avg:156.85ms step:1074/1480 train_time:166899ms step_avg:156.86ms step:1075/1480 train_time:167069ms step_avg:156.87ms step:1076/1480 train_time:167237ms step_avg:156.88ms step:1077/1480 train_time:167403ms step_avg:156.89ms step:1078/1480 train_time:167578ms step_avg:156.91ms step:1079/1480 train_time:167750ms step_avg:156.92ms step:1080/1480 train_time:167920ms step_avg:156.93ms step:1081/1480 train_time:168087ms step_avg:156.94ms step:1082/1480 train_time:168252ms step_avg:156.95ms step:1083/1480 train_time:168418ms step_avg:156.96ms step:1084/1480 train_time:168585ms step_avg:156.97ms step:1085/1480 train_time:168752ms step_avg:156.98ms step:1086/1480 train_time:168920ms step_avg:156.99ms step:1087/1480 train_time:169088ms step_avg:157.00ms step:1088/1480 train_time:169257ms step_avg:157.01ms step:1089/1480 train_time:169428ms step_avg:157.02ms step:1090/1480 train_time:169601ms step_avg:157.04ms step:1091/1480 train_time:169769ms step_avg:157.05ms step:1092/1480 train_time:169938ms step_avg:157.06ms step:1093/1480 train_time:170105ms step_avg:157.07ms step:1094/1480 train_time:170270ms step_avg:157.08ms step:1095/1480 train_time:170435ms step_avg:157.08ms step:1096/1480 train_time:170603ms step_avg:157.09ms step:1097/1480 train_time:170772ms step_avg:157.10ms step:1098/1480 train_time:170944ms step_avg:157.12ms step:1099/1480 train_time:171114ms step_avg:157.13ms step:1100/1480 train_time:171286ms step_avg:157.14ms step:1101/1480 train_time:171457ms step_avg:157.16ms step:1102/1480 train_time:171628ms step_avg:157.17ms step:1103/1480 train_time:171805ms step_avg:157.19ms step:1104/1480 train_time:171974ms step_avg:157.20ms step:1105/1480 train_time:172144ms step_avg:157.21ms step:1106/1480 train_time:172313ms step_avg:157.22ms step:1107/1480 train_time:172482ms step_avg:157.23ms step:1108/1480 train_time:172647ms step_avg:157.24ms step:1109/1480 train_time:172814ms step_avg:157.25ms step:1110/1480 train_time:172981ms step_avg:157.26ms step:1111/1480 train_time:173147ms step_avg:157.26ms step:1112/1480 train_time:173317ms step_avg:157.27ms step:1113/1480 train_time:173496ms step_avg:157.30ms step:1114/1480 train_time:173669ms step_avg:157.31ms step:1115/1480 train_time:173842ms step_avg:157.32ms step:1116/1480 train_time:174011ms step_avg:157.33ms step:1117/1480 train_time:174184ms step_avg:157.35ms step:1118/1480 train_time:174358ms step_avg:157.36ms step:1119/1480 train_time:174524ms step_avg:157.37ms step:1120/1480 train_time:174693ms step_avg:157.38ms step:1121/1480 train_time:174864ms step_avg:157.39ms step:1122/1480 train_time:175029ms step_avg:157.40ms step:1123/1480 train_time:175196ms step_avg:157.41ms step:1124/1480 train_time:175365ms step_avg:157.42ms step:1125/1480 train_time:175533ms step_avg:157.43ms step:1125/1480 val_loss:3.3908 train_time:175601ms step_avg:157.49ms step:1126/1480 train_time:175705ms step_avg:157.44ms step:1127/1480 train_time:175876ms step_avg:157.45ms step:1128/1480 train_time:176047ms step_avg:157.47ms step:1129/1480 train_time:176220ms step_avg:157.48ms step:1130/1480 train_time:176389ms step_avg:157.49ms step:1131/1480 train_time:176565ms step_avg:157.51ms step:1132/1480 train_time:176731ms step_avg:157.51ms step:1133/1480 train_time:176902ms step_avg:157.53ms step:1134/1480 train_time:177073ms step_avg:157.54ms step:1135/1480 train_time:177241ms step_avg:157.55ms step:1136/1480 train_time:177411ms step_avg:157.56ms step:1137/1480 train_time:177580ms step_avg:157.57ms step:1138/1480 train_time:177751ms step_avg:157.58ms step:1139/1480 train_time:177918ms step_avg:157.59ms step:1140/1480 train_time:178086ms step_avg:157.60ms step:1141/1480 train_time:178256ms step_avg:157.61ms step:1142/1480 train_time:178422ms step_avg:157.62ms step:1143/1480 train_time:178594ms step_avg:157.63ms step:1144/1480 train_time:178762ms step_avg:157.64ms step:1145/1480 train_time:178928ms step_avg:157.65ms step:1146/1480 train_time:179099ms step_avg:157.66ms step:1147/1480 train_time:179270ms step_avg:157.67ms step:1148/1480 train_time:179438ms step_avg:157.68ms step:1149/1480 train_time:179609ms step_avg:157.69ms step:1150/1480 train_time:179777ms step_avg:157.70ms step:1151/1480 train_time:179949ms step_avg:157.71ms step:1152/1480 train_time:180120ms step_avg:157.72ms step:1153/1480 train_time:180293ms step_avg:157.74ms step:1154/1480 train_time:180459ms step_avg:157.74ms step:1155/1480 train_time:180631ms step_avg:157.76ms step:1156/1480 train_time:180813ms step_avg:157.78ms step:1157/1480 train_time:180983ms step_avg:157.79ms step:1158/1480 train_time:181151ms step_avg:157.80ms step:1159/1480 train_time:181317ms step_avg:157.80ms step:1160/1480 train_time:181483ms step_avg:157.81ms step:1161/1480 train_time:181654ms step_avg:157.82ms step:1162/1480 train_time:181823ms step_avg:157.83ms step:1163/1480 train_time:181993ms step_avg:157.84ms step:1164/1480 train_time:182162ms step_avg:157.85ms step:1165/1480 train_time:182328ms step_avg:157.86ms step:1166/1480 train_time:182496ms step_avg:157.87ms step:1167/1480 train_time:182664ms step_avg:157.88ms step:1168/1480 train_time:182832ms step_avg:157.89ms step:1169/1480 train_time:183001ms step_avg:157.90ms step:1170/1480 train_time:183169ms step_avg:157.90ms step:1171/1480 train_time:183336ms step_avg:157.91ms step:1172/1480 train_time:183503ms step_avg:157.92ms step:1173/1480 train_time:183674ms step_avg:157.93ms step:1174/1480 train_time:183857ms step_avg:157.95ms step:1175/1480 train_time:184029ms step_avg:157.96ms step:1176/1480 train_time:184202ms step_avg:157.98ms step:1177/1480 train_time:184379ms step_avg:157.99ms step:1178/1480 train_time:184546ms step_avg:158.00ms step:1179/1480 train_time:184713ms step_avg:158.01ms step:1180/1480 train_time:184893ms step_avg:158.03ms step:1181/1480 train_time:185062ms step_avg:158.04ms step:1182/1480 train_time:185231ms step_avg:158.05ms step:1183/1480 train_time:185402ms step_avg:158.06ms step:1184/1480 train_time:185570ms step_avg:158.07ms step:1185/1480 train_time:185742ms step_avg:158.08ms step:1186/1480 train_time:185914ms step_avg:158.09ms step:1187/1480 train_time:186097ms step_avg:158.11ms step:1188/1480 train_time:186264ms step_avg:158.12ms step:1189/1480 train_time:186436ms step_avg:158.13ms step:1190/1480 train_time:186604ms step_avg:158.14ms step:1191/1480 train_time:186774ms step_avg:158.15ms step:1192/1480 train_time:186939ms step_avg:158.16ms step:1193/1480 train_time:187108ms step_avg:158.16ms step:1194/1480 train_time:187277ms step_avg:158.17ms step:1195/1480 train_time:187450ms step_avg:158.19ms step:1196/1480 train_time:187634ms step_avg:158.21ms step:1197/1480 train_time:187805ms step_avg:158.22ms step:1198/1480 train_time:187985ms step_avg:158.24ms step:1199/1480 train_time:188156ms step_avg:158.25ms step:1200/1480 train_time:188326ms step_avg:158.26ms step:1201/1480 train_time:188493ms step_avg:158.26ms step:1202/1480 train_time:188675ms step_avg:158.28ms step:1203/1480 train_time:188852ms step_avg:158.30ms step:1204/1480 train_time:189026ms step_avg:158.31ms step:1205/1480 train_time:189195ms step_avg:158.32ms step:1206/1480 train_time:189363ms step_avg:158.33ms step:1207/1480 train_time:189534ms step_avg:158.34ms step:1208/1480 train_time:189701ms step_avg:158.35ms step:1209/1480 train_time:189874ms step_avg:158.36ms step:1210/1480 train_time:190049ms step_avg:158.37ms step:1211/1480 train_time:190224ms step_avg:158.39ms step:1212/1480 train_time:190396ms step_avg:158.40ms step:1213/1480 train_time:190569ms step_avg:158.41ms step:1214/1480 train_time:190746ms step_avg:158.43ms step:1215/1480 train_time:190919ms step_avg:158.44ms step:1216/1480 train_time:191090ms step_avg:158.45ms step:1217/1480 train_time:191265ms step_avg:158.46ms step:1218/1480 train_time:191435ms step_avg:158.47ms step:1219/1480 train_time:191614ms step_avg:158.49ms step:1220/1480 train_time:191783ms step_avg:158.50ms step:1221/1480 train_time:191952ms step_avg:158.51ms step:1222/1480 train_time:192118ms step_avg:158.51ms step:1223/1480 train_time:192287ms step_avg:158.52ms step:1224/1480 train_time:192462ms step_avg:158.54ms step:1225/1480 train_time:192632ms step_avg:158.55ms step:1226/1480 train_time:192807ms step_avg:158.56ms step:1227/1480 train_time:192980ms step_avg:158.57ms step:1228/1480 train_time:193150ms step_avg:158.58ms step:1229/1480 train_time:193322ms step_avg:158.59ms step:1230/1480 train_time:193502ms step_avg:158.61ms step:1231/1480 train_time:193677ms step_avg:158.62ms step:1232/1480 train_time:193852ms step_avg:158.64ms step:1233/1480 train_time:194021ms step_avg:158.64ms step:1234/1480 train_time:194193ms step_avg:158.65ms step:1235/1480 train_time:194369ms step_avg:158.67ms step:1236/1480 train_time:194538ms step_avg:158.68ms step:1237/1480 train_time:194709ms step_avg:158.69ms step:1238/1480 train_time:194893ms step_avg:158.71ms step:1239/1480 train_time:195064ms step_avg:158.72ms step:1240/1480 train_time:195234ms step_avg:158.73ms step:1241/1480 train_time:195407ms step_avg:158.74ms step:1242/1480 train_time:195577ms step_avg:158.75ms step:1243/1480 train_time:195751ms step_avg:158.76ms step:1244/1480 train_time:195917ms step_avg:158.77ms step:1245/1480 train_time:196085ms step_avg:158.77ms step:1246/1480 train_time:196254ms step_avg:158.78ms step:1247/1480 train_time:196423ms step_avg:158.79ms step:1248/1480 train_time:196593ms step_avg:158.80ms step:1249/1480 train_time:196762ms step_avg:158.81ms step:1250/1480 train_time:196931ms step_avg:158.82ms step:1250/1480 val_loss:3.3412 train_time:197003ms step_avg:158.87ms step:1251/1480 train_time:197112ms step_avg:158.83ms step:1252/1480 train_time:197281ms step_avg:158.84ms step:1253/1480 train_time:197450ms step_avg:158.85ms step:1254/1480 train_time:197621ms step_avg:158.86ms step:1255/1480 train_time:197810ms step_avg:158.88ms step:1256/1480 train_time:197985ms step_avg:158.90ms step:1257/1480 train_time:198156ms step_avg:158.91ms step:1258/1480 train_time:198330ms step_avg:158.92ms step:1259/1480 train_time:198502ms step_avg:158.93ms step:1260/1480 train_time:198670ms step_avg:158.94ms step:1261/1480 train_time:198841ms step_avg:158.95ms step:1262/1480 train_time:199018ms step_avg:158.96ms step:1263/1480 train_time:199193ms step_avg:158.97ms step:1264/1480 train_time:199358ms step_avg:158.98ms step:1265/1480 train_time:199525ms step_avg:158.98ms step:1266/1480 train_time:199696ms step_avg:158.99ms step:1267/1480 train_time:199867ms step_avg:159.00ms step:1268/1480 train_time:200039ms step_avg:159.01ms step:1269/1480 train_time:200214ms step_avg:159.03ms step:1270/1480 train_time:200384ms step_avg:159.04ms step:1271/1480 train_time:200555ms step_avg:159.04ms step:1272/1480 train_time:200720ms step_avg:159.05ms step:1273/1480 train_time:200891ms step_avg:159.06ms step:1274/1480 train_time:201065ms step_avg:159.07ms step:1275/1480 train_time:201234ms step_avg:159.08ms step:1276/1480 train_time:201400ms step_avg:159.08ms step:1277/1480 train_time:201572ms step_avg:159.09ms step:1278/1480 train_time:201739ms step_avg:159.10ms step:1279/1480 train_time:201913ms step_avg:159.11ms step:1280/1480 train_time:202091ms step_avg:159.13ms step:1281/1480 train_time:202259ms step_avg:159.13ms step:1282/1480 train_time:202425ms step_avg:159.14ms step:1283/1480 train_time:202596ms step_avg:159.15ms step:1284/1480 train_time:202766ms step_avg:159.16ms step:1285/1480 train_time:202935ms step_avg:159.16ms step:1286/1480 train_time:203104ms step_avg:159.17ms step:1287/1480 train_time:203273ms step_avg:159.18ms step:1288/1480 train_time:203445ms step_avg:159.19ms step:1289/1480 train_time:203630ms step_avg:159.21ms step:1290/1480 train_time:203811ms step_avg:159.23ms step:1291/1480 train_time:203984ms step_avg:159.24ms step:1292/1480 train_time:204160ms step_avg:159.25ms step:1293/1480 train_time:204334ms step_avg:159.26ms step:1294/1480 train_time:204505ms step_avg:159.27ms step:1295/1480 train_time:204675ms step_avg:159.28ms step:1296/1480 train_time:204850ms step_avg:159.29ms step:1297/1480 train_time:205021ms step_avg:159.30ms step:1298/1480 train_time:205193ms step_avg:159.31ms step:1299/1480 train_time:205364ms step_avg:159.32ms step:1300/1480 train_time:205532ms step_avg:159.33ms step:1301/1480 train_time:205700ms step_avg:159.33ms step:1302/1480 train_time:205874ms step_avg:159.35ms step:1303/1480 train_time:206052ms step_avg:159.36ms step:1304/1480 train_time:206226ms step_avg:159.37ms step:1305/1480 train_time:206395ms step_avg:159.38ms step:1306/1480 train_time:206571ms step_avg:159.39ms step:1307/1480 train_time:206740ms step_avg:159.40ms step:1308/1480 train_time:206909ms step_avg:159.41ms step:1309/1480 train_time:207081ms step_avg:159.42ms step:1310/1480 train_time:207250ms step_avg:159.42ms step:1311/1480 train_time:207419ms step_avg:159.43ms step:1312/1480 train_time:207592ms step_avg:159.44ms step:1313/1480 train_time:207761ms step_avg:159.45ms step:1314/1480 train_time:207934ms step_avg:159.46ms step:1315/1480 train_time:208105ms step_avg:159.47ms step:1316/1480 train_time:208272ms step_avg:159.47ms step:1317/1480 train_time:208443ms step_avg:159.48ms step:1318/1480 train_time:208623ms step_avg:159.50ms step:1319/1480 train_time:208798ms step_avg:159.51ms step:1320/1480 train_time:208975ms step_avg:159.52ms step:1321/1480 train_time:209145ms step_avg:159.53ms step:1322/1480 train_time:209328ms step_avg:159.55ms step:1323/1480 train_time:209497ms step_avg:159.56ms step:1324/1480 train_time:209672ms step_avg:159.57ms step:1325/1480 train_time:209852ms step_avg:159.58ms step:1326/1480 train_time:210028ms step_avg:159.60ms step:1327/1480 train_time:210198ms step_avg:159.60ms step:1328/1480 train_time:210370ms step_avg:159.61ms step:1329/1480 train_time:210565ms step_avg:159.64ms step:1330/1480 train_time:210744ms step_avg:159.65ms step:1331/1480 train_time:210913ms step_avg:159.66ms step:1332/1480 train_time:211087ms step_avg:159.67ms step:1333/1480 train_time:211263ms step_avg:159.68ms step:1334/1480 train_time:211435ms step_avg:159.69ms step:1335/1480 train_time:211604ms step_avg:159.70ms step:1336/1480 train_time:211788ms step_avg:159.72ms step:1337/1480 train_time:211963ms step_avg:159.73ms step:1338/1480 train_time:212134ms step_avg:159.74ms step:1339/1480 train_time:212309ms step_avg:159.75ms step:1340/1480 train_time:212480ms step_avg:159.76ms step:1341/1480 train_time:212649ms step_avg:159.77ms step:1342/1480 train_time:212822ms step_avg:159.78ms step:1343/1480 train_time:212993ms step_avg:159.78ms step:1344/1480 train_time:213164ms step_avg:159.79ms step:1345/1480 train_time:213342ms step_avg:159.81ms step:1346/1480 train_time:213513ms step_avg:159.81ms step:1347/1480 train_time:213681ms step_avg:159.82ms step:1348/1480 train_time:213851ms step_avg:159.83ms step:1349/1480 train_time:214021ms step_avg:159.84ms step:1350/1480 train_time:214196ms step_avg:159.85ms step:1351/1480 train_time:214367ms step_avg:159.86ms step:1352/1480 train_time:214539ms step_avg:159.86ms step:1353/1480 train_time:214716ms step_avg:159.88ms step:1354/1480 train_time:214889ms step_avg:159.89ms step:1355/1480 train_time:215056ms step_avg:159.89ms step:1356/1480 train_time:215230ms step_avg:159.90ms step:1357/1480 train_time:215404ms step_avg:159.91ms step:1358/1480 train_time:215577ms step_avg:159.92ms step:1359/1480 train_time:215750ms step_avg:159.93ms step:1360/1480 train_time:215924ms step_avg:159.94ms step:1361/1480 train_time:216101ms step_avg:159.96ms step:1362/1480 train_time:216276ms step_avg:159.97ms step:1363/1480 train_time:216457ms step_avg:159.98ms step:1364/1480 train_time:216628ms step_avg:159.99ms step:1365/1480 train_time:216794ms step_avg:160.00ms step:1366/1480 train_time:216967ms step_avg:160.01ms step:1367/1480 train_time:217138ms step_avg:160.01ms step:1368/1480 train_time:217312ms step_avg:160.02ms step:1369/1480 train_time:217495ms step_avg:160.04ms step:1370/1480 train_time:217673ms step_avg:160.05ms step:1371/1480 train_time:217844ms step_avg:160.06ms step:1372/1480 train_time:218024ms step_avg:160.08ms step:1373/1480 train_time:218194ms step_avg:160.08ms step:1374/1480 train_time:218370ms step_avg:160.10ms step:1375/1480 train_time:218540ms step_avg:160.10ms step:1375/1480 val_loss:3.3029 train_time:218608ms step_avg:160.15ms step:1376/1480 train_time:218711ms step_avg:160.11ms step:1377/1480 train_time:218884ms step_avg:160.12ms step:1378/1480 train_time:219051ms step_avg:160.13ms step:1379/1480 train_time:219226ms step_avg:160.14ms step:1380/1480 train_time:219401ms step_avg:160.15ms step:1381/1480 train_time:219583ms step_avg:160.16ms step:1382/1480 train_time:219754ms step_avg:160.17ms step:1383/1480 train_time:219926ms step_avg:160.18ms step:1384/1480 train_time:220103ms step_avg:160.19ms step:1385/1480 train_time:220269ms step_avg:160.20ms step:1386/1480 train_time:220439ms step_avg:160.20ms step:1387/1480 train_time:220610ms step_avg:160.21ms step:1388/1480 train_time:220778ms step_avg:160.22ms step:1389/1480 train_time:220950ms step_avg:160.22ms step:1390/1480 train_time:221118ms step_avg:160.23ms step:1391/1480 train_time:221287ms step_avg:160.24ms step:1392/1480 train_time:221459ms step_avg:160.25ms step:1393/1480 train_time:221630ms step_avg:160.25ms step:1394/1480 train_time:221801ms step_avg:160.26ms step:1395/1480 train_time:221970ms step_avg:160.27ms step:1396/1480 train_time:222139ms step_avg:160.27ms step:1397/1480 train_time:222307ms step_avg:160.28ms step:1398/1480 train_time:222472ms step_avg:160.28ms step:1399/1480 train_time:222643ms step_avg:160.29ms step:1400/1480 train_time:222819ms step_avg:160.30ms step:1401/1480 train_time:222985ms step_avg:160.31ms step:1402/1480 train_time:223156ms step_avg:160.31ms step:1403/1480 train_time:223332ms step_avg:160.32ms step:1404/1480 train_time:223503ms step_avg:160.33ms step:1405/1480 train_time:223676ms step_avg:160.34ms step:1406/1480 train_time:223851ms step_avg:160.35ms step:1407/1480 train_time:224020ms step_avg:160.36ms step:1408/1480 train_time:224189ms step_avg:160.36ms step:1409/1480 train_time:224371ms step_avg:160.38ms step:1410/1480 train_time:224541ms step_avg:160.39ms step:1411/1480 train_time:224709ms step_avg:160.39ms step:1412/1480 train_time:224879ms step_avg:160.40ms step:1413/1480 train_time:225050ms step_avg:160.41ms step:1414/1480 train_time:225222ms step_avg:160.41ms step:1415/1480 train_time:225396ms step_avg:160.42ms step:1416/1480 train_time:225584ms step_avg:160.44ms step:1417/1480 train_time:225758ms step_avg:160.45ms step:1418/1480 train_time:225928ms step_avg:160.46ms step:1419/1480 train_time:226103ms step_avg:160.47ms step:1420/1480 train_time:226278ms step_avg:160.48ms step:1421/1480 train_time:226452ms step_avg:160.49ms step:1422/1480 train_time:226625ms step_avg:160.50ms step:1423/1480 train_time:226795ms step_avg:160.51ms step:1424/1480 train_time:226971ms step_avg:160.52ms step:1425/1480 train_time:227151ms step_avg:160.53ms step:1426/1480 train_time:227322ms step_avg:160.54ms step:1427/1480 train_time:227498ms step_avg:160.55ms step:1428/1480 train_time:227670ms step_avg:160.56ms step:1429/1480 train_time:227840ms step_avg:160.56ms step:1430/1480 train_time:228012ms step_avg:160.57ms step:1431/1480 train_time:228186ms step_avg:160.58ms step:1432/1480 train_time:228363ms step_avg:160.59ms step:1433/1480 train_time:228543ms step_avg:160.61ms step:1434/1480 train_time:228722ms step_avg:160.62ms step:1435/1480 train_time:228897ms step_avg:160.63ms step:1436/1480 train_time:229070ms step_avg:160.64ms step:1437/1480 train_time:229242ms step_avg:160.65ms step:1438/1480 train_time:229411ms step_avg:160.65ms step:1439/1480 train_time:229586ms step_avg:160.66ms step:1440/1480 train_time:229756ms step_avg:160.67ms step:1441/1480 train_time:229928ms step_avg:160.68ms step:1442/1480 train_time:230106ms step_avg:160.69ms step:1443/1480 train_time:230294ms step_avg:160.71ms step:1444/1480 train_time:230465ms step_avg:160.71ms step:1445/1480 train_time:230635ms step_avg:160.72ms step:1446/1480 train_time:230811ms step_avg:160.73ms step:1447/1480 train_time:230989ms step_avg:160.74ms step:1448/1480 train_time:231160ms step_avg:160.75ms step:1449/1480 train_time:231333ms step_avg:160.76ms step:1450/1480 train_time:231507ms step_avg:160.77ms step:1451/1480 train_time:231679ms step_avg:160.78ms step:1452/1480 train_time:231852ms step_avg:160.79ms step:1453/1480 train_time:232022ms step_avg:160.79ms step:1454/1480 train_time:232194ms step_avg:160.80ms step:1455/1480 train_time:232371ms step_avg:160.81ms step:1456/1480 train_time:232545ms step_avg:160.82ms step:1457/1480 train_time:232717ms step_avg:160.83ms step:1458/1480 train_time:232888ms step_avg:160.83ms step:1459/1480 train_time:233065ms step_avg:160.85ms step:1460/1480 train_time:233236ms step_avg:160.85ms step:1461/1480 train_time:233412ms step_avg:160.86ms step:1462/1480 train_time:233583ms step_avg:160.87ms step:1463/1480 train_time:233760ms step_avg:160.88ms step:1464/1480 train_time:233935ms step_avg:160.89ms step:1465/1480 train_time:234108ms step_avg:160.90ms step:1466/1480 train_time:234279ms step_avg:160.91ms step:1467/1480 train_time:234453ms step_avg:160.91ms step:1468/1480 train_time:234623ms step_avg:160.92ms step:1469/1480 train_time:234796ms step_avg:160.93ms step:1470/1480 train_time:234976ms step_avg:160.94ms step:1471/1480 train_time:235164ms step_avg:160.96ms step:1472/1480 train_time:235346ms step_avg:160.98ms step:1473/1480 train_time:235517ms step_avg:160.98ms step:1474/1480 train_time:235694ms step_avg:160.99ms step:1475/1480 train_time:235872ms step_avg:161.00ms step:1476/1480 train_time:236045ms step_avg:161.01ms step:1477/1480 train_time:236227ms step_avg:161.03ms step:1478/1480 train_time:236410ms step_avg:161.04ms step:1479/1480 train_time:236585ms step_avg:161.05ms step:1480/1480 train_time:236758ms step_avg:161.06ms step:1480/1480 val_loss:3.2839 train_time:236829ms step_avg:161.11ms