import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 10:53:02 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 127W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 103W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 118W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 116W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23667ms step_avg:nanms step:2/1480 train_time:23794ms step_avg:nanms step:3/1480 train_time:23933ms step_avg:nanms step:4/1480 train_time:24073ms step_avg:nanms step:5/1480 train_time:24213ms step_avg:nanms step:6/1480 train_time:24354ms step_avg:nanms step:7/1480 train_time:24494ms step_avg:nanms step:8/1480 train_time:24638ms step_avg:nanms step:9/1480 train_time:24784ms step_avg:nanms step:10/1480 train_time:24929ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.57ms step:14/1480 train_time:565ms step_avg:141.32ms step:15/1480 train_time:707ms step_avg:141.46ms step:16/1480 train_time:851ms step_avg:141.90ms step:17/1480 train_time:995ms step_avg:142.19ms step:18/1480 train_time:1139ms step_avg:142.42ms step:19/1480 train_time:1283ms step_avg:142.50ms step:20/1480 train_time:1423ms step_avg:142.33ms step:21/1480 train_time:1565ms step_avg:142.26ms step:22/1480 train_time:1706ms step_avg:142.20ms step:23/1480 train_time:1848ms step_avg:142.18ms step:24/1480 train_time:1992ms step_avg:142.26ms step:25/1480 train_time:2135ms step_avg:142.35ms step:26/1480 train_time:2279ms step_avg:142.46ms step:27/1480 train_time:2421ms step_avg:142.42ms step:28/1480 train_time:2564ms step_avg:142.45ms step:29/1480 train_time:2706ms step_avg:142.40ms step:30/1480 train_time:2850ms step_avg:142.49ms step:31/1480 train_time:2993ms step_avg:142.51ms step:32/1480 train_time:3137ms step_avg:142.61ms step:33/1480 train_time:3281ms step_avg:142.66ms step:34/1480 train_time:3423ms step_avg:142.62ms step:35/1480 train_time:3564ms step_avg:142.57ms step:36/1480 train_time:3706ms step_avg:142.54ms step:37/1480 train_time:3848ms step_avg:142.51ms step:38/1480 train_time:3990ms step_avg:142.49ms step:39/1480 train_time:4133ms step_avg:142.53ms step:40/1480 train_time:4279ms step_avg:142.62ms step:41/1480 train_time:4422ms step_avg:142.64ms step:42/1480 train_time:4564ms step_avg:142.63ms step:43/1480 train_time:4706ms step_avg:142.61ms step:44/1480 train_time:4847ms step_avg:142.57ms step:45/1480 train_time:4988ms step_avg:142.52ms step:46/1480 train_time:5131ms step_avg:142.52ms step:47/1480 train_time:5275ms step_avg:142.57ms step:48/1480 train_time:5418ms step_avg:142.58ms step:49/1480 train_time:5561ms step_avg:142.59ms step:50/1480 train_time:5702ms step_avg:142.56ms step:51/1480 train_time:5844ms step_avg:142.55ms step:52/1480 train_time:5986ms step_avg:142.53ms step:53/1480 train_time:6129ms step_avg:142.53ms step:54/1480 train_time:6273ms step_avg:142.57ms step:55/1480 train_time:6416ms step_avg:142.58ms step:56/1480 train_time:6558ms step_avg:142.58ms step:57/1480 train_time:6701ms step_avg:142.58ms step:58/1480 train_time:6844ms step_avg:142.58ms step:59/1480 train_time:6985ms step_avg:142.55ms step:60/1480 train_time:7126ms step_avg:142.52ms step:61/1480 train_time:7269ms step_avg:142.54ms step:62/1480 train_time:7414ms step_avg:142.57ms step:63/1480 train_time:7559ms step_avg:142.62ms step:64/1480 train_time:7702ms step_avg:142.63ms step:65/1480 train_time:7845ms step_avg:142.63ms step:66/1480 train_time:7987ms step_avg:142.62ms step:67/1480 train_time:8129ms step_avg:142.61ms step:68/1480 train_time:8271ms step_avg:142.60ms step:69/1480 train_time:8413ms step_avg:142.60ms step:70/1480 train_time:8557ms step_avg:142.61ms step:71/1480 train_time:8700ms step_avg:142.62ms step:72/1480 train_time:8843ms step_avg:142.62ms step:73/1480 train_time:8983ms step_avg:142.59ms step:74/1480 train_time:9124ms step_avg:142.56ms step:75/1480 train_time:9266ms step_avg:142.56ms step:76/1480 train_time:9408ms step_avg:142.54ms step:77/1480 train_time:9550ms step_avg:142.54ms step:78/1480 train_time:9691ms step_avg:142.51ms step:79/1480 train_time:9834ms step_avg:142.52ms step:80/1480 train_time:9978ms step_avg:142.54ms step:81/1480 train_time:10120ms step_avg:142.54ms step:82/1480 train_time:10263ms step_avg:142.54ms step:83/1480 train_time:10405ms step_avg:142.53ms step:84/1480 train_time:10547ms step_avg:142.53ms step:85/1480 train_time:10689ms step_avg:142.51ms step:86/1480 train_time:10830ms step_avg:142.50ms step:87/1480 train_time:10973ms step_avg:142.51ms step:88/1480 train_time:11117ms step_avg:142.52ms step:89/1480 train_time:11260ms step_avg:142.54ms step:90/1480 train_time:11403ms step_avg:142.54ms step:91/1480 train_time:11546ms step_avg:142.54ms step:92/1480 train_time:11687ms step_avg:142.52ms step:93/1480 train_time:11829ms step_avg:142.52ms step:94/1480 train_time:11974ms step_avg:142.54ms step:95/1480 train_time:12116ms step_avg:142.54ms step:96/1480 train_time:12260ms step_avg:142.55ms step:97/1480 train_time:12402ms step_avg:142.55ms step:98/1480 train_time:12544ms step_avg:142.55ms step:99/1480 train_time:12686ms step_avg:142.54ms step:100/1480 train_time:12828ms step_avg:142.53ms step:101/1480 train_time:12970ms step_avg:142.53ms step:102/1480 train_time:13112ms step_avg:142.53ms step:103/1480 train_time:13257ms step_avg:142.55ms step:104/1480 train_time:13401ms step_avg:142.56ms step:105/1480 train_time:13545ms step_avg:142.58ms step:106/1480 train_time:13686ms step_avg:142.56ms step:107/1480 train_time:13827ms step_avg:142.55ms step:108/1480 train_time:13969ms step_avg:142.54ms step:109/1480 train_time:14111ms step_avg:142.54ms step:110/1480 train_time:14255ms step_avg:142.55ms step:111/1480 train_time:14401ms step_avg:142.59ms step:112/1480 train_time:14549ms step_avg:142.63ms step:113/1480 train_time:14697ms step_avg:142.69ms step:114/1480 train_time:14844ms step_avg:142.73ms step:115/1480 train_time:14990ms step_avg:142.76ms step:116/1480 train_time:15138ms step_avg:142.81ms step:117/1480 train_time:15286ms step_avg:142.86ms step:118/1480 train_time:15433ms step_avg:142.90ms step:119/1480 train_time:15581ms step_avg:142.94ms step:120/1480 train_time:15728ms step_avg:142.98ms step:121/1480 train_time:15873ms step_avg:143.00ms step:122/1480 train_time:16020ms step_avg:143.04ms step:123/1480 train_time:16167ms step_avg:143.07ms step:124/1480 train_time:16313ms step_avg:143.10ms step:125/1480 train_time:16461ms step_avg:143.14ms step:125/1480 val_loss:4.4141 train_time:16518ms step_avg:143.64ms step:126/1480 train_time:16615ms step_avg:143.23ms step:127/1480 train_time:16761ms step_avg:143.26ms step:128/1480 train_time:16910ms step_avg:143.31ms step:129/1480 train_time:17056ms step_avg:143.33ms step:130/1480 train_time:17201ms step_avg:143.34ms step:131/1480 train_time:17347ms step_avg:143.37ms step:132/1480 train_time:17495ms step_avg:143.40ms step:133/1480 train_time:17642ms step_avg:143.43ms step:134/1480 train_time:17791ms step_avg:143.48ms step:135/1480 train_time:17938ms step_avg:143.51ms step:136/1480 train_time:18083ms step_avg:143.52ms step:137/1480 train_time:18229ms step_avg:143.54ms step:138/1480 train_time:18377ms step_avg:143.57ms step:139/1480 train_time:18523ms step_avg:143.59ms step:140/1480 train_time:18671ms step_avg:143.62ms step:141/1480 train_time:18818ms step_avg:143.65ms step:142/1480 train_time:18966ms step_avg:143.68ms step:143/1480 train_time:19114ms step_avg:143.71ms step:144/1480 train_time:19260ms step_avg:143.73ms step:145/1480 train_time:19406ms step_avg:143.75ms step:146/1480 train_time:19552ms step_avg:143.77ms step:147/1480 train_time:19699ms step_avg:143.79ms step:148/1480 train_time:19845ms step_avg:143.80ms step:149/1480 train_time:19995ms step_avg:143.85ms step:150/1480 train_time:20141ms step_avg:143.86ms step:151/1480 train_time:20287ms step_avg:143.88ms step:152/1480 train_time:20434ms step_avg:143.90ms step:153/1480 train_time:20580ms step_avg:143.92ms step:154/1480 train_time:20725ms step_avg:143.93ms step:155/1480 train_time:20874ms step_avg:143.96ms step:156/1480 train_time:21020ms step_avg:143.97ms step:157/1480 train_time:21167ms step_avg:144.00ms step:158/1480 train_time:21315ms step_avg:144.02ms step:159/1480 train_time:21461ms step_avg:144.03ms step:160/1480 train_time:21608ms step_avg:144.05ms step:161/1480 train_time:21755ms step_avg:144.07ms step:162/1480 train_time:21901ms step_avg:144.08ms step:163/1480 train_time:22046ms step_avg:144.09ms step:164/1480 train_time:22195ms step_avg:144.12ms step:165/1480 train_time:22341ms step_avg:144.13ms step:166/1480 train_time:22487ms step_avg:144.15ms step:167/1480 train_time:22634ms step_avg:144.17ms step:168/1480 train_time:22780ms step_avg:144.18ms step:169/1480 train_time:22927ms step_avg:144.20ms step:170/1480 train_time:23076ms step_avg:144.22ms step:171/1480 train_time:23221ms step_avg:144.23ms step:172/1480 train_time:23370ms step_avg:144.26ms step:173/1480 train_time:23518ms step_avg:144.28ms step:174/1480 train_time:23665ms step_avg:144.30ms step:175/1480 train_time:23814ms step_avg:144.32ms step:176/1480 train_time:23959ms step_avg:144.33ms step:177/1480 train_time:24106ms step_avg:144.34ms step:178/1480 train_time:24252ms step_avg:144.35ms step:179/1480 train_time:24399ms step_avg:144.37ms step:180/1480 train_time:24547ms step_avg:144.39ms step:181/1480 train_time:24695ms step_avg:144.42ms step:182/1480 train_time:24841ms step_avg:144.42ms step:183/1480 train_time:24987ms step_avg:144.44ms step:184/1480 train_time:25135ms step_avg:144.46ms step:185/1480 train_time:25281ms step_avg:144.46ms step:186/1480 train_time:25427ms step_avg:144.47ms step:187/1480 train_time:25577ms step_avg:144.50ms step:188/1480 train_time:25723ms step_avg:144.51ms step:189/1480 train_time:25873ms step_avg:144.54ms step:190/1480 train_time:26020ms step_avg:144.56ms step:191/1480 train_time:26166ms step_avg:144.56ms step:192/1480 train_time:26313ms step_avg:144.58ms step:193/1480 train_time:26459ms step_avg:144.58ms step:194/1480 train_time:26607ms step_avg:144.61ms step:195/1480 train_time:26754ms step_avg:144.62ms step:196/1480 train_time:26901ms step_avg:144.63ms step:197/1480 train_time:27049ms step_avg:144.64ms step:198/1480 train_time:27197ms step_avg:144.66ms step:199/1480 train_time:27343ms step_avg:144.67ms step:200/1480 train_time:27490ms step_avg:144.68ms step:201/1480 train_time:27638ms step_avg:144.70ms step:202/1480 train_time:27784ms step_avg:144.71ms step:203/1480 train_time:27931ms step_avg:144.72ms step:204/1480 train_time:28079ms step_avg:144.74ms step:205/1480 train_time:28226ms step_avg:144.75ms step:206/1480 train_time:28374ms step_avg:144.76ms step:207/1480 train_time:28520ms step_avg:144.77ms step:208/1480 train_time:28667ms step_avg:144.78ms step:209/1480 train_time:28816ms step_avg:144.80ms step:210/1480 train_time:28962ms step_avg:144.81ms step:211/1480 train_time:29109ms step_avg:144.82ms step:212/1480 train_time:29257ms step_avg:144.84ms step:213/1480 train_time:29405ms step_avg:144.85ms step:214/1480 train_time:29552ms step_avg:144.86ms step:215/1480 train_time:29699ms step_avg:144.87ms step:216/1480 train_time:29845ms step_avg:144.88ms step:217/1480 train_time:29993ms step_avg:144.89ms step:218/1480 train_time:30140ms step_avg:144.90ms step:219/1480 train_time:30287ms step_avg:144.91ms step:220/1480 train_time:30435ms step_avg:144.93ms step:221/1480 train_time:30583ms step_avg:144.94ms step:222/1480 train_time:30735ms step_avg:144.97ms step:223/1480 train_time:30885ms step_avg:145.00ms step:224/1480 train_time:31037ms step_avg:145.03ms step:225/1480 train_time:31186ms step_avg:145.05ms step:226/1480 train_time:31338ms step_avg:145.09ms step:227/1480 train_time:31489ms step_avg:145.11ms step:228/1480 train_time:31639ms step_avg:145.14ms step:229/1480 train_time:31792ms step_avg:145.17ms step:230/1480 train_time:31942ms step_avg:145.19ms step:231/1480 train_time:32093ms step_avg:145.22ms step:232/1480 train_time:32243ms step_avg:145.24ms step:233/1480 train_time:32394ms step_avg:145.27ms step:234/1480 train_time:32545ms step_avg:145.29ms step:235/1480 train_time:32697ms step_avg:145.32ms step:236/1480 train_time:32847ms step_avg:145.34ms step:237/1480 train_time:32998ms step_avg:145.37ms step:238/1480 train_time:33147ms step_avg:145.38ms step:239/1480 train_time:33299ms step_avg:145.41ms step:240/1480 train_time:33448ms step_avg:145.43ms step:241/1480 train_time:33598ms step_avg:145.45ms step:242/1480 train_time:33749ms step_avg:145.47ms step:243/1480 train_time:33900ms step_avg:145.49ms step:244/1480 train_time:34049ms step_avg:145.51ms step:245/1480 train_time:34201ms step_avg:145.53ms step:246/1480 train_time:34352ms step_avg:145.56ms step:247/1480 train_time:34502ms step_avg:145.58ms step:248/1480 train_time:34652ms step_avg:145.60ms step:249/1480 train_time:34804ms step_avg:145.62ms step:250/1480 train_time:34955ms step_avg:145.64ms step:250/1480 val_loss:3.9942 train_time:35013ms step_avg:145.89ms step:251/1480 train_time:35111ms step_avg:145.69ms step:252/1480 train_time:35262ms step_avg:145.71ms step:253/1480 train_time:35413ms step_avg:145.73ms step:254/1480 train_time:35562ms step_avg:145.74ms step:255/1480 train_time:35712ms step_avg:145.76ms step:256/1480 train_time:35860ms step_avg:145.77ms step:257/1480 train_time:36010ms step_avg:145.79ms step:258/1480 train_time:36164ms step_avg:145.82ms step:259/1480 train_time:36316ms step_avg:145.85ms step:260/1480 train_time:36466ms step_avg:145.86ms step:261/1480 train_time:36616ms step_avg:145.88ms step:262/1480 train_time:36765ms step_avg:145.89ms step:263/1480 train_time:36915ms step_avg:145.91ms step:264/1480 train_time:37066ms step_avg:145.93ms step:265/1480 train_time:37218ms step_avg:145.95ms step:266/1480 train_time:37369ms step_avg:145.97ms step:267/1480 train_time:37519ms step_avg:145.99ms step:268/1480 train_time:37671ms step_avg:146.01ms step:269/1480 train_time:37820ms step_avg:146.02ms step:270/1480 train_time:37970ms step_avg:146.04ms step:271/1480 train_time:38120ms step_avg:146.06ms step:272/1480 train_time:38272ms step_avg:146.08ms step:273/1480 train_time:38421ms step_avg:146.09ms step:274/1480 train_time:38572ms step_avg:146.11ms step:275/1480 train_time:38724ms step_avg:146.13ms step:276/1480 train_time:38874ms step_avg:146.14ms step:277/1480 train_time:39027ms step_avg:146.17ms step:278/1480 train_time:39176ms step_avg:146.18ms step:279/1480 train_time:39327ms step_avg:146.20ms step:280/1480 train_time:39478ms step_avg:146.21ms step:281/1480 train_time:39630ms step_avg:146.24ms step:282/1480 train_time:39782ms step_avg:146.26ms step:283/1480 train_time:39932ms step_avg:146.27ms step:284/1480 train_time:40081ms step_avg:146.28ms step:285/1480 train_time:40232ms step_avg:146.30ms step:286/1480 train_time:40382ms step_avg:146.31ms step:287/1480 train_time:40533ms step_avg:146.33ms step:288/1480 train_time:40682ms step_avg:146.34ms step:289/1480 train_time:40833ms step_avg:146.36ms step:290/1480 train_time:40982ms step_avg:146.36ms step:291/1480 train_time:41133ms step_avg:146.38ms step:292/1480 train_time:41283ms step_avg:146.39ms step:293/1480 train_time:41432ms step_avg:146.40ms step:294/1480 train_time:41582ms step_avg:146.42ms step:295/1480 train_time:41733ms step_avg:146.43ms step:296/1480 train_time:41884ms step_avg:146.45ms step:297/1480 train_time:42035ms step_avg:146.46ms step:298/1480 train_time:42185ms step_avg:146.47ms step:299/1480 train_time:42335ms step_avg:146.49ms step:300/1480 train_time:42485ms step_avg:146.50ms step:301/1480 train_time:42635ms step_avg:146.51ms step:302/1480 train_time:42786ms step_avg:146.53ms step:303/1480 train_time:42936ms step_avg:146.54ms step:304/1480 train_time:43088ms step_avg:146.56ms step:305/1480 train_time:43238ms step_avg:146.57ms step:306/1480 train_time:43389ms step_avg:146.58ms step:307/1480 train_time:43538ms step_avg:146.59ms step:308/1480 train_time:43690ms step_avg:146.61ms step:309/1480 train_time:43841ms step_avg:146.62ms step:310/1480 train_time:43991ms step_avg:146.64ms step:311/1480 train_time:44142ms step_avg:146.65ms step:312/1480 train_time:44293ms step_avg:146.66ms step:313/1480 train_time:44441ms step_avg:146.67ms step:314/1480 train_time:44592ms step_avg:146.68ms step:315/1480 train_time:44741ms step_avg:146.69ms step:316/1480 train_time:44891ms step_avg:146.70ms step:317/1480 train_time:45040ms step_avg:146.71ms step:318/1480 train_time:45192ms step_avg:146.73ms step:319/1480 train_time:45342ms step_avg:146.74ms step:320/1480 train_time:45494ms step_avg:146.75ms step:321/1480 train_time:45643ms step_avg:146.76ms step:322/1480 train_time:45794ms step_avg:146.78ms step:323/1480 train_time:45944ms step_avg:146.79ms step:324/1480 train_time:46095ms step_avg:146.80ms step:325/1480 train_time:46245ms step_avg:146.81ms step:326/1480 train_time:46395ms step_avg:146.82ms step:327/1480 train_time:46545ms step_avg:146.83ms step:328/1480 train_time:46694ms step_avg:146.84ms step:329/1480 train_time:46845ms step_avg:146.85ms step:330/1480 train_time:46997ms step_avg:146.87ms step:331/1480 train_time:47150ms step_avg:146.89ms step:332/1480 train_time:47306ms step_avg:146.91ms step:333/1480 train_time:47458ms step_avg:146.93ms step:334/1480 train_time:47614ms step_avg:146.96ms step:335/1480 train_time:47767ms step_avg:146.98ms step:336/1480 train_time:47921ms step_avg:147.00ms step:337/1480 train_time:48075ms step_avg:147.02ms step:338/1480 train_time:48230ms step_avg:147.04ms step:339/1480 train_time:48384ms step_avg:147.06ms step:340/1480 train_time:48537ms step_avg:147.08ms step:341/1480 train_time:48691ms step_avg:147.10ms step:342/1480 train_time:48844ms step_avg:147.12ms step:343/1480 train_time:48998ms step_avg:147.14ms step:344/1480 train_time:49153ms step_avg:147.17ms step:345/1480 train_time:49308ms step_avg:147.19ms step:346/1480 train_time:49464ms step_avg:147.21ms step:347/1480 train_time:49617ms step_avg:147.23ms step:348/1480 train_time:49771ms step_avg:147.25ms step:349/1480 train_time:49925ms step_avg:147.27ms step:350/1480 train_time:50078ms step_avg:147.29ms step:351/1480 train_time:50233ms step_avg:147.31ms step:352/1480 train_time:50387ms step_avg:147.33ms step:353/1480 train_time:50541ms step_avg:147.35ms step:354/1480 train_time:50694ms step_avg:147.37ms step:355/1480 train_time:50848ms step_avg:147.39ms step:356/1480 train_time:51003ms step_avg:147.41ms step:357/1480 train_time:51156ms step_avg:147.42ms step:358/1480 train_time:51310ms step_avg:147.44ms step:359/1480 train_time:51465ms step_avg:147.46ms step:360/1480 train_time:51619ms step_avg:147.48ms step:361/1480 train_time:51772ms step_avg:147.50ms step:362/1480 train_time:51928ms step_avg:147.52ms step:363/1480 train_time:52083ms step_avg:147.54ms step:364/1480 train_time:52236ms step_avg:147.56ms step:365/1480 train_time:52389ms step_avg:147.57ms step:366/1480 train_time:52543ms step_avg:147.59ms step:367/1480 train_time:52697ms step_avg:147.61ms step:368/1480 train_time:52850ms step_avg:147.63ms step:369/1480 train_time:53003ms step_avg:147.64ms step:370/1480 train_time:53156ms step_avg:147.66ms step:371/1480 train_time:53311ms step_avg:147.68ms step:372/1480 train_time:53465ms step_avg:147.69ms step:373/1480 train_time:53619ms step_avg:147.71ms step:374/1480 train_time:53773ms step_avg:147.73ms step:375/1480 train_time:53929ms step_avg:147.75ms step:375/1480 val_loss:3.8104 train_time:53990ms step_avg:147.92ms step:376/1480 train_time:54088ms step_avg:147.78ms step:377/1480 train_time:54244ms step_avg:147.80ms step:378/1480 train_time:54397ms step_avg:147.82ms step:379/1480 train_time:54550ms step_avg:147.83ms step:380/1480 train_time:54702ms step_avg:147.84ms step:381/1480 train_time:54854ms step_avg:147.85ms step:382/1480 train_time:55010ms step_avg:147.88ms step:383/1480 train_time:55166ms step_avg:147.90ms step:384/1480 train_time:55320ms step_avg:147.92ms step:385/1480 train_time:55473ms step_avg:147.93ms step:386/1480 train_time:55626ms step_avg:147.94ms step:387/1480 train_time:55779ms step_avg:147.96ms step:388/1480 train_time:55933ms step_avg:147.97ms step:389/1480 train_time:56088ms step_avg:147.99ms step:390/1480 train_time:56243ms step_avg:148.01ms step:391/1480 train_time:56397ms step_avg:148.02ms step:392/1480 train_time:56550ms step_avg:148.04ms step:393/1480 train_time:56705ms step_avg:148.05ms step:394/1480 train_time:56858ms step_avg:148.07ms step:395/1480 train_time:57012ms step_avg:148.08ms step:396/1480 train_time:57165ms step_avg:148.10ms step:397/1480 train_time:57319ms step_avg:148.11ms step:398/1480 train_time:57473ms step_avg:148.13ms step:399/1480 train_time:57627ms step_avg:148.14ms step:400/1480 train_time:57781ms step_avg:148.16ms step:401/1480 train_time:57934ms step_avg:148.17ms step:402/1480 train_time:58089ms step_avg:148.19ms step:403/1480 train_time:58244ms step_avg:148.20ms step:404/1480 train_time:58398ms step_avg:148.22ms step:405/1480 train_time:58551ms step_avg:148.23ms step:406/1480 train_time:58706ms step_avg:148.25ms step:407/1480 train_time:58859ms step_avg:148.26ms step:408/1480 train_time:59013ms step_avg:148.27ms step:409/1480 train_time:59167ms step_avg:148.29ms step:410/1480 train_time:59321ms step_avg:148.30ms step:411/1480 train_time:59474ms step_avg:148.31ms step:412/1480 train_time:59628ms step_avg:148.33ms step:413/1480 train_time:59782ms step_avg:148.34ms step:414/1480 train_time:59935ms step_avg:148.35ms step:415/1480 train_time:60090ms step_avg:148.37ms step:416/1480 train_time:60242ms step_avg:148.38ms step:417/1480 train_time:60395ms step_avg:148.39ms step:418/1480 train_time:60549ms step_avg:148.40ms step:419/1480 train_time:60702ms step_avg:148.42ms step:420/1480 train_time:60855ms step_avg:148.43ms step:421/1480 train_time:61008ms step_avg:148.44ms step:422/1480 train_time:61161ms step_avg:148.45ms step:423/1480 train_time:61314ms step_avg:148.46ms step:424/1480 train_time:61468ms step_avg:148.47ms step:425/1480 train_time:61621ms step_avg:148.48ms step:426/1480 train_time:61775ms step_avg:148.50ms step:427/1480 train_time:61929ms step_avg:148.51ms step:428/1480 train_time:62082ms step_avg:148.52ms step:429/1480 train_time:62235ms step_avg:148.53ms step:430/1480 train_time:62390ms step_avg:148.55ms step:431/1480 train_time:62545ms step_avg:148.56ms step:432/1480 train_time:62697ms step_avg:148.57ms step:433/1480 train_time:62850ms step_avg:148.58ms step:434/1480 train_time:63004ms step_avg:148.59ms step:435/1480 train_time:63158ms step_avg:148.61ms step:436/1480 train_time:63312ms step_avg:148.62ms step:437/1480 train_time:63465ms step_avg:148.63ms step:438/1480 train_time:63617ms step_avg:148.64ms step:439/1480 train_time:63771ms step_avg:148.65ms step:440/1480 train_time:63927ms step_avg:148.67ms step:441/1480 train_time:64083ms step_avg:148.68ms step:442/1480 train_time:64240ms step_avg:148.70ms step:443/1480 train_time:64395ms step_avg:148.72ms step:444/1480 train_time:64552ms step_avg:148.74ms step:445/1480 train_time:64708ms step_avg:148.75ms step:446/1480 train_time:64864ms step_avg:148.77ms step:447/1480 train_time:65019ms step_avg:148.78ms step:448/1480 train_time:65175ms step_avg:148.80ms step:449/1480 train_time:65333ms step_avg:148.82ms step:450/1480 train_time:65491ms step_avg:148.84ms step:451/1480 train_time:65649ms step_avg:148.86ms step:452/1480 train_time:65806ms step_avg:148.88ms step:453/1480 train_time:65962ms step_avg:148.90ms step:454/1480 train_time:66117ms step_avg:148.91ms step:455/1480 train_time:66273ms step_avg:148.93ms step:456/1480 train_time:66430ms step_avg:148.95ms step:457/1480 train_time:66587ms step_avg:148.96ms step:458/1480 train_time:66745ms step_avg:148.98ms step:459/1480 train_time:66902ms step_avg:149.00ms step:460/1480 train_time:67056ms step_avg:149.01ms step:461/1480 train_time:67214ms step_avg:149.03ms step:462/1480 train_time:67371ms step_avg:149.05ms step:463/1480 train_time:67529ms step_avg:149.07ms step:464/1480 train_time:67686ms step_avg:149.09ms step:465/1480 train_time:67843ms step_avg:149.11ms step:466/1480 train_time:68000ms step_avg:149.12ms step:467/1480 train_time:68157ms step_avg:149.14ms step:468/1480 train_time:68313ms step_avg:149.15ms step:469/1480 train_time:68469ms step_avg:149.17ms step:470/1480 train_time:68628ms step_avg:149.19ms step:471/1480 train_time:68785ms step_avg:149.21ms step:472/1480 train_time:68942ms step_avg:149.23ms step:473/1480 train_time:69098ms step_avg:149.24ms step:474/1480 train_time:69255ms step_avg:149.26ms step:475/1480 train_time:69411ms step_avg:149.27ms step:476/1480 train_time:69568ms step_avg:149.29ms step:477/1480 train_time:69727ms step_avg:149.31ms step:478/1480 train_time:69885ms step_avg:149.33ms step:479/1480 train_time:70041ms step_avg:149.34ms step:480/1480 train_time:70200ms step_avg:149.36ms step:481/1480 train_time:70355ms step_avg:149.37ms step:482/1480 train_time:70512ms step_avg:149.39ms step:483/1480 train_time:70668ms step_avg:149.40ms step:484/1480 train_time:70827ms step_avg:149.42ms step:485/1480 train_time:70985ms step_avg:149.44ms step:486/1480 train_time:71142ms step_avg:149.46ms step:487/1480 train_time:71298ms step_avg:149.47ms step:488/1480 train_time:71453ms step_avg:149.48ms step:489/1480 train_time:71610ms step_avg:149.50ms step:490/1480 train_time:71767ms step_avg:149.51ms step:491/1480 train_time:71925ms step_avg:149.53ms step:492/1480 train_time:72083ms step_avg:149.55ms step:493/1480 train_time:72241ms step_avg:149.57ms step:494/1480 train_time:72397ms step_avg:149.58ms step:495/1480 train_time:72553ms step_avg:149.59ms step:496/1480 train_time:72709ms step_avg:149.61ms step:497/1480 train_time:72866ms step_avg:149.62ms step:498/1480 train_time:73024ms step_avg:149.64ms step:499/1480 train_time:73182ms step_avg:149.66ms step:500/1480 train_time:73339ms step_avg:149.67ms step:500/1480 val_loss:3.6865 train_time:73400ms step_avg:149.80ms step:501/1480 train_time:73498ms step_avg:149.69ms step:502/1480 train_time:73657ms step_avg:149.71ms step:503/1480 train_time:73813ms step_avg:149.72ms step:504/1480 train_time:73969ms step_avg:149.74ms step:505/1480 train_time:74124ms step_avg:149.75ms step:506/1480 train_time:74280ms step_avg:149.76ms step:507/1480 train_time:74437ms step_avg:149.77ms step:508/1480 train_time:74595ms step_avg:149.79ms step:509/1480 train_time:74752ms step_avg:149.80ms step:510/1480 train_time:74910ms step_avg:149.82ms step:511/1480 train_time:75067ms step_avg:149.83ms step:512/1480 train_time:75223ms step_avg:149.85ms step:513/1480 train_time:75379ms step_avg:149.86ms step:514/1480 train_time:75537ms step_avg:149.88ms step:515/1480 train_time:75694ms step_avg:149.89ms step:516/1480 train_time:75852ms step_avg:149.91ms step:517/1480 train_time:76010ms step_avg:149.92ms step:518/1480 train_time:76166ms step_avg:149.93ms step:519/1480 train_time:76319ms step_avg:149.94ms step:520/1480 train_time:76476ms step_avg:149.95ms step:521/1480 train_time:76633ms step_avg:149.97ms step:522/1480 train_time:76792ms step_avg:149.98ms step:523/1480 train_time:76950ms step_avg:150.00ms step:524/1480 train_time:77108ms step_avg:150.02ms step:525/1480 train_time:77263ms step_avg:150.03ms step:526/1480 train_time:77419ms step_avg:150.04ms step:527/1480 train_time:77575ms step_avg:150.05ms step:528/1480 train_time:77731ms step_avg:150.06ms step:529/1480 train_time:77890ms step_avg:150.08ms step:530/1480 train_time:78048ms step_avg:150.09ms step:531/1480 train_time:78205ms step_avg:150.11ms step:532/1480 train_time:78361ms step_avg:150.12ms step:533/1480 train_time:78518ms step_avg:150.13ms step:534/1480 train_time:78674ms step_avg:150.14ms step:535/1480 train_time:78830ms step_avg:150.15ms step:536/1480 train_time:78988ms step_avg:150.17ms step:537/1480 train_time:79144ms step_avg:150.18ms step:538/1480 train_time:79301ms step_avg:150.19ms step:539/1480 train_time:79460ms step_avg:150.21ms step:540/1480 train_time:79617ms step_avg:150.22ms step:541/1480 train_time:79774ms step_avg:150.23ms step:542/1480 train_time:79931ms step_avg:150.25ms step:543/1480 train_time:80088ms step_avg:150.26ms step:544/1480 train_time:80245ms step_avg:150.27ms step:545/1480 train_time:80400ms step_avg:150.28ms step:546/1480 train_time:80557ms step_avg:150.29ms step:547/1480 train_time:80714ms step_avg:150.31ms step:548/1480 train_time:80873ms step_avg:150.32ms step:549/1480 train_time:81031ms step_avg:150.34ms step:550/1480 train_time:81189ms step_avg:150.35ms step:551/1480 train_time:81349ms step_avg:150.37ms step:552/1480 train_time:81509ms step_avg:150.38ms step:553/1480 train_time:81669ms step_avg:150.40ms step:554/1480 train_time:81828ms step_avg:150.42ms step:555/1480 train_time:81988ms step_avg:150.44ms step:556/1480 train_time:82147ms step_avg:150.45ms step:557/1480 train_time:82307ms step_avg:150.47ms step:558/1480 train_time:82467ms step_avg:150.49ms step:559/1480 train_time:82624ms step_avg:150.50ms step:560/1480 train_time:82784ms step_avg:150.52ms step:561/1480 train_time:82941ms step_avg:150.53ms step:562/1480 train_time:83100ms step_avg:150.54ms step:563/1480 train_time:83256ms step_avg:150.55ms step:564/1480 train_time:83417ms step_avg:150.57ms step:565/1480 train_time:83576ms step_avg:150.59ms step:566/1480 train_time:83736ms step_avg:150.60ms step:567/1480 train_time:83894ms step_avg:150.62ms step:568/1480 train_time:84054ms step_avg:150.63ms step:569/1480 train_time:84213ms step_avg:150.65ms step:570/1480 train_time:84374ms step_avg:150.67ms step:571/1480 train_time:84533ms step_avg:150.68ms step:572/1480 train_time:84693ms step_avg:150.70ms step:573/1480 train_time:84853ms step_avg:150.72ms step:574/1480 train_time:85015ms step_avg:150.74ms step:575/1480 train_time:85175ms step_avg:150.75ms step:576/1480 train_time:85333ms step_avg:150.77ms step:577/1480 train_time:85494ms step_avg:150.78ms step:578/1480 train_time:85653ms step_avg:150.80ms step:579/1480 train_time:85815ms step_avg:150.82ms step:580/1480 train_time:85975ms step_avg:150.83ms step:581/1480 train_time:86134ms step_avg:150.85ms step:582/1480 train_time:86294ms step_avg:150.86ms step:583/1480 train_time:86453ms step_avg:150.88ms step:584/1480 train_time:86614ms step_avg:150.89ms step:585/1480 train_time:86773ms step_avg:150.91ms step:586/1480 train_time:86932ms step_avg:150.92ms step:587/1480 train_time:87093ms step_avg:150.94ms step:588/1480 train_time:87252ms step_avg:150.95ms step:589/1480 train_time:87413ms step_avg:150.97ms step:590/1480 train_time:87574ms step_avg:150.99ms step:591/1480 train_time:87732ms step_avg:151.00ms step:592/1480 train_time:87893ms step_avg:151.02ms step:593/1480 train_time:88054ms step_avg:151.04ms step:594/1480 train_time:88215ms step_avg:151.05ms step:595/1480 train_time:88376ms step_avg:151.07ms step:596/1480 train_time:88537ms step_avg:151.09ms step:597/1480 train_time:88696ms step_avg:151.10ms step:598/1480 train_time:88854ms step_avg:151.11ms step:599/1480 train_time:89013ms step_avg:151.13ms step:600/1480 train_time:89174ms step_avg:151.14ms step:601/1480 train_time:89334ms step_avg:151.16ms step:602/1480 train_time:89494ms step_avg:151.17ms step:603/1480 train_time:89653ms step_avg:151.19ms step:604/1480 train_time:89813ms step_avg:151.20ms step:605/1480 train_time:89972ms step_avg:151.21ms step:606/1480 train_time:90134ms step_avg:151.23ms step:607/1480 train_time:90297ms step_avg:151.25ms step:608/1480 train_time:90456ms step_avg:151.26ms step:609/1480 train_time:90616ms step_avg:151.28ms step:610/1480 train_time:90774ms step_avg:151.29ms step:611/1480 train_time:90934ms step_avg:151.30ms step:612/1480 train_time:91094ms step_avg:151.32ms step:613/1480 train_time:91254ms step_avg:151.33ms step:614/1480 train_time:91415ms step_avg:151.35ms step:615/1480 train_time:91574ms step_avg:151.36ms step:616/1480 train_time:91732ms step_avg:151.37ms step:617/1480 train_time:91892ms step_avg:151.39ms step:618/1480 train_time:92051ms step_avg:151.40ms step:619/1480 train_time:92211ms step_avg:151.41ms step:620/1480 train_time:92371ms step_avg:151.43ms step:621/1480 train_time:92531ms step_avg:151.44ms step:622/1480 train_time:92692ms step_avg:151.46ms step:623/1480 train_time:92852ms step_avg:151.47ms step:624/1480 train_time:93012ms step_avg:151.49ms step:625/1480 train_time:93172ms step_avg:151.50ms step:625/1480 val_loss:3.6050 train_time:93236ms step_avg:151.60ms step:626/1480 train_time:93335ms step_avg:151.52ms step:627/1480 train_time:93495ms step_avg:151.53ms step:628/1480 train_time:93654ms step_avg:151.54ms step:629/1480 train_time:93813ms step_avg:151.56ms step:630/1480 train_time:93971ms step_avg:151.57ms step:631/1480 train_time:94128ms step_avg:151.58ms step:632/1480 train_time:94289ms step_avg:151.59ms step:633/1480 train_time:94449ms step_avg:151.60ms step:634/1480 train_time:94610ms step_avg:151.62ms step:635/1480 train_time:94770ms step_avg:151.63ms step:636/1480 train_time:94929ms step_avg:151.64ms step:637/1480 train_time:95090ms step_avg:151.66ms step:638/1480 train_time:95249ms step_avg:151.67ms step:639/1480 train_time:95408ms step_avg:151.68ms step:640/1480 train_time:95569ms step_avg:151.70ms step:641/1480 train_time:95729ms step_avg:151.71ms step:642/1480 train_time:95890ms step_avg:151.72ms step:643/1480 train_time:96050ms step_avg:151.74ms step:644/1480 train_time:96208ms step_avg:151.75ms step:645/1480 train_time:96369ms step_avg:151.76ms step:646/1480 train_time:96528ms step_avg:151.77ms step:647/1480 train_time:96688ms step_avg:151.79ms step:648/1480 train_time:96849ms step_avg:151.80ms step:649/1480 train_time:97008ms step_avg:151.81ms step:650/1480 train_time:97168ms step_avg:151.83ms step:651/1480 train_time:97328ms step_avg:151.84ms step:652/1480 train_time:97488ms step_avg:151.85ms step:653/1480 train_time:97648ms step_avg:151.86ms step:654/1480 train_time:97808ms step_avg:151.88ms step:655/1480 train_time:97968ms step_avg:151.89ms step:656/1480 train_time:98128ms step_avg:151.90ms step:657/1480 train_time:98289ms step_avg:151.92ms step:658/1480 train_time:98449ms step_avg:151.93ms step:659/1480 train_time:98612ms step_avg:151.94ms step:660/1480 train_time:98774ms step_avg:151.96ms step:661/1480 train_time:98934ms step_avg:151.97ms step:662/1480 train_time:99094ms step_avg:151.98ms step:663/1480 train_time:99253ms step_avg:152.00ms step:664/1480 train_time:99415ms step_avg:152.01ms step:665/1480 train_time:99578ms step_avg:152.03ms step:666/1480 train_time:99738ms step_avg:152.04ms step:667/1480 train_time:99899ms step_avg:152.05ms step:668/1480 train_time:100062ms step_avg:152.07ms step:669/1480 train_time:100224ms step_avg:152.09ms step:670/1480 train_time:100385ms step_avg:152.10ms step:671/1480 train_time:100546ms step_avg:152.11ms step:672/1480 train_time:100708ms step_avg:152.13ms step:673/1480 train_time:100871ms step_avg:152.14ms step:674/1480 train_time:101032ms step_avg:152.16ms step:675/1480 train_time:101194ms step_avg:152.17ms step:676/1480 train_time:101355ms step_avg:152.18ms step:677/1480 train_time:101516ms step_avg:152.20ms step:678/1480 train_time:101675ms step_avg:152.21ms step:679/1480 train_time:101836ms step_avg:152.22ms step:680/1480 train_time:101999ms step_avg:152.24ms step:681/1480 train_time:102160ms step_avg:152.25ms step:682/1480 train_time:102321ms step_avg:152.26ms step:683/1480 train_time:102482ms step_avg:152.28ms step:684/1480 train_time:102644ms step_avg:152.29ms step:685/1480 train_time:102807ms step_avg:152.31ms step:686/1480 train_time:102971ms step_avg:152.32ms step:687/1480 train_time:103131ms step_avg:152.34ms step:688/1480 train_time:103294ms step_avg:152.35ms step:689/1480 train_time:103457ms step_avg:152.37ms step:690/1480 train_time:103618ms step_avg:152.38ms step:691/1480 train_time:103778ms step_avg:152.39ms step:692/1480 train_time:103937ms step_avg:152.40ms step:693/1480 train_time:104097ms step_avg:152.41ms step:694/1480 train_time:104257ms step_avg:152.42ms step:695/1480 train_time:104417ms step_avg:152.43ms step:696/1480 train_time:104577ms step_avg:152.44ms step:697/1480 train_time:104740ms step_avg:152.46ms step:698/1480 train_time:104902ms step_avg:152.47ms step:699/1480 train_time:105067ms step_avg:152.49ms step:700/1480 train_time:105230ms step_avg:152.51ms step:701/1480 train_time:105390ms step_avg:152.52ms step:702/1480 train_time:105552ms step_avg:152.53ms step:703/1480 train_time:105712ms step_avg:152.54ms step:704/1480 train_time:105873ms step_avg:152.55ms step:705/1480 train_time:106037ms step_avg:152.57ms step:706/1480 train_time:106199ms step_avg:152.58ms step:707/1480 train_time:106360ms step_avg:152.60ms step:708/1480 train_time:106520ms step_avg:152.61ms step:709/1480 train_time:106684ms step_avg:152.62ms step:710/1480 train_time:106844ms step_avg:152.63ms step:711/1480 train_time:107008ms step_avg:152.65ms step:712/1480 train_time:107173ms step_avg:152.67ms step:713/1480 train_time:107336ms step_avg:152.68ms step:714/1480 train_time:107496ms step_avg:152.69ms step:715/1480 train_time:107655ms step_avg:152.70ms step:716/1480 train_time:107814ms step_avg:152.71ms step:717/1480 train_time:107977ms step_avg:152.73ms step:718/1480 train_time:108137ms step_avg:152.74ms step:719/1480 train_time:108297ms step_avg:152.75ms step:720/1480 train_time:108462ms step_avg:152.76ms step:721/1480 train_time:108624ms step_avg:152.78ms step:722/1480 train_time:108788ms step_avg:152.79ms step:723/1480 train_time:108950ms step_avg:152.80ms step:724/1480 train_time:109112ms step_avg:152.82ms step:725/1480 train_time:109275ms step_avg:152.83ms step:726/1480 train_time:109437ms step_avg:152.84ms step:727/1480 train_time:109599ms step_avg:152.86ms step:728/1480 train_time:109759ms step_avg:152.87ms step:729/1480 train_time:109920ms step_avg:152.88ms step:730/1480 train_time:110084ms step_avg:152.89ms step:731/1480 train_time:110245ms step_avg:152.91ms step:732/1480 train_time:110405ms step_avg:152.92ms step:733/1480 train_time:110567ms step_avg:152.93ms step:734/1480 train_time:110730ms step_avg:152.94ms step:735/1480 train_time:110892ms step_avg:152.95ms step:736/1480 train_time:111054ms step_avg:152.97ms step:737/1480 train_time:111214ms step_avg:152.98ms step:738/1480 train_time:111373ms step_avg:152.99ms step:739/1480 train_time:111532ms step_avg:152.99ms step:740/1480 train_time:111697ms step_avg:153.01ms step:741/1480 train_time:111860ms step_avg:153.02ms step:742/1480 train_time:112022ms step_avg:153.04ms step:743/1480 train_time:112185ms step_avg:153.05ms step:744/1480 train_time:112349ms step_avg:153.06ms step:745/1480 train_time:112513ms step_avg:153.08ms step:746/1480 train_time:112674ms step_avg:153.09ms step:747/1480 train_time:112836ms step_avg:153.10ms step:748/1480 train_time:112999ms step_avg:153.12ms step:749/1480 train_time:113163ms step_avg:153.13ms step:750/1480 train_time:113323ms step_avg:153.14ms step:750/1480 val_loss:3.5505 train_time:113388ms step_avg:153.23ms step:751/1480 train_time:113488ms step_avg:153.16ms step:752/1480 train_time:113650ms step_avg:153.17ms step:753/1480 train_time:113812ms step_avg:153.18ms step:754/1480 train_time:113972ms step_avg:153.19ms step:755/1480 train_time:114134ms step_avg:153.20ms step:756/1480 train_time:114295ms step_avg:153.21ms step:757/1480 train_time:114459ms step_avg:153.22ms step:758/1480 train_time:114620ms step_avg:153.24ms step:759/1480 train_time:114782ms step_avg:153.25ms step:760/1480 train_time:114945ms step_avg:153.26ms step:761/1480 train_time:115109ms step_avg:153.27ms step:762/1480 train_time:115270ms step_avg:153.28ms step:763/1480 train_time:115431ms step_avg:153.30ms step:764/1480 train_time:115592ms step_avg:153.31ms step:765/1480 train_time:115753ms step_avg:153.32ms step:766/1480 train_time:115916ms step_avg:153.33ms step:767/1480 train_time:116077ms step_avg:153.34ms step:768/1480 train_time:116241ms step_avg:153.35ms step:769/1480 train_time:116405ms step_avg:153.37ms step:770/1480 train_time:116568ms step_avg:153.38ms step:771/1480 train_time:116732ms step_avg:153.39ms step:772/1480 train_time:116894ms step_avg:153.40ms step:773/1480 train_time:117054ms step_avg:153.41ms step:774/1480 train_time:117217ms step_avg:153.43ms step:775/1480 train_time:117380ms step_avg:153.44ms step:776/1480 train_time:117547ms step_avg:153.46ms step:777/1480 train_time:117713ms step_avg:153.47ms step:778/1480 train_time:117874ms step_avg:153.48ms step:779/1480 train_time:118036ms step_avg:153.49ms step:780/1480 train_time:118199ms step_avg:153.51ms step:781/1480 train_time:118362ms step_avg:153.52ms step:782/1480 train_time:118528ms step_avg:153.53ms step:783/1480 train_time:118689ms step_avg:153.54ms step:784/1480 train_time:118854ms step_avg:153.56ms step:785/1480 train_time:119016ms step_avg:153.57ms step:786/1480 train_time:119183ms step_avg:153.59ms step:787/1480 train_time:119347ms step_avg:153.60ms step:788/1480 train_time:119510ms step_avg:153.61ms step:789/1480 train_time:119672ms step_avg:153.62ms step:790/1480 train_time:119837ms step_avg:153.64ms step:791/1480 train_time:120004ms step_avg:153.65ms step:792/1480 train_time:120169ms step_avg:153.67ms step:793/1480 train_time:120330ms step_avg:153.68ms step:794/1480 train_time:120495ms step_avg:153.69ms step:795/1480 train_time:120661ms step_avg:153.71ms step:796/1480 train_time:120828ms step_avg:153.72ms step:797/1480 train_time:120991ms step_avg:153.74ms step:798/1480 train_time:121155ms step_avg:153.75ms step:799/1480 train_time:121323ms step_avg:153.77ms step:800/1480 train_time:121487ms step_avg:153.78ms step:801/1480 train_time:121651ms step_avg:153.79ms step:802/1480 train_time:121817ms step_avg:153.81ms step:803/1480 train_time:121980ms step_avg:153.82ms step:804/1480 train_time:122142ms step_avg:153.83ms step:805/1480 train_time:122308ms step_avg:153.85ms step:806/1480 train_time:122470ms step_avg:153.86ms step:807/1480 train_time:122633ms step_avg:153.87ms step:808/1480 train_time:122796ms step_avg:153.88ms step:809/1480 train_time:122959ms step_avg:153.89ms step:810/1480 train_time:123122ms step_avg:153.90ms step:811/1480 train_time:123284ms step_avg:153.91ms step:812/1480 train_time:123449ms step_avg:153.93ms step:813/1480 train_time:123609ms step_avg:153.93ms step:814/1480 train_time:123772ms step_avg:153.95ms step:815/1480 train_time:123934ms step_avg:153.95ms step:816/1480 train_time:124098ms step_avg:153.97ms step:817/1480 train_time:124261ms step_avg:153.98ms step:818/1480 train_time:124424ms step_avg:153.99ms step:819/1480 train_time:124588ms step_avg:154.00ms step:820/1480 train_time:124751ms step_avg:154.01ms step:821/1480 train_time:124913ms step_avg:154.02ms step:822/1480 train_time:125076ms step_avg:154.03ms step:823/1480 train_time:125238ms step_avg:154.04ms step:824/1480 train_time:125400ms step_avg:154.05ms step:825/1480 train_time:125565ms step_avg:154.07ms step:826/1480 train_time:125731ms step_avg:154.08ms step:827/1480 train_time:125895ms step_avg:154.09ms step:828/1480 train_time:126056ms step_avg:154.10ms step:829/1480 train_time:126220ms step_avg:154.11ms step:830/1480 train_time:126384ms step_avg:154.13ms step:831/1480 train_time:126549ms step_avg:154.14ms step:832/1480 train_time:126713ms step_avg:154.15ms step:833/1480 train_time:126877ms step_avg:154.16ms step:834/1480 train_time:127042ms step_avg:154.18ms step:835/1480 train_time:127206ms step_avg:154.19ms step:836/1480 train_time:127370ms step_avg:154.20ms step:837/1480 train_time:127534ms step_avg:154.21ms step:838/1480 train_time:127695ms step_avg:154.22ms step:839/1480 train_time:127856ms step_avg:154.23ms step:840/1480 train_time:128016ms step_avg:154.24ms step:841/1480 train_time:128175ms step_avg:154.24ms step:842/1480 train_time:128343ms step_avg:154.26ms step:843/1480 train_time:128506ms step_avg:154.27ms step:844/1480 train_time:128668ms step_avg:154.28ms step:845/1480 train_time:128831ms step_avg:154.29ms step:846/1480 train_time:128995ms step_avg:154.30ms step:847/1480 train_time:129158ms step_avg:154.31ms step:848/1480 train_time:129319ms step_avg:154.32ms step:849/1480 train_time:129481ms step_avg:154.33ms step:850/1480 train_time:129645ms step_avg:154.34ms step:851/1480 train_time:129810ms step_avg:154.35ms step:852/1480 train_time:129972ms step_avg:154.36ms step:853/1480 train_time:130134ms step_avg:154.37ms step:854/1480 train_time:130298ms step_avg:154.38ms step:855/1480 train_time:130461ms step_avg:154.39ms step:856/1480 train_time:130622ms step_avg:154.40ms step:857/1480 train_time:130787ms step_avg:154.41ms step:858/1480 train_time:130953ms step_avg:154.43ms step:859/1480 train_time:131116ms step_avg:154.44ms step:860/1480 train_time:131277ms step_avg:154.44ms step:861/1480 train_time:131443ms step_avg:154.46ms step:862/1480 train_time:131611ms step_avg:154.47ms step:863/1480 train_time:131779ms step_avg:154.49ms step:864/1480 train_time:131943ms step_avg:154.50ms step:865/1480 train_time:132107ms step_avg:154.51ms step:866/1480 train_time:132274ms step_avg:154.53ms step:867/1480 train_time:132436ms step_avg:154.53ms step:868/1480 train_time:132597ms step_avg:154.54ms step:869/1480 train_time:132758ms step_avg:154.55ms step:870/1480 train_time:132922ms step_avg:154.56ms step:871/1480 train_time:133086ms step_avg:154.57ms step:872/1480 train_time:133251ms step_avg:154.58ms step:873/1480 train_time:133415ms step_avg:154.59ms step:874/1480 train_time:133581ms step_avg:154.61ms step:875/1480 train_time:133747ms step_avg:154.62ms step:875/1480 val_loss:3.5038 train_time:133812ms step_avg:154.70ms step:876/1480 train_time:133913ms step_avg:154.63ms step:877/1480 train_time:134080ms step_avg:154.65ms step:878/1480 train_time:134243ms step_avg:154.66ms step:879/1480 train_time:134407ms step_avg:154.67ms step:880/1480 train_time:134569ms step_avg:154.68ms step:881/1480 train_time:134731ms step_avg:154.69ms step:882/1480 train_time:134897ms step_avg:154.70ms step:883/1480 train_time:135064ms step_avg:154.71ms step:884/1480 train_time:135229ms step_avg:154.72ms step:885/1480 train_time:135396ms step_avg:154.74ms step:886/1480 train_time:135563ms step_avg:154.75ms step:887/1480 train_time:135728ms step_avg:154.76ms step:888/1480 train_time:135901ms step_avg:154.78ms step:889/1480 train_time:136069ms step_avg:154.80ms step:890/1480 train_time:136232ms step_avg:154.81ms step:891/1480 train_time:136399ms step_avg:154.82ms step:892/1480 train_time:136564ms step_avg:154.83ms step:893/1480 train_time:136726ms step_avg:154.84ms step:894/1480 train_time:136892ms step_avg:154.85ms step:895/1480 train_time:137058ms step_avg:154.87ms step:896/1480 train_time:137225ms step_avg:154.88ms step:897/1480 train_time:137391ms step_avg:154.89ms step:898/1480 train_time:137559ms step_avg:154.91ms step:899/1480 train_time:137723ms step_avg:154.92ms step:900/1480 train_time:137886ms step_avg:154.93ms step:901/1480 train_time:138051ms step_avg:154.94ms step:902/1480 train_time:138214ms step_avg:154.95ms step:903/1480 train_time:138388ms step_avg:154.97ms step:904/1480 train_time:138553ms step_avg:154.98ms step:905/1480 train_time:138715ms step_avg:154.99ms step:906/1480 train_time:138883ms step_avg:155.00ms step:907/1480 train_time:139050ms step_avg:155.02ms step:908/1480 train_time:139213ms step_avg:155.03ms step:909/1480 train_time:139380ms step_avg:155.04ms step:910/1480 train_time:139551ms step_avg:155.06ms step:911/1480 train_time:139715ms step_avg:155.07ms step:912/1480 train_time:139881ms step_avg:155.08ms step:913/1480 train_time:140048ms step_avg:155.09ms step:914/1480 train_time:140215ms step_avg:155.10ms step:915/1480 train_time:140384ms step_avg:155.12ms step:916/1480 train_time:140548ms step_avg:155.13ms step:917/1480 train_time:140712ms step_avg:155.14ms step:918/1480 train_time:140882ms step_avg:155.16ms step:919/1480 train_time:141051ms step_avg:155.17ms step:920/1480 train_time:141216ms step_avg:155.18ms step:921/1480 train_time:141382ms step_avg:155.19ms step:922/1480 train_time:141547ms step_avg:155.21ms step:923/1480 train_time:141710ms step_avg:155.21ms step:924/1480 train_time:141875ms step_avg:155.22ms step:925/1480 train_time:142039ms step_avg:155.23ms step:926/1480 train_time:142203ms step_avg:155.24ms step:927/1480 train_time:142367ms step_avg:155.25ms step:928/1480 train_time:142533ms step_avg:155.26ms step:929/1480 train_time:142699ms step_avg:155.28ms step:930/1480 train_time:142865ms step_avg:155.29ms step:931/1480 train_time:143027ms step_avg:155.30ms step:932/1480 train_time:143193ms step_avg:155.31ms step:933/1480 train_time:143359ms step_avg:155.32ms step:934/1480 train_time:143525ms step_avg:155.33ms step:935/1480 train_time:143697ms step_avg:155.35ms step:936/1480 train_time:143865ms step_avg:155.36ms step:937/1480 train_time:144036ms step_avg:155.38ms step:938/1480 train_time:144200ms step_avg:155.39ms step:939/1480 train_time:144370ms step_avg:155.40ms step:940/1480 train_time:144538ms step_avg:155.42ms step:941/1480 train_time:144702ms step_avg:155.43ms step:942/1480 train_time:144867ms step_avg:155.44ms step:943/1480 train_time:145038ms step_avg:155.45ms step:944/1480 train_time:145209ms step_avg:155.47ms step:945/1480 train_time:145375ms step_avg:155.48ms step:946/1480 train_time:145542ms step_avg:155.49ms step:947/1480 train_time:145709ms step_avg:155.51ms step:948/1480 train_time:145876ms step_avg:155.52ms step:949/1480 train_time:146041ms step_avg:155.53ms step:950/1480 train_time:146204ms step_avg:155.54ms step:951/1480 train_time:146371ms step_avg:155.55ms step:952/1480 train_time:146537ms step_avg:155.56ms step:953/1480 train_time:146706ms step_avg:155.57ms step:954/1480 train_time:146875ms step_avg:155.59ms step:955/1480 train_time:147039ms step_avg:155.60ms step:956/1480 train_time:147204ms step_avg:155.61ms step:957/1480 train_time:147371ms step_avg:155.62ms step:958/1480 train_time:147541ms step_avg:155.63ms step:959/1480 train_time:147705ms step_avg:155.64ms step:960/1480 train_time:147872ms step_avg:155.65ms step:961/1480 train_time:148038ms step_avg:155.67ms step:962/1480 train_time:148203ms step_avg:155.68ms step:963/1480 train_time:148368ms step_avg:155.69ms step:964/1480 train_time:148535ms step_avg:155.70ms step:965/1480 train_time:148701ms step_avg:155.71ms step:966/1480 train_time:148864ms step_avg:155.72ms step:967/1480 train_time:149026ms step_avg:155.72ms step:968/1480 train_time:149192ms step_avg:155.73ms step:969/1480 train_time:149359ms step_avg:155.74ms step:970/1480 train_time:149523ms step_avg:155.75ms step:971/1480 train_time:149687ms step_avg:155.76ms step:972/1480 train_time:149849ms step_avg:155.77ms step:973/1480 train_time:150012ms step_avg:155.78ms step:974/1480 train_time:150183ms step_avg:155.79ms step:975/1480 train_time:150348ms step_avg:155.80ms step:976/1480 train_time:150512ms step_avg:155.81ms step:977/1480 train_time:150678ms step_avg:155.82ms step:978/1480 train_time:150843ms step_avg:155.83ms step:979/1480 train_time:151008ms step_avg:155.84ms step:980/1480 train_time:151174ms step_avg:155.85ms step:981/1480 train_time:151341ms step_avg:155.86ms step:982/1480 train_time:151504ms step_avg:155.87ms step:983/1480 train_time:151669ms step_avg:155.88ms step:984/1480 train_time:151834ms step_avg:155.89ms step:985/1480 train_time:152001ms step_avg:155.90ms step:986/1480 train_time:152167ms step_avg:155.91ms step:987/1480 train_time:152330ms step_avg:155.92ms step:988/1480 train_time:152500ms step_avg:155.93ms step:989/1480 train_time:152664ms step_avg:155.94ms step:990/1480 train_time:152832ms step_avg:155.95ms step:991/1480 train_time:153001ms step_avg:155.96ms step:992/1480 train_time:153176ms step_avg:155.98ms step:993/1480 train_time:153353ms step_avg:156.01ms step:994/1480 train_time:153519ms step_avg:156.02ms step:995/1480 train_time:153683ms step_avg:156.02ms step:996/1480 train_time:153845ms step_avg:156.03ms step:997/1480 train_time:154009ms step_avg:156.04ms step:998/1480 train_time:154173ms step_avg:156.05ms step:999/1480 train_time:154339ms step_avg:156.06ms step:1000/1480 train_time:154508ms step_avg:156.07ms step:1000/1480 val_loss:3.4412 train_time:154577ms step_avg:156.14ms step:1001/1480 train_time:154678ms step_avg:156.08ms step:1002/1480 train_time:154844ms step_avg:156.09ms step:1003/1480 train_time:155017ms step_avg:156.11ms step:1004/1480 train_time:155186ms step_avg:156.12ms step:1005/1480 train_time:155355ms step_avg:156.14ms step:1006/1480 train_time:155522ms step_avg:156.15ms step:1007/1480 train_time:155688ms step_avg:156.16ms step:1008/1480 train_time:155856ms step_avg:156.17ms step:1009/1480 train_time:156031ms step_avg:156.19ms step:1010/1480 train_time:156197ms step_avg:156.20ms step:1011/1480 train_time:156361ms step_avg:156.20ms step:1012/1480 train_time:156528ms step_avg:156.22ms step:1013/1480 train_time:156699ms step_avg:156.23ms step:1014/1480 train_time:156866ms step_avg:156.24ms step:1015/1480 train_time:157035ms step_avg:156.25ms step:1016/1480 train_time:157201ms step_avg:156.26ms step:1017/1480 train_time:157374ms step_avg:156.28ms step:1018/1480 train_time:157543ms step_avg:156.29ms step:1019/1480 train_time:157713ms step_avg:156.31ms step:1020/1480 train_time:157881ms step_avg:156.32ms step:1021/1480 train_time:158046ms step_avg:156.33ms step:1022/1480 train_time:158215ms step_avg:156.34ms step:1023/1480 train_time:158381ms step_avg:156.35ms step:1024/1480 train_time:158547ms step_avg:156.36ms step:1025/1480 train_time:158719ms step_avg:156.37ms step:1026/1480 train_time:158883ms step_avg:156.38ms step:1027/1480 train_time:159050ms step_avg:156.39ms step:1028/1480 train_time:159222ms step_avg:156.41ms step:1029/1480 train_time:159396ms step_avg:156.42ms step:1030/1480 train_time:159563ms step_avg:156.43ms step:1031/1480 train_time:159728ms step_avg:156.44ms step:1032/1480 train_time:159899ms step_avg:156.46ms step:1033/1480 train_time:160064ms step_avg:156.47ms step:1034/1480 train_time:160234ms step_avg:156.48ms step:1035/1480 train_time:160400ms step_avg:156.49ms step:1036/1480 train_time:160565ms step_avg:156.50ms step:1037/1480 train_time:160734ms step_avg:156.51ms step:1038/1480 train_time:160902ms step_avg:156.52ms step:1039/1480 train_time:161072ms step_avg:156.53ms step:1040/1480 train_time:161239ms step_avg:156.54ms step:1041/1480 train_time:161406ms step_avg:156.55ms step:1042/1480 train_time:161570ms step_avg:156.56ms step:1043/1480 train_time:161736ms step_avg:156.57ms step:1044/1480 train_time:161902ms step_avg:156.58ms step:1045/1480 train_time:162074ms step_avg:156.59ms step:1046/1480 train_time:162242ms step_avg:156.60ms step:1047/1480 train_time:162409ms step_avg:156.61ms step:1048/1480 train_time:162575ms step_avg:156.62ms step:1049/1480 train_time:162740ms step_avg:156.63ms step:1050/1480 train_time:162910ms step_avg:156.64ms step:1051/1480 train_time:163078ms step_avg:156.66ms step:1052/1480 train_time:163246ms step_avg:156.67ms step:1053/1480 train_time:163414ms step_avg:156.68ms step:1054/1480 train_time:163581ms step_avg:156.69ms step:1055/1480 train_time:163746ms step_avg:156.69ms step:1056/1480 train_time:163913ms step_avg:156.70ms step:1057/1480 train_time:164079ms step_avg:156.71ms step:1058/1480 train_time:164249ms step_avg:156.73ms step:1059/1480 train_time:164421ms step_avg:156.74ms step:1060/1480 train_time:164589ms step_avg:156.75ms step:1061/1480 train_time:164753ms step_avg:156.76ms step:1062/1480 train_time:164919ms step_avg:156.77ms step:1063/1480 train_time:165083ms step_avg:156.77ms step:1064/1480 train_time:165247ms step_avg:156.78ms step:1065/1480 train_time:165416ms step_avg:156.79ms step:1066/1480 train_time:165584ms step_avg:156.80ms step:1067/1480 train_time:165755ms step_avg:156.82ms step:1068/1480 train_time:165921ms step_avg:156.82ms step:1069/1480 train_time:166093ms step_avg:156.84ms step:1070/1480 train_time:166259ms step_avg:156.85ms step:1071/1480 train_time:166434ms step_avg:156.87ms step:1072/1480 train_time:166600ms step_avg:156.87ms step:1073/1480 train_time:166763ms step_avg:156.88ms step:1074/1480 train_time:166932ms step_avg:156.89ms step:1075/1480 train_time:167101ms step_avg:156.90ms step:1076/1480 train_time:167268ms step_avg:156.91ms step:1077/1480 train_time:167435ms step_avg:156.92ms step:1078/1480 train_time:167608ms step_avg:156.94ms step:1079/1480 train_time:167780ms step_avg:156.95ms step:1080/1480 train_time:167950ms step_avg:156.96ms step:1081/1480 train_time:168117ms step_avg:156.97ms step:1082/1480 train_time:168282ms step_avg:156.98ms step:1083/1480 train_time:168447ms step_avg:156.99ms step:1084/1480 train_time:168615ms step_avg:157.00ms step:1085/1480 train_time:168781ms step_avg:157.01ms step:1086/1480 train_time:168950ms step_avg:157.02ms step:1087/1480 train_time:169117ms step_avg:157.03ms step:1088/1480 train_time:169285ms step_avg:157.04ms step:1089/1480 train_time:169456ms step_avg:157.05ms step:1090/1480 train_time:169629ms step_avg:157.06ms step:1091/1480 train_time:169798ms step_avg:157.08ms step:1092/1480 train_time:169967ms step_avg:157.09ms step:1093/1480 train_time:170136ms step_avg:157.10ms step:1094/1480 train_time:170302ms step_avg:157.11ms step:1095/1480 train_time:170467ms step_avg:157.11ms step:1096/1480 train_time:170636ms step_avg:157.12ms step:1097/1480 train_time:170804ms step_avg:157.13ms step:1098/1480 train_time:170975ms step_avg:157.15ms step:1099/1480 train_time:171146ms step_avg:157.16ms step:1100/1480 train_time:171319ms step_avg:157.17ms step:1101/1480 train_time:171490ms step_avg:157.19ms step:1102/1480 train_time:171663ms step_avg:157.20ms step:1103/1480 train_time:171838ms step_avg:157.22ms step:1104/1480 train_time:172005ms step_avg:157.23ms step:1105/1480 train_time:172176ms step_avg:157.24ms step:1106/1480 train_time:172343ms step_avg:157.25ms step:1107/1480 train_time:172514ms step_avg:157.26ms step:1108/1480 train_time:172679ms step_avg:157.27ms step:1109/1480 train_time:172845ms step_avg:157.28ms step:1110/1480 train_time:173013ms step_avg:157.28ms step:1111/1480 train_time:173179ms step_avg:157.29ms step:1112/1480 train_time:173349ms step_avg:157.30ms step:1113/1480 train_time:173531ms step_avg:157.33ms step:1114/1480 train_time:173703ms step_avg:157.34ms step:1115/1480 train_time:173876ms step_avg:157.35ms step:1116/1480 train_time:174043ms step_avg:157.36ms step:1117/1480 train_time:174217ms step_avg:157.38ms step:1118/1480 train_time:174389ms step_avg:157.39ms step:1119/1480 train_time:174556ms step_avg:157.40ms step:1120/1480 train_time:174725ms step_avg:157.41ms step:1121/1480 train_time:174896ms step_avg:157.42ms step:1122/1480 train_time:175061ms step_avg:157.43ms step:1123/1480 train_time:175228ms step_avg:157.44ms step:1124/1480 train_time:175397ms step_avg:157.45ms step:1125/1480 train_time:175564ms step_avg:157.46ms step:1125/1480 val_loss:3.3854 train_time:175632ms step_avg:157.52ms step:1126/1480 train_time:175736ms step_avg:157.47ms step:1127/1480 train_time:175905ms step_avg:157.48ms step:1128/1480 train_time:176075ms step_avg:157.49ms step:1129/1480 train_time:176250ms step_avg:157.51ms step:1130/1480 train_time:176418ms step_avg:157.52ms step:1131/1480 train_time:176596ms step_avg:157.53ms step:1132/1480 train_time:176763ms step_avg:157.54ms step:1133/1480 train_time:176934ms step_avg:157.55ms step:1134/1480 train_time:177105ms step_avg:157.57ms step:1135/1480 train_time:177271ms step_avg:157.57ms step:1136/1480 train_time:177444ms step_avg:157.59ms step:1137/1480 train_time:177612ms step_avg:157.60ms step:1138/1480 train_time:177785ms step_avg:157.61ms step:1139/1480 train_time:177952ms step_avg:157.62ms step:1140/1480 train_time:178119ms step_avg:157.63ms step:1141/1480 train_time:178291ms step_avg:157.64ms step:1142/1480 train_time:178459ms step_avg:157.65ms step:1143/1480 train_time:178629ms step_avg:157.66ms step:1144/1480 train_time:178798ms step_avg:157.67ms step:1145/1480 train_time:178964ms step_avg:157.68ms step:1146/1480 train_time:179135ms step_avg:157.69ms step:1147/1480 train_time:179304ms step_avg:157.70ms step:1148/1480 train_time:179472ms step_avg:157.71ms step:1149/1480 train_time:179645ms step_avg:157.72ms step:1150/1480 train_time:179813ms step_avg:157.73ms step:1151/1480 train_time:179986ms step_avg:157.74ms step:1152/1480 train_time:180157ms step_avg:157.76ms step:1153/1480 train_time:180331ms step_avg:157.77ms step:1154/1480 train_time:180499ms step_avg:157.78ms step:1155/1480 train_time:180671ms step_avg:157.79ms step:1156/1480 train_time:180849ms step_avg:157.81ms step:1157/1480 train_time:181019ms step_avg:157.82ms step:1158/1480 train_time:181186ms step_avg:157.83ms step:1159/1480 train_time:181354ms step_avg:157.84ms step:1160/1480 train_time:181520ms step_avg:157.84ms step:1161/1480 train_time:181693ms step_avg:157.86ms step:1162/1480 train_time:181864ms step_avg:157.87ms step:1163/1480 train_time:182033ms step_avg:157.88ms step:1164/1480 train_time:182203ms step_avg:157.89ms step:1165/1480 train_time:182368ms step_avg:157.89ms step:1166/1480 train_time:182538ms step_avg:157.90ms step:1167/1480 train_time:182705ms step_avg:157.91ms step:1168/1480 train_time:182874ms step_avg:157.92ms step:1169/1480 train_time:183044ms step_avg:157.93ms step:1170/1480 train_time:183211ms step_avg:157.94ms step:1171/1480 train_time:183378ms step_avg:157.95ms step:1172/1480 train_time:183545ms step_avg:157.96ms step:1173/1480 train_time:183716ms step_avg:157.97ms step:1174/1480 train_time:183898ms step_avg:157.99ms step:1175/1480 train_time:184070ms step_avg:158.00ms step:1176/1480 train_time:184243ms step_avg:158.01ms step:1177/1480 train_time:184419ms step_avg:158.03ms step:1178/1480 train_time:184587ms step_avg:158.04ms step:1179/1480 train_time:184754ms step_avg:158.04ms step:1180/1480 train_time:184934ms step_avg:158.06ms step:1181/1480 train_time:185103ms step_avg:158.07ms step:1182/1480 train_time:185270ms step_avg:158.08ms step:1183/1480 train_time:185441ms step_avg:158.09ms step:1184/1480 train_time:185608ms step_avg:158.10ms step:1185/1480 train_time:185783ms step_avg:158.11ms step:1186/1480 train_time:185954ms step_avg:158.12ms step:1187/1480 train_time:186136ms step_avg:158.14ms step:1188/1480 train_time:186302ms step_avg:158.15ms step:1189/1480 train_time:186471ms step_avg:158.16ms step:1190/1480 train_time:186639ms step_avg:158.17ms step:1191/1480 train_time:186809ms step_avg:158.18ms step:1192/1480 train_time:186976ms step_avg:158.19ms step:1193/1480 train_time:187143ms step_avg:158.19ms step:1194/1480 train_time:187311ms step_avg:158.20ms step:1195/1480 train_time:187485ms step_avg:158.22ms step:1196/1480 train_time:187669ms step_avg:158.24ms step:1197/1480 train_time:187841ms step_avg:158.25ms step:1198/1480 train_time:188023ms step_avg:158.27ms step:1199/1480 train_time:188193ms step_avg:158.28ms step:1200/1480 train_time:188363ms step_avg:158.29ms step:1201/1480 train_time:188531ms step_avg:158.30ms step:1202/1480 train_time:188713ms step_avg:158.32ms step:1203/1480 train_time:188888ms step_avg:158.33ms step:1204/1480 train_time:189064ms step_avg:158.35ms step:1205/1480 train_time:189232ms step_avg:158.35ms step:1206/1480 train_time:189399ms step_avg:158.36ms step:1207/1480 train_time:189568ms step_avg:158.37ms step:1208/1480 train_time:189735ms step_avg:158.38ms step:1209/1480 train_time:189907ms step_avg:158.39ms step:1210/1480 train_time:190084ms step_avg:158.40ms step:1211/1480 train_time:190259ms step_avg:158.42ms step:1212/1480 train_time:190432ms step_avg:158.43ms step:1213/1480 train_time:190604ms step_avg:158.44ms step:1214/1480 train_time:190782ms step_avg:158.46ms step:1215/1480 train_time:190957ms step_avg:158.47ms step:1216/1480 train_time:191127ms step_avg:158.48ms step:1217/1480 train_time:191301ms step_avg:158.49ms step:1218/1480 train_time:191471ms step_avg:158.50ms step:1219/1480 train_time:191650ms step_avg:158.52ms step:1220/1480 train_time:191819ms step_avg:158.53ms step:1221/1480 train_time:191989ms step_avg:158.54ms step:1222/1480 train_time:192156ms step_avg:158.54ms step:1223/1480 train_time:192326ms step_avg:158.55ms step:1224/1480 train_time:192504ms step_avg:158.57ms step:1225/1480 train_time:192674ms step_avg:158.58ms step:1226/1480 train_time:192848ms step_avg:158.59ms step:1227/1480 train_time:193020ms step_avg:158.60ms step:1228/1480 train_time:193189ms step_avg:158.61ms step:1229/1480 train_time:193363ms step_avg:158.62ms step:1230/1480 train_time:193542ms step_avg:158.64ms step:1231/1480 train_time:193717ms step_avg:158.65ms step:1232/1480 train_time:193892ms step_avg:158.67ms step:1233/1480 train_time:194064ms step_avg:158.68ms step:1234/1480 train_time:194231ms step_avg:158.69ms step:1235/1480 train_time:194406ms step_avg:158.70ms step:1236/1480 train_time:194574ms step_avg:158.71ms step:1237/1480 train_time:194747ms step_avg:158.72ms step:1238/1480 train_time:194932ms step_avg:158.74ms step:1239/1480 train_time:195104ms step_avg:158.75ms step:1240/1480 train_time:195275ms step_avg:158.76ms step:1241/1480 train_time:195449ms step_avg:158.77ms step:1242/1480 train_time:195617ms step_avg:158.78ms step:1243/1480 train_time:195790ms step_avg:158.79ms step:1244/1480 train_time:195958ms step_avg:158.80ms step:1245/1480 train_time:196127ms step_avg:158.81ms step:1246/1480 train_time:196298ms step_avg:158.82ms step:1247/1480 train_time:196466ms step_avg:158.82ms step:1248/1480 train_time:196634ms step_avg:158.83ms step:1249/1480 train_time:196803ms step_avg:158.84ms step:1250/1480 train_time:196971ms step_avg:158.85ms step:1250/1480 val_loss:3.3354 train_time:197043ms step_avg:158.91ms step:1251/1480 train_time:197154ms step_avg:158.87ms step:1252/1480 train_time:197324ms step_avg:158.88ms step:1253/1480 train_time:197493ms step_avg:158.88ms step:1254/1480 train_time:197663ms step_avg:158.89ms step:1255/1480 train_time:197849ms step_avg:158.92ms step:1256/1480 train_time:198024ms step_avg:158.93ms step:1257/1480 train_time:198194ms step_avg:158.94ms step:1258/1480 train_time:198371ms step_avg:158.95ms step:1259/1480 train_time:198543ms step_avg:158.96ms step:1260/1480 train_time:198710ms step_avg:158.97ms step:1261/1480 train_time:198882ms step_avg:158.98ms step:1262/1480 train_time:199056ms step_avg:158.99ms step:1263/1480 train_time:199229ms step_avg:159.00ms step:1264/1480 train_time:199395ms step_avg:159.01ms step:1265/1480 train_time:199562ms step_avg:159.01ms step:1266/1480 train_time:199732ms step_avg:159.02ms step:1267/1480 train_time:199903ms step_avg:159.03ms step:1268/1480 train_time:200073ms step_avg:159.04ms step:1269/1480 train_time:200249ms step_avg:159.05ms step:1270/1480 train_time:200418ms step_avg:159.06ms step:1271/1480 train_time:200589ms step_avg:159.07ms step:1272/1480 train_time:200756ms step_avg:159.08ms step:1273/1480 train_time:200928ms step_avg:159.09ms step:1274/1480 train_time:201101ms step_avg:159.10ms step:1275/1480 train_time:201269ms step_avg:159.11ms step:1276/1480 train_time:201433ms step_avg:159.11ms step:1277/1480 train_time:201607ms step_avg:159.12ms step:1278/1480 train_time:201776ms step_avg:159.13ms step:1279/1480 train_time:201948ms step_avg:159.14ms step:1280/1480 train_time:202126ms step_avg:159.15ms step:1281/1480 train_time:202295ms step_avg:159.16ms step:1282/1480 train_time:202462ms step_avg:159.17ms step:1283/1480 train_time:202633ms step_avg:159.18ms step:1284/1480 train_time:202804ms step_avg:159.19ms step:1285/1480 train_time:202973ms step_avg:159.19ms step:1286/1480 train_time:203143ms step_avg:159.20ms step:1287/1480 train_time:203316ms step_avg:159.21ms step:1288/1480 train_time:203490ms step_avg:159.23ms step:1289/1480 train_time:203671ms step_avg:159.24ms step:1290/1480 train_time:203851ms step_avg:159.26ms step:1291/1480 train_time:204025ms step_avg:159.27ms step:1292/1480 train_time:204198ms step_avg:159.28ms step:1293/1480 train_time:204374ms step_avg:159.29ms step:1294/1480 train_time:204544ms step_avg:159.30ms step:1295/1480 train_time:204715ms step_avg:159.31ms step:1296/1480 train_time:204890ms step_avg:159.32ms step:1297/1480 train_time:205062ms step_avg:159.33ms step:1298/1480 train_time:205232ms step_avg:159.34ms step:1299/1480 train_time:205403ms step_avg:159.35ms step:1300/1480 train_time:205571ms step_avg:159.36ms step:1301/1480 train_time:205740ms step_avg:159.36ms step:1302/1480 train_time:205914ms step_avg:159.38ms step:1303/1480 train_time:206093ms step_avg:159.39ms step:1304/1480 train_time:206268ms step_avg:159.40ms step:1305/1480 train_time:206437ms step_avg:159.41ms step:1306/1480 train_time:206611ms step_avg:159.42ms step:1307/1480 train_time:206778ms step_avg:159.43ms step:1308/1480 train_time:206947ms step_avg:159.44ms step:1309/1480 train_time:207119ms step_avg:159.44ms step:1310/1480 train_time:207288ms step_avg:159.45ms step:1311/1480 train_time:207456ms step_avg:159.46ms step:1312/1480 train_time:207629ms step_avg:159.47ms step:1313/1480 train_time:207797ms step_avg:159.48ms step:1314/1480 train_time:207971ms step_avg:159.49ms step:1315/1480 train_time:208142ms step_avg:159.50ms step:1316/1480 train_time:208310ms step_avg:159.50ms step:1317/1480 train_time:208481ms step_avg:159.51ms step:1318/1480 train_time:208661ms step_avg:159.53ms step:1319/1480 train_time:208836ms step_avg:159.54ms step:1320/1480 train_time:209012ms step_avg:159.55ms step:1321/1480 train_time:209186ms step_avg:159.56ms step:1322/1480 train_time:209365ms step_avg:159.58ms step:1323/1480 train_time:209538ms step_avg:159.59ms step:1324/1480 train_time:209713ms step_avg:159.60ms step:1325/1480 train_time:209893ms step_avg:159.61ms step:1326/1480 train_time:210069ms step_avg:159.63ms step:1327/1480 train_time:210238ms step_avg:159.63ms step:1328/1480 train_time:210409ms step_avg:159.64ms step:1329/1480 train_time:210605ms step_avg:159.67ms step:1330/1480 train_time:210785ms step_avg:159.69ms step:1331/1480 train_time:210955ms step_avg:159.69ms step:1332/1480 train_time:211131ms step_avg:159.71ms step:1333/1480 train_time:211306ms step_avg:159.72ms step:1334/1480 train_time:211476ms step_avg:159.73ms step:1335/1480 train_time:211645ms step_avg:159.73ms step:1336/1480 train_time:211827ms step_avg:159.75ms step:1337/1480 train_time:212002ms step_avg:159.76ms step:1338/1480 train_time:212173ms step_avg:159.77ms step:1339/1480 train_time:212348ms step_avg:159.78ms step:1340/1480 train_time:212520ms step_avg:159.79ms step:1341/1480 train_time:212688ms step_avg:159.80ms step:1342/1480 train_time:212862ms step_avg:159.81ms step:1343/1480 train_time:213031ms step_avg:159.81ms step:1344/1480 train_time:213204ms step_avg:159.82ms step:1345/1480 train_time:213385ms step_avg:159.84ms step:1346/1480 train_time:213553ms step_avg:159.85ms step:1347/1480 train_time:213723ms step_avg:159.85ms step:1348/1480 train_time:213893ms step_avg:159.86ms step:1349/1480 train_time:214063ms step_avg:159.87ms step:1350/1480 train_time:214237ms step_avg:159.88ms step:1351/1480 train_time:214408ms step_avg:159.89ms step:1352/1480 train_time:214580ms step_avg:159.90ms step:1353/1480 train_time:214755ms step_avg:159.91ms step:1354/1480 train_time:214926ms step_avg:159.92ms step:1355/1480 train_time:215094ms step_avg:159.92ms step:1356/1480 train_time:215268ms step_avg:159.93ms step:1357/1480 train_time:215439ms step_avg:159.94ms step:1358/1480 train_time:215611ms step_avg:159.95ms step:1359/1480 train_time:215784ms step_avg:159.96ms step:1360/1480 train_time:215959ms step_avg:159.97ms step:1361/1480 train_time:216135ms step_avg:159.98ms step:1362/1480 train_time:216311ms step_avg:159.99ms step:1363/1480 train_time:216492ms step_avg:160.01ms step:1364/1480 train_time:216662ms step_avg:160.02ms step:1365/1480 train_time:216828ms step_avg:160.02ms step:1366/1480 train_time:216999ms step_avg:160.03ms step:1367/1480 train_time:217170ms step_avg:160.04ms step:1368/1480 train_time:217343ms step_avg:160.05ms step:1369/1480 train_time:217525ms step_avg:160.06ms step:1370/1480 train_time:217703ms step_avg:160.08ms step:1371/1480 train_time:217874ms step_avg:160.08ms step:1372/1480 train_time:218051ms step_avg:160.10ms step:1373/1480 train_time:218221ms step_avg:160.10ms step:1374/1480 train_time:218396ms step_avg:160.11ms step:1375/1480 train_time:218568ms step_avg:160.12ms step:1375/1480 val_loss:3.2973 train_time:218636ms step_avg:160.17ms step:1376/1480 train_time:218741ms step_avg:160.13ms step:1377/1480 train_time:218913ms step_avg:160.14ms step:1378/1480 train_time:219081ms step_avg:160.15ms step:1379/1480 train_time:219255ms step_avg:160.16ms step:1380/1480 train_time:219428ms step_avg:160.17ms step:1381/1480 train_time:219608ms step_avg:160.18ms step:1382/1480 train_time:219780ms step_avg:160.19ms step:1383/1480 train_time:219953ms step_avg:160.20ms step:1384/1480 train_time:220130ms step_avg:160.21ms step:1385/1480 train_time:220296ms step_avg:160.21ms step:1386/1480 train_time:220465ms step_avg:160.22ms step:1387/1480 train_time:220637ms step_avg:160.23ms step:1388/1480 train_time:220805ms step_avg:160.24ms step:1389/1480 train_time:220979ms step_avg:160.25ms step:1390/1480 train_time:221147ms step_avg:160.25ms step:1391/1480 train_time:221318ms step_avg:160.26ms step:1392/1480 train_time:221492ms step_avg:160.27ms step:1393/1480 train_time:221662ms step_avg:160.28ms step:1394/1480 train_time:221832ms step_avg:160.28ms step:1395/1480 train_time:222001ms step_avg:160.29ms step:1396/1480 train_time:222170ms step_avg:160.30ms step:1397/1480 train_time:222338ms step_avg:160.30ms step:1398/1480 train_time:222505ms step_avg:160.31ms step:1399/1480 train_time:222674ms step_avg:160.31ms step:1400/1480 train_time:222851ms step_avg:160.32ms step:1401/1480 train_time:223017ms step_avg:160.33ms step:1402/1480 train_time:223189ms step_avg:160.34ms step:1403/1480 train_time:223364ms step_avg:160.35ms step:1404/1480 train_time:223535ms step_avg:160.35ms step:1405/1480 train_time:223707ms step_avg:160.36ms step:1406/1480 train_time:223881ms step_avg:160.37ms step:1407/1480 train_time:224050ms step_avg:160.38ms step:1408/1480 train_time:224219ms step_avg:160.39ms step:1409/1480 train_time:224402ms step_avg:160.40ms step:1410/1480 train_time:224572ms step_avg:160.41ms step:1411/1480 train_time:224740ms step_avg:160.41ms step:1412/1480 train_time:224909ms step_avg:160.42ms step:1413/1480 train_time:225079ms step_avg:160.43ms step:1414/1480 train_time:225251ms step_avg:160.44ms step:1415/1480 train_time:225425ms step_avg:160.44ms step:1416/1480 train_time:225611ms step_avg:160.46ms step:1417/1480 train_time:225785ms step_avg:160.47ms step:1418/1480 train_time:225957ms step_avg:160.48ms step:1419/1480 train_time:226130ms step_avg:160.49ms step:1420/1480 train_time:226306ms step_avg:160.50ms step:1421/1480 train_time:226479ms step_avg:160.51ms step:1422/1480 train_time:226651ms step_avg:160.52ms step:1423/1480 train_time:226820ms step_avg:160.52ms step:1424/1480 train_time:226998ms step_avg:160.54ms step:1425/1480 train_time:227180ms step_avg:160.55ms step:1426/1480 train_time:227353ms step_avg:160.56ms step:1427/1480 train_time:227527ms step_avg:160.57ms step:1428/1480 train_time:227700ms step_avg:160.58ms step:1429/1480 train_time:227869ms step_avg:160.58ms step:1430/1480 train_time:228043ms step_avg:160.59ms step:1431/1480 train_time:228218ms step_avg:160.60ms step:1432/1480 train_time:228395ms step_avg:160.62ms step:1433/1480 train_time:228574ms step_avg:160.63ms step:1434/1480 train_time:228754ms step_avg:160.64ms step:1435/1480 train_time:228929ms step_avg:160.65ms step:1436/1480 train_time:229102ms step_avg:160.66ms step:1437/1480 train_time:229273ms step_avg:160.67ms step:1438/1480 train_time:229441ms step_avg:160.67ms step:1439/1480 train_time:229615ms step_avg:160.68ms step:1440/1480 train_time:229783ms step_avg:160.69ms step:1441/1480 train_time:229955ms step_avg:160.70ms step:1442/1480 train_time:230132ms step_avg:160.71ms step:1443/1480 train_time:230322ms step_avg:160.73ms step:1444/1480 train_time:230494ms step_avg:160.73ms step:1445/1480 train_time:230664ms step_avg:160.74ms step:1446/1480 train_time:230840ms step_avg:160.75ms step:1447/1480 train_time:231017ms step_avg:160.76ms step:1448/1480 train_time:231188ms step_avg:160.77ms step:1449/1480 train_time:231361ms step_avg:160.78ms step:1450/1480 train_time:231536ms step_avg:160.79ms step:1451/1480 train_time:231705ms step_avg:160.79ms step:1452/1480 train_time:231879ms step_avg:160.80ms step:1453/1480 train_time:232049ms step_avg:160.81ms step:1454/1480 train_time:232223ms step_avg:160.82ms step:1455/1480 train_time:232404ms step_avg:160.83ms step:1456/1480 train_time:232576ms step_avg:160.84ms step:1457/1480 train_time:232747ms step_avg:160.85ms step:1458/1480 train_time:232918ms step_avg:160.85ms step:1459/1480 train_time:233096ms step_avg:160.87ms step:1460/1480 train_time:233267ms step_avg:160.87ms step:1461/1480 train_time:233442ms step_avg:160.88ms step:1462/1480 train_time:233615ms step_avg:160.89ms step:1463/1480 train_time:233791ms step_avg:160.90ms step:1464/1480 train_time:233965ms step_avg:160.91ms step:1465/1480 train_time:234139ms step_avg:160.92ms step:1466/1480 train_time:234309ms step_avg:160.93ms step:1467/1480 train_time:234484ms step_avg:160.94ms step:1468/1480 train_time:234654ms step_avg:160.94ms step:1469/1480 train_time:234827ms step_avg:160.95ms step:1470/1480 train_time:235007ms step_avg:160.96ms step:1471/1480 train_time:235194ms step_avg:160.98ms step:1472/1480 train_time:235375ms step_avg:160.99ms step:1473/1480 train_time:235547ms step_avg:161.00ms step:1474/1480 train_time:235725ms step_avg:161.01ms step:1475/1480 train_time:235905ms step_avg:161.03ms step:1476/1480 train_time:236078ms step_avg:161.04ms step:1477/1480 train_time:236260ms step_avg:161.05ms step:1478/1480 train_time:236443ms step_avg:161.06ms step:1479/1480 train_time:236617ms step_avg:161.07ms step:1480/1480 train_time:236791ms step_avg:161.08ms step:1480/1480 val_loss:3.2782 train_time:236862ms step_avg:161.13ms