import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:18:09 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 92W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 107W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 107W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23064ms step_avg:nanms step:2/1480 train_time:23150ms step_avg:nanms step:3/1480 train_time:23288ms step_avg:nanms step:4/1480 train_time:23429ms step_avg:nanms step:5/1480 train_time:23569ms step_avg:nanms step:6/1480 train_time:23710ms step_avg:nanms step:7/1480 train_time:23851ms step_avg:nanms step:8/1480 train_time:23995ms step_avg:nanms step:9/1480 train_time:24140ms step_avg:nanms step:10/1480 train_time:24283ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.72ms step:14/1480 train_time:567ms step_avg:141.67ms step:15/1480 train_time:709ms step_avg:141.89ms step:16/1480 train_time:853ms step_avg:142.11ms step:17/1480 train_time:997ms step_avg:142.40ms step:18/1480 train_time:1141ms step_avg:142.60ms step:19/1480 train_time:1284ms step_avg:142.65ms step:20/1480 train_time:1425ms step_avg:142.47ms step:21/1480 train_time:1566ms step_avg:142.39ms step:22/1480 train_time:1709ms step_avg:142.40ms step:23/1480 train_time:1851ms step_avg:142.37ms step:24/1480 train_time:1993ms step_avg:142.34ms step:25/1480 train_time:2138ms step_avg:142.51ms step:26/1480 train_time:2281ms step_avg:142.57ms step:27/1480 train_time:2423ms step_avg:142.55ms step:28/1480 train_time:2566ms step_avg:142.55ms step:29/1480 train_time:2707ms step_avg:142.50ms step:30/1480 train_time:2850ms step_avg:142.48ms step:31/1480 train_time:2992ms step_avg:142.47ms step:32/1480 train_time:3136ms step_avg:142.54ms step:33/1480 train_time:3280ms step_avg:142.62ms step:34/1480 train_time:3425ms step_avg:142.70ms step:35/1480 train_time:3567ms step_avg:142.69ms step:36/1480 train_time:3709ms step_avg:142.64ms step:37/1480 train_time:3850ms step_avg:142.60ms step:38/1480 train_time:3992ms step_avg:142.56ms step:39/1480 train_time:4134ms step_avg:142.54ms step:40/1480 train_time:4278ms step_avg:142.60ms step:41/1480 train_time:4422ms step_avg:142.63ms step:42/1480 train_time:4565ms step_avg:142.67ms step:43/1480 train_time:4707ms step_avg:142.63ms step:44/1480 train_time:4848ms step_avg:142.60ms step:45/1480 train_time:4988ms step_avg:142.53ms step:46/1480 train_time:5132ms step_avg:142.57ms step:47/1480 train_time:5278ms step_avg:142.65ms step:48/1480 train_time:5422ms step_avg:142.69ms step:49/1480 train_time:5566ms step_avg:142.71ms step:50/1480 train_time:5707ms step_avg:142.67ms step:51/1480 train_time:5850ms step_avg:142.69ms step:52/1480 train_time:5991ms step_avg:142.64ms step:53/1480 train_time:6133ms step_avg:142.62ms step:54/1480 train_time:6277ms step_avg:142.66ms step:55/1480 train_time:6421ms step_avg:142.68ms step:56/1480 train_time:6564ms step_avg:142.71ms step:57/1480 train_time:6706ms step_avg:142.69ms step:58/1480 train_time:6847ms step_avg:142.65ms step:59/1480 train_time:6989ms step_avg:142.63ms step:60/1480 train_time:7130ms step_avg:142.61ms step:61/1480 train_time:7275ms step_avg:142.64ms step:62/1480 train_time:7419ms step_avg:142.67ms step:63/1480 train_time:7562ms step_avg:142.68ms step:64/1480 train_time:7705ms step_avg:142.68ms step:65/1480 train_time:7847ms step_avg:142.68ms step:66/1480 train_time:7988ms step_avg:142.65ms step:67/1480 train_time:8130ms step_avg:142.63ms step:68/1480 train_time:8273ms step_avg:142.63ms step:69/1480 train_time:8418ms step_avg:142.67ms step:70/1480 train_time:8562ms step_avg:142.69ms step:71/1480 train_time:8704ms step_avg:142.69ms step:72/1480 train_time:8847ms step_avg:142.70ms step:73/1480 train_time:8988ms step_avg:142.67ms step:74/1480 train_time:9130ms step_avg:142.65ms step:75/1480 train_time:9274ms step_avg:142.68ms step:76/1480 train_time:9418ms step_avg:142.69ms step:77/1480 train_time:9562ms step_avg:142.72ms step:78/1480 train_time:9706ms step_avg:142.74ms step:79/1480 train_time:9848ms step_avg:142.72ms step:80/1480 train_time:9988ms step_avg:142.69ms step:81/1480 train_time:10130ms step_avg:142.67ms step:82/1480 train_time:10273ms step_avg:142.68ms step:83/1480 train_time:10418ms step_avg:142.71ms step:84/1480 train_time:10563ms step_avg:142.74ms step:85/1480 train_time:10706ms step_avg:142.75ms step:86/1480 train_time:10849ms step_avg:142.75ms step:87/1480 train_time:10989ms step_avg:142.71ms step:88/1480 train_time:11129ms step_avg:142.69ms step:89/1480 train_time:11272ms step_avg:142.68ms step:90/1480 train_time:11416ms step_avg:142.70ms step:91/1480 train_time:11559ms step_avg:142.71ms step:92/1480 train_time:11702ms step_avg:142.71ms step:93/1480 train_time:11845ms step_avg:142.71ms step:94/1480 train_time:11986ms step_avg:142.69ms step:95/1480 train_time:12128ms step_avg:142.68ms step:96/1480 train_time:12269ms step_avg:142.67ms step:97/1480 train_time:12411ms step_avg:142.65ms step:98/1480 train_time:12553ms step_avg:142.64ms step:99/1480 train_time:12694ms step_avg:142.63ms step:100/1480 train_time:12837ms step_avg:142.64ms step:101/1480 train_time:12980ms step_avg:142.64ms step:102/1480 train_time:13121ms step_avg:142.62ms step:103/1480 train_time:13264ms step_avg:142.62ms step:104/1480 train_time:13406ms step_avg:142.61ms step:105/1480 train_time:13548ms step_avg:142.61ms step:106/1480 train_time:13689ms step_avg:142.59ms step:107/1480 train_time:13834ms step_avg:142.61ms step:108/1480 train_time:13978ms step_avg:142.63ms step:109/1480 train_time:14122ms step_avg:142.65ms step:110/1480 train_time:14264ms step_avg:142.64ms step:111/1480 train_time:14408ms step_avg:142.65ms step:112/1480 train_time:14554ms step_avg:142.69ms step:113/1480 train_time:14701ms step_avg:142.73ms step:114/1480 train_time:14848ms step_avg:142.77ms step:115/1480 train_time:14993ms step_avg:142.79ms step:116/1480 train_time:15142ms step_avg:142.85ms step:117/1480 train_time:15289ms step_avg:142.89ms step:118/1480 train_time:15436ms step_avg:142.93ms step:119/1480 train_time:15583ms step_avg:142.96ms step:120/1480 train_time:15730ms step_avg:143.00ms step:121/1480 train_time:15876ms step_avg:143.03ms step:122/1480 train_time:16023ms step_avg:143.06ms step:123/1480 train_time:16171ms step_avg:143.10ms step:124/1480 train_time:16318ms step_avg:143.14ms step:125/1480 train_time:16465ms step_avg:143.17ms step:125/1480 val_loss:4.4133 train_time:16522ms step_avg:143.67ms step:126/1480 train_time:16621ms step_avg:143.29ms step:127/1480 train_time:16770ms step_avg:143.34ms step:128/1480 train_time:16919ms step_avg:143.38ms step:129/1480 train_time:17064ms step_avg:143.39ms step:130/1480 train_time:17208ms step_avg:143.40ms step:131/1480 train_time:17354ms step_avg:143.42ms step:132/1480 train_time:17500ms step_avg:143.45ms step:133/1480 train_time:17646ms step_avg:143.47ms step:134/1480 train_time:17796ms step_avg:143.51ms step:135/1480 train_time:17946ms step_avg:143.57ms step:136/1480 train_time:18089ms step_avg:143.57ms step:137/1480 train_time:18236ms step_avg:143.59ms step:138/1480 train_time:18384ms step_avg:143.63ms step:139/1480 train_time:18529ms step_avg:143.64ms step:140/1480 train_time:18677ms step_avg:143.67ms step:141/1480 train_time:18823ms step_avg:143.69ms step:142/1480 train_time:18970ms step_avg:143.72ms step:143/1480 train_time:19118ms step_avg:143.75ms step:144/1480 train_time:19265ms step_avg:143.77ms step:145/1480 train_time:19412ms step_avg:143.79ms step:146/1480 train_time:19558ms step_avg:143.81ms step:147/1480 train_time:19706ms step_avg:143.84ms step:148/1480 train_time:19853ms step_avg:143.86ms step:149/1480 train_time:20000ms step_avg:143.89ms step:150/1480 train_time:20146ms step_avg:143.90ms step:151/1480 train_time:20295ms step_avg:143.94ms step:152/1480 train_time:20442ms step_avg:143.95ms step:153/1480 train_time:20587ms step_avg:143.97ms step:154/1480 train_time:20735ms step_avg:143.99ms step:155/1480 train_time:20882ms step_avg:144.01ms step:156/1480 train_time:21027ms step_avg:144.02ms step:157/1480 train_time:21175ms step_avg:144.05ms step:158/1480 train_time:21322ms step_avg:144.06ms step:159/1480 train_time:21467ms step_avg:144.07ms step:160/1480 train_time:21615ms step_avg:144.10ms step:161/1480 train_time:21762ms step_avg:144.12ms step:162/1480 train_time:21909ms step_avg:144.14ms step:163/1480 train_time:22056ms step_avg:144.16ms step:164/1480 train_time:22203ms step_avg:144.18ms step:165/1480 train_time:22351ms step_avg:144.20ms step:166/1480 train_time:22498ms step_avg:144.22ms step:167/1480 train_time:22644ms step_avg:144.23ms step:168/1480 train_time:22790ms step_avg:144.24ms step:169/1480 train_time:22938ms step_avg:144.27ms step:170/1480 train_time:23085ms step_avg:144.28ms step:171/1480 train_time:23232ms step_avg:144.30ms step:172/1480 train_time:23379ms step_avg:144.31ms step:173/1480 train_time:23525ms step_avg:144.32ms step:174/1480 train_time:23671ms step_avg:144.33ms step:175/1480 train_time:23819ms step_avg:144.36ms step:176/1480 train_time:23965ms step_avg:144.36ms step:177/1480 train_time:24112ms step_avg:144.38ms step:178/1480 train_time:24258ms step_avg:144.40ms step:179/1480 train_time:24406ms step_avg:144.41ms step:180/1480 train_time:24554ms step_avg:144.44ms step:181/1480 train_time:24701ms step_avg:144.45ms step:182/1480 train_time:24848ms step_avg:144.46ms step:183/1480 train_time:24995ms step_avg:144.48ms step:184/1480 train_time:25142ms step_avg:144.49ms step:185/1480 train_time:25288ms step_avg:144.50ms step:186/1480 train_time:25435ms step_avg:144.52ms step:187/1480 train_time:25581ms step_avg:144.53ms step:188/1480 train_time:25727ms step_avg:144.53ms step:189/1480 train_time:25874ms step_avg:144.55ms step:190/1480 train_time:26021ms step_avg:144.56ms step:191/1480 train_time:26167ms step_avg:144.57ms step:192/1480 train_time:26316ms step_avg:144.60ms step:193/1480 train_time:26462ms step_avg:144.60ms step:194/1480 train_time:26609ms step_avg:144.61ms step:195/1480 train_time:26756ms step_avg:144.63ms step:196/1480 train_time:26904ms step_avg:144.64ms step:197/1480 train_time:27051ms step_avg:144.66ms step:198/1480 train_time:27199ms step_avg:144.67ms step:199/1480 train_time:27346ms step_avg:144.69ms step:200/1480 train_time:27494ms step_avg:144.71ms step:201/1480 train_time:27641ms step_avg:144.72ms step:202/1480 train_time:27785ms step_avg:144.72ms step:203/1480 train_time:27932ms step_avg:144.72ms step:204/1480 train_time:28079ms step_avg:144.73ms step:205/1480 train_time:28225ms step_avg:144.74ms step:206/1480 train_time:28372ms step_avg:144.76ms step:207/1480 train_time:28520ms step_avg:144.77ms step:208/1480 train_time:28666ms step_avg:144.78ms step:209/1480 train_time:28814ms step_avg:144.79ms step:210/1480 train_time:28960ms step_avg:144.80ms step:211/1480 train_time:29105ms step_avg:144.80ms step:212/1480 train_time:29254ms step_avg:144.82ms step:213/1480 train_time:29401ms step_avg:144.83ms step:214/1480 train_time:29546ms step_avg:144.83ms step:215/1480 train_time:29693ms step_avg:144.84ms step:216/1480 train_time:29840ms step_avg:144.85ms step:217/1480 train_time:29986ms step_avg:144.86ms step:218/1480 train_time:30133ms step_avg:144.87ms step:219/1480 train_time:30280ms step_avg:144.88ms step:220/1480 train_time:30426ms step_avg:144.88ms step:221/1480 train_time:30575ms step_avg:144.90ms step:222/1480 train_time:30727ms step_avg:144.94ms step:223/1480 train_time:30878ms step_avg:144.97ms step:224/1480 train_time:31028ms step_avg:144.99ms step:225/1480 train_time:31178ms step_avg:145.01ms step:226/1480 train_time:31328ms step_avg:145.04ms step:227/1480 train_time:31478ms step_avg:145.06ms step:228/1480 train_time:31628ms step_avg:145.08ms step:229/1480 train_time:31780ms step_avg:145.11ms step:230/1480 train_time:31931ms step_avg:145.14ms step:231/1480 train_time:32082ms step_avg:145.17ms step:232/1480 train_time:32231ms step_avg:145.19ms step:233/1480 train_time:32382ms step_avg:145.21ms step:234/1480 train_time:32533ms step_avg:145.23ms step:235/1480 train_time:32684ms step_avg:145.26ms step:236/1480 train_time:32833ms step_avg:145.28ms step:237/1480 train_time:32984ms step_avg:145.31ms step:238/1480 train_time:33135ms step_avg:145.33ms step:239/1480 train_time:33285ms step_avg:145.35ms step:240/1480 train_time:33436ms step_avg:145.37ms step:241/1480 train_time:33586ms step_avg:145.39ms step:242/1480 train_time:33737ms step_avg:145.42ms step:243/1480 train_time:33887ms step_avg:145.44ms step:244/1480 train_time:34037ms step_avg:145.46ms step:245/1480 train_time:34187ms step_avg:145.48ms step:246/1480 train_time:34337ms step_avg:145.50ms step:247/1480 train_time:34487ms step_avg:145.52ms step:248/1480 train_time:34637ms step_avg:145.54ms step:249/1480 train_time:34787ms step_avg:145.55ms step:250/1480 train_time:34937ms step_avg:145.57ms step:250/1480 val_loss:3.9896 train_time:34996ms step_avg:145.82ms step:251/1480 train_time:35093ms step_avg:145.61ms step:252/1480 train_time:35244ms step_avg:145.63ms step:253/1480 train_time:35394ms step_avg:145.65ms step:254/1480 train_time:35542ms step_avg:145.66ms step:255/1480 train_time:35691ms step_avg:145.68ms step:256/1480 train_time:35840ms step_avg:145.69ms step:257/1480 train_time:35990ms step_avg:145.71ms step:258/1480 train_time:36142ms step_avg:145.74ms step:259/1480 train_time:36294ms step_avg:145.76ms step:260/1480 train_time:36444ms step_avg:145.78ms step:261/1480 train_time:36593ms step_avg:145.79ms step:262/1480 train_time:36742ms step_avg:145.80ms step:263/1480 train_time:36892ms step_avg:145.82ms step:264/1480 train_time:37042ms step_avg:145.83ms step:265/1480 train_time:37194ms step_avg:145.86ms step:266/1480 train_time:37345ms step_avg:145.88ms step:267/1480 train_time:37495ms step_avg:145.89ms step:268/1480 train_time:37644ms step_avg:145.91ms step:269/1480 train_time:37794ms step_avg:145.92ms step:270/1480 train_time:37944ms step_avg:145.94ms step:271/1480 train_time:38094ms step_avg:145.96ms step:272/1480 train_time:38246ms step_avg:145.98ms step:273/1480 train_time:38395ms step_avg:145.99ms step:274/1480 train_time:38544ms step_avg:146.00ms step:275/1480 train_time:38695ms step_avg:146.02ms step:276/1480 train_time:38846ms step_avg:146.04ms step:277/1480 train_time:38995ms step_avg:146.05ms step:278/1480 train_time:39146ms step_avg:146.07ms step:279/1480 train_time:39296ms step_avg:146.08ms step:280/1480 train_time:39447ms step_avg:146.10ms step:281/1480 train_time:39597ms step_avg:146.12ms step:282/1480 train_time:39748ms step_avg:146.13ms step:283/1480 train_time:39898ms step_avg:146.15ms step:284/1480 train_time:40048ms step_avg:146.16ms step:285/1480 train_time:40200ms step_avg:146.18ms step:286/1480 train_time:40350ms step_avg:146.20ms step:287/1480 train_time:40502ms step_avg:146.22ms step:288/1480 train_time:40652ms step_avg:146.23ms step:289/1480 train_time:40803ms step_avg:146.25ms step:290/1480 train_time:40954ms step_avg:146.26ms step:291/1480 train_time:41105ms step_avg:146.28ms step:292/1480 train_time:41255ms step_avg:146.29ms step:293/1480 train_time:41405ms step_avg:146.31ms step:294/1480 train_time:41555ms step_avg:146.32ms step:295/1480 train_time:41706ms step_avg:146.34ms step:296/1480 train_time:41856ms step_avg:146.35ms step:297/1480 train_time:42007ms step_avg:146.37ms step:298/1480 train_time:42157ms step_avg:146.38ms step:299/1480 train_time:42308ms step_avg:146.39ms step:300/1480 train_time:42458ms step_avg:146.41ms step:301/1480 train_time:42607ms step_avg:146.42ms step:302/1480 train_time:42757ms step_avg:146.43ms step:303/1480 train_time:42908ms step_avg:146.44ms step:304/1480 train_time:43057ms step_avg:146.45ms step:305/1480 train_time:43208ms step_avg:146.47ms step:306/1480 train_time:43358ms step_avg:146.48ms step:307/1480 train_time:43510ms step_avg:146.50ms step:308/1480 train_time:43659ms step_avg:146.51ms step:309/1480 train_time:43809ms step_avg:146.52ms step:310/1480 train_time:43959ms step_avg:146.53ms step:311/1480 train_time:44110ms step_avg:146.54ms step:312/1480 train_time:44260ms step_avg:146.56ms step:313/1480 train_time:44411ms step_avg:146.57ms step:314/1480 train_time:44559ms step_avg:146.58ms step:315/1480 train_time:44710ms step_avg:146.59ms step:316/1480 train_time:44859ms step_avg:146.60ms step:317/1480 train_time:45010ms step_avg:146.61ms step:318/1480 train_time:45161ms step_avg:146.63ms step:319/1480 train_time:45312ms step_avg:146.64ms step:320/1480 train_time:45462ms step_avg:146.65ms step:321/1480 train_time:45612ms step_avg:146.66ms step:322/1480 train_time:45761ms step_avg:146.67ms step:323/1480 train_time:45911ms step_avg:146.68ms step:324/1480 train_time:46062ms step_avg:146.70ms step:325/1480 train_time:46214ms step_avg:146.71ms step:326/1480 train_time:46365ms step_avg:146.72ms step:327/1480 train_time:46514ms step_avg:146.73ms step:328/1480 train_time:46666ms step_avg:146.75ms step:329/1480 train_time:46817ms step_avg:146.76ms step:330/1480 train_time:46970ms step_avg:146.78ms step:331/1480 train_time:47125ms step_avg:146.81ms step:332/1480 train_time:47280ms step_avg:146.83ms step:333/1480 train_time:47433ms step_avg:146.85ms step:334/1480 train_time:47586ms step_avg:146.87ms step:335/1480 train_time:47740ms step_avg:146.89ms step:336/1480 train_time:47894ms step_avg:146.91ms step:337/1480 train_time:48048ms step_avg:146.94ms step:338/1480 train_time:48203ms step_avg:146.96ms step:339/1480 train_time:48356ms step_avg:146.98ms step:340/1480 train_time:48510ms step_avg:147.00ms step:341/1480 train_time:48665ms step_avg:147.02ms step:342/1480 train_time:48819ms step_avg:147.05ms step:343/1480 train_time:48972ms step_avg:147.06ms step:344/1480 train_time:49127ms step_avg:147.09ms step:345/1480 train_time:49282ms step_avg:147.11ms step:346/1480 train_time:49435ms step_avg:147.13ms step:347/1480 train_time:49590ms step_avg:147.15ms step:348/1480 train_time:49744ms step_avg:147.17ms step:349/1480 train_time:49898ms step_avg:147.19ms step:350/1480 train_time:50052ms step_avg:147.21ms step:351/1480 train_time:50205ms step_avg:147.23ms step:352/1480 train_time:50359ms step_avg:147.25ms step:353/1480 train_time:50512ms step_avg:147.27ms step:354/1480 train_time:50666ms step_avg:147.28ms step:355/1480 train_time:50821ms step_avg:147.31ms step:356/1480 train_time:50973ms step_avg:147.32ms step:357/1480 train_time:51128ms step_avg:147.34ms step:358/1480 train_time:51281ms step_avg:147.36ms step:359/1480 train_time:51436ms step_avg:147.38ms step:360/1480 train_time:51591ms step_avg:147.40ms step:361/1480 train_time:51745ms step_avg:147.42ms step:362/1480 train_time:51899ms step_avg:147.44ms step:363/1480 train_time:52052ms step_avg:147.46ms step:364/1480 train_time:52206ms step_avg:147.47ms step:365/1480 train_time:52361ms step_avg:147.50ms step:366/1480 train_time:52514ms step_avg:147.51ms step:367/1480 train_time:52667ms step_avg:147.53ms step:368/1480 train_time:52820ms step_avg:147.54ms step:369/1480 train_time:52973ms step_avg:147.56ms step:370/1480 train_time:53126ms step_avg:147.57ms step:371/1480 train_time:53280ms step_avg:147.59ms step:372/1480 train_time:53434ms step_avg:147.61ms step:373/1480 train_time:53589ms step_avg:147.63ms step:374/1480 train_time:53744ms step_avg:147.65ms step:375/1480 train_time:53897ms step_avg:147.66ms step:375/1480 val_loss:3.8065 train_time:53958ms step_avg:147.83ms step:376/1480 train_time:54054ms step_avg:147.69ms step:377/1480 train_time:54208ms step_avg:147.71ms step:378/1480 train_time:54362ms step_avg:147.72ms step:379/1480 train_time:54514ms step_avg:147.73ms step:380/1480 train_time:54667ms step_avg:147.75ms step:381/1480 train_time:54820ms step_avg:147.76ms step:382/1480 train_time:54974ms step_avg:147.78ms step:383/1480 train_time:55129ms step_avg:147.80ms step:384/1480 train_time:55283ms step_avg:147.82ms step:385/1480 train_time:55436ms step_avg:147.83ms step:386/1480 train_time:55589ms step_avg:147.84ms step:387/1480 train_time:55742ms step_avg:147.86ms step:388/1480 train_time:55896ms step_avg:147.87ms step:389/1480 train_time:56049ms step_avg:147.89ms step:390/1480 train_time:56204ms step_avg:147.91ms step:391/1480 train_time:56358ms step_avg:147.92ms step:392/1480 train_time:56510ms step_avg:147.93ms step:393/1480 train_time:56663ms step_avg:147.95ms step:394/1480 train_time:56817ms step_avg:147.96ms step:395/1480 train_time:56970ms step_avg:147.98ms step:396/1480 train_time:57125ms step_avg:147.99ms step:397/1480 train_time:57280ms step_avg:148.01ms step:398/1480 train_time:57434ms step_avg:148.03ms step:399/1480 train_time:57588ms step_avg:148.04ms step:400/1480 train_time:57743ms step_avg:148.06ms step:401/1480 train_time:57897ms step_avg:148.07ms step:402/1480 train_time:58050ms step_avg:148.09ms step:403/1480 train_time:58206ms step_avg:148.11ms step:404/1480 train_time:58361ms step_avg:148.13ms step:405/1480 train_time:58516ms step_avg:148.14ms step:406/1480 train_time:58670ms step_avg:148.16ms step:407/1480 train_time:58824ms step_avg:148.17ms step:408/1480 train_time:58978ms step_avg:148.19ms step:409/1480 train_time:59131ms step_avg:148.20ms step:410/1480 train_time:59285ms step_avg:148.21ms step:411/1480 train_time:59439ms step_avg:148.23ms step:412/1480 train_time:59594ms step_avg:148.24ms step:413/1480 train_time:59747ms step_avg:148.26ms step:414/1480 train_time:59902ms step_avg:148.27ms step:415/1480 train_time:60055ms step_avg:148.28ms step:416/1480 train_time:60208ms step_avg:148.30ms step:417/1480 train_time:60362ms step_avg:148.31ms step:418/1480 train_time:60517ms step_avg:148.32ms step:419/1480 train_time:60670ms step_avg:148.34ms step:420/1480 train_time:60824ms step_avg:148.35ms step:421/1480 train_time:60979ms step_avg:148.37ms step:422/1480 train_time:61132ms step_avg:148.38ms step:423/1480 train_time:61287ms step_avg:148.39ms step:424/1480 train_time:61440ms step_avg:148.41ms step:425/1480 train_time:61594ms step_avg:148.42ms step:426/1480 train_time:61748ms step_avg:148.43ms step:427/1480 train_time:61902ms step_avg:148.45ms step:428/1480 train_time:62056ms step_avg:148.46ms step:429/1480 train_time:62210ms step_avg:148.47ms step:430/1480 train_time:62363ms step_avg:148.48ms step:431/1480 train_time:62517ms step_avg:148.50ms step:432/1480 train_time:62670ms step_avg:148.51ms step:433/1480 train_time:62824ms step_avg:148.52ms step:434/1480 train_time:62978ms step_avg:148.53ms step:435/1480 train_time:63132ms step_avg:148.54ms step:436/1480 train_time:63287ms step_avg:148.56ms step:437/1480 train_time:63440ms step_avg:148.57ms step:438/1480 train_time:63593ms step_avg:148.58ms step:439/1480 train_time:63747ms step_avg:148.59ms step:440/1480 train_time:63902ms step_avg:148.61ms step:441/1480 train_time:64058ms step_avg:148.63ms step:442/1480 train_time:64216ms step_avg:148.65ms step:443/1480 train_time:64371ms step_avg:148.66ms step:444/1480 train_time:64527ms step_avg:148.68ms step:445/1480 train_time:64684ms step_avg:148.70ms step:446/1480 train_time:64840ms step_avg:148.72ms step:447/1480 train_time:64996ms step_avg:148.73ms step:448/1480 train_time:65152ms step_avg:148.75ms step:449/1480 train_time:65310ms step_avg:148.77ms step:450/1480 train_time:65467ms step_avg:148.79ms step:451/1480 train_time:65626ms step_avg:148.81ms step:452/1480 train_time:65784ms step_avg:148.83ms step:453/1480 train_time:65941ms step_avg:148.85ms step:454/1480 train_time:66099ms step_avg:148.87ms step:455/1480 train_time:66254ms step_avg:148.89ms step:456/1480 train_time:66411ms step_avg:148.90ms step:457/1480 train_time:66568ms step_avg:148.92ms step:458/1480 train_time:66725ms step_avg:148.94ms step:459/1480 train_time:66884ms step_avg:148.96ms step:460/1480 train_time:67042ms step_avg:148.98ms step:461/1480 train_time:67203ms step_avg:149.01ms step:462/1480 train_time:67361ms step_avg:149.03ms step:463/1480 train_time:67519ms step_avg:149.05ms step:464/1480 train_time:67675ms step_avg:149.06ms step:465/1480 train_time:67830ms step_avg:149.08ms step:466/1480 train_time:67985ms step_avg:149.09ms step:467/1480 train_time:68145ms step_avg:149.11ms step:468/1480 train_time:68303ms step_avg:149.13ms step:469/1480 train_time:68460ms step_avg:149.15ms step:470/1480 train_time:68618ms step_avg:149.17ms step:471/1480 train_time:68775ms step_avg:149.19ms step:472/1480 train_time:68932ms step_avg:149.20ms step:473/1480 train_time:69088ms step_avg:149.22ms step:474/1480 train_time:69243ms step_avg:149.23ms step:475/1480 train_time:69401ms step_avg:149.25ms step:476/1480 train_time:69558ms step_avg:149.27ms step:477/1480 train_time:69715ms step_avg:149.28ms step:478/1480 train_time:69871ms step_avg:149.30ms step:479/1480 train_time:70027ms step_avg:149.31ms step:480/1480 train_time:70185ms step_avg:149.33ms step:481/1480 train_time:70343ms step_avg:149.35ms step:482/1480 train_time:70500ms step_avg:149.36ms step:483/1480 train_time:70655ms step_avg:149.38ms step:484/1480 train_time:70812ms step_avg:149.39ms step:485/1480 train_time:70968ms step_avg:149.41ms step:486/1480 train_time:71125ms step_avg:149.42ms step:487/1480 train_time:71283ms step_avg:149.44ms step:488/1480 train_time:71441ms step_avg:149.46ms step:489/1480 train_time:71597ms step_avg:149.47ms step:490/1480 train_time:71752ms step_avg:149.48ms step:491/1480 train_time:71909ms step_avg:149.50ms step:492/1480 train_time:72066ms step_avg:149.51ms step:493/1480 train_time:72225ms step_avg:149.53ms step:494/1480 train_time:72384ms step_avg:149.55ms step:495/1480 train_time:72542ms step_avg:149.57ms step:496/1480 train_time:72699ms step_avg:149.59ms step:497/1480 train_time:72855ms step_avg:149.60ms step:498/1480 train_time:73012ms step_avg:149.61ms step:499/1480 train_time:73168ms step_avg:149.63ms step:500/1480 train_time:73325ms step_avg:149.64ms step:500/1480 val_loss:3.6837 train_time:73387ms step_avg:149.77ms step:501/1480 train_time:73486ms step_avg:149.67ms step:502/1480 train_time:73642ms step_avg:149.68ms step:503/1480 train_time:73798ms step_avg:149.69ms step:504/1480 train_time:73954ms step_avg:149.70ms step:505/1480 train_time:74110ms step_avg:149.72ms step:506/1480 train_time:74266ms step_avg:149.73ms step:507/1480 train_time:74420ms step_avg:149.74ms step:508/1480 train_time:74580ms step_avg:149.76ms step:509/1480 train_time:74737ms step_avg:149.77ms step:510/1480 train_time:74894ms step_avg:149.79ms step:511/1480 train_time:75051ms step_avg:149.80ms step:512/1480 train_time:75209ms step_avg:149.82ms step:513/1480 train_time:75365ms step_avg:149.83ms step:514/1480 train_time:75521ms step_avg:149.84ms step:515/1480 train_time:75677ms step_avg:149.86ms step:516/1480 train_time:75835ms step_avg:149.87ms step:517/1480 train_time:75992ms step_avg:149.89ms step:518/1480 train_time:76149ms step_avg:149.90ms step:519/1480 train_time:76305ms step_avg:149.91ms step:520/1480 train_time:76462ms step_avg:149.93ms step:521/1480 train_time:76620ms step_avg:149.94ms step:522/1480 train_time:76778ms step_avg:149.96ms step:523/1480 train_time:76936ms step_avg:149.97ms step:524/1480 train_time:77094ms step_avg:149.99ms step:525/1480 train_time:77251ms step_avg:150.00ms step:526/1480 train_time:77411ms step_avg:150.02ms step:527/1480 train_time:77566ms step_avg:150.03ms step:528/1480 train_time:77722ms step_avg:150.04ms step:529/1480 train_time:77879ms step_avg:150.06ms step:530/1480 train_time:78035ms step_avg:150.07ms step:531/1480 train_time:78191ms step_avg:150.08ms step:532/1480 train_time:78346ms step_avg:150.09ms step:533/1480 train_time:78502ms step_avg:150.10ms step:534/1480 train_time:78658ms step_avg:150.11ms step:535/1480 train_time:78816ms step_avg:150.13ms step:536/1480 train_time:78974ms step_avg:150.14ms step:537/1480 train_time:79130ms step_avg:150.15ms step:538/1480 train_time:79287ms step_avg:150.17ms step:539/1480 train_time:79445ms step_avg:150.18ms step:540/1480 train_time:79602ms step_avg:150.19ms step:541/1480 train_time:79758ms step_avg:150.20ms step:542/1480 train_time:79917ms step_avg:150.22ms step:543/1480 train_time:80074ms step_avg:150.23ms step:544/1480 train_time:80233ms step_avg:150.25ms step:545/1480 train_time:80389ms step_avg:150.26ms step:546/1480 train_time:80545ms step_avg:150.27ms step:547/1480 train_time:80700ms step_avg:150.28ms step:548/1480 train_time:80857ms step_avg:150.29ms step:549/1480 train_time:81016ms step_avg:150.31ms step:550/1480 train_time:81175ms step_avg:150.32ms step:551/1480 train_time:81333ms step_avg:150.34ms step:552/1480 train_time:81492ms step_avg:150.35ms step:553/1480 train_time:81653ms step_avg:150.37ms step:554/1480 train_time:81812ms step_avg:150.39ms step:555/1480 train_time:81972ms step_avg:150.41ms step:556/1480 train_time:82131ms step_avg:150.42ms step:557/1480 train_time:82292ms step_avg:150.44ms step:558/1480 train_time:82453ms step_avg:150.46ms step:559/1480 train_time:82614ms step_avg:150.48ms step:560/1480 train_time:82774ms step_avg:150.50ms step:561/1480 train_time:82934ms step_avg:150.52ms step:562/1480 train_time:83094ms step_avg:150.53ms step:563/1480 train_time:83254ms step_avg:150.55ms step:564/1480 train_time:83414ms step_avg:150.57ms step:565/1480 train_time:83574ms step_avg:150.58ms step:566/1480 train_time:83735ms step_avg:150.60ms step:567/1480 train_time:83895ms step_avg:150.62ms step:568/1480 train_time:84054ms step_avg:150.63ms step:569/1480 train_time:84214ms step_avg:150.65ms step:570/1480 train_time:84374ms step_avg:150.67ms step:571/1480 train_time:84534ms step_avg:150.68ms step:572/1480 train_time:84695ms step_avg:150.70ms step:573/1480 train_time:84856ms step_avg:150.72ms step:574/1480 train_time:85017ms step_avg:150.74ms step:575/1480 train_time:85177ms step_avg:150.76ms step:576/1480 train_time:85337ms step_avg:150.77ms step:577/1480 train_time:85497ms step_avg:150.79ms step:578/1480 train_time:85656ms step_avg:150.80ms step:579/1480 train_time:85818ms step_avg:150.82ms step:580/1480 train_time:85977ms step_avg:150.84ms step:581/1480 train_time:86137ms step_avg:150.85ms step:582/1480 train_time:86296ms step_avg:150.87ms step:583/1480 train_time:86454ms step_avg:150.88ms step:584/1480 train_time:86615ms step_avg:150.90ms step:585/1480 train_time:86774ms step_avg:150.91ms step:586/1480 train_time:86934ms step_avg:150.93ms step:587/1480 train_time:87095ms step_avg:150.94ms step:588/1480 train_time:87254ms step_avg:150.96ms step:589/1480 train_time:87415ms step_avg:150.98ms step:590/1480 train_time:87576ms step_avg:150.99ms step:591/1480 train_time:87735ms step_avg:151.01ms step:592/1480 train_time:87895ms step_avg:151.02ms step:593/1480 train_time:88056ms step_avg:151.04ms step:594/1480 train_time:88218ms step_avg:151.06ms step:595/1480 train_time:88379ms step_avg:151.08ms step:596/1480 train_time:88540ms step_avg:151.09ms step:597/1480 train_time:88699ms step_avg:151.11ms step:598/1480 train_time:88856ms step_avg:151.12ms step:599/1480 train_time:89015ms step_avg:151.13ms step:600/1480 train_time:89175ms step_avg:151.14ms step:601/1480 train_time:89335ms step_avg:151.16ms step:602/1480 train_time:89494ms step_avg:151.17ms step:603/1480 train_time:89654ms step_avg:151.19ms step:604/1480 train_time:89815ms step_avg:151.20ms step:605/1480 train_time:89975ms step_avg:151.22ms step:606/1480 train_time:90137ms step_avg:151.24ms step:607/1480 train_time:90299ms step_avg:151.26ms step:608/1480 train_time:90458ms step_avg:151.27ms step:609/1480 train_time:90618ms step_avg:151.28ms step:610/1480 train_time:90776ms step_avg:151.29ms step:611/1480 train_time:90938ms step_avg:151.31ms step:612/1480 train_time:91097ms step_avg:151.32ms step:613/1480 train_time:91258ms step_avg:151.34ms step:614/1480 train_time:91418ms step_avg:151.35ms step:615/1480 train_time:91576ms step_avg:151.37ms step:616/1480 train_time:91735ms step_avg:151.38ms step:617/1480 train_time:91896ms step_avg:151.39ms step:618/1480 train_time:92055ms step_avg:151.41ms step:619/1480 train_time:92217ms step_avg:151.42ms step:620/1480 train_time:92377ms step_avg:151.44ms step:621/1480 train_time:92537ms step_avg:151.45ms step:622/1480 train_time:92697ms step_avg:151.47ms step:623/1480 train_time:92857ms step_avg:151.48ms step:624/1480 train_time:93016ms step_avg:151.49ms step:625/1480 train_time:93175ms step_avg:151.50ms step:625/1480 val_loss:3.6029 train_time:93239ms step_avg:151.61ms step:626/1480 train_time:93338ms step_avg:151.52ms step:627/1480 train_time:93498ms step_avg:151.54ms step:628/1480 train_time:93656ms step_avg:151.55ms step:629/1480 train_time:93813ms step_avg:151.56ms step:630/1480 train_time:93970ms step_avg:151.57ms step:631/1480 train_time:94128ms step_avg:151.57ms step:632/1480 train_time:94288ms step_avg:151.59ms step:633/1480 train_time:94448ms step_avg:151.60ms step:634/1480 train_time:94608ms step_avg:151.62ms step:635/1480 train_time:94769ms step_avg:151.63ms step:636/1480 train_time:94927ms step_avg:151.64ms step:637/1480 train_time:95087ms step_avg:151.65ms step:638/1480 train_time:95246ms step_avg:151.67ms step:639/1480 train_time:95406ms step_avg:151.68ms step:640/1480 train_time:95567ms step_avg:151.69ms step:641/1480 train_time:95726ms step_avg:151.71ms step:642/1480 train_time:95886ms step_avg:151.72ms step:643/1480 train_time:96047ms step_avg:151.73ms step:644/1480 train_time:96206ms step_avg:151.74ms step:645/1480 train_time:96365ms step_avg:151.76ms step:646/1480 train_time:96525ms step_avg:151.77ms step:647/1480 train_time:96686ms step_avg:151.78ms step:648/1480 train_time:96848ms step_avg:151.80ms step:649/1480 train_time:97007ms step_avg:151.81ms step:650/1480 train_time:97167ms step_avg:151.82ms step:651/1480 train_time:97327ms step_avg:151.84ms step:652/1480 train_time:97488ms step_avg:151.85ms step:653/1480 train_time:97646ms step_avg:151.86ms step:654/1480 train_time:97806ms step_avg:151.87ms step:655/1480 train_time:97966ms step_avg:151.89ms step:656/1480 train_time:98126ms step_avg:151.90ms step:657/1480 train_time:98287ms step_avg:151.91ms step:658/1480 train_time:98447ms step_avg:151.92ms step:659/1480 train_time:98610ms step_avg:151.94ms step:660/1480 train_time:98771ms step_avg:151.96ms step:661/1480 train_time:98934ms step_avg:151.97ms step:662/1480 train_time:99093ms step_avg:151.98ms step:663/1480 train_time:99253ms step_avg:152.00ms step:664/1480 train_time:99415ms step_avg:152.01ms step:665/1480 train_time:99578ms step_avg:152.03ms step:666/1480 train_time:99738ms step_avg:152.04ms step:667/1480 train_time:99899ms step_avg:152.05ms step:668/1480 train_time:100062ms step_avg:152.07ms step:669/1480 train_time:100223ms step_avg:152.08ms step:670/1480 train_time:100386ms step_avg:152.10ms step:671/1480 train_time:100547ms step_avg:152.11ms step:672/1480 train_time:100709ms step_avg:152.13ms step:673/1480 train_time:100870ms step_avg:152.14ms step:674/1480 train_time:101031ms step_avg:152.16ms step:675/1480 train_time:101192ms step_avg:152.17ms step:676/1480 train_time:101354ms step_avg:152.18ms step:677/1480 train_time:101514ms step_avg:152.20ms step:678/1480 train_time:101677ms step_avg:152.21ms step:679/1480 train_time:101838ms step_avg:152.22ms step:680/1480 train_time:101999ms step_avg:152.24ms step:681/1480 train_time:102160ms step_avg:152.25ms step:682/1480 train_time:102323ms step_avg:152.27ms step:683/1480 train_time:102488ms step_avg:152.28ms step:684/1480 train_time:102649ms step_avg:152.30ms step:685/1480 train_time:102812ms step_avg:152.31ms step:686/1480 train_time:102973ms step_avg:152.33ms step:687/1480 train_time:103132ms step_avg:152.34ms step:688/1480 train_time:103294ms step_avg:152.35ms step:689/1480 train_time:103456ms step_avg:152.36ms step:690/1480 train_time:103621ms step_avg:152.38ms step:691/1480 train_time:103784ms step_avg:152.40ms step:692/1480 train_time:103946ms step_avg:152.41ms step:693/1480 train_time:104108ms step_avg:152.43ms step:694/1480 train_time:104270ms step_avg:152.44ms step:695/1480 train_time:104430ms step_avg:152.45ms step:696/1480 train_time:104590ms step_avg:152.46ms step:697/1480 train_time:104753ms step_avg:152.48ms step:698/1480 train_time:104913ms step_avg:152.49ms step:699/1480 train_time:105075ms step_avg:152.50ms step:700/1480 train_time:105237ms step_avg:152.52ms step:701/1480 train_time:105398ms step_avg:152.53ms step:702/1480 train_time:105560ms step_avg:152.54ms step:703/1480 train_time:105721ms step_avg:152.56ms step:704/1480 train_time:105882ms step_avg:152.57ms step:705/1480 train_time:106045ms step_avg:152.58ms step:706/1480 train_time:106208ms step_avg:152.60ms step:707/1480 train_time:106370ms step_avg:152.61ms step:708/1480 train_time:106531ms step_avg:152.62ms step:709/1480 train_time:106692ms step_avg:152.64ms step:710/1480 train_time:106851ms step_avg:152.64ms step:711/1480 train_time:107014ms step_avg:152.66ms step:712/1480 train_time:107179ms step_avg:152.68ms step:713/1480 train_time:107343ms step_avg:152.69ms step:714/1480 train_time:107505ms step_avg:152.71ms step:715/1480 train_time:107665ms step_avg:152.72ms step:716/1480 train_time:107826ms step_avg:152.73ms step:717/1480 train_time:107989ms step_avg:152.74ms step:718/1480 train_time:108148ms step_avg:152.75ms step:719/1480 train_time:108310ms step_avg:152.76ms step:720/1480 train_time:108473ms step_avg:152.78ms step:721/1480 train_time:108634ms step_avg:152.79ms step:722/1480 train_time:108795ms step_avg:152.80ms step:723/1480 train_time:108955ms step_avg:152.81ms step:724/1480 train_time:109115ms step_avg:152.82ms step:725/1480 train_time:109280ms step_avg:152.84ms step:726/1480 train_time:109446ms step_avg:152.86ms step:727/1480 train_time:109609ms step_avg:152.87ms step:728/1480 train_time:109769ms step_avg:152.88ms step:729/1480 train_time:109929ms step_avg:152.89ms step:730/1480 train_time:110092ms step_avg:152.91ms step:731/1480 train_time:110253ms step_avg:152.92ms step:732/1480 train_time:110414ms step_avg:152.93ms step:733/1480 train_time:110576ms step_avg:152.94ms step:734/1480 train_time:110737ms step_avg:152.95ms step:735/1480 train_time:110897ms step_avg:152.96ms step:736/1480 train_time:111059ms step_avg:152.97ms step:737/1480 train_time:111221ms step_avg:152.99ms step:738/1480 train_time:111386ms step_avg:153.00ms step:739/1480 train_time:111549ms step_avg:153.02ms step:740/1480 train_time:111713ms step_avg:153.03ms step:741/1480 train_time:111874ms step_avg:153.04ms step:742/1480 train_time:112035ms step_avg:153.05ms step:743/1480 train_time:112195ms step_avg:153.06ms step:744/1480 train_time:112358ms step_avg:153.08ms step:745/1480 train_time:112524ms step_avg:153.09ms step:746/1480 train_time:112687ms step_avg:153.11ms step:747/1480 train_time:112847ms step_avg:153.12ms step:748/1480 train_time:113013ms step_avg:153.13ms step:749/1480 train_time:113176ms step_avg:153.15ms step:750/1480 train_time:113336ms step_avg:153.16ms step:750/1480 val_loss:3.5480 train_time:113400ms step_avg:153.24ms step:751/1480 train_time:113500ms step_avg:153.17ms step:752/1480 train_time:113662ms step_avg:153.18ms step:753/1480 train_time:113823ms step_avg:153.19ms step:754/1480 train_time:113983ms step_avg:153.20ms step:755/1480 train_time:114144ms step_avg:153.21ms step:756/1480 train_time:114305ms step_avg:153.22ms step:757/1480 train_time:114469ms step_avg:153.24ms step:758/1480 train_time:114632ms step_avg:153.25ms step:759/1480 train_time:114794ms step_avg:153.26ms step:760/1480 train_time:114957ms step_avg:153.28ms step:761/1480 train_time:115120ms step_avg:153.29ms step:762/1480 train_time:115282ms step_avg:153.30ms step:763/1480 train_time:115444ms step_avg:153.31ms step:764/1480 train_time:115605ms step_avg:153.32ms step:765/1480 train_time:115765ms step_avg:153.33ms step:766/1480 train_time:115927ms step_avg:153.34ms step:767/1480 train_time:116089ms step_avg:153.35ms step:768/1480 train_time:116253ms step_avg:153.37ms step:769/1480 train_time:116416ms step_avg:153.38ms step:770/1480 train_time:116579ms step_avg:153.39ms step:771/1480 train_time:116742ms step_avg:153.41ms step:772/1480 train_time:116904ms step_avg:153.42ms step:773/1480 train_time:117066ms step_avg:153.43ms step:774/1480 train_time:117228ms step_avg:153.44ms step:775/1480 train_time:117390ms step_avg:153.45ms step:776/1480 train_time:117555ms step_avg:153.47ms step:777/1480 train_time:117722ms step_avg:153.48ms step:778/1480 train_time:117886ms step_avg:153.50ms step:779/1480 train_time:118048ms step_avg:153.51ms step:780/1480 train_time:118211ms step_avg:153.52ms step:781/1480 train_time:118375ms step_avg:153.53ms step:782/1480 train_time:118539ms step_avg:153.55ms step:783/1480 train_time:118699ms step_avg:153.56ms step:784/1480 train_time:118862ms step_avg:153.57ms step:785/1480 train_time:119024ms step_avg:153.58ms step:786/1480 train_time:119188ms step_avg:153.59ms step:787/1480 train_time:119352ms step_avg:153.61ms step:788/1480 train_time:119516ms step_avg:153.62ms step:789/1480 train_time:119679ms step_avg:153.63ms step:790/1480 train_time:119845ms step_avg:153.65ms step:791/1480 train_time:120015ms step_avg:153.67ms step:792/1480 train_time:120180ms step_avg:153.68ms step:793/1480 train_time:120341ms step_avg:153.69ms step:794/1480 train_time:120504ms step_avg:153.70ms step:795/1480 train_time:120670ms step_avg:153.72ms step:796/1480 train_time:120837ms step_avg:153.74ms step:797/1480 train_time:121000ms step_avg:153.75ms step:798/1480 train_time:121164ms step_avg:153.76ms step:799/1480 train_time:121333ms step_avg:153.78ms step:800/1480 train_time:121497ms step_avg:153.79ms step:801/1480 train_time:121661ms step_avg:153.81ms step:802/1480 train_time:121828ms step_avg:153.82ms step:803/1480 train_time:121992ms step_avg:153.84ms step:804/1480 train_time:122155ms step_avg:153.85ms step:805/1480 train_time:122320ms step_avg:153.86ms step:806/1480 train_time:122483ms step_avg:153.87ms step:807/1480 train_time:122644ms step_avg:153.88ms step:808/1480 train_time:122808ms step_avg:153.90ms step:809/1480 train_time:122971ms step_avg:153.91ms step:810/1480 train_time:123133ms step_avg:153.92ms step:811/1480 train_time:123296ms step_avg:153.93ms step:812/1480 train_time:123460ms step_avg:153.94ms step:813/1480 train_time:123620ms step_avg:153.95ms step:814/1480 train_time:123783ms step_avg:153.96ms step:815/1480 train_time:123946ms step_avg:153.97ms step:816/1480 train_time:124112ms step_avg:153.99ms step:817/1480 train_time:124275ms step_avg:154.00ms step:818/1480 train_time:124438ms step_avg:154.01ms step:819/1480 train_time:124600ms step_avg:154.02ms step:820/1480 train_time:124763ms step_avg:154.03ms step:821/1480 train_time:124926ms step_avg:154.04ms step:822/1480 train_time:125091ms step_avg:154.05ms step:823/1480 train_time:125255ms step_avg:154.06ms step:824/1480 train_time:125418ms step_avg:154.08ms step:825/1480 train_time:125583ms step_avg:154.09ms step:826/1480 train_time:125749ms step_avg:154.10ms step:827/1480 train_time:125913ms step_avg:154.12ms step:828/1480 train_time:126076ms step_avg:154.13ms step:829/1480 train_time:126241ms step_avg:154.14ms step:830/1480 train_time:126405ms step_avg:154.15ms step:831/1480 train_time:126569ms step_avg:154.16ms step:832/1480 train_time:126734ms step_avg:154.18ms step:833/1480 train_time:126899ms step_avg:154.19ms step:834/1480 train_time:127064ms step_avg:154.20ms step:835/1480 train_time:127227ms step_avg:154.21ms step:836/1480 train_time:127392ms step_avg:154.23ms step:837/1480 train_time:127554ms step_avg:154.24ms step:838/1480 train_time:127719ms step_avg:154.25ms step:839/1480 train_time:127882ms step_avg:154.26ms step:840/1480 train_time:128043ms step_avg:154.27ms step:841/1480 train_time:128204ms step_avg:154.28ms step:842/1480 train_time:128368ms step_avg:154.29ms step:843/1480 train_time:128530ms step_avg:154.30ms step:844/1480 train_time:128693ms step_avg:154.31ms step:845/1480 train_time:128860ms step_avg:154.32ms step:846/1480 train_time:129026ms step_avg:154.34ms step:847/1480 train_time:129190ms step_avg:154.35ms step:848/1480 train_time:129352ms step_avg:154.36ms step:849/1480 train_time:129516ms step_avg:154.37ms step:850/1480 train_time:129680ms step_avg:154.38ms step:851/1480 train_time:129845ms step_avg:154.39ms step:852/1480 train_time:130007ms step_avg:154.40ms step:853/1480 train_time:130169ms step_avg:154.41ms step:854/1480 train_time:130332ms step_avg:154.42ms step:855/1480 train_time:130495ms step_avg:154.43ms step:856/1480 train_time:130659ms step_avg:154.44ms step:857/1480 train_time:130824ms step_avg:154.46ms step:858/1480 train_time:130989ms step_avg:154.47ms step:859/1480 train_time:131154ms step_avg:154.48ms step:860/1480 train_time:131316ms step_avg:154.49ms step:861/1480 train_time:131482ms step_avg:154.50ms step:862/1480 train_time:131652ms step_avg:154.52ms step:863/1480 train_time:131819ms step_avg:154.54ms step:864/1480 train_time:131982ms step_avg:154.55ms step:865/1480 train_time:132143ms step_avg:154.55ms step:866/1480 train_time:132309ms step_avg:154.57ms step:867/1480 train_time:132472ms step_avg:154.58ms step:868/1480 train_time:132634ms step_avg:154.59ms step:869/1480 train_time:132797ms step_avg:154.59ms step:870/1480 train_time:132960ms step_avg:154.61ms step:871/1480 train_time:133123ms step_avg:154.61ms step:872/1480 train_time:133287ms step_avg:154.63ms step:873/1480 train_time:133449ms step_avg:154.63ms step:874/1480 train_time:133617ms step_avg:154.65ms step:875/1480 train_time:133782ms step_avg:154.66ms step:875/1480 val_loss:3.5023 train_time:133847ms step_avg:154.74ms step:876/1480 train_time:133948ms step_avg:154.67ms step:877/1480 train_time:134113ms step_avg:154.69ms step:878/1480 train_time:134276ms step_avg:154.70ms step:879/1480 train_time:134440ms step_avg:154.71ms step:880/1480 train_time:134604ms step_avg:154.72ms step:881/1480 train_time:134767ms step_avg:154.73ms step:882/1480 train_time:134931ms step_avg:154.74ms step:883/1480 train_time:135097ms step_avg:154.75ms step:884/1480 train_time:135265ms step_avg:154.77ms step:885/1480 train_time:135430ms step_avg:154.78ms step:886/1480 train_time:135595ms step_avg:154.79ms step:887/1480 train_time:135762ms step_avg:154.80ms step:888/1480 train_time:135935ms step_avg:154.82ms step:889/1480 train_time:136103ms step_avg:154.84ms step:890/1480 train_time:136266ms step_avg:154.85ms step:891/1480 train_time:136431ms step_avg:154.86ms step:892/1480 train_time:136595ms step_avg:154.87ms step:893/1480 train_time:136758ms step_avg:154.88ms step:894/1480 train_time:136925ms step_avg:154.89ms step:895/1480 train_time:137091ms step_avg:154.91ms step:896/1480 train_time:137256ms step_avg:154.92ms step:897/1480 train_time:137422ms step_avg:154.93ms step:898/1480 train_time:137590ms step_avg:154.94ms step:899/1480 train_time:137753ms step_avg:154.95ms step:900/1480 train_time:137915ms step_avg:154.96ms step:901/1480 train_time:138080ms step_avg:154.97ms step:902/1480 train_time:138245ms step_avg:154.98ms step:903/1480 train_time:138417ms step_avg:155.00ms step:904/1480 train_time:138585ms step_avg:155.02ms step:905/1480 train_time:138747ms step_avg:155.02ms step:906/1480 train_time:138913ms step_avg:155.04ms step:907/1480 train_time:139081ms step_avg:155.05ms step:908/1480 train_time:139244ms step_avg:155.06ms step:909/1480 train_time:139410ms step_avg:155.07ms step:910/1480 train_time:139580ms step_avg:155.09ms step:911/1480 train_time:139746ms step_avg:155.10ms step:912/1480 train_time:139911ms step_avg:155.11ms step:913/1480 train_time:140078ms step_avg:155.12ms step:914/1480 train_time:140245ms step_avg:155.14ms step:915/1480 train_time:140413ms step_avg:155.15ms step:916/1480 train_time:140579ms step_avg:155.16ms step:917/1480 train_time:140743ms step_avg:155.17ms step:918/1480 train_time:140911ms step_avg:155.19ms step:919/1480 train_time:141080ms step_avg:155.20ms step:920/1480 train_time:141246ms step_avg:155.22ms step:921/1480 train_time:141411ms step_avg:155.23ms step:922/1480 train_time:141579ms step_avg:155.24ms step:923/1480 train_time:141742ms step_avg:155.25ms step:924/1480 train_time:141908ms step_avg:155.26ms step:925/1480 train_time:142073ms step_avg:155.27ms step:926/1480 train_time:142235ms step_avg:155.28ms step:927/1480 train_time:142399ms step_avg:155.29ms step:928/1480 train_time:142565ms step_avg:155.30ms step:929/1480 train_time:142729ms step_avg:155.31ms step:930/1480 train_time:142893ms step_avg:155.32ms step:931/1480 train_time:143056ms step_avg:155.33ms step:932/1480 train_time:143222ms step_avg:155.34ms step:933/1480 train_time:143390ms step_avg:155.35ms step:934/1480 train_time:143556ms step_avg:155.36ms step:935/1480 train_time:143725ms step_avg:155.38ms step:936/1480 train_time:143893ms step_avg:155.39ms step:937/1480 train_time:144064ms step_avg:155.41ms step:938/1480 train_time:144227ms step_avg:155.42ms step:939/1480 train_time:144396ms step_avg:155.43ms step:940/1480 train_time:144563ms step_avg:155.44ms step:941/1480 train_time:144727ms step_avg:155.45ms step:942/1480 train_time:144891ms step_avg:155.46ms step:943/1480 train_time:145062ms step_avg:155.48ms step:944/1480 train_time:145235ms step_avg:155.50ms step:945/1480 train_time:145400ms step_avg:155.51ms step:946/1480 train_time:145570ms step_avg:155.52ms step:947/1480 train_time:145737ms step_avg:155.54ms step:948/1480 train_time:145904ms step_avg:155.55ms step:949/1480 train_time:146069ms step_avg:155.56ms step:950/1480 train_time:146232ms step_avg:155.57ms step:951/1480 train_time:146401ms step_avg:155.58ms step:952/1480 train_time:146568ms step_avg:155.59ms step:953/1480 train_time:146738ms step_avg:155.61ms step:954/1480 train_time:146909ms step_avg:155.62ms step:955/1480 train_time:147068ms step_avg:155.63ms step:956/1480 train_time:147233ms step_avg:155.64ms step:957/1480 train_time:147402ms step_avg:155.65ms step:958/1480 train_time:147570ms step_avg:155.66ms step:959/1480 train_time:147734ms step_avg:155.67ms step:960/1480 train_time:147902ms step_avg:155.69ms step:961/1480 train_time:148068ms step_avg:155.70ms step:962/1480 train_time:148232ms step_avg:155.71ms step:963/1480 train_time:148399ms step_avg:155.72ms step:964/1480 train_time:148568ms step_avg:155.73ms step:965/1480 train_time:148733ms step_avg:155.74ms step:966/1480 train_time:148899ms step_avg:155.75ms step:967/1480 train_time:149063ms step_avg:155.76ms step:968/1480 train_time:149228ms step_avg:155.77ms step:969/1480 train_time:149396ms step_avg:155.78ms step:970/1480 train_time:149559ms step_avg:155.79ms step:971/1480 train_time:149724ms step_avg:155.80ms step:972/1480 train_time:149889ms step_avg:155.81ms step:973/1480 train_time:150052ms step_avg:155.82ms step:974/1480 train_time:150220ms step_avg:155.83ms step:975/1480 train_time:150387ms step_avg:155.84ms step:976/1480 train_time:150551ms step_avg:155.85ms step:977/1480 train_time:150714ms step_avg:155.86ms step:978/1480 train_time:150883ms step_avg:155.87ms step:979/1480 train_time:151049ms step_avg:155.88ms step:980/1480 train_time:151215ms step_avg:155.89ms step:981/1480 train_time:151385ms step_avg:155.91ms step:982/1480 train_time:151547ms step_avg:155.91ms step:983/1480 train_time:151712ms step_avg:155.92ms step:984/1480 train_time:151877ms step_avg:155.93ms step:985/1480 train_time:152045ms step_avg:155.94ms step:986/1480 train_time:152209ms step_avg:155.95ms step:987/1480 train_time:152372ms step_avg:155.96ms step:988/1480 train_time:152538ms step_avg:155.97ms step:989/1480 train_time:152705ms step_avg:155.98ms step:990/1480 train_time:152873ms step_avg:155.99ms step:991/1480 train_time:153041ms step_avg:156.01ms step:992/1480 train_time:153215ms step_avg:156.02ms step:993/1480 train_time:153391ms step_avg:156.04ms step:994/1480 train_time:153556ms step_avg:156.05ms step:995/1480 train_time:153721ms step_avg:156.06ms step:996/1480 train_time:153884ms step_avg:156.07ms step:997/1480 train_time:154047ms step_avg:156.08ms step:998/1480 train_time:154210ms step_avg:156.08ms step:999/1480 train_time:154378ms step_avg:156.10ms step:1000/1480 train_time:154546ms step_avg:156.11ms step:1000/1480 val_loss:3.4406 train_time:154614ms step_avg:156.18ms step:1001/1480 train_time:154716ms step_avg:156.12ms step:1002/1480 train_time:154881ms step_avg:156.13ms step:1003/1480 train_time:155054ms step_avg:156.15ms step:1004/1480 train_time:155223ms step_avg:156.16ms step:1005/1480 train_time:155390ms step_avg:156.17ms step:1006/1480 train_time:155556ms step_avg:156.18ms step:1007/1480 train_time:155721ms step_avg:156.19ms step:1008/1480 train_time:155890ms step_avg:156.20ms step:1009/1480 train_time:156063ms step_avg:156.22ms step:1010/1480 train_time:156228ms step_avg:156.23ms step:1011/1480 train_time:156394ms step_avg:156.24ms step:1012/1480 train_time:156559ms step_avg:156.25ms step:1013/1480 train_time:156730ms step_avg:156.26ms step:1014/1480 train_time:156897ms step_avg:156.27ms step:1015/1480 train_time:157069ms step_avg:156.29ms step:1016/1480 train_time:157238ms step_avg:156.30ms step:1017/1480 train_time:157408ms step_avg:156.31ms step:1018/1480 train_time:157575ms step_avg:156.32ms step:1019/1480 train_time:157741ms step_avg:156.33ms step:1020/1480 train_time:157911ms step_avg:156.35ms step:1021/1480 train_time:158076ms step_avg:156.36ms step:1022/1480 train_time:158243ms step_avg:156.37ms step:1023/1480 train_time:158411ms step_avg:156.38ms step:1024/1480 train_time:158578ms step_avg:156.39ms step:1025/1480 train_time:158750ms step_avg:156.40ms step:1026/1480 train_time:158917ms step_avg:156.41ms step:1027/1480 train_time:159082ms step_avg:156.42ms step:1028/1480 train_time:159255ms step_avg:156.44ms step:1029/1480 train_time:159430ms step_avg:156.46ms step:1030/1480 train_time:159596ms step_avg:156.47ms step:1031/1480 train_time:159760ms step_avg:156.47ms step:1032/1480 train_time:159934ms step_avg:156.49ms step:1033/1480 train_time:160101ms step_avg:156.50ms step:1034/1480 train_time:160269ms step_avg:156.51ms step:1035/1480 train_time:160438ms step_avg:156.52ms step:1036/1480 train_time:160604ms step_avg:156.53ms step:1037/1480 train_time:160771ms step_avg:156.54ms step:1038/1480 train_time:160940ms step_avg:156.56ms step:1039/1480 train_time:161112ms step_avg:156.57ms step:1040/1480 train_time:161278ms step_avg:156.58ms step:1041/1480 train_time:161444ms step_avg:156.59ms step:1042/1480 train_time:161608ms step_avg:156.60ms step:1043/1480 train_time:161773ms step_avg:156.61ms step:1044/1480 train_time:161939ms step_avg:156.61ms step:1045/1480 train_time:162110ms step_avg:156.63ms step:1046/1480 train_time:162276ms step_avg:156.64ms step:1047/1480 train_time:162442ms step_avg:156.65ms step:1048/1480 train_time:162608ms step_avg:156.66ms step:1049/1480 train_time:162775ms step_avg:156.66ms step:1050/1480 train_time:162942ms step_avg:156.67ms step:1051/1480 train_time:163112ms step_avg:156.69ms step:1052/1480 train_time:163280ms step_avg:156.70ms step:1053/1480 train_time:163447ms step_avg:156.71ms step:1054/1480 train_time:163616ms step_avg:156.72ms step:1055/1480 train_time:163781ms step_avg:156.73ms step:1056/1480 train_time:163945ms step_avg:156.74ms step:1057/1480 train_time:164113ms step_avg:156.75ms step:1058/1480 train_time:164281ms step_avg:156.76ms step:1059/1480 train_time:164455ms step_avg:156.77ms step:1060/1480 train_time:164622ms step_avg:156.78ms step:1061/1480 train_time:164786ms step_avg:156.79ms step:1062/1480 train_time:164953ms step_avg:156.80ms step:1063/1480 train_time:165118ms step_avg:156.81ms step:1064/1480 train_time:165280ms step_avg:156.81ms step:1065/1480 train_time:165448ms step_avg:156.82ms step:1066/1480 train_time:165617ms step_avg:156.83ms step:1067/1480 train_time:165786ms step_avg:156.85ms step:1068/1480 train_time:165953ms step_avg:156.86ms step:1069/1480 train_time:166125ms step_avg:156.87ms step:1070/1480 train_time:166291ms step_avg:156.88ms step:1071/1480 train_time:166463ms step_avg:156.89ms step:1072/1480 train_time:166629ms step_avg:156.90ms step:1073/1480 train_time:166793ms step_avg:156.91ms step:1074/1480 train_time:166959ms step_avg:156.92ms step:1075/1480 train_time:167130ms step_avg:156.93ms step:1076/1480 train_time:167297ms step_avg:156.94ms step:1077/1480 train_time:167462ms step_avg:156.95ms step:1078/1480 train_time:167638ms step_avg:156.96ms step:1079/1480 train_time:167810ms step_avg:156.98ms step:1080/1480 train_time:167979ms step_avg:156.99ms step:1081/1480 train_time:168145ms step_avg:157.00ms step:1082/1480 train_time:168313ms step_avg:157.01ms step:1083/1480 train_time:168479ms step_avg:157.02ms step:1084/1480 train_time:168645ms step_avg:157.03ms step:1085/1480 train_time:168816ms step_avg:157.04ms step:1086/1480 train_time:168983ms step_avg:157.05ms step:1087/1480 train_time:169148ms step_avg:157.06ms step:1088/1480 train_time:169318ms step_avg:157.07ms step:1089/1480 train_time:169491ms step_avg:157.08ms step:1090/1480 train_time:169662ms step_avg:157.09ms step:1091/1480 train_time:169829ms step_avg:157.10ms step:1092/1480 train_time:169996ms step_avg:157.11ms step:1093/1480 train_time:170163ms step_avg:157.12ms step:1094/1480 train_time:170329ms step_avg:157.13ms step:1095/1480 train_time:170494ms step_avg:157.14ms step:1096/1480 train_time:170662ms step_avg:157.15ms step:1097/1480 train_time:170832ms step_avg:157.16ms step:1098/1480 train_time:171001ms step_avg:157.17ms step:1099/1480 train_time:171172ms step_avg:157.18ms step:1100/1480 train_time:171342ms step_avg:157.19ms step:1101/1480 train_time:171514ms step_avg:157.21ms step:1102/1480 train_time:171686ms step_avg:157.22ms step:1103/1480 train_time:171861ms step_avg:157.24ms step:1104/1480 train_time:172031ms step_avg:157.25ms step:1105/1480 train_time:172200ms step_avg:157.26ms step:1106/1480 train_time:172370ms step_avg:157.27ms step:1107/1480 train_time:172539ms step_avg:157.28ms step:1108/1480 train_time:172702ms step_avg:157.29ms step:1109/1480 train_time:172868ms step_avg:157.30ms step:1110/1480 train_time:173035ms step_avg:157.30ms step:1111/1480 train_time:173201ms step_avg:157.31ms step:1112/1480 train_time:173373ms step_avg:157.33ms step:1113/1480 train_time:173552ms step_avg:157.35ms step:1114/1480 train_time:173725ms step_avg:157.36ms step:1115/1480 train_time:173897ms step_avg:157.37ms step:1116/1480 train_time:174065ms step_avg:157.38ms step:1117/1480 train_time:174238ms step_avg:157.40ms step:1118/1480 train_time:174412ms step_avg:157.41ms step:1119/1480 train_time:174578ms step_avg:157.42ms step:1120/1480 train_time:174746ms step_avg:157.43ms step:1121/1480 train_time:174917ms step_avg:157.44ms step:1122/1480 train_time:175082ms step_avg:157.45ms step:1123/1480 train_time:175249ms step_avg:157.46ms step:1124/1480 train_time:175419ms step_avg:157.47ms step:1125/1480 train_time:175586ms step_avg:157.48ms step:1125/1480 val_loss:3.3841 train_time:175654ms step_avg:157.54ms step:1126/1480 train_time:175760ms step_avg:157.49ms step:1127/1480 train_time:175928ms step_avg:157.50ms step:1128/1480 train_time:176100ms step_avg:157.51ms step:1129/1480 train_time:176275ms step_avg:157.53ms step:1130/1480 train_time:176446ms step_avg:157.54ms step:1131/1480 train_time:176625ms step_avg:157.56ms step:1132/1480 train_time:176791ms step_avg:157.57ms step:1133/1480 train_time:176964ms step_avg:157.58ms step:1134/1480 train_time:177134ms step_avg:157.59ms step:1135/1480 train_time:177302ms step_avg:157.60ms step:1136/1480 train_time:177473ms step_avg:157.61ms step:1137/1480 train_time:177643ms step_avg:157.62ms step:1138/1480 train_time:177815ms step_avg:157.64ms step:1139/1480 train_time:177982ms step_avg:157.65ms step:1140/1480 train_time:178150ms step_avg:157.65ms step:1141/1480 train_time:178322ms step_avg:157.67ms step:1142/1480 train_time:178489ms step_avg:157.68ms step:1143/1480 train_time:178660ms step_avg:157.69ms step:1144/1480 train_time:178828ms step_avg:157.70ms step:1145/1480 train_time:178993ms step_avg:157.70ms step:1146/1480 train_time:179164ms step_avg:157.71ms step:1147/1480 train_time:179333ms step_avg:157.72ms step:1148/1480 train_time:179501ms step_avg:157.73ms step:1149/1480 train_time:179672ms step_avg:157.75ms step:1150/1480 train_time:179841ms step_avg:157.76ms step:1151/1480 train_time:180012ms step_avg:157.77ms step:1152/1480 train_time:180183ms step_avg:157.78ms step:1153/1480 train_time:180357ms step_avg:157.79ms step:1154/1480 train_time:180525ms step_avg:157.80ms step:1155/1480 train_time:180698ms step_avg:157.81ms step:1156/1480 train_time:180878ms step_avg:157.83ms step:1157/1480 train_time:181047ms step_avg:157.84ms step:1158/1480 train_time:181214ms step_avg:157.85ms step:1159/1480 train_time:181381ms step_avg:157.86ms step:1160/1480 train_time:181546ms step_avg:157.87ms step:1161/1480 train_time:181718ms step_avg:157.88ms step:1162/1480 train_time:181887ms step_avg:157.89ms step:1163/1480 train_time:182058ms step_avg:157.90ms step:1164/1480 train_time:182227ms step_avg:157.91ms step:1165/1480 train_time:182392ms step_avg:157.92ms step:1166/1480 train_time:182561ms step_avg:157.92ms step:1167/1480 train_time:182729ms step_avg:157.93ms step:1168/1480 train_time:182897ms step_avg:157.94ms step:1169/1480 train_time:183065ms step_avg:157.95ms step:1170/1480 train_time:183234ms step_avg:157.96ms step:1171/1480 train_time:183401ms step_avg:157.97ms step:1172/1480 train_time:183566ms step_avg:157.97ms step:1173/1480 train_time:183740ms step_avg:157.99ms step:1174/1480 train_time:183922ms step_avg:158.01ms step:1175/1480 train_time:184093ms step_avg:158.02ms step:1176/1480 train_time:184264ms step_avg:158.03ms step:1177/1480 train_time:184440ms step_avg:158.05ms step:1178/1480 train_time:184607ms step_avg:158.05ms step:1179/1480 train_time:184773ms step_avg:158.06ms step:1180/1480 train_time:184953ms step_avg:158.08ms step:1181/1480 train_time:185123ms step_avg:158.09ms step:1182/1480 train_time:185291ms step_avg:158.10ms step:1183/1480 train_time:185462ms step_avg:158.11ms step:1184/1480 train_time:185631ms step_avg:158.12ms step:1185/1480 train_time:185803ms step_avg:158.13ms step:1186/1480 train_time:185973ms step_avg:158.14ms step:1187/1480 train_time:186157ms step_avg:158.16ms step:1188/1480 train_time:186324ms step_avg:158.17ms step:1189/1480 train_time:186495ms step_avg:158.18ms step:1190/1480 train_time:186663ms step_avg:158.19ms step:1191/1480 train_time:186835ms step_avg:158.20ms step:1192/1480 train_time:187003ms step_avg:158.21ms step:1193/1480 train_time:187168ms step_avg:158.21ms step:1194/1480 train_time:187339ms step_avg:158.23ms step:1195/1480 train_time:187513ms step_avg:158.24ms step:1196/1480 train_time:187696ms step_avg:158.26ms step:1197/1480 train_time:187866ms step_avg:158.27ms step:1198/1480 train_time:188047ms step_avg:158.29ms step:1199/1480 train_time:188217ms step_avg:158.30ms step:1200/1480 train_time:188384ms step_avg:158.31ms step:1201/1480 train_time:188552ms step_avg:158.31ms step:1202/1480 train_time:188732ms step_avg:158.33ms step:1203/1480 train_time:188908ms step_avg:158.35ms step:1204/1480 train_time:189082ms step_avg:158.36ms step:1205/1480 train_time:189249ms step_avg:158.37ms step:1206/1480 train_time:189418ms step_avg:158.38ms step:1207/1480 train_time:189588ms step_avg:158.39ms step:1208/1480 train_time:189759ms step_avg:158.40ms step:1209/1480 train_time:189929ms step_avg:158.41ms step:1210/1480 train_time:190105ms step_avg:158.42ms step:1211/1480 train_time:190279ms step_avg:158.43ms step:1212/1480 train_time:190452ms step_avg:158.45ms step:1213/1480 train_time:190625ms step_avg:158.46ms step:1214/1480 train_time:190802ms step_avg:158.47ms step:1215/1480 train_time:190976ms step_avg:158.49ms step:1216/1480 train_time:191143ms step_avg:158.49ms step:1217/1480 train_time:191317ms step_avg:158.51ms step:1218/1480 train_time:191485ms step_avg:158.51ms step:1219/1480 train_time:191664ms step_avg:158.53ms step:1220/1480 train_time:191834ms step_avg:158.54ms step:1221/1480 train_time:192002ms step_avg:158.55ms step:1222/1480 train_time:192169ms step_avg:158.56ms step:1223/1480 train_time:192341ms step_avg:158.57ms step:1224/1480 train_time:192520ms step_avg:158.58ms step:1225/1480 train_time:192692ms step_avg:158.59ms step:1226/1480 train_time:192865ms step_avg:158.61ms step:1227/1480 train_time:193038ms step_avg:158.62ms step:1228/1480 train_time:193207ms step_avg:158.63ms step:1229/1480 train_time:193380ms step_avg:158.64ms step:1230/1480 train_time:193561ms step_avg:158.66ms step:1231/1480 train_time:193738ms step_avg:158.67ms step:1232/1480 train_time:193912ms step_avg:158.68ms step:1233/1480 train_time:194083ms step_avg:158.69ms step:1234/1480 train_time:194253ms step_avg:158.70ms step:1235/1480 train_time:194428ms step_avg:158.72ms step:1236/1480 train_time:194596ms step_avg:158.72ms step:1237/1480 train_time:194767ms step_avg:158.73ms step:1238/1480 train_time:194952ms step_avg:158.76ms step:1239/1480 train_time:195123ms step_avg:158.77ms step:1240/1480 train_time:195293ms step_avg:158.77ms step:1241/1480 train_time:195465ms step_avg:158.79ms step:1242/1480 train_time:195634ms step_avg:158.79ms step:1243/1480 train_time:195806ms step_avg:158.80ms step:1244/1480 train_time:195973ms step_avg:158.81ms step:1245/1480 train_time:196141ms step_avg:158.82ms step:1246/1480 train_time:196311ms step_avg:158.83ms step:1247/1480 train_time:196481ms step_avg:158.84ms step:1248/1480 train_time:196650ms step_avg:158.85ms step:1249/1480 train_time:196819ms step_avg:158.85ms step:1250/1480 train_time:196987ms step_avg:158.86ms step:1250/1480 val_loss:3.3348 train_time:197059ms step_avg:158.92ms step:1251/1480 train_time:197169ms step_avg:158.88ms step:1252/1480 train_time:197339ms step_avg:158.89ms step:1253/1480 train_time:197508ms step_avg:158.90ms step:1254/1480 train_time:197680ms step_avg:158.91ms step:1255/1480 train_time:197868ms step_avg:158.93ms step:1256/1480 train_time:198043ms step_avg:158.94ms step:1257/1480 train_time:198213ms step_avg:158.95ms step:1258/1480 train_time:198389ms step_avg:158.97ms step:1259/1480 train_time:198560ms step_avg:158.98ms step:1260/1480 train_time:198728ms step_avg:158.98ms step:1261/1480 train_time:198902ms step_avg:158.99ms step:1262/1480 train_time:199077ms step_avg:159.01ms step:1263/1480 train_time:199251ms step_avg:159.02ms step:1264/1480 train_time:199417ms step_avg:159.02ms step:1265/1480 train_time:199586ms step_avg:159.03ms step:1266/1480 train_time:199756ms step_avg:159.04ms step:1267/1480 train_time:199927ms step_avg:159.05ms step:1268/1480 train_time:200099ms step_avg:159.06ms step:1269/1480 train_time:200276ms step_avg:159.08ms step:1270/1480 train_time:200446ms step_avg:159.08ms step:1271/1480 train_time:200616ms step_avg:159.09ms step:1272/1480 train_time:200782ms step_avg:159.10ms step:1273/1480 train_time:200952ms step_avg:159.11ms step:1274/1480 train_time:201126ms step_avg:159.12ms step:1275/1480 train_time:201293ms step_avg:159.12ms step:1276/1480 train_time:201459ms step_avg:159.13ms step:1277/1480 train_time:201630ms step_avg:159.14ms step:1278/1480 train_time:201798ms step_avg:159.15ms step:1279/1480 train_time:201970ms step_avg:159.16ms step:1280/1480 train_time:202148ms step_avg:159.17ms step:1281/1480 train_time:202318ms step_avg:159.18ms step:1282/1480 train_time:202485ms step_avg:159.19ms step:1283/1480 train_time:202655ms step_avg:159.19ms step:1284/1480 train_time:202825ms step_avg:159.20ms step:1285/1480 train_time:202994ms step_avg:159.21ms step:1286/1480 train_time:203165ms step_avg:159.22ms step:1287/1480 train_time:203337ms step_avg:159.23ms step:1288/1480 train_time:203510ms step_avg:159.24ms step:1289/1480 train_time:203693ms step_avg:159.26ms step:1290/1480 train_time:203873ms step_avg:159.28ms step:1291/1480 train_time:204046ms step_avg:159.29ms step:1292/1480 train_time:204221ms step_avg:159.30ms step:1293/1480 train_time:204397ms step_avg:159.31ms step:1294/1480 train_time:204569ms step_avg:159.32ms step:1295/1480 train_time:204741ms step_avg:159.33ms step:1296/1480 train_time:204915ms step_avg:159.34ms step:1297/1480 train_time:205087ms step_avg:159.35ms step:1298/1480 train_time:205257ms step_avg:159.36ms step:1299/1480 train_time:205427ms step_avg:159.37ms step:1300/1480 train_time:205593ms step_avg:159.37ms step:1301/1480 train_time:205764ms step_avg:159.38ms step:1302/1480 train_time:205937ms step_avg:159.39ms step:1303/1480 train_time:206113ms step_avg:159.41ms step:1304/1480 train_time:206288ms step_avg:159.42ms step:1305/1480 train_time:206456ms step_avg:159.43ms step:1306/1480 train_time:206632ms step_avg:159.44ms step:1307/1480 train_time:206799ms step_avg:159.44ms step:1308/1480 train_time:206968ms step_avg:159.45ms step:1309/1480 train_time:207141ms step_avg:159.46ms step:1310/1480 train_time:207309ms step_avg:159.47ms step:1311/1480 train_time:207476ms step_avg:159.47ms step:1312/1480 train_time:207649ms step_avg:159.48ms step:1313/1480 train_time:207819ms step_avg:159.49ms step:1314/1480 train_time:207992ms step_avg:159.50ms step:1315/1480 train_time:208162ms step_avg:159.51ms step:1316/1480 train_time:208329ms step_avg:159.52ms step:1317/1480 train_time:208502ms step_avg:159.53ms step:1318/1480 train_time:208683ms step_avg:159.54ms step:1319/1480 train_time:208859ms step_avg:159.56ms step:1320/1480 train_time:209035ms step_avg:159.57ms step:1321/1480 train_time:209208ms step_avg:159.58ms step:1322/1480 train_time:209390ms step_avg:159.60ms step:1323/1480 train_time:209563ms step_avg:159.61ms step:1324/1480 train_time:209737ms step_avg:159.62ms step:1325/1480 train_time:209919ms step_avg:159.63ms step:1326/1480 train_time:210095ms step_avg:159.65ms step:1327/1480 train_time:210265ms step_avg:159.65ms step:1328/1480 train_time:210434ms step_avg:159.66ms step:1329/1480 train_time:210631ms step_avg:159.69ms step:1330/1480 train_time:210810ms step_avg:159.70ms step:1331/1480 train_time:210981ms step_avg:159.71ms step:1332/1480 train_time:211155ms step_avg:159.72ms step:1333/1480 train_time:211331ms step_avg:159.74ms step:1334/1480 train_time:211502ms step_avg:159.75ms step:1335/1480 train_time:211671ms step_avg:159.75ms step:1336/1480 train_time:211855ms step_avg:159.77ms step:1337/1480 train_time:212030ms step_avg:159.78ms step:1338/1480 train_time:212203ms step_avg:159.79ms step:1339/1480 train_time:212377ms step_avg:159.80ms step:1340/1480 train_time:212549ms step_avg:159.81ms step:1341/1480 train_time:212717ms step_avg:159.82ms step:1342/1480 train_time:212892ms step_avg:159.83ms step:1343/1480 train_time:213062ms step_avg:159.84ms step:1344/1480 train_time:213234ms step_avg:159.85ms step:1345/1480 train_time:213413ms step_avg:159.86ms step:1346/1480 train_time:213584ms step_avg:159.87ms step:1347/1480 train_time:213754ms step_avg:159.88ms step:1348/1480 train_time:213926ms step_avg:159.88ms step:1349/1480 train_time:214095ms step_avg:159.89ms step:1350/1480 train_time:214271ms step_avg:159.90ms step:1351/1480 train_time:214444ms step_avg:159.91ms step:1352/1480 train_time:214613ms step_avg:159.92ms step:1353/1480 train_time:214791ms step_avg:159.93ms step:1354/1480 train_time:214963ms step_avg:159.94ms step:1355/1480 train_time:215132ms step_avg:159.95ms step:1356/1480 train_time:215306ms step_avg:159.96ms step:1357/1480 train_time:215478ms step_avg:159.97ms step:1358/1480 train_time:215651ms step_avg:159.98ms step:1359/1480 train_time:215824ms step_avg:159.99ms step:1360/1480 train_time:215996ms step_avg:160.00ms step:1361/1480 train_time:216173ms step_avg:160.01ms step:1362/1480 train_time:216348ms step_avg:160.02ms step:1363/1480 train_time:216528ms step_avg:160.04ms step:1364/1480 train_time:216697ms step_avg:160.04ms step:1365/1480 train_time:216865ms step_avg:160.05ms step:1366/1480 train_time:217036ms step_avg:160.06ms step:1367/1480 train_time:217208ms step_avg:160.06ms step:1368/1480 train_time:217383ms step_avg:160.08ms step:1369/1480 train_time:217563ms step_avg:160.09ms step:1370/1480 train_time:217740ms step_avg:160.10ms step:1371/1480 train_time:217911ms step_avg:160.11ms step:1372/1480 train_time:218088ms step_avg:160.12ms step:1373/1480 train_time:218256ms step_avg:160.13ms step:1374/1480 train_time:218430ms step_avg:160.14ms step:1375/1480 train_time:218601ms step_avg:160.15ms step:1375/1480 val_loss:3.2962 train_time:218669ms step_avg:160.20ms step:1376/1480 train_time:218774ms step_avg:160.16ms step:1377/1480 train_time:218945ms step_avg:160.16ms step:1378/1480 train_time:219115ms step_avg:160.17ms step:1379/1480 train_time:219290ms step_avg:160.18ms step:1380/1480 train_time:219464ms step_avg:160.19ms step:1381/1480 train_time:219645ms step_avg:160.21ms step:1382/1480 train_time:219816ms step_avg:160.22ms step:1383/1480 train_time:219988ms step_avg:160.22ms step:1384/1480 train_time:220164ms step_avg:160.24ms step:1385/1480 train_time:220329ms step_avg:160.24ms step:1386/1480 train_time:220500ms step_avg:160.25ms step:1387/1480 train_time:220671ms step_avg:160.26ms step:1388/1480 train_time:220839ms step_avg:160.26ms step:1389/1480 train_time:221010ms step_avg:160.27ms step:1390/1480 train_time:221180ms step_avg:160.28ms step:1391/1480 train_time:221348ms step_avg:160.28ms step:1392/1480 train_time:221522ms step_avg:160.29ms step:1393/1480 train_time:221692ms step_avg:160.30ms step:1394/1480 train_time:221862ms step_avg:160.31ms step:1395/1480 train_time:222030ms step_avg:160.31ms step:1396/1480 train_time:222200ms step_avg:160.32ms step:1397/1480 train_time:222367ms step_avg:160.32ms step:1398/1480 train_time:222534ms step_avg:160.33ms step:1399/1480 train_time:222703ms step_avg:160.33ms step:1400/1480 train_time:222880ms step_avg:160.35ms step:1401/1480 train_time:223046ms step_avg:160.35ms step:1402/1480 train_time:223217ms step_avg:160.36ms step:1403/1480 train_time:223394ms step_avg:160.37ms step:1404/1480 train_time:223565ms step_avg:160.38ms step:1405/1480 train_time:223740ms step_avg:160.39ms step:1406/1480 train_time:223915ms step_avg:160.40ms step:1407/1480 train_time:224085ms step_avg:160.40ms step:1408/1480 train_time:224253ms step_avg:160.41ms step:1409/1480 train_time:224437ms step_avg:160.43ms step:1410/1480 train_time:224606ms step_avg:160.43ms step:1411/1480 train_time:224774ms step_avg:160.44ms step:1412/1480 train_time:224942ms step_avg:160.44ms step:1413/1480 train_time:225111ms step_avg:160.45ms step:1414/1480 train_time:225284ms step_avg:160.46ms step:1415/1480 train_time:225458ms step_avg:160.47ms step:1416/1480 train_time:225645ms step_avg:160.49ms step:1417/1480 train_time:225819ms step_avg:160.50ms step:1418/1480 train_time:225989ms step_avg:160.50ms step:1419/1480 train_time:226164ms step_avg:160.51ms step:1420/1480 train_time:226338ms step_avg:160.52ms step:1421/1480 train_time:226511ms step_avg:160.53ms step:1422/1480 train_time:226683ms step_avg:160.54ms step:1423/1480 train_time:226852ms step_avg:160.55ms step:1424/1480 train_time:227028ms step_avg:160.56ms step:1425/1480 train_time:227208ms step_avg:160.57ms step:1426/1480 train_time:227379ms step_avg:160.58ms step:1427/1480 train_time:227553ms step_avg:160.59ms step:1428/1480 train_time:227725ms step_avg:160.60ms step:1429/1480 train_time:227892ms step_avg:160.60ms step:1430/1480 train_time:228067ms step_avg:160.61ms step:1431/1480 train_time:228242ms step_avg:160.62ms step:1432/1480 train_time:228421ms step_avg:160.63ms step:1433/1480 train_time:228602ms step_avg:160.65ms step:1434/1480 train_time:228781ms step_avg:160.66ms step:1435/1480 train_time:228955ms step_avg:160.67ms step:1436/1480 train_time:229128ms step_avg:160.68ms step:1437/1480 train_time:229298ms step_avg:160.69ms step:1438/1480 train_time:229467ms step_avg:160.69ms step:1439/1480 train_time:229642ms step_avg:160.70ms step:1440/1480 train_time:229812ms step_avg:160.71ms step:1441/1480 train_time:229984ms step_avg:160.72ms step:1442/1480 train_time:230162ms step_avg:160.73ms step:1443/1480 train_time:230350ms step_avg:160.75ms step:1444/1480 train_time:230522ms step_avg:160.75ms step:1445/1480 train_time:230692ms step_avg:160.76ms step:1446/1480 train_time:230867ms step_avg:160.77ms step:1447/1480 train_time:231046ms step_avg:160.78ms step:1448/1480 train_time:231218ms step_avg:160.79ms step:1449/1480 train_time:231392ms step_avg:160.80ms step:1450/1480 train_time:231565ms step_avg:160.81ms step:1451/1480 train_time:231736ms step_avg:160.82ms step:1452/1480 train_time:231910ms step_avg:160.83ms step:1453/1480 train_time:232080ms step_avg:160.83ms step:1454/1480 train_time:232252ms step_avg:160.84ms step:1455/1480 train_time:232431ms step_avg:160.85ms step:1456/1480 train_time:232604ms step_avg:160.86ms step:1457/1480 train_time:232775ms step_avg:160.87ms step:1458/1480 train_time:232945ms step_avg:160.87ms step:1459/1480 train_time:233122ms step_avg:160.88ms step:1460/1480 train_time:233293ms step_avg:160.89ms step:1461/1480 train_time:233467ms step_avg:160.90ms step:1462/1480 train_time:233638ms step_avg:160.91ms step:1463/1480 train_time:233816ms step_avg:160.92ms step:1464/1480 train_time:233991ms step_avg:160.93ms step:1465/1480 train_time:234165ms step_avg:160.94ms step:1466/1480 train_time:234337ms step_avg:160.95ms step:1467/1480 train_time:234510ms step_avg:160.95ms step:1468/1480 train_time:234681ms step_avg:160.96ms step:1469/1480 train_time:234853ms step_avg:160.97ms step:1470/1480 train_time:235033ms step_avg:160.98ms step:1471/1480 train_time:235222ms step_avg:161.00ms step:1472/1480 train_time:235403ms step_avg:161.01ms step:1473/1480 train_time:235576ms step_avg:161.02ms step:1474/1480 train_time:235752ms step_avg:161.03ms step:1475/1480 train_time:235931ms step_avg:161.05ms step:1476/1480 train_time:236103ms step_avg:161.05ms step:1477/1480 train_time:236284ms step_avg:161.07ms step:1478/1480 train_time:236465ms step_avg:161.08ms step:1479/1480 train_time:236641ms step_avg:161.09ms step:1480/1480 train_time:236814ms step_avg:161.10ms step:1480/1480 val_loss:3.2773 train_time:236886ms step_avg:161.15ms