import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 07:34:03 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 124W / 700W | 47MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 40C P0 98W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 108W / 700W | 27MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 47C P0 128W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23079ms step_avg:nanms step:2/1480 train_time:23239ms step_avg:nanms step:3/1480 train_time:23377ms step_avg:nanms step:4/1480 train_time:23520ms step_avg:nanms step:5/1480 train_time:23661ms step_avg:nanms step:6/1480 train_time:23804ms step_avg:nanms step:7/1480 train_time:23946ms step_avg:nanms step:8/1480 train_time:24089ms step_avg:nanms step:9/1480 train_time:24233ms step_avg:nanms step:10/1480 train_time:24375ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:426ms step_avg:141.92ms step:14/1480 train_time:569ms step_avg:142.25ms step:15/1480 train_time:711ms step_avg:142.29ms step:16/1480 train_time:854ms step_avg:142.40ms step:17/1480 train_time:998ms step_avg:142.57ms step:18/1480 train_time:1139ms step_avg:142.44ms step:19/1480 train_time:1281ms step_avg:142.32ms step:20/1480 train_time:1426ms step_avg:142.58ms step:21/1480 train_time:1569ms step_avg:142.67ms step:22/1480 train_time:1712ms step_avg:142.70ms step:23/1480 train_time:1856ms step_avg:142.76ms step:24/1480 train_time:1998ms step_avg:142.72ms step:25/1480 train_time:2140ms step_avg:142.64ms step:26/1480 train_time:2281ms step_avg:142.57ms step:27/1480 train_time:2423ms step_avg:142.53ms step:28/1480 train_time:2567ms step_avg:142.59ms step:29/1480 train_time:2710ms step_avg:142.62ms step:30/1480 train_time:2853ms step_avg:142.65ms step:31/1480 train_time:2995ms step_avg:142.64ms step:32/1480 train_time:3137ms step_avg:142.60ms step:33/1480 train_time:3279ms step_avg:142.57ms step:34/1480 train_time:3420ms step_avg:142.52ms step:35/1480 train_time:3563ms step_avg:142.54ms step:36/1480 train_time:3707ms step_avg:142.57ms step:37/1480 train_time:3850ms step_avg:142.59ms step:38/1480 train_time:3994ms step_avg:142.63ms step:39/1480 train_time:4135ms step_avg:142.60ms step:40/1480 train_time:4277ms step_avg:142.56ms step:41/1480 train_time:4419ms step_avg:142.54ms step:42/1480 train_time:4562ms step_avg:142.57ms step:43/1480 train_time:4707ms step_avg:142.63ms step:44/1480 train_time:4851ms step_avg:142.66ms step:45/1480 train_time:4994ms step_avg:142.69ms step:46/1480 train_time:5137ms step_avg:142.69ms step:47/1480 train_time:5279ms step_avg:142.66ms step:48/1480 train_time:5421ms step_avg:142.65ms step:49/1480 train_time:5565ms step_avg:142.68ms step:50/1480 train_time:5709ms step_avg:142.73ms step:51/1480 train_time:5853ms step_avg:142.75ms step:52/1480 train_time:5995ms step_avg:142.73ms step:53/1480 train_time:6136ms step_avg:142.70ms step:54/1480 train_time:6277ms step_avg:142.66ms step:55/1480 train_time:6418ms step_avg:142.63ms step:56/1480 train_time:6561ms step_avg:142.63ms step:57/1480 train_time:6704ms step_avg:142.63ms step:58/1480 train_time:6847ms step_avg:142.64ms step:59/1480 train_time:6989ms step_avg:142.63ms step:60/1480 train_time:7132ms step_avg:142.63ms step:61/1480 train_time:7274ms step_avg:142.62ms step:62/1480 train_time:7416ms step_avg:142.61ms step:63/1480 train_time:7559ms step_avg:142.63ms step:64/1480 train_time:7704ms step_avg:142.67ms step:65/1480 train_time:7847ms step_avg:142.68ms step:66/1480 train_time:7990ms step_avg:142.68ms step:67/1480 train_time:8133ms step_avg:142.68ms step:68/1480 train_time:8274ms step_avg:142.65ms step:69/1480 train_time:8416ms step_avg:142.65ms step:70/1480 train_time:8559ms step_avg:142.65ms step:71/1480 train_time:8702ms step_avg:142.66ms step:72/1480 train_time:8846ms step_avg:142.68ms step:73/1480 train_time:8989ms step_avg:142.69ms step:74/1480 train_time:9132ms step_avg:142.69ms step:75/1480 train_time:9275ms step_avg:142.68ms step:76/1480 train_time:9416ms step_avg:142.67ms step:77/1480 train_time:9559ms step_avg:142.67ms step:78/1480 train_time:9702ms step_avg:142.68ms step:79/1480 train_time:9847ms step_avg:142.72ms step:80/1480 train_time:9991ms step_avg:142.73ms step:81/1480 train_time:10133ms step_avg:142.72ms step:82/1480 train_time:10273ms step_avg:142.69ms step:83/1480 train_time:10414ms step_avg:142.66ms step:84/1480 train_time:10556ms step_avg:142.65ms step:85/1480 train_time:10699ms step_avg:142.65ms step:86/1480 train_time:10841ms step_avg:142.64ms step:87/1480 train_time:10983ms step_avg:142.64ms step:88/1480 train_time:11126ms step_avg:142.64ms step:89/1480 train_time:11269ms step_avg:142.64ms step:90/1480 train_time:11411ms step_avg:142.63ms step:91/1480 train_time:11552ms step_avg:142.61ms step:92/1480 train_time:11693ms step_avg:142.60ms step:93/1480 train_time:11837ms step_avg:142.61ms step:94/1480 train_time:11979ms step_avg:142.61ms step:95/1480 train_time:12122ms step_avg:142.61ms step:96/1480 train_time:12265ms step_avg:142.61ms step:97/1480 train_time:12407ms step_avg:142.61ms step:98/1480 train_time:12548ms step_avg:142.59ms step:99/1480 train_time:12691ms step_avg:142.60ms step:100/1480 train_time:12835ms step_avg:142.61ms step:101/1480 train_time:12977ms step_avg:142.60ms step:102/1480 train_time:13119ms step_avg:142.59ms step:103/1480 train_time:13262ms step_avg:142.60ms step:104/1480 train_time:13405ms step_avg:142.61ms step:105/1480 train_time:13548ms step_avg:142.61ms step:106/1480 train_time:13690ms step_avg:142.60ms step:107/1480 train_time:13832ms step_avg:142.60ms step:108/1480 train_time:13974ms step_avg:142.60ms step:109/1480 train_time:14115ms step_avg:142.58ms step:110/1480 train_time:14259ms step_avg:142.59ms step:111/1480 train_time:14403ms step_avg:142.61ms step:112/1480 train_time:14552ms step_avg:142.67ms step:113/1480 train_time:14698ms step_avg:142.70ms step:114/1480 train_time:14844ms step_avg:142.73ms step:115/1480 train_time:14990ms step_avg:142.77ms step:116/1480 train_time:15137ms step_avg:142.80ms step:117/1480 train_time:15282ms step_avg:142.83ms step:118/1480 train_time:15432ms step_avg:142.89ms step:119/1480 train_time:15579ms step_avg:142.92ms step:120/1480 train_time:15725ms step_avg:142.96ms step:121/1480 train_time:15873ms step_avg:143.00ms step:122/1480 train_time:16017ms step_avg:143.01ms step:123/1480 train_time:16164ms step_avg:143.04ms step:124/1480 train_time:16311ms step_avg:143.08ms step:125/1480 train_time:16459ms step_avg:143.12ms step:125/1480 val_loss:4.4101 train_time:16517ms step_avg:143.62ms step:126/1480 train_time:16613ms step_avg:143.22ms step:127/1480 train_time:16763ms step_avg:143.27ms step:128/1480 train_time:16910ms step_avg:143.30ms step:129/1480 train_time:17055ms step_avg:143.32ms step:130/1480 train_time:17201ms step_avg:143.34ms step:131/1480 train_time:17347ms step_avg:143.37ms step:132/1480 train_time:17494ms step_avg:143.39ms step:133/1480 train_time:17642ms step_avg:143.43ms step:134/1480 train_time:17790ms step_avg:143.47ms step:135/1480 train_time:17936ms step_avg:143.49ms step:136/1480 train_time:18083ms step_avg:143.52ms step:137/1480 train_time:18230ms step_avg:143.54ms step:138/1480 train_time:18375ms step_avg:143.56ms step:139/1480 train_time:18525ms step_avg:143.60ms step:140/1480 train_time:18673ms step_avg:143.64ms step:141/1480 train_time:18822ms step_avg:143.68ms step:142/1480 train_time:18970ms step_avg:143.71ms step:143/1480 train_time:19115ms step_avg:143.73ms step:144/1480 train_time:19263ms step_avg:143.75ms step:145/1480 train_time:19410ms step_avg:143.78ms step:146/1480 train_time:19557ms step_avg:143.80ms step:147/1480 train_time:19706ms step_avg:143.84ms step:148/1480 train_time:19854ms step_avg:143.87ms step:149/1480 train_time:20002ms step_avg:143.90ms step:150/1480 train_time:20148ms step_avg:143.92ms step:151/1480 train_time:20294ms step_avg:143.93ms step:152/1480 train_time:20442ms step_avg:143.96ms step:153/1480 train_time:20589ms step_avg:143.98ms step:154/1480 train_time:20735ms step_avg:144.00ms step:155/1480 train_time:20884ms step_avg:144.03ms step:156/1480 train_time:21031ms step_avg:144.05ms step:157/1480 train_time:21180ms step_avg:144.08ms step:158/1480 train_time:21325ms step_avg:144.09ms step:159/1480 train_time:21472ms step_avg:144.10ms step:160/1480 train_time:21617ms step_avg:144.11ms step:161/1480 train_time:21766ms step_avg:144.14ms step:162/1480 train_time:21913ms step_avg:144.16ms step:163/1480 train_time:22060ms step_avg:144.19ms step:164/1480 train_time:22207ms step_avg:144.20ms step:165/1480 train_time:22353ms step_avg:144.21ms step:166/1480 train_time:22500ms step_avg:144.23ms step:167/1480 train_time:22648ms step_avg:144.25ms step:168/1480 train_time:22795ms step_avg:144.27ms step:169/1480 train_time:22944ms step_avg:144.30ms step:170/1480 train_time:23091ms step_avg:144.32ms step:171/1480 train_time:23237ms step_avg:144.33ms step:172/1480 train_time:23384ms step_avg:144.34ms step:173/1480 train_time:23530ms step_avg:144.35ms step:174/1480 train_time:23676ms step_avg:144.36ms step:175/1480 train_time:23823ms step_avg:144.38ms step:176/1480 train_time:23971ms step_avg:144.40ms step:177/1480 train_time:24116ms step_avg:144.41ms step:178/1480 train_time:24264ms step_avg:144.43ms step:179/1480 train_time:24412ms step_avg:144.45ms step:180/1480 train_time:24558ms step_avg:144.46ms step:181/1480 train_time:24706ms step_avg:144.48ms step:182/1480 train_time:24853ms step_avg:144.49ms step:183/1480 train_time:25002ms step_avg:144.52ms step:184/1480 train_time:25148ms step_avg:144.53ms step:185/1480 train_time:25294ms step_avg:144.54ms step:186/1480 train_time:25441ms step_avg:144.55ms step:187/1480 train_time:25588ms step_avg:144.57ms step:188/1480 train_time:25735ms step_avg:144.58ms step:189/1480 train_time:25883ms step_avg:144.60ms step:190/1480 train_time:26030ms step_avg:144.61ms step:191/1480 train_time:26177ms step_avg:144.63ms step:192/1480 train_time:26325ms step_avg:144.64ms step:193/1480 train_time:26471ms step_avg:144.65ms step:194/1480 train_time:26617ms step_avg:144.66ms step:195/1480 train_time:26764ms step_avg:144.67ms step:196/1480 train_time:26911ms step_avg:144.68ms step:197/1480 train_time:27056ms step_avg:144.69ms step:198/1480 train_time:27206ms step_avg:144.71ms step:199/1480 train_time:27352ms step_avg:144.72ms step:200/1480 train_time:27501ms step_avg:144.74ms step:201/1480 train_time:27647ms step_avg:144.75ms step:202/1480 train_time:27794ms step_avg:144.76ms step:203/1480 train_time:27941ms step_avg:144.77ms step:204/1480 train_time:28089ms step_avg:144.79ms step:205/1480 train_time:28234ms step_avg:144.79ms step:206/1480 train_time:28383ms step_avg:144.81ms step:207/1480 train_time:28528ms step_avg:144.81ms step:208/1480 train_time:28674ms step_avg:144.82ms step:209/1480 train_time:28820ms step_avg:144.83ms step:210/1480 train_time:28968ms step_avg:144.84ms step:211/1480 train_time:29113ms step_avg:144.84ms step:212/1480 train_time:29260ms step_avg:144.85ms step:213/1480 train_time:29408ms step_avg:144.87ms step:214/1480 train_time:29554ms step_avg:144.87ms step:215/1480 train_time:29702ms step_avg:144.89ms step:216/1480 train_time:29849ms step_avg:144.90ms step:217/1480 train_time:29993ms step_avg:144.90ms step:218/1480 train_time:30141ms step_avg:144.91ms step:219/1480 train_time:30287ms step_avg:144.92ms step:220/1480 train_time:30433ms step_avg:144.92ms step:221/1480 train_time:30583ms step_avg:144.94ms step:222/1480 train_time:30733ms step_avg:144.97ms step:223/1480 train_time:30885ms step_avg:145.00ms step:224/1480 train_time:31036ms step_avg:145.03ms step:225/1480 train_time:31189ms step_avg:145.07ms step:226/1480 train_time:31337ms step_avg:145.08ms step:227/1480 train_time:31487ms step_avg:145.10ms step:228/1480 train_time:31636ms step_avg:145.12ms step:229/1480 train_time:31787ms step_avg:145.14ms step:230/1480 train_time:31936ms step_avg:145.16ms step:231/1480 train_time:32086ms step_avg:145.19ms step:232/1480 train_time:32235ms step_avg:145.20ms step:233/1480 train_time:32387ms step_avg:145.23ms step:234/1480 train_time:32537ms step_avg:145.25ms step:235/1480 train_time:32689ms step_avg:145.29ms step:236/1480 train_time:32839ms step_avg:145.30ms step:237/1480 train_time:32990ms step_avg:145.33ms step:238/1480 train_time:33140ms step_avg:145.35ms step:239/1480 train_time:33291ms step_avg:145.38ms step:240/1480 train_time:33442ms step_avg:145.40ms step:241/1480 train_time:33592ms step_avg:145.42ms step:242/1480 train_time:33744ms step_avg:145.45ms step:243/1480 train_time:33895ms step_avg:145.47ms step:244/1480 train_time:34047ms step_avg:145.50ms step:245/1480 train_time:34196ms step_avg:145.51ms step:246/1480 train_time:34346ms step_avg:145.53ms step:247/1480 train_time:34496ms step_avg:145.55ms step:248/1480 train_time:34647ms step_avg:145.57ms step:249/1480 train_time:34796ms step_avg:145.59ms step:250/1480 train_time:34946ms step_avg:145.61ms step:250/1480 val_loss:3.9942 train_time:35005ms step_avg:145.85ms step:251/1480 train_time:35104ms step_avg:145.66ms step:252/1480 train_time:35257ms step_avg:145.69ms step:253/1480 train_time:35408ms step_avg:145.71ms step:254/1480 train_time:35557ms step_avg:145.73ms step:255/1480 train_time:35707ms step_avg:145.74ms step:256/1480 train_time:35856ms step_avg:145.76ms step:257/1480 train_time:36005ms step_avg:145.77ms step:258/1480 train_time:36157ms step_avg:145.79ms step:259/1480 train_time:36309ms step_avg:145.82ms step:260/1480 train_time:36460ms step_avg:145.84ms step:261/1480 train_time:36612ms step_avg:145.86ms step:262/1480 train_time:36759ms step_avg:145.87ms step:263/1480 train_time:36910ms step_avg:145.89ms step:264/1480 train_time:37060ms step_avg:145.90ms step:265/1480 train_time:37211ms step_avg:145.93ms step:266/1480 train_time:37362ms step_avg:145.95ms step:267/1480 train_time:37514ms step_avg:145.97ms step:268/1480 train_time:37664ms step_avg:145.98ms step:269/1480 train_time:37815ms step_avg:146.00ms step:270/1480 train_time:37964ms step_avg:146.02ms step:271/1480 train_time:38115ms step_avg:146.03ms step:272/1480 train_time:38263ms step_avg:146.04ms step:273/1480 train_time:38415ms step_avg:146.06ms step:274/1480 train_time:38566ms step_avg:146.08ms step:275/1480 train_time:38716ms step_avg:146.10ms step:276/1480 train_time:38866ms step_avg:146.11ms step:277/1480 train_time:39017ms step_avg:146.13ms step:278/1480 train_time:39168ms step_avg:146.15ms step:279/1480 train_time:39317ms step_avg:146.16ms step:280/1480 train_time:39469ms step_avg:146.18ms step:281/1480 train_time:39619ms step_avg:146.20ms step:282/1480 train_time:39771ms step_avg:146.22ms step:283/1480 train_time:39922ms step_avg:146.23ms step:284/1480 train_time:40072ms step_avg:146.25ms step:285/1480 train_time:40222ms step_avg:146.26ms step:286/1480 train_time:40373ms step_avg:146.28ms step:287/1480 train_time:40522ms step_avg:146.29ms step:288/1480 train_time:40672ms step_avg:146.30ms step:289/1480 train_time:40823ms step_avg:146.32ms step:290/1480 train_time:40974ms step_avg:146.34ms step:291/1480 train_time:41126ms step_avg:146.36ms step:292/1480 train_time:41276ms step_avg:146.37ms step:293/1480 train_time:41427ms step_avg:146.39ms step:294/1480 train_time:41577ms step_avg:146.40ms step:295/1480 train_time:41729ms step_avg:146.42ms step:296/1480 train_time:41878ms step_avg:146.43ms step:297/1480 train_time:42030ms step_avg:146.44ms step:298/1480 train_time:42179ms step_avg:146.46ms step:299/1480 train_time:42331ms step_avg:146.48ms step:300/1480 train_time:42481ms step_avg:146.49ms step:301/1480 train_time:42632ms step_avg:146.50ms step:302/1480 train_time:42782ms step_avg:146.51ms step:303/1480 train_time:42933ms step_avg:146.53ms step:304/1480 train_time:43081ms step_avg:146.53ms step:305/1480 train_time:43235ms step_avg:146.56ms step:306/1480 train_time:43385ms step_avg:146.57ms step:307/1480 train_time:43536ms step_avg:146.59ms step:308/1480 train_time:43687ms step_avg:146.60ms step:309/1480 train_time:43837ms step_avg:146.61ms step:310/1480 train_time:43989ms step_avg:146.63ms step:311/1480 train_time:44139ms step_avg:146.64ms step:312/1480 train_time:44290ms step_avg:146.65ms step:313/1480 train_time:44440ms step_avg:146.67ms step:314/1480 train_time:44591ms step_avg:146.68ms step:315/1480 train_time:44740ms step_avg:146.69ms step:316/1480 train_time:44890ms step_avg:146.70ms step:317/1480 train_time:45040ms step_avg:146.71ms step:318/1480 train_time:45191ms step_avg:146.72ms step:319/1480 train_time:45340ms step_avg:146.73ms step:320/1480 train_time:45491ms step_avg:146.75ms step:321/1480 train_time:45641ms step_avg:146.76ms step:322/1480 train_time:45793ms step_avg:146.77ms step:323/1480 train_time:45942ms step_avg:146.78ms step:324/1480 train_time:46094ms step_avg:146.80ms step:325/1480 train_time:46243ms step_avg:146.80ms step:326/1480 train_time:46395ms step_avg:146.82ms step:327/1480 train_time:46544ms step_avg:146.83ms step:328/1480 train_time:46695ms step_avg:146.84ms step:329/1480 train_time:46845ms step_avg:146.85ms step:330/1480 train_time:46998ms step_avg:146.87ms step:331/1480 train_time:47151ms step_avg:146.89ms step:332/1480 train_time:47306ms step_avg:146.91ms step:333/1480 train_time:47459ms step_avg:146.93ms step:334/1480 train_time:47613ms step_avg:146.95ms step:335/1480 train_time:47767ms step_avg:146.97ms step:336/1480 train_time:47921ms step_avg:147.00ms step:337/1480 train_time:48075ms step_avg:147.02ms step:338/1480 train_time:48230ms step_avg:147.04ms step:339/1480 train_time:48383ms step_avg:147.06ms step:340/1480 train_time:48537ms step_avg:147.08ms step:341/1480 train_time:48690ms step_avg:147.10ms step:342/1480 train_time:48844ms step_avg:147.12ms step:343/1480 train_time:48999ms step_avg:147.14ms step:344/1480 train_time:49153ms step_avg:147.16ms step:345/1480 train_time:49307ms step_avg:147.19ms step:346/1480 train_time:49461ms step_avg:147.21ms step:347/1480 train_time:49615ms step_avg:147.23ms step:348/1480 train_time:49770ms step_avg:147.25ms step:349/1480 train_time:49923ms step_avg:147.27ms step:350/1480 train_time:50078ms step_avg:147.29ms step:351/1480 train_time:50232ms step_avg:147.31ms step:352/1480 train_time:50388ms step_avg:147.33ms step:353/1480 train_time:50542ms step_avg:147.35ms step:354/1480 train_time:50695ms step_avg:147.37ms step:355/1480 train_time:50848ms step_avg:147.39ms step:356/1480 train_time:51002ms step_avg:147.40ms step:357/1480 train_time:51155ms step_avg:147.42ms step:358/1480 train_time:51309ms step_avg:147.44ms step:359/1480 train_time:51464ms step_avg:147.46ms step:360/1480 train_time:51620ms step_avg:147.48ms step:361/1480 train_time:51773ms step_avg:147.50ms step:362/1480 train_time:51928ms step_avg:147.52ms step:363/1480 train_time:52081ms step_avg:147.54ms step:364/1480 train_time:52236ms step_avg:147.56ms step:365/1480 train_time:52390ms step_avg:147.58ms step:366/1480 train_time:52542ms step_avg:147.59ms step:367/1480 train_time:52695ms step_avg:147.61ms step:368/1480 train_time:52848ms step_avg:147.62ms step:369/1480 train_time:53000ms step_avg:147.63ms step:370/1480 train_time:53154ms step_avg:147.65ms step:371/1480 train_time:53308ms step_avg:147.67ms step:372/1480 train_time:53462ms step_avg:147.69ms step:373/1480 train_time:53616ms step_avg:147.70ms step:374/1480 train_time:53770ms step_avg:147.72ms step:375/1480 train_time:53924ms step_avg:147.74ms step:375/1480 val_loss:3.8155 train_time:53984ms step_avg:147.90ms step:376/1480 train_time:54081ms step_avg:147.76ms step:377/1480 train_time:54236ms step_avg:147.78ms step:378/1480 train_time:54388ms step_avg:147.79ms step:379/1480 train_time:54540ms step_avg:147.81ms step:380/1480 train_time:54694ms step_avg:147.82ms step:381/1480 train_time:54846ms step_avg:147.83ms step:382/1480 train_time:55000ms step_avg:147.85ms step:383/1480 train_time:55156ms step_avg:147.87ms step:384/1480 train_time:55310ms step_avg:147.89ms step:385/1480 train_time:55463ms step_avg:147.90ms step:386/1480 train_time:55616ms step_avg:147.92ms step:387/1480 train_time:55769ms step_avg:147.93ms step:388/1480 train_time:55922ms step_avg:147.94ms step:389/1480 train_time:56075ms step_avg:147.96ms step:390/1480 train_time:56231ms step_avg:147.98ms step:391/1480 train_time:56386ms step_avg:147.99ms step:392/1480 train_time:56539ms step_avg:148.01ms step:393/1480 train_time:56692ms step_avg:148.02ms step:394/1480 train_time:56845ms step_avg:148.03ms step:395/1480 train_time:56998ms step_avg:148.05ms step:396/1480 train_time:57153ms step_avg:148.06ms step:397/1480 train_time:57307ms step_avg:148.08ms step:398/1480 train_time:57461ms step_avg:148.10ms step:399/1480 train_time:57615ms step_avg:148.11ms step:400/1480 train_time:57769ms step_avg:148.12ms step:401/1480 train_time:57921ms step_avg:148.14ms step:402/1480 train_time:58075ms step_avg:148.15ms step:403/1480 train_time:58228ms step_avg:148.16ms step:404/1480 train_time:58382ms step_avg:148.18ms step:405/1480 train_time:58535ms step_avg:148.19ms step:406/1480 train_time:58689ms step_avg:148.21ms step:407/1480 train_time:58843ms step_avg:148.22ms step:408/1480 train_time:58997ms step_avg:148.23ms step:409/1480 train_time:59151ms step_avg:148.25ms step:410/1480 train_time:59305ms step_avg:148.26ms step:411/1480 train_time:59459ms step_avg:148.28ms step:412/1480 train_time:59614ms step_avg:148.29ms step:413/1480 train_time:59767ms step_avg:148.31ms step:414/1480 train_time:59921ms step_avg:148.32ms step:415/1480 train_time:60073ms step_avg:148.33ms step:416/1480 train_time:60228ms step_avg:148.35ms step:417/1480 train_time:60381ms step_avg:148.36ms step:418/1480 train_time:60535ms step_avg:148.37ms step:419/1480 train_time:60688ms step_avg:148.38ms step:420/1480 train_time:60841ms step_avg:148.39ms step:421/1480 train_time:60994ms step_avg:148.41ms step:422/1480 train_time:61147ms step_avg:148.42ms step:423/1480 train_time:61301ms step_avg:148.43ms step:424/1480 train_time:61455ms step_avg:148.44ms step:425/1480 train_time:61609ms step_avg:148.46ms step:426/1480 train_time:61763ms step_avg:148.47ms step:427/1480 train_time:61918ms step_avg:148.48ms step:428/1480 train_time:62072ms step_avg:148.50ms step:429/1480 train_time:62225ms step_avg:148.51ms step:430/1480 train_time:62379ms step_avg:148.52ms step:431/1480 train_time:62534ms step_avg:148.54ms step:432/1480 train_time:62688ms step_avg:148.55ms step:433/1480 train_time:62841ms step_avg:148.56ms step:434/1480 train_time:62996ms step_avg:148.58ms step:435/1480 train_time:63150ms step_avg:148.59ms step:436/1480 train_time:63304ms step_avg:148.60ms step:437/1480 train_time:63458ms step_avg:148.61ms step:438/1480 train_time:63613ms step_avg:148.63ms step:439/1480 train_time:63766ms step_avg:148.64ms step:440/1480 train_time:63921ms step_avg:148.65ms step:441/1480 train_time:64077ms step_avg:148.67ms step:442/1480 train_time:64235ms step_avg:148.69ms step:443/1480 train_time:64393ms step_avg:148.71ms step:444/1480 train_time:64549ms step_avg:148.73ms step:445/1480 train_time:64705ms step_avg:148.75ms step:446/1480 train_time:64860ms step_avg:148.76ms step:447/1480 train_time:65016ms step_avg:148.78ms step:448/1480 train_time:65173ms step_avg:148.80ms step:449/1480 train_time:65331ms step_avg:148.82ms step:450/1480 train_time:65490ms step_avg:148.84ms step:451/1480 train_time:65649ms step_avg:148.86ms step:452/1480 train_time:65806ms step_avg:148.88ms step:453/1480 train_time:65960ms step_avg:148.89ms step:454/1480 train_time:66118ms step_avg:148.91ms step:455/1480 train_time:66274ms step_avg:148.93ms step:456/1480 train_time:66431ms step_avg:148.95ms step:457/1480 train_time:66588ms step_avg:148.97ms step:458/1480 train_time:66743ms step_avg:148.98ms step:459/1480 train_time:66901ms step_avg:149.00ms step:460/1480 train_time:67057ms step_avg:149.02ms step:461/1480 train_time:67216ms step_avg:149.04ms step:462/1480 train_time:67373ms step_avg:149.05ms step:463/1480 train_time:67530ms step_avg:149.07ms step:464/1480 train_time:67687ms step_avg:149.09ms step:465/1480 train_time:67842ms step_avg:149.10ms step:466/1480 train_time:67998ms step_avg:149.12ms step:467/1480 train_time:68155ms step_avg:149.14ms step:468/1480 train_time:68312ms step_avg:149.15ms step:469/1480 train_time:68468ms step_avg:149.17ms step:470/1480 train_time:68626ms step_avg:149.19ms step:471/1480 train_time:68782ms step_avg:149.20ms step:472/1480 train_time:68939ms step_avg:149.22ms step:473/1480 train_time:69095ms step_avg:149.23ms step:474/1480 train_time:69251ms step_avg:149.25ms step:475/1480 train_time:69407ms step_avg:149.26ms step:476/1480 train_time:69565ms step_avg:149.28ms step:477/1480 train_time:69721ms step_avg:149.30ms step:478/1480 train_time:69877ms step_avg:149.31ms step:479/1480 train_time:70035ms step_avg:149.33ms step:480/1480 train_time:70193ms step_avg:149.35ms step:481/1480 train_time:70349ms step_avg:149.36ms step:482/1480 train_time:70506ms step_avg:149.38ms step:483/1480 train_time:70661ms step_avg:149.39ms step:484/1480 train_time:70819ms step_avg:149.41ms step:485/1480 train_time:70975ms step_avg:149.42ms step:486/1480 train_time:71133ms step_avg:149.44ms step:487/1480 train_time:71290ms step_avg:149.46ms step:488/1480 train_time:71447ms step_avg:149.47ms step:489/1480 train_time:71602ms step_avg:149.48ms step:490/1480 train_time:71758ms step_avg:149.50ms step:491/1480 train_time:71914ms step_avg:149.51ms step:492/1480 train_time:72070ms step_avg:149.52ms step:493/1480 train_time:72229ms step_avg:149.54ms step:494/1480 train_time:72386ms step_avg:149.56ms step:495/1480 train_time:72542ms step_avg:149.57ms step:496/1480 train_time:72701ms step_avg:149.59ms step:497/1480 train_time:72857ms step_avg:149.60ms step:498/1480 train_time:73015ms step_avg:149.62ms step:499/1480 train_time:73171ms step_avg:149.63ms step:500/1480 train_time:73330ms step_avg:149.65ms step:500/1480 val_loss:3.6879 train_time:73392ms step_avg:149.78ms step:501/1480 train_time:73490ms step_avg:149.67ms step:502/1480 train_time:73647ms step_avg:149.69ms step:503/1480 train_time:73803ms step_avg:149.70ms step:504/1480 train_time:73958ms step_avg:149.71ms step:505/1480 train_time:74115ms step_avg:149.73ms step:506/1480 train_time:74271ms step_avg:149.74ms step:507/1480 train_time:74429ms step_avg:149.76ms step:508/1480 train_time:74589ms step_avg:149.78ms step:509/1480 train_time:74746ms step_avg:149.79ms step:510/1480 train_time:74902ms step_avg:149.80ms step:511/1480 train_time:75059ms step_avg:149.82ms step:512/1480 train_time:75217ms step_avg:149.83ms step:513/1480 train_time:75374ms step_avg:149.85ms step:514/1480 train_time:75531ms step_avg:149.86ms step:515/1480 train_time:75688ms step_avg:149.88ms step:516/1480 train_time:75846ms step_avg:149.89ms step:517/1480 train_time:76002ms step_avg:149.90ms step:518/1480 train_time:76158ms step_avg:149.92ms step:519/1480 train_time:76315ms step_avg:149.93ms step:520/1480 train_time:76474ms step_avg:149.95ms step:521/1480 train_time:76630ms step_avg:149.96ms step:522/1480 train_time:76785ms step_avg:149.97ms step:523/1480 train_time:76943ms step_avg:149.99ms step:524/1480 train_time:77100ms step_avg:150.00ms step:525/1480 train_time:77256ms step_avg:150.01ms step:526/1480 train_time:77415ms step_avg:150.03ms step:527/1480 train_time:77571ms step_avg:150.04ms step:528/1480 train_time:77727ms step_avg:150.05ms step:529/1480 train_time:77884ms step_avg:150.07ms step:530/1480 train_time:78042ms step_avg:150.08ms step:531/1480 train_time:78200ms step_avg:150.10ms step:532/1480 train_time:78355ms step_avg:150.11ms step:533/1480 train_time:78512ms step_avg:150.12ms step:534/1480 train_time:78668ms step_avg:150.13ms step:535/1480 train_time:78824ms step_avg:150.14ms step:536/1480 train_time:78981ms step_avg:150.15ms step:537/1480 train_time:79138ms step_avg:150.17ms step:538/1480 train_time:79298ms step_avg:150.19ms step:539/1480 train_time:79458ms step_avg:150.20ms step:540/1480 train_time:79617ms step_avg:150.22ms step:541/1480 train_time:79774ms step_avg:150.23ms step:542/1480 train_time:79930ms step_avg:150.24ms step:543/1480 train_time:80086ms step_avg:150.26ms step:544/1480 train_time:80242ms step_avg:150.27ms step:545/1480 train_time:80399ms step_avg:150.28ms step:546/1480 train_time:80555ms step_avg:150.29ms step:547/1480 train_time:80712ms step_avg:150.30ms step:548/1480 train_time:80868ms step_avg:150.31ms step:549/1480 train_time:81024ms step_avg:150.32ms step:550/1480 train_time:81183ms step_avg:150.34ms step:551/1480 train_time:81341ms step_avg:150.35ms step:552/1480 train_time:81501ms step_avg:150.37ms step:553/1480 train_time:81662ms step_avg:150.39ms step:554/1480 train_time:81822ms step_avg:150.41ms step:555/1480 train_time:81983ms step_avg:150.43ms step:556/1480 train_time:82141ms step_avg:150.44ms step:557/1480 train_time:82302ms step_avg:150.46ms step:558/1480 train_time:82461ms step_avg:150.48ms step:559/1480 train_time:82621ms step_avg:150.49ms step:560/1480 train_time:82781ms step_avg:150.51ms step:561/1480 train_time:82940ms step_avg:150.53ms step:562/1480 train_time:83101ms step_avg:150.55ms step:563/1480 train_time:83260ms step_avg:150.56ms step:564/1480 train_time:83420ms step_avg:150.58ms step:565/1480 train_time:83580ms step_avg:150.59ms step:566/1480 train_time:83741ms step_avg:150.61ms step:567/1480 train_time:83900ms step_avg:150.63ms step:568/1480 train_time:84060ms step_avg:150.64ms step:569/1480 train_time:84220ms step_avg:150.66ms step:570/1480 train_time:84380ms step_avg:150.68ms step:571/1480 train_time:84540ms step_avg:150.69ms step:572/1480 train_time:84700ms step_avg:150.71ms step:573/1480 train_time:84861ms step_avg:150.73ms step:574/1480 train_time:85022ms step_avg:150.75ms step:575/1480 train_time:85182ms step_avg:150.76ms step:576/1480 train_time:85341ms step_avg:150.78ms step:577/1480 train_time:85500ms step_avg:150.79ms step:578/1480 train_time:85659ms step_avg:150.81ms step:579/1480 train_time:85819ms step_avg:150.82ms step:580/1480 train_time:85978ms step_avg:150.84ms step:581/1480 train_time:86137ms step_avg:150.85ms step:582/1480 train_time:86298ms step_avg:150.87ms step:583/1480 train_time:86458ms step_avg:150.89ms step:584/1480 train_time:86618ms step_avg:150.90ms step:585/1480 train_time:86778ms step_avg:150.92ms step:586/1480 train_time:86940ms step_avg:150.94ms step:587/1480 train_time:87100ms step_avg:150.95ms step:588/1480 train_time:87258ms step_avg:150.97ms step:589/1480 train_time:87420ms step_avg:150.98ms step:590/1480 train_time:87581ms step_avg:151.00ms step:591/1480 train_time:87739ms step_avg:151.01ms step:592/1480 train_time:87900ms step_avg:151.03ms step:593/1480 train_time:88061ms step_avg:151.05ms step:594/1480 train_time:88223ms step_avg:151.07ms step:595/1480 train_time:88385ms step_avg:151.09ms step:596/1480 train_time:88545ms step_avg:151.10ms step:597/1480 train_time:88704ms step_avg:151.11ms step:598/1480 train_time:88862ms step_avg:151.13ms step:599/1480 train_time:89020ms step_avg:151.14ms step:600/1480 train_time:89180ms step_avg:151.15ms step:601/1480 train_time:89338ms step_avg:151.16ms step:602/1480 train_time:89498ms step_avg:151.18ms step:603/1480 train_time:89658ms step_avg:151.19ms step:604/1480 train_time:89819ms step_avg:151.21ms step:605/1480 train_time:89979ms step_avg:151.23ms step:606/1480 train_time:90141ms step_avg:151.24ms step:607/1480 train_time:90304ms step_avg:151.26ms step:608/1480 train_time:90463ms step_avg:151.28ms step:609/1480 train_time:90623ms step_avg:151.29ms step:610/1480 train_time:90782ms step_avg:151.30ms step:611/1480 train_time:90941ms step_avg:151.32ms step:612/1480 train_time:91102ms step_avg:151.33ms step:613/1480 train_time:91263ms step_avg:151.35ms step:614/1480 train_time:91423ms step_avg:151.36ms step:615/1480 train_time:91582ms step_avg:151.38ms step:616/1480 train_time:91741ms step_avg:151.39ms step:617/1480 train_time:91901ms step_avg:151.40ms step:618/1480 train_time:92061ms step_avg:151.42ms step:619/1480 train_time:92221ms step_avg:151.43ms step:620/1480 train_time:92381ms step_avg:151.44ms step:621/1480 train_time:92540ms step_avg:151.46ms step:622/1480 train_time:92699ms step_avg:151.47ms step:623/1480 train_time:92858ms step_avg:151.48ms step:624/1480 train_time:93018ms step_avg:151.50ms step:625/1480 train_time:93178ms step_avg:151.51ms step:625/1480 val_loss:3.6071 train_time:93242ms step_avg:151.61ms step:626/1480 train_time:93341ms step_avg:151.53ms step:627/1480 train_time:93500ms step_avg:151.54ms step:628/1480 train_time:93659ms step_avg:151.55ms step:629/1480 train_time:93818ms step_avg:151.56ms step:630/1480 train_time:93975ms step_avg:151.57ms step:631/1480 train_time:94132ms step_avg:151.58ms step:632/1480 train_time:94292ms step_avg:151.59ms step:633/1480 train_time:94451ms step_avg:151.61ms step:634/1480 train_time:94611ms step_avg:151.62ms step:635/1480 train_time:94771ms step_avg:151.63ms step:636/1480 train_time:94930ms step_avg:151.65ms step:637/1480 train_time:95092ms step_avg:151.66ms step:638/1480 train_time:95251ms step_avg:151.67ms step:639/1480 train_time:95409ms step_avg:151.68ms step:640/1480 train_time:95569ms step_avg:151.70ms step:641/1480 train_time:95728ms step_avg:151.71ms step:642/1480 train_time:95889ms step_avg:151.72ms step:643/1480 train_time:96049ms step_avg:151.74ms step:644/1480 train_time:96208ms step_avg:151.75ms step:645/1480 train_time:96367ms step_avg:151.76ms step:646/1480 train_time:96526ms step_avg:151.77ms step:647/1480 train_time:96685ms step_avg:151.78ms step:648/1480 train_time:96846ms step_avg:151.80ms step:649/1480 train_time:97006ms step_avg:151.81ms step:650/1480 train_time:97166ms step_avg:151.82ms step:651/1480 train_time:97326ms step_avg:151.84ms step:652/1480 train_time:97487ms step_avg:151.85ms step:653/1480 train_time:97645ms step_avg:151.86ms step:654/1480 train_time:97804ms step_avg:151.87ms step:655/1480 train_time:97965ms step_avg:151.88ms step:656/1480 train_time:98125ms step_avg:151.90ms step:657/1480 train_time:98285ms step_avg:151.91ms step:658/1480 train_time:98444ms step_avg:151.92ms step:659/1480 train_time:98605ms step_avg:151.93ms step:660/1480 train_time:98767ms step_avg:151.95ms step:661/1480 train_time:98930ms step_avg:151.97ms step:662/1480 train_time:99092ms step_avg:151.98ms step:663/1480 train_time:99253ms step_avg:151.99ms step:664/1480 train_time:99414ms step_avg:152.01ms step:665/1480 train_time:99576ms step_avg:152.03ms step:666/1480 train_time:99736ms step_avg:152.04ms step:667/1480 train_time:99898ms step_avg:152.05ms step:668/1480 train_time:100059ms step_avg:152.06ms step:669/1480 train_time:100221ms step_avg:152.08ms step:670/1480 train_time:100382ms step_avg:152.09ms step:671/1480 train_time:100543ms step_avg:152.11ms step:672/1480 train_time:100704ms step_avg:152.12ms step:673/1480 train_time:100868ms step_avg:152.14ms step:674/1480 train_time:101031ms step_avg:152.15ms step:675/1480 train_time:101194ms step_avg:152.17ms step:676/1480 train_time:101356ms step_avg:152.19ms step:677/1480 train_time:101516ms step_avg:152.20ms step:678/1480 train_time:101678ms step_avg:152.21ms step:679/1480 train_time:101840ms step_avg:152.23ms step:680/1480 train_time:102001ms step_avg:152.24ms step:681/1480 train_time:102161ms step_avg:152.25ms step:682/1480 train_time:102324ms step_avg:152.27ms step:683/1480 train_time:102487ms step_avg:152.28ms step:684/1480 train_time:102650ms step_avg:152.30ms step:685/1480 train_time:102813ms step_avg:152.32ms step:686/1480 train_time:102974ms step_avg:152.33ms step:687/1480 train_time:103133ms step_avg:152.34ms step:688/1480 train_time:103296ms step_avg:152.35ms step:689/1480 train_time:103460ms step_avg:152.37ms step:690/1480 train_time:103623ms step_avg:152.39ms step:691/1480 train_time:103784ms step_avg:152.40ms step:692/1480 train_time:103945ms step_avg:152.41ms step:693/1480 train_time:104106ms step_avg:152.42ms step:694/1480 train_time:104268ms step_avg:152.44ms step:695/1480 train_time:104429ms step_avg:152.45ms step:696/1480 train_time:104591ms step_avg:152.47ms step:697/1480 train_time:104754ms step_avg:152.48ms step:698/1480 train_time:104915ms step_avg:152.49ms step:699/1480 train_time:105076ms step_avg:152.51ms step:700/1480 train_time:105237ms step_avg:152.52ms step:701/1480 train_time:105398ms step_avg:152.53ms step:702/1480 train_time:105559ms step_avg:152.54ms step:703/1480 train_time:105720ms step_avg:152.55ms step:704/1480 train_time:105880ms step_avg:152.56ms step:705/1480 train_time:106044ms step_avg:152.58ms step:706/1480 train_time:106209ms step_avg:152.60ms step:707/1480 train_time:106370ms step_avg:152.61ms step:708/1480 train_time:106531ms step_avg:152.62ms step:709/1480 train_time:106694ms step_avg:152.64ms step:710/1480 train_time:106854ms step_avg:152.65ms step:711/1480 train_time:107015ms step_avg:152.66ms step:712/1480 train_time:107180ms step_avg:152.68ms step:713/1480 train_time:107342ms step_avg:152.69ms step:714/1480 train_time:107502ms step_avg:152.70ms step:715/1480 train_time:107663ms step_avg:152.71ms step:716/1480 train_time:107824ms step_avg:152.73ms step:717/1480 train_time:107988ms step_avg:152.74ms step:718/1480 train_time:108148ms step_avg:152.75ms step:719/1480 train_time:108307ms step_avg:152.76ms step:720/1480 train_time:108473ms step_avg:152.78ms step:721/1480 train_time:108635ms step_avg:152.79ms step:722/1480 train_time:108797ms step_avg:152.80ms step:723/1480 train_time:108956ms step_avg:152.81ms step:724/1480 train_time:109117ms step_avg:152.83ms step:725/1480 train_time:109280ms step_avg:152.84ms step:726/1480 train_time:109444ms step_avg:152.85ms step:727/1480 train_time:109608ms step_avg:152.87ms step:728/1480 train_time:109770ms step_avg:152.88ms step:729/1480 train_time:109931ms step_avg:152.89ms step:730/1480 train_time:110095ms step_avg:152.91ms step:731/1480 train_time:110256ms step_avg:152.92ms step:732/1480 train_time:110416ms step_avg:152.93ms step:733/1480 train_time:110578ms step_avg:152.94ms step:734/1480 train_time:110740ms step_avg:152.96ms step:735/1480 train_time:110900ms step_avg:152.96ms step:736/1480 train_time:111061ms step_avg:152.98ms step:737/1480 train_time:111223ms step_avg:152.99ms step:738/1480 train_time:111385ms step_avg:153.00ms step:739/1480 train_time:111546ms step_avg:153.01ms step:740/1480 train_time:111711ms step_avg:153.03ms step:741/1480 train_time:111874ms step_avg:153.04ms step:742/1480 train_time:112034ms step_avg:153.05ms step:743/1480 train_time:112196ms step_avg:153.06ms step:744/1480 train_time:112358ms step_avg:153.08ms step:745/1480 train_time:112520ms step_avg:153.09ms step:746/1480 train_time:112680ms step_avg:153.10ms step:747/1480 train_time:112841ms step_avg:153.11ms step:748/1480 train_time:113008ms step_avg:153.13ms step:749/1480 train_time:113173ms step_avg:153.14ms step:750/1480 train_time:113332ms step_avg:153.15ms step:750/1480 val_loss:3.5500 train_time:113396ms step_avg:153.24ms step:751/1480 train_time:113498ms step_avg:153.17ms step:752/1480 train_time:113659ms step_avg:153.18ms step:753/1480 train_time:113820ms step_avg:153.19ms step:754/1480 train_time:113979ms step_avg:153.20ms step:755/1480 train_time:114142ms step_avg:153.21ms step:756/1480 train_time:114302ms step_avg:153.22ms step:757/1480 train_time:114467ms step_avg:153.24ms step:758/1480 train_time:114629ms step_avg:153.25ms step:759/1480 train_time:114791ms step_avg:153.26ms step:760/1480 train_time:114954ms step_avg:153.27ms step:761/1480 train_time:115117ms step_avg:153.28ms step:762/1480 train_time:115278ms step_avg:153.30ms step:763/1480 train_time:115440ms step_avg:153.31ms step:764/1480 train_time:115601ms step_avg:153.32ms step:765/1480 train_time:115762ms step_avg:153.33ms step:766/1480 train_time:115926ms step_avg:153.34ms step:767/1480 train_time:116089ms step_avg:153.35ms step:768/1480 train_time:116252ms step_avg:153.37ms step:769/1480 train_time:116416ms step_avg:153.38ms step:770/1480 train_time:116579ms step_avg:153.39ms step:771/1480 train_time:116742ms step_avg:153.41ms step:772/1480 train_time:116903ms step_avg:153.42ms step:773/1480 train_time:117064ms step_avg:153.43ms step:774/1480 train_time:117226ms step_avg:153.44ms step:775/1480 train_time:117389ms step_avg:153.45ms step:776/1480 train_time:117555ms step_avg:153.47ms step:777/1480 train_time:117721ms step_avg:153.48ms step:778/1480 train_time:117884ms step_avg:153.50ms step:779/1480 train_time:118046ms step_avg:153.51ms step:780/1480 train_time:118210ms step_avg:153.52ms step:781/1480 train_time:118375ms step_avg:153.53ms step:782/1480 train_time:118539ms step_avg:153.55ms step:783/1480 train_time:118699ms step_avg:153.56ms step:784/1480 train_time:118863ms step_avg:153.57ms step:785/1480 train_time:119025ms step_avg:153.58ms step:786/1480 train_time:119191ms step_avg:153.60ms step:787/1480 train_time:119356ms step_avg:153.61ms step:788/1480 train_time:119519ms step_avg:153.62ms step:789/1480 train_time:119679ms step_avg:153.63ms step:790/1480 train_time:119844ms step_avg:153.65ms step:791/1480 train_time:120012ms step_avg:153.66ms step:792/1480 train_time:120178ms step_avg:153.68ms step:793/1480 train_time:120339ms step_avg:153.69ms step:794/1480 train_time:120503ms step_avg:153.70ms step:795/1480 train_time:120667ms step_avg:153.72ms step:796/1480 train_time:120833ms step_avg:153.73ms step:797/1480 train_time:120997ms step_avg:153.74ms step:798/1480 train_time:121161ms step_avg:153.76ms step:799/1480 train_time:121329ms step_avg:153.78ms step:800/1480 train_time:121493ms step_avg:153.79ms step:801/1480 train_time:121657ms step_avg:153.80ms step:802/1480 train_time:121822ms step_avg:153.82ms step:803/1480 train_time:121984ms step_avg:153.83ms step:804/1480 train_time:122147ms step_avg:153.84ms step:805/1480 train_time:122314ms step_avg:153.85ms step:806/1480 train_time:122476ms step_avg:153.86ms step:807/1480 train_time:122637ms step_avg:153.87ms step:808/1480 train_time:122799ms step_avg:153.88ms step:809/1480 train_time:122962ms step_avg:153.89ms step:810/1480 train_time:123122ms step_avg:153.90ms step:811/1480 train_time:123283ms step_avg:153.91ms step:812/1480 train_time:123446ms step_avg:153.92ms step:813/1480 train_time:123609ms step_avg:153.93ms step:814/1480 train_time:123772ms step_avg:153.94ms step:815/1480 train_time:123936ms step_avg:153.96ms step:816/1480 train_time:124098ms step_avg:153.97ms step:817/1480 train_time:124261ms step_avg:153.98ms step:818/1480 train_time:124422ms step_avg:153.99ms step:819/1480 train_time:124586ms step_avg:154.00ms step:820/1480 train_time:124751ms step_avg:154.01ms step:821/1480 train_time:124914ms step_avg:154.03ms step:822/1480 train_time:125077ms step_avg:154.04ms step:823/1480 train_time:125239ms step_avg:154.05ms step:824/1480 train_time:125400ms step_avg:154.05ms step:825/1480 train_time:125565ms step_avg:154.07ms step:826/1480 train_time:125732ms step_avg:154.08ms step:827/1480 train_time:125897ms step_avg:154.10ms step:828/1480 train_time:126060ms step_avg:154.11ms step:829/1480 train_time:126223ms step_avg:154.12ms step:830/1480 train_time:126387ms step_avg:154.13ms step:831/1480 train_time:126553ms step_avg:154.15ms step:832/1480 train_time:126718ms step_avg:154.16ms step:833/1480 train_time:126881ms step_avg:154.17ms step:834/1480 train_time:127044ms step_avg:154.18ms step:835/1480 train_time:127208ms step_avg:154.19ms step:836/1480 train_time:127374ms step_avg:154.21ms step:837/1480 train_time:127537ms step_avg:154.22ms step:838/1480 train_time:127699ms step_avg:154.23ms step:839/1480 train_time:127863ms step_avg:154.24ms step:840/1480 train_time:128024ms step_avg:154.25ms step:841/1480 train_time:128185ms step_avg:154.25ms step:842/1480 train_time:128350ms step_avg:154.27ms step:843/1480 train_time:128513ms step_avg:154.28ms step:844/1480 train_time:128676ms step_avg:154.29ms step:845/1480 train_time:128839ms step_avg:154.30ms step:846/1480 train_time:129002ms step_avg:154.31ms step:847/1480 train_time:129166ms step_avg:154.32ms step:848/1480 train_time:129330ms step_avg:154.33ms step:849/1480 train_time:129492ms step_avg:154.34ms step:850/1480 train_time:129655ms step_avg:154.35ms step:851/1480 train_time:129820ms step_avg:154.36ms step:852/1480 train_time:129981ms step_avg:154.37ms step:853/1480 train_time:130143ms step_avg:154.38ms step:854/1480 train_time:130307ms step_avg:154.39ms step:855/1480 train_time:130471ms step_avg:154.40ms step:856/1480 train_time:130633ms step_avg:154.41ms step:857/1480 train_time:130798ms step_avg:154.42ms step:858/1480 train_time:130962ms step_avg:154.44ms step:859/1480 train_time:131126ms step_avg:154.45ms step:860/1480 train_time:131288ms step_avg:154.46ms step:861/1480 train_time:131454ms step_avg:154.47ms step:862/1480 train_time:131621ms step_avg:154.48ms step:863/1480 train_time:131787ms step_avg:154.50ms step:864/1480 train_time:131952ms step_avg:154.51ms step:865/1480 train_time:132114ms step_avg:154.52ms step:866/1480 train_time:132281ms step_avg:154.53ms step:867/1480 train_time:132444ms step_avg:154.54ms step:868/1480 train_time:132606ms step_avg:154.55ms step:869/1480 train_time:132769ms step_avg:154.56ms step:870/1480 train_time:132935ms step_avg:154.58ms step:871/1480 train_time:133097ms step_avg:154.58ms step:872/1480 train_time:133261ms step_avg:154.59ms step:873/1480 train_time:133423ms step_avg:154.60ms step:874/1480 train_time:133588ms step_avg:154.62ms step:875/1480 train_time:133753ms step_avg:154.63ms step:875/1480 val_loss:3.5046 train_time:133817ms step_avg:154.70ms step:876/1480 train_time:133917ms step_avg:154.64ms step:877/1480 train_time:134081ms step_avg:154.65ms step:878/1480 train_time:134244ms step_avg:154.66ms step:879/1480 train_time:134410ms step_avg:154.67ms step:880/1480 train_time:134573ms step_avg:154.68ms step:881/1480 train_time:134734ms step_avg:154.69ms step:882/1480 train_time:134899ms step_avg:154.70ms step:883/1480 train_time:135064ms step_avg:154.71ms step:884/1480 train_time:135231ms step_avg:154.73ms step:885/1480 train_time:135396ms step_avg:154.74ms step:886/1480 train_time:135562ms step_avg:154.75ms step:887/1480 train_time:135731ms step_avg:154.77ms step:888/1480 train_time:135904ms step_avg:154.79ms step:889/1480 train_time:136072ms step_avg:154.80ms step:890/1480 train_time:136233ms step_avg:154.81ms step:891/1480 train_time:136398ms step_avg:154.82ms step:892/1480 train_time:136562ms step_avg:154.83ms step:893/1480 train_time:136726ms step_avg:154.84ms step:894/1480 train_time:136895ms step_avg:154.86ms step:895/1480 train_time:137059ms step_avg:154.87ms step:896/1480 train_time:137223ms step_avg:154.88ms step:897/1480 train_time:137390ms step_avg:154.89ms step:898/1480 train_time:137556ms step_avg:154.91ms step:899/1480 train_time:137720ms step_avg:154.92ms step:900/1480 train_time:137883ms step_avg:154.92ms step:901/1480 train_time:138049ms step_avg:154.94ms step:902/1480 train_time:138215ms step_avg:154.95ms step:903/1480 train_time:138386ms step_avg:154.97ms step:904/1480 train_time:138552ms step_avg:154.98ms step:905/1480 train_time:138714ms step_avg:154.99ms step:906/1480 train_time:138879ms step_avg:155.00ms step:907/1480 train_time:139047ms step_avg:155.01ms step:908/1480 train_time:139210ms step_avg:155.02ms step:909/1480 train_time:139374ms step_avg:155.03ms step:910/1480 train_time:139545ms step_avg:155.05ms step:911/1480 train_time:139711ms step_avg:155.06ms step:912/1480 train_time:139876ms step_avg:155.07ms step:913/1480 train_time:140043ms step_avg:155.09ms step:914/1480 train_time:140210ms step_avg:155.10ms step:915/1480 train_time:140378ms step_avg:155.11ms step:916/1480 train_time:140543ms step_avg:155.12ms step:917/1480 train_time:140708ms step_avg:155.14ms step:918/1480 train_time:140877ms step_avg:155.15ms step:919/1480 train_time:141047ms step_avg:155.17ms step:920/1480 train_time:141213ms step_avg:155.18ms step:921/1480 train_time:141377ms step_avg:155.19ms step:922/1480 train_time:141545ms step_avg:155.20ms step:923/1480 train_time:141708ms step_avg:155.21ms step:924/1480 train_time:141873ms step_avg:155.22ms step:925/1480 train_time:142037ms step_avg:155.23ms step:926/1480 train_time:142200ms step_avg:155.24ms step:927/1480 train_time:142365ms step_avg:155.25ms step:928/1480 train_time:142531ms step_avg:155.26ms step:929/1480 train_time:142695ms step_avg:155.27ms step:930/1480 train_time:142860ms step_avg:155.28ms step:931/1480 train_time:143022ms step_avg:155.29ms step:932/1480 train_time:143189ms step_avg:155.30ms step:933/1480 train_time:143355ms step_avg:155.31ms step:934/1480 train_time:143523ms step_avg:155.33ms step:935/1480 train_time:143695ms step_avg:155.35ms step:936/1480 train_time:143861ms step_avg:155.36ms step:937/1480 train_time:144033ms step_avg:155.38ms step:938/1480 train_time:144196ms step_avg:155.38ms step:939/1480 train_time:144365ms step_avg:155.40ms step:940/1480 train_time:144531ms step_avg:155.41ms step:941/1480 train_time:144695ms step_avg:155.42ms step:942/1480 train_time:144860ms step_avg:155.43ms step:943/1480 train_time:145031ms step_avg:155.45ms step:944/1480 train_time:145202ms step_avg:155.46ms step:945/1480 train_time:145366ms step_avg:155.47ms step:946/1480 train_time:145536ms step_avg:155.49ms step:947/1480 train_time:145703ms step_avg:155.50ms step:948/1480 train_time:145870ms step_avg:155.51ms step:949/1480 train_time:146036ms step_avg:155.52ms step:950/1480 train_time:146198ms step_avg:155.53ms step:951/1480 train_time:146367ms step_avg:155.54ms step:952/1480 train_time:146534ms step_avg:155.56ms step:953/1480 train_time:146700ms step_avg:155.57ms step:954/1480 train_time:146870ms step_avg:155.58ms step:955/1480 train_time:147034ms step_avg:155.59ms step:956/1480 train_time:147199ms step_avg:155.60ms step:957/1480 train_time:147368ms step_avg:155.62ms step:958/1480 train_time:147536ms step_avg:155.63ms step:959/1480 train_time:147700ms step_avg:155.64ms step:960/1480 train_time:147868ms step_avg:155.65ms step:961/1480 train_time:148034ms step_avg:155.66ms step:962/1480 train_time:148197ms step_avg:155.67ms step:963/1480 train_time:148362ms step_avg:155.68ms step:964/1480 train_time:148531ms step_avg:155.69ms step:965/1480 train_time:148695ms step_avg:155.70ms step:966/1480 train_time:148859ms step_avg:155.71ms step:967/1480 train_time:149022ms step_avg:155.72ms step:968/1480 train_time:149188ms step_avg:155.73ms step:969/1480 train_time:149354ms step_avg:155.74ms step:970/1480 train_time:149517ms step_avg:155.75ms step:971/1480 train_time:149684ms step_avg:155.76ms step:972/1480 train_time:149849ms step_avg:155.77ms step:973/1480 train_time:150014ms step_avg:155.78ms step:974/1480 train_time:150182ms step_avg:155.79ms step:975/1480 train_time:150348ms step_avg:155.80ms step:976/1480 train_time:150513ms step_avg:155.81ms step:977/1480 train_time:150678ms step_avg:155.82ms step:978/1480 train_time:150842ms step_avg:155.83ms step:979/1480 train_time:151009ms step_avg:155.84ms step:980/1480 train_time:151175ms step_avg:155.85ms step:981/1480 train_time:151344ms step_avg:155.86ms step:982/1480 train_time:151507ms step_avg:155.87ms step:983/1480 train_time:151673ms step_avg:155.88ms step:984/1480 train_time:151837ms step_avg:155.89ms step:985/1480 train_time:152004ms step_avg:155.90ms step:986/1480 train_time:152169ms step_avg:155.91ms step:987/1480 train_time:152334ms step_avg:155.92ms step:988/1480 train_time:152499ms step_avg:155.93ms step:989/1480 train_time:152663ms step_avg:155.94ms step:990/1480 train_time:152833ms step_avg:155.95ms step:991/1480 train_time:153000ms step_avg:155.96ms step:992/1480 train_time:153177ms step_avg:155.98ms step:993/1480 train_time:153354ms step_avg:156.01ms step:994/1480 train_time:153519ms step_avg:156.02ms step:995/1480 train_time:153684ms step_avg:156.02ms step:996/1480 train_time:153847ms step_avg:156.03ms step:997/1480 train_time:154012ms step_avg:156.04ms step:998/1480 train_time:154175ms step_avg:156.05ms step:999/1480 train_time:154340ms step_avg:156.06ms step:1000/1480 train_time:154511ms step_avg:156.07ms step:1000/1480 val_loss:3.4399 train_time:154578ms step_avg:156.14ms step:1001/1480 train_time:154678ms step_avg:156.08ms step:1002/1480 train_time:154847ms step_avg:156.10ms step:1003/1480 train_time:155017ms step_avg:156.11ms step:1004/1480 train_time:155187ms step_avg:156.12ms step:1005/1480 train_time:155356ms step_avg:156.14ms step:1006/1480 train_time:155524ms step_avg:156.15ms step:1007/1480 train_time:155690ms step_avg:156.16ms step:1008/1480 train_time:155857ms step_avg:156.17ms step:1009/1480 train_time:156032ms step_avg:156.19ms step:1010/1480 train_time:156197ms step_avg:156.20ms step:1011/1480 train_time:156363ms step_avg:156.21ms step:1012/1480 train_time:156530ms step_avg:156.22ms step:1013/1480 train_time:156699ms step_avg:156.23ms step:1014/1480 train_time:156867ms step_avg:156.24ms step:1015/1480 train_time:157036ms step_avg:156.25ms step:1016/1480 train_time:157203ms step_avg:156.27ms step:1017/1480 train_time:157375ms step_avg:156.28ms step:1018/1480 train_time:157545ms step_avg:156.29ms step:1019/1480 train_time:157714ms step_avg:156.31ms step:1020/1480 train_time:157882ms step_avg:156.32ms step:1021/1480 train_time:158048ms step_avg:156.33ms step:1022/1480 train_time:158215ms step_avg:156.34ms step:1023/1480 train_time:158382ms step_avg:156.35ms step:1024/1480 train_time:158550ms step_avg:156.36ms step:1025/1480 train_time:158720ms step_avg:156.37ms step:1026/1480 train_time:158886ms step_avg:156.38ms step:1027/1480 train_time:159052ms step_avg:156.39ms step:1028/1480 train_time:159224ms step_avg:156.41ms step:1029/1480 train_time:159397ms step_avg:156.42ms step:1030/1480 train_time:159567ms step_avg:156.44ms step:1031/1480 train_time:159732ms step_avg:156.45ms step:1032/1480 train_time:159905ms step_avg:156.46ms step:1033/1480 train_time:160073ms step_avg:156.47ms step:1034/1480 train_time:160240ms step_avg:156.48ms step:1035/1480 train_time:160409ms step_avg:156.50ms step:1036/1480 train_time:160574ms step_avg:156.50ms step:1037/1480 train_time:160740ms step_avg:156.51ms step:1038/1480 train_time:160909ms step_avg:156.53ms step:1039/1480 train_time:161079ms step_avg:156.54ms step:1040/1480 train_time:161246ms step_avg:156.55ms step:1041/1480 train_time:161415ms step_avg:156.56ms step:1042/1480 train_time:161578ms step_avg:156.57ms step:1043/1480 train_time:161742ms step_avg:156.58ms step:1044/1480 train_time:161908ms step_avg:156.58ms step:1045/1480 train_time:162077ms step_avg:156.60ms step:1046/1480 train_time:162244ms step_avg:156.61ms step:1047/1480 train_time:162411ms step_avg:156.62ms step:1048/1480 train_time:162577ms step_avg:156.63ms step:1049/1480 train_time:162743ms step_avg:156.63ms step:1050/1480 train_time:162913ms step_avg:156.65ms step:1051/1480 train_time:163081ms step_avg:156.66ms step:1052/1480 train_time:163249ms step_avg:156.67ms step:1053/1480 train_time:163416ms step_avg:156.68ms step:1054/1480 train_time:163583ms step_avg:156.69ms step:1055/1480 train_time:163750ms step_avg:156.70ms step:1056/1480 train_time:163914ms step_avg:156.71ms step:1057/1480 train_time:164081ms step_avg:156.71ms step:1058/1480 train_time:164251ms step_avg:156.73ms step:1059/1480 train_time:164424ms step_avg:156.74ms step:1060/1480 train_time:164592ms step_avg:156.75ms step:1061/1480 train_time:164755ms step_avg:156.76ms step:1062/1480 train_time:164919ms step_avg:156.77ms step:1063/1480 train_time:165083ms step_avg:156.77ms step:1064/1480 train_time:165249ms step_avg:156.78ms step:1065/1480 train_time:165415ms step_avg:156.79ms step:1066/1480 train_time:165582ms step_avg:156.80ms step:1067/1480 train_time:165754ms step_avg:156.82ms step:1068/1480 train_time:165920ms step_avg:156.82ms step:1069/1480 train_time:166091ms step_avg:156.84ms step:1070/1480 train_time:166257ms step_avg:156.85ms step:1071/1480 train_time:166430ms step_avg:156.86ms step:1072/1480 train_time:166596ms step_avg:156.87ms step:1073/1480 train_time:166759ms step_avg:156.88ms step:1074/1480 train_time:166927ms step_avg:156.89ms step:1075/1480 train_time:167097ms step_avg:156.90ms step:1076/1480 train_time:167264ms step_avg:156.91ms step:1077/1480 train_time:167430ms step_avg:156.92ms step:1078/1480 train_time:167604ms step_avg:156.93ms step:1079/1480 train_time:167775ms step_avg:156.95ms step:1080/1480 train_time:167945ms step_avg:156.96ms step:1081/1480 train_time:168113ms step_avg:156.97ms step:1082/1480 train_time:168278ms step_avg:156.98ms step:1083/1480 train_time:168444ms step_avg:156.98ms step:1084/1480 train_time:168612ms step_avg:156.99ms step:1085/1480 train_time:168780ms step_avg:157.00ms step:1086/1480 train_time:168949ms step_avg:157.02ms step:1087/1480 train_time:169116ms step_avg:157.03ms step:1088/1480 train_time:169287ms step_avg:157.04ms step:1089/1480 train_time:169457ms step_avg:157.05ms step:1090/1480 train_time:169630ms step_avg:157.06ms step:1091/1480 train_time:169798ms step_avg:157.07ms step:1092/1480 train_time:169967ms step_avg:157.09ms step:1093/1480 train_time:170134ms step_avg:157.10ms step:1094/1480 train_time:170300ms step_avg:157.10ms step:1095/1480 train_time:170465ms step_avg:157.11ms step:1096/1480 train_time:170635ms step_avg:157.12ms step:1097/1480 train_time:170802ms step_avg:157.13ms step:1098/1480 train_time:170973ms step_avg:157.14ms step:1099/1480 train_time:171145ms step_avg:157.16ms step:1100/1480 train_time:171316ms step_avg:157.17ms step:1101/1480 train_time:171486ms step_avg:157.18ms step:1102/1480 train_time:171658ms step_avg:157.20ms step:1103/1480 train_time:171835ms step_avg:157.21ms step:1104/1480 train_time:172003ms step_avg:157.22ms step:1105/1480 train_time:172174ms step_avg:157.24ms step:1106/1480 train_time:172342ms step_avg:157.25ms step:1107/1480 train_time:172512ms step_avg:157.26ms step:1108/1480 train_time:172676ms step_avg:157.26ms step:1109/1480 train_time:172843ms step_avg:157.27ms step:1110/1480 train_time:173009ms step_avg:157.28ms step:1111/1480 train_time:173174ms step_avg:157.29ms step:1112/1480 train_time:173345ms step_avg:157.30ms step:1113/1480 train_time:173523ms step_avg:157.32ms step:1114/1480 train_time:173698ms step_avg:157.33ms step:1115/1480 train_time:173870ms step_avg:157.35ms step:1116/1480 train_time:174037ms step_avg:157.36ms step:1117/1480 train_time:174212ms step_avg:157.37ms step:1118/1480 train_time:174386ms step_avg:157.39ms step:1119/1480 train_time:174553ms step_avg:157.40ms step:1120/1480 train_time:174720ms step_avg:157.41ms step:1121/1480 train_time:174892ms step_avg:157.42ms step:1122/1480 train_time:175058ms step_avg:157.43ms step:1123/1480 train_time:175225ms step_avg:157.43ms step:1124/1480 train_time:175392ms step_avg:157.44ms step:1125/1480 train_time:175559ms step_avg:157.45ms step:1125/1480 val_loss:3.3855 train_time:175627ms step_avg:157.51ms step:1126/1480 train_time:175728ms step_avg:157.46ms step:1127/1480 train_time:175897ms step_avg:157.47ms step:1128/1480 train_time:176069ms step_avg:157.49ms step:1129/1480 train_time:176244ms step_avg:157.50ms step:1130/1480 train_time:176413ms step_avg:157.51ms step:1131/1480 train_time:176592ms step_avg:157.53ms step:1132/1480 train_time:176756ms step_avg:157.54ms step:1133/1480 train_time:176929ms step_avg:157.55ms step:1134/1480 train_time:177099ms step_avg:157.56ms step:1135/1480 train_time:177267ms step_avg:157.57ms step:1136/1480 train_time:177436ms step_avg:157.58ms step:1137/1480 train_time:177607ms step_avg:157.59ms step:1138/1480 train_time:177779ms step_avg:157.61ms step:1139/1480 train_time:177948ms step_avg:157.62ms step:1140/1480 train_time:178115ms step_avg:157.62ms step:1141/1480 train_time:178288ms step_avg:157.64ms step:1142/1480 train_time:178455ms step_avg:157.65ms step:1143/1480 train_time:178626ms step_avg:157.66ms step:1144/1480 train_time:178793ms step_avg:157.67ms step:1145/1480 train_time:178958ms step_avg:157.67ms step:1146/1480 train_time:179128ms step_avg:157.68ms step:1147/1480 train_time:179297ms step_avg:157.69ms step:1148/1480 train_time:179466ms step_avg:157.70ms step:1149/1480 train_time:179635ms step_avg:157.71ms step:1150/1480 train_time:179804ms step_avg:157.72ms step:1151/1480 train_time:179976ms step_avg:157.74ms step:1152/1480 train_time:180148ms step_avg:157.75ms step:1153/1480 train_time:180321ms step_avg:157.76ms step:1154/1480 train_time:180488ms step_avg:157.77ms step:1155/1480 train_time:180658ms step_avg:157.78ms step:1156/1480 train_time:180838ms step_avg:157.80ms step:1157/1480 train_time:181007ms step_avg:157.81ms step:1158/1480 train_time:181174ms step_avg:157.82ms step:1159/1480 train_time:181341ms step_avg:157.83ms step:1160/1480 train_time:181507ms step_avg:157.83ms step:1161/1480 train_time:181677ms step_avg:157.84ms step:1162/1480 train_time:181848ms step_avg:157.85ms step:1163/1480 train_time:182017ms step_avg:157.86ms step:1164/1480 train_time:182186ms step_avg:157.87ms step:1165/1480 train_time:182352ms step_avg:157.88ms step:1166/1480 train_time:182522ms step_avg:157.89ms step:1167/1480 train_time:182691ms step_avg:157.90ms step:1168/1480 train_time:182859ms step_avg:157.91ms step:1169/1480 train_time:183028ms step_avg:157.92ms step:1170/1480 train_time:183196ms step_avg:157.93ms step:1171/1480 train_time:183364ms step_avg:157.94ms step:1172/1480 train_time:183529ms step_avg:157.94ms step:1173/1480 train_time:183702ms step_avg:157.96ms step:1174/1480 train_time:183885ms step_avg:157.98ms step:1175/1480 train_time:184055ms step_avg:157.99ms step:1176/1480 train_time:184228ms step_avg:158.00ms step:1177/1480 train_time:184403ms step_avg:158.01ms step:1178/1480 train_time:184571ms step_avg:158.02ms step:1179/1480 train_time:184737ms step_avg:158.03ms step:1180/1480 train_time:184919ms step_avg:158.05ms step:1181/1480 train_time:185089ms step_avg:158.06ms step:1182/1480 train_time:185256ms step_avg:158.07ms step:1183/1480 train_time:185426ms step_avg:158.08ms step:1184/1480 train_time:185595ms step_avg:158.09ms step:1185/1480 train_time:185769ms step_avg:158.10ms step:1186/1480 train_time:185939ms step_avg:158.11ms step:1187/1480 train_time:186124ms step_avg:158.13ms step:1188/1480 train_time:186290ms step_avg:158.14ms step:1189/1480 train_time:186460ms step_avg:158.15ms step:1190/1480 train_time:186628ms step_avg:158.16ms step:1191/1480 train_time:186799ms step_avg:158.17ms step:1192/1480 train_time:186967ms step_avg:158.18ms step:1193/1480 train_time:187132ms step_avg:158.18ms step:1194/1480 train_time:187303ms step_avg:158.19ms step:1195/1480 train_time:187478ms step_avg:158.21ms step:1196/1480 train_time:187661ms step_avg:158.23ms step:1197/1480 train_time:187831ms step_avg:158.24ms step:1198/1480 train_time:188013ms step_avg:158.26ms step:1199/1480 train_time:188185ms step_avg:158.27ms step:1200/1480 train_time:188352ms step_avg:158.28ms step:1201/1480 train_time:188519ms step_avg:158.29ms step:1202/1480 train_time:188698ms step_avg:158.30ms step:1203/1480 train_time:188874ms step_avg:158.32ms step:1204/1480 train_time:189048ms step_avg:158.33ms step:1205/1480 train_time:189216ms step_avg:158.34ms step:1206/1480 train_time:189386ms step_avg:158.35ms step:1207/1480 train_time:189553ms step_avg:158.36ms step:1208/1480 train_time:189721ms step_avg:158.37ms step:1209/1480 train_time:189895ms step_avg:158.38ms step:1210/1480 train_time:190072ms step_avg:158.39ms step:1211/1480 train_time:190246ms step_avg:158.41ms step:1212/1480 train_time:190417ms step_avg:158.42ms step:1213/1480 train_time:190590ms step_avg:158.43ms step:1214/1480 train_time:190768ms step_avg:158.45ms step:1215/1480 train_time:190941ms step_avg:158.46ms step:1216/1480 train_time:191110ms step_avg:158.47ms step:1217/1480 train_time:191284ms step_avg:158.48ms step:1218/1480 train_time:191453ms step_avg:158.49ms step:1219/1480 train_time:191631ms step_avg:158.50ms step:1220/1480 train_time:191800ms step_avg:158.51ms step:1221/1480 train_time:191970ms step_avg:158.52ms step:1222/1480 train_time:192137ms step_avg:158.53ms step:1223/1480 train_time:192308ms step_avg:158.54ms step:1224/1480 train_time:192486ms step_avg:158.56ms step:1225/1480 train_time:192658ms step_avg:158.57ms step:1226/1480 train_time:192830ms step_avg:158.58ms step:1227/1480 train_time:193000ms step_avg:158.59ms step:1228/1480 train_time:193169ms step_avg:158.60ms step:1229/1480 train_time:193341ms step_avg:158.61ms step:1230/1480 train_time:193522ms step_avg:158.62ms step:1231/1480 train_time:193696ms step_avg:158.64ms step:1232/1480 train_time:193874ms step_avg:158.65ms step:1233/1480 train_time:194046ms step_avg:158.66ms step:1234/1480 train_time:194215ms step_avg:158.67ms step:1235/1480 train_time:194390ms step_avg:158.69ms step:1236/1480 train_time:194557ms step_avg:158.69ms step:1237/1480 train_time:194728ms step_avg:158.70ms step:1238/1480 train_time:194912ms step_avg:158.72ms step:1239/1480 train_time:195084ms step_avg:158.73ms step:1240/1480 train_time:195254ms step_avg:158.74ms step:1241/1480 train_time:195427ms step_avg:158.75ms step:1242/1480 train_time:195596ms step_avg:158.76ms step:1243/1480 train_time:195769ms step_avg:158.77ms step:1244/1480 train_time:195935ms step_avg:158.78ms step:1245/1480 train_time:196104ms step_avg:158.79ms step:1246/1480 train_time:196274ms step_avg:158.80ms step:1247/1480 train_time:196445ms step_avg:158.81ms step:1248/1480 train_time:196613ms step_avg:158.82ms step:1249/1480 train_time:196782ms step_avg:158.82ms step:1250/1480 train_time:196951ms step_avg:158.83ms step:1250/1480 val_loss:3.3356 train_time:197022ms step_avg:158.89ms step:1251/1480 train_time:197130ms step_avg:158.85ms step:1252/1480 train_time:197299ms step_avg:158.86ms step:1253/1480 train_time:197467ms step_avg:158.86ms step:1254/1480 train_time:197639ms step_avg:158.87ms step:1255/1480 train_time:197826ms step_avg:158.90ms step:1256/1480 train_time:197999ms step_avg:158.91ms step:1257/1480 train_time:198168ms step_avg:158.92ms step:1258/1480 train_time:198342ms step_avg:158.93ms step:1259/1480 train_time:198514ms step_avg:158.94ms step:1260/1480 train_time:198681ms step_avg:158.94ms step:1261/1480 train_time:198854ms step_avg:158.96ms step:1262/1480 train_time:199029ms step_avg:158.97ms step:1263/1480 train_time:199203ms step_avg:158.98ms step:1264/1480 train_time:199371ms step_avg:158.99ms step:1265/1480 train_time:199537ms step_avg:158.99ms step:1266/1480 train_time:199709ms step_avg:159.00ms step:1267/1480 train_time:199878ms step_avg:159.01ms step:1268/1480 train_time:200051ms step_avg:159.02ms step:1269/1480 train_time:200225ms step_avg:159.04ms step:1270/1480 train_time:200395ms step_avg:159.04ms step:1271/1480 train_time:200567ms step_avg:159.05ms step:1272/1480 train_time:200733ms step_avg:159.06ms step:1273/1480 train_time:200904ms step_avg:159.07ms step:1274/1480 train_time:201076ms step_avg:159.08ms step:1275/1480 train_time:201242ms step_avg:159.08ms step:1276/1480 train_time:201408ms step_avg:159.09ms step:1277/1480 train_time:201580ms step_avg:159.10ms step:1278/1480 train_time:201747ms step_avg:159.11ms step:1279/1480 train_time:201919ms step_avg:159.12ms step:1280/1480 train_time:202097ms step_avg:159.13ms step:1281/1480 train_time:202266ms step_avg:159.14ms step:1282/1480 train_time:202431ms step_avg:159.14ms step:1283/1480 train_time:202602ms step_avg:159.15ms step:1284/1480 train_time:202772ms step_avg:159.16ms step:1285/1480 train_time:202941ms step_avg:159.17ms step:1286/1480 train_time:203111ms step_avg:159.18ms step:1287/1480 train_time:203281ms step_avg:159.19ms step:1288/1480 train_time:203454ms step_avg:159.20ms step:1289/1480 train_time:203636ms step_avg:159.22ms step:1290/1480 train_time:203815ms step_avg:159.23ms step:1291/1480 train_time:203989ms step_avg:159.24ms step:1292/1480 train_time:204162ms step_avg:159.25ms step:1293/1480 train_time:204338ms step_avg:159.27ms step:1294/1480 train_time:204509ms step_avg:159.27ms step:1295/1480 train_time:204680ms step_avg:159.28ms step:1296/1480 train_time:204853ms step_avg:159.30ms step:1297/1480 train_time:205024ms step_avg:159.30ms step:1298/1480 train_time:205195ms step_avg:159.31ms step:1299/1480 train_time:205364ms step_avg:159.32ms step:1300/1480 train_time:205532ms step_avg:159.33ms step:1301/1480 train_time:205700ms step_avg:159.33ms step:1302/1480 train_time:205874ms step_avg:159.35ms step:1303/1480 train_time:206050ms step_avg:159.36ms step:1304/1480 train_time:206222ms step_avg:159.37ms step:1305/1480 train_time:206391ms step_avg:159.38ms step:1306/1480 train_time:206565ms step_avg:159.39ms step:1307/1480 train_time:206733ms step_avg:159.39ms step:1308/1480 train_time:206902ms step_avg:159.40ms step:1309/1480 train_time:207074ms step_avg:159.41ms step:1310/1480 train_time:207242ms step_avg:159.42ms step:1311/1480 train_time:207411ms step_avg:159.42ms step:1312/1480 train_time:207583ms step_avg:159.43ms step:1313/1480 train_time:207751ms step_avg:159.44ms step:1314/1480 train_time:207923ms step_avg:159.45ms step:1315/1480 train_time:208093ms step_avg:159.46ms step:1316/1480 train_time:208259ms step_avg:159.46ms step:1317/1480 train_time:208429ms step_avg:159.47ms step:1318/1480 train_time:208609ms step_avg:159.49ms step:1319/1480 train_time:208784ms step_avg:159.50ms step:1320/1480 train_time:208960ms step_avg:159.51ms step:1321/1480 train_time:209133ms step_avg:159.52ms step:1322/1480 train_time:209314ms step_avg:159.54ms step:1323/1480 train_time:209486ms step_avg:159.55ms step:1324/1480 train_time:209661ms step_avg:159.56ms step:1325/1480 train_time:209844ms step_avg:159.58ms step:1326/1480 train_time:210020ms step_avg:159.59ms step:1327/1480 train_time:210189ms step_avg:159.60ms step:1328/1480 train_time:210359ms step_avg:159.60ms step:1329/1480 train_time:210555ms step_avg:159.63ms step:1330/1480 train_time:210735ms step_avg:159.65ms step:1331/1480 train_time:210905ms step_avg:159.66ms step:1332/1480 train_time:211079ms step_avg:159.67ms step:1333/1480 train_time:211254ms step_avg:159.68ms step:1334/1480 train_time:211425ms step_avg:159.69ms step:1335/1480 train_time:211595ms step_avg:159.69ms step:1336/1480 train_time:211777ms step_avg:159.71ms step:1337/1480 train_time:211952ms step_avg:159.72ms step:1338/1480 train_time:212124ms step_avg:159.73ms step:1339/1480 train_time:212299ms step_avg:159.74ms step:1340/1480 train_time:212471ms step_avg:159.75ms step:1341/1480 train_time:212638ms step_avg:159.76ms step:1342/1480 train_time:212813ms step_avg:159.77ms step:1343/1480 train_time:212983ms step_avg:159.78ms step:1344/1480 train_time:213156ms step_avg:159.79ms step:1345/1480 train_time:213335ms step_avg:159.80ms step:1346/1480 train_time:213504ms step_avg:159.81ms step:1347/1480 train_time:213675ms step_avg:159.82ms step:1348/1480 train_time:213845ms step_avg:159.82ms step:1349/1480 train_time:214015ms step_avg:159.83ms step:1350/1480 train_time:214191ms step_avg:159.84ms step:1351/1480 train_time:214361ms step_avg:159.85ms step:1352/1480 train_time:214532ms step_avg:159.86ms step:1353/1480 train_time:214708ms step_avg:159.87ms step:1354/1480 train_time:214877ms step_avg:159.88ms step:1355/1480 train_time:215045ms step_avg:159.88ms step:1356/1480 train_time:215218ms step_avg:159.89ms step:1357/1480 train_time:215392ms step_avg:159.91ms step:1358/1480 train_time:215564ms step_avg:159.91ms step:1359/1480 train_time:215737ms step_avg:159.92ms step:1360/1480 train_time:215914ms step_avg:159.94ms step:1361/1480 train_time:216090ms step_avg:159.95ms step:1362/1480 train_time:216267ms step_avg:159.96ms step:1363/1480 train_time:216447ms step_avg:159.98ms step:1364/1480 train_time:216615ms step_avg:159.98ms step:1365/1480 train_time:216783ms step_avg:159.99ms step:1366/1480 train_time:216956ms step_avg:160.00ms step:1367/1480 train_time:217128ms step_avg:160.01ms step:1368/1480 train_time:217302ms step_avg:160.02ms step:1369/1480 train_time:217484ms step_avg:160.03ms step:1370/1480 train_time:217660ms step_avg:160.04ms step:1371/1480 train_time:217832ms step_avg:160.05ms step:1372/1480 train_time:218010ms step_avg:160.07ms step:1373/1480 train_time:218178ms step_avg:160.07ms step:1374/1480 train_time:218354ms step_avg:160.08ms step:1375/1480 train_time:218524ms step_avg:160.09ms step:1375/1480 val_loss:3.2974 train_time:218591ms step_avg:160.14ms step:1376/1480 train_time:218698ms step_avg:160.10ms step:1377/1480 train_time:218870ms step_avg:160.11ms step:1378/1480 train_time:219039ms step_avg:160.12ms step:1379/1480 train_time:219213ms step_avg:160.13ms step:1380/1480 train_time:219387ms step_avg:160.14ms step:1381/1480 train_time:219568ms step_avg:160.15ms step:1382/1480 train_time:219738ms step_avg:160.16ms step:1383/1480 train_time:219909ms step_avg:160.17ms step:1384/1480 train_time:220086ms step_avg:160.18ms step:1385/1480 train_time:220251ms step_avg:160.18ms step:1386/1480 train_time:220422ms step_avg:160.19ms step:1387/1480 train_time:220592ms step_avg:160.20ms step:1388/1480 train_time:220761ms step_avg:160.20ms step:1389/1480 train_time:220932ms step_avg:160.21ms step:1390/1480 train_time:221101ms step_avg:160.22ms step:1391/1480 train_time:221270ms step_avg:160.22ms step:1392/1480 train_time:221444ms step_avg:160.23ms step:1393/1480 train_time:221614ms step_avg:160.24ms step:1394/1480 train_time:221784ms step_avg:160.25ms step:1395/1480 train_time:221953ms step_avg:160.25ms step:1396/1480 train_time:222124ms step_avg:160.26ms step:1397/1480 train_time:222291ms step_avg:160.27ms step:1398/1480 train_time:222459ms step_avg:160.27ms step:1399/1480 train_time:222630ms step_avg:160.28ms step:1400/1480 train_time:222808ms step_avg:160.29ms step:1401/1480 train_time:222975ms step_avg:160.30ms step:1402/1480 train_time:223146ms step_avg:160.31ms step:1403/1480 train_time:223325ms step_avg:160.32ms step:1404/1480 train_time:223496ms step_avg:160.33ms step:1405/1480 train_time:223670ms step_avg:160.34ms step:1406/1480 train_time:223845ms step_avg:160.35ms step:1407/1480 train_time:224012ms step_avg:160.35ms step:1408/1480 train_time:224179ms step_avg:160.36ms step:1409/1480 train_time:224363ms step_avg:160.37ms step:1410/1480 train_time:224530ms step_avg:160.38ms step:1411/1480 train_time:224699ms step_avg:160.38ms step:1412/1480 train_time:224868ms step_avg:160.39ms step:1413/1480 train_time:225039ms step_avg:160.40ms step:1414/1480 train_time:225210ms step_avg:160.41ms step:1415/1480 train_time:225386ms step_avg:160.42ms step:1416/1480 train_time:225571ms step_avg:160.43ms step:1417/1480 train_time:225746ms step_avg:160.45ms step:1418/1480 train_time:225920ms step_avg:160.45ms step:1419/1480 train_time:226093ms step_avg:160.46ms step:1420/1480 train_time:226266ms step_avg:160.47ms step:1421/1480 train_time:226440ms step_avg:160.48ms step:1422/1480 train_time:226612ms step_avg:160.49ms step:1423/1480 train_time:226781ms step_avg:160.50ms step:1424/1480 train_time:226956ms step_avg:160.51ms step:1425/1480 train_time:227135ms step_avg:160.52ms step:1426/1480 train_time:227307ms step_avg:160.53ms step:1427/1480 train_time:227483ms step_avg:160.54ms step:1428/1480 train_time:227654ms step_avg:160.55ms step:1429/1480 train_time:227823ms step_avg:160.55ms step:1430/1480 train_time:227996ms step_avg:160.56ms step:1431/1480 train_time:228169ms step_avg:160.57ms step:1432/1480 train_time:228347ms step_avg:160.58ms step:1433/1480 train_time:228526ms step_avg:160.59ms step:1434/1480 train_time:228706ms step_avg:160.61ms step:1435/1480 train_time:228882ms step_avg:160.62ms step:1436/1480 train_time:229053ms step_avg:160.63ms step:1437/1480 train_time:229225ms step_avg:160.63ms step:1438/1480 train_time:229394ms step_avg:160.64ms step:1439/1480 train_time:229567ms step_avg:160.65ms step:1440/1480 train_time:229736ms step_avg:160.65ms step:1441/1480 train_time:229907ms step_avg:160.66ms step:1442/1480 train_time:230084ms step_avg:160.67ms step:1443/1480 train_time:230272ms step_avg:160.69ms step:1444/1480 train_time:230443ms step_avg:160.70ms step:1445/1480 train_time:230613ms step_avg:160.71ms step:1446/1480 train_time:230788ms step_avg:160.72ms step:1447/1480 train_time:230965ms step_avg:160.73ms step:1448/1480 train_time:231135ms step_avg:160.73ms step:1449/1480 train_time:231309ms step_avg:160.74ms step:1450/1480 train_time:231484ms step_avg:160.75ms step:1451/1480 train_time:231653ms step_avg:160.76ms step:1452/1480 train_time:231829ms step_avg:160.77ms step:1453/1480 train_time:231998ms step_avg:160.77ms step:1454/1480 train_time:232169ms step_avg:160.78ms step:1455/1480 train_time:232350ms step_avg:160.80ms step:1456/1480 train_time:232522ms step_avg:160.80ms step:1457/1480 train_time:232692ms step_avg:160.81ms step:1458/1480 train_time:232865ms step_avg:160.82ms step:1459/1480 train_time:233041ms step_avg:160.83ms step:1460/1480 train_time:233211ms step_avg:160.84ms step:1461/1480 train_time:233387ms step_avg:160.85ms step:1462/1480 train_time:233557ms step_avg:160.85ms step:1463/1480 train_time:233733ms step_avg:160.86ms step:1464/1480 train_time:233907ms step_avg:160.87ms step:1465/1480 train_time:234080ms step_avg:160.88ms step:1466/1480 train_time:234250ms step_avg:160.89ms step:1467/1480 train_time:234425ms step_avg:160.90ms step:1468/1480 train_time:234595ms step_avg:160.90ms step:1469/1480 train_time:234767ms step_avg:160.91ms step:1470/1480 train_time:234947ms step_avg:160.92ms step:1471/1480 train_time:235132ms step_avg:160.94ms step:1472/1480 train_time:235312ms step_avg:160.95ms step:1473/1480 train_time:235484ms step_avg:160.96ms step:1474/1480 train_time:235660ms step_avg:160.97ms step:1475/1480 train_time:235838ms step_avg:160.98ms step:1476/1480 train_time:236010ms step_avg:160.99ms step:1477/1480 train_time:236193ms step_avg:161.00ms step:1478/1480 train_time:236375ms step_avg:161.02ms step:1479/1480 train_time:236548ms step_avg:161.03ms step:1480/1480 train_time:236721ms step_avg:161.04ms step:1480/1480 val_loss:3.2788 train_time:236792ms step_avg:161.08ms