import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 08:54:42 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 44C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 107W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 86W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 123W / 700W | 45MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23329ms step_avg:nanms step:2/1480 train_time:23415ms step_avg:nanms step:3/1480 train_time:23556ms step_avg:nanms step:4/1480 train_time:23699ms step_avg:nanms step:5/1480 train_time:23840ms step_avg:nanms step:6/1480 train_time:23980ms step_avg:nanms step:7/1480 train_time:24122ms step_avg:nanms step:8/1480 train_time:24263ms step_avg:nanms step:9/1480 train_time:24406ms step_avg:nanms step:10/1480 train_time:24547ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:287ms step_avg:nanms step:13/1480 train_time:429ms step_avg:142.96ms step:14/1480 train_time:571ms step_avg:142.71ms step:15/1480 train_time:713ms step_avg:142.57ms step:16/1480 train_time:857ms step_avg:142.82ms step:17/1480 train_time:998ms step_avg:142.58ms step:18/1480 train_time:1139ms step_avg:142.44ms step:19/1480 train_time:1283ms step_avg:142.53ms step:20/1480 train_time:1427ms step_avg:142.70ms step:21/1480 train_time:1571ms step_avg:142.80ms step:22/1480 train_time:1714ms step_avg:142.81ms step:23/1480 train_time:1857ms step_avg:142.85ms step:24/1480 train_time:1998ms step_avg:142.73ms step:25/1480 train_time:2142ms step_avg:142.79ms step:26/1480 train_time:2285ms step_avg:142.84ms step:27/1480 train_time:2429ms step_avg:142.87ms step:28/1480 train_time:2572ms step_avg:142.88ms step:29/1480 train_time:2715ms step_avg:142.87ms step:30/1480 train_time:2858ms step_avg:142.88ms step:31/1480 train_time:2999ms step_avg:142.82ms step:32/1480 train_time:3142ms step_avg:142.81ms step:33/1480 train_time:3287ms step_avg:142.91ms step:34/1480 train_time:3431ms step_avg:142.97ms step:35/1480 train_time:3573ms step_avg:142.93ms step:36/1480 train_time:3717ms step_avg:142.95ms step:37/1480 train_time:3858ms step_avg:142.89ms step:38/1480 train_time:4000ms step_avg:142.86ms step:39/1480 train_time:4142ms step_avg:142.82ms step:40/1480 train_time:4287ms step_avg:142.90ms step:41/1480 train_time:4432ms step_avg:142.98ms step:42/1480 train_time:4576ms step_avg:142.99ms step:43/1480 train_time:4717ms step_avg:142.95ms step:44/1480 train_time:4860ms step_avg:142.94ms step:45/1480 train_time:5003ms step_avg:142.94ms step:46/1480 train_time:5145ms step_avg:142.91ms step:47/1480 train_time:5289ms step_avg:142.94ms step:48/1480 train_time:5433ms step_avg:142.97ms step:49/1480 train_time:5575ms step_avg:142.95ms step:50/1480 train_time:5717ms step_avg:142.92ms step:51/1480 train_time:5859ms step_avg:142.90ms step:52/1480 train_time:6001ms step_avg:142.88ms step:53/1480 train_time:6146ms step_avg:142.92ms step:54/1480 train_time:6289ms step_avg:142.93ms step:55/1480 train_time:6433ms step_avg:142.96ms step:56/1480 train_time:6576ms step_avg:142.96ms step:57/1480 train_time:6718ms step_avg:142.94ms step:58/1480 train_time:6861ms step_avg:142.94ms step:59/1480 train_time:7003ms step_avg:142.92ms step:60/1480 train_time:7148ms step_avg:142.97ms step:61/1480 train_time:7293ms step_avg:143.01ms step:62/1480 train_time:7435ms step_avg:142.99ms step:63/1480 train_time:7578ms step_avg:142.97ms step:64/1480 train_time:7720ms step_avg:142.96ms step:65/1480 train_time:7865ms step_avg:143.00ms step:66/1480 train_time:8007ms step_avg:142.99ms step:67/1480 train_time:8150ms step_avg:142.98ms step:68/1480 train_time:8293ms step_avg:142.98ms step:69/1480 train_time:8434ms step_avg:142.95ms step:70/1480 train_time:8576ms step_avg:142.93ms step:71/1480 train_time:8720ms step_avg:142.95ms step:72/1480 train_time:8865ms step_avg:142.98ms step:73/1480 train_time:9008ms step_avg:142.98ms step:74/1480 train_time:9152ms step_avg:142.99ms step:75/1480 train_time:9295ms step_avg:143.00ms step:76/1480 train_time:9438ms step_avg:142.99ms step:77/1480 train_time:9580ms step_avg:142.98ms step:78/1480 train_time:9722ms step_avg:142.96ms step:79/1480 train_time:9865ms step_avg:142.97ms step:80/1480 train_time:10010ms step_avg:143.00ms step:81/1480 train_time:10154ms step_avg:143.02ms step:82/1480 train_time:10296ms step_avg:143.00ms step:83/1480 train_time:10437ms step_avg:142.97ms step:84/1480 train_time:10581ms step_avg:142.99ms step:85/1480 train_time:10724ms step_avg:142.98ms step:86/1480 train_time:10868ms step_avg:143.00ms step:87/1480 train_time:11011ms step_avg:143.00ms step:88/1480 train_time:11154ms step_avg:143.00ms step:89/1480 train_time:11295ms step_avg:142.98ms step:90/1480 train_time:11436ms step_avg:142.95ms step:91/1480 train_time:11578ms step_avg:142.94ms step:92/1480 train_time:11722ms step_avg:142.95ms step:93/1480 train_time:11867ms step_avg:142.97ms step:94/1480 train_time:12011ms step_avg:142.99ms step:95/1480 train_time:12155ms step_avg:142.99ms step:96/1480 train_time:12296ms step_avg:142.98ms step:97/1480 train_time:12437ms step_avg:142.96ms step:98/1480 train_time:12579ms step_avg:142.94ms step:99/1480 train_time:12721ms step_avg:142.94ms step:100/1480 train_time:12866ms step_avg:142.96ms step:101/1480 train_time:13011ms step_avg:142.97ms step:102/1480 train_time:13155ms step_avg:142.99ms step:103/1480 train_time:13297ms step_avg:142.98ms step:104/1480 train_time:13438ms step_avg:142.95ms step:105/1480 train_time:13580ms step_avg:142.95ms step:106/1480 train_time:13724ms step_avg:142.96ms step:107/1480 train_time:13869ms step_avg:142.98ms step:108/1480 train_time:14012ms step_avg:142.98ms step:109/1480 train_time:14155ms step_avg:142.98ms step:110/1480 train_time:14296ms step_avg:142.96ms step:111/1480 train_time:14441ms step_avg:142.98ms step:112/1480 train_time:14590ms step_avg:143.04ms step:113/1480 train_time:14736ms step_avg:143.07ms step:114/1480 train_time:14884ms step_avg:143.11ms step:115/1480 train_time:15031ms step_avg:143.15ms step:116/1480 train_time:15179ms step_avg:143.20ms step:117/1480 train_time:15328ms step_avg:143.25ms step:118/1480 train_time:15476ms step_avg:143.29ms step:119/1480 train_time:15621ms step_avg:143.31ms step:120/1480 train_time:15768ms step_avg:143.34ms step:121/1480 train_time:15915ms step_avg:143.38ms step:122/1480 train_time:16062ms step_avg:143.41ms step:123/1480 train_time:16209ms step_avg:143.44ms step:124/1480 train_time:16357ms step_avg:143.48ms step:125/1480 train_time:16501ms step_avg:143.49ms step:125/1480 val_loss:4.4155 train_time:16559ms step_avg:143.99ms step:126/1480 train_time:16654ms step_avg:143.57ms step:127/1480 train_time:16803ms step_avg:143.62ms step:128/1480 train_time:16949ms step_avg:143.64ms step:129/1480 train_time:17095ms step_avg:143.65ms step:130/1480 train_time:17241ms step_avg:143.67ms step:131/1480 train_time:17388ms step_avg:143.70ms step:132/1480 train_time:17534ms step_avg:143.72ms step:133/1480 train_time:17682ms step_avg:143.76ms step:134/1480 train_time:17830ms step_avg:143.79ms step:135/1480 train_time:17977ms step_avg:143.82ms step:136/1480 train_time:18125ms step_avg:143.85ms step:137/1480 train_time:18271ms step_avg:143.87ms step:138/1480 train_time:18419ms step_avg:143.90ms step:139/1480 train_time:18566ms step_avg:143.92ms step:140/1480 train_time:18713ms step_avg:143.95ms step:141/1480 train_time:18863ms step_avg:143.99ms step:142/1480 train_time:19010ms step_avg:144.01ms step:143/1480 train_time:19156ms step_avg:144.03ms step:144/1480 train_time:19303ms step_avg:144.05ms step:145/1480 train_time:19449ms step_avg:144.07ms step:146/1480 train_time:19595ms step_avg:144.08ms step:147/1480 train_time:19742ms step_avg:144.10ms step:148/1480 train_time:19890ms step_avg:144.13ms step:149/1480 train_time:20037ms step_avg:144.15ms step:150/1480 train_time:20185ms step_avg:144.18ms step:151/1480 train_time:20331ms step_avg:144.19ms step:152/1480 train_time:20477ms step_avg:144.20ms step:153/1480 train_time:20624ms step_avg:144.23ms step:154/1480 train_time:20770ms step_avg:144.24ms step:155/1480 train_time:20916ms step_avg:144.25ms step:156/1480 train_time:21065ms step_avg:144.28ms step:157/1480 train_time:21212ms step_avg:144.30ms step:158/1480 train_time:21359ms step_avg:144.32ms step:159/1480 train_time:21506ms step_avg:144.34ms step:160/1480 train_time:21652ms step_avg:144.35ms step:161/1480 train_time:21799ms step_avg:144.37ms step:162/1480 train_time:21945ms step_avg:144.38ms step:163/1480 train_time:22092ms step_avg:144.39ms step:164/1480 train_time:22239ms step_avg:144.41ms step:165/1480 train_time:22388ms step_avg:144.44ms step:166/1480 train_time:22533ms step_avg:144.44ms step:167/1480 train_time:22681ms step_avg:144.46ms step:168/1480 train_time:22828ms step_avg:144.48ms step:169/1480 train_time:22974ms step_avg:144.49ms step:170/1480 train_time:23124ms step_avg:144.53ms step:171/1480 train_time:23270ms step_avg:144.54ms step:172/1480 train_time:23418ms step_avg:144.55ms step:173/1480 train_time:23565ms step_avg:144.57ms step:174/1480 train_time:23712ms step_avg:144.58ms step:175/1480 train_time:23858ms step_avg:144.60ms step:176/1480 train_time:24006ms step_avg:144.61ms step:177/1480 train_time:24152ms step_avg:144.62ms step:178/1480 train_time:24299ms step_avg:144.64ms step:179/1480 train_time:24446ms step_avg:144.65ms step:180/1480 train_time:24591ms step_avg:144.65ms step:181/1480 train_time:24738ms step_avg:144.66ms step:182/1480 train_time:24886ms step_avg:144.69ms step:183/1480 train_time:25032ms step_avg:144.69ms step:184/1480 train_time:25178ms step_avg:144.70ms step:185/1480 train_time:25326ms step_avg:144.72ms step:186/1480 train_time:25473ms step_avg:144.73ms step:187/1480 train_time:25619ms step_avg:144.74ms step:188/1480 train_time:25766ms step_avg:144.75ms step:189/1480 train_time:25915ms step_avg:144.78ms step:190/1480 train_time:26062ms step_avg:144.79ms step:191/1480 train_time:26208ms step_avg:144.80ms step:192/1480 train_time:26354ms step_avg:144.80ms step:193/1480 train_time:26501ms step_avg:144.82ms step:194/1480 train_time:26648ms step_avg:144.83ms step:195/1480 train_time:26794ms step_avg:144.84ms step:196/1480 train_time:26941ms step_avg:144.84ms step:197/1480 train_time:27088ms step_avg:144.86ms step:198/1480 train_time:27233ms step_avg:144.86ms step:199/1480 train_time:27382ms step_avg:144.88ms step:200/1480 train_time:27530ms step_avg:144.89ms step:201/1480 train_time:27675ms step_avg:144.89ms step:202/1480 train_time:27822ms step_avg:144.91ms step:203/1480 train_time:27970ms step_avg:144.92ms step:204/1480 train_time:28118ms step_avg:144.94ms step:205/1480 train_time:28266ms step_avg:144.95ms step:206/1480 train_time:28412ms step_avg:144.96ms step:207/1480 train_time:28559ms step_avg:144.97ms step:208/1480 train_time:28706ms step_avg:144.98ms step:209/1480 train_time:28852ms step_avg:144.98ms step:210/1480 train_time:28998ms step_avg:144.99ms step:211/1480 train_time:29146ms step_avg:145.00ms step:212/1480 train_time:29292ms step_avg:145.01ms step:213/1480 train_time:29439ms step_avg:145.02ms step:214/1480 train_time:29587ms step_avg:145.04ms step:215/1480 train_time:29734ms step_avg:145.04ms step:216/1480 train_time:29881ms step_avg:145.06ms step:217/1480 train_time:30028ms step_avg:145.06ms step:218/1480 train_time:30174ms step_avg:145.07ms step:219/1480 train_time:30322ms step_avg:145.08ms step:220/1480 train_time:30469ms step_avg:145.09ms step:221/1480 train_time:30616ms step_avg:145.10ms step:222/1480 train_time:30766ms step_avg:145.12ms step:223/1480 train_time:30918ms step_avg:145.16ms step:224/1480 train_time:31070ms step_avg:145.19ms step:225/1480 train_time:31220ms step_avg:145.21ms step:226/1480 train_time:31371ms step_avg:145.23ms step:227/1480 train_time:31521ms step_avg:145.26ms step:228/1480 train_time:31672ms step_avg:145.28ms step:229/1480 train_time:31824ms step_avg:145.31ms step:230/1480 train_time:31974ms step_avg:145.34ms step:231/1480 train_time:32124ms step_avg:145.36ms step:232/1480 train_time:32275ms step_avg:145.38ms step:233/1480 train_time:32427ms step_avg:145.41ms step:234/1480 train_time:32577ms step_avg:145.43ms step:235/1480 train_time:32728ms step_avg:145.46ms step:236/1480 train_time:32879ms step_avg:145.48ms step:237/1480 train_time:33030ms step_avg:145.51ms step:238/1480 train_time:33181ms step_avg:145.53ms step:239/1480 train_time:33331ms step_avg:145.55ms step:240/1480 train_time:33482ms step_avg:145.57ms step:241/1480 train_time:33632ms step_avg:145.59ms step:242/1480 train_time:33784ms step_avg:145.62ms step:243/1480 train_time:33934ms step_avg:145.64ms step:244/1480 train_time:34086ms step_avg:145.67ms step:245/1480 train_time:34235ms step_avg:145.68ms step:246/1480 train_time:34385ms step_avg:145.70ms step:247/1480 train_time:34534ms step_avg:145.72ms step:248/1480 train_time:34686ms step_avg:145.74ms step:249/1480 train_time:34836ms step_avg:145.76ms step:250/1480 train_time:34987ms step_avg:145.78ms step:250/1480 val_loss:3.9966 train_time:35045ms step_avg:146.02ms step:251/1480 train_time:35143ms step_avg:145.82ms step:252/1480 train_time:35294ms step_avg:145.84ms step:253/1480 train_time:35445ms step_avg:145.86ms step:254/1480 train_time:35594ms step_avg:145.88ms step:255/1480 train_time:35744ms step_avg:145.90ms step:256/1480 train_time:35894ms step_avg:145.91ms step:257/1480 train_time:36044ms step_avg:145.93ms step:258/1480 train_time:36197ms step_avg:145.95ms step:259/1480 train_time:36348ms step_avg:145.97ms step:260/1480 train_time:36499ms step_avg:146.00ms step:261/1480 train_time:36648ms step_avg:146.01ms step:262/1480 train_time:36799ms step_avg:146.03ms step:263/1480 train_time:36949ms step_avg:146.04ms step:264/1480 train_time:37100ms step_avg:146.06ms step:265/1480 train_time:37250ms step_avg:146.08ms step:266/1480 train_time:37401ms step_avg:146.10ms step:267/1480 train_time:37551ms step_avg:146.11ms step:268/1480 train_time:37702ms step_avg:146.13ms step:269/1480 train_time:37851ms step_avg:146.14ms step:270/1480 train_time:38002ms step_avg:146.16ms step:271/1480 train_time:38151ms step_avg:146.17ms step:272/1480 train_time:38303ms step_avg:146.19ms step:273/1480 train_time:38452ms step_avg:146.21ms step:274/1480 train_time:38603ms step_avg:146.22ms step:275/1480 train_time:38752ms step_avg:146.24ms step:276/1480 train_time:38903ms step_avg:146.25ms step:277/1480 train_time:39052ms step_avg:146.26ms step:278/1480 train_time:39203ms step_avg:146.28ms step:279/1480 train_time:39353ms step_avg:146.29ms step:280/1480 train_time:39505ms step_avg:146.31ms step:281/1480 train_time:39655ms step_avg:146.33ms step:282/1480 train_time:39806ms step_avg:146.34ms step:283/1480 train_time:39956ms step_avg:146.36ms step:284/1480 train_time:40106ms step_avg:146.37ms step:285/1480 train_time:40257ms step_avg:146.39ms step:286/1480 train_time:40409ms step_avg:146.41ms step:287/1480 train_time:40560ms step_avg:146.42ms step:288/1480 train_time:40709ms step_avg:146.43ms step:289/1480 train_time:40860ms step_avg:146.45ms step:290/1480 train_time:41010ms step_avg:146.46ms step:291/1480 train_time:41161ms step_avg:146.48ms step:292/1480 train_time:41311ms step_avg:146.49ms step:293/1480 train_time:41463ms step_avg:146.51ms step:294/1480 train_time:41614ms step_avg:146.53ms step:295/1480 train_time:41764ms step_avg:146.54ms step:296/1480 train_time:41914ms step_avg:146.55ms step:297/1480 train_time:42065ms step_avg:146.57ms step:298/1480 train_time:42216ms step_avg:146.58ms step:299/1480 train_time:42366ms step_avg:146.59ms step:300/1480 train_time:42518ms step_avg:146.61ms step:301/1480 train_time:42668ms step_avg:146.62ms step:302/1480 train_time:42819ms step_avg:146.64ms step:303/1480 train_time:42968ms step_avg:146.65ms step:304/1480 train_time:43119ms step_avg:146.66ms step:305/1480 train_time:43269ms step_avg:146.68ms step:306/1480 train_time:43421ms step_avg:146.69ms step:307/1480 train_time:43572ms step_avg:146.71ms step:308/1480 train_time:43723ms step_avg:146.72ms step:309/1480 train_time:43872ms step_avg:146.73ms step:310/1480 train_time:44023ms step_avg:146.74ms step:311/1480 train_time:44173ms step_avg:146.75ms step:312/1480 train_time:44324ms step_avg:146.77ms step:313/1480 train_time:44475ms step_avg:146.78ms step:314/1480 train_time:44625ms step_avg:146.79ms step:315/1480 train_time:44775ms step_avg:146.80ms step:316/1480 train_time:44926ms step_avg:146.82ms step:317/1480 train_time:45077ms step_avg:146.83ms step:318/1480 train_time:45227ms step_avg:146.84ms step:319/1480 train_time:45378ms step_avg:146.86ms step:320/1480 train_time:45530ms step_avg:146.87ms step:321/1480 train_time:45680ms step_avg:146.88ms step:322/1480 train_time:45830ms step_avg:146.89ms step:323/1480 train_time:45980ms step_avg:146.90ms step:324/1480 train_time:46130ms step_avg:146.91ms step:325/1480 train_time:46280ms step_avg:146.92ms step:326/1480 train_time:46431ms step_avg:146.93ms step:327/1480 train_time:46582ms step_avg:146.95ms step:328/1480 train_time:46733ms step_avg:146.96ms step:329/1480 train_time:46884ms step_avg:146.97ms step:330/1480 train_time:47037ms step_avg:146.99ms step:331/1480 train_time:47190ms step_avg:147.01ms step:332/1480 train_time:47343ms step_avg:147.03ms step:333/1480 train_time:47497ms step_avg:147.05ms step:334/1480 train_time:47651ms step_avg:147.07ms step:335/1480 train_time:47805ms step_avg:147.09ms step:336/1480 train_time:47959ms step_avg:147.11ms step:337/1480 train_time:48114ms step_avg:147.14ms step:338/1480 train_time:48268ms step_avg:147.16ms step:339/1480 train_time:48422ms step_avg:147.18ms step:340/1480 train_time:48576ms step_avg:147.20ms step:341/1480 train_time:48729ms step_avg:147.22ms step:342/1480 train_time:48882ms step_avg:147.23ms step:343/1480 train_time:49037ms step_avg:147.26ms step:344/1480 train_time:49192ms step_avg:147.28ms step:345/1480 train_time:49346ms step_avg:147.30ms step:346/1480 train_time:49501ms step_avg:147.32ms step:347/1480 train_time:49655ms step_avg:147.34ms step:348/1480 train_time:49808ms step_avg:147.36ms step:349/1480 train_time:49962ms step_avg:147.38ms step:350/1480 train_time:50116ms step_avg:147.40ms step:351/1480 train_time:50270ms step_avg:147.42ms step:352/1480 train_time:50425ms step_avg:147.44ms step:353/1480 train_time:50580ms step_avg:147.46ms step:354/1480 train_time:50734ms step_avg:147.48ms step:355/1480 train_time:50887ms step_avg:147.50ms step:356/1480 train_time:51041ms step_avg:147.52ms step:357/1480 train_time:51195ms step_avg:147.54ms step:358/1480 train_time:51349ms step_avg:147.55ms step:359/1480 train_time:51503ms step_avg:147.57ms step:360/1480 train_time:51659ms step_avg:147.60ms step:361/1480 train_time:51814ms step_avg:147.62ms step:362/1480 train_time:51968ms step_avg:147.64ms step:363/1480 train_time:52121ms step_avg:147.65ms step:364/1480 train_time:52275ms step_avg:147.67ms step:365/1480 train_time:52429ms step_avg:147.69ms step:366/1480 train_time:52582ms step_avg:147.70ms step:367/1480 train_time:52737ms step_avg:147.72ms step:368/1480 train_time:52891ms step_avg:147.74ms step:369/1480 train_time:53044ms step_avg:147.75ms step:370/1480 train_time:53197ms step_avg:147.77ms step:371/1480 train_time:53351ms step_avg:147.79ms step:372/1480 train_time:53505ms step_avg:147.81ms step:373/1480 train_time:53659ms step_avg:147.82ms step:374/1480 train_time:53812ms step_avg:147.84ms step:375/1480 train_time:53966ms step_avg:147.85ms step:375/1480 val_loss:3.8075 train_time:54025ms step_avg:148.01ms step:376/1480 train_time:54121ms step_avg:147.87ms step:377/1480 train_time:54277ms step_avg:147.89ms step:378/1480 train_time:54430ms step_avg:147.91ms step:379/1480 train_time:54583ms step_avg:147.92ms step:380/1480 train_time:54736ms step_avg:147.93ms step:381/1480 train_time:54887ms step_avg:147.94ms step:382/1480 train_time:55042ms step_avg:147.96ms step:383/1480 train_time:55198ms step_avg:147.98ms step:384/1480 train_time:55353ms step_avg:148.00ms step:385/1480 train_time:55507ms step_avg:148.02ms step:386/1480 train_time:55660ms step_avg:148.03ms step:387/1480 train_time:55813ms step_avg:148.05ms step:388/1480 train_time:55967ms step_avg:148.06ms step:389/1480 train_time:56120ms step_avg:148.07ms step:390/1480 train_time:56274ms step_avg:148.09ms step:391/1480 train_time:56429ms step_avg:148.11ms step:392/1480 train_time:56581ms step_avg:148.12ms step:393/1480 train_time:56737ms step_avg:148.14ms step:394/1480 train_time:56891ms step_avg:148.15ms step:395/1480 train_time:57044ms step_avg:148.17ms step:396/1480 train_time:57198ms step_avg:148.18ms step:397/1480 train_time:57353ms step_avg:148.20ms step:398/1480 train_time:57508ms step_avg:148.22ms step:399/1480 train_time:57661ms step_avg:148.23ms step:400/1480 train_time:57815ms step_avg:148.24ms step:401/1480 train_time:57970ms step_avg:148.26ms step:402/1480 train_time:58124ms step_avg:148.27ms step:403/1480 train_time:58277ms step_avg:148.29ms step:404/1480 train_time:58433ms step_avg:148.31ms step:405/1480 train_time:58587ms step_avg:148.32ms step:406/1480 train_time:58740ms step_avg:148.33ms step:407/1480 train_time:58894ms step_avg:148.35ms step:408/1480 train_time:59048ms step_avg:148.36ms step:409/1480 train_time:59202ms step_avg:148.38ms step:410/1480 train_time:59356ms step_avg:148.39ms step:411/1480 train_time:59511ms step_avg:148.41ms step:412/1480 train_time:59664ms step_avg:148.42ms step:413/1480 train_time:59818ms step_avg:148.43ms step:414/1480 train_time:59973ms step_avg:148.45ms step:415/1480 train_time:60127ms step_avg:148.46ms step:416/1480 train_time:60280ms step_avg:148.47ms step:417/1480 train_time:60435ms step_avg:148.49ms step:418/1480 train_time:60589ms step_avg:148.50ms step:419/1480 train_time:60743ms step_avg:148.52ms step:420/1480 train_time:60896ms step_avg:148.53ms step:421/1480 train_time:61050ms step_avg:148.54ms step:422/1480 train_time:61205ms step_avg:148.56ms step:423/1480 train_time:61359ms step_avg:148.57ms step:424/1480 train_time:61514ms step_avg:148.58ms step:425/1480 train_time:61668ms step_avg:148.60ms step:426/1480 train_time:61822ms step_avg:148.61ms step:427/1480 train_time:61976ms step_avg:148.62ms step:428/1480 train_time:62129ms step_avg:148.63ms step:429/1480 train_time:62283ms step_avg:148.65ms step:430/1480 train_time:62437ms step_avg:148.66ms step:431/1480 train_time:62591ms step_avg:148.67ms step:432/1480 train_time:62743ms step_avg:148.68ms step:433/1480 train_time:62899ms step_avg:148.70ms step:434/1480 train_time:63052ms step_avg:148.71ms step:435/1480 train_time:63206ms step_avg:148.72ms step:436/1480 train_time:63359ms step_avg:148.73ms step:437/1480 train_time:63512ms step_avg:148.74ms step:438/1480 train_time:63665ms step_avg:148.75ms step:439/1480 train_time:63818ms step_avg:148.76ms step:440/1480 train_time:63974ms step_avg:148.78ms step:441/1480 train_time:64131ms step_avg:148.80ms step:442/1480 train_time:64288ms step_avg:148.82ms step:443/1480 train_time:64445ms step_avg:148.83ms step:444/1480 train_time:64601ms step_avg:148.85ms step:445/1480 train_time:64756ms step_avg:148.86ms step:446/1480 train_time:64912ms step_avg:148.88ms step:447/1480 train_time:65067ms step_avg:148.90ms step:448/1480 train_time:65224ms step_avg:148.91ms step:449/1480 train_time:65381ms step_avg:148.93ms step:450/1480 train_time:65539ms step_avg:148.95ms step:451/1480 train_time:65697ms step_avg:148.97ms step:452/1480 train_time:65854ms step_avg:148.99ms step:453/1480 train_time:66010ms step_avg:149.01ms step:454/1480 train_time:66167ms step_avg:149.03ms step:455/1480 train_time:66322ms step_avg:149.04ms step:456/1480 train_time:66479ms step_avg:149.06ms step:457/1480 train_time:66637ms step_avg:149.08ms step:458/1480 train_time:66793ms step_avg:149.09ms step:459/1480 train_time:66952ms step_avg:149.11ms step:460/1480 train_time:67110ms step_avg:149.13ms step:461/1480 train_time:67267ms step_avg:149.15ms step:462/1480 train_time:67424ms step_avg:149.17ms step:463/1480 train_time:67581ms step_avg:149.18ms step:464/1480 train_time:67737ms step_avg:149.20ms step:465/1480 train_time:67894ms step_avg:149.22ms step:466/1480 train_time:68052ms step_avg:149.24ms step:467/1480 train_time:68211ms step_avg:149.26ms step:468/1480 train_time:68369ms step_avg:149.28ms step:469/1480 train_time:68525ms step_avg:149.29ms step:470/1480 train_time:68681ms step_avg:149.31ms step:471/1480 train_time:68837ms step_avg:149.32ms step:472/1480 train_time:68996ms step_avg:149.34ms step:473/1480 train_time:69153ms step_avg:149.36ms step:474/1480 train_time:69309ms step_avg:149.37ms step:475/1480 train_time:69464ms step_avg:149.38ms step:476/1480 train_time:69620ms step_avg:149.40ms step:477/1480 train_time:69776ms step_avg:149.41ms step:478/1480 train_time:69933ms step_avg:149.43ms step:479/1480 train_time:70090ms step_avg:149.45ms step:480/1480 train_time:70246ms step_avg:149.46ms step:481/1480 train_time:70402ms step_avg:149.47ms step:482/1480 train_time:70558ms step_avg:149.49ms step:483/1480 train_time:70716ms step_avg:149.51ms step:484/1480 train_time:70874ms step_avg:149.52ms step:485/1480 train_time:71033ms step_avg:149.54ms step:486/1480 train_time:71191ms step_avg:149.56ms step:487/1480 train_time:71347ms step_avg:149.57ms step:488/1480 train_time:71504ms step_avg:149.59ms step:489/1480 train_time:71660ms step_avg:149.60ms step:490/1480 train_time:71817ms step_avg:149.62ms step:491/1480 train_time:71974ms step_avg:149.63ms step:492/1480 train_time:72132ms step_avg:149.65ms step:493/1480 train_time:72290ms step_avg:149.67ms step:494/1480 train_time:72447ms step_avg:149.68ms step:495/1480 train_time:72605ms step_avg:149.70ms step:496/1480 train_time:72762ms step_avg:149.72ms step:497/1480 train_time:72917ms step_avg:149.73ms step:498/1480 train_time:73075ms step_avg:149.74ms step:499/1480 train_time:73233ms step_avg:149.76ms step:500/1480 train_time:73391ms step_avg:149.78ms step:500/1480 val_loss:3.6847 train_time:73454ms step_avg:149.91ms step:501/1480 train_time:73551ms step_avg:149.80ms step:502/1480 train_time:73709ms step_avg:149.81ms step:503/1480 train_time:73865ms step_avg:149.83ms step:504/1480 train_time:74021ms step_avg:149.84ms step:505/1480 train_time:74176ms step_avg:149.85ms step:506/1480 train_time:74332ms step_avg:149.86ms step:507/1480 train_time:74489ms step_avg:149.88ms step:508/1480 train_time:74647ms step_avg:149.89ms step:509/1480 train_time:74803ms step_avg:149.91ms step:510/1480 train_time:74960ms step_avg:149.92ms step:511/1480 train_time:75117ms step_avg:149.93ms step:512/1480 train_time:75273ms step_avg:149.95ms step:513/1480 train_time:75428ms step_avg:149.96ms step:514/1480 train_time:75587ms step_avg:149.97ms step:515/1480 train_time:75745ms step_avg:149.99ms step:516/1480 train_time:75902ms step_avg:150.00ms step:517/1480 train_time:76061ms step_avg:150.02ms step:518/1480 train_time:76218ms step_avg:150.04ms step:519/1480 train_time:76374ms step_avg:150.05ms step:520/1480 train_time:76532ms step_avg:150.06ms step:521/1480 train_time:76689ms step_avg:150.08ms step:522/1480 train_time:76846ms step_avg:150.09ms step:523/1480 train_time:77003ms step_avg:150.10ms step:524/1480 train_time:77160ms step_avg:150.12ms step:525/1480 train_time:77317ms step_avg:150.13ms step:526/1480 train_time:77475ms step_avg:150.15ms step:527/1480 train_time:77631ms step_avg:150.16ms step:528/1480 train_time:77788ms step_avg:150.17ms step:529/1480 train_time:77945ms step_avg:150.18ms step:530/1480 train_time:78104ms step_avg:150.20ms step:531/1480 train_time:78263ms step_avg:150.22ms step:532/1480 train_time:78421ms step_avg:150.23ms step:533/1480 train_time:78577ms step_avg:150.24ms step:534/1480 train_time:78733ms step_avg:150.25ms step:535/1480 train_time:78890ms step_avg:150.27ms step:536/1480 train_time:79048ms step_avg:150.28ms step:537/1480 train_time:79205ms step_avg:150.29ms step:538/1480 train_time:79363ms step_avg:150.31ms step:539/1480 train_time:79523ms step_avg:150.33ms step:540/1480 train_time:79682ms step_avg:150.34ms step:541/1480 train_time:79838ms step_avg:150.35ms step:542/1480 train_time:79995ms step_avg:150.37ms step:543/1480 train_time:80151ms step_avg:150.38ms step:544/1480 train_time:80309ms step_avg:150.39ms step:545/1480 train_time:80464ms step_avg:150.40ms step:546/1480 train_time:80622ms step_avg:150.41ms step:547/1480 train_time:80780ms step_avg:150.43ms step:548/1480 train_time:80940ms step_avg:150.45ms step:549/1480 train_time:81098ms step_avg:150.46ms step:550/1480 train_time:81255ms step_avg:150.47ms step:551/1480 train_time:81412ms step_avg:150.48ms step:552/1480 train_time:81570ms step_avg:150.50ms step:553/1480 train_time:81729ms step_avg:150.51ms step:554/1480 train_time:81888ms step_avg:150.53ms step:555/1480 train_time:82049ms step_avg:150.55ms step:556/1480 train_time:82207ms step_avg:150.56ms step:557/1480 train_time:82367ms step_avg:150.58ms step:558/1480 train_time:82526ms step_avg:150.60ms step:559/1480 train_time:82685ms step_avg:150.61ms step:560/1480 train_time:82844ms step_avg:150.63ms step:561/1480 train_time:83002ms step_avg:150.64ms step:562/1480 train_time:83162ms step_avg:150.66ms step:563/1480 train_time:83322ms step_avg:150.67ms step:564/1480 train_time:83482ms step_avg:150.69ms step:565/1480 train_time:83640ms step_avg:150.70ms step:566/1480 train_time:83800ms step_avg:150.72ms step:567/1480 train_time:83959ms step_avg:150.73ms step:568/1480 train_time:84118ms step_avg:150.75ms step:569/1480 train_time:84276ms step_avg:150.76ms step:570/1480 train_time:84435ms step_avg:150.78ms step:571/1480 train_time:84595ms step_avg:150.79ms step:572/1480 train_time:84753ms step_avg:150.81ms step:573/1480 train_time:84912ms step_avg:150.82ms step:574/1480 train_time:85072ms step_avg:150.84ms step:575/1480 train_time:85231ms step_avg:150.85ms step:576/1480 train_time:85390ms step_avg:150.87ms step:577/1480 train_time:85549ms step_avg:150.88ms step:578/1480 train_time:85707ms step_avg:150.89ms step:579/1480 train_time:85867ms step_avg:150.91ms step:580/1480 train_time:86027ms step_avg:150.92ms step:581/1480 train_time:86187ms step_avg:150.94ms step:582/1480 train_time:86346ms step_avg:150.95ms step:583/1480 train_time:86506ms step_avg:150.97ms step:584/1480 train_time:86666ms step_avg:150.99ms step:585/1480 train_time:86825ms step_avg:151.00ms step:586/1480 train_time:86985ms step_avg:151.02ms step:587/1480 train_time:87144ms step_avg:151.03ms step:588/1480 train_time:87303ms step_avg:151.04ms step:589/1480 train_time:87464ms step_avg:151.06ms step:590/1480 train_time:87625ms step_avg:151.08ms step:591/1480 train_time:87784ms step_avg:151.09ms step:592/1480 train_time:87945ms step_avg:151.11ms step:593/1480 train_time:88106ms step_avg:151.12ms step:594/1480 train_time:88265ms step_avg:151.14ms step:595/1480 train_time:88427ms step_avg:151.16ms step:596/1480 train_time:88587ms step_avg:151.17ms step:597/1480 train_time:88746ms step_avg:151.19ms step:598/1480 train_time:88905ms step_avg:151.20ms step:599/1480 train_time:89063ms step_avg:151.21ms step:600/1480 train_time:89223ms step_avg:151.23ms step:601/1480 train_time:89383ms step_avg:151.24ms step:602/1480 train_time:89544ms step_avg:151.26ms step:603/1480 train_time:89704ms step_avg:151.27ms step:604/1480 train_time:89863ms step_avg:151.29ms step:605/1480 train_time:90024ms step_avg:151.30ms step:606/1480 train_time:90186ms step_avg:151.32ms step:607/1480 train_time:90348ms step_avg:151.34ms step:608/1480 train_time:90507ms step_avg:151.35ms step:609/1480 train_time:90665ms step_avg:151.36ms step:610/1480 train_time:90823ms step_avg:151.37ms step:611/1480 train_time:90984ms step_avg:151.39ms step:612/1480 train_time:91144ms step_avg:151.40ms step:613/1480 train_time:91305ms step_avg:151.42ms step:614/1480 train_time:91466ms step_avg:151.43ms step:615/1480 train_time:91625ms step_avg:151.45ms step:616/1480 train_time:91784ms step_avg:151.46ms step:617/1480 train_time:91943ms step_avg:151.47ms step:618/1480 train_time:92103ms step_avg:151.49ms step:619/1480 train_time:92263ms step_avg:151.50ms step:620/1480 train_time:92424ms step_avg:151.52ms step:621/1480 train_time:92584ms step_avg:151.53ms step:622/1480 train_time:92743ms step_avg:151.54ms step:623/1480 train_time:92905ms step_avg:151.56ms step:624/1480 train_time:93064ms step_avg:151.57ms step:625/1480 train_time:93223ms step_avg:151.58ms step:625/1480 val_loss:3.6024 train_time:93286ms step_avg:151.68ms step:626/1480 train_time:93385ms step_avg:151.60ms step:627/1480 train_time:93545ms step_avg:151.61ms step:628/1480 train_time:93703ms step_avg:151.62ms step:629/1480 train_time:93862ms step_avg:151.64ms step:630/1480 train_time:94020ms step_avg:151.64ms step:631/1480 train_time:94178ms step_avg:151.65ms step:632/1480 train_time:94337ms step_avg:151.67ms step:633/1480 train_time:94498ms step_avg:151.68ms step:634/1480 train_time:94658ms step_avg:151.69ms step:635/1480 train_time:94817ms step_avg:151.71ms step:636/1480 train_time:94976ms step_avg:151.72ms step:637/1480 train_time:95136ms step_avg:151.73ms step:638/1480 train_time:95295ms step_avg:151.74ms step:639/1480 train_time:95455ms step_avg:151.76ms step:640/1480 train_time:95614ms step_avg:151.77ms step:641/1480 train_time:95774ms step_avg:151.78ms step:642/1480 train_time:95934ms step_avg:151.79ms step:643/1480 train_time:96095ms step_avg:151.81ms step:644/1480 train_time:96254ms step_avg:151.82ms step:645/1480 train_time:96412ms step_avg:151.83ms step:646/1480 train_time:96573ms step_avg:151.84ms step:647/1480 train_time:96733ms step_avg:151.86ms step:648/1480 train_time:96895ms step_avg:151.87ms step:649/1480 train_time:97055ms step_avg:151.89ms step:650/1480 train_time:97214ms step_avg:151.90ms step:651/1480 train_time:97375ms step_avg:151.91ms step:652/1480 train_time:97535ms step_avg:151.92ms step:653/1480 train_time:97695ms step_avg:151.94ms step:654/1480 train_time:97855ms step_avg:151.95ms step:655/1480 train_time:98014ms step_avg:151.96ms step:656/1480 train_time:98175ms step_avg:151.97ms step:657/1480 train_time:98335ms step_avg:151.99ms step:658/1480 train_time:98494ms step_avg:152.00ms step:659/1480 train_time:98657ms step_avg:152.01ms step:660/1480 train_time:98819ms step_avg:152.03ms step:661/1480 train_time:98981ms step_avg:152.05ms step:662/1480 train_time:99140ms step_avg:152.06ms step:663/1480 train_time:99300ms step_avg:152.07ms step:664/1480 train_time:99462ms step_avg:152.08ms step:665/1480 train_time:99623ms step_avg:152.10ms step:666/1480 train_time:99783ms step_avg:152.11ms step:667/1480 train_time:99944ms step_avg:152.12ms step:668/1480 train_time:100107ms step_avg:152.14ms step:669/1480 train_time:100271ms step_avg:152.16ms step:670/1480 train_time:100432ms step_avg:152.17ms step:671/1480 train_time:100595ms step_avg:152.19ms step:672/1480 train_time:100759ms step_avg:152.20ms step:673/1480 train_time:100920ms step_avg:152.22ms step:674/1480 train_time:101081ms step_avg:152.23ms step:675/1480 train_time:101243ms step_avg:152.24ms step:676/1480 train_time:101404ms step_avg:152.26ms step:677/1480 train_time:101564ms step_avg:152.27ms step:678/1480 train_time:101723ms step_avg:152.28ms step:679/1480 train_time:101886ms step_avg:152.30ms step:680/1480 train_time:102050ms step_avg:152.31ms step:681/1480 train_time:102211ms step_avg:152.33ms step:682/1480 train_time:102375ms step_avg:152.34ms step:683/1480 train_time:102537ms step_avg:152.36ms step:684/1480 train_time:102698ms step_avg:152.37ms step:685/1480 train_time:102860ms step_avg:152.39ms step:686/1480 train_time:103021ms step_avg:152.40ms step:687/1480 train_time:103181ms step_avg:152.41ms step:688/1480 train_time:103344ms step_avg:152.43ms step:689/1480 train_time:103507ms step_avg:152.44ms step:690/1480 train_time:103672ms step_avg:152.46ms step:691/1480 train_time:103833ms step_avg:152.47ms step:692/1480 train_time:103996ms step_avg:152.49ms step:693/1480 train_time:104158ms step_avg:152.50ms step:694/1480 train_time:104318ms step_avg:152.51ms step:695/1480 train_time:104479ms step_avg:152.52ms step:696/1480 train_time:104638ms step_avg:152.53ms step:697/1480 train_time:104802ms step_avg:152.55ms step:698/1480 train_time:104963ms step_avg:152.56ms step:699/1480 train_time:105125ms step_avg:152.58ms step:700/1480 train_time:105287ms step_avg:152.59ms step:701/1480 train_time:105447ms step_avg:152.60ms step:702/1480 train_time:105608ms step_avg:152.61ms step:703/1480 train_time:105770ms step_avg:152.63ms step:704/1480 train_time:105933ms step_avg:152.64ms step:705/1480 train_time:106097ms step_avg:152.66ms step:706/1480 train_time:106261ms step_avg:152.67ms step:707/1480 train_time:106421ms step_avg:152.68ms step:708/1480 train_time:106583ms step_avg:152.70ms step:709/1480 train_time:106745ms step_avg:152.71ms step:710/1480 train_time:106905ms step_avg:152.72ms step:711/1480 train_time:107068ms step_avg:152.74ms step:712/1480 train_time:107234ms step_avg:152.76ms step:713/1480 train_time:107398ms step_avg:152.77ms step:714/1480 train_time:107559ms step_avg:152.78ms step:715/1480 train_time:107718ms step_avg:152.79ms step:716/1480 train_time:107878ms step_avg:152.80ms step:717/1480 train_time:108041ms step_avg:152.82ms step:718/1480 train_time:108201ms step_avg:152.83ms step:719/1480 train_time:108362ms step_avg:152.84ms step:720/1480 train_time:108523ms step_avg:152.85ms step:721/1480 train_time:108684ms step_avg:152.86ms step:722/1480 train_time:108845ms step_avg:152.87ms step:723/1480 train_time:109004ms step_avg:152.88ms step:724/1480 train_time:109167ms step_avg:152.89ms step:725/1480 train_time:109331ms step_avg:152.91ms step:726/1480 train_time:109497ms step_avg:152.93ms step:727/1480 train_time:109659ms step_avg:152.94ms step:728/1480 train_time:109819ms step_avg:152.95ms step:729/1480 train_time:109979ms step_avg:152.96ms step:730/1480 train_time:110141ms step_avg:152.97ms step:731/1480 train_time:110301ms step_avg:152.98ms step:732/1480 train_time:110462ms step_avg:152.99ms step:733/1480 train_time:110623ms step_avg:153.01ms step:734/1480 train_time:110784ms step_avg:153.02ms step:735/1480 train_time:110945ms step_avg:153.03ms step:736/1480 train_time:111109ms step_avg:153.04ms step:737/1480 train_time:111270ms step_avg:153.05ms step:738/1480 train_time:111432ms step_avg:153.07ms step:739/1480 train_time:111594ms step_avg:153.08ms step:740/1480 train_time:111760ms step_avg:153.10ms step:741/1480 train_time:111923ms step_avg:153.11ms step:742/1480 train_time:112085ms step_avg:153.12ms step:743/1480 train_time:112245ms step_avg:153.13ms step:744/1480 train_time:112411ms step_avg:153.15ms step:745/1480 train_time:112578ms step_avg:153.17ms step:746/1480 train_time:112737ms step_avg:153.18ms step:747/1480 train_time:112898ms step_avg:153.19ms step:748/1480 train_time:113063ms step_avg:153.20ms step:749/1480 train_time:113225ms step_avg:153.21ms step:750/1480 train_time:113385ms step_avg:153.22ms step:750/1480 val_loss:3.5485 train_time:113449ms step_avg:153.31ms step:751/1480 train_time:113550ms step_avg:153.24ms step:752/1480 train_time:113714ms step_avg:153.25ms step:753/1480 train_time:113876ms step_avg:153.27ms step:754/1480 train_time:114037ms step_avg:153.28ms step:755/1480 train_time:114198ms step_avg:153.29ms step:756/1480 train_time:114360ms step_avg:153.30ms step:757/1480 train_time:114524ms step_avg:153.31ms step:758/1480 train_time:114684ms step_avg:153.32ms step:759/1480 train_time:114845ms step_avg:153.33ms step:760/1480 train_time:115005ms step_avg:153.34ms step:761/1480 train_time:115166ms step_avg:153.35ms step:762/1480 train_time:115327ms step_avg:153.36ms step:763/1480 train_time:115488ms step_avg:153.37ms step:764/1480 train_time:115649ms step_avg:153.38ms step:765/1480 train_time:115811ms step_avg:153.39ms step:766/1480 train_time:115974ms step_avg:153.40ms step:767/1480 train_time:116137ms step_avg:153.42ms step:768/1480 train_time:116299ms step_avg:153.43ms step:769/1480 train_time:116463ms step_avg:153.44ms step:770/1480 train_time:116625ms step_avg:153.45ms step:771/1480 train_time:116790ms step_avg:153.47ms step:772/1480 train_time:116952ms step_avg:153.48ms step:773/1480 train_time:117115ms step_avg:153.49ms step:774/1480 train_time:117279ms step_avg:153.51ms step:775/1480 train_time:117441ms step_avg:153.52ms step:776/1480 train_time:117604ms step_avg:153.53ms step:777/1480 train_time:117769ms step_avg:153.54ms step:778/1480 train_time:117931ms step_avg:153.56ms step:779/1480 train_time:118094ms step_avg:153.57ms step:780/1480 train_time:118259ms step_avg:153.58ms step:781/1480 train_time:118424ms step_avg:153.60ms step:782/1480 train_time:118588ms step_avg:153.61ms step:783/1480 train_time:118748ms step_avg:153.62ms step:784/1480 train_time:118910ms step_avg:153.63ms step:785/1480 train_time:119073ms step_avg:153.64ms step:786/1480 train_time:119239ms step_avg:153.66ms step:787/1480 train_time:119403ms step_avg:153.67ms step:788/1480 train_time:119567ms step_avg:153.69ms step:789/1480 train_time:119729ms step_avg:153.70ms step:790/1480 train_time:119895ms step_avg:153.71ms step:791/1480 train_time:120063ms step_avg:153.73ms step:792/1480 train_time:120228ms step_avg:153.74ms step:793/1480 train_time:120389ms step_avg:153.75ms step:794/1480 train_time:120555ms step_avg:153.77ms step:795/1480 train_time:120720ms step_avg:153.78ms step:796/1480 train_time:120886ms step_avg:153.80ms step:797/1480 train_time:121049ms step_avg:153.81ms step:798/1480 train_time:121214ms step_avg:153.82ms step:799/1480 train_time:121379ms step_avg:153.84ms step:800/1480 train_time:121543ms step_avg:153.85ms step:801/1480 train_time:121705ms step_avg:153.86ms step:802/1480 train_time:121874ms step_avg:153.88ms step:803/1480 train_time:122038ms step_avg:153.89ms step:804/1480 train_time:122200ms step_avg:153.90ms step:805/1480 train_time:122365ms step_avg:153.92ms step:806/1480 train_time:122528ms step_avg:153.93ms step:807/1480 train_time:122687ms step_avg:153.94ms step:808/1480 train_time:122850ms step_avg:153.95ms step:809/1480 train_time:123013ms step_avg:153.96ms step:810/1480 train_time:123177ms step_avg:153.97ms step:811/1480 train_time:123341ms step_avg:153.98ms step:812/1480 train_time:123503ms step_avg:153.99ms step:813/1480 train_time:123664ms step_avg:154.00ms step:814/1480 train_time:123826ms step_avg:154.01ms step:815/1480 train_time:123988ms step_avg:154.02ms step:816/1480 train_time:124154ms step_avg:154.04ms step:817/1480 train_time:124318ms step_avg:154.05ms step:818/1480 train_time:124480ms step_avg:154.06ms step:819/1480 train_time:124644ms step_avg:154.07ms step:820/1480 train_time:124809ms step_avg:154.09ms step:821/1480 train_time:124970ms step_avg:154.09ms step:822/1480 train_time:125134ms step_avg:154.11ms step:823/1480 train_time:125297ms step_avg:154.12ms step:824/1480 train_time:125460ms step_avg:154.13ms step:825/1480 train_time:125624ms step_avg:154.14ms step:826/1480 train_time:125789ms step_avg:154.15ms step:827/1480 train_time:125953ms step_avg:154.16ms step:828/1480 train_time:126117ms step_avg:154.18ms step:829/1480 train_time:126282ms step_avg:154.19ms step:830/1480 train_time:126446ms step_avg:154.20ms step:831/1480 train_time:126608ms step_avg:154.21ms step:832/1480 train_time:126773ms step_avg:154.23ms step:833/1480 train_time:126939ms step_avg:154.24ms step:834/1480 train_time:127102ms step_avg:154.25ms step:835/1480 train_time:127266ms step_avg:154.26ms step:836/1480 train_time:127432ms step_avg:154.28ms step:837/1480 train_time:127596ms step_avg:154.29ms step:838/1480 train_time:127762ms step_avg:154.30ms step:839/1480 train_time:127924ms step_avg:154.31ms step:840/1480 train_time:128085ms step_avg:154.32ms step:841/1480 train_time:128246ms step_avg:154.33ms step:842/1480 train_time:128409ms step_avg:154.34ms step:843/1480 train_time:128571ms step_avg:154.35ms step:844/1480 train_time:128734ms step_avg:154.36ms step:845/1480 train_time:128898ms step_avg:154.37ms step:846/1480 train_time:129063ms step_avg:154.38ms step:847/1480 train_time:129227ms step_avg:154.39ms step:848/1480 train_time:129389ms step_avg:154.40ms step:849/1480 train_time:129553ms step_avg:154.41ms step:850/1480 train_time:129716ms step_avg:154.42ms step:851/1480 train_time:129881ms step_avg:154.44ms step:852/1480 train_time:130042ms step_avg:154.44ms step:853/1480 train_time:130203ms step_avg:154.45ms step:854/1480 train_time:130367ms step_avg:154.46ms step:855/1480 train_time:130532ms step_avg:154.48ms step:856/1480 train_time:130695ms step_avg:154.49ms step:857/1480 train_time:130861ms step_avg:154.50ms step:858/1480 train_time:131026ms step_avg:154.51ms step:859/1480 train_time:131190ms step_avg:154.52ms step:860/1480 train_time:131351ms step_avg:154.53ms step:861/1480 train_time:131519ms step_avg:154.55ms step:862/1480 train_time:131687ms step_avg:154.56ms step:863/1480 train_time:131856ms step_avg:154.58ms step:864/1480 train_time:132020ms step_avg:154.59ms step:865/1480 train_time:132181ms step_avg:154.60ms step:866/1480 train_time:132348ms step_avg:154.61ms step:867/1480 train_time:132511ms step_avg:154.62ms step:868/1480 train_time:132673ms step_avg:154.63ms step:869/1480 train_time:132836ms step_avg:154.64ms step:870/1480 train_time:133000ms step_avg:154.65ms step:871/1480 train_time:133164ms step_avg:154.66ms step:872/1480 train_time:133327ms step_avg:154.67ms step:873/1480 train_time:133490ms step_avg:154.68ms step:874/1480 train_time:133655ms step_avg:154.69ms step:875/1480 train_time:133821ms step_avg:154.71ms step:875/1480 val_loss:3.5014 train_time:133886ms step_avg:154.78ms step:876/1480 train_time:133985ms step_avg:154.72ms step:877/1480 train_time:134148ms step_avg:154.73ms step:878/1480 train_time:134312ms step_avg:154.74ms step:879/1480 train_time:134476ms step_avg:154.75ms step:880/1480 train_time:134638ms step_avg:154.76ms step:881/1480 train_time:134803ms step_avg:154.77ms step:882/1480 train_time:134969ms step_avg:154.78ms step:883/1480 train_time:135133ms step_avg:154.79ms step:884/1480 train_time:135302ms step_avg:154.81ms step:885/1480 train_time:135466ms step_avg:154.82ms step:886/1480 train_time:135632ms step_avg:154.83ms step:887/1480 train_time:135802ms step_avg:154.85ms step:888/1480 train_time:135974ms step_avg:154.87ms step:889/1480 train_time:136141ms step_avg:154.88ms step:890/1480 train_time:136304ms step_avg:154.89ms step:891/1480 train_time:136469ms step_avg:154.90ms step:892/1480 train_time:136633ms step_avg:154.91ms step:893/1480 train_time:136798ms step_avg:154.92ms step:894/1480 train_time:136966ms step_avg:154.94ms step:895/1480 train_time:137132ms step_avg:154.95ms step:896/1480 train_time:137297ms step_avg:154.96ms step:897/1480 train_time:137464ms step_avg:154.98ms step:898/1480 train_time:137631ms step_avg:154.99ms step:899/1480 train_time:137795ms step_avg:155.00ms step:900/1480 train_time:137960ms step_avg:155.01ms step:901/1480 train_time:138124ms step_avg:155.02ms step:902/1480 train_time:138288ms step_avg:155.03ms step:903/1480 train_time:138459ms step_avg:155.05ms step:904/1480 train_time:138624ms step_avg:155.06ms step:905/1480 train_time:138785ms step_avg:155.07ms step:906/1480 train_time:138951ms step_avg:155.08ms step:907/1480 train_time:139121ms step_avg:155.10ms step:908/1480 train_time:139283ms step_avg:155.10ms step:909/1480 train_time:139448ms step_avg:155.11ms step:910/1480 train_time:139618ms step_avg:155.13ms step:911/1480 train_time:139783ms step_avg:155.14ms step:912/1480 train_time:139949ms step_avg:155.15ms step:913/1480 train_time:140116ms step_avg:155.17ms step:914/1480 train_time:140284ms step_avg:155.18ms step:915/1480 train_time:140452ms step_avg:155.20ms step:916/1480 train_time:140618ms step_avg:155.21ms step:917/1480 train_time:140781ms step_avg:155.22ms step:918/1480 train_time:140949ms step_avg:155.23ms step:919/1480 train_time:141117ms step_avg:155.24ms step:920/1480 train_time:141284ms step_avg:155.26ms step:921/1480 train_time:141450ms step_avg:155.27ms step:922/1480 train_time:141619ms step_avg:155.28ms step:923/1480 train_time:141782ms step_avg:155.29ms step:924/1480 train_time:141945ms step_avg:155.30ms step:925/1480 train_time:142110ms step_avg:155.31ms step:926/1480 train_time:142272ms step_avg:155.32ms step:927/1480 train_time:142437ms step_avg:155.33ms step:928/1480 train_time:142604ms step_avg:155.34ms step:929/1480 train_time:142769ms step_avg:155.35ms step:930/1480 train_time:142932ms step_avg:155.36ms step:931/1480 train_time:143096ms step_avg:155.37ms step:932/1480 train_time:143262ms step_avg:155.38ms step:933/1480 train_time:143429ms step_avg:155.39ms step:934/1480 train_time:143596ms step_avg:155.41ms step:935/1480 train_time:143767ms step_avg:155.42ms step:936/1480 train_time:143934ms step_avg:155.44ms step:937/1480 train_time:144105ms step_avg:155.45ms step:938/1480 train_time:144268ms step_avg:155.46ms step:939/1480 train_time:144437ms step_avg:155.48ms step:940/1480 train_time:144604ms step_avg:155.49ms step:941/1480 train_time:144768ms step_avg:155.50ms step:942/1480 train_time:144932ms step_avg:155.51ms step:943/1480 train_time:145104ms step_avg:155.52ms step:944/1480 train_time:145277ms step_avg:155.54ms step:945/1480 train_time:145441ms step_avg:155.55ms step:946/1480 train_time:145610ms step_avg:155.57ms step:947/1480 train_time:145778ms step_avg:155.58ms step:948/1480 train_time:145945ms step_avg:155.59ms step:949/1480 train_time:146109ms step_avg:155.60ms step:950/1480 train_time:146272ms step_avg:155.61ms step:951/1480 train_time:146440ms step_avg:155.62ms step:952/1480 train_time:146606ms step_avg:155.63ms step:953/1480 train_time:146776ms step_avg:155.65ms step:954/1480 train_time:146943ms step_avg:155.66ms step:955/1480 train_time:147107ms step_avg:155.67ms step:956/1480 train_time:147272ms step_avg:155.68ms step:957/1480 train_time:147439ms step_avg:155.69ms step:958/1480 train_time:147611ms step_avg:155.71ms step:959/1480 train_time:147777ms step_avg:155.72ms step:960/1480 train_time:147944ms step_avg:155.73ms step:961/1480 train_time:148109ms step_avg:155.74ms step:962/1480 train_time:148274ms step_avg:155.75ms step:963/1480 train_time:148441ms step_avg:155.76ms step:964/1480 train_time:148609ms step_avg:155.77ms step:965/1480 train_time:148773ms step_avg:155.78ms step:966/1480 train_time:148938ms step_avg:155.79ms step:967/1480 train_time:149103ms step_avg:155.80ms step:968/1480 train_time:149269ms step_avg:155.81ms step:969/1480 train_time:149436ms step_avg:155.82ms step:970/1480 train_time:149601ms step_avg:155.83ms step:971/1480 train_time:149766ms step_avg:155.84ms step:972/1480 train_time:149930ms step_avg:155.85ms step:973/1480 train_time:150094ms step_avg:155.86ms step:974/1480 train_time:150264ms step_avg:155.88ms step:975/1480 train_time:150428ms step_avg:155.88ms step:976/1480 train_time:150593ms step_avg:155.89ms step:977/1480 train_time:150760ms step_avg:155.90ms step:978/1480 train_time:150926ms step_avg:155.92ms step:979/1480 train_time:151091ms step_avg:155.92ms step:980/1480 train_time:151257ms step_avg:155.94ms step:981/1480 train_time:151426ms step_avg:155.95ms step:982/1480 train_time:151589ms step_avg:155.96ms step:983/1480 train_time:151754ms step_avg:155.97ms step:984/1480 train_time:151920ms step_avg:155.98ms step:985/1480 train_time:152088ms step_avg:155.99ms step:986/1480 train_time:152253ms step_avg:156.00ms step:987/1480 train_time:152417ms step_avg:156.00ms step:988/1480 train_time:152586ms step_avg:156.02ms step:989/1480 train_time:152750ms step_avg:156.03ms step:990/1480 train_time:152921ms step_avg:156.04ms step:991/1480 train_time:153088ms step_avg:156.05ms step:992/1480 train_time:153264ms step_avg:156.07ms step:993/1480 train_time:153440ms step_avg:156.09ms step:994/1480 train_time:153606ms step_avg:156.10ms step:995/1480 train_time:153769ms step_avg:156.11ms step:996/1480 train_time:153932ms step_avg:156.12ms step:997/1480 train_time:154096ms step_avg:156.13ms step:998/1480 train_time:154260ms step_avg:156.13ms step:999/1480 train_time:154425ms step_avg:156.14ms step:1000/1480 train_time:154595ms step_avg:156.16ms step:1000/1480 val_loss:3.4376 train_time:154663ms step_avg:156.23ms step:1001/1480 train_time:154764ms step_avg:156.17ms step:1002/1480 train_time:154930ms step_avg:156.18ms step:1003/1480 train_time:155101ms step_avg:156.19ms step:1004/1480 train_time:155270ms step_avg:156.21ms step:1005/1480 train_time:155439ms step_avg:156.22ms step:1006/1480 train_time:155605ms step_avg:156.23ms step:1007/1480 train_time:155770ms step_avg:156.24ms step:1008/1480 train_time:155937ms step_avg:156.25ms step:1009/1480 train_time:156110ms step_avg:156.27ms step:1010/1480 train_time:156276ms step_avg:156.28ms step:1011/1480 train_time:156441ms step_avg:156.29ms step:1012/1480 train_time:156607ms step_avg:156.29ms step:1013/1480 train_time:156778ms step_avg:156.31ms step:1014/1480 train_time:156944ms step_avg:156.32ms step:1015/1480 train_time:157114ms step_avg:156.33ms step:1016/1480 train_time:157282ms step_avg:156.34ms step:1017/1480 train_time:157454ms step_avg:156.36ms step:1018/1480 train_time:157623ms step_avg:156.37ms step:1019/1480 train_time:157792ms step_avg:156.38ms step:1020/1480 train_time:157961ms step_avg:156.40ms step:1021/1480 train_time:158125ms step_avg:156.40ms step:1022/1480 train_time:158294ms step_avg:156.42ms step:1023/1480 train_time:158461ms step_avg:156.43ms step:1024/1480 train_time:158628ms step_avg:156.44ms step:1025/1480 train_time:158799ms step_avg:156.45ms step:1026/1480 train_time:158965ms step_avg:156.46ms step:1027/1480 train_time:159131ms step_avg:156.47ms step:1028/1480 train_time:159303ms step_avg:156.49ms step:1029/1480 train_time:159478ms step_avg:156.50ms step:1030/1480 train_time:159646ms step_avg:156.52ms step:1031/1480 train_time:159811ms step_avg:156.52ms step:1032/1480 train_time:159983ms step_avg:156.54ms step:1033/1480 train_time:160149ms step_avg:156.55ms step:1034/1480 train_time:160319ms step_avg:156.56ms step:1035/1480 train_time:160487ms step_avg:156.57ms step:1036/1480 train_time:160653ms step_avg:156.58ms step:1037/1480 train_time:160820ms step_avg:156.59ms step:1038/1480 train_time:160990ms step_avg:156.60ms step:1039/1480 train_time:161160ms step_avg:156.62ms step:1040/1480 train_time:161326ms step_avg:156.63ms step:1041/1480 train_time:161495ms step_avg:156.64ms step:1042/1480 train_time:161660ms step_avg:156.65ms step:1043/1480 train_time:161825ms step_avg:156.66ms step:1044/1480 train_time:161990ms step_avg:156.66ms step:1045/1480 train_time:162159ms step_avg:156.68ms step:1046/1480 train_time:162327ms step_avg:156.69ms step:1047/1480 train_time:162495ms step_avg:156.70ms step:1048/1480 train_time:162660ms step_avg:156.71ms step:1049/1480 train_time:162826ms step_avg:156.71ms step:1050/1480 train_time:162996ms step_avg:156.73ms step:1051/1480 train_time:163165ms step_avg:156.74ms step:1052/1480 train_time:163334ms step_avg:156.75ms step:1053/1480 train_time:163500ms step_avg:156.76ms step:1054/1480 train_time:163668ms step_avg:156.77ms step:1055/1480 train_time:163834ms step_avg:156.78ms step:1056/1480 train_time:163999ms step_avg:156.79ms step:1057/1480 train_time:164166ms step_avg:156.80ms step:1058/1480 train_time:164338ms step_avg:156.81ms step:1059/1480 train_time:164510ms step_avg:156.83ms step:1060/1480 train_time:164678ms step_avg:156.84ms step:1061/1480 train_time:164841ms step_avg:156.84ms step:1062/1480 train_time:165008ms step_avg:156.85ms step:1063/1480 train_time:165174ms step_avg:156.86ms step:1064/1480 train_time:165338ms step_avg:156.87ms step:1065/1480 train_time:165504ms step_avg:156.88ms step:1066/1480 train_time:165672ms step_avg:156.89ms step:1067/1480 train_time:165840ms step_avg:156.90ms step:1068/1480 train_time:166005ms step_avg:156.90ms step:1069/1480 train_time:166176ms step_avg:156.92ms step:1070/1480 train_time:166342ms step_avg:156.93ms step:1071/1480 train_time:166516ms step_avg:156.94ms step:1072/1480 train_time:166681ms step_avg:156.95ms step:1073/1480 train_time:166845ms step_avg:156.96ms step:1074/1480 train_time:167014ms step_avg:156.97ms step:1075/1480 train_time:167183ms step_avg:156.98ms step:1076/1480 train_time:167351ms step_avg:156.99ms step:1077/1480 train_time:167519ms step_avg:157.00ms step:1078/1480 train_time:167695ms step_avg:157.02ms step:1079/1480 train_time:167868ms step_avg:157.03ms step:1080/1480 train_time:168038ms step_avg:157.04ms step:1081/1480 train_time:168203ms step_avg:157.05ms step:1082/1480 train_time:168368ms step_avg:157.06ms step:1083/1480 train_time:168538ms step_avg:157.07ms step:1084/1480 train_time:168705ms step_avg:157.08ms step:1085/1480 train_time:168874ms step_avg:157.09ms step:1086/1480 train_time:169042ms step_avg:157.10ms step:1087/1480 train_time:169210ms step_avg:157.11ms step:1088/1480 train_time:169379ms step_avg:157.12ms step:1089/1480 train_time:169553ms step_avg:157.14ms step:1090/1480 train_time:169725ms step_avg:157.15ms step:1091/1480 train_time:169895ms step_avg:157.16ms step:1092/1480 train_time:170063ms step_avg:157.17ms step:1093/1480 train_time:170232ms step_avg:157.19ms step:1094/1480 train_time:170399ms step_avg:157.19ms step:1095/1480 train_time:170563ms step_avg:157.20ms step:1096/1480 train_time:170733ms step_avg:157.21ms step:1097/1480 train_time:170901ms step_avg:157.22ms step:1098/1480 train_time:171072ms step_avg:157.24ms step:1099/1480 train_time:171244ms step_avg:157.25ms step:1100/1480 train_time:171416ms step_avg:157.26ms step:1101/1480 train_time:171586ms step_avg:157.27ms step:1102/1480 train_time:171759ms step_avg:157.29ms step:1103/1480 train_time:171937ms step_avg:157.31ms step:1104/1480 train_time:172105ms step_avg:157.32ms step:1105/1480 train_time:172275ms step_avg:157.33ms step:1106/1480 train_time:172443ms step_avg:157.34ms step:1107/1480 train_time:172612ms step_avg:157.35ms step:1108/1480 train_time:172778ms step_avg:157.36ms step:1109/1480 train_time:172944ms step_avg:157.36ms step:1110/1480 train_time:173111ms step_avg:157.37ms step:1111/1480 train_time:173277ms step_avg:157.38ms step:1112/1480 train_time:173446ms step_avg:157.39ms step:1113/1480 train_time:173626ms step_avg:157.41ms step:1114/1480 train_time:173798ms step_avg:157.43ms step:1115/1480 train_time:173970ms step_avg:157.44ms step:1116/1480 train_time:174138ms step_avg:157.45ms step:1117/1480 train_time:174311ms step_avg:157.46ms step:1118/1480 train_time:174487ms step_avg:157.48ms step:1119/1480 train_time:174652ms step_avg:157.49ms step:1120/1480 train_time:174820ms step_avg:157.50ms step:1121/1480 train_time:174990ms step_avg:157.51ms step:1122/1480 train_time:175157ms step_avg:157.51ms step:1123/1480 train_time:175322ms step_avg:157.52ms step:1124/1480 train_time:175492ms step_avg:157.53ms step:1125/1480 train_time:175660ms step_avg:157.54ms step:1125/1480 val_loss:3.3833 train_time:175728ms step_avg:157.60ms step:1126/1480 train_time:175829ms step_avg:157.55ms step:1127/1480 train_time:175999ms step_avg:157.56ms step:1128/1480 train_time:176169ms step_avg:157.58ms step:1129/1480 train_time:176344ms step_avg:157.59ms step:1130/1480 train_time:176512ms step_avg:157.60ms step:1131/1480 train_time:176689ms step_avg:157.62ms step:1132/1480 train_time:176855ms step_avg:157.62ms step:1133/1480 train_time:177028ms step_avg:157.64ms step:1134/1480 train_time:177200ms step_avg:157.65ms step:1135/1480 train_time:177368ms step_avg:157.66ms step:1136/1480 train_time:177539ms step_avg:157.67ms step:1137/1480 train_time:177709ms step_avg:157.68ms step:1138/1480 train_time:177882ms step_avg:157.70ms step:1139/1480 train_time:178051ms step_avg:157.71ms step:1140/1480 train_time:178219ms step_avg:157.72ms step:1141/1480 train_time:178391ms step_avg:157.73ms step:1142/1480 train_time:178560ms step_avg:157.74ms step:1143/1480 train_time:178728ms step_avg:157.75ms step:1144/1480 train_time:178896ms step_avg:157.76ms step:1145/1480 train_time:179063ms step_avg:157.76ms step:1146/1480 train_time:179232ms step_avg:157.78ms step:1147/1480 train_time:179400ms step_avg:157.78ms step:1148/1480 train_time:179569ms step_avg:157.79ms step:1149/1480 train_time:179741ms step_avg:157.81ms step:1150/1480 train_time:179909ms step_avg:157.82ms step:1151/1480 train_time:180084ms step_avg:157.83ms step:1152/1480 train_time:180255ms step_avg:157.84ms step:1153/1480 train_time:180429ms step_avg:157.86ms step:1154/1480 train_time:180594ms step_avg:157.86ms step:1155/1480 train_time:180768ms step_avg:157.88ms step:1156/1480 train_time:180946ms step_avg:157.89ms step:1157/1480 train_time:181116ms step_avg:157.90ms step:1158/1480 train_time:181283ms step_avg:157.91ms step:1159/1480 train_time:181450ms step_avg:157.92ms step:1160/1480 train_time:181616ms step_avg:157.93ms step:1161/1480 train_time:181787ms step_avg:157.94ms step:1162/1480 train_time:181956ms step_avg:157.95ms step:1163/1480 train_time:182126ms step_avg:157.96ms step:1164/1480 train_time:182294ms step_avg:157.97ms step:1165/1480 train_time:182461ms step_avg:157.98ms step:1166/1480 train_time:182629ms step_avg:157.98ms step:1167/1480 train_time:182797ms step_avg:157.99ms step:1168/1480 train_time:182965ms step_avg:158.00ms step:1169/1480 train_time:183133ms step_avg:158.01ms step:1170/1480 train_time:183302ms step_avg:158.02ms step:1171/1480 train_time:183468ms step_avg:158.03ms step:1172/1480 train_time:183635ms step_avg:158.03ms step:1173/1480 train_time:183805ms step_avg:158.04ms step:1174/1480 train_time:183986ms step_avg:158.06ms step:1175/1480 train_time:184157ms step_avg:158.07ms step:1176/1480 train_time:184329ms step_avg:158.09ms step:1177/1480 train_time:184506ms step_avg:158.10ms step:1178/1480 train_time:184673ms step_avg:158.11ms step:1179/1480 train_time:184839ms step_avg:158.12ms step:1180/1480 train_time:185020ms step_avg:158.14ms step:1181/1480 train_time:185190ms step_avg:158.15ms step:1182/1480 train_time:185358ms step_avg:158.16ms step:1183/1480 train_time:185529ms step_avg:158.17ms step:1184/1480 train_time:185696ms step_avg:158.17ms step:1185/1480 train_time:185869ms step_avg:158.19ms step:1186/1480 train_time:186040ms step_avg:158.20ms step:1187/1480 train_time:186224ms step_avg:158.22ms step:1188/1480 train_time:186391ms step_avg:158.23ms step:1189/1480 train_time:186563ms step_avg:158.24ms step:1190/1480 train_time:186729ms step_avg:158.25ms step:1191/1480 train_time:186902ms step_avg:158.26ms step:1192/1480 train_time:187069ms step_avg:158.26ms step:1193/1480 train_time:187236ms step_avg:158.27ms step:1194/1480 train_time:187405ms step_avg:158.28ms step:1195/1480 train_time:187580ms step_avg:158.30ms step:1196/1480 train_time:187764ms step_avg:158.32ms step:1197/1480 train_time:187934ms step_avg:158.33ms step:1198/1480 train_time:188116ms step_avg:158.35ms step:1199/1480 train_time:188286ms step_avg:158.36ms step:1200/1480 train_time:188455ms step_avg:158.37ms step:1201/1480 train_time:188622ms step_avg:158.37ms step:1202/1480 train_time:188803ms step_avg:158.39ms step:1203/1480 train_time:188979ms step_avg:158.41ms step:1204/1480 train_time:189154ms step_avg:158.42ms step:1205/1480 train_time:189322ms step_avg:158.43ms step:1206/1480 train_time:189489ms step_avg:158.44ms step:1207/1480 train_time:189660ms step_avg:158.45ms step:1208/1480 train_time:189828ms step_avg:158.45ms step:1209/1480 train_time:190001ms step_avg:158.47ms step:1210/1480 train_time:190173ms step_avg:158.48ms step:1211/1480 train_time:190348ms step_avg:158.49ms step:1212/1480 train_time:190520ms step_avg:158.50ms step:1213/1480 train_time:190692ms step_avg:158.51ms step:1214/1480 train_time:190869ms step_avg:158.53ms step:1215/1480 train_time:191041ms step_avg:158.54ms step:1216/1480 train_time:191210ms step_avg:158.55ms step:1217/1480 train_time:191384ms step_avg:158.56ms step:1218/1480 train_time:191554ms step_avg:158.57ms step:1219/1480 train_time:191733ms step_avg:158.59ms step:1220/1480 train_time:191903ms step_avg:158.60ms step:1221/1480 train_time:192071ms step_avg:158.61ms step:1222/1480 train_time:192239ms step_avg:158.61ms step:1223/1480 train_time:192407ms step_avg:158.62ms step:1224/1480 train_time:192586ms step_avg:158.64ms step:1225/1480 train_time:192758ms step_avg:158.65ms step:1226/1480 train_time:192931ms step_avg:158.66ms step:1227/1480 train_time:193104ms step_avg:158.67ms step:1228/1480 train_time:193272ms step_avg:158.68ms step:1229/1480 train_time:193445ms step_avg:158.69ms step:1230/1480 train_time:193625ms step_avg:158.71ms step:1231/1480 train_time:193801ms step_avg:158.72ms step:1232/1480 train_time:193977ms step_avg:158.74ms step:1233/1480 train_time:194146ms step_avg:158.75ms step:1234/1480 train_time:194314ms step_avg:158.75ms step:1235/1480 train_time:194489ms step_avg:158.77ms step:1236/1480 train_time:194658ms step_avg:158.78ms step:1237/1480 train_time:194829ms step_avg:158.78ms step:1238/1480 train_time:195013ms step_avg:158.81ms step:1239/1480 train_time:195186ms step_avg:158.82ms step:1240/1480 train_time:195356ms step_avg:158.83ms step:1241/1480 train_time:195528ms step_avg:158.84ms step:1242/1480 train_time:195696ms step_avg:158.84ms step:1243/1480 train_time:195869ms step_avg:158.86ms step:1244/1480 train_time:196037ms step_avg:158.86ms step:1245/1480 train_time:196205ms step_avg:158.87ms step:1246/1480 train_time:196375ms step_avg:158.88ms step:1247/1480 train_time:196544ms step_avg:158.89ms step:1248/1480 train_time:196712ms step_avg:158.89ms step:1249/1480 train_time:196880ms step_avg:158.90ms step:1250/1480 train_time:197050ms step_avg:158.91ms step:1250/1480 val_loss:3.3337 train_time:197123ms step_avg:158.97ms step:1251/1480 train_time:197232ms step_avg:158.93ms step:1252/1480 train_time:197402ms step_avg:158.94ms step:1253/1480 train_time:197571ms step_avg:158.95ms step:1254/1480 train_time:197740ms step_avg:158.96ms step:1255/1480 train_time:197927ms step_avg:158.98ms step:1256/1480 train_time:198101ms step_avg:158.99ms step:1257/1480 train_time:198273ms step_avg:159.00ms step:1258/1480 train_time:198449ms step_avg:159.01ms step:1259/1480 train_time:198621ms step_avg:159.02ms step:1260/1480 train_time:198789ms step_avg:159.03ms step:1261/1480 train_time:198961ms step_avg:159.04ms step:1262/1480 train_time:199136ms step_avg:159.05ms step:1263/1480 train_time:199310ms step_avg:159.07ms step:1264/1480 train_time:199478ms step_avg:159.07ms step:1265/1480 train_time:199646ms step_avg:159.08ms step:1266/1480 train_time:199817ms step_avg:159.09ms step:1267/1480 train_time:199986ms step_avg:159.10ms step:1268/1480 train_time:200157ms step_avg:159.11ms step:1269/1480 train_time:200332ms step_avg:159.12ms step:1270/1480 train_time:200501ms step_avg:159.13ms step:1271/1480 train_time:200672ms step_avg:159.14ms step:1272/1480 train_time:200839ms step_avg:159.14ms step:1273/1480 train_time:201010ms step_avg:159.15ms step:1274/1480 train_time:201184ms step_avg:159.16ms step:1275/1480 train_time:201351ms step_avg:159.17ms step:1276/1480 train_time:201516ms step_avg:159.18ms step:1277/1480 train_time:201690ms step_avg:159.19ms step:1278/1480 train_time:201857ms step_avg:159.19ms step:1279/1480 train_time:202029ms step_avg:159.20ms step:1280/1480 train_time:202208ms step_avg:159.22ms step:1281/1480 train_time:202377ms step_avg:159.23ms step:1282/1480 train_time:202543ms step_avg:159.23ms step:1283/1480 train_time:202713ms step_avg:159.24ms step:1284/1480 train_time:202885ms step_avg:159.25ms step:1285/1480 train_time:203054ms step_avg:159.26ms step:1286/1480 train_time:203225ms step_avg:159.27ms step:1287/1480 train_time:203397ms step_avg:159.28ms step:1288/1480 train_time:203569ms step_avg:159.29ms step:1289/1480 train_time:203750ms step_avg:159.30ms step:1290/1480 train_time:203929ms step_avg:159.32ms step:1291/1480 train_time:204103ms step_avg:159.33ms step:1292/1480 train_time:204275ms step_avg:159.34ms step:1293/1480 train_time:204452ms step_avg:159.35ms step:1294/1480 train_time:204623ms step_avg:159.36ms step:1295/1480 train_time:204794ms step_avg:159.37ms step:1296/1480 train_time:204969ms step_avg:159.39ms step:1297/1480 train_time:205141ms step_avg:159.39ms step:1298/1480 train_time:205312ms step_avg:159.40ms step:1299/1480 train_time:205483ms step_avg:159.41ms step:1300/1480 train_time:205651ms step_avg:159.42ms step:1301/1480 train_time:205819ms step_avg:159.43ms step:1302/1480 train_time:205993ms step_avg:159.44ms step:1303/1480 train_time:206169ms step_avg:159.45ms step:1304/1480 train_time:206343ms step_avg:159.46ms step:1305/1480 train_time:206511ms step_avg:159.47ms step:1306/1480 train_time:206688ms step_avg:159.48ms step:1307/1480 train_time:206856ms step_avg:159.49ms step:1308/1480 train_time:207026ms step_avg:159.50ms step:1309/1480 train_time:207198ms step_avg:159.51ms step:1310/1480 train_time:207367ms step_avg:159.51ms step:1311/1480 train_time:207535ms step_avg:159.52ms step:1312/1480 train_time:207708ms step_avg:159.53ms step:1313/1480 train_time:207877ms step_avg:159.54ms step:1314/1480 train_time:208050ms step_avg:159.55ms step:1315/1480 train_time:208221ms step_avg:159.56ms step:1316/1480 train_time:208388ms step_avg:159.56ms step:1317/1480 train_time:208559ms step_avg:159.57ms step:1318/1480 train_time:208738ms step_avg:159.59ms step:1319/1480 train_time:208913ms step_avg:159.60ms step:1320/1480 train_time:209089ms step_avg:159.61ms step:1321/1480 train_time:209262ms step_avg:159.62ms step:1322/1480 train_time:209444ms step_avg:159.64ms step:1323/1480 train_time:209615ms step_avg:159.65ms step:1324/1480 train_time:209791ms step_avg:159.66ms step:1325/1480 train_time:209972ms step_avg:159.67ms step:1326/1480 train_time:210148ms step_avg:159.69ms step:1327/1480 train_time:210319ms step_avg:159.70ms step:1328/1480 train_time:210491ms step_avg:159.70ms step:1329/1480 train_time:210687ms step_avg:159.73ms step:1330/1480 train_time:210869ms step_avg:159.75ms step:1331/1480 train_time:211040ms step_avg:159.76ms step:1332/1480 train_time:211213ms step_avg:159.77ms step:1333/1480 train_time:211388ms step_avg:159.78ms step:1334/1480 train_time:211559ms step_avg:159.79ms step:1335/1480 train_time:211728ms step_avg:159.79ms step:1336/1480 train_time:211911ms step_avg:159.81ms step:1337/1480 train_time:212086ms step_avg:159.82ms step:1338/1480 train_time:212257ms step_avg:159.83ms step:1339/1480 train_time:212431ms step_avg:159.84ms step:1340/1480 train_time:212604ms step_avg:159.85ms step:1341/1480 train_time:212772ms step_avg:159.86ms step:1342/1480 train_time:212947ms step_avg:159.87ms step:1343/1480 train_time:213116ms step_avg:159.88ms step:1344/1480 train_time:213290ms step_avg:159.89ms step:1345/1480 train_time:213467ms step_avg:159.90ms step:1346/1480 train_time:213637ms step_avg:159.91ms step:1347/1480 train_time:213808ms step_avg:159.92ms step:1348/1480 train_time:213977ms step_avg:159.92ms step:1349/1480 train_time:214147ms step_avg:159.93ms step:1350/1480 train_time:214323ms step_avg:159.94ms step:1351/1480 train_time:214494ms step_avg:159.95ms step:1352/1480 train_time:214666ms step_avg:159.96ms step:1353/1480 train_time:214844ms step_avg:159.97ms step:1354/1480 train_time:215017ms step_avg:159.98ms step:1355/1480 train_time:215184ms step_avg:159.99ms step:1356/1480 train_time:215356ms step_avg:160.00ms step:1357/1480 train_time:215529ms step_avg:160.01ms step:1358/1480 train_time:215703ms step_avg:160.02ms step:1359/1480 train_time:215875ms step_avg:160.03ms step:1360/1480 train_time:216049ms step_avg:160.04ms step:1361/1480 train_time:216226ms step_avg:160.05ms step:1362/1480 train_time:216403ms step_avg:160.06ms step:1363/1480 train_time:216582ms step_avg:160.08ms step:1364/1480 train_time:216750ms step_avg:160.08ms step:1365/1480 train_time:216918ms step_avg:160.09ms step:1366/1480 train_time:217090ms step_avg:160.10ms step:1367/1480 train_time:217263ms step_avg:160.11ms step:1368/1480 train_time:217436ms step_avg:160.11ms step:1369/1480 train_time:217616ms step_avg:160.13ms step:1370/1480 train_time:217793ms step_avg:160.14ms step:1371/1480 train_time:217965ms step_avg:160.15ms step:1372/1480 train_time:218144ms step_avg:160.16ms step:1373/1480 train_time:218312ms step_avg:160.17ms step:1374/1480 train_time:218488ms step_avg:160.18ms step:1375/1480 train_time:218659ms step_avg:160.19ms step:1375/1480 val_loss:3.2947 train_time:218726ms step_avg:160.24ms step:1376/1480 train_time:218831ms step_avg:160.20ms step:1377/1480 train_time:219004ms step_avg:160.21ms step:1378/1480 train_time:219172ms step_avg:160.21ms step:1379/1480 train_time:219347ms step_avg:160.22ms step:1380/1480 train_time:219521ms step_avg:160.23ms step:1381/1480 train_time:219704ms step_avg:160.25ms step:1382/1480 train_time:219874ms step_avg:160.26ms step:1383/1480 train_time:220046ms step_avg:160.27ms step:1384/1480 train_time:220223ms step_avg:160.28ms step:1385/1480 train_time:220389ms step_avg:160.28ms step:1386/1480 train_time:220560ms step_avg:160.29ms step:1387/1480 train_time:220730ms step_avg:160.30ms step:1388/1480 train_time:220900ms step_avg:160.30ms step:1389/1480 train_time:221073ms step_avg:160.31ms step:1390/1480 train_time:221241ms step_avg:160.32ms step:1391/1480 train_time:221410ms step_avg:160.33ms step:1392/1480 train_time:221585ms step_avg:160.34ms step:1393/1480 train_time:221754ms step_avg:160.34ms step:1394/1480 train_time:221926ms step_avg:160.35ms step:1395/1480 train_time:222094ms step_avg:160.36ms step:1396/1480 train_time:222264ms step_avg:160.36ms step:1397/1480 train_time:222431ms step_avg:160.37ms step:1398/1480 train_time:222600ms step_avg:160.37ms step:1399/1480 train_time:222768ms step_avg:160.38ms step:1400/1480 train_time:222944ms step_avg:160.39ms step:1401/1480 train_time:223110ms step_avg:160.40ms step:1402/1480 train_time:223283ms step_avg:160.40ms step:1403/1480 train_time:223459ms step_avg:160.42ms step:1404/1480 train_time:223630ms step_avg:160.42ms step:1405/1480 train_time:223804ms step_avg:160.43ms step:1406/1480 train_time:223978ms step_avg:160.44ms step:1407/1480 train_time:224146ms step_avg:160.45ms step:1408/1480 train_time:224314ms step_avg:160.45ms step:1409/1480 train_time:224497ms step_avg:160.47ms step:1410/1480 train_time:224667ms step_avg:160.48ms step:1411/1480 train_time:224835ms step_avg:160.48ms step:1412/1480 train_time:225006ms step_avg:160.49ms step:1413/1480 train_time:225176ms step_avg:160.50ms step:1414/1480 train_time:225347ms step_avg:160.50ms step:1415/1480 train_time:225523ms step_avg:160.51ms step:1416/1480 train_time:225709ms step_avg:160.53ms step:1417/1480 train_time:225883ms step_avg:160.54ms step:1418/1480 train_time:226053ms step_avg:160.55ms step:1419/1480 train_time:226229ms step_avg:160.56ms step:1420/1480 train_time:226404ms step_avg:160.57ms step:1421/1480 train_time:226576ms step_avg:160.58ms step:1422/1480 train_time:226748ms step_avg:160.59ms step:1423/1480 train_time:226916ms step_avg:160.59ms step:1424/1480 train_time:227093ms step_avg:160.60ms step:1425/1480 train_time:227273ms step_avg:160.62ms step:1426/1480 train_time:227446ms step_avg:160.63ms step:1427/1480 train_time:227622ms step_avg:160.64ms step:1428/1480 train_time:227793ms step_avg:160.64ms step:1429/1480 train_time:227960ms step_avg:160.65ms step:1430/1480 train_time:228133ms step_avg:160.66ms step:1431/1480 train_time:228308ms step_avg:160.67ms step:1432/1480 train_time:228485ms step_avg:160.68ms step:1433/1480 train_time:228663ms step_avg:160.69ms step:1434/1480 train_time:228844ms step_avg:160.70ms step:1435/1480 train_time:229019ms step_avg:160.71ms step:1436/1480 train_time:229192ms step_avg:160.72ms step:1437/1480 train_time:229362ms step_avg:160.73ms step:1438/1480 train_time:229533ms step_avg:160.74ms step:1439/1480 train_time:229707ms step_avg:160.75ms step:1440/1480 train_time:229877ms step_avg:160.75ms step:1441/1480 train_time:230047ms step_avg:160.76ms step:1442/1480 train_time:230224ms step_avg:160.77ms step:1443/1480 train_time:230412ms step_avg:160.79ms step:1444/1480 train_time:230584ms step_avg:160.80ms step:1445/1480 train_time:230754ms step_avg:160.80ms step:1446/1480 train_time:230930ms step_avg:160.81ms step:1447/1480 train_time:231109ms step_avg:160.83ms step:1448/1480 train_time:231281ms step_avg:160.84ms step:1449/1480 train_time:231456ms step_avg:160.84ms step:1450/1480 train_time:231628ms step_avg:160.85ms step:1451/1480 train_time:231800ms step_avg:160.86ms step:1452/1480 train_time:231972ms step_avg:160.87ms step:1453/1480 train_time:232142ms step_avg:160.87ms step:1454/1480 train_time:232314ms step_avg:160.88ms step:1455/1480 train_time:232493ms step_avg:160.89ms step:1456/1480 train_time:232667ms step_avg:160.90ms step:1457/1480 train_time:232837ms step_avg:160.91ms step:1458/1480 train_time:233008ms step_avg:160.92ms step:1459/1480 train_time:233185ms step_avg:160.93ms step:1460/1480 train_time:233357ms step_avg:160.94ms step:1461/1480 train_time:233530ms step_avg:160.94ms step:1462/1480 train_time:233703ms step_avg:160.95ms step:1463/1480 train_time:233878ms step_avg:160.96ms step:1464/1480 train_time:234052ms step_avg:160.97ms step:1465/1480 train_time:234225ms step_avg:160.98ms step:1466/1480 train_time:234395ms step_avg:160.99ms step:1467/1480 train_time:234570ms step_avg:161.00ms step:1468/1480 train_time:234741ms step_avg:161.00ms step:1469/1480 train_time:234915ms step_avg:161.01ms step:1470/1480 train_time:235095ms step_avg:161.02ms step:1471/1480 train_time:235282ms step_avg:161.04ms step:1472/1480 train_time:235462ms step_avg:161.05ms step:1473/1480 train_time:235634ms step_avg:161.06ms step:1474/1480 train_time:235812ms step_avg:161.07ms step:1475/1480 train_time:235992ms step_avg:161.09ms step:1476/1480 train_time:236165ms step_avg:161.10ms step:1477/1480 train_time:236347ms step_avg:161.11ms step:1478/1480 train_time:236529ms step_avg:161.12ms step:1479/1480 train_time:236704ms step_avg:161.13ms step:1480/1480 train_time:236876ms step_avg:161.14ms step:1480/1480 val_loss:3.2755 train_time:236947ms step_avg:161.19ms