import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 08:38:36 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 119W / 700W | 47MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 88W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 112W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23504ms step_avg:nanms step:2/1480 train_time:23631ms step_avg:nanms step:3/1480 train_time:23770ms step_avg:nanms step:4/1480 train_time:23912ms step_avg:nanms step:5/1480 train_time:24053ms step_avg:nanms step:6/1480 train_time:24194ms step_avg:nanms step:7/1480 train_time:24336ms step_avg:nanms step:8/1480 train_time:24477ms step_avg:nanms step:9/1480 train_time:24619ms step_avg:nanms step:10/1480 train_time:24763ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.54ms step:14/1480 train_time:568ms step_avg:142.00ms step:15/1480 train_time:710ms step_avg:142.00ms step:16/1480 train_time:853ms step_avg:142.11ms step:17/1480 train_time:995ms step_avg:142.16ms step:18/1480 train_time:1139ms step_avg:142.34ms step:19/1480 train_time:1282ms step_avg:142.47ms step:20/1480 train_time:1425ms step_avg:142.54ms step:21/1480 train_time:1571ms step_avg:142.85ms step:22/1480 train_time:1712ms step_avg:142.64ms step:23/1480 train_time:1855ms step_avg:142.69ms step:24/1480 train_time:1996ms step_avg:142.60ms step:25/1480 train_time:2140ms step_avg:142.66ms step:26/1480 train_time:2284ms step_avg:142.77ms step:27/1480 train_time:2429ms step_avg:142.88ms step:28/1480 train_time:2573ms step_avg:142.92ms step:29/1480 train_time:2715ms step_avg:142.90ms step:30/1480 train_time:2858ms step_avg:142.88ms step:31/1480 train_time:2999ms step_avg:142.80ms step:32/1480 train_time:3141ms step_avg:142.77ms step:33/1480 train_time:3284ms step_avg:142.79ms step:34/1480 train_time:3429ms step_avg:142.87ms step:35/1480 train_time:3572ms step_avg:142.90ms step:36/1480 train_time:3714ms step_avg:142.86ms step:37/1480 train_time:3856ms step_avg:142.82ms step:38/1480 train_time:3997ms step_avg:142.76ms step:39/1480 train_time:4141ms step_avg:142.79ms step:40/1480 train_time:4284ms step_avg:142.81ms step:41/1480 train_time:4428ms step_avg:142.84ms step:42/1480 train_time:4572ms step_avg:142.86ms step:43/1480 train_time:4714ms step_avg:142.86ms step:44/1480 train_time:4856ms step_avg:142.83ms step:45/1480 train_time:4997ms step_avg:142.77ms step:46/1480 train_time:5139ms step_avg:142.75ms step:47/1480 train_time:5284ms step_avg:142.82ms step:48/1480 train_time:5429ms step_avg:142.88ms step:49/1480 train_time:5572ms step_avg:142.88ms step:50/1480 train_time:5714ms step_avg:142.86ms step:51/1480 train_time:5856ms step_avg:142.83ms step:52/1480 train_time:5997ms step_avg:142.78ms step:53/1480 train_time:6139ms step_avg:142.76ms step:54/1480 train_time:6282ms step_avg:142.78ms step:55/1480 train_time:6427ms step_avg:142.82ms step:56/1480 train_time:6571ms step_avg:142.84ms step:57/1480 train_time:6713ms step_avg:142.83ms step:58/1480 train_time:6854ms step_avg:142.80ms step:59/1480 train_time:6996ms step_avg:142.77ms step:60/1480 train_time:7138ms step_avg:142.76ms step:61/1480 train_time:7280ms step_avg:142.75ms step:62/1480 train_time:7424ms step_avg:142.77ms step:63/1480 train_time:7568ms step_avg:142.80ms step:64/1480 train_time:7711ms step_avg:142.80ms step:65/1480 train_time:7854ms step_avg:142.80ms step:66/1480 train_time:7995ms step_avg:142.77ms step:67/1480 train_time:8138ms step_avg:142.78ms step:68/1480 train_time:8283ms step_avg:142.81ms step:69/1480 train_time:8427ms step_avg:142.83ms step:70/1480 train_time:8570ms step_avg:142.84ms step:71/1480 train_time:8713ms step_avg:142.83ms step:72/1480 train_time:8855ms step_avg:142.82ms step:73/1480 train_time:8996ms step_avg:142.79ms step:74/1480 train_time:9137ms step_avg:142.77ms step:75/1480 train_time:9281ms step_avg:142.78ms step:76/1480 train_time:9428ms step_avg:142.85ms step:77/1480 train_time:9572ms step_avg:142.87ms step:78/1480 train_time:9714ms step_avg:142.85ms step:79/1480 train_time:9855ms step_avg:142.83ms step:80/1480 train_time:9996ms step_avg:142.80ms step:81/1480 train_time:10139ms step_avg:142.80ms step:82/1480 train_time:10284ms step_avg:142.83ms step:83/1480 train_time:10429ms step_avg:142.86ms step:84/1480 train_time:10573ms step_avg:142.88ms step:85/1480 train_time:10716ms step_avg:142.87ms step:86/1480 train_time:10857ms step_avg:142.85ms step:87/1480 train_time:10997ms step_avg:142.82ms step:88/1480 train_time:11138ms step_avg:142.79ms step:89/1480 train_time:11282ms step_avg:142.81ms step:90/1480 train_time:11427ms step_avg:142.84ms step:91/1480 train_time:11572ms step_avg:142.86ms step:92/1480 train_time:11714ms step_avg:142.86ms step:93/1480 train_time:11855ms step_avg:142.84ms step:94/1480 train_time:11996ms step_avg:142.81ms step:95/1480 train_time:12138ms step_avg:142.80ms step:96/1480 train_time:12280ms step_avg:142.79ms step:97/1480 train_time:12425ms step_avg:142.81ms step:98/1480 train_time:12571ms step_avg:142.85ms step:99/1480 train_time:12712ms step_avg:142.83ms step:100/1480 train_time:12854ms step_avg:142.82ms step:101/1480 train_time:12994ms step_avg:142.79ms step:102/1480 train_time:13136ms step_avg:142.79ms step:103/1480 train_time:13279ms step_avg:142.78ms step:104/1480 train_time:13423ms step_avg:142.80ms step:105/1480 train_time:13568ms step_avg:142.82ms step:106/1480 train_time:13712ms step_avg:142.83ms step:107/1480 train_time:13854ms step_avg:142.82ms step:108/1480 train_time:13995ms step_avg:142.81ms step:109/1480 train_time:14137ms step_avg:142.79ms step:110/1480 train_time:14280ms step_avg:142.80ms step:111/1480 train_time:14426ms step_avg:142.83ms step:112/1480 train_time:14575ms step_avg:142.89ms step:113/1480 train_time:14720ms step_avg:142.92ms step:114/1480 train_time:14868ms step_avg:142.96ms step:115/1480 train_time:15014ms step_avg:142.99ms step:116/1480 train_time:15159ms step_avg:143.01ms step:117/1480 train_time:15308ms step_avg:143.07ms step:118/1480 train_time:15456ms step_avg:143.11ms step:119/1480 train_time:15603ms step_avg:143.15ms step:120/1480 train_time:15751ms step_avg:143.19ms step:121/1480 train_time:15897ms step_avg:143.22ms step:122/1480 train_time:16044ms step_avg:143.25ms step:123/1480 train_time:16191ms step_avg:143.29ms step:124/1480 train_time:16337ms step_avg:143.31ms step:125/1480 train_time:16484ms step_avg:143.34ms step:125/1480 val_loss:4.4079 train_time:16542ms step_avg:143.84ms step:126/1480 train_time:16639ms step_avg:143.44ms step:127/1480 train_time:16786ms step_avg:143.47ms step:128/1480 train_time:16932ms step_avg:143.49ms step:129/1480 train_time:17079ms step_avg:143.52ms step:130/1480 train_time:17225ms step_avg:143.54ms step:131/1480 train_time:17370ms step_avg:143.56ms step:132/1480 train_time:17518ms step_avg:143.59ms step:133/1480 train_time:17666ms step_avg:143.63ms step:134/1480 train_time:17815ms step_avg:143.67ms step:135/1480 train_time:17961ms step_avg:143.69ms step:136/1480 train_time:18106ms step_avg:143.70ms step:137/1480 train_time:18252ms step_avg:143.71ms step:138/1480 train_time:18400ms step_avg:143.75ms step:139/1480 train_time:18546ms step_avg:143.77ms step:140/1480 train_time:18694ms step_avg:143.80ms step:141/1480 train_time:18843ms step_avg:143.84ms step:142/1480 train_time:18988ms step_avg:143.85ms step:143/1480 train_time:19134ms step_avg:143.87ms step:144/1480 train_time:19282ms step_avg:143.90ms step:145/1480 train_time:19427ms step_avg:143.90ms step:146/1480 train_time:19575ms step_avg:143.93ms step:147/1480 train_time:19722ms step_avg:143.96ms step:148/1480 train_time:19868ms step_avg:143.97ms step:149/1480 train_time:20016ms step_avg:144.00ms step:150/1480 train_time:20163ms step_avg:144.02ms step:151/1480 train_time:20309ms step_avg:144.03ms step:152/1480 train_time:20454ms step_avg:144.05ms step:153/1480 train_time:20602ms step_avg:144.07ms step:154/1480 train_time:20748ms step_avg:144.08ms step:155/1480 train_time:20895ms step_avg:144.10ms step:156/1480 train_time:21042ms step_avg:144.12ms step:157/1480 train_time:21188ms step_avg:144.14ms step:158/1480 train_time:21336ms step_avg:144.16ms step:159/1480 train_time:21483ms step_avg:144.18ms step:160/1480 train_time:21629ms step_avg:144.19ms step:161/1480 train_time:21778ms step_avg:144.22ms step:162/1480 train_time:21924ms step_avg:144.24ms step:163/1480 train_time:22071ms step_avg:144.26ms step:164/1480 train_time:22219ms step_avg:144.28ms step:165/1480 train_time:22366ms step_avg:144.30ms step:166/1480 train_time:22514ms step_avg:144.32ms step:167/1480 train_time:22659ms step_avg:144.33ms step:168/1480 train_time:22806ms step_avg:144.34ms step:169/1480 train_time:22954ms step_avg:144.37ms step:170/1480 train_time:23102ms step_avg:144.39ms step:171/1480 train_time:23248ms step_avg:144.40ms step:172/1480 train_time:23395ms step_avg:144.41ms step:173/1480 train_time:23542ms step_avg:144.43ms step:174/1480 train_time:23688ms step_avg:144.44ms step:175/1480 train_time:23834ms step_avg:144.45ms step:176/1480 train_time:23983ms step_avg:144.48ms step:177/1480 train_time:24128ms step_avg:144.48ms step:178/1480 train_time:24276ms step_avg:144.50ms step:179/1480 train_time:24423ms step_avg:144.52ms step:180/1480 train_time:24568ms step_avg:144.52ms step:181/1480 train_time:24716ms step_avg:144.54ms step:182/1480 train_time:24862ms step_avg:144.55ms step:183/1480 train_time:25009ms step_avg:144.56ms step:184/1480 train_time:25155ms step_avg:144.57ms step:185/1480 train_time:25304ms step_avg:144.59ms step:186/1480 train_time:25449ms step_avg:144.60ms step:187/1480 train_time:25596ms step_avg:144.61ms step:188/1480 train_time:25743ms step_avg:144.63ms step:189/1480 train_time:25888ms step_avg:144.63ms step:190/1480 train_time:26035ms step_avg:144.64ms step:191/1480 train_time:26182ms step_avg:144.65ms step:192/1480 train_time:26328ms step_avg:144.66ms step:193/1480 train_time:26474ms step_avg:144.67ms step:194/1480 train_time:26622ms step_avg:144.69ms step:195/1480 train_time:26768ms step_avg:144.69ms step:196/1480 train_time:26916ms step_avg:144.71ms step:197/1480 train_time:27062ms step_avg:144.72ms step:198/1480 train_time:27208ms step_avg:144.72ms step:199/1480 train_time:27358ms step_avg:144.75ms step:200/1480 train_time:27509ms step_avg:144.78ms step:201/1480 train_time:27652ms step_avg:144.77ms step:202/1480 train_time:27799ms step_avg:144.79ms step:203/1480 train_time:27945ms step_avg:144.79ms step:204/1480 train_time:28092ms step_avg:144.80ms step:205/1480 train_time:28239ms step_avg:144.82ms step:206/1480 train_time:28386ms step_avg:144.83ms step:207/1480 train_time:28532ms step_avg:144.83ms step:208/1480 train_time:28679ms step_avg:144.84ms step:209/1480 train_time:28825ms step_avg:144.85ms step:210/1480 train_time:28973ms step_avg:144.87ms step:211/1480 train_time:29120ms step_avg:144.88ms step:212/1480 train_time:29266ms step_avg:144.88ms step:213/1480 train_time:29413ms step_avg:144.89ms step:214/1480 train_time:29561ms step_avg:144.91ms step:215/1480 train_time:29707ms step_avg:144.91ms step:216/1480 train_time:29854ms step_avg:144.92ms step:217/1480 train_time:30001ms step_avg:144.93ms step:218/1480 train_time:30146ms step_avg:144.93ms step:219/1480 train_time:30294ms step_avg:144.95ms step:220/1480 train_time:30442ms step_avg:144.96ms step:221/1480 train_time:30590ms step_avg:144.98ms step:222/1480 train_time:30742ms step_avg:145.01ms step:223/1480 train_time:30893ms step_avg:145.04ms step:224/1480 train_time:31043ms step_avg:145.06ms step:225/1480 train_time:31192ms step_avg:145.08ms step:226/1480 train_time:31343ms step_avg:145.11ms step:227/1480 train_time:31492ms step_avg:145.13ms step:228/1480 train_time:31643ms step_avg:145.15ms step:229/1480 train_time:31793ms step_avg:145.18ms step:230/1480 train_time:31945ms step_avg:145.20ms step:231/1480 train_time:32095ms step_avg:145.23ms step:232/1480 train_time:32246ms step_avg:145.25ms step:233/1480 train_time:32396ms step_avg:145.27ms step:234/1480 train_time:32546ms step_avg:145.30ms step:235/1480 train_time:32700ms step_avg:145.33ms step:236/1480 train_time:32849ms step_avg:145.35ms step:237/1480 train_time:33000ms step_avg:145.37ms step:238/1480 train_time:33149ms step_avg:145.39ms step:239/1480 train_time:33302ms step_avg:145.42ms step:240/1480 train_time:33451ms step_avg:145.44ms step:241/1480 train_time:33602ms step_avg:145.46ms step:242/1480 train_time:33751ms step_avg:145.48ms step:243/1480 train_time:33901ms step_avg:145.50ms step:244/1480 train_time:34050ms step_avg:145.51ms step:245/1480 train_time:34201ms step_avg:145.53ms step:246/1480 train_time:34351ms step_avg:145.55ms step:247/1480 train_time:34502ms step_avg:145.58ms step:248/1480 train_time:34652ms step_avg:145.60ms step:249/1480 train_time:34804ms step_avg:145.62ms step:250/1480 train_time:34954ms step_avg:145.64ms step:250/1480 val_loss:3.9993 train_time:35012ms step_avg:145.89ms step:251/1480 train_time:35110ms step_avg:145.68ms step:252/1480 train_time:35262ms step_avg:145.71ms step:253/1480 train_time:35414ms step_avg:145.74ms step:254/1480 train_time:35562ms step_avg:145.74ms step:255/1480 train_time:35711ms step_avg:145.76ms step:256/1480 train_time:35860ms step_avg:145.77ms step:257/1480 train_time:36010ms step_avg:145.79ms step:258/1480 train_time:36162ms step_avg:145.81ms step:259/1480 train_time:36314ms step_avg:145.84ms step:260/1480 train_time:36463ms step_avg:145.85ms step:261/1480 train_time:36614ms step_avg:145.87ms step:262/1480 train_time:36763ms step_avg:145.89ms step:263/1480 train_time:36914ms step_avg:145.90ms step:264/1480 train_time:37063ms step_avg:145.92ms step:265/1480 train_time:37215ms step_avg:145.94ms step:266/1480 train_time:37365ms step_avg:145.96ms step:267/1480 train_time:37516ms step_avg:145.98ms step:268/1480 train_time:37666ms step_avg:145.99ms step:269/1480 train_time:37817ms step_avg:146.01ms step:270/1480 train_time:37965ms step_avg:146.02ms step:271/1480 train_time:38115ms step_avg:146.03ms step:272/1480 train_time:38265ms step_avg:146.05ms step:273/1480 train_time:38416ms step_avg:146.07ms step:274/1480 train_time:38566ms step_avg:146.08ms step:275/1480 train_time:38720ms step_avg:146.11ms step:276/1480 train_time:38868ms step_avg:146.12ms step:277/1480 train_time:39018ms step_avg:146.14ms step:278/1480 train_time:39169ms step_avg:146.15ms step:279/1480 train_time:39319ms step_avg:146.17ms step:280/1480 train_time:39471ms step_avg:146.19ms step:281/1480 train_time:39621ms step_avg:146.20ms step:282/1480 train_time:39772ms step_avg:146.22ms step:283/1480 train_time:39922ms step_avg:146.24ms step:284/1480 train_time:40074ms step_avg:146.25ms step:285/1480 train_time:40223ms step_avg:146.27ms step:286/1480 train_time:40374ms step_avg:146.28ms step:287/1480 train_time:40525ms step_avg:146.30ms step:288/1480 train_time:40675ms step_avg:146.31ms step:289/1480 train_time:40826ms step_avg:146.33ms step:290/1480 train_time:40976ms step_avg:146.34ms step:291/1480 train_time:41126ms step_avg:146.36ms step:292/1480 train_time:41276ms step_avg:146.37ms step:293/1480 train_time:41426ms step_avg:146.38ms step:294/1480 train_time:41576ms step_avg:146.40ms step:295/1480 train_time:41726ms step_avg:146.41ms step:296/1480 train_time:41876ms step_avg:146.42ms step:297/1480 train_time:42028ms step_avg:146.44ms step:298/1480 train_time:42178ms step_avg:146.45ms step:299/1480 train_time:42330ms step_avg:146.47ms step:300/1480 train_time:42481ms step_avg:146.48ms step:301/1480 train_time:42631ms step_avg:146.50ms step:302/1480 train_time:42781ms step_avg:146.51ms step:303/1480 train_time:42934ms step_avg:146.53ms step:304/1480 train_time:43084ms step_avg:146.54ms step:305/1480 train_time:43235ms step_avg:146.56ms step:306/1480 train_time:43384ms step_avg:146.57ms step:307/1480 train_time:43535ms step_avg:146.58ms step:308/1480 train_time:43684ms step_avg:146.59ms step:309/1480 train_time:43835ms step_avg:146.61ms step:310/1480 train_time:43986ms step_avg:146.62ms step:311/1480 train_time:44137ms step_avg:146.64ms step:312/1480 train_time:44286ms step_avg:146.64ms step:313/1480 train_time:44438ms step_avg:146.66ms step:314/1480 train_time:44589ms step_avg:146.67ms step:315/1480 train_time:44740ms step_avg:146.69ms step:316/1480 train_time:44890ms step_avg:146.70ms step:317/1480 train_time:45041ms step_avg:146.71ms step:318/1480 train_time:45193ms step_avg:146.73ms step:319/1480 train_time:45343ms step_avg:146.74ms step:320/1480 train_time:45494ms step_avg:146.76ms step:321/1480 train_time:45644ms step_avg:146.77ms step:322/1480 train_time:45794ms step_avg:146.78ms step:323/1480 train_time:45943ms step_avg:146.78ms step:324/1480 train_time:46095ms step_avg:146.80ms step:325/1480 train_time:46244ms step_avg:146.81ms step:326/1480 train_time:46396ms step_avg:146.82ms step:327/1480 train_time:46547ms step_avg:146.84ms step:328/1480 train_time:46699ms step_avg:146.85ms step:329/1480 train_time:46848ms step_avg:146.86ms step:330/1480 train_time:47000ms step_avg:146.87ms step:331/1480 train_time:47154ms step_avg:146.90ms step:332/1480 train_time:47309ms step_avg:146.92ms step:333/1480 train_time:47463ms step_avg:146.94ms step:334/1480 train_time:47618ms step_avg:146.97ms step:335/1480 train_time:47771ms step_avg:146.99ms step:336/1480 train_time:47925ms step_avg:147.01ms step:337/1480 train_time:48079ms step_avg:147.03ms step:338/1480 train_time:48234ms step_avg:147.05ms step:339/1480 train_time:48387ms step_avg:147.07ms step:340/1480 train_time:48541ms step_avg:147.09ms step:341/1480 train_time:48695ms step_avg:147.11ms step:342/1480 train_time:48848ms step_avg:147.13ms step:343/1480 train_time:49003ms step_avg:147.16ms step:344/1480 train_time:49158ms step_avg:147.18ms step:345/1480 train_time:49312ms step_avg:147.20ms step:346/1480 train_time:49466ms step_avg:147.22ms step:347/1480 train_time:49620ms step_avg:147.24ms step:348/1480 train_time:49773ms step_avg:147.26ms step:349/1480 train_time:49927ms step_avg:147.28ms step:350/1480 train_time:50080ms step_avg:147.29ms step:351/1480 train_time:50234ms step_avg:147.31ms step:352/1480 train_time:50389ms step_avg:147.34ms step:353/1480 train_time:50542ms step_avg:147.35ms step:354/1480 train_time:50696ms step_avg:147.37ms step:355/1480 train_time:50849ms step_avg:147.39ms step:356/1480 train_time:51003ms step_avg:147.41ms step:357/1480 train_time:51157ms step_avg:147.43ms step:358/1480 train_time:51311ms step_avg:147.45ms step:359/1480 train_time:51465ms step_avg:147.46ms step:360/1480 train_time:51620ms step_avg:147.49ms step:361/1480 train_time:51774ms step_avg:147.50ms step:362/1480 train_time:51929ms step_avg:147.52ms step:363/1480 train_time:52081ms step_avg:147.54ms step:364/1480 train_time:52236ms step_avg:147.56ms step:365/1480 train_time:52391ms step_avg:147.58ms step:366/1480 train_time:52544ms step_avg:147.60ms step:367/1480 train_time:52697ms step_avg:147.61ms step:368/1480 train_time:52849ms step_avg:147.62ms step:369/1480 train_time:53003ms step_avg:147.64ms step:370/1480 train_time:53156ms step_avg:147.66ms step:371/1480 train_time:53311ms step_avg:147.68ms step:372/1480 train_time:53464ms step_avg:147.69ms step:373/1480 train_time:53618ms step_avg:147.71ms step:374/1480 train_time:53771ms step_avg:147.72ms step:375/1480 train_time:53925ms step_avg:147.74ms step:375/1480 val_loss:3.8105 train_time:53985ms step_avg:147.91ms step:376/1480 train_time:54082ms step_avg:147.76ms step:377/1480 train_time:54238ms step_avg:147.79ms step:378/1480 train_time:54392ms step_avg:147.80ms step:379/1480 train_time:54544ms step_avg:147.82ms step:380/1480 train_time:54697ms step_avg:147.83ms step:381/1480 train_time:54849ms step_avg:147.84ms step:382/1480 train_time:55003ms step_avg:147.86ms step:383/1480 train_time:55158ms step_avg:147.88ms step:384/1480 train_time:55313ms step_avg:147.89ms step:385/1480 train_time:55467ms step_avg:147.91ms step:386/1480 train_time:55621ms step_avg:147.93ms step:387/1480 train_time:55774ms step_avg:147.94ms step:388/1480 train_time:55929ms step_avg:147.96ms step:389/1480 train_time:56081ms step_avg:147.97ms step:390/1480 train_time:56237ms step_avg:147.99ms step:391/1480 train_time:56391ms step_avg:148.01ms step:392/1480 train_time:56545ms step_avg:148.02ms step:393/1480 train_time:56699ms step_avg:148.04ms step:394/1480 train_time:56852ms step_avg:148.05ms step:395/1480 train_time:57006ms step_avg:148.07ms step:396/1480 train_time:57159ms step_avg:148.08ms step:397/1480 train_time:57313ms step_avg:148.10ms step:398/1480 train_time:57468ms step_avg:148.11ms step:399/1480 train_time:57621ms step_avg:148.13ms step:400/1480 train_time:57775ms step_avg:148.14ms step:401/1480 train_time:57929ms step_avg:148.16ms step:402/1480 train_time:58083ms step_avg:148.17ms step:403/1480 train_time:58237ms step_avg:148.19ms step:404/1480 train_time:58391ms step_avg:148.20ms step:405/1480 train_time:58544ms step_avg:148.21ms step:406/1480 train_time:58698ms step_avg:148.23ms step:407/1480 train_time:58852ms step_avg:148.24ms step:408/1480 train_time:59005ms step_avg:148.25ms step:409/1480 train_time:59158ms step_avg:148.27ms step:410/1480 train_time:59312ms step_avg:148.28ms step:411/1480 train_time:59467ms step_avg:148.30ms step:412/1480 train_time:59620ms step_avg:148.31ms step:413/1480 train_time:59774ms step_avg:148.32ms step:414/1480 train_time:59929ms step_avg:148.34ms step:415/1480 train_time:60084ms step_avg:148.35ms step:416/1480 train_time:60237ms step_avg:148.37ms step:417/1480 train_time:60392ms step_avg:148.38ms step:418/1480 train_time:60547ms step_avg:148.40ms step:419/1480 train_time:60700ms step_avg:148.41ms step:420/1480 train_time:60853ms step_avg:148.42ms step:421/1480 train_time:61008ms step_avg:148.44ms step:422/1480 train_time:61161ms step_avg:148.45ms step:423/1480 train_time:61315ms step_avg:148.46ms step:424/1480 train_time:61470ms step_avg:148.48ms step:425/1480 train_time:61623ms step_avg:148.49ms step:426/1480 train_time:61777ms step_avg:148.50ms step:427/1480 train_time:61931ms step_avg:148.51ms step:428/1480 train_time:62083ms step_avg:148.52ms step:429/1480 train_time:62237ms step_avg:148.54ms step:430/1480 train_time:62390ms step_avg:148.55ms step:431/1480 train_time:62544ms step_avg:148.56ms step:432/1480 train_time:62697ms step_avg:148.57ms step:433/1480 train_time:62849ms step_avg:148.58ms step:434/1480 train_time:63003ms step_avg:148.59ms step:435/1480 train_time:63158ms step_avg:148.61ms step:436/1480 train_time:63311ms step_avg:148.62ms step:437/1480 train_time:63464ms step_avg:148.63ms step:438/1480 train_time:63618ms step_avg:148.64ms step:439/1480 train_time:63773ms step_avg:148.65ms step:440/1480 train_time:63928ms step_avg:148.67ms step:441/1480 train_time:64084ms step_avg:148.69ms step:442/1480 train_time:64240ms step_avg:148.70ms step:443/1480 train_time:64396ms step_avg:148.72ms step:444/1480 train_time:64552ms step_avg:148.74ms step:445/1480 train_time:64708ms step_avg:148.75ms step:446/1480 train_time:64862ms step_avg:148.77ms step:447/1480 train_time:65018ms step_avg:148.78ms step:448/1480 train_time:65175ms step_avg:148.80ms step:449/1480 train_time:65335ms step_avg:148.83ms step:450/1480 train_time:65494ms step_avg:148.85ms step:451/1480 train_time:65652ms step_avg:148.87ms step:452/1480 train_time:65810ms step_avg:148.89ms step:453/1480 train_time:65965ms step_avg:148.91ms step:454/1480 train_time:66120ms step_avg:148.92ms step:455/1480 train_time:66276ms step_avg:148.93ms step:456/1480 train_time:66432ms step_avg:148.95ms step:457/1480 train_time:66589ms step_avg:148.97ms step:458/1480 train_time:66744ms step_avg:148.98ms step:459/1480 train_time:66901ms step_avg:149.00ms step:460/1480 train_time:67058ms step_avg:149.02ms step:461/1480 train_time:67215ms step_avg:149.04ms step:462/1480 train_time:67373ms step_avg:149.06ms step:463/1480 train_time:67534ms step_avg:149.08ms step:464/1480 train_time:67691ms step_avg:149.10ms step:465/1480 train_time:67847ms step_avg:149.11ms step:466/1480 train_time:68004ms step_avg:149.13ms step:467/1480 train_time:68163ms step_avg:149.15ms step:468/1480 train_time:68319ms step_avg:149.17ms step:469/1480 train_time:68476ms step_avg:149.18ms step:470/1480 train_time:68634ms step_avg:149.20ms step:471/1480 train_time:68791ms step_avg:149.22ms step:472/1480 train_time:68947ms step_avg:149.24ms step:473/1480 train_time:69102ms step_avg:149.25ms step:474/1480 train_time:69258ms step_avg:149.26ms step:475/1480 train_time:69414ms step_avg:149.28ms step:476/1480 train_time:69571ms step_avg:149.29ms step:477/1480 train_time:69728ms step_avg:149.31ms step:478/1480 train_time:69883ms step_avg:149.32ms step:479/1480 train_time:70039ms step_avg:149.34ms step:480/1480 train_time:70197ms step_avg:149.36ms step:481/1480 train_time:70354ms step_avg:149.37ms step:482/1480 train_time:70512ms step_avg:149.39ms step:483/1480 train_time:70667ms step_avg:149.40ms step:484/1480 train_time:70823ms step_avg:149.42ms step:485/1480 train_time:70980ms step_avg:149.43ms step:486/1480 train_time:71137ms step_avg:149.45ms step:487/1480 train_time:71295ms step_avg:149.46ms step:488/1480 train_time:71452ms step_avg:149.48ms step:489/1480 train_time:71609ms step_avg:149.50ms step:490/1480 train_time:71765ms step_avg:149.51ms step:491/1480 train_time:71923ms step_avg:149.53ms step:492/1480 train_time:72079ms step_avg:149.54ms step:493/1480 train_time:72237ms step_avg:149.56ms step:494/1480 train_time:72395ms step_avg:149.58ms step:495/1480 train_time:72554ms step_avg:149.60ms step:496/1480 train_time:72712ms step_avg:149.61ms step:497/1480 train_time:72870ms step_avg:149.63ms step:498/1480 train_time:73027ms step_avg:149.64ms step:499/1480 train_time:73184ms step_avg:149.66ms step:500/1480 train_time:73341ms step_avg:149.68ms step:500/1480 val_loss:3.6911 train_time:73402ms step_avg:149.80ms step:501/1480 train_time:73500ms step_avg:149.69ms step:502/1480 train_time:73657ms step_avg:149.71ms step:503/1480 train_time:73815ms step_avg:149.73ms step:504/1480 train_time:73970ms step_avg:149.74ms step:505/1480 train_time:74125ms step_avg:149.75ms step:506/1480 train_time:74281ms step_avg:149.76ms step:507/1480 train_time:74438ms step_avg:149.78ms step:508/1480 train_time:74596ms step_avg:149.79ms step:509/1480 train_time:74754ms step_avg:149.81ms step:510/1480 train_time:74910ms step_avg:149.82ms step:511/1480 train_time:75066ms step_avg:149.83ms step:512/1480 train_time:75222ms step_avg:149.85ms step:513/1480 train_time:75379ms step_avg:149.86ms step:514/1480 train_time:75536ms step_avg:149.87ms step:515/1480 train_time:75695ms step_avg:149.89ms step:516/1480 train_time:75853ms step_avg:149.91ms step:517/1480 train_time:76011ms step_avg:149.92ms step:518/1480 train_time:76168ms step_avg:149.94ms step:519/1480 train_time:76324ms step_avg:149.95ms step:520/1480 train_time:76482ms step_avg:149.96ms step:521/1480 train_time:76638ms step_avg:149.98ms step:522/1480 train_time:76795ms step_avg:149.99ms step:523/1480 train_time:76953ms step_avg:150.01ms step:524/1480 train_time:77111ms step_avg:150.02ms step:525/1480 train_time:77268ms step_avg:150.03ms step:526/1480 train_time:77425ms step_avg:150.05ms step:527/1480 train_time:77582ms step_avg:150.06ms step:528/1480 train_time:77739ms step_avg:150.07ms step:529/1480 train_time:77896ms step_avg:150.09ms step:530/1480 train_time:78053ms step_avg:150.10ms step:531/1480 train_time:78210ms step_avg:150.12ms step:532/1480 train_time:78368ms step_avg:150.13ms step:533/1480 train_time:78524ms step_avg:150.14ms step:534/1480 train_time:78680ms step_avg:150.15ms step:535/1480 train_time:78835ms step_avg:150.16ms step:536/1480 train_time:78993ms step_avg:150.18ms step:537/1480 train_time:79151ms step_avg:150.19ms step:538/1480 train_time:79309ms step_avg:150.21ms step:539/1480 train_time:79469ms step_avg:150.22ms step:540/1480 train_time:79625ms step_avg:150.24ms step:541/1480 train_time:79780ms step_avg:150.25ms step:542/1480 train_time:79937ms step_avg:150.26ms step:543/1480 train_time:80095ms step_avg:150.27ms step:544/1480 train_time:80252ms step_avg:150.28ms step:545/1480 train_time:80409ms step_avg:150.30ms step:546/1480 train_time:80566ms step_avg:150.31ms step:547/1480 train_time:80722ms step_avg:150.32ms step:548/1480 train_time:80879ms step_avg:150.33ms step:549/1480 train_time:81036ms step_avg:150.34ms step:550/1480 train_time:81194ms step_avg:150.36ms step:551/1480 train_time:81354ms step_avg:150.38ms step:552/1480 train_time:81513ms step_avg:150.39ms step:553/1480 train_time:81675ms step_avg:150.41ms step:554/1480 train_time:81837ms step_avg:150.44ms step:555/1480 train_time:81997ms step_avg:150.45ms step:556/1480 train_time:82156ms step_avg:150.47ms step:557/1480 train_time:82317ms step_avg:150.49ms step:558/1480 train_time:82477ms step_avg:150.50ms step:559/1480 train_time:82636ms step_avg:150.52ms step:560/1480 train_time:82797ms step_avg:150.54ms step:561/1480 train_time:82957ms step_avg:150.56ms step:562/1480 train_time:83117ms step_avg:150.57ms step:563/1480 train_time:83277ms step_avg:150.59ms step:564/1480 train_time:83436ms step_avg:150.61ms step:565/1480 train_time:83595ms step_avg:150.62ms step:566/1480 train_time:83755ms step_avg:150.64ms step:567/1480 train_time:83915ms step_avg:150.66ms step:568/1480 train_time:84075ms step_avg:150.67ms step:569/1480 train_time:84235ms step_avg:150.69ms step:570/1480 train_time:84395ms step_avg:150.71ms step:571/1480 train_time:84555ms step_avg:150.72ms step:572/1480 train_time:84716ms step_avg:150.74ms step:573/1480 train_time:84876ms step_avg:150.76ms step:574/1480 train_time:85036ms step_avg:150.77ms step:575/1480 train_time:85198ms step_avg:150.79ms step:576/1480 train_time:85357ms step_avg:150.81ms step:577/1480 train_time:85517ms step_avg:150.82ms step:578/1480 train_time:85676ms step_avg:150.84ms step:579/1480 train_time:85835ms step_avg:150.85ms step:580/1480 train_time:85994ms step_avg:150.87ms step:581/1480 train_time:86153ms step_avg:150.88ms step:582/1480 train_time:86317ms step_avg:150.90ms step:583/1480 train_time:86476ms step_avg:150.92ms step:584/1480 train_time:86635ms step_avg:150.93ms step:585/1480 train_time:86794ms step_avg:150.95ms step:586/1480 train_time:86953ms step_avg:150.96ms step:587/1480 train_time:87112ms step_avg:150.97ms step:588/1480 train_time:87271ms step_avg:150.99ms step:589/1480 train_time:87432ms step_avg:151.01ms step:590/1480 train_time:87595ms step_avg:151.03ms step:591/1480 train_time:87755ms step_avg:151.04ms step:592/1480 train_time:87915ms step_avg:151.06ms step:593/1480 train_time:88076ms step_avg:151.07ms step:594/1480 train_time:88236ms step_avg:151.09ms step:595/1480 train_time:88398ms step_avg:151.11ms step:596/1480 train_time:88559ms step_avg:151.12ms step:597/1480 train_time:88719ms step_avg:151.14ms step:598/1480 train_time:88878ms step_avg:151.15ms step:599/1480 train_time:89036ms step_avg:151.16ms step:600/1480 train_time:89196ms step_avg:151.18ms step:601/1480 train_time:89355ms step_avg:151.19ms step:602/1480 train_time:89513ms step_avg:151.20ms step:603/1480 train_time:89676ms step_avg:151.22ms step:604/1480 train_time:89836ms step_avg:151.24ms step:605/1480 train_time:89996ms step_avg:151.25ms step:606/1480 train_time:90158ms step_avg:151.27ms step:607/1480 train_time:90322ms step_avg:151.29ms step:608/1480 train_time:90482ms step_avg:151.31ms step:609/1480 train_time:90641ms step_avg:151.32ms step:610/1480 train_time:90798ms step_avg:151.33ms step:611/1480 train_time:90958ms step_avg:151.34ms step:612/1480 train_time:91118ms step_avg:151.36ms step:613/1480 train_time:91278ms step_avg:151.37ms step:614/1480 train_time:91437ms step_avg:151.39ms step:615/1480 train_time:91596ms step_avg:151.40ms step:616/1480 train_time:91753ms step_avg:151.41ms step:617/1480 train_time:91915ms step_avg:151.43ms step:618/1480 train_time:92075ms step_avg:151.44ms step:619/1480 train_time:92234ms step_avg:151.45ms step:620/1480 train_time:92395ms step_avg:151.47ms step:621/1480 train_time:92556ms step_avg:151.48ms step:622/1480 train_time:92716ms step_avg:151.50ms step:623/1480 train_time:92877ms step_avg:151.51ms step:624/1480 train_time:93035ms step_avg:151.52ms step:625/1480 train_time:93195ms step_avg:151.54ms step:625/1480 val_loss:3.6097 train_time:93258ms step_avg:151.64ms step:626/1480 train_time:93358ms step_avg:151.55ms step:627/1480 train_time:93517ms step_avg:151.57ms step:628/1480 train_time:93675ms step_avg:151.58ms step:629/1480 train_time:93834ms step_avg:151.59ms step:630/1480 train_time:93992ms step_avg:151.60ms step:631/1480 train_time:94149ms step_avg:151.61ms step:632/1480 train_time:94308ms step_avg:151.62ms step:633/1480 train_time:94467ms step_avg:151.63ms step:634/1480 train_time:94627ms step_avg:151.65ms step:635/1480 train_time:94786ms step_avg:151.66ms step:636/1480 train_time:94945ms step_avg:151.67ms step:637/1480 train_time:95106ms step_avg:151.68ms step:638/1480 train_time:95265ms step_avg:151.70ms step:639/1480 train_time:95424ms step_avg:151.71ms step:640/1480 train_time:95585ms step_avg:151.72ms step:641/1480 train_time:95745ms step_avg:151.74ms step:642/1480 train_time:95904ms step_avg:151.75ms step:643/1480 train_time:96063ms step_avg:151.76ms step:644/1480 train_time:96223ms step_avg:151.77ms step:645/1480 train_time:96382ms step_avg:151.78ms step:646/1480 train_time:96543ms step_avg:151.80ms step:647/1480 train_time:96702ms step_avg:151.81ms step:648/1480 train_time:96864ms step_avg:151.82ms step:649/1480 train_time:97024ms step_avg:151.84ms step:650/1480 train_time:97185ms step_avg:151.85ms step:651/1480 train_time:97346ms step_avg:151.87ms step:652/1480 train_time:97506ms step_avg:151.88ms step:653/1480 train_time:97664ms step_avg:151.89ms step:654/1480 train_time:97824ms step_avg:151.90ms step:655/1480 train_time:97985ms step_avg:151.91ms step:656/1480 train_time:98146ms step_avg:151.93ms step:657/1480 train_time:98307ms step_avg:151.94ms step:658/1480 train_time:98466ms step_avg:151.95ms step:659/1480 train_time:98626ms step_avg:151.97ms step:660/1480 train_time:98788ms step_avg:151.98ms step:661/1480 train_time:98951ms step_avg:152.00ms step:662/1480 train_time:99110ms step_avg:152.01ms step:663/1480 train_time:99270ms step_avg:152.02ms step:664/1480 train_time:99432ms step_avg:152.04ms step:665/1480 train_time:99593ms step_avg:152.05ms step:666/1480 train_time:99753ms step_avg:152.06ms step:667/1480 train_time:99914ms step_avg:152.08ms step:668/1480 train_time:100075ms step_avg:152.09ms step:669/1480 train_time:100238ms step_avg:152.11ms step:670/1480 train_time:100398ms step_avg:152.12ms step:671/1480 train_time:100559ms step_avg:152.13ms step:672/1480 train_time:100721ms step_avg:152.15ms step:673/1480 train_time:100886ms step_avg:152.17ms step:674/1480 train_time:101048ms step_avg:152.18ms step:675/1480 train_time:101210ms step_avg:152.20ms step:676/1480 train_time:101371ms step_avg:152.21ms step:677/1480 train_time:101531ms step_avg:152.22ms step:678/1480 train_time:101692ms step_avg:152.23ms step:679/1480 train_time:101854ms step_avg:152.25ms step:680/1480 train_time:102017ms step_avg:152.26ms step:681/1480 train_time:102177ms step_avg:152.28ms step:682/1480 train_time:102340ms step_avg:152.29ms step:683/1480 train_time:102503ms step_avg:152.31ms step:684/1480 train_time:102665ms step_avg:152.32ms step:685/1480 train_time:102828ms step_avg:152.34ms step:686/1480 train_time:102989ms step_avg:152.35ms step:687/1480 train_time:103149ms step_avg:152.36ms step:688/1480 train_time:103313ms step_avg:152.38ms step:689/1480 train_time:103475ms step_avg:152.39ms step:690/1480 train_time:103639ms step_avg:152.41ms step:691/1480 train_time:103801ms step_avg:152.42ms step:692/1480 train_time:103964ms step_avg:152.44ms step:693/1480 train_time:104127ms step_avg:152.45ms step:694/1480 train_time:104290ms step_avg:152.47ms step:695/1480 train_time:104451ms step_avg:152.48ms step:696/1480 train_time:104611ms step_avg:152.49ms step:697/1480 train_time:104774ms step_avg:152.51ms step:698/1480 train_time:104933ms step_avg:152.52ms step:699/1480 train_time:105095ms step_avg:152.53ms step:700/1480 train_time:105258ms step_avg:152.55ms step:701/1480 train_time:105418ms step_avg:152.56ms step:702/1480 train_time:105579ms step_avg:152.57ms step:703/1480 train_time:105741ms step_avg:152.58ms step:704/1480 train_time:105902ms step_avg:152.60ms step:705/1480 train_time:106066ms step_avg:152.61ms step:706/1480 train_time:106229ms step_avg:152.63ms step:707/1480 train_time:106390ms step_avg:152.64ms step:708/1480 train_time:106551ms step_avg:152.65ms step:709/1480 train_time:106713ms step_avg:152.67ms step:710/1480 train_time:106872ms step_avg:152.67ms step:711/1480 train_time:107034ms step_avg:152.69ms step:712/1480 train_time:107200ms step_avg:152.71ms step:713/1480 train_time:107365ms step_avg:152.72ms step:714/1480 train_time:107527ms step_avg:152.74ms step:715/1480 train_time:107689ms step_avg:152.75ms step:716/1480 train_time:107849ms step_avg:152.76ms step:717/1480 train_time:108011ms step_avg:152.77ms step:718/1480 train_time:108169ms step_avg:152.78ms step:719/1480 train_time:108328ms step_avg:152.79ms step:720/1480 train_time:108490ms step_avg:152.80ms step:721/1480 train_time:108653ms step_avg:152.82ms step:722/1480 train_time:108814ms step_avg:152.83ms step:723/1480 train_time:108974ms step_avg:152.84ms step:724/1480 train_time:109134ms step_avg:152.85ms step:725/1480 train_time:109298ms step_avg:152.86ms step:726/1480 train_time:109463ms step_avg:152.88ms step:727/1480 train_time:109626ms step_avg:152.90ms step:728/1480 train_time:109787ms step_avg:152.91ms step:729/1480 train_time:109948ms step_avg:152.92ms step:730/1480 train_time:110111ms step_avg:152.93ms step:731/1480 train_time:110272ms step_avg:152.94ms step:732/1480 train_time:110432ms step_avg:152.95ms step:733/1480 train_time:110594ms step_avg:152.96ms step:734/1480 train_time:110753ms step_avg:152.97ms step:735/1480 train_time:110913ms step_avg:152.98ms step:736/1480 train_time:111074ms step_avg:152.99ms step:737/1480 train_time:111234ms step_avg:153.00ms step:738/1480 train_time:111396ms step_avg:153.02ms step:739/1480 train_time:111557ms step_avg:153.03ms step:740/1480 train_time:111723ms step_avg:153.05ms step:741/1480 train_time:111887ms step_avg:153.06ms step:742/1480 train_time:112049ms step_avg:153.07ms step:743/1480 train_time:112210ms step_avg:153.08ms step:744/1480 train_time:112373ms step_avg:153.10ms step:745/1480 train_time:112537ms step_avg:153.11ms step:746/1480 train_time:112696ms step_avg:153.12ms step:747/1480 train_time:112858ms step_avg:153.13ms step:748/1480 train_time:113023ms step_avg:153.15ms step:749/1480 train_time:113189ms step_avg:153.17ms step:750/1480 train_time:113349ms step_avg:153.17ms step:750/1480 val_loss:3.5555 train_time:113413ms step_avg:153.26ms step:751/1480 train_time:113514ms step_avg:153.19ms step:752/1480 train_time:113678ms step_avg:153.21ms step:753/1480 train_time:113842ms step_avg:153.22ms step:754/1480 train_time:114002ms step_avg:153.23ms step:755/1480 train_time:114165ms step_avg:153.24ms step:756/1480 train_time:114326ms step_avg:153.25ms step:757/1480 train_time:114489ms step_avg:153.27ms step:758/1480 train_time:114648ms step_avg:153.27ms step:759/1480 train_time:114809ms step_avg:153.28ms step:760/1480 train_time:114970ms step_avg:153.29ms step:761/1480 train_time:115134ms step_avg:153.31ms step:762/1480 train_time:115298ms step_avg:153.32ms step:763/1480 train_time:115462ms step_avg:153.34ms step:764/1480 train_time:115624ms step_avg:153.35ms step:765/1480 train_time:115785ms step_avg:153.36ms step:766/1480 train_time:115947ms step_avg:153.37ms step:767/1480 train_time:116109ms step_avg:153.38ms step:768/1480 train_time:116271ms step_avg:153.39ms step:769/1480 train_time:116435ms step_avg:153.41ms step:770/1480 train_time:116600ms step_avg:153.42ms step:771/1480 train_time:116763ms step_avg:153.43ms step:772/1480 train_time:116925ms step_avg:153.44ms step:773/1480 train_time:117087ms step_avg:153.46ms step:774/1480 train_time:117249ms step_avg:153.47ms step:775/1480 train_time:117411ms step_avg:153.48ms step:776/1480 train_time:117575ms step_avg:153.49ms step:777/1480 train_time:117742ms step_avg:153.51ms step:778/1480 train_time:117906ms step_avg:153.52ms step:779/1480 train_time:118067ms step_avg:153.53ms step:780/1480 train_time:118230ms step_avg:153.55ms step:781/1480 train_time:118393ms step_avg:153.56ms step:782/1480 train_time:118558ms step_avg:153.57ms step:783/1480 train_time:118721ms step_avg:153.58ms step:784/1480 train_time:118884ms step_avg:153.60ms step:785/1480 train_time:119046ms step_avg:153.61ms step:786/1480 train_time:119213ms step_avg:153.62ms step:787/1480 train_time:119377ms step_avg:153.64ms step:788/1480 train_time:119541ms step_avg:153.65ms step:789/1480 train_time:119703ms step_avg:153.66ms step:790/1480 train_time:119868ms step_avg:153.68ms step:791/1480 train_time:120034ms step_avg:153.69ms step:792/1480 train_time:120201ms step_avg:153.71ms step:793/1480 train_time:120363ms step_avg:153.72ms step:794/1480 train_time:120526ms step_avg:153.73ms step:795/1480 train_time:120689ms step_avg:153.74ms step:796/1480 train_time:120854ms step_avg:153.76ms step:797/1480 train_time:121019ms step_avg:153.77ms step:798/1480 train_time:121183ms step_avg:153.79ms step:799/1480 train_time:121348ms step_avg:153.80ms step:800/1480 train_time:121510ms step_avg:153.81ms step:801/1480 train_time:121672ms step_avg:153.82ms step:802/1480 train_time:121843ms step_avg:153.84ms step:803/1480 train_time:122005ms step_avg:153.85ms step:804/1480 train_time:122169ms step_avg:153.87ms step:805/1480 train_time:122335ms step_avg:153.88ms step:806/1480 train_time:122498ms step_avg:153.89ms step:807/1480 train_time:122658ms step_avg:153.90ms step:808/1480 train_time:122822ms step_avg:153.91ms step:809/1480 train_time:122984ms step_avg:153.92ms step:810/1480 train_time:123146ms step_avg:153.93ms step:811/1480 train_time:123309ms step_avg:153.94ms step:812/1480 train_time:123470ms step_avg:153.95ms step:813/1480 train_time:123629ms step_avg:153.96ms step:814/1480 train_time:123793ms step_avg:153.97ms step:815/1480 train_time:123957ms step_avg:153.98ms step:816/1480 train_time:124123ms step_avg:154.00ms step:817/1480 train_time:124284ms step_avg:154.01ms step:818/1480 train_time:124446ms step_avg:154.02ms step:819/1480 train_time:124610ms step_avg:154.03ms step:820/1480 train_time:124773ms step_avg:154.04ms step:821/1480 train_time:124934ms step_avg:154.05ms step:822/1480 train_time:125099ms step_avg:154.06ms step:823/1480 train_time:125263ms step_avg:154.07ms step:824/1480 train_time:125425ms step_avg:154.08ms step:825/1480 train_time:125588ms step_avg:154.10ms step:826/1480 train_time:125754ms step_avg:154.11ms step:827/1480 train_time:125919ms step_avg:154.12ms step:828/1480 train_time:126081ms step_avg:154.13ms step:829/1480 train_time:126246ms step_avg:154.15ms step:830/1480 train_time:126409ms step_avg:154.16ms step:831/1480 train_time:126571ms step_avg:154.17ms step:832/1480 train_time:126738ms step_avg:154.18ms step:833/1480 train_time:126903ms step_avg:154.20ms step:834/1480 train_time:127067ms step_avg:154.21ms step:835/1480 train_time:127231ms step_avg:154.22ms step:836/1480 train_time:127396ms step_avg:154.23ms step:837/1480 train_time:127559ms step_avg:154.24ms step:838/1480 train_time:127724ms step_avg:154.26ms step:839/1480 train_time:127885ms step_avg:154.26ms step:840/1480 train_time:128047ms step_avg:154.27ms step:841/1480 train_time:128208ms step_avg:154.28ms step:842/1480 train_time:128369ms step_avg:154.29ms step:843/1480 train_time:128530ms step_avg:154.30ms step:844/1480 train_time:128694ms step_avg:154.31ms step:845/1480 train_time:128859ms step_avg:154.32ms step:846/1480 train_time:129023ms step_avg:154.33ms step:847/1480 train_time:129187ms step_avg:154.35ms step:848/1480 train_time:129348ms step_avg:154.35ms step:849/1480 train_time:129510ms step_avg:154.36ms step:850/1480 train_time:129674ms step_avg:154.37ms step:851/1480 train_time:129840ms step_avg:154.39ms step:852/1480 train_time:130003ms step_avg:154.40ms step:853/1480 train_time:130167ms step_avg:154.41ms step:854/1480 train_time:130329ms step_avg:154.42ms step:855/1480 train_time:130490ms step_avg:154.43ms step:856/1480 train_time:130653ms step_avg:154.44ms step:857/1480 train_time:130819ms step_avg:154.45ms step:858/1480 train_time:130984ms step_avg:154.46ms step:859/1480 train_time:131147ms step_avg:154.47ms step:860/1480 train_time:131308ms step_avg:154.48ms step:861/1480 train_time:131472ms step_avg:154.49ms step:862/1480 train_time:131643ms step_avg:154.51ms step:863/1480 train_time:131809ms step_avg:154.52ms step:864/1480 train_time:131972ms step_avg:154.53ms step:865/1480 train_time:132134ms step_avg:154.54ms step:866/1480 train_time:132301ms step_avg:154.56ms step:867/1480 train_time:132465ms step_avg:154.57ms step:868/1480 train_time:132626ms step_avg:154.58ms step:869/1480 train_time:132787ms step_avg:154.58ms step:870/1480 train_time:132953ms step_avg:154.60ms step:871/1480 train_time:133116ms step_avg:154.61ms step:872/1480 train_time:133280ms step_avg:154.62ms step:873/1480 train_time:133444ms step_avg:154.63ms step:874/1480 train_time:133610ms step_avg:154.64ms step:875/1480 train_time:133776ms step_avg:154.65ms step:875/1480 val_loss:3.5077 train_time:133842ms step_avg:154.73ms step:876/1480 train_time:133942ms step_avg:154.67ms step:877/1480 train_time:134109ms step_avg:154.68ms step:878/1480 train_time:134272ms step_avg:154.69ms step:879/1480 train_time:134435ms step_avg:154.70ms step:880/1480 train_time:134598ms step_avg:154.71ms step:881/1480 train_time:134760ms step_avg:154.72ms step:882/1480 train_time:134927ms step_avg:154.73ms step:883/1480 train_time:135094ms step_avg:154.75ms step:884/1480 train_time:135259ms step_avg:154.76ms step:885/1480 train_time:135424ms step_avg:154.77ms step:886/1480 train_time:135591ms step_avg:154.78ms step:887/1480 train_time:135759ms step_avg:154.80ms step:888/1480 train_time:135933ms step_avg:154.82ms step:889/1480 train_time:136100ms step_avg:154.84ms step:890/1480 train_time:136263ms step_avg:154.84ms step:891/1480 train_time:136429ms step_avg:154.86ms step:892/1480 train_time:136595ms step_avg:154.87ms step:893/1480 train_time:136757ms step_avg:154.88ms step:894/1480 train_time:136922ms step_avg:154.89ms step:895/1480 train_time:137090ms step_avg:154.90ms step:896/1480 train_time:137254ms step_avg:154.91ms step:897/1480 train_time:137419ms step_avg:154.93ms step:898/1480 train_time:137587ms step_avg:154.94ms step:899/1480 train_time:137752ms step_avg:154.95ms step:900/1480 train_time:137914ms step_avg:154.96ms step:901/1480 train_time:138080ms step_avg:154.97ms step:902/1480 train_time:138242ms step_avg:154.98ms step:903/1480 train_time:138415ms step_avg:155.00ms step:904/1480 train_time:138579ms step_avg:155.01ms step:905/1480 train_time:138741ms step_avg:155.02ms step:906/1480 train_time:138909ms step_avg:155.03ms step:907/1480 train_time:139076ms step_avg:155.05ms step:908/1480 train_time:139239ms step_avg:155.05ms step:909/1480 train_time:139405ms step_avg:155.07ms step:910/1480 train_time:139576ms step_avg:155.08ms step:911/1480 train_time:139741ms step_avg:155.10ms step:912/1480 train_time:139907ms step_avg:155.11ms step:913/1480 train_time:140076ms step_avg:155.12ms step:914/1480 train_time:140243ms step_avg:155.14ms step:915/1480 train_time:140414ms step_avg:155.15ms step:916/1480 train_time:140578ms step_avg:155.16ms step:917/1480 train_time:140742ms step_avg:155.17ms step:918/1480 train_time:140910ms step_avg:155.19ms step:919/1480 train_time:141081ms step_avg:155.20ms step:920/1480 train_time:141247ms step_avg:155.22ms step:921/1480 train_time:141412ms step_avg:155.23ms step:922/1480 train_time:141579ms step_avg:155.24ms step:923/1480 train_time:141741ms step_avg:155.25ms step:924/1480 train_time:141906ms step_avg:155.26ms step:925/1480 train_time:142073ms step_avg:155.27ms step:926/1480 train_time:142235ms step_avg:155.28ms step:927/1480 train_time:142399ms step_avg:155.29ms step:928/1480 train_time:142567ms step_avg:155.30ms step:929/1480 train_time:142732ms step_avg:155.31ms step:930/1480 train_time:142898ms step_avg:155.32ms step:931/1480 train_time:143062ms step_avg:155.33ms step:932/1480 train_time:143228ms step_avg:155.35ms step:933/1480 train_time:143396ms step_avg:155.36ms step:934/1480 train_time:143561ms step_avg:155.37ms step:935/1480 train_time:143732ms step_avg:155.39ms step:936/1480 train_time:143899ms step_avg:155.40ms step:937/1480 train_time:144070ms step_avg:155.41ms step:938/1480 train_time:144231ms step_avg:155.42ms step:939/1480 train_time:144400ms step_avg:155.44ms step:940/1480 train_time:144568ms step_avg:155.45ms step:941/1480 train_time:144732ms step_avg:155.46ms step:942/1480 train_time:144898ms step_avg:155.47ms step:943/1480 train_time:145068ms step_avg:155.49ms step:944/1480 train_time:145240ms step_avg:155.50ms step:945/1480 train_time:145403ms step_avg:155.51ms step:946/1480 train_time:145573ms step_avg:155.53ms step:947/1480 train_time:145741ms step_avg:155.54ms step:948/1480 train_time:145907ms step_avg:155.55ms step:949/1480 train_time:146074ms step_avg:155.56ms step:950/1480 train_time:146238ms step_avg:155.57ms step:951/1480 train_time:146406ms step_avg:155.59ms step:952/1480 train_time:146573ms step_avg:155.60ms step:953/1480 train_time:146740ms step_avg:155.61ms step:954/1480 train_time:146909ms step_avg:155.62ms step:955/1480 train_time:147073ms step_avg:155.63ms step:956/1480 train_time:147237ms step_avg:155.64ms step:957/1480 train_time:147406ms step_avg:155.66ms step:958/1480 train_time:147576ms step_avg:155.67ms step:959/1480 train_time:147741ms step_avg:155.68ms step:960/1480 train_time:147909ms step_avg:155.69ms step:961/1480 train_time:148075ms step_avg:155.70ms step:962/1480 train_time:148238ms step_avg:155.71ms step:963/1480 train_time:148404ms step_avg:155.72ms step:964/1480 train_time:148573ms step_avg:155.74ms step:965/1480 train_time:148738ms step_avg:155.75ms step:966/1480 train_time:148902ms step_avg:155.76ms step:967/1480 train_time:149065ms step_avg:155.76ms step:968/1480 train_time:149231ms step_avg:155.77ms step:969/1480 train_time:149397ms step_avg:155.78ms step:970/1480 train_time:149560ms step_avg:155.79ms step:971/1480 train_time:149725ms step_avg:155.80ms step:972/1480 train_time:149891ms step_avg:155.81ms step:973/1480 train_time:150054ms step_avg:155.82ms step:974/1480 train_time:150222ms step_avg:155.83ms step:975/1480 train_time:150387ms step_avg:155.84ms step:976/1480 train_time:150553ms step_avg:155.85ms step:977/1480 train_time:150717ms step_avg:155.86ms step:978/1480 train_time:150883ms step_avg:155.87ms step:979/1480 train_time:151050ms step_avg:155.88ms step:980/1480 train_time:151216ms step_avg:155.89ms step:981/1480 train_time:151384ms step_avg:155.90ms step:982/1480 train_time:151546ms step_avg:155.91ms step:983/1480 train_time:151712ms step_avg:155.92ms step:984/1480 train_time:151876ms step_avg:155.93ms step:985/1480 train_time:152042ms step_avg:155.94ms step:986/1480 train_time:152206ms step_avg:155.95ms step:987/1480 train_time:152371ms step_avg:155.96ms step:988/1480 train_time:152537ms step_avg:155.97ms step:989/1480 train_time:152703ms step_avg:155.98ms step:990/1480 train_time:152874ms step_avg:155.99ms step:991/1480 train_time:153042ms step_avg:156.01ms step:992/1480 train_time:153219ms step_avg:156.03ms step:993/1480 train_time:153397ms step_avg:156.05ms step:994/1480 train_time:153561ms step_avg:156.06ms step:995/1480 train_time:153726ms step_avg:156.07ms step:996/1480 train_time:153890ms step_avg:156.07ms step:997/1480 train_time:154054ms step_avg:156.08ms step:998/1480 train_time:154216ms step_avg:156.09ms step:999/1480 train_time:154383ms step_avg:156.10ms step:1000/1480 train_time:154553ms step_avg:156.11ms step:1000/1480 val_loss:3.4463 train_time:154620ms step_avg:156.18ms step:1001/1480 train_time:154723ms step_avg:156.13ms step:1002/1480 train_time:154889ms step_avg:156.14ms step:1003/1480 train_time:155059ms step_avg:156.15ms step:1004/1480 train_time:155227ms step_avg:156.16ms step:1005/1480 train_time:155394ms step_avg:156.17ms step:1006/1480 train_time:155562ms step_avg:156.19ms step:1007/1480 train_time:155728ms step_avg:156.20ms step:1008/1480 train_time:155896ms step_avg:156.21ms step:1009/1480 train_time:156069ms step_avg:156.22ms step:1010/1480 train_time:156234ms step_avg:156.23ms step:1011/1480 train_time:156400ms step_avg:156.24ms step:1012/1480 train_time:156567ms step_avg:156.25ms step:1013/1480 train_time:156736ms step_avg:156.27ms step:1014/1480 train_time:156904ms step_avg:156.28ms step:1015/1480 train_time:157073ms step_avg:156.29ms step:1016/1480 train_time:157242ms step_avg:156.30ms step:1017/1480 train_time:157413ms step_avg:156.32ms step:1018/1480 train_time:157581ms step_avg:156.33ms step:1019/1480 train_time:157749ms step_avg:156.34ms step:1020/1480 train_time:157918ms step_avg:156.35ms step:1021/1480 train_time:158084ms step_avg:156.36ms step:1022/1480 train_time:158250ms step_avg:156.37ms step:1023/1480 train_time:158418ms step_avg:156.39ms step:1024/1480 train_time:158586ms step_avg:156.40ms step:1025/1480 train_time:158756ms step_avg:156.41ms step:1026/1480 train_time:158923ms step_avg:156.42ms step:1027/1480 train_time:159089ms step_avg:156.43ms step:1028/1480 train_time:159263ms step_avg:156.45ms step:1029/1480 train_time:159437ms step_avg:156.46ms step:1030/1480 train_time:159605ms step_avg:156.48ms step:1031/1480 train_time:159769ms step_avg:156.48ms step:1032/1480 train_time:159941ms step_avg:156.50ms step:1033/1480 train_time:160107ms step_avg:156.51ms step:1034/1480 train_time:160274ms step_avg:156.52ms step:1035/1480 train_time:160442ms step_avg:156.53ms step:1036/1480 train_time:160608ms step_avg:156.54ms step:1037/1480 train_time:160776ms step_avg:156.55ms step:1038/1480 train_time:160945ms step_avg:156.56ms step:1039/1480 train_time:161116ms step_avg:156.58ms step:1040/1480 train_time:161284ms step_avg:156.59ms step:1041/1480 train_time:161451ms step_avg:156.60ms step:1042/1480 train_time:161614ms step_avg:156.60ms step:1043/1480 train_time:161781ms step_avg:156.61ms step:1044/1480 train_time:161946ms step_avg:156.62ms step:1045/1480 train_time:162116ms step_avg:156.63ms step:1046/1480 train_time:162285ms step_avg:156.65ms step:1047/1480 train_time:162453ms step_avg:156.66ms step:1048/1480 train_time:162620ms step_avg:156.67ms step:1049/1480 train_time:162787ms step_avg:156.68ms step:1050/1480 train_time:162954ms step_avg:156.69ms step:1051/1480 train_time:163125ms step_avg:156.70ms step:1052/1480 train_time:163291ms step_avg:156.71ms step:1053/1480 train_time:163456ms step_avg:156.72ms step:1054/1480 train_time:163626ms step_avg:156.73ms step:1055/1480 train_time:163792ms step_avg:156.74ms step:1056/1480 train_time:163956ms step_avg:156.75ms step:1057/1480 train_time:164125ms step_avg:156.76ms step:1058/1480 train_time:164292ms step_avg:156.77ms step:1059/1480 train_time:164464ms step_avg:156.78ms step:1060/1480 train_time:164632ms step_avg:156.79ms step:1061/1480 train_time:164795ms step_avg:156.80ms step:1062/1480 train_time:164963ms step_avg:156.81ms step:1063/1480 train_time:165128ms step_avg:156.82ms step:1064/1480 train_time:165293ms step_avg:156.82ms step:1065/1480 train_time:165459ms step_avg:156.83ms step:1066/1480 train_time:165627ms step_avg:156.84ms step:1067/1480 train_time:165795ms step_avg:156.85ms step:1068/1480 train_time:165960ms step_avg:156.86ms step:1069/1480 train_time:166131ms step_avg:156.87ms step:1070/1480 train_time:166296ms step_avg:156.88ms step:1071/1480 train_time:166467ms step_avg:156.90ms step:1072/1480 train_time:166633ms step_avg:156.90ms step:1073/1480 train_time:166797ms step_avg:156.91ms step:1074/1480 train_time:166964ms step_avg:156.92ms step:1075/1480 train_time:167135ms step_avg:156.93ms step:1076/1480 train_time:167305ms step_avg:156.95ms step:1077/1480 train_time:167472ms step_avg:156.96ms step:1078/1480 train_time:167646ms step_avg:156.97ms step:1079/1480 train_time:167820ms step_avg:156.99ms step:1080/1480 train_time:167990ms step_avg:157.00ms step:1081/1480 train_time:168156ms step_avg:157.01ms step:1082/1480 train_time:168322ms step_avg:157.02ms step:1083/1480 train_time:168489ms step_avg:157.03ms step:1084/1480 train_time:168654ms step_avg:157.03ms step:1085/1480 train_time:168823ms step_avg:157.05ms step:1086/1480 train_time:168991ms step_avg:157.06ms step:1087/1480 train_time:169158ms step_avg:157.06ms step:1088/1480 train_time:169329ms step_avg:157.08ms step:1089/1480 train_time:169501ms step_avg:157.09ms step:1090/1480 train_time:169671ms step_avg:157.10ms step:1091/1480 train_time:169838ms step_avg:157.11ms step:1092/1480 train_time:170006ms step_avg:157.12ms step:1093/1480 train_time:170173ms step_avg:157.13ms step:1094/1480 train_time:170338ms step_avg:157.14ms step:1095/1480 train_time:170504ms step_avg:157.15ms step:1096/1480 train_time:170670ms step_avg:157.16ms step:1097/1480 train_time:170838ms step_avg:157.17ms step:1098/1480 train_time:171009ms step_avg:157.18ms step:1099/1480 train_time:171180ms step_avg:157.19ms step:1100/1480 train_time:171351ms step_avg:157.20ms step:1101/1480 train_time:171522ms step_avg:157.22ms step:1102/1480 train_time:171693ms step_avg:157.23ms step:1103/1480 train_time:171868ms step_avg:157.24ms step:1104/1480 train_time:172036ms step_avg:157.25ms step:1105/1480 train_time:172207ms step_avg:157.27ms step:1106/1480 train_time:172375ms step_avg:157.28ms step:1107/1480 train_time:172545ms step_avg:157.29ms step:1108/1480 train_time:172710ms step_avg:157.30ms step:1109/1480 train_time:172878ms step_avg:157.31ms step:1110/1480 train_time:173045ms step_avg:157.31ms step:1111/1480 train_time:173212ms step_avg:157.32ms step:1112/1480 train_time:173381ms step_avg:157.33ms step:1113/1480 train_time:173564ms step_avg:157.36ms step:1114/1480 train_time:173735ms step_avg:157.37ms step:1115/1480 train_time:173908ms step_avg:157.38ms step:1116/1480 train_time:174074ms step_avg:157.39ms step:1117/1480 train_time:174247ms step_avg:157.40ms step:1118/1480 train_time:174423ms step_avg:157.42ms step:1119/1480 train_time:174589ms step_avg:157.43ms step:1120/1480 train_time:174757ms step_avg:157.44ms step:1121/1480 train_time:174928ms step_avg:157.45ms step:1122/1480 train_time:175095ms step_avg:157.46ms step:1123/1480 train_time:175261ms step_avg:157.47ms step:1124/1480 train_time:175428ms step_avg:157.48ms step:1125/1480 train_time:175595ms step_avg:157.48ms step:1125/1480 val_loss:3.3897 train_time:175663ms step_avg:157.55ms step:1126/1480 train_time:175765ms step_avg:157.50ms step:1127/1480 train_time:175937ms step_avg:157.51ms step:1128/1480 train_time:176106ms step_avg:157.52ms step:1129/1480 train_time:176280ms step_avg:157.53ms step:1130/1480 train_time:176450ms step_avg:157.54ms step:1131/1480 train_time:176626ms step_avg:157.56ms step:1132/1480 train_time:176793ms step_avg:157.57ms step:1133/1480 train_time:176965ms step_avg:157.58ms step:1134/1480 train_time:177136ms step_avg:157.59ms step:1135/1480 train_time:177303ms step_avg:157.60ms step:1136/1480 train_time:177476ms step_avg:157.62ms step:1137/1480 train_time:177644ms step_avg:157.63ms step:1138/1480 train_time:177817ms step_avg:157.64ms step:1139/1480 train_time:177985ms step_avg:157.65ms step:1140/1480 train_time:178154ms step_avg:157.66ms step:1141/1480 train_time:178326ms step_avg:157.67ms step:1142/1480 train_time:178493ms step_avg:157.68ms step:1143/1480 train_time:178663ms step_avg:157.69ms step:1144/1480 train_time:178832ms step_avg:157.70ms step:1145/1480 train_time:178997ms step_avg:157.71ms step:1146/1480 train_time:179168ms step_avg:157.72ms step:1147/1480 train_time:179337ms step_avg:157.73ms step:1148/1480 train_time:179504ms step_avg:157.74ms step:1149/1480 train_time:179675ms step_avg:157.75ms step:1150/1480 train_time:179844ms step_avg:157.76ms step:1151/1480 train_time:180018ms step_avg:157.77ms step:1152/1480 train_time:180188ms step_avg:157.78ms step:1153/1480 train_time:180362ms step_avg:157.80ms step:1154/1480 train_time:180528ms step_avg:157.80ms step:1155/1480 train_time:180700ms step_avg:157.82ms step:1156/1480 train_time:180879ms step_avg:157.83ms step:1157/1480 train_time:181050ms step_avg:157.85ms step:1158/1480 train_time:181218ms step_avg:157.86ms step:1159/1480 train_time:181384ms step_avg:157.86ms step:1160/1480 train_time:181552ms step_avg:157.87ms step:1161/1480 train_time:181723ms step_avg:157.88ms step:1162/1480 train_time:181893ms step_avg:157.89ms step:1163/1480 train_time:182063ms step_avg:157.90ms step:1164/1480 train_time:182233ms step_avg:157.91ms step:1165/1480 train_time:182399ms step_avg:157.92ms step:1166/1480 train_time:182569ms step_avg:157.93ms step:1167/1480 train_time:182738ms step_avg:157.94ms step:1168/1480 train_time:182906ms step_avg:157.95ms step:1169/1480 train_time:183076ms step_avg:157.96ms step:1170/1480 train_time:183244ms step_avg:157.97ms step:1171/1480 train_time:183413ms step_avg:157.98ms step:1172/1480 train_time:183580ms step_avg:157.99ms step:1173/1480 train_time:183753ms step_avg:158.00ms step:1174/1480 train_time:183935ms step_avg:158.02ms step:1175/1480 train_time:184105ms step_avg:158.03ms step:1176/1480 train_time:184277ms step_avg:158.04ms step:1177/1480 train_time:184455ms step_avg:158.06ms step:1178/1480 train_time:184622ms step_avg:158.07ms step:1179/1480 train_time:184788ms step_avg:158.07ms step:1180/1480 train_time:184967ms step_avg:158.09ms step:1181/1480 train_time:185136ms step_avg:158.10ms step:1182/1480 train_time:185303ms step_avg:158.11ms step:1183/1480 train_time:185474ms step_avg:158.12ms step:1184/1480 train_time:185643ms step_avg:158.13ms step:1185/1480 train_time:185817ms step_avg:158.14ms step:1186/1480 train_time:185988ms step_avg:158.15ms step:1187/1480 train_time:186173ms step_avg:158.18ms step:1188/1480 train_time:186340ms step_avg:158.18ms step:1189/1480 train_time:186512ms step_avg:158.20ms step:1190/1480 train_time:186680ms step_avg:158.20ms step:1191/1480 train_time:186852ms step_avg:158.22ms step:1192/1480 train_time:187018ms step_avg:158.22ms step:1193/1480 train_time:187185ms step_avg:158.23ms step:1194/1480 train_time:187355ms step_avg:158.24ms step:1195/1480 train_time:187527ms step_avg:158.25ms step:1196/1480 train_time:187710ms step_avg:158.27ms step:1197/1480 train_time:187881ms step_avg:158.28ms step:1198/1480 train_time:188061ms step_avg:158.30ms step:1199/1480 train_time:188231ms step_avg:158.31ms step:1200/1480 train_time:188399ms step_avg:158.32ms step:1201/1480 train_time:188566ms step_avg:158.33ms step:1202/1480 train_time:188748ms step_avg:158.35ms step:1203/1480 train_time:188923ms step_avg:158.36ms step:1204/1480 train_time:189097ms step_avg:158.37ms step:1205/1480 train_time:189264ms step_avg:158.38ms step:1206/1480 train_time:189432ms step_avg:158.39ms step:1207/1480 train_time:189601ms step_avg:158.40ms step:1208/1480 train_time:189768ms step_avg:158.40ms step:1209/1480 train_time:189943ms step_avg:158.42ms step:1210/1480 train_time:190119ms step_avg:158.43ms step:1211/1480 train_time:190295ms step_avg:158.45ms step:1212/1480 train_time:190465ms step_avg:158.46ms step:1213/1480 train_time:190639ms step_avg:158.47ms step:1214/1480 train_time:190817ms step_avg:158.49ms step:1215/1480 train_time:190990ms step_avg:158.50ms step:1216/1480 train_time:191159ms step_avg:158.51ms step:1217/1480 train_time:191332ms step_avg:158.52ms step:1218/1480 train_time:191503ms step_avg:158.53ms step:1219/1480 train_time:191680ms step_avg:158.54ms step:1220/1480 train_time:191850ms step_avg:158.55ms step:1221/1480 train_time:192019ms step_avg:158.56ms step:1222/1480 train_time:192186ms step_avg:158.57ms step:1223/1480 train_time:192358ms step_avg:158.58ms step:1224/1480 train_time:192536ms step_avg:158.60ms step:1225/1480 train_time:192709ms step_avg:158.61ms step:1226/1480 train_time:192882ms step_avg:158.62ms step:1227/1480 train_time:193057ms step_avg:158.63ms step:1228/1480 train_time:193226ms step_avg:158.64ms step:1229/1480 train_time:193398ms step_avg:158.65ms step:1230/1480 train_time:193578ms step_avg:158.67ms step:1231/1480 train_time:193754ms step_avg:158.68ms step:1232/1480 train_time:193929ms step_avg:158.70ms step:1233/1480 train_time:194097ms step_avg:158.71ms step:1234/1480 train_time:194267ms step_avg:158.71ms step:1235/1480 train_time:194442ms step_avg:158.73ms step:1236/1480 train_time:194610ms step_avg:158.74ms step:1237/1480 train_time:194781ms step_avg:158.75ms step:1238/1480 train_time:194965ms step_avg:158.77ms step:1239/1480 train_time:195136ms step_avg:158.78ms step:1240/1480 train_time:195305ms step_avg:158.78ms step:1241/1480 train_time:195478ms step_avg:158.80ms step:1242/1480 train_time:195647ms step_avg:158.80ms step:1243/1480 train_time:195820ms step_avg:158.82ms step:1244/1480 train_time:195987ms step_avg:158.82ms step:1245/1480 train_time:196158ms step_avg:158.83ms step:1246/1480 train_time:196326ms step_avg:158.84ms step:1247/1480 train_time:196496ms step_avg:158.85ms step:1248/1480 train_time:196664ms step_avg:158.86ms step:1249/1480 train_time:196834ms step_avg:158.87ms step:1250/1480 train_time:197003ms step_avg:158.87ms step:1250/1480 val_loss:3.3398 train_time:197075ms step_avg:158.93ms step:1251/1480 train_time:197185ms step_avg:158.89ms step:1252/1480 train_time:197354ms step_avg:158.90ms step:1253/1480 train_time:197520ms step_avg:158.91ms step:1254/1480 train_time:197692ms step_avg:158.92ms step:1255/1480 train_time:197879ms step_avg:158.94ms step:1256/1480 train_time:198051ms step_avg:158.95ms step:1257/1480 train_time:198221ms step_avg:158.96ms step:1258/1480 train_time:198397ms step_avg:158.97ms step:1259/1480 train_time:198568ms step_avg:158.98ms step:1260/1480 train_time:198736ms step_avg:158.99ms step:1261/1480 train_time:198909ms step_avg:159.00ms step:1262/1480 train_time:199085ms step_avg:159.01ms step:1263/1480 train_time:199258ms step_avg:159.02ms step:1264/1480 train_time:199425ms step_avg:159.03ms step:1265/1480 train_time:199591ms step_avg:159.04ms step:1266/1480 train_time:199764ms step_avg:159.05ms step:1267/1480 train_time:199934ms step_avg:159.06ms step:1268/1480 train_time:200106ms step_avg:159.07ms step:1269/1480 train_time:200283ms step_avg:159.08ms step:1270/1480 train_time:200452ms step_avg:159.09ms step:1271/1480 train_time:200621ms step_avg:159.10ms step:1272/1480 train_time:200786ms step_avg:159.10ms step:1273/1480 train_time:200958ms step_avg:159.11ms step:1274/1480 train_time:201129ms step_avg:159.12ms step:1275/1480 train_time:201298ms step_avg:159.13ms step:1276/1480 train_time:201463ms step_avg:159.13ms step:1277/1480 train_time:201634ms step_avg:159.14ms step:1278/1480 train_time:201803ms step_avg:159.15ms step:1279/1480 train_time:201974ms step_avg:159.16ms step:1280/1480 train_time:202157ms step_avg:159.18ms step:1281/1480 train_time:202325ms step_avg:159.19ms step:1282/1480 train_time:202493ms step_avg:159.19ms step:1283/1480 train_time:202664ms step_avg:159.20ms step:1284/1480 train_time:202835ms step_avg:159.21ms step:1285/1480 train_time:203004ms step_avg:159.22ms step:1286/1480 train_time:203176ms step_avg:159.23ms step:1287/1480 train_time:203346ms step_avg:159.24ms step:1288/1480 train_time:203518ms step_avg:159.25ms step:1289/1480 train_time:203701ms step_avg:159.27ms step:1290/1480 train_time:203882ms step_avg:159.28ms step:1291/1480 train_time:204055ms step_avg:159.29ms step:1292/1480 train_time:204230ms step_avg:159.31ms step:1293/1480 train_time:204407ms step_avg:159.32ms step:1294/1480 train_time:204580ms step_avg:159.33ms step:1295/1480 train_time:204751ms step_avg:159.34ms step:1296/1480 train_time:204924ms step_avg:159.35ms step:1297/1480 train_time:205096ms step_avg:159.36ms step:1298/1480 train_time:205266ms step_avg:159.37ms step:1299/1480 train_time:205437ms step_avg:159.38ms step:1300/1480 train_time:205604ms step_avg:159.38ms step:1301/1480 train_time:205774ms step_avg:159.39ms step:1302/1480 train_time:205950ms step_avg:159.40ms step:1303/1480 train_time:206127ms step_avg:159.42ms step:1304/1480 train_time:206301ms step_avg:159.43ms step:1305/1480 train_time:206470ms step_avg:159.44ms step:1306/1480 train_time:206644ms step_avg:159.45ms step:1307/1480 train_time:206812ms step_avg:159.45ms step:1308/1480 train_time:206982ms step_avg:159.46ms step:1309/1480 train_time:207154ms step_avg:159.47ms step:1310/1480 train_time:207321ms step_avg:159.48ms step:1311/1480 train_time:207490ms step_avg:159.49ms step:1312/1480 train_time:207664ms step_avg:159.50ms step:1313/1480 train_time:207833ms step_avg:159.50ms step:1314/1480 train_time:208006ms step_avg:159.51ms step:1315/1480 train_time:208178ms step_avg:159.52ms step:1316/1480 train_time:208343ms step_avg:159.53ms step:1317/1480 train_time:208513ms step_avg:159.54ms step:1318/1480 train_time:208693ms step_avg:159.55ms step:1319/1480 train_time:208867ms step_avg:159.56ms step:1320/1480 train_time:209043ms step_avg:159.57ms step:1321/1480 train_time:209216ms step_avg:159.59ms step:1322/1480 train_time:209398ms step_avg:159.60ms step:1323/1480 train_time:209570ms step_avg:159.61ms step:1324/1480 train_time:209744ms step_avg:159.62ms step:1325/1480 train_time:209926ms step_avg:159.64ms step:1326/1480 train_time:210102ms step_avg:159.65ms step:1327/1480 train_time:210271ms step_avg:159.66ms step:1328/1480 train_time:210442ms step_avg:159.67ms step:1329/1480 train_time:210639ms step_avg:159.70ms step:1330/1480 train_time:210819ms step_avg:159.71ms step:1331/1480 train_time:210989ms step_avg:159.72ms step:1332/1480 train_time:211162ms step_avg:159.73ms step:1333/1480 train_time:211338ms step_avg:159.74ms step:1334/1480 train_time:211509ms step_avg:159.75ms step:1335/1480 train_time:211679ms step_avg:159.76ms step:1336/1480 train_time:211863ms step_avg:159.78ms step:1337/1480 train_time:212039ms step_avg:159.79ms step:1338/1480 train_time:212212ms step_avg:159.80ms step:1339/1480 train_time:212385ms step_avg:159.81ms step:1340/1480 train_time:212558ms step_avg:159.82ms step:1341/1480 train_time:212725ms step_avg:159.82ms step:1342/1480 train_time:212899ms step_avg:159.83ms step:1343/1480 train_time:213069ms step_avg:159.84ms step:1344/1480 train_time:213242ms step_avg:159.85ms step:1345/1480 train_time:213419ms step_avg:159.86ms step:1346/1480 train_time:213589ms step_avg:159.87ms step:1347/1480 train_time:213760ms step_avg:159.88ms step:1348/1480 train_time:213929ms step_avg:159.89ms step:1349/1480 train_time:214100ms step_avg:159.90ms step:1350/1480 train_time:214275ms step_avg:159.91ms step:1351/1480 train_time:214445ms step_avg:159.91ms step:1352/1480 train_time:214615ms step_avg:159.92ms step:1353/1480 train_time:214791ms step_avg:159.93ms step:1354/1480 train_time:214963ms step_avg:159.94ms step:1355/1480 train_time:215132ms step_avg:159.95ms step:1356/1480 train_time:215306ms step_avg:159.96ms step:1357/1480 train_time:215480ms step_avg:159.97ms step:1358/1480 train_time:215653ms step_avg:159.98ms step:1359/1480 train_time:215825ms step_avg:159.99ms step:1360/1480 train_time:215999ms step_avg:160.00ms step:1361/1480 train_time:216176ms step_avg:160.01ms step:1362/1480 train_time:216351ms step_avg:160.02ms step:1363/1480 train_time:216529ms step_avg:160.04ms step:1364/1480 train_time:216699ms step_avg:160.04ms step:1365/1480 train_time:216865ms step_avg:160.05ms step:1366/1480 train_time:217037ms step_avg:160.06ms step:1367/1480 train_time:217208ms step_avg:160.06ms step:1368/1480 train_time:217382ms step_avg:160.08ms step:1369/1480 train_time:217564ms step_avg:160.09ms step:1370/1480 train_time:217741ms step_avg:160.10ms step:1371/1480 train_time:217913ms step_avg:160.11ms step:1372/1480 train_time:218090ms step_avg:160.12ms step:1373/1480 train_time:218259ms step_avg:160.13ms step:1374/1480 train_time:218435ms step_avg:160.14ms step:1375/1480 train_time:218605ms step_avg:160.15ms step:1375/1480 val_loss:3.3012 train_time:218671ms step_avg:160.20ms step:1376/1480 train_time:218777ms step_avg:160.16ms step:1377/1480 train_time:218948ms step_avg:160.17ms step:1378/1480 train_time:219117ms step_avg:160.17ms step:1379/1480 train_time:219293ms step_avg:160.18ms step:1380/1480 train_time:219465ms step_avg:160.19ms step:1381/1480 train_time:219645ms step_avg:160.21ms step:1382/1480 train_time:219817ms step_avg:160.22ms step:1383/1480 train_time:219990ms step_avg:160.23ms step:1384/1480 train_time:220166ms step_avg:160.24ms step:1385/1480 train_time:220333ms step_avg:160.24ms step:1386/1480 train_time:220503ms step_avg:160.25ms step:1387/1480 train_time:220676ms step_avg:160.26ms step:1388/1480 train_time:220843ms step_avg:160.26ms step:1389/1480 train_time:221016ms step_avg:160.27ms step:1390/1480 train_time:221184ms step_avg:160.28ms step:1391/1480 train_time:221355ms step_avg:160.29ms step:1392/1480 train_time:221528ms step_avg:160.29ms step:1393/1480 train_time:221698ms step_avg:160.30ms step:1394/1480 train_time:221868ms step_avg:160.31ms step:1395/1480 train_time:222037ms step_avg:160.32ms step:1396/1480 train_time:222205ms step_avg:160.32ms step:1397/1480 train_time:222374ms step_avg:160.33ms step:1398/1480 train_time:222540ms step_avg:160.33ms step:1399/1480 train_time:222708ms step_avg:160.34ms step:1400/1480 train_time:222886ms step_avg:160.35ms step:1401/1480 train_time:223053ms step_avg:160.35ms step:1402/1480 train_time:223226ms step_avg:160.36ms step:1403/1480 train_time:223401ms step_avg:160.37ms step:1404/1480 train_time:223573ms step_avg:160.38ms step:1405/1480 train_time:223745ms step_avg:160.39ms step:1406/1480 train_time:223921ms step_avg:160.40ms step:1407/1480 train_time:224091ms step_avg:160.41ms step:1408/1480 train_time:224261ms step_avg:160.42ms step:1409/1480 train_time:224443ms step_avg:160.43ms step:1410/1480 train_time:224614ms step_avg:160.44ms step:1411/1480 train_time:224781ms step_avg:160.44ms step:1412/1480 train_time:224952ms step_avg:160.45ms step:1413/1480 train_time:225121ms step_avg:160.46ms step:1414/1480 train_time:225293ms step_avg:160.46ms step:1415/1480 train_time:225466ms step_avg:160.47ms step:1416/1480 train_time:225654ms step_avg:160.49ms step:1417/1480 train_time:225829ms step_avg:160.50ms step:1418/1480 train_time:226000ms step_avg:160.51ms step:1419/1480 train_time:226175ms step_avg:160.52ms step:1420/1480 train_time:226350ms step_avg:160.53ms step:1421/1480 train_time:226522ms step_avg:160.54ms step:1422/1480 train_time:226696ms step_avg:160.55ms step:1423/1480 train_time:226865ms step_avg:160.56ms step:1424/1480 train_time:227042ms step_avg:160.57ms step:1425/1480 train_time:227221ms step_avg:160.58ms step:1426/1480 train_time:227393ms step_avg:160.59ms step:1427/1480 train_time:227568ms step_avg:160.60ms step:1428/1480 train_time:227738ms step_avg:160.60ms step:1429/1480 train_time:227905ms step_avg:160.61ms step:1430/1480 train_time:228079ms step_avg:160.62ms step:1431/1480 train_time:228253ms step_avg:160.63ms step:1432/1480 train_time:228429ms step_avg:160.64ms step:1433/1480 train_time:228606ms step_avg:160.65ms step:1434/1480 train_time:228786ms step_avg:160.66ms step:1435/1480 train_time:228960ms step_avg:160.67ms step:1436/1480 train_time:229135ms step_avg:160.68ms step:1437/1480 train_time:229304ms step_avg:160.69ms step:1438/1480 train_time:229475ms step_avg:160.70ms step:1439/1480 train_time:229648ms step_avg:160.71ms step:1440/1480 train_time:229819ms step_avg:160.71ms step:1441/1480 train_time:229993ms step_avg:160.72ms step:1442/1480 train_time:230168ms step_avg:160.73ms step:1443/1480 train_time:230359ms step_avg:160.75ms step:1444/1480 train_time:230530ms step_avg:160.76ms step:1445/1480 train_time:230702ms step_avg:160.77ms step:1446/1480 train_time:230878ms step_avg:160.78ms step:1447/1480 train_time:231057ms step_avg:160.79ms step:1448/1480 train_time:231230ms step_avg:160.80ms step:1449/1480 train_time:231402ms step_avg:160.81ms step:1450/1480 train_time:231576ms step_avg:160.82ms step:1451/1480 train_time:231747ms step_avg:160.82ms step:1452/1480 train_time:231919ms step_avg:160.83ms step:1453/1480 train_time:232088ms step_avg:160.84ms step:1454/1480 train_time:232260ms step_avg:160.85ms step:1455/1480 train_time:232438ms step_avg:160.86ms step:1456/1480 train_time:232611ms step_avg:160.87ms step:1457/1480 train_time:232783ms step_avg:160.87ms step:1458/1480 train_time:232956ms step_avg:160.88ms step:1459/1480 train_time:233133ms step_avg:160.89ms step:1460/1480 train_time:233304ms step_avg:160.90ms step:1461/1480 train_time:233479ms step_avg:160.91ms step:1462/1480 train_time:233650ms step_avg:160.92ms step:1463/1480 train_time:233827ms step_avg:160.93ms step:1464/1480 train_time:234001ms step_avg:160.94ms step:1465/1480 train_time:234174ms step_avg:160.94ms step:1466/1480 train_time:234343ms step_avg:160.95ms step:1467/1480 train_time:234519ms step_avg:160.96ms step:1468/1480 train_time:234691ms step_avg:160.97ms step:1469/1480 train_time:234864ms step_avg:160.98ms step:1470/1480 train_time:235042ms step_avg:160.99ms step:1471/1480 train_time:235229ms step_avg:161.01ms step:1472/1480 train_time:235408ms step_avg:161.02ms step:1473/1480 train_time:235579ms step_avg:161.02ms step:1474/1480 train_time:235757ms step_avg:161.04ms step:1475/1480 train_time:235937ms step_avg:161.05ms step:1476/1480 train_time:236110ms step_avg:161.06ms step:1477/1480 train_time:236295ms step_avg:161.07ms step:1478/1480 train_time:236476ms step_avg:161.09ms step:1479/1480 train_time:236650ms step_avg:161.10ms step:1480/1480 train_time:236824ms step_avg:161.11ms step:1480/1480 val_loss:3.2822 train_time:236896ms step_avg:161.15ms