import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:23:32 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 87W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 105W / 700W | 35MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 121W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23027ms step_avg:nanms step:2/1480 train_time:23126ms step_avg:nanms step:3/1480 train_time:23265ms step_avg:nanms step:4/1480 train_time:23408ms step_avg:nanms step:5/1480 train_time:23549ms step_avg:nanms step:6/1480 train_time:23691ms step_avg:nanms step:7/1480 train_time:23832ms step_avg:nanms step:8/1480 train_time:23974ms step_avg:nanms step:9/1480 train_time:24119ms step_avg:nanms step:10/1480 train_time:24263ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:429ms step_avg:142.91ms step:14/1480 train_time:569ms step_avg:142.37ms step:15/1480 train_time:711ms step_avg:142.17ms step:16/1480 train_time:854ms step_avg:142.37ms step:17/1480 train_time:998ms step_avg:142.59ms step:18/1480 train_time:1142ms step_avg:142.80ms step:19/1480 train_time:1286ms step_avg:142.84ms step:20/1480 train_time:1428ms step_avg:142.79ms step:21/1480 train_time:1570ms step_avg:142.71ms step:22/1480 train_time:1711ms step_avg:142.60ms step:23/1480 train_time:1854ms step_avg:142.60ms step:24/1480 train_time:1996ms step_avg:142.58ms step:25/1480 train_time:2141ms step_avg:142.72ms step:26/1480 train_time:2286ms step_avg:142.85ms step:27/1480 train_time:2428ms step_avg:142.81ms step:28/1480 train_time:2570ms step_avg:142.80ms step:29/1480 train_time:2714ms step_avg:142.83ms step:30/1480 train_time:2858ms step_avg:142.90ms step:31/1480 train_time:3001ms step_avg:142.90ms step:32/1480 train_time:3145ms step_avg:142.97ms step:33/1480 train_time:3289ms step_avg:143.02ms step:34/1480 train_time:3431ms step_avg:142.95ms step:35/1480 train_time:3571ms step_avg:142.85ms step:36/1480 train_time:3713ms step_avg:142.82ms step:37/1480 train_time:3855ms step_avg:142.79ms step:38/1480 train_time:4000ms step_avg:142.87ms step:39/1480 train_time:4146ms step_avg:142.95ms step:40/1480 train_time:4289ms step_avg:142.97ms step:41/1480 train_time:4431ms step_avg:142.94ms step:42/1480 train_time:4572ms step_avg:142.88ms step:43/1480 train_time:4714ms step_avg:142.84ms step:44/1480 train_time:4856ms step_avg:142.82ms step:45/1480 train_time:5000ms step_avg:142.85ms step:46/1480 train_time:5144ms step_avg:142.89ms step:47/1480 train_time:5287ms step_avg:142.89ms step:48/1480 train_time:5430ms step_avg:142.89ms step:49/1480 train_time:5571ms step_avg:142.84ms step:50/1480 train_time:5712ms step_avg:142.80ms step:51/1480 train_time:5854ms step_avg:142.78ms step:52/1480 train_time:5997ms step_avg:142.78ms step:53/1480 train_time:6140ms step_avg:142.80ms step:54/1480 train_time:6285ms step_avg:142.83ms step:55/1480 train_time:6429ms step_avg:142.87ms step:56/1480 train_time:6570ms step_avg:142.84ms step:57/1480 train_time:6712ms step_avg:142.81ms step:58/1480 train_time:6854ms step_avg:142.80ms step:59/1480 train_time:6996ms step_avg:142.77ms step:60/1480 train_time:7138ms step_avg:142.77ms step:61/1480 train_time:7282ms step_avg:142.79ms step:62/1480 train_time:7426ms step_avg:142.80ms step:63/1480 train_time:7569ms step_avg:142.81ms step:64/1480 train_time:7712ms step_avg:142.81ms step:65/1480 train_time:7853ms step_avg:142.79ms step:66/1480 train_time:7996ms step_avg:142.78ms step:67/1480 train_time:8139ms step_avg:142.79ms step:68/1480 train_time:8283ms step_avg:142.80ms step:69/1480 train_time:8426ms step_avg:142.82ms step:70/1480 train_time:8568ms step_avg:142.80ms step:71/1480 train_time:8710ms step_avg:142.78ms step:72/1480 train_time:8852ms step_avg:142.77ms step:73/1480 train_time:8993ms step_avg:142.75ms step:74/1480 train_time:9136ms step_avg:142.75ms step:75/1480 train_time:9279ms step_avg:142.75ms step:76/1480 train_time:9423ms step_avg:142.78ms step:77/1480 train_time:9567ms step_avg:142.79ms step:78/1480 train_time:9710ms step_avg:142.79ms step:79/1480 train_time:9852ms step_avg:142.79ms step:80/1480 train_time:9994ms step_avg:142.77ms step:81/1480 train_time:10135ms step_avg:142.75ms step:82/1480 train_time:10277ms step_avg:142.74ms step:83/1480 train_time:10422ms step_avg:142.77ms step:84/1480 train_time:10565ms step_avg:142.77ms step:85/1480 train_time:10708ms step_avg:142.77ms step:86/1480 train_time:10851ms step_avg:142.78ms step:87/1480 train_time:10994ms step_avg:142.78ms step:88/1480 train_time:11136ms step_avg:142.77ms step:89/1480 train_time:11278ms step_avg:142.76ms step:90/1480 train_time:11421ms step_avg:142.77ms step:91/1480 train_time:11566ms step_avg:142.79ms step:92/1480 train_time:11709ms step_avg:142.80ms step:93/1480 train_time:11852ms step_avg:142.80ms step:94/1480 train_time:11994ms step_avg:142.78ms step:95/1480 train_time:12134ms step_avg:142.76ms step:96/1480 train_time:12275ms step_avg:142.74ms step:97/1480 train_time:12418ms step_avg:142.73ms step:98/1480 train_time:12561ms step_avg:142.74ms step:99/1480 train_time:12705ms step_avg:142.75ms step:100/1480 train_time:12848ms step_avg:142.76ms step:101/1480 train_time:12991ms step_avg:142.76ms step:102/1480 train_time:13133ms step_avg:142.75ms step:103/1480 train_time:13273ms step_avg:142.72ms step:104/1480 train_time:13415ms step_avg:142.72ms step:105/1480 train_time:13557ms step_avg:142.71ms step:106/1480 train_time:13701ms step_avg:142.72ms step:107/1480 train_time:13844ms step_avg:142.72ms step:108/1480 train_time:13987ms step_avg:142.72ms step:109/1480 train_time:14129ms step_avg:142.72ms step:110/1480 train_time:14272ms step_avg:142.72ms step:111/1480 train_time:14417ms step_avg:142.74ms step:112/1480 train_time:14566ms step_avg:142.80ms step:113/1480 train_time:14713ms step_avg:142.84ms step:114/1480 train_time:14860ms step_avg:142.88ms step:115/1480 train_time:15008ms step_avg:142.93ms step:116/1480 train_time:15155ms step_avg:142.97ms step:117/1480 train_time:15301ms step_avg:143.00ms step:118/1480 train_time:15449ms step_avg:143.04ms step:119/1480 train_time:15595ms step_avg:143.07ms step:120/1480 train_time:15743ms step_avg:143.12ms step:121/1480 train_time:15890ms step_avg:143.16ms step:122/1480 train_time:16037ms step_avg:143.19ms step:123/1480 train_time:16184ms step_avg:143.22ms step:124/1480 train_time:16331ms step_avg:143.26ms step:125/1480 train_time:16478ms step_avg:143.28ms step:125/1480 val_loss:4.4197 train_time:16535ms step_avg:143.78ms step:126/1480 train_time:16630ms step_avg:143.36ms step:127/1480 train_time:16779ms step_avg:143.41ms step:128/1480 train_time:16926ms step_avg:143.44ms step:129/1480 train_time:17072ms step_avg:143.46ms step:130/1480 train_time:17217ms step_avg:143.47ms step:131/1480 train_time:17364ms step_avg:143.50ms step:132/1480 train_time:17511ms step_avg:143.53ms step:133/1480 train_time:17660ms step_avg:143.58ms step:134/1480 train_time:17809ms step_avg:143.62ms step:135/1480 train_time:17955ms step_avg:143.64ms step:136/1480 train_time:18103ms step_avg:143.67ms step:137/1480 train_time:18249ms step_avg:143.69ms step:138/1480 train_time:18396ms step_avg:143.72ms step:139/1480 train_time:18543ms step_avg:143.74ms step:140/1480 train_time:18691ms step_avg:143.78ms step:141/1480 train_time:18840ms step_avg:143.81ms step:142/1480 train_time:18986ms step_avg:143.84ms step:143/1480 train_time:19132ms step_avg:143.85ms step:144/1480 train_time:19278ms step_avg:143.87ms step:145/1480 train_time:19425ms step_avg:143.89ms step:146/1480 train_time:19572ms step_avg:143.91ms step:147/1480 train_time:19718ms step_avg:143.93ms step:148/1480 train_time:19866ms step_avg:143.96ms step:149/1480 train_time:20012ms step_avg:143.97ms step:150/1480 train_time:20159ms step_avg:143.99ms step:151/1480 train_time:20307ms step_avg:144.02ms step:152/1480 train_time:20453ms step_avg:144.04ms step:153/1480 train_time:20601ms step_avg:144.07ms step:154/1480 train_time:20749ms step_avg:144.09ms step:155/1480 train_time:20895ms step_avg:144.10ms step:156/1480 train_time:21043ms step_avg:144.13ms step:157/1480 train_time:21189ms step_avg:144.14ms step:158/1480 train_time:21334ms step_avg:144.15ms step:159/1480 train_time:21482ms step_avg:144.18ms step:160/1480 train_time:21629ms step_avg:144.19ms step:161/1480 train_time:21775ms step_avg:144.21ms step:162/1480 train_time:21922ms step_avg:144.23ms step:163/1480 train_time:22070ms step_avg:144.25ms step:164/1480 train_time:22215ms step_avg:144.25ms step:165/1480 train_time:22362ms step_avg:144.27ms step:166/1480 train_time:22509ms step_avg:144.29ms step:167/1480 train_time:22656ms step_avg:144.30ms step:168/1480 train_time:22803ms step_avg:144.32ms step:169/1480 train_time:22950ms step_avg:144.34ms step:170/1480 train_time:23096ms step_avg:144.35ms step:171/1480 train_time:23244ms step_avg:144.37ms step:172/1480 train_time:23391ms step_avg:144.39ms step:173/1480 train_time:23537ms step_avg:144.40ms step:174/1480 train_time:23684ms step_avg:144.42ms step:175/1480 train_time:23830ms step_avg:144.42ms step:176/1480 train_time:23976ms step_avg:144.43ms step:177/1480 train_time:24124ms step_avg:144.45ms step:178/1480 train_time:24271ms step_avg:144.47ms step:179/1480 train_time:24417ms step_avg:144.48ms step:180/1480 train_time:24564ms step_avg:144.50ms step:181/1480 train_time:24711ms step_avg:144.51ms step:182/1480 train_time:24858ms step_avg:144.52ms step:183/1480 train_time:25006ms step_avg:144.54ms step:184/1480 train_time:25151ms step_avg:144.55ms step:185/1480 train_time:25298ms step_avg:144.56ms step:186/1480 train_time:25444ms step_avg:144.57ms step:187/1480 train_time:25592ms step_avg:144.59ms step:188/1480 train_time:25739ms step_avg:144.60ms step:189/1480 train_time:25886ms step_avg:144.61ms step:190/1480 train_time:26032ms step_avg:144.62ms step:191/1480 train_time:26179ms step_avg:144.64ms step:192/1480 train_time:26327ms step_avg:144.65ms step:193/1480 train_time:26473ms step_avg:144.66ms step:194/1480 train_time:26619ms step_avg:144.67ms step:195/1480 train_time:26767ms step_avg:144.69ms step:196/1480 train_time:26912ms step_avg:144.69ms step:197/1480 train_time:27060ms step_avg:144.70ms step:198/1480 train_time:27207ms step_avg:144.72ms step:199/1480 train_time:27352ms step_avg:144.72ms step:200/1480 train_time:27500ms step_avg:144.73ms step:201/1480 train_time:27645ms step_avg:144.74ms step:202/1480 train_time:27792ms step_avg:144.75ms step:203/1480 train_time:27939ms step_avg:144.76ms step:204/1480 train_time:28086ms step_avg:144.78ms step:205/1480 train_time:28232ms step_avg:144.78ms step:206/1480 train_time:28379ms step_avg:144.79ms step:207/1480 train_time:28526ms step_avg:144.80ms step:208/1480 train_time:28673ms step_avg:144.81ms step:209/1480 train_time:28819ms step_avg:144.82ms step:210/1480 train_time:28967ms step_avg:144.84ms step:211/1480 train_time:29113ms step_avg:144.84ms step:212/1480 train_time:29261ms step_avg:144.86ms step:213/1480 train_time:29408ms step_avg:144.87ms step:214/1480 train_time:29555ms step_avg:144.88ms step:215/1480 train_time:29702ms step_avg:144.89ms step:216/1480 train_time:29849ms step_avg:144.90ms step:217/1480 train_time:29995ms step_avg:144.90ms step:218/1480 train_time:30142ms step_avg:144.92ms step:219/1480 train_time:30290ms step_avg:144.93ms step:220/1480 train_time:30436ms step_avg:144.93ms step:221/1480 train_time:30585ms step_avg:144.95ms step:222/1480 train_time:30735ms step_avg:144.98ms step:223/1480 train_time:30886ms step_avg:145.01ms step:224/1480 train_time:31035ms step_avg:145.02ms step:225/1480 train_time:31187ms step_avg:145.05ms step:226/1480 train_time:31337ms step_avg:145.08ms step:227/1480 train_time:31488ms step_avg:145.11ms step:228/1480 train_time:31637ms step_avg:145.12ms step:229/1480 train_time:31787ms step_avg:145.15ms step:230/1480 train_time:31937ms step_avg:145.17ms step:231/1480 train_time:32087ms step_avg:145.19ms step:232/1480 train_time:32237ms step_avg:145.21ms step:233/1480 train_time:32388ms step_avg:145.24ms step:234/1480 train_time:32539ms step_avg:145.26ms step:235/1480 train_time:32689ms step_avg:145.29ms step:236/1480 train_time:32839ms step_avg:145.31ms step:237/1480 train_time:32990ms step_avg:145.33ms step:238/1480 train_time:33141ms step_avg:145.36ms step:239/1480 train_time:33291ms step_avg:145.37ms step:240/1480 train_time:33441ms step_avg:145.39ms step:241/1480 train_time:33591ms step_avg:145.42ms step:242/1480 train_time:33742ms step_avg:145.44ms step:243/1480 train_time:33892ms step_avg:145.46ms step:244/1480 train_time:34042ms step_avg:145.48ms step:245/1480 train_time:34193ms step_avg:145.50ms step:246/1480 train_time:34343ms step_avg:145.52ms step:247/1480 train_time:34494ms step_avg:145.54ms step:248/1480 train_time:34647ms step_avg:145.57ms step:249/1480 train_time:34798ms step_avg:145.60ms step:250/1480 train_time:34949ms step_avg:145.62ms step:250/1480 val_loss:4.0031 train_time:35007ms step_avg:145.86ms step:251/1480 train_time:35104ms step_avg:145.66ms step:252/1480 train_time:35254ms step_avg:145.68ms step:253/1480 train_time:35405ms step_avg:145.70ms step:254/1480 train_time:35553ms step_avg:145.71ms step:255/1480 train_time:35702ms step_avg:145.72ms step:256/1480 train_time:35851ms step_avg:145.74ms step:257/1480 train_time:36003ms step_avg:145.76ms step:258/1480 train_time:36156ms step_avg:145.79ms step:259/1480 train_time:36307ms step_avg:145.81ms step:260/1480 train_time:36457ms step_avg:145.83ms step:261/1480 train_time:36607ms step_avg:145.84ms step:262/1480 train_time:36756ms step_avg:145.86ms step:263/1480 train_time:36906ms step_avg:145.87ms step:264/1480 train_time:37057ms step_avg:145.89ms step:265/1480 train_time:37209ms step_avg:145.92ms step:266/1480 train_time:37360ms step_avg:145.94ms step:267/1480 train_time:37510ms step_avg:145.95ms step:268/1480 train_time:37660ms step_avg:145.97ms step:269/1480 train_time:37810ms step_avg:145.99ms step:270/1480 train_time:37961ms step_avg:146.00ms step:271/1480 train_time:38112ms step_avg:146.02ms step:272/1480 train_time:38263ms step_avg:146.04ms step:273/1480 train_time:38414ms step_avg:146.06ms step:274/1480 train_time:38564ms step_avg:146.08ms step:275/1480 train_time:38717ms step_avg:146.10ms step:276/1480 train_time:38867ms step_avg:146.12ms step:277/1480 train_time:39017ms step_avg:146.13ms step:278/1480 train_time:39167ms step_avg:146.14ms step:279/1480 train_time:39318ms step_avg:146.16ms step:280/1480 train_time:39468ms step_avg:146.18ms step:281/1480 train_time:39619ms step_avg:146.19ms step:282/1480 train_time:39771ms step_avg:146.22ms step:283/1480 train_time:39922ms step_avg:146.24ms step:284/1480 train_time:40072ms step_avg:146.25ms step:285/1480 train_time:40223ms step_avg:146.27ms step:286/1480 train_time:40373ms step_avg:146.28ms step:287/1480 train_time:40523ms step_avg:146.29ms step:288/1480 train_time:40674ms step_avg:146.31ms step:289/1480 train_time:40826ms step_avg:146.33ms step:290/1480 train_time:40978ms step_avg:146.35ms step:291/1480 train_time:41128ms step_avg:146.36ms step:292/1480 train_time:41278ms step_avg:146.38ms step:293/1480 train_time:41428ms step_avg:146.39ms step:294/1480 train_time:41578ms step_avg:146.40ms step:295/1480 train_time:41729ms step_avg:146.42ms step:296/1480 train_time:41881ms step_avg:146.44ms step:297/1480 train_time:42032ms step_avg:146.45ms step:298/1480 train_time:42183ms step_avg:146.47ms step:299/1480 train_time:42334ms step_avg:146.48ms step:300/1480 train_time:42485ms step_avg:146.50ms step:301/1480 train_time:42634ms step_avg:146.51ms step:302/1480 train_time:42784ms step_avg:146.52ms step:303/1480 train_time:42935ms step_avg:146.53ms step:304/1480 train_time:43086ms step_avg:146.55ms step:305/1480 train_time:43235ms step_avg:146.56ms step:306/1480 train_time:43386ms step_avg:146.57ms step:307/1480 train_time:43536ms step_avg:146.59ms step:308/1480 train_time:43687ms step_avg:146.60ms step:309/1480 train_time:43837ms step_avg:146.61ms step:310/1480 train_time:43988ms step_avg:146.63ms step:311/1480 train_time:44139ms step_avg:146.64ms step:312/1480 train_time:44290ms step_avg:146.66ms step:313/1480 train_time:44441ms step_avg:146.67ms step:314/1480 train_time:44590ms step_avg:146.68ms step:315/1480 train_time:44739ms step_avg:146.69ms step:316/1480 train_time:44890ms step_avg:146.70ms step:317/1480 train_time:45040ms step_avg:146.71ms step:318/1480 train_time:45191ms step_avg:146.72ms step:319/1480 train_time:45342ms step_avg:146.74ms step:320/1480 train_time:45492ms step_avg:146.75ms step:321/1480 train_time:45643ms step_avg:146.76ms step:322/1480 train_time:45794ms step_avg:146.78ms step:323/1480 train_time:45945ms step_avg:146.79ms step:324/1480 train_time:46098ms step_avg:146.81ms step:325/1480 train_time:46248ms step_avg:146.82ms step:326/1480 train_time:46398ms step_avg:146.83ms step:327/1480 train_time:46549ms step_avg:146.84ms step:328/1480 train_time:46700ms step_avg:146.85ms step:329/1480 train_time:46850ms step_avg:146.86ms step:330/1480 train_time:47002ms step_avg:146.88ms step:331/1480 train_time:47157ms step_avg:146.91ms step:332/1480 train_time:47311ms step_avg:146.93ms step:333/1480 train_time:47465ms step_avg:146.95ms step:334/1480 train_time:47619ms step_avg:146.97ms step:335/1480 train_time:47772ms step_avg:146.99ms step:336/1480 train_time:47926ms step_avg:147.01ms step:337/1480 train_time:48079ms step_avg:147.03ms step:338/1480 train_time:48233ms step_avg:147.05ms step:339/1480 train_time:48387ms step_avg:147.07ms step:340/1480 train_time:48543ms step_avg:147.10ms step:341/1480 train_time:48698ms step_avg:147.12ms step:342/1480 train_time:48852ms step_avg:147.14ms step:343/1480 train_time:49007ms step_avg:147.17ms step:344/1480 train_time:49161ms step_avg:147.19ms step:345/1480 train_time:49316ms step_avg:147.21ms step:346/1480 train_time:49470ms step_avg:147.23ms step:347/1480 train_time:49624ms step_avg:147.25ms step:348/1480 train_time:49778ms step_avg:147.27ms step:349/1480 train_time:49931ms step_avg:147.29ms step:350/1480 train_time:50085ms step_avg:147.31ms step:351/1480 train_time:50240ms step_avg:147.33ms step:352/1480 train_time:50394ms step_avg:147.35ms step:353/1480 train_time:50548ms step_avg:147.37ms step:354/1480 train_time:50700ms step_avg:147.39ms step:355/1480 train_time:50856ms step_avg:147.41ms step:356/1480 train_time:51010ms step_avg:147.43ms step:357/1480 train_time:51164ms step_avg:147.45ms step:358/1480 train_time:51318ms step_avg:147.47ms step:359/1480 train_time:51471ms step_avg:147.48ms step:360/1480 train_time:51627ms step_avg:147.51ms step:361/1480 train_time:51783ms step_avg:147.53ms step:362/1480 train_time:51938ms step_avg:147.55ms step:363/1480 train_time:52091ms step_avg:147.57ms step:364/1480 train_time:52244ms step_avg:147.58ms step:365/1480 train_time:52398ms step_avg:147.60ms step:366/1480 train_time:52550ms step_avg:147.61ms step:367/1480 train_time:52704ms step_avg:147.63ms step:368/1480 train_time:52856ms step_avg:147.64ms step:369/1480 train_time:53010ms step_avg:147.66ms step:370/1480 train_time:53164ms step_avg:147.68ms step:371/1480 train_time:53320ms step_avg:147.70ms step:372/1480 train_time:53473ms step_avg:147.72ms step:373/1480 train_time:53627ms step_avg:147.73ms step:374/1480 train_time:53780ms step_avg:147.75ms step:375/1480 train_time:53934ms step_avg:147.76ms step:375/1480 val_loss:3.8185 train_time:53995ms step_avg:147.93ms step:376/1480 train_time:54092ms step_avg:147.79ms step:377/1480 train_time:54246ms step_avg:147.81ms step:378/1480 train_time:54399ms step_avg:147.82ms step:379/1480 train_time:54552ms step_avg:147.84ms step:380/1480 train_time:54705ms step_avg:147.85ms step:381/1480 train_time:54857ms step_avg:147.86ms step:382/1480 train_time:55011ms step_avg:147.88ms step:383/1480 train_time:55167ms step_avg:147.90ms step:384/1480 train_time:55321ms step_avg:147.92ms step:385/1480 train_time:55475ms step_avg:147.93ms step:386/1480 train_time:55628ms step_avg:147.95ms step:387/1480 train_time:55782ms step_avg:147.96ms step:388/1480 train_time:55935ms step_avg:147.98ms step:389/1480 train_time:56091ms step_avg:148.00ms step:390/1480 train_time:56246ms step_avg:148.02ms step:391/1480 train_time:56402ms step_avg:148.04ms step:392/1480 train_time:56554ms step_avg:148.05ms step:393/1480 train_time:56706ms step_avg:148.06ms step:394/1480 train_time:56860ms step_avg:148.07ms step:395/1480 train_time:57013ms step_avg:148.09ms step:396/1480 train_time:57167ms step_avg:148.10ms step:397/1480 train_time:57322ms step_avg:148.12ms step:398/1480 train_time:57476ms step_avg:148.13ms step:399/1480 train_time:57630ms step_avg:148.15ms step:400/1480 train_time:57784ms step_avg:148.16ms step:401/1480 train_time:57939ms step_avg:148.18ms step:402/1480 train_time:58093ms step_avg:148.20ms step:403/1480 train_time:58246ms step_avg:148.21ms step:404/1480 train_time:58400ms step_avg:148.22ms step:405/1480 train_time:58554ms step_avg:148.24ms step:406/1480 train_time:58708ms step_avg:148.25ms step:407/1480 train_time:58861ms step_avg:148.26ms step:408/1480 train_time:59013ms step_avg:148.28ms step:409/1480 train_time:59168ms step_avg:148.29ms step:410/1480 train_time:59324ms step_avg:148.31ms step:411/1480 train_time:59476ms step_avg:148.32ms step:412/1480 train_time:59631ms step_avg:148.34ms step:413/1480 train_time:59784ms step_avg:148.35ms step:414/1480 train_time:59940ms step_avg:148.37ms step:415/1480 train_time:60093ms step_avg:148.38ms step:416/1480 train_time:60246ms step_avg:148.39ms step:417/1480 train_time:60400ms step_avg:148.40ms step:418/1480 train_time:60554ms step_avg:148.42ms step:419/1480 train_time:60707ms step_avg:148.43ms step:420/1480 train_time:60860ms step_avg:148.44ms step:421/1480 train_time:61014ms step_avg:148.45ms step:422/1480 train_time:61167ms step_avg:148.46ms step:423/1480 train_time:61320ms step_avg:148.47ms step:424/1480 train_time:61473ms step_avg:148.49ms step:425/1480 train_time:61629ms step_avg:148.50ms step:426/1480 train_time:61784ms step_avg:148.52ms step:427/1480 train_time:61937ms step_avg:148.53ms step:428/1480 train_time:62090ms step_avg:148.54ms step:429/1480 train_time:62244ms step_avg:148.55ms step:430/1480 train_time:62398ms step_avg:148.57ms step:431/1480 train_time:62551ms step_avg:148.58ms step:432/1480 train_time:62705ms step_avg:148.59ms step:433/1480 train_time:62857ms step_avg:148.60ms step:434/1480 train_time:63011ms step_avg:148.61ms step:435/1480 train_time:63166ms step_avg:148.63ms step:436/1480 train_time:63320ms step_avg:148.64ms step:437/1480 train_time:63474ms step_avg:148.65ms step:438/1480 train_time:63628ms step_avg:148.66ms step:439/1480 train_time:63782ms step_avg:148.68ms step:440/1480 train_time:63937ms step_avg:148.69ms step:441/1480 train_time:64094ms step_avg:148.71ms step:442/1480 train_time:64251ms step_avg:148.73ms step:443/1480 train_time:64409ms step_avg:148.75ms step:444/1480 train_time:64565ms step_avg:148.77ms step:445/1480 train_time:64722ms step_avg:148.79ms step:446/1480 train_time:64877ms step_avg:148.80ms step:447/1480 train_time:65032ms step_avg:148.82ms step:448/1480 train_time:65188ms step_avg:148.83ms step:449/1480 train_time:65347ms step_avg:148.86ms step:450/1480 train_time:65507ms step_avg:148.88ms step:451/1480 train_time:65664ms step_avg:148.90ms step:452/1480 train_time:65822ms step_avg:148.92ms step:453/1480 train_time:65977ms step_avg:148.93ms step:454/1480 train_time:66133ms step_avg:148.95ms step:455/1480 train_time:66289ms step_avg:148.96ms step:456/1480 train_time:66446ms step_avg:148.98ms step:457/1480 train_time:66603ms step_avg:149.00ms step:458/1480 train_time:66760ms step_avg:149.02ms step:459/1480 train_time:66917ms step_avg:149.03ms step:460/1480 train_time:67073ms step_avg:149.05ms step:461/1480 train_time:67231ms step_avg:149.07ms step:462/1480 train_time:67388ms step_avg:149.09ms step:463/1480 train_time:67546ms step_avg:149.11ms step:464/1480 train_time:67704ms step_avg:149.13ms step:465/1480 train_time:67860ms step_avg:149.14ms step:466/1480 train_time:68016ms step_avg:149.16ms step:467/1480 train_time:68173ms step_avg:149.17ms step:468/1480 train_time:68329ms step_avg:149.19ms step:469/1480 train_time:68486ms step_avg:149.21ms step:470/1480 train_time:68643ms step_avg:149.22ms step:471/1480 train_time:68800ms step_avg:149.24ms step:472/1480 train_time:68956ms step_avg:149.26ms step:473/1480 train_time:69113ms step_avg:149.27ms step:474/1480 train_time:69269ms step_avg:149.29ms step:475/1480 train_time:69426ms step_avg:149.30ms step:476/1480 train_time:69583ms step_avg:149.32ms step:477/1480 train_time:69742ms step_avg:149.34ms step:478/1480 train_time:69899ms step_avg:149.36ms step:479/1480 train_time:70055ms step_avg:149.37ms step:480/1480 train_time:70214ms step_avg:149.39ms step:481/1480 train_time:70370ms step_avg:149.40ms step:482/1480 train_time:70528ms step_avg:149.42ms step:483/1480 train_time:70686ms step_avg:149.44ms step:484/1480 train_time:70843ms step_avg:149.46ms step:485/1480 train_time:71001ms step_avg:149.48ms step:486/1480 train_time:71158ms step_avg:149.49ms step:487/1480 train_time:71315ms step_avg:149.51ms step:488/1480 train_time:71471ms step_avg:149.52ms step:489/1480 train_time:71627ms step_avg:149.53ms step:490/1480 train_time:71784ms step_avg:149.55ms step:491/1480 train_time:71942ms step_avg:149.57ms step:492/1480 train_time:72099ms step_avg:149.58ms step:493/1480 train_time:72255ms step_avg:149.60ms step:494/1480 train_time:72411ms step_avg:149.61ms step:495/1480 train_time:72568ms step_avg:149.63ms step:496/1480 train_time:72727ms step_avg:149.64ms step:497/1480 train_time:72886ms step_avg:149.66ms step:498/1480 train_time:73043ms step_avg:149.68ms step:499/1480 train_time:73202ms step_avg:149.70ms step:500/1480 train_time:73360ms step_avg:149.71ms step:500/1480 val_loss:3.6931 train_time:73421ms step_avg:149.84ms step:501/1480 train_time:73521ms step_avg:149.74ms step:502/1480 train_time:73679ms step_avg:149.75ms step:503/1480 train_time:73836ms step_avg:149.77ms step:504/1480 train_time:73993ms step_avg:149.78ms step:505/1480 train_time:74148ms step_avg:149.79ms step:506/1480 train_time:74304ms step_avg:149.81ms step:507/1480 train_time:74459ms step_avg:149.82ms step:508/1480 train_time:74619ms step_avg:149.84ms step:509/1480 train_time:74777ms step_avg:149.85ms step:510/1480 train_time:74934ms step_avg:149.87ms step:511/1480 train_time:75092ms step_avg:149.88ms step:512/1480 train_time:75250ms step_avg:149.90ms step:513/1480 train_time:75406ms step_avg:149.91ms step:514/1480 train_time:75563ms step_avg:149.93ms step:515/1480 train_time:75721ms step_avg:149.94ms step:516/1480 train_time:75879ms step_avg:149.96ms step:517/1480 train_time:76037ms step_avg:149.97ms step:518/1480 train_time:76196ms step_avg:149.99ms step:519/1480 train_time:76354ms step_avg:150.01ms step:520/1480 train_time:76513ms step_avg:150.03ms step:521/1480 train_time:76671ms step_avg:150.04ms step:522/1480 train_time:76827ms step_avg:150.05ms step:523/1480 train_time:76984ms step_avg:150.07ms step:524/1480 train_time:77140ms step_avg:150.08ms step:525/1480 train_time:77296ms step_avg:150.09ms step:526/1480 train_time:77453ms step_avg:150.10ms step:527/1480 train_time:77610ms step_avg:150.12ms step:528/1480 train_time:77766ms step_avg:150.13ms step:529/1480 train_time:77922ms step_avg:150.14ms step:530/1480 train_time:78079ms step_avg:150.15ms step:531/1480 train_time:78238ms step_avg:150.17ms step:532/1480 train_time:78396ms step_avg:150.18ms step:533/1480 train_time:78554ms step_avg:150.20ms step:534/1480 train_time:78711ms step_avg:150.21ms step:535/1480 train_time:78868ms step_avg:150.22ms step:536/1480 train_time:79025ms step_avg:150.24ms step:537/1480 train_time:79182ms step_avg:150.25ms step:538/1480 train_time:79339ms step_avg:150.26ms step:539/1480 train_time:79497ms step_avg:150.28ms step:540/1480 train_time:79655ms step_avg:150.29ms step:541/1480 train_time:79812ms step_avg:150.30ms step:542/1480 train_time:79969ms step_avg:150.32ms step:543/1480 train_time:80124ms step_avg:150.33ms step:544/1480 train_time:80281ms step_avg:150.34ms step:545/1480 train_time:80437ms step_avg:150.35ms step:546/1480 train_time:80596ms step_avg:150.37ms step:547/1480 train_time:80754ms step_avg:150.38ms step:548/1480 train_time:80911ms step_avg:150.39ms step:549/1480 train_time:81066ms step_avg:150.40ms step:550/1480 train_time:81223ms step_avg:150.41ms step:551/1480 train_time:81380ms step_avg:150.43ms step:552/1480 train_time:81539ms step_avg:150.44ms step:553/1480 train_time:81699ms step_avg:150.46ms step:554/1480 train_time:81859ms step_avg:150.48ms step:555/1480 train_time:82021ms step_avg:150.50ms step:556/1480 train_time:82180ms step_avg:150.51ms step:557/1480 train_time:82341ms step_avg:150.53ms step:558/1480 train_time:82500ms step_avg:150.55ms step:559/1480 train_time:82658ms step_avg:150.56ms step:560/1480 train_time:82819ms step_avg:150.58ms step:561/1480 train_time:82979ms step_avg:150.60ms step:562/1480 train_time:83139ms step_avg:150.61ms step:563/1480 train_time:83299ms step_avg:150.63ms step:564/1480 train_time:83457ms step_avg:150.64ms step:565/1480 train_time:83618ms step_avg:150.66ms step:566/1480 train_time:83779ms step_avg:150.68ms step:567/1480 train_time:83938ms step_avg:150.70ms step:568/1480 train_time:84098ms step_avg:150.71ms step:569/1480 train_time:84257ms step_avg:150.73ms step:570/1480 train_time:84418ms step_avg:150.75ms step:571/1480 train_time:84579ms step_avg:150.76ms step:572/1480 train_time:84739ms step_avg:150.78ms step:573/1480 train_time:84899ms step_avg:150.80ms step:574/1480 train_time:85060ms step_avg:150.81ms step:575/1480 train_time:85221ms step_avg:150.83ms step:576/1480 train_time:85380ms step_avg:150.85ms step:577/1480 train_time:85541ms step_avg:150.87ms step:578/1480 train_time:85699ms step_avg:150.88ms step:579/1480 train_time:85859ms step_avg:150.89ms step:580/1480 train_time:86019ms step_avg:150.91ms step:581/1480 train_time:86180ms step_avg:150.93ms step:582/1480 train_time:86340ms step_avg:150.94ms step:583/1480 train_time:86500ms step_avg:150.96ms step:584/1480 train_time:86659ms step_avg:150.97ms step:585/1480 train_time:86817ms step_avg:150.99ms step:586/1480 train_time:86978ms step_avg:151.00ms step:587/1480 train_time:87138ms step_avg:151.02ms step:588/1480 train_time:87298ms step_avg:151.03ms step:589/1480 train_time:87457ms step_avg:151.05ms step:590/1480 train_time:87619ms step_avg:151.07ms step:591/1480 train_time:87778ms step_avg:151.08ms step:592/1480 train_time:87939ms step_avg:151.10ms step:593/1480 train_time:88100ms step_avg:151.12ms step:594/1480 train_time:88260ms step_avg:151.13ms step:595/1480 train_time:88421ms step_avg:151.15ms step:596/1480 train_time:88582ms step_avg:151.16ms step:597/1480 train_time:88742ms step_avg:151.18ms step:598/1480 train_time:88900ms step_avg:151.19ms step:599/1480 train_time:89058ms step_avg:151.20ms step:600/1480 train_time:89220ms step_avg:151.22ms step:601/1480 train_time:89379ms step_avg:151.23ms step:602/1480 train_time:89538ms step_avg:151.25ms step:603/1480 train_time:89700ms step_avg:151.26ms step:604/1480 train_time:89859ms step_avg:151.28ms step:605/1480 train_time:90019ms step_avg:151.29ms step:606/1480 train_time:90181ms step_avg:151.31ms step:607/1480 train_time:90344ms step_avg:151.33ms step:608/1480 train_time:90503ms step_avg:151.34ms step:609/1480 train_time:90661ms step_avg:151.35ms step:610/1480 train_time:90819ms step_avg:151.37ms step:611/1480 train_time:90979ms step_avg:151.38ms step:612/1480 train_time:91139ms step_avg:151.39ms step:613/1480 train_time:91300ms step_avg:151.41ms step:614/1480 train_time:91459ms step_avg:151.42ms step:615/1480 train_time:91619ms step_avg:151.44ms step:616/1480 train_time:91777ms step_avg:151.45ms step:617/1480 train_time:91938ms step_avg:151.46ms step:618/1480 train_time:92098ms step_avg:151.48ms step:619/1480 train_time:92258ms step_avg:151.49ms step:620/1480 train_time:92419ms step_avg:151.51ms step:621/1480 train_time:92579ms step_avg:151.52ms step:622/1480 train_time:92739ms step_avg:151.53ms step:623/1480 train_time:92900ms step_avg:151.55ms step:624/1480 train_time:93059ms step_avg:151.56ms step:625/1480 train_time:93220ms step_avg:151.58ms step:625/1480 val_loss:3.6069 train_time:93284ms step_avg:151.68ms step:626/1480 train_time:93382ms step_avg:151.59ms step:627/1480 train_time:93540ms step_avg:151.60ms step:628/1480 train_time:93697ms step_avg:151.61ms step:629/1480 train_time:93854ms step_avg:151.62ms step:630/1480 train_time:94013ms step_avg:151.63ms step:631/1480 train_time:94172ms step_avg:151.65ms step:632/1480 train_time:94332ms step_avg:151.66ms step:633/1480 train_time:94493ms step_avg:151.67ms step:634/1480 train_time:94654ms step_avg:151.69ms step:635/1480 train_time:94814ms step_avg:151.70ms step:636/1480 train_time:94974ms step_avg:151.72ms step:637/1480 train_time:95133ms step_avg:151.73ms step:638/1480 train_time:95292ms step_avg:151.74ms step:639/1480 train_time:95452ms step_avg:151.75ms step:640/1480 train_time:95613ms step_avg:151.77ms step:641/1480 train_time:95774ms step_avg:151.78ms step:642/1480 train_time:95934ms step_avg:151.79ms step:643/1480 train_time:96094ms step_avg:151.81ms step:644/1480 train_time:96253ms step_avg:151.82ms step:645/1480 train_time:96412ms step_avg:151.83ms step:646/1480 train_time:96573ms step_avg:151.84ms step:647/1480 train_time:96733ms step_avg:151.86ms step:648/1480 train_time:96896ms step_avg:151.87ms step:649/1480 train_time:97055ms step_avg:151.89ms step:650/1480 train_time:97215ms step_avg:151.90ms step:651/1480 train_time:97374ms step_avg:151.91ms step:652/1480 train_time:97533ms step_avg:151.92ms step:653/1480 train_time:97693ms step_avg:151.93ms step:654/1480 train_time:97854ms step_avg:151.95ms step:655/1480 train_time:98014ms step_avg:151.96ms step:656/1480 train_time:98174ms step_avg:151.97ms step:657/1480 train_time:98333ms step_avg:151.98ms step:658/1480 train_time:98493ms step_avg:152.00ms step:659/1480 train_time:98655ms step_avg:152.01ms step:660/1480 train_time:98818ms step_avg:152.03ms step:661/1480 train_time:98980ms step_avg:152.04ms step:662/1480 train_time:99140ms step_avg:152.06ms step:663/1480 train_time:99299ms step_avg:152.07ms step:664/1480 train_time:99461ms step_avg:152.08ms step:665/1480 train_time:99622ms step_avg:152.09ms step:666/1480 train_time:99781ms step_avg:152.11ms step:667/1480 train_time:99942ms step_avg:152.12ms step:668/1480 train_time:100103ms step_avg:152.13ms step:669/1480 train_time:100266ms step_avg:152.15ms step:670/1480 train_time:100428ms step_avg:152.16ms step:671/1480 train_time:100590ms step_avg:152.18ms step:672/1480 train_time:100752ms step_avg:152.19ms step:673/1480 train_time:100915ms step_avg:152.21ms step:674/1480 train_time:101077ms step_avg:152.22ms step:675/1480 train_time:101239ms step_avg:152.24ms step:676/1480 train_time:101400ms step_avg:152.25ms step:677/1480 train_time:101561ms step_avg:152.27ms step:678/1480 train_time:101722ms step_avg:152.28ms step:679/1480 train_time:101882ms step_avg:152.29ms step:680/1480 train_time:102044ms step_avg:152.30ms step:681/1480 train_time:102205ms step_avg:152.32ms step:682/1480 train_time:102369ms step_avg:152.33ms step:683/1480 train_time:102530ms step_avg:152.35ms step:684/1480 train_time:102692ms step_avg:152.36ms step:685/1480 train_time:102856ms step_avg:152.38ms step:686/1480 train_time:103018ms step_avg:152.39ms step:687/1480 train_time:103178ms step_avg:152.40ms step:688/1480 train_time:103340ms step_avg:152.42ms step:689/1480 train_time:103503ms step_avg:152.43ms step:690/1480 train_time:103668ms step_avg:152.45ms step:691/1480 train_time:103829ms step_avg:152.47ms step:692/1480 train_time:103992ms step_avg:152.48ms step:693/1480 train_time:104155ms step_avg:152.50ms step:694/1480 train_time:104316ms step_avg:152.51ms step:695/1480 train_time:104477ms step_avg:152.52ms step:696/1480 train_time:104637ms step_avg:152.53ms step:697/1480 train_time:104799ms step_avg:152.55ms step:698/1480 train_time:104959ms step_avg:152.56ms step:699/1480 train_time:105122ms step_avg:152.57ms step:700/1480 train_time:105284ms step_avg:152.59ms step:701/1480 train_time:105445ms step_avg:152.60ms step:702/1480 train_time:105606ms step_avg:152.61ms step:703/1480 train_time:105767ms step_avg:152.62ms step:704/1480 train_time:105928ms step_avg:152.63ms step:705/1480 train_time:106093ms step_avg:152.65ms step:706/1480 train_time:106255ms step_avg:152.67ms step:707/1480 train_time:106418ms step_avg:152.68ms step:708/1480 train_time:106578ms step_avg:152.69ms step:709/1480 train_time:106739ms step_avg:152.70ms step:710/1480 train_time:106898ms step_avg:152.71ms step:711/1480 train_time:107061ms step_avg:152.73ms step:712/1480 train_time:107226ms step_avg:152.74ms step:713/1480 train_time:107389ms step_avg:152.76ms step:714/1480 train_time:107551ms step_avg:152.77ms step:715/1480 train_time:107712ms step_avg:152.78ms step:716/1480 train_time:107874ms step_avg:152.80ms step:717/1480 train_time:108036ms step_avg:152.81ms step:718/1480 train_time:108196ms step_avg:152.82ms step:719/1480 train_time:108356ms step_avg:152.83ms step:720/1480 train_time:108519ms step_avg:152.84ms step:721/1480 train_time:108681ms step_avg:152.86ms step:722/1480 train_time:108842ms step_avg:152.87ms step:723/1480 train_time:109001ms step_avg:152.88ms step:724/1480 train_time:109164ms step_avg:152.89ms step:725/1480 train_time:109325ms step_avg:152.90ms step:726/1480 train_time:109491ms step_avg:152.92ms step:727/1480 train_time:109654ms step_avg:152.93ms step:728/1480 train_time:109815ms step_avg:152.95ms step:729/1480 train_time:109975ms step_avg:152.96ms step:730/1480 train_time:110138ms step_avg:152.97ms step:731/1480 train_time:110299ms step_avg:152.98ms step:732/1480 train_time:110459ms step_avg:152.99ms step:733/1480 train_time:110621ms step_avg:153.00ms step:734/1480 train_time:110782ms step_avg:153.01ms step:735/1480 train_time:110942ms step_avg:153.02ms step:736/1480 train_time:111104ms step_avg:153.04ms step:737/1480 train_time:111265ms step_avg:153.05ms step:738/1480 train_time:111426ms step_avg:153.06ms step:739/1480 train_time:111588ms step_avg:153.07ms step:740/1480 train_time:111755ms step_avg:153.09ms step:741/1480 train_time:111920ms step_avg:153.11ms step:742/1480 train_time:112081ms step_avg:153.12ms step:743/1480 train_time:112241ms step_avg:153.13ms step:744/1480 train_time:112403ms step_avg:153.14ms step:745/1480 train_time:112568ms step_avg:153.15ms step:746/1480 train_time:112729ms step_avg:153.16ms step:747/1480 train_time:112892ms step_avg:153.18ms step:748/1480 train_time:113057ms step_avg:153.19ms step:749/1480 train_time:113221ms step_avg:153.21ms step:750/1480 train_time:113383ms step_avg:153.22ms step:750/1480 val_loss:3.5536 train_time:113445ms step_avg:153.30ms step:751/1480 train_time:113545ms step_avg:153.23ms step:752/1480 train_time:113706ms step_avg:153.24ms step:753/1480 train_time:113866ms step_avg:153.25ms step:754/1480 train_time:114026ms step_avg:153.26ms step:755/1480 train_time:114187ms step_avg:153.27ms step:756/1480 train_time:114349ms step_avg:153.28ms step:757/1480 train_time:114513ms step_avg:153.30ms step:758/1480 train_time:114672ms step_avg:153.31ms step:759/1480 train_time:114836ms step_avg:153.32ms step:760/1480 train_time:114998ms step_avg:153.33ms step:761/1480 train_time:115162ms step_avg:153.35ms step:762/1480 train_time:115324ms step_avg:153.36ms step:763/1480 train_time:115486ms step_avg:153.37ms step:764/1480 train_time:115647ms step_avg:153.38ms step:765/1480 train_time:115808ms step_avg:153.39ms step:766/1480 train_time:115969ms step_avg:153.40ms step:767/1480 train_time:116130ms step_avg:153.41ms step:768/1480 train_time:116292ms step_avg:153.42ms step:769/1480 train_time:116457ms step_avg:153.43ms step:770/1480 train_time:116620ms step_avg:153.45ms step:771/1480 train_time:116784ms step_avg:153.46ms step:772/1480 train_time:116946ms step_avg:153.47ms step:773/1480 train_time:117108ms step_avg:153.48ms step:774/1480 train_time:117269ms step_avg:153.49ms step:775/1480 train_time:117430ms step_avg:153.50ms step:776/1480 train_time:117598ms step_avg:153.52ms step:777/1480 train_time:117765ms step_avg:153.54ms step:778/1480 train_time:117927ms step_avg:153.55ms step:779/1480 train_time:118089ms step_avg:153.56ms step:780/1480 train_time:118252ms step_avg:153.57ms step:781/1480 train_time:118415ms step_avg:153.59ms step:782/1480 train_time:118579ms step_avg:153.60ms step:783/1480 train_time:118742ms step_avg:153.61ms step:784/1480 train_time:118905ms step_avg:153.62ms step:785/1480 train_time:119066ms step_avg:153.63ms step:786/1480 train_time:119230ms step_avg:153.65ms step:787/1480 train_time:119394ms step_avg:153.66ms step:788/1480 train_time:119558ms step_avg:153.67ms step:789/1480 train_time:119720ms step_avg:153.68ms step:790/1480 train_time:119887ms step_avg:153.70ms step:791/1480 train_time:120054ms step_avg:153.72ms step:792/1480 train_time:120219ms step_avg:153.73ms step:793/1480 train_time:120382ms step_avg:153.75ms step:794/1480 train_time:120546ms step_avg:153.76ms step:795/1480 train_time:120710ms step_avg:153.77ms step:796/1480 train_time:120875ms step_avg:153.79ms step:797/1480 train_time:121040ms step_avg:153.80ms step:798/1480 train_time:121203ms step_avg:153.81ms step:799/1480 train_time:121369ms step_avg:153.83ms step:800/1480 train_time:121531ms step_avg:153.84ms step:801/1480 train_time:121697ms step_avg:153.85ms step:802/1480 train_time:121865ms step_avg:153.87ms step:803/1480 train_time:122027ms step_avg:153.88ms step:804/1480 train_time:122188ms step_avg:153.89ms step:805/1480 train_time:122354ms step_avg:153.90ms step:806/1480 train_time:122516ms step_avg:153.91ms step:807/1480 train_time:122679ms step_avg:153.93ms step:808/1480 train_time:122844ms step_avg:153.94ms step:809/1480 train_time:123005ms step_avg:153.95ms step:810/1480 train_time:123167ms step_avg:153.96ms step:811/1480 train_time:123329ms step_avg:153.97ms step:812/1480 train_time:123492ms step_avg:153.98ms step:813/1480 train_time:123652ms step_avg:153.99ms step:814/1480 train_time:123818ms step_avg:154.00ms step:815/1480 train_time:123981ms step_avg:154.01ms step:816/1480 train_time:124146ms step_avg:154.03ms step:817/1480 train_time:124308ms step_avg:154.04ms step:818/1480 train_time:124469ms step_avg:154.05ms step:819/1480 train_time:124633ms step_avg:154.06ms step:820/1480 train_time:124797ms step_avg:154.07ms step:821/1480 train_time:124960ms step_avg:154.08ms step:822/1480 train_time:125123ms step_avg:154.09ms step:823/1480 train_time:125285ms step_avg:154.10ms step:824/1480 train_time:125446ms step_avg:154.11ms step:825/1480 train_time:125609ms step_avg:154.12ms step:826/1480 train_time:125775ms step_avg:154.14ms step:827/1480 train_time:125940ms step_avg:154.15ms step:828/1480 train_time:126103ms step_avg:154.16ms step:829/1480 train_time:126268ms step_avg:154.17ms step:830/1480 train_time:126431ms step_avg:154.18ms step:831/1480 train_time:126595ms step_avg:154.20ms step:832/1480 train_time:126761ms step_avg:154.21ms step:833/1480 train_time:126925ms step_avg:154.22ms step:834/1480 train_time:127090ms step_avg:154.24ms step:835/1480 train_time:127254ms step_avg:154.25ms step:836/1480 train_time:127420ms step_avg:154.26ms step:837/1480 train_time:127582ms step_avg:154.27ms step:838/1480 train_time:127746ms step_avg:154.28ms step:839/1480 train_time:127908ms step_avg:154.29ms step:840/1480 train_time:128069ms step_avg:154.30ms step:841/1480 train_time:128231ms step_avg:154.31ms step:842/1480 train_time:128397ms step_avg:154.32ms step:843/1480 train_time:128561ms step_avg:154.33ms step:844/1480 train_time:128723ms step_avg:154.34ms step:845/1480 train_time:128887ms step_avg:154.36ms step:846/1480 train_time:129053ms step_avg:154.37ms step:847/1480 train_time:129218ms step_avg:154.38ms step:848/1480 train_time:129381ms step_avg:154.39ms step:849/1480 train_time:129544ms step_avg:154.40ms step:850/1480 train_time:129707ms step_avg:154.41ms step:851/1480 train_time:129872ms step_avg:154.43ms step:852/1480 train_time:130035ms step_avg:154.44ms step:853/1480 train_time:130198ms step_avg:154.45ms step:854/1480 train_time:130362ms step_avg:154.46ms step:855/1480 train_time:130526ms step_avg:154.47ms step:856/1480 train_time:130687ms step_avg:154.48ms step:857/1480 train_time:130852ms step_avg:154.49ms step:858/1480 train_time:131018ms step_avg:154.50ms step:859/1480 train_time:131183ms step_avg:154.51ms step:860/1480 train_time:131344ms step_avg:154.52ms step:861/1480 train_time:131510ms step_avg:154.54ms step:862/1480 train_time:131680ms step_avg:154.55ms step:863/1480 train_time:131848ms step_avg:154.57ms step:864/1480 train_time:132012ms step_avg:154.58ms step:865/1480 train_time:132173ms step_avg:154.59ms step:866/1480 train_time:132342ms step_avg:154.60ms step:867/1480 train_time:132505ms step_avg:154.61ms step:868/1480 train_time:132666ms step_avg:154.62ms step:869/1480 train_time:132829ms step_avg:154.63ms step:870/1480 train_time:132995ms step_avg:154.64ms step:871/1480 train_time:133158ms step_avg:154.65ms step:872/1480 train_time:133323ms step_avg:154.67ms step:873/1480 train_time:133486ms step_avg:154.68ms step:874/1480 train_time:133651ms step_avg:154.69ms step:875/1480 train_time:133816ms step_avg:154.70ms step:875/1480 val_loss:3.5074 train_time:133882ms step_avg:154.78ms step:876/1480 train_time:133982ms step_avg:154.71ms step:877/1480 train_time:134146ms step_avg:154.72ms step:878/1480 train_time:134308ms step_avg:154.73ms step:879/1480 train_time:134474ms step_avg:154.75ms step:880/1480 train_time:134638ms step_avg:154.76ms step:881/1480 train_time:134801ms step_avg:154.77ms step:882/1480 train_time:134966ms step_avg:154.78ms step:883/1480 train_time:135131ms step_avg:154.79ms step:884/1480 train_time:135299ms step_avg:154.80ms step:885/1480 train_time:135464ms step_avg:154.82ms step:886/1480 train_time:135629ms step_avg:154.83ms step:887/1480 train_time:135799ms step_avg:154.84ms step:888/1480 train_time:135971ms step_avg:154.86ms step:889/1480 train_time:136139ms step_avg:154.88ms step:890/1480 train_time:136301ms step_avg:154.89ms step:891/1480 train_time:136467ms step_avg:154.90ms step:892/1480 train_time:136631ms step_avg:154.91ms step:893/1480 train_time:136795ms step_avg:154.92ms step:894/1480 train_time:136960ms step_avg:154.93ms step:895/1480 train_time:137125ms step_avg:154.94ms step:896/1480 train_time:137291ms step_avg:154.96ms step:897/1480 train_time:137457ms step_avg:154.97ms step:898/1480 train_time:137624ms step_avg:154.98ms step:899/1480 train_time:137787ms step_avg:154.99ms step:900/1480 train_time:137950ms step_avg:155.00ms step:901/1480 train_time:138116ms step_avg:155.01ms step:902/1480 train_time:138280ms step_avg:155.02ms step:903/1480 train_time:138452ms step_avg:155.04ms step:904/1480 train_time:138618ms step_avg:155.05ms step:905/1480 train_time:138780ms step_avg:155.06ms step:906/1480 train_time:138946ms step_avg:155.07ms step:907/1480 train_time:139115ms step_avg:155.09ms step:908/1480 train_time:139278ms step_avg:155.10ms step:909/1480 train_time:139443ms step_avg:155.11ms step:910/1480 train_time:139614ms step_avg:155.13ms step:911/1480 train_time:139779ms step_avg:155.14ms step:912/1480 train_time:139944ms step_avg:155.15ms step:913/1480 train_time:140113ms step_avg:155.16ms step:914/1480 train_time:140281ms step_avg:155.18ms step:915/1480 train_time:140449ms step_avg:155.19ms step:916/1480 train_time:140614ms step_avg:155.20ms step:917/1480 train_time:140778ms step_avg:155.21ms step:918/1480 train_time:140946ms step_avg:155.23ms step:919/1480 train_time:141117ms step_avg:155.24ms step:920/1480 train_time:141282ms step_avg:155.26ms step:921/1480 train_time:141447ms step_avg:155.27ms step:922/1480 train_time:141618ms step_avg:155.28ms step:923/1480 train_time:141780ms step_avg:155.29ms step:924/1480 train_time:141944ms step_avg:155.30ms step:925/1480 train_time:142109ms step_avg:155.31ms step:926/1480 train_time:142273ms step_avg:155.32ms step:927/1480 train_time:142438ms step_avg:155.33ms step:928/1480 train_time:142604ms step_avg:155.34ms step:929/1480 train_time:142769ms step_avg:155.35ms step:930/1480 train_time:142935ms step_avg:155.36ms step:931/1480 train_time:143099ms step_avg:155.37ms step:932/1480 train_time:143264ms step_avg:155.38ms step:933/1480 train_time:143431ms step_avg:155.40ms step:934/1480 train_time:143599ms step_avg:155.41ms step:935/1480 train_time:143769ms step_avg:155.43ms step:936/1480 train_time:143938ms step_avg:155.44ms step:937/1480 train_time:144107ms step_avg:155.46ms step:938/1480 train_time:144270ms step_avg:155.46ms step:939/1480 train_time:144440ms step_avg:155.48ms step:940/1480 train_time:144606ms step_avg:155.49ms step:941/1480 train_time:144770ms step_avg:155.50ms step:942/1480 train_time:144936ms step_avg:155.51ms step:943/1480 train_time:145106ms step_avg:155.53ms step:944/1480 train_time:145279ms step_avg:155.55ms step:945/1480 train_time:145442ms step_avg:155.55ms step:946/1480 train_time:145610ms step_avg:155.57ms step:947/1480 train_time:145779ms step_avg:155.58ms step:948/1480 train_time:145944ms step_avg:155.59ms step:949/1480 train_time:146109ms step_avg:155.60ms step:950/1480 train_time:146274ms step_avg:155.61ms step:951/1480 train_time:146441ms step_avg:155.62ms step:952/1480 train_time:146605ms step_avg:155.63ms step:953/1480 train_time:146773ms step_avg:155.64ms step:954/1480 train_time:146941ms step_avg:155.66ms step:955/1480 train_time:147104ms step_avg:155.67ms step:956/1480 train_time:147268ms step_avg:155.67ms step:957/1480 train_time:147437ms step_avg:155.69ms step:958/1480 train_time:147604ms step_avg:155.70ms step:959/1480 train_time:147769ms step_avg:155.71ms step:960/1480 train_time:147935ms step_avg:155.72ms step:961/1480 train_time:148100ms step_avg:155.73ms step:962/1480 train_time:148264ms step_avg:155.74ms step:963/1480 train_time:148429ms step_avg:155.75ms step:964/1480 train_time:148598ms step_avg:155.76ms step:965/1480 train_time:148763ms step_avg:155.77ms step:966/1480 train_time:148927ms step_avg:155.78ms step:967/1480 train_time:149093ms step_avg:155.79ms step:968/1480 train_time:149258ms step_avg:155.80ms step:969/1480 train_time:149425ms step_avg:155.81ms step:970/1480 train_time:149589ms step_avg:155.82ms step:971/1480 train_time:149754ms step_avg:155.83ms step:972/1480 train_time:149920ms step_avg:155.84ms step:973/1480 train_time:150083ms step_avg:155.85ms step:974/1480 train_time:150253ms step_avg:155.86ms step:975/1480 train_time:150419ms step_avg:155.87ms step:976/1480 train_time:150583ms step_avg:155.88ms step:977/1480 train_time:150746ms step_avg:155.89ms step:978/1480 train_time:150913ms step_avg:155.90ms step:979/1480 train_time:151080ms step_avg:155.91ms step:980/1480 train_time:151245ms step_avg:155.92ms step:981/1480 train_time:151415ms step_avg:155.94ms step:982/1480 train_time:151578ms step_avg:155.94ms step:983/1480 train_time:151742ms step_avg:155.95ms step:984/1480 train_time:151906ms step_avg:155.96ms step:985/1480 train_time:152077ms step_avg:155.98ms step:986/1480 train_time:152242ms step_avg:155.99ms step:987/1480 train_time:152405ms step_avg:155.99ms step:988/1480 train_time:152572ms step_avg:156.00ms step:989/1480 train_time:152738ms step_avg:156.01ms step:990/1480 train_time:152908ms step_avg:156.03ms step:991/1480 train_time:153076ms step_avg:156.04ms step:992/1480 train_time:153250ms step_avg:156.06ms step:993/1480 train_time:153425ms step_avg:156.08ms step:994/1480 train_time:153590ms step_avg:156.09ms step:995/1480 train_time:153756ms step_avg:156.10ms step:996/1480 train_time:153919ms step_avg:156.10ms step:997/1480 train_time:154084ms step_avg:156.11ms step:998/1480 train_time:154246ms step_avg:156.12ms step:999/1480 train_time:154414ms step_avg:156.13ms step:1000/1480 train_time:154582ms step_avg:156.14ms step:1000/1480 val_loss:3.4441 train_time:154650ms step_avg:156.21ms step:1001/1480 train_time:154752ms step_avg:156.16ms step:1002/1480 train_time:154920ms step_avg:156.17ms step:1003/1480 train_time:155091ms step_avg:156.18ms step:1004/1480 train_time:155260ms step_avg:156.20ms step:1005/1480 train_time:155427ms step_avg:156.21ms step:1006/1480 train_time:155596ms step_avg:156.22ms step:1007/1480 train_time:155761ms step_avg:156.23ms step:1008/1480 train_time:155928ms step_avg:156.24ms step:1009/1480 train_time:156101ms step_avg:156.26ms step:1010/1480 train_time:156265ms step_avg:156.26ms step:1011/1480 train_time:156431ms step_avg:156.27ms step:1012/1480 train_time:156597ms step_avg:156.28ms step:1013/1480 train_time:156769ms step_avg:156.30ms step:1014/1480 train_time:156936ms step_avg:156.31ms step:1015/1480 train_time:157105ms step_avg:156.32ms step:1016/1480 train_time:157274ms step_avg:156.34ms step:1017/1480 train_time:157445ms step_avg:156.35ms step:1018/1480 train_time:157615ms step_avg:156.36ms step:1019/1480 train_time:157783ms step_avg:156.38ms step:1020/1480 train_time:157953ms step_avg:156.39ms step:1021/1480 train_time:158119ms step_avg:156.40ms step:1022/1480 train_time:158286ms step_avg:156.41ms step:1023/1480 train_time:158455ms step_avg:156.42ms step:1024/1480 train_time:158621ms step_avg:156.43ms step:1025/1480 train_time:158793ms step_avg:156.45ms step:1026/1480 train_time:158960ms step_avg:156.46ms step:1027/1480 train_time:159125ms step_avg:156.47ms step:1028/1480 train_time:159299ms step_avg:156.48ms step:1029/1480 train_time:159473ms step_avg:156.50ms step:1030/1480 train_time:159641ms step_avg:156.51ms step:1031/1480 train_time:159805ms step_avg:156.52ms step:1032/1480 train_time:159979ms step_avg:156.53ms step:1033/1480 train_time:160145ms step_avg:156.54ms step:1034/1480 train_time:160313ms step_avg:156.56ms step:1035/1480 train_time:160481ms step_avg:156.57ms step:1036/1480 train_time:160645ms step_avg:156.57ms step:1037/1480 train_time:160811ms step_avg:156.58ms step:1038/1480 train_time:160980ms step_avg:156.60ms step:1039/1480 train_time:161151ms step_avg:156.61ms step:1040/1480 train_time:161318ms step_avg:156.62ms step:1041/1480 train_time:161484ms step_avg:156.63ms step:1042/1480 train_time:161647ms step_avg:156.63ms step:1043/1480 train_time:161813ms step_avg:156.64ms step:1044/1480 train_time:161978ms step_avg:156.65ms step:1045/1480 train_time:162147ms step_avg:156.66ms step:1046/1480 train_time:162316ms step_avg:156.68ms step:1047/1480 train_time:162482ms step_avg:156.68ms step:1048/1480 train_time:162648ms step_avg:156.69ms step:1049/1480 train_time:162814ms step_avg:156.70ms step:1050/1480 train_time:162983ms step_avg:156.71ms step:1051/1480 train_time:163152ms step_avg:156.73ms step:1052/1480 train_time:163320ms step_avg:156.74ms step:1053/1480 train_time:163485ms step_avg:156.75ms step:1054/1480 train_time:163655ms step_avg:156.76ms step:1055/1480 train_time:163821ms step_avg:156.77ms step:1056/1480 train_time:163986ms step_avg:156.77ms step:1057/1480 train_time:164155ms step_avg:156.79ms step:1058/1480 train_time:164324ms step_avg:156.80ms step:1059/1480 train_time:164496ms step_avg:156.81ms step:1060/1480 train_time:164664ms step_avg:156.82ms step:1061/1480 train_time:164826ms step_avg:156.83ms step:1062/1480 train_time:164992ms step_avg:156.84ms step:1063/1480 train_time:165158ms step_avg:156.85ms step:1064/1480 train_time:165322ms step_avg:156.85ms step:1065/1480 train_time:165488ms step_avg:156.86ms step:1066/1480 train_time:165656ms step_avg:156.87ms step:1067/1480 train_time:165824ms step_avg:156.88ms step:1068/1480 train_time:165988ms step_avg:156.89ms step:1069/1480 train_time:166161ms step_avg:156.90ms step:1070/1480 train_time:166326ms step_avg:156.91ms step:1071/1480 train_time:166498ms step_avg:156.93ms step:1072/1480 train_time:166665ms step_avg:156.93ms step:1073/1480 train_time:166829ms step_avg:156.94ms step:1074/1480 train_time:166998ms step_avg:156.95ms step:1075/1480 train_time:167170ms step_avg:156.97ms step:1076/1480 train_time:167339ms step_avg:156.98ms step:1077/1480 train_time:167504ms step_avg:156.99ms step:1078/1480 train_time:167676ms step_avg:157.00ms step:1079/1480 train_time:167848ms step_avg:157.01ms step:1080/1480 train_time:168019ms step_avg:157.03ms step:1081/1480 train_time:168186ms step_avg:157.04ms step:1082/1480 train_time:168353ms step_avg:157.05ms step:1083/1480 train_time:168520ms step_avg:157.05ms step:1084/1480 train_time:168685ms step_avg:157.06ms step:1085/1480 train_time:168853ms step_avg:157.07ms step:1086/1480 train_time:169022ms step_avg:157.08ms step:1087/1480 train_time:169188ms step_avg:157.09ms step:1088/1480 train_time:169358ms step_avg:157.10ms step:1089/1480 train_time:169530ms step_avg:157.12ms step:1090/1480 train_time:169702ms step_avg:157.13ms step:1091/1480 train_time:169871ms step_avg:157.14ms step:1092/1480 train_time:170039ms step_avg:157.15ms step:1093/1480 train_time:170205ms step_avg:157.16ms step:1094/1480 train_time:170371ms step_avg:157.17ms step:1095/1480 train_time:170536ms step_avg:157.18ms step:1096/1480 train_time:170704ms step_avg:157.19ms step:1097/1480 train_time:170873ms step_avg:157.20ms step:1098/1480 train_time:171044ms step_avg:157.21ms step:1099/1480 train_time:171215ms step_avg:157.22ms step:1100/1480 train_time:171386ms step_avg:157.23ms step:1101/1480 train_time:171558ms step_avg:157.25ms step:1102/1480 train_time:171729ms step_avg:157.26ms step:1103/1480 train_time:171907ms step_avg:157.28ms step:1104/1480 train_time:172076ms step_avg:157.29ms step:1105/1480 train_time:172246ms step_avg:157.30ms step:1106/1480 train_time:172414ms step_avg:157.31ms step:1107/1480 train_time:172583ms step_avg:157.32ms step:1108/1480 train_time:172748ms step_avg:157.33ms step:1109/1480 train_time:172915ms step_avg:157.34ms step:1110/1480 train_time:173080ms step_avg:157.35ms step:1111/1480 train_time:173247ms step_avg:157.35ms step:1112/1480 train_time:173419ms step_avg:157.37ms step:1113/1480 train_time:173598ms step_avg:157.39ms step:1114/1480 train_time:173770ms step_avg:157.40ms step:1115/1480 train_time:173943ms step_avg:157.41ms step:1116/1480 train_time:174110ms step_avg:157.42ms step:1117/1480 train_time:174282ms step_avg:157.44ms step:1118/1480 train_time:174458ms step_avg:157.45ms step:1119/1480 train_time:174624ms step_avg:157.46ms step:1120/1480 train_time:174791ms step_avg:157.47ms step:1121/1480 train_time:174961ms step_avg:157.48ms step:1122/1480 train_time:175126ms step_avg:157.49ms step:1123/1480 train_time:175295ms step_avg:157.50ms step:1124/1480 train_time:175464ms step_avg:157.51ms step:1125/1480 train_time:175632ms step_avg:157.52ms step:1125/1480 val_loss:3.3888 train_time:175700ms step_avg:157.58ms step:1126/1480 train_time:175801ms step_avg:157.53ms step:1127/1480 train_time:175973ms step_avg:157.54ms step:1128/1480 train_time:176143ms step_avg:157.55ms step:1129/1480 train_time:176317ms step_avg:157.57ms step:1130/1480 train_time:176485ms step_avg:157.58ms step:1131/1480 train_time:176664ms step_avg:157.60ms step:1132/1480 train_time:176831ms step_avg:157.60ms step:1133/1480 train_time:177002ms step_avg:157.62ms step:1134/1480 train_time:177173ms step_avg:157.63ms step:1135/1480 train_time:177341ms step_avg:157.64ms step:1136/1480 train_time:177512ms step_avg:157.65ms step:1137/1480 train_time:177682ms step_avg:157.66ms step:1138/1480 train_time:177855ms step_avg:157.67ms step:1139/1480 train_time:178023ms step_avg:157.68ms step:1140/1480 train_time:178192ms step_avg:157.69ms step:1141/1480 train_time:178363ms step_avg:157.70ms step:1142/1480 train_time:178530ms step_avg:157.71ms step:1143/1480 train_time:178700ms step_avg:157.72ms step:1144/1480 train_time:178869ms step_avg:157.73ms step:1145/1480 train_time:179035ms step_avg:157.74ms step:1146/1480 train_time:179204ms step_avg:157.75ms step:1147/1480 train_time:179375ms step_avg:157.76ms step:1148/1480 train_time:179543ms step_avg:157.77ms step:1149/1480 train_time:179715ms step_avg:157.78ms step:1150/1480 train_time:179883ms step_avg:157.79ms step:1151/1480 train_time:180056ms step_avg:157.81ms step:1152/1480 train_time:180227ms step_avg:157.82ms step:1153/1480 train_time:180400ms step_avg:157.83ms step:1154/1480 train_time:180568ms step_avg:157.84ms step:1155/1480 train_time:180740ms step_avg:157.85ms step:1156/1480 train_time:180920ms step_avg:157.87ms step:1157/1480 train_time:181089ms step_avg:157.88ms step:1158/1480 train_time:181256ms step_avg:157.89ms step:1159/1480 train_time:181422ms step_avg:157.90ms step:1160/1480 train_time:181590ms step_avg:157.90ms step:1161/1480 train_time:181761ms step_avg:157.92ms step:1162/1480 train_time:181931ms step_avg:157.93ms step:1163/1480 train_time:182100ms step_avg:157.94ms step:1164/1480 train_time:182269ms step_avg:157.95ms step:1165/1480 train_time:182435ms step_avg:157.95ms step:1166/1480 train_time:182603ms step_avg:157.96ms step:1167/1480 train_time:182774ms step_avg:157.97ms step:1168/1480 train_time:182941ms step_avg:157.98ms step:1169/1480 train_time:183110ms step_avg:157.99ms step:1170/1480 train_time:183279ms step_avg:158.00ms step:1171/1480 train_time:183446ms step_avg:158.01ms step:1172/1480 train_time:183613ms step_avg:158.01ms step:1173/1480 train_time:183784ms step_avg:158.03ms step:1174/1480 train_time:183964ms step_avg:158.04ms step:1175/1480 train_time:184136ms step_avg:158.06ms step:1176/1480 train_time:184308ms step_avg:158.07ms step:1177/1480 train_time:184486ms step_avg:158.09ms step:1178/1480 train_time:184654ms step_avg:158.09ms step:1179/1480 train_time:184820ms step_avg:158.10ms step:1180/1480 train_time:185003ms step_avg:158.12ms step:1181/1480 train_time:185173ms step_avg:158.13ms step:1182/1480 train_time:185340ms step_avg:158.14ms step:1183/1480 train_time:185511ms step_avg:158.15ms step:1184/1480 train_time:185679ms step_avg:158.16ms step:1185/1480 train_time:185851ms step_avg:158.17ms step:1186/1480 train_time:186022ms step_avg:158.18ms step:1187/1480 train_time:186203ms step_avg:158.20ms step:1188/1480 train_time:186370ms step_avg:158.21ms step:1189/1480 train_time:186540ms step_avg:158.22ms step:1190/1480 train_time:186708ms step_avg:158.23ms step:1191/1480 train_time:186880ms step_avg:158.24ms step:1192/1480 train_time:187047ms step_avg:158.25ms step:1193/1480 train_time:187214ms step_avg:158.25ms step:1194/1480 train_time:187382ms step_avg:158.26ms step:1195/1480 train_time:187555ms step_avg:158.27ms step:1196/1480 train_time:187735ms step_avg:158.29ms step:1197/1480 train_time:187906ms step_avg:158.30ms step:1198/1480 train_time:188088ms step_avg:158.32ms step:1199/1480 train_time:188259ms step_avg:158.33ms step:1200/1480 train_time:188429ms step_avg:158.34ms step:1201/1480 train_time:188596ms step_avg:158.35ms step:1202/1480 train_time:188778ms step_avg:158.37ms step:1203/1480 train_time:188956ms step_avg:158.39ms step:1204/1480 train_time:189131ms step_avg:158.40ms step:1205/1480 train_time:189299ms step_avg:158.41ms step:1206/1480 train_time:189466ms step_avg:158.42ms step:1207/1480 train_time:189635ms step_avg:158.43ms step:1208/1480 train_time:189803ms step_avg:158.43ms step:1209/1480 train_time:189977ms step_avg:158.45ms step:1210/1480 train_time:190152ms step_avg:158.46ms step:1211/1480 train_time:190325ms step_avg:158.47ms step:1212/1480 train_time:190497ms step_avg:158.48ms step:1213/1480 train_time:190671ms step_avg:158.50ms step:1214/1480 train_time:190848ms step_avg:158.51ms step:1215/1480 train_time:191020ms step_avg:158.52ms step:1216/1480 train_time:191189ms step_avg:158.53ms step:1217/1480 train_time:191365ms step_avg:158.55ms step:1218/1480 train_time:191534ms step_avg:158.55ms step:1219/1480 train_time:191713ms step_avg:158.57ms step:1220/1480 train_time:191881ms step_avg:158.58ms step:1221/1480 train_time:192050ms step_avg:158.59ms step:1222/1480 train_time:192217ms step_avg:158.59ms step:1223/1480 train_time:192387ms step_avg:158.60ms step:1224/1480 train_time:192563ms step_avg:158.62ms step:1225/1480 train_time:192734ms step_avg:158.63ms step:1226/1480 train_time:192906ms step_avg:158.64ms step:1227/1480 train_time:193080ms step_avg:158.65ms step:1228/1480 train_time:193249ms step_avg:158.66ms step:1229/1480 train_time:193422ms step_avg:158.67ms step:1230/1480 train_time:193603ms step_avg:158.69ms step:1231/1480 train_time:193778ms step_avg:158.70ms step:1232/1480 train_time:193954ms step_avg:158.72ms step:1233/1480 train_time:194124ms step_avg:158.73ms step:1234/1480 train_time:194294ms step_avg:158.74ms step:1235/1480 train_time:194467ms step_avg:158.75ms step:1236/1480 train_time:194636ms step_avg:158.76ms step:1237/1480 train_time:194807ms step_avg:158.77ms step:1238/1480 train_time:194995ms step_avg:158.79ms step:1239/1480 train_time:195164ms step_avg:158.80ms step:1240/1480 train_time:195334ms step_avg:158.81ms step:1241/1480 train_time:195506ms step_avg:158.82ms step:1242/1480 train_time:195676ms step_avg:158.83ms step:1243/1480 train_time:195849ms step_avg:158.84ms step:1244/1480 train_time:196015ms step_avg:158.85ms step:1245/1480 train_time:196184ms step_avg:158.85ms step:1246/1480 train_time:196355ms step_avg:158.86ms step:1247/1480 train_time:196524ms step_avg:158.87ms step:1248/1480 train_time:196694ms step_avg:158.88ms step:1249/1480 train_time:196862ms step_avg:158.89ms step:1250/1480 train_time:197031ms step_avg:158.90ms step:1250/1480 val_loss:3.3379 train_time:197104ms step_avg:158.96ms step:1251/1480 train_time:197214ms step_avg:158.92ms step:1252/1480 train_time:197383ms step_avg:158.92ms step:1253/1480 train_time:197553ms step_avg:158.93ms step:1254/1480 train_time:197723ms step_avg:158.94ms step:1255/1480 train_time:197910ms step_avg:158.96ms step:1256/1480 train_time:198086ms step_avg:158.98ms step:1257/1480 train_time:198255ms step_avg:158.99ms step:1258/1480 train_time:198431ms step_avg:159.00ms step:1259/1480 train_time:198603ms step_avg:159.01ms step:1260/1480 train_time:198770ms step_avg:159.02ms step:1261/1480 train_time:198943ms step_avg:159.03ms step:1262/1480 train_time:199118ms step_avg:159.04ms step:1263/1480 train_time:199293ms step_avg:159.05ms step:1264/1480 train_time:199460ms step_avg:159.06ms step:1265/1480 train_time:199627ms step_avg:159.07ms step:1266/1480 train_time:199799ms step_avg:159.08ms step:1267/1480 train_time:199969ms step_avg:159.08ms step:1268/1480 train_time:200140ms step_avg:159.09ms step:1269/1480 train_time:200316ms step_avg:159.11ms step:1270/1480 train_time:200486ms step_avg:159.12ms step:1271/1480 train_time:200655ms step_avg:159.12ms step:1272/1480 train_time:200819ms step_avg:159.13ms step:1273/1480 train_time:200991ms step_avg:159.14ms step:1274/1480 train_time:201166ms step_avg:159.15ms step:1275/1480 train_time:201333ms step_avg:159.16ms step:1276/1480 train_time:201499ms step_avg:159.16ms step:1277/1480 train_time:201671ms step_avg:159.17ms step:1278/1480 train_time:201840ms step_avg:159.18ms step:1279/1480 train_time:202013ms step_avg:159.19ms step:1280/1480 train_time:202191ms step_avg:159.21ms step:1281/1480 train_time:202358ms step_avg:159.21ms step:1282/1480 train_time:202527ms step_avg:159.22ms step:1283/1480 train_time:202695ms step_avg:159.23ms step:1284/1480 train_time:202865ms step_avg:159.23ms step:1285/1480 train_time:203034ms step_avg:159.24ms step:1286/1480 train_time:203205ms step_avg:159.25ms step:1287/1480 train_time:203377ms step_avg:159.26ms step:1288/1480 train_time:203550ms step_avg:159.27ms step:1289/1480 train_time:203734ms step_avg:159.29ms step:1290/1480 train_time:203912ms step_avg:159.31ms step:1291/1480 train_time:204086ms step_avg:159.32ms step:1292/1480 train_time:204260ms step_avg:159.33ms step:1293/1480 train_time:204434ms step_avg:159.34ms step:1294/1480 train_time:204607ms step_avg:159.35ms step:1295/1480 train_time:204779ms step_avg:159.36ms step:1296/1480 train_time:204952ms step_avg:159.37ms step:1297/1480 train_time:205125ms step_avg:159.38ms step:1298/1480 train_time:205295ms step_avg:159.39ms step:1299/1480 train_time:205466ms step_avg:159.40ms step:1300/1480 train_time:205633ms step_avg:159.41ms step:1301/1480 train_time:205802ms step_avg:159.41ms step:1302/1480 train_time:205975ms step_avg:159.42ms step:1303/1480 train_time:206151ms step_avg:159.44ms step:1304/1480 train_time:206326ms step_avg:159.45ms step:1305/1480 train_time:206495ms step_avg:159.46ms step:1306/1480 train_time:206669ms step_avg:159.47ms step:1307/1480 train_time:206837ms step_avg:159.47ms step:1308/1480 train_time:207007ms step_avg:159.48ms step:1309/1480 train_time:207179ms step_avg:159.49ms step:1310/1480 train_time:207348ms step_avg:159.50ms step:1311/1480 train_time:207516ms step_avg:159.51ms step:1312/1480 train_time:207691ms step_avg:159.52ms step:1313/1480 train_time:207859ms step_avg:159.52ms step:1314/1480 train_time:208032ms step_avg:159.53ms step:1315/1480 train_time:208203ms step_avg:159.54ms step:1316/1480 train_time:208370ms step_avg:159.55ms step:1317/1480 train_time:208541ms step_avg:159.56ms step:1318/1480 train_time:208723ms step_avg:159.57ms step:1319/1480 train_time:208900ms step_avg:159.59ms step:1320/1480 train_time:209076ms step_avg:159.60ms step:1321/1480 train_time:209248ms step_avg:159.61ms step:1322/1480 train_time:209430ms step_avg:159.63ms step:1323/1480 train_time:209602ms step_avg:159.64ms step:1324/1480 train_time:209776ms step_avg:159.65ms step:1325/1480 train_time:209959ms step_avg:159.66ms step:1326/1480 train_time:210135ms step_avg:159.68ms step:1327/1480 train_time:210305ms step_avg:159.69ms step:1328/1480 train_time:210475ms step_avg:159.69ms step:1329/1480 train_time:210671ms step_avg:159.72ms step:1330/1480 train_time:210851ms step_avg:159.74ms step:1331/1480 train_time:211022ms step_avg:159.74ms step:1332/1480 train_time:211196ms step_avg:159.75ms step:1333/1480 train_time:211372ms step_avg:159.77ms step:1334/1480 train_time:211543ms step_avg:159.78ms step:1335/1480 train_time:211711ms step_avg:159.78ms step:1336/1480 train_time:211893ms step_avg:159.80ms step:1337/1480 train_time:212068ms step_avg:159.81ms step:1338/1480 train_time:212238ms step_avg:159.82ms step:1339/1480 train_time:212413ms step_avg:159.83ms step:1340/1480 train_time:212584ms step_avg:159.84ms step:1341/1480 train_time:212752ms step_avg:159.84ms step:1342/1480 train_time:212927ms step_avg:159.85ms step:1343/1480 train_time:213096ms step_avg:159.86ms step:1344/1480 train_time:213269ms step_avg:159.87ms step:1345/1480 train_time:213447ms step_avg:159.89ms step:1346/1480 train_time:213615ms step_avg:159.89ms step:1347/1480 train_time:213786ms step_avg:159.90ms step:1348/1480 train_time:213955ms step_avg:159.91ms step:1349/1480 train_time:214124ms step_avg:159.91ms step:1350/1480 train_time:214297ms step_avg:159.92ms step:1351/1480 train_time:214468ms step_avg:159.93ms step:1352/1480 train_time:214641ms step_avg:159.94ms step:1353/1480 train_time:214817ms step_avg:159.95ms step:1354/1480 train_time:214989ms step_avg:159.96ms step:1355/1480 train_time:215156ms step_avg:159.97ms step:1356/1480 train_time:215329ms step_avg:159.98ms step:1357/1480 train_time:215504ms step_avg:159.99ms step:1358/1480 train_time:215675ms step_avg:160.00ms step:1359/1480 train_time:215847ms step_avg:160.01ms step:1360/1480 train_time:216023ms step_avg:160.02ms step:1361/1480 train_time:216199ms step_avg:160.03ms step:1362/1480 train_time:216375ms step_avg:160.04ms step:1363/1480 train_time:216555ms step_avg:160.06ms step:1364/1480 train_time:216724ms step_avg:160.06ms step:1365/1480 train_time:216890ms step_avg:160.07ms step:1366/1480 train_time:217062ms step_avg:160.08ms step:1367/1480 train_time:217234ms step_avg:160.08ms step:1368/1480 train_time:217408ms step_avg:160.09ms step:1369/1480 train_time:217588ms step_avg:160.11ms step:1370/1480 train_time:217766ms step_avg:160.12ms step:1371/1480 train_time:217937ms step_avg:160.13ms step:1372/1480 train_time:218116ms step_avg:160.14ms step:1373/1480 train_time:218288ms step_avg:160.15ms step:1374/1480 train_time:218463ms step_avg:160.16ms step:1375/1480 train_time:218634ms step_avg:160.17ms step:1375/1480 val_loss:3.2996 train_time:218702ms step_avg:160.22ms step:1376/1480 train_time:218811ms step_avg:160.18ms step:1377/1480 train_time:218984ms step_avg:160.19ms step:1378/1480 train_time:219152ms step_avg:160.20ms step:1379/1480 train_time:219327ms step_avg:160.21ms step:1380/1480 train_time:219500ms step_avg:160.22ms step:1381/1480 train_time:219679ms step_avg:160.23ms step:1382/1480 train_time:219850ms step_avg:160.24ms step:1383/1480 train_time:220021ms step_avg:160.25ms step:1384/1480 train_time:220198ms step_avg:160.26ms step:1385/1480 train_time:220364ms step_avg:160.26ms step:1386/1480 train_time:220534ms step_avg:160.27ms step:1387/1480 train_time:220705ms step_avg:160.28ms step:1388/1480 train_time:220875ms step_avg:160.29ms step:1389/1480 train_time:221050ms step_avg:160.30ms step:1390/1480 train_time:221217ms step_avg:160.30ms step:1391/1480 train_time:221388ms step_avg:160.31ms step:1392/1480 train_time:221559ms step_avg:160.32ms step:1393/1480 train_time:221731ms step_avg:160.33ms step:1394/1480 train_time:221901ms step_avg:160.33ms step:1395/1480 train_time:222070ms step_avg:160.34ms step:1396/1480 train_time:222238ms step_avg:160.35ms step:1397/1480 train_time:222407ms step_avg:160.35ms step:1398/1480 train_time:222573ms step_avg:160.36ms step:1399/1480 train_time:222743ms step_avg:160.36ms step:1400/1480 train_time:222919ms step_avg:160.37ms step:1401/1480 train_time:223086ms step_avg:160.38ms step:1402/1480 train_time:223256ms step_avg:160.38ms step:1403/1480 train_time:223434ms step_avg:160.40ms step:1404/1480 train_time:223605ms step_avg:160.41ms step:1405/1480 train_time:223777ms step_avg:160.41ms step:1406/1480 train_time:223952ms step_avg:160.42ms step:1407/1480 train_time:224119ms step_avg:160.43ms step:1408/1480 train_time:224288ms step_avg:160.44ms step:1409/1480 train_time:224471ms step_avg:160.45ms step:1410/1480 train_time:224639ms step_avg:160.46ms step:1411/1480 train_time:224808ms step_avg:160.46ms step:1412/1480 train_time:224979ms step_avg:160.47ms step:1413/1480 train_time:225150ms step_avg:160.48ms step:1414/1480 train_time:225322ms step_avg:160.49ms step:1415/1480 train_time:225495ms step_avg:160.49ms step:1416/1480 train_time:225681ms step_avg:160.51ms step:1417/1480 train_time:225855ms step_avg:160.52ms step:1418/1480 train_time:226027ms step_avg:160.53ms step:1419/1480 train_time:226199ms step_avg:160.54ms step:1420/1480 train_time:226374ms step_avg:160.55ms step:1421/1480 train_time:226548ms step_avg:160.56ms step:1422/1480 train_time:226720ms step_avg:160.57ms step:1423/1480 train_time:226890ms step_avg:160.57ms step:1424/1480 train_time:227066ms step_avg:160.58ms step:1425/1480 train_time:227249ms step_avg:160.60ms step:1426/1480 train_time:227422ms step_avg:160.61ms step:1427/1480 train_time:227596ms step_avg:160.62ms step:1428/1480 train_time:227768ms step_avg:160.63ms step:1429/1480 train_time:227934ms step_avg:160.63ms step:1430/1480 train_time:228109ms step_avg:160.64ms step:1431/1480 train_time:228286ms step_avg:160.65ms step:1432/1480 train_time:228462ms step_avg:160.66ms step:1433/1480 train_time:228640ms step_avg:160.67ms step:1434/1480 train_time:228821ms step_avg:160.69ms step:1435/1480 train_time:228996ms step_avg:160.70ms step:1436/1480 train_time:229172ms step_avg:160.71ms step:1437/1480 train_time:229342ms step_avg:160.72ms step:1438/1480 train_time:229511ms step_avg:160.72ms step:1439/1480 train_time:229688ms step_avg:160.73ms step:1440/1480 train_time:229857ms step_avg:160.74ms step:1441/1480 train_time:230029ms step_avg:160.75ms step:1442/1480 train_time:230207ms step_avg:160.76ms step:1443/1480 train_time:230397ms step_avg:160.78ms step:1444/1480 train_time:230567ms step_avg:160.79ms step:1445/1480 train_time:230741ms step_avg:160.79ms step:1446/1480 train_time:230916ms step_avg:160.80ms step:1447/1480 train_time:231093ms step_avg:160.82ms step:1448/1480 train_time:231265ms step_avg:160.82ms step:1449/1480 train_time:231439ms step_avg:160.83ms step:1450/1480 train_time:231613ms step_avg:160.84ms step:1451/1480 train_time:231784ms step_avg:160.85ms step:1452/1480 train_time:231958ms step_avg:160.86ms step:1453/1480 train_time:232127ms step_avg:160.86ms step:1454/1480 train_time:232300ms step_avg:160.87ms step:1455/1480 train_time:232478ms step_avg:160.88ms step:1456/1480 train_time:232651ms step_avg:160.89ms step:1457/1480 train_time:232820ms step_avg:160.90ms step:1458/1480 train_time:232992ms step_avg:160.91ms step:1459/1480 train_time:233168ms step_avg:160.92ms step:1460/1480 train_time:233339ms step_avg:160.92ms step:1461/1480 train_time:233514ms step_avg:160.93ms step:1462/1480 train_time:233685ms step_avg:160.94ms step:1463/1480 train_time:233862ms step_avg:160.95ms step:1464/1480 train_time:234038ms step_avg:160.96ms step:1465/1480 train_time:234213ms step_avg:160.97ms step:1466/1480 train_time:234384ms step_avg:160.98ms step:1467/1480 train_time:234557ms step_avg:160.99ms step:1468/1480 train_time:234728ms step_avg:160.99ms step:1469/1480 train_time:234901ms step_avg:161.00ms step:1470/1480 train_time:235082ms step_avg:161.01ms step:1471/1480 train_time:235271ms step_avg:161.03ms step:1472/1480 train_time:235453ms step_avg:161.05ms step:1473/1480 train_time:235624ms step_avg:161.06ms step:1474/1480 train_time:235800ms step_avg:161.07ms step:1475/1480 train_time:235980ms step_avg:161.08ms step:1476/1480 train_time:236153ms step_avg:161.09ms step:1477/1480 train_time:236334ms step_avg:161.10ms step:1478/1480 train_time:236517ms step_avg:161.12ms step:1479/1480 train_time:236693ms step_avg:161.12ms step:1480/1480 train_time:236867ms step_avg:161.13ms step:1480/1480 val_loss:3.2805 train_time:236939ms step_avg:161.18ms