import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 11:14:34 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 86W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 94W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 121W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23164ms step_avg:nanms step:2/1480 train_time:23315ms step_avg:nanms step:3/1480 train_time:23454ms step_avg:nanms step:4/1480 train_time:23595ms step_avg:nanms step:5/1480 train_time:23735ms step_avg:nanms step:6/1480 train_time:23876ms step_avg:nanms step:7/1480 train_time:24017ms step_avg:nanms step:8/1480 train_time:24161ms step_avg:nanms step:9/1480 train_time:24304ms step_avg:nanms step:10/1480 train_time:24448ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.74ms step:14/1480 train_time:566ms step_avg:141.56ms step:15/1480 train_time:708ms step_avg:141.69ms step:16/1480 train_time:851ms step_avg:141.80ms step:17/1480 train_time:995ms step_avg:142.21ms step:18/1480 train_time:1138ms step_avg:142.29ms step:19/1480 train_time:1281ms step_avg:142.38ms step:20/1480 train_time:1424ms step_avg:142.44ms step:21/1480 train_time:1566ms step_avg:142.39ms step:22/1480 train_time:1709ms step_avg:142.40ms step:23/1480 train_time:1851ms step_avg:142.37ms step:24/1480 train_time:1994ms step_avg:142.41ms step:25/1480 train_time:2138ms step_avg:142.53ms step:26/1480 train_time:2282ms step_avg:142.60ms step:27/1480 train_time:2424ms step_avg:142.61ms step:28/1480 train_time:2566ms step_avg:142.55ms step:29/1480 train_time:2707ms step_avg:142.49ms step:30/1480 train_time:2849ms step_avg:142.43ms step:31/1480 train_time:2992ms step_avg:142.48ms step:32/1480 train_time:3136ms step_avg:142.54ms step:33/1480 train_time:3281ms step_avg:142.67ms step:34/1480 train_time:3426ms step_avg:142.74ms step:35/1480 train_time:3568ms step_avg:142.71ms step:36/1480 train_time:3709ms step_avg:142.67ms step:37/1480 train_time:3850ms step_avg:142.60ms step:38/1480 train_time:3995ms step_avg:142.66ms step:39/1480 train_time:4138ms step_avg:142.69ms step:40/1480 train_time:4282ms step_avg:142.74ms step:41/1480 train_time:4425ms step_avg:142.76ms step:42/1480 train_time:4567ms step_avg:142.72ms step:43/1480 train_time:4711ms step_avg:142.75ms step:44/1480 train_time:4852ms step_avg:142.69ms step:45/1480 train_time:4993ms step_avg:142.65ms step:46/1480 train_time:5136ms step_avg:142.67ms step:47/1480 train_time:5281ms step_avg:142.72ms step:48/1480 train_time:5426ms step_avg:142.79ms step:49/1480 train_time:5569ms step_avg:142.78ms step:50/1480 train_time:5710ms step_avg:142.74ms step:51/1480 train_time:5851ms step_avg:142.71ms step:52/1480 train_time:5994ms step_avg:142.71ms step:53/1480 train_time:6138ms step_avg:142.74ms step:54/1480 train_time:6282ms step_avg:142.76ms step:55/1480 train_time:6426ms step_avg:142.81ms step:56/1480 train_time:6569ms step_avg:142.80ms step:57/1480 train_time:6710ms step_avg:142.77ms step:58/1480 train_time:6850ms step_avg:142.71ms step:59/1480 train_time:6992ms step_avg:142.69ms step:60/1480 train_time:7136ms step_avg:142.72ms step:61/1480 train_time:7278ms step_avg:142.71ms step:62/1480 train_time:7423ms step_avg:142.75ms step:63/1480 train_time:7566ms step_avg:142.76ms step:64/1480 train_time:7708ms step_avg:142.75ms step:65/1480 train_time:7849ms step_avg:142.72ms step:66/1480 train_time:7991ms step_avg:142.70ms step:67/1480 train_time:8133ms step_avg:142.69ms step:68/1480 train_time:8277ms step_avg:142.71ms step:69/1480 train_time:8421ms step_avg:142.73ms step:70/1480 train_time:8564ms step_avg:142.74ms step:71/1480 train_time:8708ms step_avg:142.75ms step:72/1480 train_time:8849ms step_avg:142.73ms step:73/1480 train_time:8990ms step_avg:142.70ms step:74/1480 train_time:9132ms step_avg:142.69ms step:75/1480 train_time:9275ms step_avg:142.69ms step:76/1480 train_time:9419ms step_avg:142.71ms step:77/1480 train_time:9561ms step_avg:142.70ms step:78/1480 train_time:9705ms step_avg:142.72ms step:79/1480 train_time:9847ms step_avg:142.71ms step:80/1480 train_time:9989ms step_avg:142.70ms step:81/1480 train_time:10132ms step_avg:142.71ms step:82/1480 train_time:10276ms step_avg:142.72ms step:83/1480 train_time:10419ms step_avg:142.73ms step:84/1480 train_time:10561ms step_avg:142.71ms step:85/1480 train_time:10704ms step_avg:142.72ms step:86/1480 train_time:10847ms step_avg:142.72ms step:87/1480 train_time:10987ms step_avg:142.69ms step:88/1480 train_time:11130ms step_avg:142.69ms step:89/1480 train_time:11271ms step_avg:142.67ms step:90/1480 train_time:11413ms step_avg:142.66ms step:91/1480 train_time:11556ms step_avg:142.66ms step:92/1480 train_time:11700ms step_avg:142.68ms step:93/1480 train_time:11842ms step_avg:142.68ms step:94/1480 train_time:11984ms step_avg:142.67ms step:95/1480 train_time:12127ms step_avg:142.67ms step:96/1480 train_time:12269ms step_avg:142.66ms step:97/1480 train_time:12411ms step_avg:142.65ms step:98/1480 train_time:12552ms step_avg:142.63ms step:99/1480 train_time:12693ms step_avg:142.62ms step:100/1480 train_time:12838ms step_avg:142.64ms step:101/1480 train_time:12980ms step_avg:142.64ms step:102/1480 train_time:13123ms step_avg:142.64ms step:103/1480 train_time:13266ms step_avg:142.65ms step:104/1480 train_time:13408ms step_avg:142.64ms step:105/1480 train_time:13551ms step_avg:142.64ms step:106/1480 train_time:13691ms step_avg:142.62ms step:107/1480 train_time:13835ms step_avg:142.63ms step:108/1480 train_time:13979ms step_avg:142.65ms step:109/1480 train_time:14123ms step_avg:142.66ms step:110/1480 train_time:14266ms step_avg:142.66ms step:111/1480 train_time:14411ms step_avg:142.68ms step:112/1480 train_time:14556ms step_avg:142.70ms step:113/1480 train_time:14703ms step_avg:142.74ms step:114/1480 train_time:14850ms step_avg:142.79ms step:115/1480 train_time:14995ms step_avg:142.81ms step:116/1480 train_time:15143ms step_avg:142.86ms step:117/1480 train_time:15291ms step_avg:142.91ms step:118/1480 train_time:15437ms step_avg:142.94ms step:119/1480 train_time:15585ms step_avg:142.98ms step:120/1480 train_time:15730ms step_avg:143.00ms step:121/1480 train_time:15875ms step_avg:143.02ms step:122/1480 train_time:16022ms step_avg:143.06ms step:123/1480 train_time:16170ms step_avg:143.10ms step:124/1480 train_time:16316ms step_avg:143.12ms step:125/1480 train_time:16463ms step_avg:143.15ms step:125/1480 val_loss:4.4160 train_time:16522ms step_avg:143.67ms step:126/1480 train_time:16618ms step_avg:143.26ms step:127/1480 train_time:16767ms step_avg:143.31ms step:128/1480 train_time:16913ms step_avg:143.33ms step:129/1480 train_time:17060ms step_avg:143.36ms step:130/1480 train_time:17206ms step_avg:143.38ms step:131/1480 train_time:17350ms step_avg:143.39ms step:132/1480 train_time:17496ms step_avg:143.41ms step:133/1480 train_time:17645ms step_avg:143.46ms step:134/1480 train_time:17793ms step_avg:143.49ms step:135/1480 train_time:17942ms step_avg:143.54ms step:136/1480 train_time:18087ms step_avg:143.55ms step:137/1480 train_time:18235ms step_avg:143.58ms step:138/1480 train_time:18382ms step_avg:143.61ms step:139/1480 train_time:18528ms step_avg:143.63ms step:140/1480 train_time:18676ms step_avg:143.66ms step:141/1480 train_time:18823ms step_avg:143.69ms step:142/1480 train_time:18971ms step_avg:143.72ms step:143/1480 train_time:19119ms step_avg:143.75ms step:144/1480 train_time:19266ms step_avg:143.77ms step:145/1480 train_time:19413ms step_avg:143.80ms step:146/1480 train_time:19561ms step_avg:143.83ms step:147/1480 train_time:19707ms step_avg:143.85ms step:148/1480 train_time:19854ms step_avg:143.87ms step:149/1480 train_time:20002ms step_avg:143.90ms step:150/1480 train_time:20148ms step_avg:143.92ms step:151/1480 train_time:20295ms step_avg:143.94ms step:152/1480 train_time:20443ms step_avg:143.97ms step:153/1480 train_time:20590ms step_avg:143.98ms step:154/1480 train_time:20737ms step_avg:144.01ms step:155/1480 train_time:20884ms step_avg:144.03ms step:156/1480 train_time:21030ms step_avg:144.04ms step:157/1480 train_time:21178ms step_avg:144.07ms step:158/1480 train_time:21324ms step_avg:144.08ms step:159/1480 train_time:21469ms step_avg:144.09ms step:160/1480 train_time:21616ms step_avg:144.11ms step:161/1480 train_time:21763ms step_avg:144.13ms step:162/1480 train_time:21908ms step_avg:144.13ms step:163/1480 train_time:22055ms step_avg:144.15ms step:164/1480 train_time:22203ms step_avg:144.17ms step:165/1480 train_time:22348ms step_avg:144.18ms step:166/1480 train_time:22494ms step_avg:144.19ms step:167/1480 train_time:22641ms step_avg:144.21ms step:168/1480 train_time:22787ms step_avg:144.22ms step:169/1480 train_time:22934ms step_avg:144.24ms step:170/1480 train_time:23081ms step_avg:144.26ms step:171/1480 train_time:23227ms step_avg:144.27ms step:172/1480 train_time:23375ms step_avg:144.29ms step:173/1480 train_time:23522ms step_avg:144.30ms step:174/1480 train_time:23667ms step_avg:144.31ms step:175/1480 train_time:23814ms step_avg:144.33ms step:176/1480 train_time:23962ms step_avg:144.35ms step:177/1480 train_time:24108ms step_avg:144.36ms step:178/1480 train_time:24255ms step_avg:144.37ms step:179/1480 train_time:24402ms step_avg:144.39ms step:180/1480 train_time:24549ms step_avg:144.40ms step:181/1480 train_time:24695ms step_avg:144.42ms step:182/1480 train_time:24843ms step_avg:144.43ms step:183/1480 train_time:24988ms step_avg:144.44ms step:184/1480 train_time:25135ms step_avg:144.45ms step:185/1480 train_time:25282ms step_avg:144.47ms step:186/1480 train_time:25428ms step_avg:144.48ms step:187/1480 train_time:25577ms step_avg:144.51ms step:188/1480 train_time:25724ms step_avg:144.52ms step:189/1480 train_time:25870ms step_avg:144.53ms step:190/1480 train_time:26018ms step_avg:144.54ms step:191/1480 train_time:26164ms step_avg:144.55ms step:192/1480 train_time:26310ms step_avg:144.56ms step:193/1480 train_time:26457ms step_avg:144.57ms step:194/1480 train_time:26604ms step_avg:144.59ms step:195/1480 train_time:26750ms step_avg:144.60ms step:196/1480 train_time:26897ms step_avg:144.61ms step:197/1480 train_time:27045ms step_avg:144.62ms step:198/1480 train_time:27190ms step_avg:144.63ms step:199/1480 train_time:27338ms step_avg:144.64ms step:200/1480 train_time:27485ms step_avg:144.66ms step:201/1480 train_time:27630ms step_avg:144.66ms step:202/1480 train_time:27778ms step_avg:144.68ms step:203/1480 train_time:27924ms step_avg:144.69ms step:204/1480 train_time:28070ms step_avg:144.69ms step:205/1480 train_time:28217ms step_avg:144.70ms step:206/1480 train_time:28364ms step_avg:144.71ms step:207/1480 train_time:28509ms step_avg:144.72ms step:208/1480 train_time:28658ms step_avg:144.74ms step:209/1480 train_time:28804ms step_avg:144.75ms step:210/1480 train_time:28950ms step_avg:144.75ms step:211/1480 train_time:29097ms step_avg:144.76ms step:212/1480 train_time:29244ms step_avg:144.77ms step:213/1480 train_time:29390ms step_avg:144.78ms step:214/1480 train_time:29537ms step_avg:144.79ms step:215/1480 train_time:29683ms step_avg:144.80ms step:216/1480 train_time:29829ms step_avg:144.80ms step:217/1480 train_time:29976ms step_avg:144.81ms step:218/1480 train_time:30122ms step_avg:144.82ms step:219/1480 train_time:30267ms step_avg:144.82ms step:220/1480 train_time:30413ms step_avg:144.82ms step:221/1480 train_time:30562ms step_avg:144.84ms step:222/1480 train_time:30712ms step_avg:144.87ms step:223/1480 train_time:30862ms step_avg:144.89ms step:224/1480 train_time:31011ms step_avg:144.91ms step:225/1480 train_time:31162ms step_avg:144.94ms step:226/1480 train_time:31312ms step_avg:144.96ms step:227/1480 train_time:31462ms step_avg:144.99ms step:228/1480 train_time:31612ms step_avg:145.01ms step:229/1480 train_time:31763ms step_avg:145.03ms step:230/1480 train_time:31912ms step_avg:145.05ms step:231/1480 train_time:32063ms step_avg:145.08ms step:232/1480 train_time:32212ms step_avg:145.10ms step:233/1480 train_time:32362ms step_avg:145.12ms step:234/1480 train_time:32512ms step_avg:145.14ms step:235/1480 train_time:32663ms step_avg:145.17ms step:236/1480 train_time:32812ms step_avg:145.19ms step:237/1480 train_time:32963ms step_avg:145.21ms step:238/1480 train_time:33111ms step_avg:145.22ms step:239/1480 train_time:33263ms step_avg:145.25ms step:240/1480 train_time:33414ms step_avg:145.28ms step:241/1480 train_time:33565ms step_avg:145.30ms step:242/1480 train_time:33715ms step_avg:145.32ms step:243/1480 train_time:33865ms step_avg:145.35ms step:244/1480 train_time:34015ms step_avg:145.36ms step:245/1480 train_time:34165ms step_avg:145.38ms step:246/1480 train_time:34316ms step_avg:145.41ms step:247/1480 train_time:34466ms step_avg:145.43ms step:248/1480 train_time:34617ms step_avg:145.45ms step:249/1480 train_time:34767ms step_avg:145.47ms step:250/1480 train_time:34917ms step_avg:145.49ms step:250/1480 val_loss:4.0030 train_time:34976ms step_avg:145.73ms step:251/1480 train_time:35073ms step_avg:145.53ms step:252/1480 train_time:35225ms step_avg:145.56ms step:253/1480 train_time:35374ms step_avg:145.57ms step:254/1480 train_time:35524ms step_avg:145.59ms step:255/1480 train_time:35673ms step_avg:145.60ms step:256/1480 train_time:35822ms step_avg:145.62ms step:257/1480 train_time:35972ms step_avg:145.64ms step:258/1480 train_time:36124ms step_avg:145.66ms step:259/1480 train_time:36276ms step_avg:145.69ms step:260/1480 train_time:36427ms step_avg:145.71ms step:261/1480 train_time:36577ms step_avg:145.73ms step:262/1480 train_time:36727ms step_avg:145.74ms step:263/1480 train_time:36877ms step_avg:145.76ms step:264/1480 train_time:37027ms step_avg:145.78ms step:265/1480 train_time:37178ms step_avg:145.80ms step:266/1480 train_time:37329ms step_avg:145.82ms step:267/1480 train_time:37481ms step_avg:145.84ms step:268/1480 train_time:37630ms step_avg:145.85ms step:269/1480 train_time:37781ms step_avg:145.87ms step:270/1480 train_time:37929ms step_avg:145.88ms step:271/1480 train_time:38077ms step_avg:145.89ms step:272/1480 train_time:38228ms step_avg:145.91ms step:273/1480 train_time:38378ms step_avg:145.92ms step:274/1480 train_time:38528ms step_avg:145.94ms step:275/1480 train_time:38678ms step_avg:145.96ms step:276/1480 train_time:38829ms step_avg:145.97ms step:277/1480 train_time:38980ms step_avg:145.99ms step:278/1480 train_time:39129ms step_avg:146.00ms step:279/1480 train_time:39280ms step_avg:146.02ms step:280/1480 train_time:39429ms step_avg:146.03ms step:281/1480 train_time:39580ms step_avg:146.05ms step:282/1480 train_time:39730ms step_avg:146.07ms step:283/1480 train_time:39881ms step_avg:146.08ms step:284/1480 train_time:40030ms step_avg:146.10ms step:285/1480 train_time:40182ms step_avg:146.12ms step:286/1480 train_time:40330ms step_avg:146.12ms step:287/1480 train_time:40483ms step_avg:146.15ms step:288/1480 train_time:40631ms step_avg:146.15ms step:289/1480 train_time:40783ms step_avg:146.18ms step:290/1480 train_time:40932ms step_avg:146.19ms step:291/1480 train_time:41083ms step_avg:146.20ms step:292/1480 train_time:41232ms step_avg:146.21ms step:293/1480 train_time:41383ms step_avg:146.23ms step:294/1480 train_time:41532ms step_avg:146.24ms step:295/1480 train_time:41682ms step_avg:146.25ms step:296/1480 train_time:41833ms step_avg:146.27ms step:297/1480 train_time:41983ms step_avg:146.28ms step:298/1480 train_time:42134ms step_avg:146.30ms step:299/1480 train_time:42284ms step_avg:146.31ms step:300/1480 train_time:42434ms step_avg:146.32ms step:301/1480 train_time:42584ms step_avg:146.34ms step:302/1480 train_time:42735ms step_avg:146.35ms step:303/1480 train_time:42886ms step_avg:146.37ms step:304/1480 train_time:43036ms step_avg:146.38ms step:305/1480 train_time:43186ms step_avg:146.39ms step:306/1480 train_time:43336ms step_avg:146.41ms step:307/1480 train_time:43487ms step_avg:146.42ms step:308/1480 train_time:43637ms step_avg:146.43ms step:309/1480 train_time:43788ms step_avg:146.45ms step:310/1480 train_time:43940ms step_avg:146.47ms step:311/1480 train_time:44090ms step_avg:146.48ms step:312/1480 train_time:44241ms step_avg:146.49ms step:313/1480 train_time:44391ms step_avg:146.51ms step:314/1480 train_time:44542ms step_avg:146.52ms step:315/1480 train_time:44691ms step_avg:146.53ms step:316/1480 train_time:44842ms step_avg:146.54ms step:317/1480 train_time:44993ms step_avg:146.56ms step:318/1480 train_time:45143ms step_avg:146.57ms step:319/1480 train_time:45295ms step_avg:146.59ms step:320/1480 train_time:45445ms step_avg:146.60ms step:321/1480 train_time:45596ms step_avg:146.61ms step:322/1480 train_time:45747ms step_avg:146.62ms step:323/1480 train_time:45897ms step_avg:146.64ms step:324/1480 train_time:46047ms step_avg:146.64ms step:325/1480 train_time:46198ms step_avg:146.66ms step:326/1480 train_time:46350ms step_avg:146.68ms step:327/1480 train_time:46502ms step_avg:146.69ms step:328/1480 train_time:46650ms step_avg:146.70ms step:329/1480 train_time:46800ms step_avg:146.71ms step:330/1480 train_time:46951ms step_avg:146.72ms step:331/1480 train_time:47105ms step_avg:146.74ms step:332/1480 train_time:47259ms step_avg:146.77ms step:333/1480 train_time:47412ms step_avg:146.79ms step:334/1480 train_time:47565ms step_avg:146.81ms step:335/1480 train_time:47719ms step_avg:146.83ms step:336/1480 train_time:47873ms step_avg:146.85ms step:337/1480 train_time:48027ms step_avg:146.87ms step:338/1480 train_time:48180ms step_avg:146.89ms step:339/1480 train_time:48333ms step_avg:146.91ms step:340/1480 train_time:48486ms step_avg:146.93ms step:341/1480 train_time:48640ms step_avg:146.95ms step:342/1480 train_time:48793ms step_avg:146.97ms step:343/1480 train_time:48946ms step_avg:146.99ms step:344/1480 train_time:49101ms step_avg:147.01ms step:345/1480 train_time:49257ms step_avg:147.04ms step:346/1480 train_time:49411ms step_avg:147.06ms step:347/1480 train_time:49565ms step_avg:147.08ms step:348/1480 train_time:49719ms step_avg:147.10ms step:349/1480 train_time:49873ms step_avg:147.12ms step:350/1480 train_time:50026ms step_avg:147.13ms step:351/1480 train_time:50179ms step_avg:147.15ms step:352/1480 train_time:50331ms step_avg:147.17ms step:353/1480 train_time:50484ms step_avg:147.18ms step:354/1480 train_time:50639ms step_avg:147.21ms step:355/1480 train_time:50795ms step_avg:147.23ms step:356/1480 train_time:50949ms step_avg:147.25ms step:357/1480 train_time:51103ms step_avg:147.27ms step:358/1480 train_time:51256ms step_avg:147.29ms step:359/1480 train_time:51409ms step_avg:147.31ms step:360/1480 train_time:51563ms step_avg:147.32ms step:361/1480 train_time:51719ms step_avg:147.35ms step:362/1480 train_time:51874ms step_avg:147.37ms step:363/1480 train_time:52028ms step_avg:147.39ms step:364/1480 train_time:52182ms step_avg:147.41ms step:365/1480 train_time:52337ms step_avg:147.43ms step:366/1480 train_time:52490ms step_avg:147.44ms step:367/1480 train_time:52642ms step_avg:147.46ms step:368/1480 train_time:52796ms step_avg:147.47ms step:369/1480 train_time:52949ms step_avg:147.49ms step:370/1480 train_time:53102ms step_avg:147.51ms step:371/1480 train_time:53255ms step_avg:147.52ms step:372/1480 train_time:53409ms step_avg:147.54ms step:373/1480 train_time:53562ms step_avg:147.55ms step:374/1480 train_time:53715ms step_avg:147.57ms step:375/1480 train_time:53868ms step_avg:147.58ms step:375/1480 val_loss:3.8105 train_time:53928ms step_avg:147.75ms step:376/1480 train_time:54027ms step_avg:147.62ms step:377/1480 train_time:54185ms step_avg:147.64ms step:378/1480 train_time:54337ms step_avg:147.66ms step:379/1480 train_time:54490ms step_avg:147.67ms step:380/1480 train_time:54644ms step_avg:147.69ms step:381/1480 train_time:54796ms step_avg:147.70ms step:382/1480 train_time:54950ms step_avg:147.71ms step:383/1480 train_time:55106ms step_avg:147.74ms step:384/1480 train_time:55261ms step_avg:147.76ms step:385/1480 train_time:55414ms step_avg:147.77ms step:386/1480 train_time:55568ms step_avg:147.79ms step:387/1480 train_time:55721ms step_avg:147.80ms step:388/1480 train_time:55875ms step_avg:147.82ms step:389/1480 train_time:56029ms step_avg:147.83ms step:390/1480 train_time:56183ms step_avg:147.85ms step:391/1480 train_time:56336ms step_avg:147.86ms step:392/1480 train_time:56490ms step_avg:147.88ms step:393/1480 train_time:56644ms step_avg:147.89ms step:394/1480 train_time:56797ms step_avg:147.91ms step:395/1480 train_time:56950ms step_avg:147.92ms step:396/1480 train_time:57105ms step_avg:147.94ms step:397/1480 train_time:57260ms step_avg:147.96ms step:398/1480 train_time:57412ms step_avg:147.97ms step:399/1480 train_time:57565ms step_avg:147.98ms step:400/1480 train_time:57718ms step_avg:147.99ms step:401/1480 train_time:57872ms step_avg:148.01ms step:402/1480 train_time:58027ms step_avg:148.03ms step:403/1480 train_time:58182ms step_avg:148.05ms step:404/1480 train_time:58336ms step_avg:148.06ms step:405/1480 train_time:58489ms step_avg:148.07ms step:406/1480 train_time:58642ms step_avg:148.09ms step:407/1480 train_time:58795ms step_avg:148.10ms step:408/1480 train_time:58949ms step_avg:148.11ms step:409/1480 train_time:59104ms step_avg:148.13ms step:410/1480 train_time:59257ms step_avg:148.14ms step:411/1480 train_time:59411ms step_avg:148.16ms step:412/1480 train_time:59564ms step_avg:148.17ms step:413/1480 train_time:59717ms step_avg:148.18ms step:414/1480 train_time:59871ms step_avg:148.20ms step:415/1480 train_time:60024ms step_avg:148.21ms step:416/1480 train_time:60178ms step_avg:148.22ms step:417/1480 train_time:60332ms step_avg:148.24ms step:418/1480 train_time:60487ms step_avg:148.25ms step:419/1480 train_time:60641ms step_avg:148.27ms step:420/1480 train_time:60794ms step_avg:148.28ms step:421/1480 train_time:60946ms step_avg:148.29ms step:422/1480 train_time:61100ms step_avg:148.30ms step:423/1480 train_time:61253ms step_avg:148.31ms step:424/1480 train_time:61407ms step_avg:148.33ms step:425/1480 train_time:61561ms step_avg:148.34ms step:426/1480 train_time:61714ms step_avg:148.35ms step:427/1480 train_time:61868ms step_avg:148.36ms step:428/1480 train_time:62022ms step_avg:148.38ms step:429/1480 train_time:62174ms step_avg:148.39ms step:430/1480 train_time:62327ms step_avg:148.40ms step:431/1480 train_time:62481ms step_avg:148.41ms step:432/1480 train_time:62635ms step_avg:148.42ms step:433/1480 train_time:62789ms step_avg:148.44ms step:434/1480 train_time:62944ms step_avg:148.45ms step:435/1480 train_time:63098ms step_avg:148.47ms step:436/1480 train_time:63250ms step_avg:148.47ms step:437/1480 train_time:63404ms step_avg:148.49ms step:438/1480 train_time:63558ms step_avg:148.50ms step:439/1480 train_time:63712ms step_avg:148.51ms step:440/1480 train_time:63867ms step_avg:148.53ms step:441/1480 train_time:64025ms step_avg:148.55ms step:442/1480 train_time:64183ms step_avg:148.57ms step:443/1480 train_time:64340ms step_avg:148.59ms step:444/1480 train_time:64495ms step_avg:148.61ms step:445/1480 train_time:64650ms step_avg:148.62ms step:446/1480 train_time:64807ms step_avg:148.64ms step:447/1480 train_time:64965ms step_avg:148.66ms step:448/1480 train_time:65121ms step_avg:148.68ms step:449/1480 train_time:65280ms step_avg:148.70ms step:450/1480 train_time:65438ms step_avg:148.72ms step:451/1480 train_time:65595ms step_avg:148.74ms step:452/1480 train_time:65750ms step_avg:148.76ms step:453/1480 train_time:65907ms step_avg:148.78ms step:454/1480 train_time:66065ms step_avg:148.80ms step:455/1480 train_time:66222ms step_avg:148.81ms step:456/1480 train_time:66379ms step_avg:148.83ms step:457/1480 train_time:66535ms step_avg:148.85ms step:458/1480 train_time:66691ms step_avg:148.86ms step:459/1480 train_time:66848ms step_avg:148.88ms step:460/1480 train_time:67006ms step_avg:148.90ms step:461/1480 train_time:67166ms step_avg:148.93ms step:462/1480 train_time:67324ms step_avg:148.95ms step:463/1480 train_time:67482ms step_avg:148.97ms step:464/1480 train_time:67638ms step_avg:148.98ms step:465/1480 train_time:67793ms step_avg:149.00ms step:466/1480 train_time:67949ms step_avg:149.01ms step:467/1480 train_time:68106ms step_avg:149.03ms step:468/1480 train_time:68262ms step_avg:149.04ms step:469/1480 train_time:68417ms step_avg:149.06ms step:470/1480 train_time:68574ms step_avg:149.07ms step:471/1480 train_time:68731ms step_avg:149.09ms step:472/1480 train_time:68887ms step_avg:149.11ms step:473/1480 train_time:69044ms step_avg:149.12ms step:474/1480 train_time:69201ms step_avg:149.14ms step:475/1480 train_time:69355ms step_avg:149.15ms step:476/1480 train_time:69512ms step_avg:149.17ms step:477/1480 train_time:69669ms step_avg:149.19ms step:478/1480 train_time:69826ms step_avg:149.20ms step:479/1480 train_time:69984ms step_avg:149.22ms step:480/1480 train_time:70142ms step_avg:149.24ms step:481/1480 train_time:70299ms step_avg:149.26ms step:482/1480 train_time:70455ms step_avg:149.27ms step:483/1480 train_time:70611ms step_avg:149.28ms step:484/1480 train_time:70769ms step_avg:149.30ms step:485/1480 train_time:70927ms step_avg:149.32ms step:486/1480 train_time:71084ms step_avg:149.34ms step:487/1480 train_time:71242ms step_avg:149.35ms step:488/1480 train_time:71400ms step_avg:149.37ms step:489/1480 train_time:71556ms step_avg:149.39ms step:490/1480 train_time:71713ms step_avg:149.40ms step:491/1480 train_time:71871ms step_avg:149.42ms step:492/1480 train_time:72028ms step_avg:149.44ms step:493/1480 train_time:72187ms step_avg:149.46ms step:494/1480 train_time:72344ms step_avg:149.47ms step:495/1480 train_time:72502ms step_avg:149.49ms step:496/1480 train_time:72661ms step_avg:149.51ms step:497/1480 train_time:72819ms step_avg:149.53ms step:498/1480 train_time:72975ms step_avg:149.54ms step:499/1480 train_time:73131ms step_avg:149.55ms step:500/1480 train_time:73288ms step_avg:149.57ms step:500/1480 val_loss:3.6867 train_time:73349ms step_avg:149.69ms step:501/1480 train_time:73448ms step_avg:149.59ms step:502/1480 train_time:73606ms step_avg:149.61ms step:503/1480 train_time:73762ms step_avg:149.62ms step:504/1480 train_time:73917ms step_avg:149.63ms step:505/1480 train_time:74073ms step_avg:149.64ms step:506/1480 train_time:74229ms step_avg:149.65ms step:507/1480 train_time:74385ms step_avg:149.67ms step:508/1480 train_time:74543ms step_avg:149.69ms step:509/1480 train_time:74700ms step_avg:149.70ms step:510/1480 train_time:74857ms step_avg:149.71ms step:511/1480 train_time:75014ms step_avg:149.73ms step:512/1480 train_time:75173ms step_avg:149.75ms step:513/1480 train_time:75328ms step_avg:149.76ms step:514/1480 train_time:75484ms step_avg:149.77ms step:515/1480 train_time:75641ms step_avg:149.78ms step:516/1480 train_time:75800ms step_avg:149.80ms step:517/1480 train_time:75957ms step_avg:149.82ms step:518/1480 train_time:76114ms step_avg:149.83ms step:519/1480 train_time:76273ms step_avg:149.85ms step:520/1480 train_time:76431ms step_avg:149.86ms step:521/1480 train_time:76588ms step_avg:149.88ms step:522/1480 train_time:76743ms step_avg:149.89ms step:523/1480 train_time:76900ms step_avg:149.90ms step:524/1480 train_time:77057ms step_avg:149.92ms step:525/1480 train_time:77216ms step_avg:149.93ms step:526/1480 train_time:77375ms step_avg:149.95ms step:527/1480 train_time:77532ms step_avg:149.97ms step:528/1480 train_time:77689ms step_avg:149.98ms step:529/1480 train_time:77845ms step_avg:149.99ms step:530/1480 train_time:78002ms step_avg:150.00ms step:531/1480 train_time:78158ms step_avg:150.02ms step:532/1480 train_time:78314ms step_avg:150.03ms step:533/1480 train_time:78471ms step_avg:150.04ms step:534/1480 train_time:78629ms step_avg:150.05ms step:535/1480 train_time:78784ms step_avg:150.06ms step:536/1480 train_time:78942ms step_avg:150.08ms step:537/1480 train_time:79099ms step_avg:150.09ms step:538/1480 train_time:79257ms step_avg:150.11ms step:539/1480 train_time:79416ms step_avg:150.12ms step:540/1480 train_time:79573ms step_avg:150.14ms step:541/1480 train_time:79729ms step_avg:150.15ms step:542/1480 train_time:79885ms step_avg:150.16ms step:543/1480 train_time:80040ms step_avg:150.17ms step:544/1480 train_time:80197ms step_avg:150.18ms step:545/1480 train_time:80355ms step_avg:150.20ms step:546/1480 train_time:80511ms step_avg:150.21ms step:547/1480 train_time:80667ms step_avg:150.22ms step:548/1480 train_time:80825ms step_avg:150.23ms step:549/1480 train_time:80981ms step_avg:150.24ms step:550/1480 train_time:81139ms step_avg:150.26ms step:551/1480 train_time:81298ms step_avg:150.27ms step:552/1480 train_time:81458ms step_avg:150.29ms step:553/1480 train_time:81619ms step_avg:150.31ms step:554/1480 train_time:81780ms step_avg:150.33ms step:555/1480 train_time:81939ms step_avg:150.35ms step:556/1480 train_time:82099ms step_avg:150.36ms step:557/1480 train_time:82258ms step_avg:150.38ms step:558/1480 train_time:82418ms step_avg:150.40ms step:559/1480 train_time:82577ms step_avg:150.41ms step:560/1480 train_time:82737ms step_avg:150.43ms step:561/1480 train_time:82897ms step_avg:150.45ms step:562/1480 train_time:83058ms step_avg:150.47ms step:563/1480 train_time:83217ms step_avg:150.48ms step:564/1480 train_time:83377ms step_avg:150.50ms step:565/1480 train_time:83537ms step_avg:150.52ms step:566/1480 train_time:83699ms step_avg:150.54ms step:567/1480 train_time:83859ms step_avg:150.55ms step:568/1480 train_time:84018ms step_avg:150.57ms step:569/1480 train_time:84177ms step_avg:150.58ms step:570/1480 train_time:84336ms step_avg:150.60ms step:571/1480 train_time:84497ms step_avg:150.62ms step:572/1480 train_time:84657ms step_avg:150.63ms step:573/1480 train_time:84817ms step_avg:150.65ms step:574/1480 train_time:84979ms step_avg:150.67ms step:575/1480 train_time:85139ms step_avg:150.69ms step:576/1480 train_time:85299ms step_avg:150.70ms step:577/1480 train_time:85458ms step_avg:150.72ms step:578/1480 train_time:85618ms step_avg:150.74ms step:579/1480 train_time:85778ms step_avg:150.75ms step:580/1480 train_time:85938ms step_avg:150.77ms step:581/1480 train_time:86098ms step_avg:150.79ms step:582/1480 train_time:86258ms step_avg:150.80ms step:583/1480 train_time:86418ms step_avg:150.82ms step:584/1480 train_time:86578ms step_avg:150.83ms step:585/1480 train_time:86737ms step_avg:150.85ms step:586/1480 train_time:86898ms step_avg:150.87ms step:587/1480 train_time:87058ms step_avg:150.88ms step:588/1480 train_time:87217ms step_avg:150.89ms step:589/1480 train_time:87378ms step_avg:150.91ms step:590/1480 train_time:87538ms step_avg:150.93ms step:591/1480 train_time:87698ms step_avg:150.94ms step:592/1480 train_time:87858ms step_avg:150.96ms step:593/1480 train_time:88019ms step_avg:150.98ms step:594/1480 train_time:88180ms step_avg:150.99ms step:595/1480 train_time:88340ms step_avg:151.01ms step:596/1480 train_time:88501ms step_avg:151.03ms step:597/1480 train_time:88660ms step_avg:151.04ms step:598/1480 train_time:88817ms step_avg:151.05ms step:599/1480 train_time:88976ms step_avg:151.06ms step:600/1480 train_time:89136ms step_avg:151.08ms step:601/1480 train_time:89296ms step_avg:151.09ms step:602/1480 train_time:89456ms step_avg:151.11ms step:603/1480 train_time:89616ms step_avg:151.12ms step:604/1480 train_time:89776ms step_avg:151.14ms step:605/1480 train_time:89937ms step_avg:151.15ms step:606/1480 train_time:90100ms step_avg:151.18ms step:607/1480 train_time:90261ms step_avg:151.19ms step:608/1480 train_time:90420ms step_avg:151.20ms step:609/1480 train_time:90580ms step_avg:151.22ms step:610/1480 train_time:90737ms step_avg:151.23ms step:611/1480 train_time:90898ms step_avg:151.24ms step:612/1480 train_time:91058ms step_avg:151.26ms step:613/1480 train_time:91220ms step_avg:151.28ms step:614/1480 train_time:91379ms step_avg:151.29ms step:615/1480 train_time:91538ms step_avg:151.30ms step:616/1480 train_time:91698ms step_avg:151.32ms step:617/1480 train_time:91857ms step_avg:151.33ms step:618/1480 train_time:92016ms step_avg:151.34ms step:619/1480 train_time:92176ms step_avg:151.36ms step:620/1480 train_time:92336ms step_avg:151.37ms step:621/1480 train_time:92497ms step_avg:151.39ms step:622/1480 train_time:92657ms step_avg:151.40ms step:623/1480 train_time:92818ms step_avg:151.42ms step:624/1480 train_time:92978ms step_avg:151.43ms step:625/1480 train_time:93137ms step_avg:151.44ms step:625/1480 val_loss:3.6057 train_time:93201ms step_avg:151.55ms step:626/1480 train_time:93300ms step_avg:151.46ms step:627/1480 train_time:93458ms step_avg:151.47ms step:628/1480 train_time:93615ms step_avg:151.48ms step:629/1480 train_time:93774ms step_avg:151.49ms step:630/1480 train_time:93933ms step_avg:151.50ms step:631/1480 train_time:94091ms step_avg:151.51ms step:632/1480 train_time:94252ms step_avg:151.53ms step:633/1480 train_time:94412ms step_avg:151.54ms step:634/1480 train_time:94574ms step_avg:151.56ms step:635/1480 train_time:94733ms step_avg:151.57ms step:636/1480 train_time:94892ms step_avg:151.59ms step:637/1480 train_time:95053ms step_avg:151.60ms step:638/1480 train_time:95213ms step_avg:151.61ms step:639/1480 train_time:95374ms step_avg:151.63ms step:640/1480 train_time:95535ms step_avg:151.64ms step:641/1480 train_time:95694ms step_avg:151.65ms step:642/1480 train_time:95853ms step_avg:151.67ms step:643/1480 train_time:96011ms step_avg:151.68ms step:644/1480 train_time:96170ms step_avg:151.69ms step:645/1480 train_time:96330ms step_avg:151.70ms step:646/1480 train_time:96489ms step_avg:151.71ms step:647/1480 train_time:96648ms step_avg:151.72ms step:648/1480 train_time:96808ms step_avg:151.74ms step:649/1480 train_time:96968ms step_avg:151.75ms step:650/1480 train_time:97128ms step_avg:151.76ms step:651/1480 train_time:97287ms step_avg:151.77ms step:652/1480 train_time:97446ms step_avg:151.79ms step:653/1480 train_time:97604ms step_avg:151.79ms step:654/1480 train_time:97762ms step_avg:151.80ms step:655/1480 train_time:97919ms step_avg:151.81ms step:656/1480 train_time:98079ms step_avg:151.82ms step:657/1480 train_time:98239ms step_avg:151.84ms step:658/1480 train_time:98398ms step_avg:151.85ms step:659/1480 train_time:98558ms step_avg:151.86ms step:660/1480 train_time:98719ms step_avg:151.88ms step:661/1480 train_time:98880ms step_avg:151.89ms step:662/1480 train_time:99040ms step_avg:151.90ms step:663/1480 train_time:99200ms step_avg:151.91ms step:664/1480 train_time:99360ms step_avg:151.93ms step:665/1480 train_time:99521ms step_avg:151.94ms step:666/1480 train_time:99681ms step_avg:151.95ms step:667/1480 train_time:99843ms step_avg:151.97ms step:668/1480 train_time:100004ms step_avg:151.98ms step:669/1480 train_time:100167ms step_avg:152.00ms step:670/1480 train_time:100328ms step_avg:152.01ms step:671/1480 train_time:100488ms step_avg:152.02ms step:672/1480 train_time:100649ms step_avg:152.04ms step:673/1480 train_time:100812ms step_avg:152.05ms step:674/1480 train_time:100975ms step_avg:152.07ms step:675/1480 train_time:101136ms step_avg:152.08ms step:676/1480 train_time:101298ms step_avg:152.10ms step:677/1480 train_time:101459ms step_avg:152.11ms step:678/1480 train_time:101620ms step_avg:152.13ms step:679/1480 train_time:101782ms step_avg:152.14ms step:680/1480 train_time:101945ms step_avg:152.16ms step:681/1480 train_time:102105ms step_avg:152.17ms step:682/1480 train_time:102268ms step_avg:152.18ms step:683/1480 train_time:102429ms step_avg:152.20ms step:684/1480 train_time:102590ms step_avg:152.21ms step:685/1480 train_time:102755ms step_avg:152.23ms step:686/1480 train_time:102915ms step_avg:152.24ms step:687/1480 train_time:103076ms step_avg:152.25ms step:688/1480 train_time:103239ms step_avg:152.27ms step:689/1480 train_time:103401ms step_avg:152.28ms step:690/1480 train_time:103564ms step_avg:152.30ms step:691/1480 train_time:103724ms step_avg:152.31ms step:692/1480 train_time:103885ms step_avg:152.32ms step:693/1480 train_time:104047ms step_avg:152.34ms step:694/1480 train_time:104210ms step_avg:152.35ms step:695/1480 train_time:104372ms step_avg:152.37ms step:696/1480 train_time:104534ms step_avg:152.38ms step:697/1480 train_time:104697ms step_avg:152.40ms step:698/1480 train_time:104857ms step_avg:152.41ms step:699/1480 train_time:105019ms step_avg:152.42ms step:700/1480 train_time:105181ms step_avg:152.44ms step:701/1480 train_time:105340ms step_avg:152.45ms step:702/1480 train_time:105502ms step_avg:152.46ms step:703/1480 train_time:105662ms step_avg:152.47ms step:704/1480 train_time:105822ms step_avg:152.48ms step:705/1480 train_time:105984ms step_avg:152.49ms step:706/1480 train_time:106148ms step_avg:152.51ms step:707/1480 train_time:106310ms step_avg:152.52ms step:708/1480 train_time:106472ms step_avg:152.54ms step:709/1480 train_time:106635ms step_avg:152.55ms step:710/1480 train_time:106795ms step_avg:152.56ms step:711/1480 train_time:106957ms step_avg:152.58ms step:712/1480 train_time:107121ms step_avg:152.59ms step:713/1480 train_time:107284ms step_avg:152.61ms step:714/1480 train_time:107444ms step_avg:152.62ms step:715/1480 train_time:107603ms step_avg:152.63ms step:716/1480 train_time:107762ms step_avg:152.64ms step:717/1480 train_time:107925ms step_avg:152.65ms step:718/1480 train_time:108084ms step_avg:152.66ms step:719/1480 train_time:108244ms step_avg:152.67ms step:720/1480 train_time:108407ms step_avg:152.69ms step:721/1480 train_time:108571ms step_avg:152.70ms step:722/1480 train_time:108735ms step_avg:152.72ms step:723/1480 train_time:108896ms step_avg:152.73ms step:724/1480 train_time:109057ms step_avg:152.74ms step:725/1480 train_time:109219ms step_avg:152.75ms step:726/1480 train_time:109381ms step_avg:152.77ms step:727/1480 train_time:109545ms step_avg:152.78ms step:728/1480 train_time:109705ms step_avg:152.79ms step:729/1480 train_time:109867ms step_avg:152.81ms step:730/1480 train_time:110031ms step_avg:152.82ms step:731/1480 train_time:110192ms step_avg:152.83ms step:732/1480 train_time:110353ms step_avg:152.84ms step:733/1480 train_time:110514ms step_avg:152.86ms step:734/1480 train_time:110676ms step_avg:152.87ms step:735/1480 train_time:110837ms step_avg:152.88ms step:736/1480 train_time:110999ms step_avg:152.89ms step:737/1480 train_time:111159ms step_avg:152.90ms step:738/1480 train_time:111320ms step_avg:152.91ms step:739/1480 train_time:111478ms step_avg:152.92ms step:740/1480 train_time:111643ms step_avg:152.94ms step:741/1480 train_time:111806ms step_avg:152.95ms step:742/1480 train_time:111969ms step_avg:152.96ms step:743/1480 train_time:112131ms step_avg:152.98ms step:744/1480 train_time:112296ms step_avg:152.99ms step:745/1480 train_time:112459ms step_avg:153.01ms step:746/1480 train_time:112618ms step_avg:153.01ms step:747/1480 train_time:112781ms step_avg:153.03ms step:748/1480 train_time:112947ms step_avg:153.04ms step:749/1480 train_time:113111ms step_avg:153.06ms step:750/1480 train_time:113272ms step_avg:153.07ms step:750/1480 val_loss:3.5480 train_time:113336ms step_avg:153.16ms step:751/1480 train_time:113439ms step_avg:153.09ms step:752/1480 train_time:113599ms step_avg:153.10ms step:753/1480 train_time:113760ms step_avg:153.11ms step:754/1480 train_time:113919ms step_avg:153.12ms step:755/1480 train_time:114080ms step_avg:153.13ms step:756/1480 train_time:114242ms step_avg:153.14ms step:757/1480 train_time:114406ms step_avg:153.15ms step:758/1480 train_time:114566ms step_avg:153.16ms step:759/1480 train_time:114728ms step_avg:153.18ms step:760/1480 train_time:114890ms step_avg:153.19ms step:761/1480 train_time:115054ms step_avg:153.20ms step:762/1480 train_time:115216ms step_avg:153.21ms step:763/1480 train_time:115378ms step_avg:153.22ms step:764/1480 train_time:115541ms step_avg:153.24ms step:765/1480 train_time:115701ms step_avg:153.25ms step:766/1480 train_time:115864ms step_avg:153.26ms step:767/1480 train_time:116025ms step_avg:153.27ms step:768/1480 train_time:116186ms step_avg:153.28ms step:769/1480 train_time:116347ms step_avg:153.29ms step:770/1480 train_time:116512ms step_avg:153.30ms step:771/1480 train_time:116676ms step_avg:153.32ms step:772/1480 train_time:116840ms step_avg:153.33ms step:773/1480 train_time:117001ms step_avg:153.34ms step:774/1480 train_time:117164ms step_avg:153.36ms step:775/1480 train_time:117325ms step_avg:153.37ms step:776/1480 train_time:117489ms step_avg:153.38ms step:777/1480 train_time:117656ms step_avg:153.40ms step:778/1480 train_time:117820ms step_avg:153.41ms step:779/1480 train_time:117983ms step_avg:153.42ms step:780/1480 train_time:118145ms step_avg:153.44ms step:781/1480 train_time:118310ms step_avg:153.45ms step:782/1480 train_time:118474ms step_avg:153.46ms step:783/1480 train_time:118637ms step_avg:153.48ms step:784/1480 train_time:118800ms step_avg:153.49ms step:785/1480 train_time:118962ms step_avg:153.50ms step:786/1480 train_time:119125ms step_avg:153.51ms step:787/1480 train_time:119287ms step_avg:153.52ms step:788/1480 train_time:119451ms step_avg:153.54ms step:789/1480 train_time:119615ms step_avg:153.55ms step:790/1480 train_time:119779ms step_avg:153.56ms step:791/1480 train_time:119945ms step_avg:153.58ms step:792/1480 train_time:120110ms step_avg:153.59ms step:793/1480 train_time:120272ms step_avg:153.60ms step:794/1480 train_time:120438ms step_avg:153.62ms step:795/1480 train_time:120601ms step_avg:153.63ms step:796/1480 train_time:120766ms step_avg:153.65ms step:797/1480 train_time:120929ms step_avg:153.66ms step:798/1480 train_time:121093ms step_avg:153.67ms step:799/1480 train_time:121259ms step_avg:153.69ms step:800/1480 train_time:121423ms step_avg:153.70ms step:801/1480 train_time:121584ms step_avg:153.71ms step:802/1480 train_time:121752ms step_avg:153.73ms step:803/1480 train_time:121915ms step_avg:153.74ms step:804/1480 train_time:122077ms step_avg:153.75ms step:805/1480 train_time:122241ms step_avg:153.76ms step:806/1480 train_time:122403ms step_avg:153.77ms step:807/1480 train_time:122564ms step_avg:153.78ms step:808/1480 train_time:122728ms step_avg:153.79ms step:809/1480 train_time:122891ms step_avg:153.81ms step:810/1480 train_time:123053ms step_avg:153.82ms step:811/1480 train_time:123216ms step_avg:153.83ms step:812/1480 train_time:123379ms step_avg:153.84ms step:813/1480 train_time:123540ms step_avg:153.85ms step:814/1480 train_time:123702ms step_avg:153.86ms step:815/1480 train_time:123867ms step_avg:153.87ms step:816/1480 train_time:124033ms step_avg:153.89ms step:817/1480 train_time:124197ms step_avg:153.90ms step:818/1480 train_time:124359ms step_avg:153.91ms step:819/1480 train_time:124523ms step_avg:153.92ms step:820/1480 train_time:124688ms step_avg:153.94ms step:821/1480 train_time:124849ms step_avg:153.94ms step:822/1480 train_time:125014ms step_avg:153.96ms step:823/1480 train_time:125177ms step_avg:153.97ms step:824/1480 train_time:125340ms step_avg:153.98ms step:825/1480 train_time:125504ms step_avg:153.99ms step:826/1480 train_time:125671ms step_avg:154.01ms step:827/1480 train_time:125836ms step_avg:154.02ms step:828/1480 train_time:126000ms step_avg:154.03ms step:829/1480 train_time:126163ms step_avg:154.05ms step:830/1480 train_time:126327ms step_avg:154.06ms step:831/1480 train_time:126491ms step_avg:154.07ms step:832/1480 train_time:126655ms step_avg:154.08ms step:833/1480 train_time:126820ms step_avg:154.10ms step:834/1480 train_time:126984ms step_avg:154.11ms step:835/1480 train_time:127146ms step_avg:154.12ms step:836/1480 train_time:127311ms step_avg:154.13ms step:837/1480 train_time:127474ms step_avg:154.14ms step:838/1480 train_time:127639ms step_avg:154.15ms step:839/1480 train_time:127801ms step_avg:154.16ms step:840/1480 train_time:127962ms step_avg:154.17ms step:841/1480 train_time:128122ms step_avg:154.18ms step:842/1480 train_time:128287ms step_avg:154.19ms step:843/1480 train_time:128447ms step_avg:154.20ms step:844/1480 train_time:128610ms step_avg:154.21ms step:845/1480 train_time:128774ms step_avg:154.22ms step:846/1480 train_time:128938ms step_avg:154.23ms step:847/1480 train_time:129101ms step_avg:154.24ms step:848/1480 train_time:129264ms step_avg:154.25ms step:849/1480 train_time:129426ms step_avg:154.26ms step:850/1480 train_time:129589ms step_avg:154.27ms step:851/1480 train_time:129755ms step_avg:154.29ms step:852/1480 train_time:129918ms step_avg:154.30ms step:853/1480 train_time:130080ms step_avg:154.31ms step:854/1480 train_time:130244ms step_avg:154.32ms step:855/1480 train_time:130408ms step_avg:154.33ms step:856/1480 train_time:130570ms step_avg:154.34ms step:857/1480 train_time:130736ms step_avg:154.35ms step:858/1480 train_time:130902ms step_avg:154.37ms step:859/1480 train_time:131065ms step_avg:154.38ms step:860/1480 train_time:131225ms step_avg:154.38ms step:861/1480 train_time:131392ms step_avg:154.40ms step:862/1480 train_time:131559ms step_avg:154.41ms step:863/1480 train_time:131726ms step_avg:154.43ms step:864/1480 train_time:131890ms step_avg:154.44ms step:865/1480 train_time:132052ms step_avg:154.45ms step:866/1480 train_time:132219ms step_avg:154.46ms step:867/1480 train_time:132382ms step_avg:154.47ms step:868/1480 train_time:132542ms step_avg:154.48ms step:869/1480 train_time:132703ms step_avg:154.49ms step:870/1480 train_time:132867ms step_avg:154.50ms step:871/1480 train_time:133032ms step_avg:154.51ms step:872/1480 train_time:133197ms step_avg:154.52ms step:873/1480 train_time:133360ms step_avg:154.53ms step:874/1480 train_time:133525ms step_avg:154.54ms step:875/1480 train_time:133691ms step_avg:154.56ms step:875/1480 val_loss:3.5033 train_time:133757ms step_avg:154.63ms step:876/1480 train_time:133857ms step_avg:154.57ms step:877/1480 train_time:134021ms step_avg:154.58ms step:878/1480 train_time:134183ms step_avg:154.59ms step:879/1480 train_time:134346ms step_avg:154.60ms step:880/1480 train_time:134508ms step_avg:154.61ms step:881/1480 train_time:134671ms step_avg:154.62ms step:882/1480 train_time:134838ms step_avg:154.63ms step:883/1480 train_time:135002ms step_avg:154.64ms step:884/1480 train_time:135170ms step_avg:154.66ms step:885/1480 train_time:135336ms step_avg:154.67ms step:886/1480 train_time:135501ms step_avg:154.68ms step:887/1480 train_time:135669ms step_avg:154.70ms step:888/1480 train_time:135842ms step_avg:154.72ms step:889/1480 train_time:136011ms step_avg:154.73ms step:890/1480 train_time:136174ms step_avg:154.74ms step:891/1480 train_time:136339ms step_avg:154.75ms step:892/1480 train_time:136502ms step_avg:154.76ms step:893/1480 train_time:136665ms step_avg:154.77ms step:894/1480 train_time:136832ms step_avg:154.79ms step:895/1480 train_time:136997ms step_avg:154.80ms step:896/1480 train_time:137162ms step_avg:154.81ms step:897/1480 train_time:137329ms step_avg:154.82ms step:898/1480 train_time:137497ms step_avg:154.84ms step:899/1480 train_time:137660ms step_avg:154.85ms step:900/1480 train_time:137823ms step_avg:154.86ms step:901/1480 train_time:137988ms step_avg:154.87ms step:902/1480 train_time:138153ms step_avg:154.88ms step:903/1480 train_time:138325ms step_avg:154.90ms step:904/1480 train_time:138489ms step_avg:154.91ms step:905/1480 train_time:138653ms step_avg:154.92ms step:906/1480 train_time:138820ms step_avg:154.93ms step:907/1480 train_time:138991ms step_avg:154.95ms step:908/1480 train_time:139155ms step_avg:154.96ms step:909/1480 train_time:139320ms step_avg:154.97ms step:910/1480 train_time:139492ms step_avg:154.99ms step:911/1480 train_time:139657ms step_avg:155.00ms step:912/1480 train_time:139825ms step_avg:155.02ms step:913/1480 train_time:139993ms step_avg:155.03ms step:914/1480 train_time:140160ms step_avg:155.04ms step:915/1480 train_time:140331ms step_avg:155.06ms step:916/1480 train_time:140495ms step_avg:155.07ms step:917/1480 train_time:140658ms step_avg:155.08ms step:918/1480 train_time:140827ms step_avg:155.10ms step:919/1480 train_time:140997ms step_avg:155.11ms step:920/1480 train_time:141161ms step_avg:155.12ms step:921/1480 train_time:141325ms step_avg:155.13ms step:922/1480 train_time:141493ms step_avg:155.15ms step:923/1480 train_time:141655ms step_avg:155.15ms step:924/1480 train_time:141819ms step_avg:155.16ms step:925/1480 train_time:141988ms step_avg:155.18ms step:926/1480 train_time:142153ms step_avg:155.19ms step:927/1480 train_time:142317ms step_avg:155.20ms step:928/1480 train_time:142481ms step_avg:155.21ms step:929/1480 train_time:142645ms step_avg:155.22ms step:930/1480 train_time:142810ms step_avg:155.23ms step:931/1480 train_time:142974ms step_avg:155.24ms step:932/1480 train_time:143141ms step_avg:155.25ms step:933/1480 train_time:143308ms step_avg:155.26ms step:934/1480 train_time:143475ms step_avg:155.28ms step:935/1480 train_time:143643ms step_avg:155.29ms step:936/1480 train_time:143810ms step_avg:155.30ms step:937/1480 train_time:143980ms step_avg:155.32ms step:938/1480 train_time:144141ms step_avg:155.32ms step:939/1480 train_time:144309ms step_avg:155.34ms step:940/1480 train_time:144475ms step_avg:155.35ms step:941/1480 train_time:144638ms step_avg:155.36ms step:942/1480 train_time:144803ms step_avg:155.37ms step:943/1480 train_time:144974ms step_avg:155.38ms step:944/1480 train_time:145147ms step_avg:155.40ms step:945/1480 train_time:145311ms step_avg:155.41ms step:946/1480 train_time:145480ms step_avg:155.43ms step:947/1480 train_time:145648ms step_avg:155.44ms step:948/1480 train_time:145814ms step_avg:155.45ms step:949/1480 train_time:145978ms step_avg:155.46ms step:950/1480 train_time:146142ms step_avg:155.47ms step:951/1480 train_time:146312ms step_avg:155.49ms step:952/1480 train_time:146477ms step_avg:155.50ms step:953/1480 train_time:146645ms step_avg:155.51ms step:954/1480 train_time:146814ms step_avg:155.52ms step:955/1480 train_time:146977ms step_avg:155.53ms step:956/1480 train_time:147140ms step_avg:155.54ms step:957/1480 train_time:147308ms step_avg:155.55ms step:958/1480 train_time:147477ms step_avg:155.57ms step:959/1480 train_time:147642ms step_avg:155.58ms step:960/1480 train_time:147808ms step_avg:155.59ms step:961/1480 train_time:147974ms step_avg:155.60ms step:962/1480 train_time:148137ms step_avg:155.61ms step:963/1480 train_time:148302ms step_avg:155.62ms step:964/1480 train_time:148472ms step_avg:155.63ms step:965/1480 train_time:148636ms step_avg:155.64ms step:966/1480 train_time:148801ms step_avg:155.65ms step:967/1480 train_time:148965ms step_avg:155.66ms step:968/1480 train_time:149131ms step_avg:155.67ms step:969/1480 train_time:149296ms step_avg:155.68ms step:970/1480 train_time:149459ms step_avg:155.69ms step:971/1480 train_time:149624ms step_avg:155.70ms step:972/1480 train_time:149791ms step_avg:155.71ms step:973/1480 train_time:149956ms step_avg:155.72ms step:974/1480 train_time:150124ms step_avg:155.73ms step:975/1480 train_time:150291ms step_avg:155.74ms step:976/1480 train_time:150457ms step_avg:155.75ms step:977/1480 train_time:150620ms step_avg:155.76ms step:978/1480 train_time:150785ms step_avg:155.77ms step:979/1480 train_time:150951ms step_avg:155.78ms step:980/1480 train_time:151117ms step_avg:155.79ms step:981/1480 train_time:151287ms step_avg:155.81ms step:982/1480 train_time:151451ms step_avg:155.81ms step:983/1480 train_time:151616ms step_avg:155.82ms step:984/1480 train_time:151780ms step_avg:155.83ms step:985/1480 train_time:151949ms step_avg:155.84ms step:986/1480 train_time:152114ms step_avg:155.85ms step:987/1480 train_time:152278ms step_avg:155.86ms step:988/1480 train_time:152445ms step_avg:155.87ms step:989/1480 train_time:152611ms step_avg:155.88ms step:990/1480 train_time:152781ms step_avg:155.90ms step:991/1480 train_time:152949ms step_avg:155.91ms step:992/1480 train_time:153123ms step_avg:155.93ms step:993/1480 train_time:153299ms step_avg:155.95ms step:994/1480 train_time:153463ms step_avg:155.96ms step:995/1480 train_time:153627ms step_avg:155.97ms step:996/1480 train_time:153792ms step_avg:155.98ms step:997/1480 train_time:153957ms step_avg:155.98ms step:998/1480 train_time:154119ms step_avg:155.99ms step:999/1480 train_time:154285ms step_avg:156.00ms step:1000/1480 train_time:154454ms step_avg:156.01ms step:1000/1480 val_loss:3.4404 train_time:154521ms step_avg:156.08ms step:1001/1480 train_time:154623ms step_avg:156.03ms step:1002/1480 train_time:154789ms step_avg:156.04ms step:1003/1480 train_time:154959ms step_avg:156.05ms step:1004/1480 train_time:155128ms step_avg:156.06ms step:1005/1480 train_time:155296ms step_avg:156.08ms step:1006/1480 train_time:155464ms step_avg:156.09ms step:1007/1480 train_time:155630ms step_avg:156.10ms step:1008/1480 train_time:155797ms step_avg:156.11ms step:1009/1480 train_time:155971ms step_avg:156.13ms step:1010/1480 train_time:156136ms step_avg:156.14ms step:1011/1480 train_time:156300ms step_avg:156.14ms step:1012/1480 train_time:156464ms step_avg:156.15ms step:1013/1480 train_time:156635ms step_avg:156.17ms step:1014/1480 train_time:156801ms step_avg:156.18ms step:1015/1480 train_time:156973ms step_avg:156.19ms step:1016/1480 train_time:157141ms step_avg:156.20ms step:1017/1480 train_time:157314ms step_avg:156.22ms step:1018/1480 train_time:157482ms step_avg:156.23ms step:1019/1480 train_time:157651ms step_avg:156.24ms step:1020/1480 train_time:157821ms step_avg:156.26ms step:1021/1480 train_time:157987ms step_avg:156.27ms step:1022/1480 train_time:158155ms step_avg:156.28ms step:1023/1480 train_time:158323ms step_avg:156.29ms step:1024/1480 train_time:158489ms step_avg:156.30ms step:1025/1480 train_time:158660ms step_avg:156.32ms step:1026/1480 train_time:158826ms step_avg:156.32ms step:1027/1480 train_time:158993ms step_avg:156.33ms step:1028/1480 train_time:159166ms step_avg:156.35ms step:1029/1480 train_time:159340ms step_avg:156.37ms step:1030/1480 train_time:159508ms step_avg:156.38ms step:1031/1480 train_time:159673ms step_avg:156.39ms step:1032/1480 train_time:159845ms step_avg:156.40ms step:1033/1480 train_time:160011ms step_avg:156.41ms step:1034/1480 train_time:160178ms step_avg:156.42ms step:1035/1480 train_time:160344ms step_avg:156.43ms step:1036/1480 train_time:160509ms step_avg:156.44ms step:1037/1480 train_time:160675ms step_avg:156.45ms step:1038/1480 train_time:160842ms step_avg:156.46ms step:1039/1480 train_time:161014ms step_avg:156.48ms step:1040/1480 train_time:161179ms step_avg:156.48ms step:1041/1480 train_time:161347ms step_avg:156.50ms step:1042/1480 train_time:161512ms step_avg:156.50ms step:1043/1480 train_time:161676ms step_avg:156.51ms step:1044/1480 train_time:161842ms step_avg:156.52ms step:1045/1480 train_time:162013ms step_avg:156.53ms step:1046/1480 train_time:162180ms step_avg:156.54ms step:1047/1480 train_time:162346ms step_avg:156.55ms step:1048/1480 train_time:162514ms step_avg:156.56ms step:1049/1480 train_time:162680ms step_avg:156.57ms step:1050/1480 train_time:162849ms step_avg:156.59ms step:1051/1480 train_time:163019ms step_avg:156.60ms step:1052/1480 train_time:163187ms step_avg:156.61ms step:1053/1480 train_time:163354ms step_avg:156.62ms step:1054/1480 train_time:163522ms step_avg:156.63ms step:1055/1480 train_time:163688ms step_avg:156.64ms step:1056/1480 train_time:163853ms step_avg:156.65ms step:1057/1480 train_time:164018ms step_avg:156.66ms step:1058/1480 train_time:164189ms step_avg:156.67ms step:1059/1480 train_time:164362ms step_avg:156.68ms step:1060/1480 train_time:164532ms step_avg:156.70ms step:1061/1480 train_time:164696ms step_avg:156.70ms step:1062/1480 train_time:164861ms step_avg:156.71ms step:1063/1480 train_time:165027ms step_avg:156.72ms step:1064/1480 train_time:165192ms step_avg:156.73ms step:1065/1480 train_time:165359ms step_avg:156.74ms step:1066/1480 train_time:165529ms step_avg:156.75ms step:1067/1480 train_time:165697ms step_avg:156.76ms step:1068/1480 train_time:165862ms step_avg:156.77ms step:1069/1480 train_time:166034ms step_avg:156.78ms step:1070/1480 train_time:166200ms step_avg:156.79ms step:1071/1480 train_time:166373ms step_avg:156.81ms step:1072/1480 train_time:166539ms step_avg:156.82ms step:1073/1480 train_time:166702ms step_avg:156.82ms step:1074/1480 train_time:166869ms step_avg:156.83ms step:1075/1480 train_time:167039ms step_avg:156.84ms step:1076/1480 train_time:167204ms step_avg:156.85ms step:1077/1480 train_time:167370ms step_avg:156.86ms step:1078/1480 train_time:167545ms step_avg:156.88ms step:1079/1480 train_time:167717ms step_avg:156.89ms step:1080/1480 train_time:167886ms step_avg:156.90ms step:1081/1480 train_time:168054ms step_avg:156.91ms step:1082/1480 train_time:168220ms step_avg:156.92ms step:1083/1480 train_time:168385ms step_avg:156.93ms step:1084/1480 train_time:168551ms step_avg:156.94ms step:1085/1480 train_time:168720ms step_avg:156.95ms step:1086/1480 train_time:168885ms step_avg:156.96ms step:1087/1480 train_time:169052ms step_avg:156.97ms step:1088/1480 train_time:169222ms step_avg:156.98ms step:1089/1480 train_time:169395ms step_avg:156.99ms step:1090/1480 train_time:169566ms step_avg:157.01ms step:1091/1480 train_time:169735ms step_avg:157.02ms step:1092/1480 train_time:169902ms step_avg:157.03ms step:1093/1480 train_time:170071ms step_avg:157.04ms step:1094/1480 train_time:170238ms step_avg:157.05ms step:1095/1480 train_time:170402ms step_avg:157.05ms step:1096/1480 train_time:170572ms step_avg:157.06ms step:1097/1480 train_time:170740ms step_avg:157.07ms step:1098/1480 train_time:170914ms step_avg:157.09ms step:1099/1480 train_time:171085ms step_avg:157.10ms step:1100/1480 train_time:171257ms step_avg:157.12ms step:1101/1480 train_time:171429ms step_avg:157.13ms step:1102/1480 train_time:171601ms step_avg:157.14ms step:1103/1480 train_time:171777ms step_avg:157.16ms step:1104/1480 train_time:171944ms step_avg:157.17ms step:1105/1480 train_time:172115ms step_avg:157.18ms step:1106/1480 train_time:172283ms step_avg:157.19ms step:1107/1480 train_time:172451ms step_avg:157.20ms step:1108/1480 train_time:172617ms step_avg:157.21ms step:1109/1480 train_time:172783ms step_avg:157.22ms step:1110/1480 train_time:172949ms step_avg:157.23ms step:1111/1480 train_time:173117ms step_avg:157.24ms step:1112/1480 train_time:173288ms step_avg:157.25ms step:1113/1480 train_time:173466ms step_avg:157.27ms step:1114/1480 train_time:173639ms step_avg:157.28ms step:1115/1480 train_time:173811ms step_avg:157.30ms step:1116/1480 train_time:173978ms step_avg:157.30ms step:1117/1480 train_time:174151ms step_avg:157.32ms step:1118/1480 train_time:174328ms step_avg:157.34ms step:1119/1480 train_time:174495ms step_avg:157.34ms step:1120/1480 train_time:174663ms step_avg:157.35ms step:1121/1480 train_time:174834ms step_avg:157.37ms step:1122/1480 train_time:175000ms step_avg:157.37ms step:1123/1480 train_time:175167ms step_avg:157.38ms step:1124/1480 train_time:175335ms step_avg:157.39ms step:1125/1480 train_time:175502ms step_avg:157.40ms step:1125/1480 val_loss:3.3845 train_time:175570ms step_avg:157.46ms step:1126/1480 train_time:175672ms step_avg:157.41ms step:1127/1480 train_time:175842ms step_avg:157.42ms step:1128/1480 train_time:176012ms step_avg:157.43ms step:1129/1480 train_time:176185ms step_avg:157.45ms step:1130/1480 train_time:176354ms step_avg:157.46ms step:1131/1480 train_time:176533ms step_avg:157.48ms step:1132/1480 train_time:176697ms step_avg:157.48ms step:1133/1480 train_time:176871ms step_avg:157.50ms step:1134/1480 train_time:177042ms step_avg:157.51ms step:1135/1480 train_time:177209ms step_avg:157.52ms step:1136/1480 train_time:177381ms step_avg:157.53ms step:1137/1480 train_time:177550ms step_avg:157.54ms step:1138/1480 train_time:177721ms step_avg:157.55ms step:1139/1480 train_time:177890ms step_avg:157.56ms step:1140/1480 train_time:178058ms step_avg:157.57ms step:1141/1480 train_time:178230ms step_avg:157.59ms step:1142/1480 train_time:178397ms step_avg:157.59ms step:1143/1480 train_time:178568ms step_avg:157.61ms step:1144/1480 train_time:178736ms step_avg:157.62ms step:1145/1480 train_time:178900ms step_avg:157.62ms step:1146/1480 train_time:179071ms step_avg:157.63ms step:1147/1480 train_time:179238ms step_avg:157.64ms step:1148/1480 train_time:179406ms step_avg:157.65ms step:1149/1480 train_time:179576ms step_avg:157.66ms step:1150/1480 train_time:179745ms step_avg:157.67ms step:1151/1480 train_time:179915ms step_avg:157.68ms step:1152/1480 train_time:180090ms step_avg:157.70ms step:1153/1480 train_time:180263ms step_avg:157.71ms step:1154/1480 train_time:180430ms step_avg:157.72ms step:1155/1480 train_time:180602ms step_avg:157.73ms step:1156/1480 train_time:180782ms step_avg:157.75ms step:1157/1480 train_time:180953ms step_avg:157.76ms step:1158/1480 train_time:181119ms step_avg:157.77ms step:1159/1480 train_time:181286ms step_avg:157.78ms step:1160/1480 train_time:181451ms step_avg:157.78ms step:1161/1480 train_time:181621ms step_avg:157.79ms step:1162/1480 train_time:181790ms step_avg:157.80ms step:1163/1480 train_time:181959ms step_avg:157.81ms step:1164/1480 train_time:182129ms step_avg:157.82ms step:1165/1480 train_time:182294ms step_avg:157.83ms step:1166/1480 train_time:182466ms step_avg:157.84ms step:1167/1480 train_time:182633ms step_avg:157.85ms step:1168/1480 train_time:182802ms step_avg:157.86ms step:1169/1480 train_time:182971ms step_avg:157.87ms step:1170/1480 train_time:183139ms step_avg:157.88ms step:1171/1480 train_time:183307ms step_avg:157.89ms step:1172/1480 train_time:183473ms step_avg:157.89ms step:1173/1480 train_time:183646ms step_avg:157.91ms step:1174/1480 train_time:183827ms step_avg:157.93ms step:1175/1480 train_time:183998ms step_avg:157.94ms step:1176/1480 train_time:184170ms step_avg:157.95ms step:1177/1480 train_time:184347ms step_avg:157.97ms step:1178/1480 train_time:184513ms step_avg:157.97ms step:1179/1480 train_time:184678ms step_avg:157.98ms step:1180/1480 train_time:184858ms step_avg:158.00ms step:1181/1480 train_time:185029ms step_avg:158.01ms step:1182/1480 train_time:185195ms step_avg:158.02ms step:1183/1480 train_time:185366ms step_avg:158.03ms step:1184/1480 train_time:185533ms step_avg:158.03ms step:1185/1480 train_time:185707ms step_avg:158.05ms step:1186/1480 train_time:185877ms step_avg:158.06ms step:1187/1480 train_time:186061ms step_avg:158.08ms step:1188/1480 train_time:186227ms step_avg:158.09ms step:1189/1480 train_time:186396ms step_avg:158.10ms step:1190/1480 train_time:186564ms step_avg:158.11ms step:1191/1480 train_time:186735ms step_avg:158.12ms step:1192/1480 train_time:186902ms step_avg:158.12ms step:1193/1480 train_time:187070ms step_avg:158.13ms step:1194/1480 train_time:187238ms step_avg:158.14ms step:1195/1480 train_time:187411ms step_avg:158.15ms step:1196/1480 train_time:187595ms step_avg:158.17ms step:1197/1480 train_time:187769ms step_avg:158.19ms step:1198/1480 train_time:187948ms step_avg:158.21ms step:1199/1480 train_time:188119ms step_avg:158.22ms step:1200/1480 train_time:188289ms step_avg:158.23ms step:1201/1480 train_time:188457ms step_avg:158.23ms step:1202/1480 train_time:188638ms step_avg:158.25ms step:1203/1480 train_time:188814ms step_avg:158.27ms step:1204/1480 train_time:188989ms step_avg:158.28ms step:1205/1480 train_time:189156ms step_avg:158.29ms step:1206/1480 train_time:189324ms step_avg:158.30ms step:1207/1480 train_time:189493ms step_avg:158.31ms step:1208/1480 train_time:189660ms step_avg:158.31ms step:1209/1480 train_time:189834ms step_avg:158.33ms step:1210/1480 train_time:190009ms step_avg:158.34ms step:1211/1480 train_time:190183ms step_avg:158.35ms step:1212/1480 train_time:190356ms step_avg:158.37ms step:1213/1480 train_time:190530ms step_avg:158.38ms step:1214/1480 train_time:190707ms step_avg:158.39ms step:1215/1480 train_time:190880ms step_avg:158.41ms step:1216/1480 train_time:191051ms step_avg:158.42ms step:1217/1480 train_time:191224ms step_avg:158.43ms step:1218/1480 train_time:191394ms step_avg:158.44ms step:1219/1480 train_time:191573ms step_avg:158.46ms step:1220/1480 train_time:191743ms step_avg:158.47ms step:1221/1480 train_time:191910ms step_avg:158.47ms step:1222/1480 train_time:192077ms step_avg:158.48ms step:1223/1480 train_time:192249ms step_avg:158.49ms step:1224/1480 train_time:192427ms step_avg:158.51ms step:1225/1480 train_time:192599ms step_avg:158.52ms step:1226/1480 train_time:192772ms step_avg:158.53ms step:1227/1480 train_time:192944ms step_avg:158.54ms step:1228/1480 train_time:193114ms step_avg:158.55ms step:1229/1480 train_time:193286ms step_avg:158.56ms step:1230/1480 train_time:193465ms step_avg:158.58ms step:1231/1480 train_time:193641ms step_avg:158.59ms step:1232/1480 train_time:193816ms step_avg:158.61ms step:1233/1480 train_time:193985ms step_avg:158.61ms step:1234/1480 train_time:194155ms step_avg:158.62ms step:1235/1480 train_time:194330ms step_avg:158.64ms step:1236/1480 train_time:194497ms step_avg:158.64ms step:1237/1480 train_time:194670ms step_avg:158.66ms step:1238/1480 train_time:194855ms step_avg:158.68ms step:1239/1480 train_time:195027ms step_avg:158.69ms step:1240/1480 train_time:195197ms step_avg:158.70ms step:1241/1480 train_time:195370ms step_avg:158.71ms step:1242/1480 train_time:195538ms step_avg:158.72ms step:1243/1480 train_time:195713ms step_avg:158.73ms step:1244/1480 train_time:195879ms step_avg:158.74ms step:1245/1480 train_time:196049ms step_avg:158.74ms step:1246/1480 train_time:196217ms step_avg:158.75ms step:1247/1480 train_time:196386ms step_avg:158.76ms step:1248/1480 train_time:196556ms step_avg:158.77ms step:1249/1480 train_time:196726ms step_avg:158.78ms step:1250/1480 train_time:196894ms step_avg:158.79ms step:1250/1480 val_loss:3.3346 train_time:196967ms step_avg:158.84ms step:1251/1480 train_time:197078ms step_avg:158.81ms step:1252/1480 train_time:197248ms step_avg:158.82ms step:1253/1480 train_time:197417ms step_avg:158.82ms step:1254/1480 train_time:197588ms step_avg:158.83ms step:1255/1480 train_time:197776ms step_avg:158.86ms step:1256/1480 train_time:197951ms step_avg:158.87ms step:1257/1480 train_time:198121ms step_avg:158.88ms step:1258/1480 train_time:198297ms step_avg:158.89ms step:1259/1480 train_time:198469ms step_avg:158.90ms step:1260/1480 train_time:198637ms step_avg:158.91ms step:1261/1480 train_time:198808ms step_avg:158.92ms step:1262/1480 train_time:198984ms step_avg:158.93ms step:1263/1480 train_time:199158ms step_avg:158.95ms step:1264/1480 train_time:199324ms step_avg:158.95ms step:1265/1480 train_time:199492ms step_avg:158.96ms step:1266/1480 train_time:199662ms step_avg:158.97ms step:1267/1480 train_time:199834ms step_avg:158.98ms step:1268/1480 train_time:200004ms step_avg:158.99ms step:1269/1480 train_time:200180ms step_avg:159.00ms step:1270/1480 train_time:200350ms step_avg:159.01ms step:1271/1480 train_time:200520ms step_avg:159.02ms step:1272/1480 train_time:200686ms step_avg:159.02ms step:1273/1480 train_time:200857ms step_avg:159.03ms step:1274/1480 train_time:201030ms step_avg:159.04ms step:1275/1480 train_time:201198ms step_avg:159.05ms step:1276/1480 train_time:201363ms step_avg:159.05ms step:1277/1480 train_time:201535ms step_avg:159.06ms step:1278/1480 train_time:201702ms step_avg:159.07ms step:1279/1480 train_time:201876ms step_avg:159.08ms step:1280/1480 train_time:202054ms step_avg:159.10ms step:1281/1480 train_time:202223ms step_avg:159.11ms step:1282/1480 train_time:202388ms step_avg:159.11ms step:1283/1480 train_time:202559ms step_avg:159.12ms step:1284/1480 train_time:202730ms step_avg:159.13ms step:1285/1480 train_time:202900ms step_avg:159.14ms step:1286/1480 train_time:203069ms step_avg:159.15ms step:1287/1480 train_time:203240ms step_avg:159.15ms step:1288/1480 train_time:203413ms step_avg:159.16ms step:1289/1480 train_time:203598ms step_avg:159.19ms step:1290/1480 train_time:203778ms step_avg:159.20ms step:1291/1480 train_time:203952ms step_avg:159.21ms step:1292/1480 train_time:204125ms step_avg:159.22ms step:1293/1480 train_time:204302ms step_avg:159.24ms step:1294/1480 train_time:204474ms step_avg:159.25ms step:1295/1480 train_time:204645ms step_avg:159.26ms step:1296/1480 train_time:204818ms step_avg:159.27ms step:1297/1480 train_time:204990ms step_avg:159.28ms step:1298/1480 train_time:205159ms step_avg:159.29ms step:1299/1480 train_time:205330ms step_avg:159.29ms step:1300/1480 train_time:205498ms step_avg:159.30ms step:1301/1480 train_time:205666ms step_avg:159.31ms step:1302/1480 train_time:205841ms step_avg:159.32ms step:1303/1480 train_time:206019ms step_avg:159.33ms step:1304/1480 train_time:206193ms step_avg:159.35ms step:1305/1480 train_time:206362ms step_avg:159.35ms step:1306/1480 train_time:206536ms step_avg:159.36ms step:1307/1480 train_time:206704ms step_avg:159.37ms step:1308/1480 train_time:206874ms step_avg:159.38ms step:1309/1480 train_time:207045ms step_avg:159.39ms step:1310/1480 train_time:207215ms step_avg:159.40ms step:1311/1480 train_time:207383ms step_avg:159.40ms step:1312/1480 train_time:207557ms step_avg:159.41ms step:1313/1480 train_time:207724ms step_avg:159.42ms step:1314/1480 train_time:207898ms step_avg:159.43ms step:1315/1480 train_time:208068ms step_avg:159.44ms step:1316/1480 train_time:208235ms step_avg:159.45ms step:1317/1480 train_time:208406ms step_avg:159.45ms step:1318/1480 train_time:208587ms step_avg:159.47ms step:1319/1480 train_time:208763ms step_avg:159.48ms step:1320/1480 train_time:208940ms step_avg:159.50ms step:1321/1480 train_time:209112ms step_avg:159.51ms step:1322/1480 train_time:209294ms step_avg:159.52ms step:1323/1480 train_time:209465ms step_avg:159.53ms step:1324/1480 train_time:209639ms step_avg:159.54ms step:1325/1480 train_time:209821ms step_avg:159.56ms step:1326/1480 train_time:209998ms step_avg:159.57ms step:1327/1480 train_time:210167ms step_avg:159.58ms step:1328/1480 train_time:210338ms step_avg:159.59ms step:1329/1480 train_time:210533ms step_avg:159.62ms step:1330/1480 train_time:210712ms step_avg:159.63ms step:1331/1480 train_time:210882ms step_avg:159.64ms step:1332/1480 train_time:211057ms step_avg:159.65ms step:1333/1480 train_time:211233ms step_avg:159.66ms step:1334/1480 train_time:211403ms step_avg:159.67ms step:1335/1480 train_time:211572ms step_avg:159.68ms step:1336/1480 train_time:211756ms step_avg:159.69ms step:1337/1480 train_time:211930ms step_avg:159.71ms step:1338/1480 train_time:212102ms step_avg:159.72ms step:1339/1480 train_time:212277ms step_avg:159.73ms step:1340/1480 train_time:212448ms step_avg:159.74ms step:1341/1480 train_time:212616ms step_avg:159.74ms step:1342/1480 train_time:212790ms step_avg:159.75ms step:1343/1480 train_time:212960ms step_avg:159.76ms step:1344/1480 train_time:213133ms step_avg:159.77ms step:1345/1480 train_time:213311ms step_avg:159.78ms step:1346/1480 train_time:213480ms step_avg:159.79ms step:1347/1480 train_time:213650ms step_avg:159.80ms step:1348/1480 train_time:213820ms step_avg:159.81ms step:1349/1480 train_time:213990ms step_avg:159.81ms step:1350/1480 train_time:214163ms step_avg:159.82ms step:1351/1480 train_time:214335ms step_avg:159.83ms step:1352/1480 train_time:214505ms step_avg:159.84ms step:1353/1480 train_time:214681ms step_avg:159.85ms step:1354/1480 train_time:214853ms step_avg:159.86ms step:1355/1480 train_time:215020ms step_avg:159.87ms step:1356/1480 train_time:215195ms step_avg:159.88ms step:1357/1480 train_time:215367ms step_avg:159.89ms step:1358/1480 train_time:215538ms step_avg:159.89ms step:1359/1480 train_time:215710ms step_avg:159.90ms step:1360/1480 train_time:215885ms step_avg:159.92ms step:1361/1480 train_time:216063ms step_avg:159.93ms step:1362/1480 train_time:216238ms step_avg:159.94ms step:1363/1480 train_time:216418ms step_avg:159.95ms step:1364/1480 train_time:216585ms step_avg:159.96ms step:1365/1480 train_time:216753ms step_avg:159.97ms step:1366/1480 train_time:216924ms step_avg:159.97ms step:1367/1480 train_time:217095ms step_avg:159.98ms step:1368/1480 train_time:217271ms step_avg:159.99ms step:1369/1480 train_time:217451ms step_avg:160.01ms step:1370/1480 train_time:217629ms step_avg:160.02ms step:1371/1480 train_time:217799ms step_avg:160.03ms step:1372/1480 train_time:217976ms step_avg:160.04ms step:1373/1480 train_time:218145ms step_avg:160.05ms step:1374/1480 train_time:218321ms step_avg:160.06ms step:1375/1480 train_time:218493ms step_avg:160.07ms step:1375/1480 val_loss:3.2965 train_time:218560ms step_avg:160.12ms step:1376/1480 train_time:218665ms step_avg:160.08ms step:1377/1480 train_time:218838ms step_avg:160.09ms step:1378/1480 train_time:219007ms step_avg:160.09ms step:1379/1480 train_time:219182ms step_avg:160.10ms step:1380/1480 train_time:219356ms step_avg:160.11ms step:1381/1480 train_time:219537ms step_avg:160.13ms step:1382/1480 train_time:219708ms step_avg:160.14ms step:1383/1480 train_time:219879ms step_avg:160.15ms step:1384/1480 train_time:220056ms step_avg:160.16ms step:1385/1480 train_time:220221ms step_avg:160.16ms step:1386/1480 train_time:220391ms step_avg:160.17ms step:1387/1480 train_time:220562ms step_avg:160.18ms step:1388/1480 train_time:220729ms step_avg:160.18ms step:1389/1480 train_time:220902ms step_avg:160.19ms step:1390/1480 train_time:221070ms step_avg:160.20ms step:1391/1480 train_time:221240ms step_avg:160.20ms step:1392/1480 train_time:221414ms step_avg:160.21ms step:1393/1480 train_time:221584ms step_avg:160.22ms step:1394/1480 train_time:221755ms step_avg:160.23ms step:1395/1480 train_time:221923ms step_avg:160.23ms step:1396/1480 train_time:222092ms step_avg:160.24ms step:1397/1480 train_time:222260ms step_avg:160.24ms step:1398/1480 train_time:222426ms step_avg:160.25ms step:1399/1480 train_time:222596ms step_avg:160.26ms step:1400/1480 train_time:222774ms step_avg:160.27ms step:1401/1480 train_time:222940ms step_avg:160.27ms step:1402/1480 train_time:223112ms step_avg:160.28ms step:1403/1480 train_time:223289ms step_avg:160.29ms step:1404/1480 train_time:223459ms step_avg:160.30ms step:1405/1480 train_time:223635ms step_avg:160.31ms step:1406/1480 train_time:223811ms step_avg:160.32ms step:1407/1480 train_time:223979ms step_avg:160.33ms step:1408/1480 train_time:224147ms step_avg:160.33ms step:1409/1480 train_time:224330ms step_avg:160.35ms step:1410/1480 train_time:224499ms step_avg:160.36ms step:1411/1480 train_time:224667ms step_avg:160.36ms step:1412/1480 train_time:224838ms step_avg:160.37ms step:1413/1480 train_time:225009ms step_avg:160.38ms step:1414/1480 train_time:225180ms step_avg:160.38ms step:1415/1480 train_time:225355ms step_avg:160.39ms step:1416/1480 train_time:225541ms step_avg:160.41ms step:1417/1480 train_time:225716ms step_avg:160.42ms step:1418/1480 train_time:225888ms step_avg:160.43ms step:1419/1480 train_time:226061ms step_avg:160.44ms step:1420/1480 train_time:226236ms step_avg:160.45ms step:1421/1480 train_time:226411ms step_avg:160.46ms step:1422/1480 train_time:226581ms step_avg:160.47ms step:1423/1480 train_time:226750ms step_avg:160.47ms step:1424/1480 train_time:226927ms step_avg:160.49ms step:1425/1480 train_time:227106ms step_avg:160.50ms step:1426/1480 train_time:227278ms step_avg:160.51ms step:1427/1480 train_time:227454ms step_avg:160.52ms step:1428/1480 train_time:227624ms step_avg:160.52ms step:1429/1480 train_time:227792ms step_avg:160.53ms step:1430/1480 train_time:227964ms step_avg:160.54ms step:1431/1480 train_time:228141ms step_avg:160.55ms step:1432/1480 train_time:228319ms step_avg:160.56ms step:1433/1480 train_time:228498ms step_avg:160.57ms step:1434/1480 train_time:228679ms step_avg:160.59ms step:1435/1480 train_time:228854ms step_avg:160.60ms step:1436/1480 train_time:229025ms step_avg:160.61ms step:1437/1480 train_time:229197ms step_avg:160.61ms step:1438/1480 train_time:229366ms step_avg:160.62ms step:1439/1480 train_time:229541ms step_avg:160.63ms step:1440/1480 train_time:229712ms step_avg:160.64ms step:1441/1480 train_time:229883ms step_avg:160.65ms step:1442/1480 train_time:230060ms step_avg:160.66ms step:1443/1480 train_time:230249ms step_avg:160.68ms step:1444/1480 train_time:230419ms step_avg:160.68ms step:1445/1480 train_time:230590ms step_avg:160.69ms step:1446/1480 train_time:230765ms step_avg:160.70ms step:1447/1480 train_time:230942ms step_avg:160.71ms step:1448/1480 train_time:231115ms step_avg:160.72ms step:1449/1480 train_time:231288ms step_avg:160.73ms step:1450/1480 train_time:231461ms step_avg:160.74ms step:1451/1480 train_time:231633ms step_avg:160.74ms step:1452/1480 train_time:231807ms step_avg:160.75ms step:1453/1480 train_time:231978ms step_avg:160.76ms step:1454/1480 train_time:232149ms step_avg:160.77ms step:1455/1480 train_time:232329ms step_avg:160.78ms step:1456/1480 train_time:232502ms step_avg:160.79ms step:1457/1480 train_time:232674ms step_avg:160.80ms step:1458/1480 train_time:232845ms step_avg:160.80ms step:1459/1480 train_time:233022ms step_avg:160.82ms step:1460/1480 train_time:233194ms step_avg:160.82ms step:1461/1480 train_time:233368ms step_avg:160.83ms step:1462/1480 train_time:233539ms step_avg:160.84ms step:1463/1480 train_time:233717ms step_avg:160.85ms step:1464/1480 train_time:233892ms step_avg:160.86ms step:1465/1480 train_time:234064ms step_avg:160.87ms step:1466/1480 train_time:234235ms step_avg:160.88ms step:1467/1480 train_time:234410ms step_avg:160.89ms step:1468/1480 train_time:234580ms step_avg:160.89ms step:1469/1480 train_time:234752ms step_avg:160.90ms step:1470/1480 train_time:234933ms step_avg:160.91ms step:1471/1480 train_time:235120ms step_avg:160.93ms step:1472/1480 train_time:235301ms step_avg:160.94ms step:1473/1480 train_time:235473ms step_avg:160.95ms step:1474/1480 train_time:235650ms step_avg:160.96ms step:1475/1480 train_time:235830ms step_avg:160.98ms step:1476/1480 train_time:236002ms step_avg:160.98ms step:1477/1480 train_time:236183ms step_avg:161.00ms step:1478/1480 train_time:236365ms step_avg:161.01ms step:1479/1480 train_time:236539ms step_avg:161.02ms step:1480/1480 train_time:236712ms step_avg:161.03ms step:1480/1480 val_loss:3.2775 train_time:236783ms step_avg:161.08ms