import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 09:10:49 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 131W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 40C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 99W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23212ms step_avg:nanms step:2/1480 train_time:23304ms step_avg:nanms step:3/1480 train_time:23444ms step_avg:nanms step:4/1480 train_time:23584ms step_avg:nanms step:5/1480 train_time:23726ms step_avg:nanms step:6/1480 train_time:23866ms step_avg:nanms step:7/1480 train_time:24007ms step_avg:nanms step:8/1480 train_time:24149ms step_avg:nanms step:9/1480 train_time:24296ms step_avg:nanms step:10/1480 train_time:24440ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.76ms step:14/1480 train_time:566ms step_avg:141.56ms step:15/1480 train_time:708ms step_avg:141.57ms step:16/1480 train_time:852ms step_avg:142.03ms step:17/1480 train_time:997ms step_avg:142.42ms step:18/1480 train_time:1141ms step_avg:142.58ms step:19/1480 train_time:1282ms step_avg:142.49ms step:20/1480 train_time:1423ms step_avg:142.31ms step:21/1480 train_time:1564ms step_avg:142.21ms step:22/1480 train_time:1706ms step_avg:142.15ms step:23/1480 train_time:1850ms step_avg:142.28ms step:24/1480 train_time:1994ms step_avg:142.42ms step:25/1480 train_time:2139ms step_avg:142.59ms step:26/1480 train_time:2283ms step_avg:142.69ms step:27/1480 train_time:2425ms step_avg:142.64ms step:28/1480 train_time:2566ms step_avg:142.53ms step:29/1480 train_time:2706ms step_avg:142.45ms step:30/1480 train_time:2848ms step_avg:142.39ms step:31/1480 train_time:2990ms step_avg:142.39ms step:32/1480 train_time:3135ms step_avg:142.49ms step:33/1480 train_time:3278ms step_avg:142.52ms step:34/1480 train_time:3420ms step_avg:142.50ms step:35/1480 train_time:3563ms step_avg:142.52ms step:36/1480 train_time:3704ms step_avg:142.48ms step:37/1480 train_time:3846ms step_avg:142.45ms step:38/1480 train_time:3990ms step_avg:142.49ms step:39/1480 train_time:4133ms step_avg:142.51ms step:40/1480 train_time:4278ms step_avg:142.60ms step:41/1480 train_time:4422ms step_avg:142.64ms step:42/1480 train_time:4565ms step_avg:142.65ms step:43/1480 train_time:4706ms step_avg:142.61ms step:44/1480 train_time:4848ms step_avg:142.58ms step:45/1480 train_time:4992ms step_avg:142.62ms step:46/1480 train_time:5137ms step_avg:142.70ms step:47/1480 train_time:5282ms step_avg:142.77ms step:48/1480 train_time:5425ms step_avg:142.77ms step:49/1480 train_time:5567ms step_avg:142.74ms step:50/1480 train_time:5708ms step_avg:142.70ms step:51/1480 train_time:5850ms step_avg:142.67ms step:52/1480 train_time:5993ms step_avg:142.69ms step:53/1480 train_time:6135ms step_avg:142.67ms step:54/1480 train_time:6279ms step_avg:142.72ms step:55/1480 train_time:6422ms step_avg:142.72ms step:56/1480 train_time:6566ms step_avg:142.73ms step:57/1480 train_time:6707ms step_avg:142.71ms step:58/1480 train_time:6848ms step_avg:142.68ms step:59/1480 train_time:6990ms step_avg:142.66ms step:60/1480 train_time:7133ms step_avg:142.67ms step:61/1480 train_time:7278ms step_avg:142.70ms step:62/1480 train_time:7422ms step_avg:142.73ms step:63/1480 train_time:7565ms step_avg:142.73ms step:64/1480 train_time:7706ms step_avg:142.70ms step:65/1480 train_time:7847ms step_avg:142.68ms step:66/1480 train_time:7990ms step_avg:142.67ms step:67/1480 train_time:8132ms step_avg:142.66ms step:68/1480 train_time:8276ms step_avg:142.68ms step:69/1480 train_time:8419ms step_avg:142.69ms step:70/1480 train_time:8563ms step_avg:142.71ms step:71/1480 train_time:8705ms step_avg:142.71ms step:72/1480 train_time:8846ms step_avg:142.69ms step:73/1480 train_time:8989ms step_avg:142.69ms step:74/1480 train_time:9132ms step_avg:142.69ms step:75/1480 train_time:9276ms step_avg:142.71ms step:76/1480 train_time:9420ms step_avg:142.73ms step:77/1480 train_time:9564ms step_avg:142.75ms step:78/1480 train_time:9706ms step_avg:142.73ms step:79/1480 train_time:9848ms step_avg:142.73ms step:80/1480 train_time:9992ms step_avg:142.75ms step:81/1480 train_time:10137ms step_avg:142.77ms step:82/1480 train_time:10281ms step_avg:142.79ms step:83/1480 train_time:10424ms step_avg:142.79ms step:84/1480 train_time:10566ms step_avg:142.78ms step:85/1480 train_time:10707ms step_avg:142.77ms step:86/1480 train_time:10848ms step_avg:142.74ms step:87/1480 train_time:10990ms step_avg:142.72ms step:88/1480 train_time:11133ms step_avg:142.73ms step:89/1480 train_time:11279ms step_avg:142.77ms step:90/1480 train_time:11423ms step_avg:142.79ms step:91/1480 train_time:11565ms step_avg:142.78ms step:92/1480 train_time:11707ms step_avg:142.77ms step:93/1480 train_time:11849ms step_avg:142.76ms step:94/1480 train_time:11992ms step_avg:142.76ms step:95/1480 train_time:12133ms step_avg:142.74ms step:96/1480 train_time:12276ms step_avg:142.75ms step:97/1480 train_time:12419ms step_avg:142.74ms step:98/1480 train_time:12562ms step_avg:142.75ms step:99/1480 train_time:12703ms step_avg:142.74ms step:100/1480 train_time:12845ms step_avg:142.72ms step:101/1480 train_time:12987ms step_avg:142.72ms step:102/1480 train_time:13129ms step_avg:142.70ms step:103/1480 train_time:13270ms step_avg:142.69ms step:104/1480 train_time:13412ms step_avg:142.68ms step:105/1480 train_time:13556ms step_avg:142.69ms step:106/1480 train_time:13698ms step_avg:142.69ms step:107/1480 train_time:13841ms step_avg:142.69ms step:108/1480 train_time:13984ms step_avg:142.69ms step:109/1480 train_time:14126ms step_avg:142.69ms step:110/1480 train_time:14269ms step_avg:142.69ms step:111/1480 train_time:14413ms step_avg:142.70ms step:112/1480 train_time:14562ms step_avg:142.77ms step:113/1480 train_time:14709ms step_avg:142.81ms step:114/1480 train_time:14857ms step_avg:142.86ms step:115/1480 train_time:15005ms step_avg:142.91ms step:116/1480 train_time:15151ms step_avg:142.93ms step:117/1480 train_time:15299ms step_avg:142.98ms step:118/1480 train_time:15446ms step_avg:143.02ms step:119/1480 train_time:15593ms step_avg:143.05ms step:120/1480 train_time:15740ms step_avg:143.10ms step:121/1480 train_time:15888ms step_avg:143.13ms step:122/1480 train_time:16034ms step_avg:143.16ms step:123/1480 train_time:16184ms step_avg:143.22ms step:124/1480 train_time:16330ms step_avg:143.25ms step:125/1480 train_time:16478ms step_avg:143.28ms step:125/1480 val_loss:4.4025 train_time:16535ms step_avg:143.78ms step:126/1480 train_time:16631ms step_avg:143.37ms step:127/1480 train_time:16779ms step_avg:143.41ms step:128/1480 train_time:16927ms step_avg:143.45ms step:129/1480 train_time:17072ms step_avg:143.46ms step:130/1480 train_time:17219ms step_avg:143.49ms step:131/1480 train_time:17366ms step_avg:143.52ms step:132/1480 train_time:17512ms step_avg:143.54ms step:133/1480 train_time:17659ms step_avg:143.57ms step:134/1480 train_time:17808ms step_avg:143.61ms step:135/1480 train_time:17954ms step_avg:143.63ms step:136/1480 train_time:18101ms step_avg:143.66ms step:137/1480 train_time:18248ms step_avg:143.69ms step:138/1480 train_time:18394ms step_avg:143.70ms step:139/1480 train_time:18540ms step_avg:143.72ms step:140/1480 train_time:18690ms step_avg:143.77ms step:141/1480 train_time:18837ms step_avg:143.80ms step:142/1480 train_time:18986ms step_avg:143.83ms step:143/1480 train_time:19133ms step_avg:143.86ms step:144/1480 train_time:19279ms step_avg:143.87ms step:145/1480 train_time:19426ms step_avg:143.90ms step:146/1480 train_time:19573ms step_avg:143.92ms step:147/1480 train_time:19719ms step_avg:143.94ms step:148/1480 train_time:19868ms step_avg:143.97ms step:149/1480 train_time:20014ms step_avg:143.98ms step:150/1480 train_time:20162ms step_avg:144.01ms step:151/1480 train_time:20309ms step_avg:144.04ms step:152/1480 train_time:20455ms step_avg:144.05ms step:153/1480 train_time:20601ms step_avg:144.06ms step:154/1480 train_time:20748ms step_avg:144.08ms step:155/1480 train_time:20894ms step_avg:144.10ms step:156/1480 train_time:21041ms step_avg:144.11ms step:157/1480 train_time:21189ms step_avg:144.15ms step:158/1480 train_time:21335ms step_avg:144.16ms step:159/1480 train_time:21484ms step_avg:144.19ms step:160/1480 train_time:21631ms step_avg:144.21ms step:161/1480 train_time:21777ms step_avg:144.22ms step:162/1480 train_time:21926ms step_avg:144.25ms step:163/1480 train_time:22073ms step_avg:144.27ms step:164/1480 train_time:22221ms step_avg:144.29ms step:165/1480 train_time:22369ms step_avg:144.31ms step:166/1480 train_time:22515ms step_avg:144.33ms step:167/1480 train_time:22663ms step_avg:144.35ms step:168/1480 train_time:22809ms step_avg:144.36ms step:169/1480 train_time:22955ms step_avg:144.37ms step:170/1480 train_time:23104ms step_avg:144.40ms step:171/1480 train_time:23251ms step_avg:144.42ms step:172/1480 train_time:23398ms step_avg:144.43ms step:173/1480 train_time:23546ms step_avg:144.45ms step:174/1480 train_time:23693ms step_avg:144.47ms step:175/1480 train_time:23839ms step_avg:144.48ms step:176/1480 train_time:23987ms step_avg:144.50ms step:177/1480 train_time:24133ms step_avg:144.51ms step:178/1480 train_time:24281ms step_avg:144.53ms step:179/1480 train_time:24429ms step_avg:144.55ms step:180/1480 train_time:24574ms step_avg:144.55ms step:181/1480 train_time:24724ms step_avg:144.59ms step:182/1480 train_time:24872ms step_avg:144.60ms step:183/1480 train_time:25019ms step_avg:144.62ms step:184/1480 train_time:25166ms step_avg:144.63ms step:185/1480 train_time:25313ms step_avg:144.64ms step:186/1480 train_time:25459ms step_avg:144.66ms step:187/1480 train_time:25607ms step_avg:144.67ms step:188/1480 train_time:25752ms step_avg:144.68ms step:189/1480 train_time:25899ms step_avg:144.69ms step:190/1480 train_time:26047ms step_avg:144.70ms step:191/1480 train_time:26194ms step_avg:144.72ms step:192/1480 train_time:26340ms step_avg:144.72ms step:193/1480 train_time:26488ms step_avg:144.74ms step:194/1480 train_time:26634ms step_avg:144.75ms step:195/1480 train_time:26780ms step_avg:144.76ms step:196/1480 train_time:26927ms step_avg:144.77ms step:197/1480 train_time:27073ms step_avg:144.78ms step:198/1480 train_time:27220ms step_avg:144.79ms step:199/1480 train_time:27366ms step_avg:144.79ms step:200/1480 train_time:27513ms step_avg:144.80ms step:201/1480 train_time:27658ms step_avg:144.81ms step:202/1480 train_time:27807ms step_avg:144.83ms step:203/1480 train_time:27953ms step_avg:144.83ms step:204/1480 train_time:28098ms step_avg:144.84ms step:205/1480 train_time:28244ms step_avg:144.84ms step:206/1480 train_time:28392ms step_avg:144.86ms step:207/1480 train_time:28537ms step_avg:144.86ms step:208/1480 train_time:28684ms step_avg:144.87ms step:209/1480 train_time:28832ms step_avg:144.88ms step:210/1480 train_time:28978ms step_avg:144.89ms step:211/1480 train_time:29126ms step_avg:144.91ms step:212/1480 train_time:29272ms step_avg:144.91ms step:213/1480 train_time:29420ms step_avg:144.92ms step:214/1480 train_time:29567ms step_avg:144.94ms step:215/1480 train_time:29713ms step_avg:144.94ms step:216/1480 train_time:29859ms step_avg:144.95ms step:217/1480 train_time:30007ms step_avg:144.96ms step:218/1480 train_time:30153ms step_avg:144.96ms step:219/1480 train_time:30300ms step_avg:144.98ms step:220/1480 train_time:30448ms step_avg:144.99ms step:221/1480 train_time:30595ms step_avg:145.00ms step:222/1480 train_time:30745ms step_avg:145.03ms step:223/1480 train_time:30896ms step_avg:145.05ms step:224/1480 train_time:31046ms step_avg:145.08ms step:225/1480 train_time:31195ms step_avg:145.09ms step:226/1480 train_time:31345ms step_avg:145.12ms step:227/1480 train_time:31496ms step_avg:145.14ms step:228/1480 train_time:31647ms step_avg:145.17ms step:229/1480 train_time:31798ms step_avg:145.20ms step:230/1480 train_time:31948ms step_avg:145.22ms step:231/1480 train_time:32098ms step_avg:145.24ms step:232/1480 train_time:32248ms step_avg:145.26ms step:233/1480 train_time:32399ms step_avg:145.29ms step:234/1480 train_time:32550ms step_avg:145.31ms step:235/1480 train_time:32700ms step_avg:145.33ms step:236/1480 train_time:32851ms step_avg:145.36ms step:237/1480 train_time:32999ms step_avg:145.37ms step:238/1480 train_time:33150ms step_avg:145.39ms step:239/1480 train_time:33300ms step_avg:145.41ms step:240/1480 train_time:33450ms step_avg:145.44ms step:241/1480 train_time:33600ms step_avg:145.46ms step:242/1480 train_time:33751ms step_avg:145.48ms step:243/1480 train_time:33901ms step_avg:145.50ms step:244/1480 train_time:34051ms step_avg:145.52ms step:245/1480 train_time:34199ms step_avg:145.53ms step:246/1480 train_time:34349ms step_avg:145.55ms step:247/1480 train_time:34499ms step_avg:145.57ms step:248/1480 train_time:34650ms step_avg:145.59ms step:249/1480 train_time:34799ms step_avg:145.60ms step:250/1480 train_time:34950ms step_avg:145.62ms step:250/1480 val_loss:4.0029 train_time:35008ms step_avg:145.87ms step:251/1480 train_time:35108ms step_avg:145.67ms step:252/1480 train_time:35258ms step_avg:145.69ms step:253/1480 train_time:35408ms step_avg:145.71ms step:254/1480 train_time:35557ms step_avg:145.72ms step:255/1480 train_time:35707ms step_avg:145.74ms step:256/1480 train_time:35855ms step_avg:145.75ms step:257/1480 train_time:36006ms step_avg:145.77ms step:258/1480 train_time:36157ms step_avg:145.79ms step:259/1480 train_time:36308ms step_avg:145.81ms step:260/1480 train_time:36458ms step_avg:145.83ms step:261/1480 train_time:36609ms step_avg:145.85ms step:262/1480 train_time:36758ms step_avg:145.87ms step:263/1480 train_time:36909ms step_avg:145.88ms step:264/1480 train_time:37059ms step_avg:145.90ms step:265/1480 train_time:37211ms step_avg:145.93ms step:266/1480 train_time:37362ms step_avg:145.95ms step:267/1480 train_time:37513ms step_avg:145.96ms step:268/1480 train_time:37664ms step_avg:145.98ms step:269/1480 train_time:37813ms step_avg:146.00ms step:270/1480 train_time:37963ms step_avg:146.01ms step:271/1480 train_time:38113ms step_avg:146.03ms step:272/1480 train_time:38263ms step_avg:146.04ms step:273/1480 train_time:38413ms step_avg:146.06ms step:274/1480 train_time:38564ms step_avg:146.08ms step:275/1480 train_time:38714ms step_avg:146.09ms step:276/1480 train_time:38864ms step_avg:146.11ms step:277/1480 train_time:39014ms step_avg:146.12ms step:278/1480 train_time:39164ms step_avg:146.14ms step:279/1480 train_time:39315ms step_avg:146.15ms step:280/1480 train_time:39467ms step_avg:146.17ms step:281/1480 train_time:39616ms step_avg:146.19ms step:282/1480 train_time:39768ms step_avg:146.21ms step:283/1480 train_time:39919ms step_avg:146.22ms step:284/1480 train_time:40069ms step_avg:146.24ms step:285/1480 train_time:40219ms step_avg:146.25ms step:286/1480 train_time:40370ms step_avg:146.27ms step:287/1480 train_time:40520ms step_avg:146.28ms step:288/1480 train_time:40671ms step_avg:146.30ms step:289/1480 train_time:40821ms step_avg:146.31ms step:290/1480 train_time:40972ms step_avg:146.33ms step:291/1480 train_time:41123ms step_avg:146.35ms step:292/1480 train_time:41274ms step_avg:146.36ms step:293/1480 train_time:41425ms step_avg:146.38ms step:294/1480 train_time:41574ms step_avg:146.39ms step:295/1480 train_time:41726ms step_avg:146.41ms step:296/1480 train_time:41876ms step_avg:146.42ms step:297/1480 train_time:42026ms step_avg:146.43ms step:298/1480 train_time:42176ms step_avg:146.44ms step:299/1480 train_time:42326ms step_avg:146.46ms step:300/1480 train_time:42477ms step_avg:146.47ms step:301/1480 train_time:42627ms step_avg:146.48ms step:302/1480 train_time:42775ms step_avg:146.49ms step:303/1480 train_time:42928ms step_avg:146.51ms step:304/1480 train_time:43078ms step_avg:146.52ms step:305/1480 train_time:43230ms step_avg:146.54ms step:306/1480 train_time:43379ms step_avg:146.55ms step:307/1480 train_time:43530ms step_avg:146.57ms step:308/1480 train_time:43680ms step_avg:146.58ms step:309/1480 train_time:43833ms step_avg:146.60ms step:310/1480 train_time:43983ms step_avg:146.61ms step:311/1480 train_time:44133ms step_avg:146.62ms step:312/1480 train_time:44285ms step_avg:146.64ms step:313/1480 train_time:44435ms step_avg:146.65ms step:314/1480 train_time:44586ms step_avg:146.66ms step:315/1480 train_time:44734ms step_avg:146.67ms step:316/1480 train_time:44885ms step_avg:146.68ms step:317/1480 train_time:45035ms step_avg:146.69ms step:318/1480 train_time:45186ms step_avg:146.71ms step:319/1480 train_time:45335ms step_avg:146.72ms step:320/1480 train_time:45486ms step_avg:146.73ms step:321/1480 train_time:45636ms step_avg:146.74ms step:322/1480 train_time:45787ms step_avg:146.75ms step:323/1480 train_time:45937ms step_avg:146.76ms step:324/1480 train_time:46088ms step_avg:146.78ms step:325/1480 train_time:46238ms step_avg:146.79ms step:326/1480 train_time:46388ms step_avg:146.80ms step:327/1480 train_time:46538ms step_avg:146.81ms step:328/1480 train_time:46688ms step_avg:146.82ms step:329/1480 train_time:46837ms step_avg:146.82ms step:330/1480 train_time:46990ms step_avg:146.84ms step:331/1480 train_time:47145ms step_avg:146.87ms step:332/1480 train_time:47299ms step_avg:146.89ms step:333/1480 train_time:47453ms step_avg:146.91ms step:334/1480 train_time:47608ms step_avg:146.94ms step:335/1480 train_time:47762ms step_avg:146.96ms step:336/1480 train_time:47916ms step_avg:146.98ms step:337/1480 train_time:48070ms step_avg:147.00ms step:338/1480 train_time:48223ms step_avg:147.02ms step:339/1480 train_time:48377ms step_avg:147.04ms step:340/1480 train_time:48530ms step_avg:147.06ms step:341/1480 train_time:48685ms step_avg:147.09ms step:342/1480 train_time:48839ms step_avg:147.11ms step:343/1480 train_time:48994ms step_avg:147.13ms step:344/1480 train_time:49150ms step_avg:147.16ms step:345/1480 train_time:49305ms step_avg:147.18ms step:346/1480 train_time:49458ms step_avg:147.20ms step:347/1480 train_time:49611ms step_avg:147.21ms step:348/1480 train_time:49765ms step_avg:147.23ms step:349/1480 train_time:49918ms step_avg:147.25ms step:350/1480 train_time:50073ms step_avg:147.27ms step:351/1480 train_time:50226ms step_avg:147.29ms step:352/1480 train_time:50382ms step_avg:147.31ms step:353/1480 train_time:50535ms step_avg:147.33ms step:354/1480 train_time:50689ms step_avg:147.35ms step:355/1480 train_time:50844ms step_avg:147.37ms step:356/1480 train_time:50996ms step_avg:147.39ms step:357/1480 train_time:51150ms step_avg:147.41ms step:358/1480 train_time:51304ms step_avg:147.43ms step:359/1480 train_time:51459ms step_avg:147.45ms step:360/1480 train_time:51614ms step_avg:147.47ms step:361/1480 train_time:51768ms step_avg:147.49ms step:362/1480 train_time:51922ms step_avg:147.51ms step:363/1480 train_time:52075ms step_avg:147.52ms step:364/1480 train_time:52230ms step_avg:147.54ms step:365/1480 train_time:52384ms step_avg:147.56ms step:366/1480 train_time:52538ms step_avg:147.58ms step:367/1480 train_time:52691ms step_avg:147.59ms step:368/1480 train_time:52844ms step_avg:147.61ms step:369/1480 train_time:52996ms step_avg:147.62ms step:370/1480 train_time:53149ms step_avg:147.63ms step:371/1480 train_time:53303ms step_avg:147.65ms step:372/1480 train_time:53457ms step_avg:147.67ms step:373/1480 train_time:53612ms step_avg:147.69ms step:374/1480 train_time:53765ms step_avg:147.71ms step:375/1480 train_time:53919ms step_avg:147.72ms step:375/1480 val_loss:3.8137 train_time:53980ms step_avg:147.89ms step:376/1480 train_time:54078ms step_avg:147.75ms step:377/1480 train_time:54233ms step_avg:147.77ms step:378/1480 train_time:54386ms step_avg:147.79ms step:379/1480 train_time:54540ms step_avg:147.81ms step:380/1480 train_time:54693ms step_avg:147.82ms step:381/1480 train_time:54845ms step_avg:147.83ms step:382/1480 train_time:54999ms step_avg:147.85ms step:383/1480 train_time:55154ms step_avg:147.87ms step:384/1480 train_time:55308ms step_avg:147.88ms step:385/1480 train_time:55462ms step_avg:147.90ms step:386/1480 train_time:55614ms step_avg:147.91ms step:387/1480 train_time:55767ms step_avg:147.92ms step:388/1480 train_time:55921ms step_avg:147.94ms step:389/1480 train_time:56075ms step_avg:147.95ms step:390/1480 train_time:56229ms step_avg:147.97ms step:391/1480 train_time:56384ms step_avg:147.99ms step:392/1480 train_time:56537ms step_avg:148.00ms step:393/1480 train_time:56689ms step_avg:148.01ms step:394/1480 train_time:56844ms step_avg:148.03ms step:395/1480 train_time:56996ms step_avg:148.04ms step:396/1480 train_time:57150ms step_avg:148.06ms step:397/1480 train_time:57305ms step_avg:148.07ms step:398/1480 train_time:57460ms step_avg:148.09ms step:399/1480 train_time:57614ms step_avg:148.11ms step:400/1480 train_time:57769ms step_avg:148.12ms step:401/1480 train_time:57922ms step_avg:148.14ms step:402/1480 train_time:58076ms step_avg:148.15ms step:403/1480 train_time:58230ms step_avg:148.17ms step:404/1480 train_time:58384ms step_avg:148.18ms step:405/1480 train_time:58538ms step_avg:148.20ms step:406/1480 train_time:58692ms step_avg:148.21ms step:407/1480 train_time:58846ms step_avg:148.23ms step:408/1480 train_time:59001ms step_avg:148.24ms step:409/1480 train_time:59153ms step_avg:148.25ms step:410/1480 train_time:59307ms step_avg:148.27ms step:411/1480 train_time:59460ms step_avg:148.28ms step:412/1480 train_time:59614ms step_avg:148.29ms step:413/1480 train_time:59768ms step_avg:148.31ms step:414/1480 train_time:59922ms step_avg:148.32ms step:415/1480 train_time:60076ms step_avg:148.33ms step:416/1480 train_time:60229ms step_avg:148.35ms step:417/1480 train_time:60383ms step_avg:148.36ms step:418/1480 train_time:60539ms step_avg:148.38ms step:419/1480 train_time:60692ms step_avg:148.39ms step:420/1480 train_time:60845ms step_avg:148.40ms step:421/1480 train_time:60998ms step_avg:148.41ms step:422/1480 train_time:61152ms step_avg:148.43ms step:423/1480 train_time:61307ms step_avg:148.44ms step:424/1480 train_time:61462ms step_avg:148.46ms step:425/1480 train_time:61617ms step_avg:148.47ms step:426/1480 train_time:61770ms step_avg:148.49ms step:427/1480 train_time:61925ms step_avg:148.50ms step:428/1480 train_time:62077ms step_avg:148.51ms step:429/1480 train_time:62231ms step_avg:148.52ms step:430/1480 train_time:62384ms step_avg:148.53ms step:431/1480 train_time:62538ms step_avg:148.55ms step:432/1480 train_time:62691ms step_avg:148.56ms step:433/1480 train_time:62846ms step_avg:148.57ms step:434/1480 train_time:63000ms step_avg:148.58ms step:435/1480 train_time:63153ms step_avg:148.60ms step:436/1480 train_time:63308ms step_avg:148.61ms step:437/1480 train_time:63461ms step_avg:148.62ms step:438/1480 train_time:63614ms step_avg:148.63ms step:439/1480 train_time:63769ms step_avg:148.65ms step:440/1480 train_time:63924ms step_avg:148.66ms step:441/1480 train_time:64082ms step_avg:148.68ms step:442/1480 train_time:64239ms step_avg:148.70ms step:443/1480 train_time:64394ms step_avg:148.72ms step:444/1480 train_time:64549ms step_avg:148.73ms step:445/1480 train_time:64705ms step_avg:148.75ms step:446/1480 train_time:64861ms step_avg:148.76ms step:447/1480 train_time:65017ms step_avg:148.78ms step:448/1480 train_time:65173ms step_avg:148.80ms step:449/1480 train_time:65330ms step_avg:148.81ms step:450/1480 train_time:65486ms step_avg:148.83ms step:451/1480 train_time:65645ms step_avg:148.86ms step:452/1480 train_time:65802ms step_avg:148.87ms step:453/1480 train_time:65958ms step_avg:148.89ms step:454/1480 train_time:66113ms step_avg:148.90ms step:455/1480 train_time:66269ms step_avg:148.92ms step:456/1480 train_time:66426ms step_avg:148.94ms step:457/1480 train_time:66582ms step_avg:148.95ms step:458/1480 train_time:66737ms step_avg:148.97ms step:459/1480 train_time:66893ms step_avg:148.98ms step:460/1480 train_time:67049ms step_avg:149.00ms step:461/1480 train_time:67207ms step_avg:149.02ms step:462/1480 train_time:67365ms step_avg:149.04ms step:463/1480 train_time:67524ms step_avg:149.06ms step:464/1480 train_time:67681ms step_avg:149.08ms step:465/1480 train_time:67836ms step_avg:149.09ms step:466/1480 train_time:67992ms step_avg:149.10ms step:467/1480 train_time:68150ms step_avg:149.13ms step:468/1480 train_time:68307ms step_avg:149.14ms step:469/1480 train_time:68464ms step_avg:149.16ms step:470/1480 train_time:68623ms step_avg:149.18ms step:471/1480 train_time:68781ms step_avg:149.20ms step:472/1480 train_time:68938ms step_avg:149.22ms step:473/1480 train_time:69094ms step_avg:149.23ms step:474/1480 train_time:69251ms step_avg:149.25ms step:475/1480 train_time:69409ms step_avg:149.27ms step:476/1480 train_time:69566ms step_avg:149.28ms step:477/1480 train_time:69723ms step_avg:149.30ms step:478/1480 train_time:69880ms step_avg:149.32ms step:479/1480 train_time:70036ms step_avg:149.33ms step:480/1480 train_time:70193ms step_avg:149.35ms step:481/1480 train_time:70350ms step_avg:149.36ms step:482/1480 train_time:70507ms step_avg:149.38ms step:483/1480 train_time:70665ms step_avg:149.40ms step:484/1480 train_time:70823ms step_avg:149.42ms step:485/1480 train_time:70981ms step_avg:149.43ms step:486/1480 train_time:71138ms step_avg:149.45ms step:487/1480 train_time:71294ms step_avg:149.46ms step:488/1480 train_time:71451ms step_avg:149.48ms step:489/1480 train_time:71607ms step_avg:149.49ms step:490/1480 train_time:71764ms step_avg:149.51ms step:491/1480 train_time:71922ms step_avg:149.53ms step:492/1480 train_time:72079ms step_avg:149.54ms step:493/1480 train_time:72237ms step_avg:149.56ms step:494/1480 train_time:72393ms step_avg:149.57ms step:495/1480 train_time:72550ms step_avg:149.59ms step:496/1480 train_time:72708ms step_avg:149.60ms step:497/1480 train_time:72864ms step_avg:149.62ms step:498/1480 train_time:73023ms step_avg:149.64ms step:499/1480 train_time:73181ms step_avg:149.65ms step:500/1480 train_time:73339ms step_avg:149.67ms step:500/1480 val_loss:3.6929 train_time:73401ms step_avg:149.80ms step:501/1480 train_time:73500ms step_avg:149.69ms step:502/1480 train_time:73657ms step_avg:149.71ms step:503/1480 train_time:73815ms step_avg:149.73ms step:504/1480 train_time:73971ms step_avg:149.74ms step:505/1480 train_time:74126ms step_avg:149.75ms step:506/1480 train_time:74281ms step_avg:149.76ms step:507/1480 train_time:74438ms step_avg:149.78ms step:508/1480 train_time:74598ms step_avg:149.79ms step:509/1480 train_time:74754ms step_avg:149.81ms step:510/1480 train_time:74910ms step_avg:149.82ms step:511/1480 train_time:75067ms step_avg:149.83ms step:512/1480 train_time:75225ms step_avg:149.85ms step:513/1480 train_time:75380ms step_avg:149.86ms step:514/1480 train_time:75538ms step_avg:149.88ms step:515/1480 train_time:75695ms step_avg:149.89ms step:516/1480 train_time:75853ms step_avg:149.91ms step:517/1480 train_time:76011ms step_avg:149.92ms step:518/1480 train_time:76169ms step_avg:149.94ms step:519/1480 train_time:76326ms step_avg:149.95ms step:520/1480 train_time:76483ms step_avg:149.97ms step:521/1480 train_time:76639ms step_avg:149.98ms step:522/1480 train_time:76796ms step_avg:149.99ms step:523/1480 train_time:76954ms step_avg:150.01ms step:524/1480 train_time:77111ms step_avg:150.02ms step:525/1480 train_time:77267ms step_avg:150.03ms step:526/1480 train_time:77424ms step_avg:150.05ms step:527/1480 train_time:77580ms step_avg:150.06ms step:528/1480 train_time:77737ms step_avg:150.07ms step:529/1480 train_time:77895ms step_avg:150.09ms step:530/1480 train_time:78052ms step_avg:150.10ms step:531/1480 train_time:78210ms step_avg:150.11ms step:532/1480 train_time:78366ms step_avg:150.13ms step:533/1480 train_time:78522ms step_avg:150.14ms step:534/1480 train_time:78678ms step_avg:150.15ms step:535/1480 train_time:78836ms step_avg:150.16ms step:536/1480 train_time:78994ms step_avg:150.18ms step:537/1480 train_time:79151ms step_avg:150.19ms step:538/1480 train_time:79309ms step_avg:150.21ms step:539/1480 train_time:79469ms step_avg:150.22ms step:540/1480 train_time:79625ms step_avg:150.24ms step:541/1480 train_time:79781ms step_avg:150.25ms step:542/1480 train_time:79937ms step_avg:150.26ms step:543/1480 train_time:80095ms step_avg:150.27ms step:544/1480 train_time:80251ms step_avg:150.28ms step:545/1480 train_time:80409ms step_avg:150.30ms step:546/1480 train_time:80565ms step_avg:150.31ms step:547/1480 train_time:80721ms step_avg:150.32ms step:548/1480 train_time:80878ms step_avg:150.33ms step:549/1480 train_time:81035ms step_avg:150.34ms step:550/1480 train_time:81193ms step_avg:150.36ms step:551/1480 train_time:81350ms step_avg:150.37ms step:552/1480 train_time:81511ms step_avg:150.39ms step:553/1480 train_time:81673ms step_avg:150.41ms step:554/1480 train_time:81835ms step_avg:150.43ms step:555/1480 train_time:81996ms step_avg:150.45ms step:556/1480 train_time:82155ms step_avg:150.47ms step:557/1480 train_time:82316ms step_avg:150.49ms step:558/1480 train_time:82476ms step_avg:150.50ms step:559/1480 train_time:82636ms step_avg:150.52ms step:560/1480 train_time:82796ms step_avg:150.54ms step:561/1480 train_time:82956ms step_avg:150.56ms step:562/1480 train_time:83117ms step_avg:150.57ms step:563/1480 train_time:83276ms step_avg:150.59ms step:564/1480 train_time:83437ms step_avg:150.61ms step:565/1480 train_time:83596ms step_avg:150.62ms step:566/1480 train_time:83756ms step_avg:150.64ms step:567/1480 train_time:83917ms step_avg:150.66ms step:568/1480 train_time:84076ms step_avg:150.67ms step:569/1480 train_time:84234ms step_avg:150.69ms step:570/1480 train_time:84392ms step_avg:150.70ms step:571/1480 train_time:84552ms step_avg:150.72ms step:572/1480 train_time:84711ms step_avg:150.73ms step:573/1480 train_time:84869ms step_avg:150.74ms step:574/1480 train_time:85032ms step_avg:150.77ms step:575/1480 train_time:85193ms step_avg:150.78ms step:576/1480 train_time:85353ms step_avg:150.80ms step:577/1480 train_time:85514ms step_avg:150.82ms step:578/1480 train_time:85673ms step_avg:150.83ms step:579/1480 train_time:85832ms step_avg:150.85ms step:580/1480 train_time:85990ms step_avg:150.86ms step:581/1480 train_time:86150ms step_avg:150.88ms step:582/1480 train_time:86309ms step_avg:150.89ms step:583/1480 train_time:86467ms step_avg:150.90ms step:584/1480 train_time:86626ms step_avg:150.92ms step:585/1480 train_time:86783ms step_avg:150.93ms step:586/1480 train_time:86943ms step_avg:150.94ms step:587/1480 train_time:87101ms step_avg:150.95ms step:588/1480 train_time:87259ms step_avg:150.97ms step:589/1480 train_time:87418ms step_avg:150.98ms step:590/1480 train_time:87578ms step_avg:151.00ms step:591/1480 train_time:87738ms step_avg:151.01ms step:592/1480 train_time:87898ms step_avg:151.03ms step:593/1480 train_time:88059ms step_avg:151.04ms step:594/1480 train_time:88219ms step_avg:151.06ms step:595/1480 train_time:88380ms step_avg:151.08ms step:596/1480 train_time:88541ms step_avg:151.09ms step:597/1480 train_time:88699ms step_avg:151.11ms step:598/1480 train_time:88857ms step_avg:151.12ms step:599/1480 train_time:89016ms step_avg:151.13ms step:600/1480 train_time:89176ms step_avg:151.15ms step:601/1480 train_time:89337ms step_avg:151.16ms step:602/1480 train_time:89497ms step_avg:151.18ms step:603/1480 train_time:89657ms step_avg:151.19ms step:604/1480 train_time:89817ms step_avg:151.21ms step:605/1480 train_time:89976ms step_avg:151.22ms step:606/1480 train_time:90138ms step_avg:151.24ms step:607/1480 train_time:90299ms step_avg:151.25ms step:608/1480 train_time:90458ms step_avg:151.27ms step:609/1480 train_time:90618ms step_avg:151.28ms step:610/1480 train_time:90776ms step_avg:151.29ms step:611/1480 train_time:90938ms step_avg:151.31ms step:612/1480 train_time:91098ms step_avg:151.32ms step:613/1480 train_time:91258ms step_avg:151.34ms step:614/1480 train_time:91418ms step_avg:151.35ms step:615/1480 train_time:91578ms step_avg:151.37ms step:616/1480 train_time:91738ms step_avg:151.38ms step:617/1480 train_time:91897ms step_avg:151.40ms step:618/1480 train_time:92056ms step_avg:151.41ms step:619/1480 train_time:92216ms step_avg:151.42ms step:620/1480 train_time:92377ms step_avg:151.44ms step:621/1480 train_time:92536ms step_avg:151.45ms step:622/1480 train_time:92696ms step_avg:151.46ms step:623/1480 train_time:92857ms step_avg:151.48ms step:624/1480 train_time:93017ms step_avg:151.49ms step:625/1480 train_time:93176ms step_avg:151.51ms step:625/1480 val_loss:3.6097 train_time:93239ms step_avg:151.61ms step:626/1480 train_time:93338ms step_avg:151.52ms step:627/1480 train_time:93500ms step_avg:151.54ms step:628/1480 train_time:93659ms step_avg:151.55ms step:629/1480 train_time:93816ms step_avg:151.56ms step:630/1480 train_time:93974ms step_avg:151.57ms step:631/1480 train_time:94131ms step_avg:151.58ms step:632/1480 train_time:94292ms step_avg:151.59ms step:633/1480 train_time:94450ms step_avg:151.61ms step:634/1480 train_time:94611ms step_avg:151.62ms step:635/1480 train_time:94770ms step_avg:151.63ms step:636/1480 train_time:94928ms step_avg:151.64ms step:637/1480 train_time:95088ms step_avg:151.66ms step:638/1480 train_time:95247ms step_avg:151.67ms step:639/1480 train_time:95406ms step_avg:151.68ms step:640/1480 train_time:95567ms step_avg:151.69ms step:641/1480 train_time:95727ms step_avg:151.71ms step:642/1480 train_time:95887ms step_avg:151.72ms step:643/1480 train_time:96046ms step_avg:151.73ms step:644/1480 train_time:96206ms step_avg:151.74ms step:645/1480 train_time:96366ms step_avg:151.76ms step:646/1480 train_time:96525ms step_avg:151.77ms step:647/1480 train_time:96685ms step_avg:151.78ms step:648/1480 train_time:96844ms step_avg:151.79ms step:649/1480 train_time:97005ms step_avg:151.81ms step:650/1480 train_time:97165ms step_avg:151.82ms step:651/1480 train_time:97326ms step_avg:151.83ms step:652/1480 train_time:97487ms step_avg:151.85ms step:653/1480 train_time:97646ms step_avg:151.86ms step:654/1480 train_time:97807ms step_avg:151.87ms step:655/1480 train_time:97966ms step_avg:151.89ms step:656/1480 train_time:98126ms step_avg:151.90ms step:657/1480 train_time:98288ms step_avg:151.91ms step:658/1480 train_time:98447ms step_avg:151.92ms step:659/1480 train_time:98609ms step_avg:151.94ms step:660/1480 train_time:98770ms step_avg:151.95ms step:661/1480 train_time:98932ms step_avg:151.97ms step:662/1480 train_time:99092ms step_avg:151.98ms step:663/1480 train_time:99251ms step_avg:151.99ms step:664/1480 train_time:99414ms step_avg:152.01ms step:665/1480 train_time:99576ms step_avg:152.02ms step:666/1480 train_time:99737ms step_avg:152.04ms step:667/1480 train_time:99899ms step_avg:152.05ms step:668/1480 train_time:100062ms step_avg:152.07ms step:669/1480 train_time:100224ms step_avg:152.08ms step:670/1480 train_time:100385ms step_avg:152.10ms step:671/1480 train_time:100546ms step_avg:152.11ms step:672/1480 train_time:100708ms step_avg:152.13ms step:673/1480 train_time:100870ms step_avg:152.14ms step:674/1480 train_time:101031ms step_avg:152.15ms step:675/1480 train_time:101193ms step_avg:152.17ms step:676/1480 train_time:101355ms step_avg:152.18ms step:677/1480 train_time:101515ms step_avg:152.20ms step:678/1480 train_time:101675ms step_avg:152.21ms step:679/1480 train_time:101839ms step_avg:152.23ms step:680/1480 train_time:102003ms step_avg:152.24ms step:681/1480 train_time:102164ms step_avg:152.26ms step:682/1480 train_time:102327ms step_avg:152.27ms step:683/1480 train_time:102489ms step_avg:152.29ms step:684/1480 train_time:102650ms step_avg:152.30ms step:685/1480 train_time:102812ms step_avg:152.31ms step:686/1480 train_time:102973ms step_avg:152.33ms step:687/1480 train_time:103133ms step_avg:152.34ms step:688/1480 train_time:103297ms step_avg:152.35ms step:689/1480 train_time:103460ms step_avg:152.37ms step:690/1480 train_time:103623ms step_avg:152.39ms step:691/1480 train_time:103786ms step_avg:152.40ms step:692/1480 train_time:103948ms step_avg:152.42ms step:693/1480 train_time:104110ms step_avg:152.43ms step:694/1480 train_time:104271ms step_avg:152.44ms step:695/1480 train_time:104431ms step_avg:152.45ms step:696/1480 train_time:104592ms step_avg:152.47ms step:697/1480 train_time:104753ms step_avg:152.48ms step:698/1480 train_time:104913ms step_avg:152.49ms step:699/1480 train_time:105076ms step_avg:152.50ms step:700/1480 train_time:105237ms step_avg:152.52ms step:701/1480 train_time:105399ms step_avg:152.53ms step:702/1480 train_time:105559ms step_avg:152.54ms step:703/1480 train_time:105722ms step_avg:152.56ms step:704/1480 train_time:105883ms step_avg:152.57ms step:705/1480 train_time:106046ms step_avg:152.58ms step:706/1480 train_time:106211ms step_avg:152.60ms step:707/1480 train_time:106373ms step_avg:152.61ms step:708/1480 train_time:106532ms step_avg:152.63ms step:709/1480 train_time:106693ms step_avg:152.64ms step:710/1480 train_time:106852ms step_avg:152.65ms step:711/1480 train_time:107014ms step_avg:152.66ms step:712/1480 train_time:107180ms step_avg:152.68ms step:713/1480 train_time:107343ms step_avg:152.69ms step:714/1480 train_time:107505ms step_avg:152.71ms step:715/1480 train_time:107666ms step_avg:152.72ms step:716/1480 train_time:107826ms step_avg:152.73ms step:717/1480 train_time:107990ms step_avg:152.74ms step:718/1480 train_time:108149ms step_avg:152.75ms step:719/1480 train_time:108310ms step_avg:152.76ms step:720/1480 train_time:108472ms step_avg:152.78ms step:721/1480 train_time:108633ms step_avg:152.79ms step:722/1480 train_time:108796ms step_avg:152.80ms step:723/1480 train_time:108956ms step_avg:152.81ms step:724/1480 train_time:109118ms step_avg:152.83ms step:725/1480 train_time:109282ms step_avg:152.84ms step:726/1480 train_time:109445ms step_avg:152.86ms step:727/1480 train_time:109609ms step_avg:152.87ms step:728/1480 train_time:109769ms step_avg:152.88ms step:729/1480 train_time:109930ms step_avg:152.89ms step:730/1480 train_time:110092ms step_avg:152.91ms step:731/1480 train_time:110253ms step_avg:152.92ms step:732/1480 train_time:110413ms step_avg:152.93ms step:733/1480 train_time:110576ms step_avg:152.94ms step:734/1480 train_time:110737ms step_avg:152.95ms step:735/1480 train_time:110898ms step_avg:152.96ms step:736/1480 train_time:111060ms step_avg:152.97ms step:737/1480 train_time:111222ms step_avg:152.99ms step:738/1480 train_time:111386ms step_avg:153.00ms step:739/1480 train_time:111547ms step_avg:153.01ms step:740/1480 train_time:111713ms step_avg:153.03ms step:741/1480 train_time:111875ms step_avg:153.04ms step:742/1480 train_time:112036ms step_avg:153.06ms step:743/1480 train_time:112197ms step_avg:153.07ms step:744/1480 train_time:112362ms step_avg:153.08ms step:745/1480 train_time:112527ms step_avg:153.10ms step:746/1480 train_time:112688ms step_avg:153.11ms step:747/1480 train_time:112849ms step_avg:153.12ms step:748/1480 train_time:113016ms step_avg:153.14ms step:749/1480 train_time:113178ms step_avg:153.15ms step:750/1480 train_time:113336ms step_avg:153.16ms step:750/1480 val_loss:3.5551 train_time:113401ms step_avg:153.25ms step:751/1480 train_time:113501ms step_avg:153.17ms step:752/1480 train_time:113662ms step_avg:153.18ms step:753/1480 train_time:113823ms step_avg:153.19ms step:754/1480 train_time:113983ms step_avg:153.20ms step:755/1480 train_time:114145ms step_avg:153.21ms step:756/1480 train_time:114306ms step_avg:153.23ms step:757/1480 train_time:114472ms step_avg:153.24ms step:758/1480 train_time:114634ms step_avg:153.25ms step:759/1480 train_time:114797ms step_avg:153.27ms step:760/1480 train_time:114958ms step_avg:153.28ms step:761/1480 train_time:115119ms step_avg:153.29ms step:762/1480 train_time:115280ms step_avg:153.30ms step:763/1480 train_time:115441ms step_avg:153.31ms step:764/1480 train_time:115601ms step_avg:153.32ms step:765/1480 train_time:115763ms step_avg:153.33ms step:766/1480 train_time:115926ms step_avg:153.34ms step:767/1480 train_time:116088ms step_avg:153.35ms step:768/1480 train_time:116251ms step_avg:153.37ms step:769/1480 train_time:116414ms step_avg:153.38ms step:770/1480 train_time:116577ms step_avg:153.39ms step:771/1480 train_time:116741ms step_avg:153.40ms step:772/1480 train_time:116903ms step_avg:153.42ms step:773/1480 train_time:117066ms step_avg:153.43ms step:774/1480 train_time:117229ms step_avg:153.44ms step:775/1480 train_time:117393ms step_avg:153.45ms step:776/1480 train_time:117557ms step_avg:153.47ms step:777/1480 train_time:117723ms step_avg:153.48ms step:778/1480 train_time:117885ms step_avg:153.50ms step:779/1480 train_time:118048ms step_avg:153.51ms step:780/1480 train_time:118212ms step_avg:153.52ms step:781/1480 train_time:118377ms step_avg:153.54ms step:782/1480 train_time:118539ms step_avg:153.55ms step:783/1480 train_time:118701ms step_avg:153.56ms step:784/1480 train_time:118864ms step_avg:153.57ms step:785/1480 train_time:119025ms step_avg:153.58ms step:786/1480 train_time:119190ms step_avg:153.60ms step:787/1480 train_time:119354ms step_avg:153.61ms step:788/1480 train_time:119517ms step_avg:153.62ms step:789/1480 train_time:119679ms step_avg:153.63ms step:790/1480 train_time:119845ms step_avg:153.65ms step:791/1480 train_time:120011ms step_avg:153.66ms step:792/1480 train_time:120177ms step_avg:153.68ms step:793/1480 train_time:120338ms step_avg:153.69ms step:794/1480 train_time:120501ms step_avg:153.70ms step:795/1480 train_time:120665ms step_avg:153.71ms step:796/1480 train_time:120832ms step_avg:153.73ms step:797/1480 train_time:120997ms step_avg:153.74ms step:798/1480 train_time:121161ms step_avg:153.76ms step:799/1480 train_time:121329ms step_avg:153.78ms step:800/1480 train_time:121493ms step_avg:153.79ms step:801/1480 train_time:121657ms step_avg:153.80ms step:802/1480 train_time:121823ms step_avg:153.82ms step:803/1480 train_time:121985ms step_avg:153.83ms step:804/1480 train_time:122148ms step_avg:153.84ms step:805/1480 train_time:122313ms step_avg:153.85ms step:806/1480 train_time:122475ms step_avg:153.86ms step:807/1480 train_time:122637ms step_avg:153.87ms step:808/1480 train_time:122801ms step_avg:153.89ms step:809/1480 train_time:122963ms step_avg:153.90ms step:810/1480 train_time:123124ms step_avg:153.91ms step:811/1480 train_time:123286ms step_avg:153.92ms step:812/1480 train_time:123452ms step_avg:153.93ms step:813/1480 train_time:123614ms step_avg:153.94ms step:814/1480 train_time:123777ms step_avg:153.95ms step:815/1480 train_time:123938ms step_avg:153.96ms step:816/1480 train_time:124103ms step_avg:153.97ms step:817/1480 train_time:124264ms step_avg:153.98ms step:818/1480 train_time:124424ms step_avg:153.99ms step:819/1480 train_time:124588ms step_avg:154.00ms step:820/1480 train_time:124754ms step_avg:154.02ms step:821/1480 train_time:124916ms step_avg:154.03ms step:822/1480 train_time:125080ms step_avg:154.04ms step:823/1480 train_time:125242ms step_avg:154.05ms step:824/1480 train_time:125404ms step_avg:154.06ms step:825/1480 train_time:125570ms step_avg:154.07ms step:826/1480 train_time:125737ms step_avg:154.09ms step:827/1480 train_time:125901ms step_avg:154.10ms step:828/1480 train_time:126065ms step_avg:154.11ms step:829/1480 train_time:126229ms step_avg:154.13ms step:830/1480 train_time:126395ms step_avg:154.14ms step:831/1480 train_time:126558ms step_avg:154.15ms step:832/1480 train_time:126720ms step_avg:154.16ms step:833/1480 train_time:126884ms step_avg:154.17ms step:834/1480 train_time:127049ms step_avg:154.19ms step:835/1480 train_time:127212ms step_avg:154.20ms step:836/1480 train_time:127377ms step_avg:154.21ms step:837/1480 train_time:127540ms step_avg:154.22ms step:838/1480 train_time:127702ms step_avg:154.23ms step:839/1480 train_time:127864ms step_avg:154.24ms step:840/1480 train_time:128024ms step_avg:154.25ms step:841/1480 train_time:128184ms step_avg:154.25ms step:842/1480 train_time:128350ms step_avg:154.27ms step:843/1480 train_time:128513ms step_avg:154.28ms step:844/1480 train_time:128675ms step_avg:154.29ms step:845/1480 train_time:128839ms step_avg:154.30ms step:846/1480 train_time:129003ms step_avg:154.31ms step:847/1480 train_time:129167ms step_avg:154.32ms step:848/1480 train_time:129329ms step_avg:154.33ms step:849/1480 train_time:129493ms step_avg:154.34ms step:850/1480 train_time:129656ms step_avg:154.35ms step:851/1480 train_time:129819ms step_avg:154.36ms step:852/1480 train_time:129980ms step_avg:154.37ms step:853/1480 train_time:130142ms step_avg:154.38ms step:854/1480 train_time:130306ms step_avg:154.39ms step:855/1480 train_time:130471ms step_avg:154.40ms step:856/1480 train_time:130633ms step_avg:154.41ms step:857/1480 train_time:130799ms step_avg:154.43ms step:858/1480 train_time:130965ms step_avg:154.44ms step:859/1480 train_time:131130ms step_avg:154.45ms step:860/1480 train_time:131292ms step_avg:154.46ms step:861/1480 train_time:131458ms step_avg:154.47ms step:862/1480 train_time:131628ms step_avg:154.49ms step:863/1480 train_time:131796ms step_avg:154.51ms step:864/1480 train_time:131960ms step_avg:154.52ms step:865/1480 train_time:132120ms step_avg:154.53ms step:866/1480 train_time:132286ms step_avg:154.54ms step:867/1480 train_time:132449ms step_avg:154.55ms step:868/1480 train_time:132612ms step_avg:154.56ms step:869/1480 train_time:132776ms step_avg:154.57ms step:870/1480 train_time:132939ms step_avg:154.58ms step:871/1480 train_time:133102ms step_avg:154.59ms step:872/1480 train_time:133268ms step_avg:154.60ms step:873/1480 train_time:133431ms step_avg:154.61ms step:874/1480 train_time:133597ms step_avg:154.63ms step:875/1480 train_time:133760ms step_avg:154.64ms step:875/1480 val_loss:3.5092 train_time:133824ms step_avg:154.71ms step:876/1480 train_time:133924ms step_avg:154.65ms step:877/1480 train_time:134089ms step_avg:154.66ms step:878/1480 train_time:134252ms step_avg:154.67ms step:879/1480 train_time:134417ms step_avg:154.68ms step:880/1480 train_time:134582ms step_avg:154.69ms step:881/1480 train_time:134744ms step_avg:154.70ms step:882/1480 train_time:134909ms step_avg:154.71ms step:883/1480 train_time:135073ms step_avg:154.72ms step:884/1480 train_time:135241ms step_avg:154.74ms step:885/1480 train_time:135408ms step_avg:154.75ms step:886/1480 train_time:135574ms step_avg:154.76ms step:887/1480 train_time:135743ms step_avg:154.78ms step:888/1480 train_time:135915ms step_avg:154.80ms step:889/1480 train_time:136083ms step_avg:154.82ms step:890/1480 train_time:136245ms step_avg:154.82ms step:891/1480 train_time:136410ms step_avg:154.84ms step:892/1480 train_time:136575ms step_avg:154.85ms step:893/1480 train_time:136738ms step_avg:154.86ms step:894/1480 train_time:136906ms step_avg:154.87ms step:895/1480 train_time:137072ms step_avg:154.88ms step:896/1480 train_time:137237ms step_avg:154.89ms step:897/1480 train_time:137403ms step_avg:154.91ms step:898/1480 train_time:137570ms step_avg:154.92ms step:899/1480 train_time:137734ms step_avg:154.93ms step:900/1480 train_time:137900ms step_avg:154.94ms step:901/1480 train_time:138063ms step_avg:154.95ms step:902/1480 train_time:138226ms step_avg:154.96ms step:903/1480 train_time:138396ms step_avg:154.98ms step:904/1480 train_time:138561ms step_avg:154.99ms step:905/1480 train_time:138724ms step_avg:155.00ms step:906/1480 train_time:138890ms step_avg:155.01ms step:907/1480 train_time:139057ms step_avg:155.03ms step:908/1480 train_time:139222ms step_avg:155.04ms step:909/1480 train_time:139386ms step_avg:155.05ms step:910/1480 train_time:139559ms step_avg:155.07ms step:911/1480 train_time:139724ms step_avg:155.08ms step:912/1480 train_time:139891ms step_avg:155.09ms step:913/1480 train_time:140058ms step_avg:155.10ms step:914/1480 train_time:140226ms step_avg:155.12ms step:915/1480 train_time:140396ms step_avg:155.13ms step:916/1480 train_time:140559ms step_avg:155.14ms step:917/1480 train_time:140722ms step_avg:155.15ms step:918/1480 train_time:140890ms step_avg:155.16ms step:919/1480 train_time:141059ms step_avg:155.18ms step:920/1480 train_time:141225ms step_avg:155.19ms step:921/1480 train_time:141391ms step_avg:155.20ms step:922/1480 train_time:141559ms step_avg:155.22ms step:923/1480 train_time:141722ms step_avg:155.23ms step:924/1480 train_time:141886ms step_avg:155.24ms step:925/1480 train_time:142050ms step_avg:155.25ms step:926/1480 train_time:142213ms step_avg:155.25ms step:927/1480 train_time:142379ms step_avg:155.27ms step:928/1480 train_time:142545ms step_avg:155.28ms step:929/1480 train_time:142710ms step_avg:155.29ms step:930/1480 train_time:142876ms step_avg:155.30ms step:931/1480 train_time:143039ms step_avg:155.31ms step:932/1480 train_time:143205ms step_avg:155.32ms step:933/1480 train_time:143371ms step_avg:155.33ms step:934/1480 train_time:143538ms step_avg:155.34ms step:935/1480 train_time:143710ms step_avg:155.36ms step:936/1480 train_time:143878ms step_avg:155.38ms step:937/1480 train_time:144046ms step_avg:155.39ms step:938/1480 train_time:144210ms step_avg:155.40ms step:939/1480 train_time:144381ms step_avg:155.42ms step:940/1480 train_time:144548ms step_avg:155.43ms step:941/1480 train_time:144713ms step_avg:155.44ms step:942/1480 train_time:144880ms step_avg:155.45ms step:943/1480 train_time:145048ms step_avg:155.46ms step:944/1480 train_time:145222ms step_avg:155.48ms step:945/1480 train_time:145385ms step_avg:155.49ms step:946/1480 train_time:145554ms step_avg:155.51ms step:947/1480 train_time:145722ms step_avg:155.52ms step:948/1480 train_time:145887ms step_avg:155.53ms step:949/1480 train_time:146052ms step_avg:155.54ms step:950/1480 train_time:146217ms step_avg:155.55ms step:951/1480 train_time:146384ms step_avg:155.56ms step:952/1480 train_time:146548ms step_avg:155.57ms step:953/1480 train_time:146718ms step_avg:155.59ms step:954/1480 train_time:146887ms step_avg:155.60ms step:955/1480 train_time:147050ms step_avg:155.61ms step:956/1480 train_time:147217ms step_avg:155.62ms step:957/1480 train_time:147385ms step_avg:155.63ms step:958/1480 train_time:147556ms step_avg:155.65ms step:959/1480 train_time:147722ms step_avg:155.66ms step:960/1480 train_time:147889ms step_avg:155.67ms step:961/1480 train_time:148053ms step_avg:155.68ms step:962/1480 train_time:148218ms step_avg:155.69ms step:963/1480 train_time:148383ms step_avg:155.70ms step:964/1480 train_time:148551ms step_avg:155.71ms step:965/1480 train_time:148716ms step_avg:155.72ms step:966/1480 train_time:148881ms step_avg:155.73ms step:967/1480 train_time:149045ms step_avg:155.74ms step:968/1480 train_time:149211ms step_avg:155.75ms step:969/1480 train_time:149377ms step_avg:155.76ms step:970/1480 train_time:149541ms step_avg:155.77ms step:971/1480 train_time:149705ms step_avg:155.78ms step:972/1480 train_time:149868ms step_avg:155.79ms step:973/1480 train_time:150031ms step_avg:155.80ms step:974/1480 train_time:150200ms step_avg:155.81ms step:975/1480 train_time:150366ms step_avg:155.82ms step:976/1480 train_time:150530ms step_avg:155.83ms step:977/1480 train_time:150693ms step_avg:155.84ms step:978/1480 train_time:150858ms step_avg:155.85ms step:979/1480 train_time:151025ms step_avg:155.86ms step:980/1480 train_time:151191ms step_avg:155.87ms step:981/1480 train_time:151360ms step_avg:155.88ms step:982/1480 train_time:151525ms step_avg:155.89ms step:983/1480 train_time:151690ms step_avg:155.90ms step:984/1480 train_time:151853ms step_avg:155.91ms step:985/1480 train_time:152021ms step_avg:155.92ms step:986/1480 train_time:152186ms step_avg:155.93ms step:987/1480 train_time:152348ms step_avg:155.93ms step:988/1480 train_time:152517ms step_avg:155.95ms step:989/1480 train_time:152684ms step_avg:155.96ms step:990/1480 train_time:152852ms step_avg:155.97ms step:991/1480 train_time:153019ms step_avg:155.98ms step:992/1480 train_time:153192ms step_avg:156.00ms step:993/1480 train_time:153368ms step_avg:156.02ms step:994/1480 train_time:153533ms step_avg:156.03ms step:995/1480 train_time:153698ms step_avg:156.04ms step:996/1480 train_time:153861ms step_avg:156.05ms step:997/1480 train_time:154027ms step_avg:156.06ms step:998/1480 train_time:154191ms step_avg:156.06ms step:999/1480 train_time:154358ms step_avg:156.07ms step:1000/1480 train_time:154527ms step_avg:156.09ms step:1000/1480 val_loss:3.4467 train_time:154596ms step_avg:156.16ms step:1001/1480 train_time:154698ms step_avg:156.10ms step:1002/1480 train_time:154863ms step_avg:156.11ms step:1003/1480 train_time:155037ms step_avg:156.13ms step:1004/1480 train_time:155206ms step_avg:156.14ms step:1005/1480 train_time:155374ms step_avg:156.16ms step:1006/1480 train_time:155543ms step_avg:156.17ms step:1007/1480 train_time:155708ms step_avg:156.18ms step:1008/1480 train_time:155876ms step_avg:156.19ms step:1009/1480 train_time:156051ms step_avg:156.21ms step:1010/1480 train_time:156218ms step_avg:156.22ms step:1011/1480 train_time:156383ms step_avg:156.23ms step:1012/1480 train_time:156546ms step_avg:156.23ms step:1013/1480 train_time:156717ms step_avg:156.25ms step:1014/1480 train_time:156885ms step_avg:156.26ms step:1015/1480 train_time:157056ms step_avg:156.27ms step:1016/1480 train_time:157222ms step_avg:156.28ms step:1017/1480 train_time:157393ms step_avg:156.30ms step:1018/1480 train_time:157561ms step_avg:156.31ms step:1019/1480 train_time:157731ms step_avg:156.32ms step:1020/1480 train_time:157899ms step_avg:156.34ms step:1021/1480 train_time:158064ms step_avg:156.34ms step:1022/1480 train_time:158228ms step_avg:156.35ms step:1023/1480 train_time:158396ms step_avg:156.36ms step:1024/1480 train_time:158563ms step_avg:156.37ms step:1025/1480 train_time:158735ms step_avg:156.39ms step:1026/1480 train_time:158901ms step_avg:156.40ms step:1027/1480 train_time:159066ms step_avg:156.41ms step:1028/1480 train_time:159240ms step_avg:156.42ms step:1029/1480 train_time:159415ms step_avg:156.44ms step:1030/1480 train_time:159581ms step_avg:156.45ms step:1031/1480 train_time:159745ms step_avg:156.46ms step:1032/1480 train_time:159917ms step_avg:156.47ms step:1033/1480 train_time:160084ms step_avg:156.48ms step:1034/1480 train_time:160253ms step_avg:156.50ms step:1035/1480 train_time:160421ms step_avg:156.51ms step:1036/1480 train_time:160587ms step_avg:156.52ms step:1037/1480 train_time:160756ms step_avg:156.53ms step:1038/1480 train_time:160924ms step_avg:156.54ms step:1039/1480 train_time:161095ms step_avg:156.55ms step:1040/1480 train_time:161260ms step_avg:156.56ms step:1041/1480 train_time:161427ms step_avg:156.57ms step:1042/1480 train_time:161591ms step_avg:156.58ms step:1043/1480 train_time:161757ms step_avg:156.59ms step:1044/1480 train_time:161921ms step_avg:156.60ms step:1045/1480 train_time:162092ms step_avg:156.61ms step:1046/1480 train_time:162260ms step_avg:156.62ms step:1047/1480 train_time:162425ms step_avg:156.63ms step:1048/1480 train_time:162591ms step_avg:156.64ms step:1049/1480 train_time:162757ms step_avg:156.65ms step:1050/1480 train_time:162927ms step_avg:156.66ms step:1051/1480 train_time:163097ms step_avg:156.67ms step:1052/1480 train_time:163266ms step_avg:156.68ms step:1053/1480 train_time:163432ms step_avg:156.69ms step:1054/1480 train_time:163599ms step_avg:156.70ms step:1055/1480 train_time:163763ms step_avg:156.71ms step:1056/1480 train_time:163929ms step_avg:156.72ms step:1057/1480 train_time:164095ms step_avg:156.73ms step:1058/1480 train_time:164264ms step_avg:156.74ms step:1059/1480 train_time:164438ms step_avg:156.76ms step:1060/1480 train_time:164607ms step_avg:156.77ms step:1061/1480 train_time:164771ms step_avg:156.78ms step:1062/1480 train_time:164939ms step_avg:156.79ms step:1063/1480 train_time:165103ms step_avg:156.79ms step:1064/1480 train_time:165267ms step_avg:156.80ms step:1065/1480 train_time:165436ms step_avg:156.81ms step:1066/1480 train_time:165604ms step_avg:156.82ms step:1067/1480 train_time:165776ms step_avg:156.84ms step:1068/1480 train_time:165942ms step_avg:156.84ms step:1069/1480 train_time:166115ms step_avg:156.86ms step:1070/1480 train_time:166281ms step_avg:156.87ms step:1071/1480 train_time:166455ms step_avg:156.89ms step:1072/1480 train_time:166621ms step_avg:156.89ms step:1073/1480 train_time:166784ms step_avg:156.90ms step:1074/1480 train_time:166952ms step_avg:156.91ms step:1075/1480 train_time:167121ms step_avg:156.92ms step:1076/1480 train_time:167287ms step_avg:156.93ms step:1077/1480 train_time:167455ms step_avg:156.94ms step:1078/1480 train_time:167628ms step_avg:156.96ms step:1079/1480 train_time:167800ms step_avg:156.97ms step:1080/1480 train_time:167971ms step_avg:156.98ms step:1081/1480 train_time:168139ms step_avg:156.99ms step:1082/1480 train_time:168305ms step_avg:157.00ms step:1083/1480 train_time:168471ms step_avg:157.01ms step:1084/1480 train_time:168639ms step_avg:157.02ms step:1085/1480 train_time:168805ms step_avg:157.03ms step:1086/1480 train_time:168974ms step_avg:157.04ms step:1087/1480 train_time:169140ms step_avg:157.05ms step:1088/1480 train_time:169309ms step_avg:157.06ms step:1089/1480 train_time:169482ms step_avg:157.07ms step:1090/1480 train_time:169656ms step_avg:157.09ms step:1091/1480 train_time:169824ms step_avg:157.10ms step:1092/1480 train_time:169991ms step_avg:157.11ms step:1093/1480 train_time:170160ms step_avg:157.12ms step:1094/1480 train_time:170325ms step_avg:157.13ms step:1095/1480 train_time:170489ms step_avg:157.13ms step:1096/1480 train_time:170659ms step_avg:157.14ms step:1097/1480 train_time:170829ms step_avg:157.16ms step:1098/1480 train_time:170999ms step_avg:157.17ms step:1099/1480 train_time:171170ms step_avg:157.18ms step:1100/1480 train_time:171341ms step_avg:157.19ms step:1101/1480 train_time:171512ms step_avg:157.21ms step:1102/1480 train_time:171682ms step_avg:157.22ms step:1103/1480 train_time:171859ms step_avg:157.24ms step:1104/1480 train_time:172027ms step_avg:157.25ms step:1105/1480 train_time:172197ms step_avg:157.26ms step:1106/1480 train_time:172365ms step_avg:157.27ms step:1107/1480 train_time:172535ms step_avg:157.28ms step:1108/1480 train_time:172700ms step_avg:157.29ms step:1109/1480 train_time:172866ms step_avg:157.29ms step:1110/1480 train_time:173035ms step_avg:157.30ms step:1111/1480 train_time:173201ms step_avg:157.31ms step:1112/1480 train_time:173371ms step_avg:157.32ms step:1113/1480 train_time:173552ms step_avg:157.35ms step:1114/1480 train_time:173724ms step_avg:157.36ms step:1115/1480 train_time:173897ms step_avg:157.37ms step:1116/1480 train_time:174064ms step_avg:157.38ms step:1117/1480 train_time:174237ms step_avg:157.40ms step:1118/1480 train_time:174410ms step_avg:157.41ms step:1119/1480 train_time:174575ms step_avg:157.42ms step:1120/1480 train_time:174745ms step_avg:157.43ms step:1121/1480 train_time:174916ms step_avg:157.44ms step:1122/1480 train_time:175082ms step_avg:157.45ms step:1123/1480 train_time:175249ms step_avg:157.46ms step:1124/1480 train_time:175417ms step_avg:157.47ms step:1125/1480 train_time:175585ms step_avg:157.47ms step:1125/1480 val_loss:3.3902 train_time:175653ms step_avg:157.54ms step:1126/1480 train_time:175754ms step_avg:157.49ms step:1127/1480 train_time:175923ms step_avg:157.50ms step:1128/1480 train_time:176095ms step_avg:157.51ms step:1129/1480 train_time:176267ms step_avg:157.52ms step:1130/1480 train_time:176438ms step_avg:157.53ms step:1131/1480 train_time:176617ms step_avg:157.55ms step:1132/1480 train_time:176781ms step_avg:157.56ms step:1133/1480 train_time:176954ms step_avg:157.57ms step:1134/1480 train_time:177125ms step_avg:157.58ms step:1135/1480 train_time:177293ms step_avg:157.59ms step:1136/1480 train_time:177462ms step_avg:157.60ms step:1137/1480 train_time:177632ms step_avg:157.61ms step:1138/1480 train_time:177802ms step_avg:157.63ms step:1139/1480 train_time:177970ms step_avg:157.64ms step:1140/1480 train_time:178138ms step_avg:157.64ms step:1141/1480 train_time:178312ms step_avg:157.66ms step:1142/1480 train_time:178480ms step_avg:157.67ms step:1143/1480 train_time:178653ms step_avg:157.68ms step:1144/1480 train_time:178820ms step_avg:157.69ms step:1145/1480 train_time:178986ms step_avg:157.70ms step:1146/1480 train_time:179157ms step_avg:157.71ms step:1147/1480 train_time:179324ms step_avg:157.72ms step:1148/1480 train_time:179494ms step_avg:157.73ms step:1149/1480 train_time:179664ms step_avg:157.74ms step:1150/1480 train_time:179834ms step_avg:157.75ms step:1151/1480 train_time:180005ms step_avg:157.76ms step:1152/1480 train_time:180178ms step_avg:157.77ms step:1153/1480 train_time:180351ms step_avg:157.79ms step:1154/1480 train_time:180518ms step_avg:157.80ms step:1155/1480 train_time:180690ms step_avg:157.81ms step:1156/1480 train_time:180872ms step_avg:157.83ms step:1157/1480 train_time:181040ms step_avg:157.84ms step:1158/1480 train_time:181206ms step_avg:157.85ms step:1159/1480 train_time:181375ms step_avg:157.85ms step:1160/1480 train_time:181540ms step_avg:157.86ms step:1161/1480 train_time:181710ms step_avg:157.87ms step:1162/1480 train_time:181880ms step_avg:157.88ms step:1163/1480 train_time:182049ms step_avg:157.89ms step:1164/1480 train_time:182217ms step_avg:157.90ms step:1165/1480 train_time:182381ms step_avg:157.91ms step:1166/1480 train_time:182551ms step_avg:157.92ms step:1167/1480 train_time:182719ms step_avg:157.92ms step:1168/1480 train_time:182888ms step_avg:157.93ms step:1169/1480 train_time:183057ms step_avg:157.94ms step:1170/1480 train_time:183224ms step_avg:157.95ms step:1171/1480 train_time:183391ms step_avg:157.96ms step:1172/1480 train_time:183560ms step_avg:157.97ms step:1173/1480 train_time:183732ms step_avg:157.98ms step:1174/1480 train_time:183915ms step_avg:158.00ms step:1175/1480 train_time:184086ms step_avg:158.01ms step:1176/1480 train_time:184259ms step_avg:158.03ms step:1177/1480 train_time:184437ms step_avg:158.04ms step:1178/1480 train_time:184602ms step_avg:158.05ms step:1179/1480 train_time:184770ms step_avg:158.06ms step:1180/1480 train_time:184949ms step_avg:158.08ms step:1181/1480 train_time:185118ms step_avg:158.09ms step:1182/1480 train_time:185287ms step_avg:158.10ms step:1183/1480 train_time:185459ms step_avg:158.11ms step:1184/1480 train_time:185627ms step_avg:158.11ms step:1185/1480 train_time:185799ms step_avg:158.13ms step:1186/1480 train_time:185969ms step_avg:158.14ms step:1187/1480 train_time:186152ms step_avg:158.16ms step:1188/1480 train_time:186317ms step_avg:158.16ms step:1189/1480 train_time:186487ms step_avg:158.17ms step:1190/1480 train_time:186655ms step_avg:158.18ms step:1191/1480 train_time:186826ms step_avg:158.19ms step:1192/1480 train_time:186992ms step_avg:158.20ms step:1193/1480 train_time:187159ms step_avg:158.21ms step:1194/1480 train_time:187329ms step_avg:158.22ms step:1195/1480 train_time:187503ms step_avg:158.23ms step:1196/1480 train_time:187685ms step_avg:158.25ms step:1197/1480 train_time:187858ms step_avg:158.26ms step:1198/1480 train_time:188037ms step_avg:158.28ms step:1199/1480 train_time:188207ms step_avg:158.29ms step:1200/1480 train_time:188375ms step_avg:158.30ms step:1201/1480 train_time:188543ms step_avg:158.31ms step:1202/1480 train_time:188725ms step_avg:158.33ms step:1203/1480 train_time:188901ms step_avg:158.34ms step:1204/1480 train_time:189076ms step_avg:158.35ms step:1205/1480 train_time:189244ms step_avg:158.36ms step:1206/1480 train_time:189412ms step_avg:158.37ms step:1207/1480 train_time:189582ms step_avg:158.38ms step:1208/1480 train_time:189748ms step_avg:158.39ms step:1209/1480 train_time:189921ms step_avg:158.40ms step:1210/1480 train_time:190098ms step_avg:158.41ms step:1211/1480 train_time:190271ms step_avg:158.43ms step:1212/1480 train_time:190444ms step_avg:158.44ms step:1213/1480 train_time:190616ms step_avg:158.45ms step:1214/1480 train_time:190794ms step_avg:158.47ms step:1215/1480 train_time:190967ms step_avg:158.48ms step:1216/1480 train_time:191137ms step_avg:158.49ms step:1217/1480 train_time:191311ms step_avg:158.50ms step:1218/1480 train_time:191483ms step_avg:158.51ms step:1219/1480 train_time:191662ms step_avg:158.53ms step:1220/1480 train_time:191831ms step_avg:158.54ms step:1221/1480 train_time:192001ms step_avg:158.55ms step:1222/1480 train_time:192169ms step_avg:158.56ms step:1223/1480 train_time:192339ms step_avg:158.56ms step:1224/1480 train_time:192519ms step_avg:158.58ms step:1225/1480 train_time:192690ms step_avg:158.59ms step:1226/1480 train_time:192863ms step_avg:158.60ms step:1227/1480 train_time:193037ms step_avg:158.62ms step:1228/1480 train_time:193208ms step_avg:158.63ms step:1229/1480 train_time:193381ms step_avg:158.64ms step:1230/1480 train_time:193562ms step_avg:158.66ms step:1231/1480 train_time:193737ms step_avg:158.67ms step:1232/1480 train_time:193913ms step_avg:158.68ms step:1233/1480 train_time:194081ms step_avg:158.69ms step:1234/1480 train_time:194252ms step_avg:158.70ms step:1235/1480 train_time:194423ms step_avg:158.71ms step:1236/1480 train_time:194593ms step_avg:158.72ms step:1237/1480 train_time:194764ms step_avg:158.73ms step:1238/1480 train_time:194951ms step_avg:158.76ms step:1239/1480 train_time:195121ms step_avg:158.76ms step:1240/1480 train_time:195291ms step_avg:158.77ms step:1241/1480 train_time:195463ms step_avg:158.78ms step:1242/1480 train_time:195633ms step_avg:158.79ms step:1243/1480 train_time:195805ms step_avg:158.80ms step:1244/1480 train_time:195973ms step_avg:158.81ms step:1245/1480 train_time:196141ms step_avg:158.82ms step:1246/1480 train_time:196312ms step_avg:158.83ms step:1247/1480 train_time:196480ms step_avg:158.84ms step:1248/1480 train_time:196647ms step_avg:158.84ms step:1249/1480 train_time:196816ms step_avg:158.85ms step:1250/1480 train_time:196985ms step_avg:158.86ms step:1250/1480 val_loss:3.3409 train_time:197058ms step_avg:158.92ms step:1251/1480 train_time:197167ms step_avg:158.88ms step:1252/1480 train_time:197337ms step_avg:158.89ms step:1253/1480 train_time:197505ms step_avg:158.89ms step:1254/1480 train_time:197676ms step_avg:158.90ms step:1255/1480 train_time:197863ms step_avg:158.93ms step:1256/1480 train_time:198039ms step_avg:158.94ms step:1257/1480 train_time:198208ms step_avg:158.95ms step:1258/1480 train_time:198386ms step_avg:158.96ms step:1259/1480 train_time:198559ms step_avg:158.97ms step:1260/1480 train_time:198726ms step_avg:158.98ms step:1261/1480 train_time:198897ms step_avg:158.99ms step:1262/1480 train_time:199071ms step_avg:159.00ms step:1263/1480 train_time:199244ms step_avg:159.01ms step:1264/1480 train_time:199410ms step_avg:159.02ms step:1265/1480 train_time:199578ms step_avg:159.03ms step:1266/1480 train_time:199750ms step_avg:159.04ms step:1267/1480 train_time:199921ms step_avg:159.05ms step:1268/1480 train_time:200091ms step_avg:159.05ms step:1269/1480 train_time:200265ms step_avg:159.07ms step:1270/1480 train_time:200435ms step_avg:159.08ms step:1271/1480 train_time:200606ms step_avg:159.08ms step:1272/1480 train_time:200772ms step_avg:159.09ms step:1273/1480 train_time:200943ms step_avg:159.10ms step:1274/1480 train_time:201114ms step_avg:159.11ms step:1275/1480 train_time:201284ms step_avg:159.12ms step:1276/1480 train_time:201448ms step_avg:159.12ms step:1277/1480 train_time:201621ms step_avg:159.13ms step:1278/1480 train_time:201790ms step_avg:159.14ms step:1279/1480 train_time:201963ms step_avg:159.15ms step:1280/1480 train_time:202141ms step_avg:159.17ms step:1281/1480 train_time:202309ms step_avg:159.17ms step:1282/1480 train_time:202476ms step_avg:159.18ms step:1283/1480 train_time:202646ms step_avg:159.19ms step:1284/1480 train_time:202815ms step_avg:159.20ms step:1285/1480 train_time:202986ms step_avg:159.20ms step:1286/1480 train_time:203155ms step_avg:159.21ms step:1287/1480 train_time:203329ms step_avg:159.22ms step:1288/1480 train_time:203503ms step_avg:159.24ms step:1289/1480 train_time:203684ms step_avg:159.25ms step:1290/1480 train_time:203863ms step_avg:159.27ms step:1291/1480 train_time:204036ms step_avg:159.28ms step:1292/1480 train_time:204210ms step_avg:159.29ms step:1293/1480 train_time:204386ms step_avg:159.30ms step:1294/1480 train_time:204558ms step_avg:159.31ms step:1295/1480 train_time:204729ms step_avg:159.32ms step:1296/1480 train_time:204903ms step_avg:159.33ms step:1297/1480 train_time:205074ms step_avg:159.34ms step:1298/1480 train_time:205245ms step_avg:159.35ms step:1299/1480 train_time:205414ms step_avg:159.36ms step:1300/1480 train_time:205582ms step_avg:159.37ms step:1301/1480 train_time:205750ms step_avg:159.37ms step:1302/1480 train_time:205924ms step_avg:159.38ms step:1303/1480 train_time:206101ms step_avg:159.40ms step:1304/1480 train_time:206274ms step_avg:159.41ms step:1305/1480 train_time:206444ms step_avg:159.42ms step:1306/1480 train_time:206619ms step_avg:159.43ms step:1307/1480 train_time:206788ms step_avg:159.44ms step:1308/1480 train_time:206958ms step_avg:159.44ms step:1309/1480 train_time:207130ms step_avg:159.45ms step:1310/1480 train_time:207300ms step_avg:159.46ms step:1311/1480 train_time:207469ms step_avg:159.47ms step:1312/1480 train_time:207643ms step_avg:159.48ms step:1313/1480 train_time:207811ms step_avg:159.49ms step:1314/1480 train_time:207985ms step_avg:159.50ms step:1315/1480 train_time:208156ms step_avg:159.51ms step:1316/1480 train_time:208323ms step_avg:159.51ms step:1317/1480 train_time:208494ms step_avg:159.52ms step:1318/1480 train_time:208674ms step_avg:159.54ms step:1319/1480 train_time:208849ms step_avg:159.55ms step:1320/1480 train_time:209027ms step_avg:159.56ms step:1321/1480 train_time:209200ms step_avg:159.57ms step:1322/1480 train_time:209379ms step_avg:159.59ms step:1323/1480 train_time:209551ms step_avg:159.60ms step:1324/1480 train_time:209727ms step_avg:159.61ms step:1325/1480 train_time:209909ms step_avg:159.63ms step:1326/1480 train_time:210086ms step_avg:159.64ms step:1327/1480 train_time:210255ms step_avg:159.65ms step:1328/1480 train_time:210427ms step_avg:159.66ms step:1329/1480 train_time:210623ms step_avg:159.68ms step:1330/1480 train_time:210804ms step_avg:159.70ms step:1331/1480 train_time:210973ms step_avg:159.71ms step:1332/1480 train_time:211148ms step_avg:159.72ms step:1333/1480 train_time:211325ms step_avg:159.73ms step:1334/1480 train_time:211496ms step_avg:159.74ms step:1335/1480 train_time:211665ms step_avg:159.75ms step:1336/1480 train_time:211847ms step_avg:159.76ms step:1337/1480 train_time:212025ms step_avg:159.78ms step:1338/1480 train_time:212197ms step_avg:159.79ms step:1339/1480 train_time:212370ms step_avg:159.80ms step:1340/1480 train_time:212542ms step_avg:159.81ms step:1341/1480 train_time:212711ms step_avg:159.81ms step:1342/1480 train_time:212886ms step_avg:159.82ms step:1343/1480 train_time:213055ms step_avg:159.83ms step:1344/1480 train_time:213228ms step_avg:159.84ms step:1345/1480 train_time:213408ms step_avg:159.86ms step:1346/1480 train_time:213578ms step_avg:159.86ms step:1347/1480 train_time:213747ms step_avg:159.87ms step:1348/1480 train_time:213918ms step_avg:159.88ms step:1349/1480 train_time:214087ms step_avg:159.89ms step:1350/1480 train_time:214262ms step_avg:159.90ms step:1351/1480 train_time:214433ms step_avg:159.91ms step:1352/1480 train_time:214605ms step_avg:159.91ms step:1353/1480 train_time:214780ms step_avg:159.93ms step:1354/1480 train_time:214951ms step_avg:159.93ms step:1355/1480 train_time:215118ms step_avg:159.94ms step:1356/1480 train_time:215290ms step_avg:159.95ms step:1357/1480 train_time:215465ms step_avg:159.96ms step:1358/1480 train_time:215636ms step_avg:159.97ms step:1359/1480 train_time:215808ms step_avg:159.98ms step:1360/1480 train_time:215984ms step_avg:159.99ms step:1361/1480 train_time:216159ms step_avg:160.00ms step:1362/1480 train_time:216332ms step_avg:160.01ms step:1363/1480 train_time:216513ms step_avg:160.02ms step:1364/1480 train_time:216682ms step_avg:160.03ms step:1365/1480 train_time:216848ms step_avg:160.04ms step:1366/1480 train_time:217020ms step_avg:160.04ms step:1367/1480 train_time:217191ms step_avg:160.05ms step:1368/1480 train_time:217365ms step_avg:160.06ms step:1369/1480 train_time:217547ms step_avg:160.08ms step:1370/1480 train_time:217724ms step_avg:160.09ms step:1371/1480 train_time:217894ms step_avg:160.10ms step:1372/1480 train_time:218072ms step_avg:160.11ms step:1373/1480 train_time:218242ms step_avg:160.12ms step:1374/1480 train_time:218417ms step_avg:160.13ms step:1375/1480 train_time:218587ms step_avg:160.14ms step:1375/1480 val_loss:3.3022 train_time:218655ms step_avg:160.19ms step:1376/1480 train_time:218759ms step_avg:160.15ms step:1377/1480 train_time:218930ms step_avg:160.15ms step:1378/1480 train_time:219098ms step_avg:160.16ms step:1379/1480 train_time:219276ms step_avg:160.17ms step:1380/1480 train_time:219450ms step_avg:160.18ms step:1381/1480 train_time:219632ms step_avg:160.20ms step:1382/1480 train_time:219803ms step_avg:160.21ms step:1383/1480 train_time:219976ms step_avg:160.22ms step:1384/1480 train_time:220153ms step_avg:160.23ms step:1385/1480 train_time:220318ms step_avg:160.23ms step:1386/1480 train_time:220490ms step_avg:160.24ms step:1387/1480 train_time:220660ms step_avg:160.25ms step:1388/1480 train_time:220829ms step_avg:160.25ms step:1389/1480 train_time:221002ms step_avg:160.26ms step:1390/1480 train_time:221170ms step_avg:160.27ms step:1391/1480 train_time:221341ms step_avg:160.28ms step:1392/1480 train_time:221514ms step_avg:160.28ms step:1393/1480 train_time:221683ms step_avg:160.29ms step:1394/1480 train_time:221854ms step_avg:160.30ms step:1395/1480 train_time:222022ms step_avg:160.30ms step:1396/1480 train_time:222190ms step_avg:160.31ms step:1397/1480 train_time:222357ms step_avg:160.31ms step:1398/1480 train_time:222525ms step_avg:160.32ms step:1399/1480 train_time:222696ms step_avg:160.33ms step:1400/1480 train_time:222873ms step_avg:160.34ms step:1401/1480 train_time:223039ms step_avg:160.34ms step:1402/1480 train_time:223212ms step_avg:160.35ms step:1403/1480 train_time:223387ms step_avg:160.36ms step:1404/1480 train_time:223558ms step_avg:160.37ms step:1405/1480 train_time:223734ms step_avg:160.38ms step:1406/1480 train_time:223908ms step_avg:160.39ms step:1407/1480 train_time:224076ms step_avg:160.40ms step:1408/1480 train_time:224243ms step_avg:160.40ms step:1409/1480 train_time:224427ms step_avg:160.42ms step:1410/1480 train_time:224595ms step_avg:160.43ms step:1411/1480 train_time:224764ms step_avg:160.43ms step:1412/1480 train_time:224934ms step_avg:160.44ms step:1413/1480 train_time:225104ms step_avg:160.44ms step:1414/1480 train_time:225276ms step_avg:160.45ms step:1415/1480 train_time:225450ms step_avg:160.46ms step:1416/1480 train_time:225637ms step_avg:160.48ms step:1417/1480 train_time:225812ms step_avg:160.49ms step:1418/1480 train_time:225981ms step_avg:160.50ms step:1419/1480 train_time:226156ms step_avg:160.51ms step:1420/1480 train_time:226330ms step_avg:160.52ms step:1421/1480 train_time:226504ms step_avg:160.53ms step:1422/1480 train_time:226676ms step_avg:160.54ms step:1423/1480 train_time:226845ms step_avg:160.54ms step:1424/1480 train_time:227022ms step_avg:160.55ms step:1425/1480 train_time:227202ms step_avg:160.57ms step:1426/1480 train_time:227375ms step_avg:160.58ms step:1427/1480 train_time:227551ms step_avg:160.59ms step:1428/1480 train_time:227720ms step_avg:160.59ms step:1429/1480 train_time:227889ms step_avg:160.60ms step:1430/1480 train_time:228063ms step_avg:160.61ms step:1431/1480 train_time:228238ms step_avg:160.62ms step:1432/1480 train_time:228415ms step_avg:160.63ms step:1433/1480 train_time:228595ms step_avg:160.64ms step:1434/1480 train_time:228775ms step_avg:160.66ms step:1435/1480 train_time:228950ms step_avg:160.67ms step:1436/1480 train_time:229124ms step_avg:160.68ms step:1437/1480 train_time:229296ms step_avg:160.68ms step:1438/1480 train_time:229464ms step_avg:160.69ms step:1439/1480 train_time:229637ms step_avg:160.70ms step:1440/1480 train_time:229807ms step_avg:160.70ms step:1441/1480 train_time:229977ms step_avg:160.71ms step:1442/1480 train_time:230155ms step_avg:160.72ms step:1443/1480 train_time:230344ms step_avg:160.74ms step:1444/1480 train_time:230515ms step_avg:160.75ms step:1445/1480 train_time:230684ms step_avg:160.76ms step:1446/1480 train_time:230860ms step_avg:160.77ms step:1447/1480 train_time:231038ms step_avg:160.78ms step:1448/1480 train_time:231211ms step_avg:160.79ms step:1449/1480 train_time:231385ms step_avg:160.80ms step:1450/1480 train_time:231558ms step_avg:160.80ms step:1451/1480 train_time:231730ms step_avg:160.81ms step:1452/1480 train_time:231904ms step_avg:160.82ms step:1453/1480 train_time:232074ms step_avg:160.83ms step:1454/1480 train_time:232246ms step_avg:160.84ms step:1455/1480 train_time:232426ms step_avg:160.85ms step:1456/1480 train_time:232600ms step_avg:160.86ms step:1457/1480 train_time:232771ms step_avg:160.86ms step:1458/1480 train_time:232941ms step_avg:160.87ms step:1459/1480 train_time:233118ms step_avg:160.88ms step:1460/1480 train_time:233291ms step_avg:160.89ms step:1461/1480 train_time:233466ms step_avg:160.90ms step:1462/1480 train_time:233637ms step_avg:160.91ms step:1463/1480 train_time:233815ms step_avg:160.92ms step:1464/1480 train_time:233988ms step_avg:160.93ms step:1465/1480 train_time:234160ms step_avg:160.93ms step:1466/1480 train_time:234332ms step_avg:160.94ms step:1467/1480 train_time:234506ms step_avg:160.95ms step:1468/1480 train_time:234676ms step_avg:160.96ms step:1469/1480 train_time:234848ms step_avg:160.96ms step:1470/1480 train_time:235026ms step_avg:160.98ms step:1471/1480 train_time:235214ms step_avg:161.00ms step:1472/1480 train_time:235395ms step_avg:161.01ms step:1473/1480 train_time:235566ms step_avg:161.02ms step:1474/1480 train_time:235744ms step_avg:161.03ms step:1475/1480 train_time:235922ms step_avg:161.04ms step:1476/1480 train_time:236095ms step_avg:161.05ms step:1477/1480 train_time:236277ms step_avg:161.06ms step:1478/1480 train_time:236459ms step_avg:161.08ms step:1479/1480 train_time:236632ms step_avg:161.08ms step:1480/1480 train_time:236806ms step_avg:161.09ms step:1480/1480 val_loss:3.2835 train_time:236877ms step_avg:161.14ms