import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 12:40:34 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 129W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 105W / 700W | 35MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23226ms step_avg:nanms step:2/1480 train_time:23318ms step_avg:nanms step:3/1480 train_time:23456ms step_avg:nanms step:4/1480 train_time:23598ms step_avg:nanms step:5/1480 train_time:23738ms step_avg:nanms step:6/1480 train_time:23880ms step_avg:nanms step:7/1480 train_time:24022ms step_avg:nanms step:8/1480 train_time:24165ms step_avg:nanms step:9/1480 train_time:24310ms step_avg:nanms step:10/1480 train_time:24453ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.19ms step:14/1480 train_time:566ms step_avg:141.48ms step:15/1480 train_time:709ms step_avg:141.86ms step:16/1480 train_time:854ms step_avg:142.28ms step:17/1480 train_time:997ms step_avg:142.47ms step:18/1480 train_time:1139ms step_avg:142.41ms step:19/1480 train_time:1281ms step_avg:142.29ms step:20/1480 train_time:1421ms step_avg:142.14ms step:21/1480 train_time:1563ms step_avg:142.09ms step:22/1480 train_time:1706ms step_avg:142.19ms step:23/1480 train_time:1850ms step_avg:142.34ms step:24/1480 train_time:1995ms step_avg:142.48ms step:25/1480 train_time:2137ms step_avg:142.46ms step:26/1480 train_time:2278ms step_avg:142.39ms step:27/1480 train_time:2420ms step_avg:142.33ms step:28/1480 train_time:2561ms step_avg:142.28ms step:29/1480 train_time:2703ms step_avg:142.24ms step:30/1480 train_time:2848ms step_avg:142.42ms step:31/1480 train_time:2993ms step_avg:142.53ms step:32/1480 train_time:3136ms step_avg:142.55ms step:33/1480 train_time:3278ms step_avg:142.50ms step:34/1480 train_time:3419ms step_avg:142.44ms step:35/1480 train_time:3562ms step_avg:142.49ms step:36/1480 train_time:3707ms step_avg:142.59ms step:37/1480 train_time:3851ms step_avg:142.63ms step:38/1480 train_time:3995ms step_avg:142.67ms step:39/1480 train_time:4137ms step_avg:142.67ms step:40/1480 train_time:4280ms step_avg:142.66ms step:41/1480 train_time:4422ms step_avg:142.63ms step:42/1480 train_time:4562ms step_avg:142.56ms step:43/1480 train_time:4705ms step_avg:142.57ms step:44/1480 train_time:4849ms step_avg:142.63ms step:45/1480 train_time:4993ms step_avg:142.66ms step:46/1480 train_time:5136ms step_avg:142.67ms step:47/1480 train_time:5278ms step_avg:142.64ms step:48/1480 train_time:5418ms step_avg:142.59ms step:49/1480 train_time:5562ms step_avg:142.62ms step:50/1480 train_time:5706ms step_avg:142.65ms step:51/1480 train_time:5850ms step_avg:142.67ms step:52/1480 train_time:5992ms step_avg:142.67ms step:53/1480 train_time:6135ms step_avg:142.66ms step:54/1480 train_time:6277ms step_avg:142.65ms step:55/1480 train_time:6417ms step_avg:142.61ms step:56/1480 train_time:6559ms step_avg:142.60ms step:57/1480 train_time:6704ms step_avg:142.64ms step:58/1480 train_time:6849ms step_avg:142.68ms step:59/1480 train_time:6993ms step_avg:142.71ms step:60/1480 train_time:7135ms step_avg:142.71ms step:61/1480 train_time:7278ms step_avg:142.71ms step:62/1480 train_time:7419ms step_avg:142.68ms step:63/1480 train_time:7563ms step_avg:142.70ms step:64/1480 train_time:7707ms step_avg:142.73ms step:65/1480 train_time:7851ms step_avg:142.75ms step:66/1480 train_time:7995ms step_avg:142.77ms step:67/1480 train_time:8137ms step_avg:142.75ms step:68/1480 train_time:8278ms step_avg:142.72ms step:69/1480 train_time:8419ms step_avg:142.69ms step:70/1480 train_time:8560ms step_avg:142.66ms step:71/1480 train_time:8705ms step_avg:142.70ms step:72/1480 train_time:8849ms step_avg:142.72ms step:73/1480 train_time:8993ms step_avg:142.75ms step:74/1480 train_time:9135ms step_avg:142.74ms step:75/1480 train_time:9279ms step_avg:142.75ms step:76/1480 train_time:9421ms step_avg:142.75ms step:77/1480 train_time:9563ms step_avg:142.74ms step:78/1480 train_time:9706ms step_avg:142.73ms step:79/1480 train_time:9849ms step_avg:142.74ms step:80/1480 train_time:9994ms step_avg:142.76ms step:81/1480 train_time:10136ms step_avg:142.75ms step:82/1480 train_time:10277ms step_avg:142.74ms step:83/1480 train_time:10419ms step_avg:142.72ms step:84/1480 train_time:10561ms step_avg:142.72ms step:85/1480 train_time:10704ms step_avg:142.73ms step:86/1480 train_time:10848ms step_avg:142.74ms step:87/1480 train_time:10992ms step_avg:142.76ms step:88/1480 train_time:11134ms step_avg:142.75ms step:89/1480 train_time:11278ms step_avg:142.76ms step:90/1480 train_time:11419ms step_avg:142.73ms step:91/1480 train_time:11561ms step_avg:142.73ms step:92/1480 train_time:11705ms step_avg:142.74ms step:93/1480 train_time:11848ms step_avg:142.74ms step:94/1480 train_time:11991ms step_avg:142.75ms step:95/1480 train_time:12134ms step_avg:142.76ms step:96/1480 train_time:12275ms step_avg:142.74ms step:97/1480 train_time:12418ms step_avg:142.73ms step:98/1480 train_time:12558ms step_avg:142.71ms step:99/1480 train_time:12701ms step_avg:142.70ms step:100/1480 train_time:12844ms step_avg:142.71ms step:101/1480 train_time:12986ms step_avg:142.71ms step:102/1480 train_time:13128ms step_avg:142.70ms step:103/1480 train_time:13270ms step_avg:142.69ms step:104/1480 train_time:13413ms step_avg:142.69ms step:105/1480 train_time:13554ms step_avg:142.68ms step:106/1480 train_time:13697ms step_avg:142.67ms step:107/1480 train_time:13838ms step_avg:142.66ms step:108/1480 train_time:13981ms step_avg:142.66ms step:109/1480 train_time:14124ms step_avg:142.67ms step:110/1480 train_time:14268ms step_avg:142.68ms step:111/1480 train_time:14414ms step_avg:142.71ms step:112/1480 train_time:14560ms step_avg:142.74ms step:113/1480 train_time:14707ms step_avg:142.79ms step:114/1480 train_time:14854ms step_avg:142.83ms step:115/1480 train_time:15000ms step_avg:142.85ms step:116/1480 train_time:15148ms step_avg:142.91ms step:117/1480 train_time:15295ms step_avg:142.94ms step:118/1480 train_time:15441ms step_avg:142.97ms step:119/1480 train_time:15588ms step_avg:143.01ms step:120/1480 train_time:15735ms step_avg:143.05ms step:121/1480 train_time:15882ms step_avg:143.08ms step:122/1480 train_time:16029ms step_avg:143.12ms step:123/1480 train_time:16177ms step_avg:143.16ms step:124/1480 train_time:16324ms step_avg:143.19ms step:125/1480 train_time:16472ms step_avg:143.23ms step:125/1480 val_loss:4.4086 train_time:16529ms step_avg:143.73ms step:126/1480 train_time:16626ms step_avg:143.33ms step:127/1480 train_time:16774ms step_avg:143.37ms step:128/1480 train_time:16921ms step_avg:143.40ms step:129/1480 train_time:17067ms step_avg:143.42ms step:130/1480 train_time:17213ms step_avg:143.44ms step:131/1480 train_time:17359ms step_avg:143.47ms step:132/1480 train_time:17506ms step_avg:143.49ms step:133/1480 train_time:17654ms step_avg:143.53ms step:134/1480 train_time:17804ms step_avg:143.58ms step:135/1480 train_time:17950ms step_avg:143.60ms step:136/1480 train_time:18100ms step_avg:143.65ms step:137/1480 train_time:18246ms step_avg:143.67ms step:138/1480 train_time:18392ms step_avg:143.68ms step:139/1480 train_time:18540ms step_avg:143.72ms step:140/1480 train_time:18687ms step_avg:143.75ms step:141/1480 train_time:18836ms step_avg:143.78ms step:142/1480 train_time:18982ms step_avg:143.80ms step:143/1480 train_time:19130ms step_avg:143.83ms step:144/1480 train_time:19277ms step_avg:143.86ms step:145/1480 train_time:19424ms step_avg:143.88ms step:146/1480 train_time:19569ms step_avg:143.89ms step:147/1480 train_time:19717ms step_avg:143.92ms step:148/1480 train_time:19864ms step_avg:143.94ms step:149/1480 train_time:20011ms step_avg:143.96ms step:150/1480 train_time:20159ms step_avg:143.99ms step:151/1480 train_time:20306ms step_avg:144.01ms step:152/1480 train_time:20451ms step_avg:144.02ms step:153/1480 train_time:20599ms step_avg:144.05ms step:154/1480 train_time:20746ms step_avg:144.07ms step:155/1480 train_time:20892ms step_avg:144.08ms step:156/1480 train_time:21040ms step_avg:144.11ms step:157/1480 train_time:21187ms step_avg:144.13ms step:158/1480 train_time:21334ms step_avg:144.15ms step:159/1480 train_time:21482ms step_avg:144.17ms step:160/1480 train_time:21628ms step_avg:144.19ms step:161/1480 train_time:21777ms step_avg:144.22ms step:162/1480 train_time:21925ms step_avg:144.24ms step:163/1480 train_time:22072ms step_avg:144.26ms step:164/1480 train_time:22219ms step_avg:144.28ms step:165/1480 train_time:22367ms step_avg:144.30ms step:166/1480 train_time:22514ms step_avg:144.32ms step:167/1480 train_time:22661ms step_avg:144.34ms step:168/1480 train_time:22808ms step_avg:144.35ms step:169/1480 train_time:22953ms step_avg:144.36ms step:170/1480 train_time:23102ms step_avg:144.39ms step:171/1480 train_time:23248ms step_avg:144.40ms step:172/1480 train_time:23395ms step_avg:144.41ms step:173/1480 train_time:23542ms step_avg:144.43ms step:174/1480 train_time:23689ms step_avg:144.44ms step:175/1480 train_time:23837ms step_avg:144.46ms step:176/1480 train_time:23983ms step_avg:144.48ms step:177/1480 train_time:24131ms step_avg:144.50ms step:178/1480 train_time:24279ms step_avg:144.52ms step:179/1480 train_time:24426ms step_avg:144.53ms step:180/1480 train_time:24573ms step_avg:144.55ms step:181/1480 train_time:24721ms step_avg:144.57ms step:182/1480 train_time:24867ms step_avg:144.57ms step:183/1480 train_time:25013ms step_avg:144.58ms step:184/1480 train_time:25160ms step_avg:144.60ms step:185/1480 train_time:25307ms step_avg:144.61ms step:186/1480 train_time:25453ms step_avg:144.62ms step:187/1480 train_time:25600ms step_avg:144.63ms step:188/1480 train_time:25747ms step_avg:144.65ms step:189/1480 train_time:25895ms step_avg:144.67ms step:190/1480 train_time:26041ms step_avg:144.67ms step:191/1480 train_time:26187ms step_avg:144.68ms step:192/1480 train_time:26335ms step_avg:144.70ms step:193/1480 train_time:26482ms step_avg:144.71ms step:194/1480 train_time:26627ms step_avg:144.71ms step:195/1480 train_time:26775ms step_avg:144.73ms step:196/1480 train_time:26923ms step_avg:144.74ms step:197/1480 train_time:27069ms step_avg:144.75ms step:198/1480 train_time:27216ms step_avg:144.77ms step:199/1480 train_time:27364ms step_avg:144.78ms step:200/1480 train_time:27511ms step_avg:144.79ms step:201/1480 train_time:27658ms step_avg:144.81ms step:202/1480 train_time:27806ms step_avg:144.82ms step:203/1480 train_time:27955ms step_avg:144.84ms step:204/1480 train_time:28101ms step_avg:144.85ms step:205/1480 train_time:28247ms step_avg:144.86ms step:206/1480 train_time:28396ms step_avg:144.88ms step:207/1480 train_time:28543ms step_avg:144.89ms step:208/1480 train_time:28690ms step_avg:144.90ms step:209/1480 train_time:28838ms step_avg:144.91ms step:210/1480 train_time:28984ms step_avg:144.92ms step:211/1480 train_time:29132ms step_avg:144.94ms step:212/1480 train_time:29279ms step_avg:144.95ms step:213/1480 train_time:29426ms step_avg:144.95ms step:214/1480 train_time:29572ms step_avg:144.96ms step:215/1480 train_time:29719ms step_avg:144.97ms step:216/1480 train_time:29866ms step_avg:144.98ms step:217/1480 train_time:30013ms step_avg:144.99ms step:218/1480 train_time:30161ms step_avg:145.00ms step:219/1480 train_time:30307ms step_avg:145.01ms step:220/1480 train_time:30452ms step_avg:145.01ms step:221/1480 train_time:30602ms step_avg:145.03ms step:222/1480 train_time:30752ms step_avg:145.06ms step:223/1480 train_time:30902ms step_avg:145.08ms step:224/1480 train_time:31051ms step_avg:145.10ms step:225/1480 train_time:31204ms step_avg:145.13ms step:226/1480 train_time:31355ms step_avg:145.16ms step:227/1480 train_time:31505ms step_avg:145.18ms step:228/1480 train_time:31654ms step_avg:145.20ms step:229/1480 train_time:31806ms step_avg:145.23ms step:230/1480 train_time:31955ms step_avg:145.25ms step:231/1480 train_time:32106ms step_avg:145.27ms step:232/1480 train_time:32255ms step_avg:145.30ms step:233/1480 train_time:32405ms step_avg:145.32ms step:234/1480 train_time:32554ms step_avg:145.33ms step:235/1480 train_time:32705ms step_avg:145.36ms step:236/1480 train_time:32855ms step_avg:145.38ms step:237/1480 train_time:33005ms step_avg:145.40ms step:238/1480 train_time:33156ms step_avg:145.42ms step:239/1480 train_time:33306ms step_avg:145.44ms step:240/1480 train_time:33456ms step_avg:145.46ms step:241/1480 train_time:33606ms step_avg:145.48ms step:242/1480 train_time:33756ms step_avg:145.50ms step:243/1480 train_time:33906ms step_avg:145.52ms step:244/1480 train_time:34056ms step_avg:145.54ms step:245/1480 train_time:34207ms step_avg:145.56ms step:246/1480 train_time:34358ms step_avg:145.58ms step:247/1480 train_time:34508ms step_avg:145.60ms step:248/1480 train_time:34659ms step_avg:145.63ms step:249/1480 train_time:34810ms step_avg:145.65ms step:250/1480 train_time:34961ms step_avg:145.67ms step:250/1480 val_loss:3.9954 train_time:35019ms step_avg:145.91ms step:251/1480 train_time:35117ms step_avg:145.71ms step:252/1480 train_time:35268ms step_avg:145.74ms step:253/1480 train_time:35418ms step_avg:145.75ms step:254/1480 train_time:35567ms step_avg:145.77ms step:255/1480 train_time:35716ms step_avg:145.78ms step:256/1480 train_time:35865ms step_avg:145.79ms step:257/1480 train_time:36015ms step_avg:145.81ms step:258/1480 train_time:36168ms step_avg:145.84ms step:259/1480 train_time:36320ms step_avg:145.86ms step:260/1480 train_time:36470ms step_avg:145.88ms step:261/1480 train_time:36620ms step_avg:145.90ms step:262/1480 train_time:36769ms step_avg:145.91ms step:263/1480 train_time:36919ms step_avg:145.92ms step:264/1480 train_time:37069ms step_avg:145.94ms step:265/1480 train_time:37221ms step_avg:145.96ms step:266/1480 train_time:37372ms step_avg:145.98ms step:267/1480 train_time:37522ms step_avg:146.00ms step:268/1480 train_time:37673ms step_avg:146.02ms step:269/1480 train_time:37823ms step_avg:146.04ms step:270/1480 train_time:37974ms step_avg:146.05ms step:271/1480 train_time:38124ms step_avg:146.07ms step:272/1480 train_time:38274ms step_avg:146.09ms step:273/1480 train_time:38425ms step_avg:146.10ms step:274/1480 train_time:38575ms step_avg:146.12ms step:275/1480 train_time:38727ms step_avg:146.14ms step:276/1480 train_time:38878ms step_avg:146.16ms step:277/1480 train_time:39027ms step_avg:146.17ms step:278/1480 train_time:39177ms step_avg:146.18ms step:279/1480 train_time:39328ms step_avg:146.20ms step:280/1480 train_time:39479ms step_avg:146.22ms step:281/1480 train_time:39629ms step_avg:146.23ms step:282/1480 train_time:39780ms step_avg:146.25ms step:283/1480 train_time:39931ms step_avg:146.27ms step:284/1480 train_time:40080ms step_avg:146.28ms step:285/1480 train_time:40231ms step_avg:146.30ms step:286/1480 train_time:40382ms step_avg:146.31ms step:287/1480 train_time:40533ms step_avg:146.33ms step:288/1480 train_time:40682ms step_avg:146.34ms step:289/1480 train_time:40834ms step_avg:146.36ms step:290/1480 train_time:40984ms step_avg:146.37ms step:291/1480 train_time:41134ms step_avg:146.39ms step:292/1480 train_time:41284ms step_avg:146.40ms step:293/1480 train_time:41435ms step_avg:146.41ms step:294/1480 train_time:41585ms step_avg:146.43ms step:295/1480 train_time:41736ms step_avg:146.44ms step:296/1480 train_time:41886ms step_avg:146.45ms step:297/1480 train_time:42037ms step_avg:146.47ms step:298/1480 train_time:42187ms step_avg:146.48ms step:299/1480 train_time:42338ms step_avg:146.50ms step:300/1480 train_time:42488ms step_avg:146.51ms step:301/1480 train_time:42638ms step_avg:146.52ms step:302/1480 train_time:42788ms step_avg:146.53ms step:303/1480 train_time:42939ms step_avg:146.55ms step:304/1480 train_time:43088ms step_avg:146.56ms step:305/1480 train_time:43240ms step_avg:146.58ms step:306/1480 train_time:43391ms step_avg:146.59ms step:307/1480 train_time:43541ms step_avg:146.60ms step:308/1480 train_time:43690ms step_avg:146.61ms step:309/1480 train_time:43841ms step_avg:146.63ms step:310/1480 train_time:43992ms step_avg:146.64ms step:311/1480 train_time:44142ms step_avg:146.65ms step:312/1480 train_time:44292ms step_avg:146.66ms step:313/1480 train_time:44442ms step_avg:146.67ms step:314/1480 train_time:44592ms step_avg:146.68ms step:315/1480 train_time:44742ms step_avg:146.70ms step:316/1480 train_time:44893ms step_avg:146.71ms step:317/1480 train_time:45044ms step_avg:146.72ms step:318/1480 train_time:45194ms step_avg:146.73ms step:319/1480 train_time:45345ms step_avg:146.75ms step:320/1480 train_time:45495ms step_avg:146.76ms step:321/1480 train_time:45645ms step_avg:146.77ms step:322/1480 train_time:45796ms step_avg:146.78ms step:323/1480 train_time:45946ms step_avg:146.79ms step:324/1480 train_time:46098ms step_avg:146.81ms step:325/1480 train_time:46248ms step_avg:146.82ms step:326/1480 train_time:46398ms step_avg:146.83ms step:327/1480 train_time:46550ms step_avg:146.84ms step:328/1480 train_time:46700ms step_avg:146.85ms step:329/1480 train_time:46850ms step_avg:146.87ms step:330/1480 train_time:47002ms step_avg:146.88ms step:331/1480 train_time:47156ms step_avg:146.90ms step:332/1480 train_time:47311ms step_avg:146.93ms step:333/1480 train_time:47463ms step_avg:146.94ms step:334/1480 train_time:47617ms step_avg:146.96ms step:335/1480 train_time:47771ms step_avg:146.99ms step:336/1480 train_time:47925ms step_avg:147.01ms step:337/1480 train_time:48078ms step_avg:147.03ms step:338/1480 train_time:48233ms step_avg:147.05ms step:339/1480 train_time:48386ms step_avg:147.07ms step:340/1480 train_time:48540ms step_avg:147.09ms step:341/1480 train_time:48693ms step_avg:147.11ms step:342/1480 train_time:48847ms step_avg:147.13ms step:343/1480 train_time:49000ms step_avg:147.15ms step:344/1480 train_time:49155ms step_avg:147.17ms step:345/1480 train_time:49310ms step_avg:147.19ms step:346/1480 train_time:49462ms step_avg:147.21ms step:347/1480 train_time:49618ms step_avg:147.23ms step:348/1480 train_time:49773ms step_avg:147.26ms step:349/1480 train_time:49927ms step_avg:147.28ms step:350/1480 train_time:50081ms step_avg:147.30ms step:351/1480 train_time:50235ms step_avg:147.32ms step:352/1480 train_time:50390ms step_avg:147.34ms step:353/1480 train_time:50543ms step_avg:147.36ms step:354/1480 train_time:50696ms step_avg:147.37ms step:355/1480 train_time:50852ms step_avg:147.40ms step:356/1480 train_time:51006ms step_avg:147.42ms step:357/1480 train_time:51160ms step_avg:147.43ms step:358/1480 train_time:51313ms step_avg:147.45ms step:359/1480 train_time:51467ms step_avg:147.47ms step:360/1480 train_time:51621ms step_avg:147.49ms step:361/1480 train_time:51776ms step_avg:147.51ms step:362/1480 train_time:51931ms step_avg:147.53ms step:363/1480 train_time:52084ms step_avg:147.55ms step:364/1480 train_time:52238ms step_avg:147.56ms step:365/1480 train_time:52391ms step_avg:147.58ms step:366/1480 train_time:52543ms step_avg:147.59ms step:367/1480 train_time:52696ms step_avg:147.61ms step:368/1480 train_time:52853ms step_avg:147.63ms step:369/1480 train_time:53006ms step_avg:147.65ms step:370/1480 train_time:53159ms step_avg:147.66ms step:371/1480 train_time:53312ms step_avg:147.68ms step:372/1480 train_time:53465ms step_avg:147.69ms step:373/1480 train_time:53618ms step_avg:147.71ms step:374/1480 train_time:53772ms step_avg:147.73ms step:375/1480 train_time:53926ms step_avg:147.74ms step:375/1480 val_loss:3.8085 train_time:53986ms step_avg:147.91ms step:376/1480 train_time:54082ms step_avg:147.77ms step:377/1480 train_time:54237ms step_avg:147.78ms step:378/1480 train_time:54391ms step_avg:147.80ms step:379/1480 train_time:54543ms step_avg:147.81ms step:380/1480 train_time:54695ms step_avg:147.82ms step:381/1480 train_time:54847ms step_avg:147.84ms step:382/1480 train_time:55001ms step_avg:147.85ms step:383/1480 train_time:55156ms step_avg:147.87ms step:384/1480 train_time:55311ms step_avg:147.89ms step:385/1480 train_time:55463ms step_avg:147.90ms step:386/1480 train_time:55617ms step_avg:147.92ms step:387/1480 train_time:55770ms step_avg:147.93ms step:388/1480 train_time:55924ms step_avg:147.95ms step:389/1480 train_time:56077ms step_avg:147.96ms step:390/1480 train_time:56232ms step_avg:147.98ms step:391/1480 train_time:56386ms step_avg:148.00ms step:392/1480 train_time:56539ms step_avg:148.01ms step:393/1480 train_time:56692ms step_avg:148.02ms step:394/1480 train_time:56846ms step_avg:148.04ms step:395/1480 train_time:56999ms step_avg:148.05ms step:396/1480 train_time:57153ms step_avg:148.06ms step:397/1480 train_time:57307ms step_avg:148.08ms step:398/1480 train_time:57460ms step_avg:148.09ms step:399/1480 train_time:57613ms step_avg:148.11ms step:400/1480 train_time:57769ms step_avg:148.12ms step:401/1480 train_time:57921ms step_avg:148.13ms step:402/1480 train_time:58074ms step_avg:148.15ms step:403/1480 train_time:58229ms step_avg:148.16ms step:404/1480 train_time:58382ms step_avg:148.18ms step:405/1480 train_time:58537ms step_avg:148.19ms step:406/1480 train_time:58691ms step_avg:148.21ms step:407/1480 train_time:58845ms step_avg:148.22ms step:408/1480 train_time:58997ms step_avg:148.23ms step:409/1480 train_time:59153ms step_avg:148.25ms step:410/1480 train_time:59305ms step_avg:148.26ms step:411/1480 train_time:59459ms step_avg:148.28ms step:412/1480 train_time:59612ms step_avg:148.29ms step:413/1480 train_time:59766ms step_avg:148.30ms step:414/1480 train_time:59920ms step_avg:148.32ms step:415/1480 train_time:60074ms step_avg:148.33ms step:416/1480 train_time:60230ms step_avg:148.35ms step:417/1480 train_time:60384ms step_avg:148.36ms step:418/1480 train_time:60537ms step_avg:148.37ms step:419/1480 train_time:60689ms step_avg:148.38ms step:420/1480 train_time:60844ms step_avg:148.40ms step:421/1480 train_time:60997ms step_avg:148.41ms step:422/1480 train_time:61152ms step_avg:148.43ms step:423/1480 train_time:61305ms step_avg:148.44ms step:424/1480 train_time:61458ms step_avg:148.45ms step:425/1480 train_time:61612ms step_avg:148.46ms step:426/1480 train_time:61765ms step_avg:148.47ms step:427/1480 train_time:61919ms step_avg:148.49ms step:428/1480 train_time:62071ms step_avg:148.50ms step:429/1480 train_time:62224ms step_avg:148.51ms step:430/1480 train_time:62377ms step_avg:148.52ms step:431/1480 train_time:62532ms step_avg:148.53ms step:432/1480 train_time:62684ms step_avg:148.54ms step:433/1480 train_time:62837ms step_avg:148.55ms step:434/1480 train_time:62991ms step_avg:148.56ms step:435/1480 train_time:63145ms step_avg:148.58ms step:436/1480 train_time:63298ms step_avg:148.59ms step:437/1480 train_time:63453ms step_avg:148.60ms step:438/1480 train_time:63607ms step_avg:148.61ms step:439/1480 train_time:63760ms step_avg:148.63ms step:440/1480 train_time:63915ms step_avg:148.64ms step:441/1480 train_time:64072ms step_avg:148.66ms step:442/1480 train_time:64232ms step_avg:148.68ms step:443/1480 train_time:64388ms step_avg:148.70ms step:444/1480 train_time:64544ms step_avg:148.72ms step:445/1480 train_time:64700ms step_avg:148.73ms step:446/1480 train_time:64856ms step_avg:148.75ms step:447/1480 train_time:65013ms step_avg:148.77ms step:448/1480 train_time:65169ms step_avg:148.79ms step:449/1480 train_time:65328ms step_avg:148.81ms step:450/1480 train_time:65487ms step_avg:148.83ms step:451/1480 train_time:65643ms step_avg:148.85ms step:452/1480 train_time:65798ms step_avg:148.86ms step:453/1480 train_time:65956ms step_avg:148.88ms step:454/1480 train_time:66113ms step_avg:148.90ms step:455/1480 train_time:66269ms step_avg:148.92ms step:456/1480 train_time:66425ms step_avg:148.93ms step:457/1480 train_time:66581ms step_avg:148.95ms step:458/1480 train_time:66737ms step_avg:148.97ms step:459/1480 train_time:66894ms step_avg:148.98ms step:460/1480 train_time:67053ms step_avg:149.01ms step:461/1480 train_time:67213ms step_avg:149.03ms step:462/1480 train_time:67370ms step_avg:149.05ms step:463/1480 train_time:67527ms step_avg:149.07ms step:464/1480 train_time:67682ms step_avg:149.08ms step:465/1480 train_time:67838ms step_avg:149.10ms step:466/1480 train_time:67994ms step_avg:149.11ms step:467/1480 train_time:68153ms step_avg:149.13ms step:468/1480 train_time:68311ms step_avg:149.15ms step:469/1480 train_time:68467ms step_avg:149.17ms step:470/1480 train_time:68625ms step_avg:149.19ms step:471/1480 train_time:68781ms step_avg:149.20ms step:472/1480 train_time:68938ms step_avg:149.22ms step:473/1480 train_time:69094ms step_avg:149.23ms step:474/1480 train_time:69253ms step_avg:149.25ms step:475/1480 train_time:69411ms step_avg:149.27ms step:476/1480 train_time:69568ms step_avg:149.29ms step:477/1480 train_time:69725ms step_avg:149.30ms step:478/1480 train_time:69882ms step_avg:149.32ms step:479/1480 train_time:70037ms step_avg:149.33ms step:480/1480 train_time:70194ms step_avg:149.35ms step:481/1480 train_time:70353ms step_avg:149.37ms step:482/1480 train_time:70509ms step_avg:149.38ms step:483/1480 train_time:70663ms step_avg:149.39ms step:484/1480 train_time:70821ms step_avg:149.41ms step:485/1480 train_time:70978ms step_avg:149.43ms step:486/1480 train_time:71134ms step_avg:149.44ms step:487/1480 train_time:71292ms step_avg:149.46ms step:488/1480 train_time:71449ms step_avg:149.47ms step:489/1480 train_time:71604ms step_avg:149.49ms step:490/1480 train_time:71760ms step_avg:149.50ms step:491/1480 train_time:71917ms step_avg:149.52ms step:492/1480 train_time:72073ms step_avg:149.53ms step:493/1480 train_time:72231ms step_avg:149.55ms step:494/1480 train_time:72388ms step_avg:149.56ms step:495/1480 train_time:72544ms step_avg:149.58ms step:496/1480 train_time:72700ms step_avg:149.59ms step:497/1480 train_time:72856ms step_avg:149.60ms step:498/1480 train_time:73015ms step_avg:149.62ms step:499/1480 train_time:73173ms step_avg:149.64ms step:500/1480 train_time:73331ms step_avg:149.65ms step:500/1480 val_loss:3.6886 train_time:73392ms step_avg:149.78ms step:501/1480 train_time:73492ms step_avg:149.68ms step:502/1480 train_time:73650ms step_avg:149.70ms step:503/1480 train_time:73806ms step_avg:149.71ms step:504/1480 train_time:73961ms step_avg:149.72ms step:505/1480 train_time:74117ms step_avg:149.73ms step:506/1480 train_time:74274ms step_avg:149.75ms step:507/1480 train_time:74431ms step_avg:149.76ms step:508/1480 train_time:74590ms step_avg:149.78ms step:509/1480 train_time:74746ms step_avg:149.79ms step:510/1480 train_time:74902ms step_avg:149.80ms step:511/1480 train_time:75059ms step_avg:149.82ms step:512/1480 train_time:75216ms step_avg:149.83ms step:513/1480 train_time:75373ms step_avg:149.85ms step:514/1480 train_time:75530ms step_avg:149.86ms step:515/1480 train_time:75687ms step_avg:149.88ms step:516/1480 train_time:75845ms step_avg:149.89ms step:517/1480 train_time:76001ms step_avg:149.90ms step:518/1480 train_time:76159ms step_avg:149.92ms step:519/1480 train_time:76316ms step_avg:149.93ms step:520/1480 train_time:76474ms step_avg:149.95ms step:521/1480 train_time:76633ms step_avg:149.97ms step:522/1480 train_time:76791ms step_avg:149.98ms step:523/1480 train_time:76948ms step_avg:150.00ms step:524/1480 train_time:77104ms step_avg:150.01ms step:525/1480 train_time:77259ms step_avg:150.02ms step:526/1480 train_time:77416ms step_avg:150.03ms step:527/1480 train_time:77572ms step_avg:150.04ms step:528/1480 train_time:77729ms step_avg:150.06ms step:529/1480 train_time:77886ms step_avg:150.07ms step:530/1480 train_time:78042ms step_avg:150.08ms step:531/1480 train_time:78200ms step_avg:150.10ms step:532/1480 train_time:78356ms step_avg:150.11ms step:533/1480 train_time:78514ms step_avg:150.12ms step:534/1480 train_time:78671ms step_avg:150.13ms step:535/1480 train_time:78828ms step_avg:150.15ms step:536/1480 train_time:78985ms step_avg:150.16ms step:537/1480 train_time:79141ms step_avg:150.17ms step:538/1480 train_time:79297ms step_avg:150.18ms step:539/1480 train_time:79456ms step_avg:150.20ms step:540/1480 train_time:79613ms step_avg:150.21ms step:541/1480 train_time:79770ms step_avg:150.23ms step:542/1480 train_time:79927ms step_avg:150.24ms step:543/1480 train_time:80083ms step_avg:150.25ms step:544/1480 train_time:80238ms step_avg:150.26ms step:545/1480 train_time:80395ms step_avg:150.27ms step:546/1480 train_time:80552ms step_avg:150.28ms step:547/1480 train_time:80708ms step_avg:150.29ms step:548/1480 train_time:80866ms step_avg:150.31ms step:549/1480 train_time:81021ms step_avg:150.32ms step:550/1480 train_time:81179ms step_avg:150.33ms step:551/1480 train_time:81336ms step_avg:150.34ms step:552/1480 train_time:81496ms step_avg:150.36ms step:553/1480 train_time:81656ms step_avg:150.38ms step:554/1480 train_time:81818ms step_avg:150.40ms step:555/1480 train_time:81977ms step_avg:150.42ms step:556/1480 train_time:82137ms step_avg:150.43ms step:557/1480 train_time:82297ms step_avg:150.45ms step:558/1480 train_time:82456ms step_avg:150.47ms step:559/1480 train_time:82616ms step_avg:150.48ms step:560/1480 train_time:82775ms step_avg:150.50ms step:561/1480 train_time:82935ms step_avg:150.52ms step:562/1480 train_time:83095ms step_avg:150.53ms step:563/1480 train_time:83254ms step_avg:150.55ms step:564/1480 train_time:83415ms step_avg:150.57ms step:565/1480 train_time:83575ms step_avg:150.59ms step:566/1480 train_time:83737ms step_avg:150.61ms step:567/1480 train_time:83896ms step_avg:150.62ms step:568/1480 train_time:84056ms step_avg:150.64ms step:569/1480 train_time:84215ms step_avg:150.65ms step:570/1480 train_time:84374ms step_avg:150.67ms step:571/1480 train_time:84533ms step_avg:150.68ms step:572/1480 train_time:84692ms step_avg:150.70ms step:573/1480 train_time:84851ms step_avg:150.71ms step:574/1480 train_time:85013ms step_avg:150.73ms step:575/1480 train_time:85175ms step_avg:150.75ms step:576/1480 train_time:85336ms step_avg:150.77ms step:577/1480 train_time:85497ms step_avg:150.79ms step:578/1480 train_time:85656ms step_avg:150.80ms step:579/1480 train_time:85816ms step_avg:150.82ms step:580/1480 train_time:85976ms step_avg:150.83ms step:581/1480 train_time:86137ms step_avg:150.85ms step:582/1480 train_time:86297ms step_avg:150.87ms step:583/1480 train_time:86457ms step_avg:150.88ms step:584/1480 train_time:86616ms step_avg:150.90ms step:585/1480 train_time:86775ms step_avg:150.91ms step:586/1480 train_time:86936ms step_avg:150.93ms step:587/1480 train_time:87096ms step_avg:150.95ms step:588/1480 train_time:87256ms step_avg:150.96ms step:589/1480 train_time:87417ms step_avg:150.98ms step:590/1480 train_time:87577ms step_avg:151.00ms step:591/1480 train_time:87736ms step_avg:151.01ms step:592/1480 train_time:87897ms step_avg:151.03ms step:593/1480 train_time:88058ms step_avg:151.04ms step:594/1480 train_time:88218ms step_avg:151.06ms step:595/1480 train_time:88378ms step_avg:151.07ms step:596/1480 train_time:88539ms step_avg:151.09ms step:597/1480 train_time:88698ms step_avg:151.10ms step:598/1480 train_time:88856ms step_avg:151.12ms step:599/1480 train_time:89016ms step_avg:151.13ms step:600/1480 train_time:89176ms step_avg:151.14ms step:601/1480 train_time:89336ms step_avg:151.16ms step:602/1480 train_time:89496ms step_avg:151.18ms step:603/1480 train_time:89657ms step_avg:151.19ms step:604/1480 train_time:89817ms step_avg:151.21ms step:605/1480 train_time:89977ms step_avg:151.22ms step:606/1480 train_time:90139ms step_avg:151.24ms step:607/1480 train_time:90300ms step_avg:151.26ms step:608/1480 train_time:90458ms step_avg:151.27ms step:609/1480 train_time:90617ms step_avg:151.28ms step:610/1480 train_time:90775ms step_avg:151.29ms step:611/1480 train_time:90937ms step_avg:151.31ms step:612/1480 train_time:91097ms step_avg:151.32ms step:613/1480 train_time:91257ms step_avg:151.34ms step:614/1480 train_time:91418ms step_avg:151.35ms step:615/1480 train_time:91576ms step_avg:151.37ms step:616/1480 train_time:91735ms step_avg:151.38ms step:617/1480 train_time:91895ms step_avg:151.39ms step:618/1480 train_time:92055ms step_avg:151.41ms step:619/1480 train_time:92215ms step_avg:151.42ms step:620/1480 train_time:92375ms step_avg:151.43ms step:621/1480 train_time:92535ms step_avg:151.45ms step:622/1480 train_time:92695ms step_avg:151.46ms step:623/1480 train_time:92856ms step_avg:151.48ms step:624/1480 train_time:93016ms step_avg:151.49ms step:625/1480 train_time:93176ms step_avg:151.51ms step:625/1480 val_loss:3.6052 train_time:93239ms step_avg:151.61ms step:626/1480 train_time:93340ms step_avg:151.53ms step:627/1480 train_time:93500ms step_avg:151.54ms step:628/1480 train_time:93659ms step_avg:151.55ms step:629/1480 train_time:93817ms step_avg:151.56ms step:630/1480 train_time:93974ms step_avg:151.57ms step:631/1480 train_time:94132ms step_avg:151.58ms step:632/1480 train_time:94290ms step_avg:151.59ms step:633/1480 train_time:94448ms step_avg:151.60ms step:634/1480 train_time:94608ms step_avg:151.61ms step:635/1480 train_time:94767ms step_avg:151.63ms step:636/1480 train_time:94927ms step_avg:151.64ms step:637/1480 train_time:95087ms step_avg:151.65ms step:638/1480 train_time:95246ms step_avg:151.66ms step:639/1480 train_time:95406ms step_avg:151.68ms step:640/1480 train_time:95566ms step_avg:151.69ms step:641/1480 train_time:95726ms step_avg:151.70ms step:642/1480 train_time:95886ms step_avg:151.72ms step:643/1480 train_time:96046ms step_avg:151.73ms step:644/1480 train_time:96205ms step_avg:151.74ms step:645/1480 train_time:96365ms step_avg:151.76ms step:646/1480 train_time:96525ms step_avg:151.77ms step:647/1480 train_time:96686ms step_avg:151.78ms step:648/1480 train_time:96846ms step_avg:151.80ms step:649/1480 train_time:97006ms step_avg:151.81ms step:650/1480 train_time:97167ms step_avg:151.82ms step:651/1480 train_time:97327ms step_avg:151.84ms step:652/1480 train_time:97488ms step_avg:151.85ms step:653/1480 train_time:97646ms step_avg:151.86ms step:654/1480 train_time:97807ms step_avg:151.87ms step:655/1480 train_time:97967ms step_avg:151.89ms step:656/1480 train_time:98127ms step_avg:151.90ms step:657/1480 train_time:98287ms step_avg:151.91ms step:658/1480 train_time:98447ms step_avg:151.92ms step:659/1480 train_time:98610ms step_avg:151.94ms step:660/1480 train_time:98772ms step_avg:151.96ms step:661/1480 train_time:98933ms step_avg:151.97ms step:662/1480 train_time:99092ms step_avg:151.98ms step:663/1480 train_time:99251ms step_avg:151.99ms step:664/1480 train_time:99413ms step_avg:152.01ms step:665/1480 train_time:99575ms step_avg:152.02ms step:666/1480 train_time:99735ms step_avg:152.03ms step:667/1480 train_time:99897ms step_avg:152.05ms step:668/1480 train_time:100059ms step_avg:152.07ms step:669/1480 train_time:100220ms step_avg:152.08ms step:670/1480 train_time:100381ms step_avg:152.09ms step:671/1480 train_time:100543ms step_avg:152.11ms step:672/1480 train_time:100705ms step_avg:152.12ms step:673/1480 train_time:100868ms step_avg:152.14ms step:674/1480 train_time:101031ms step_avg:152.16ms step:675/1480 train_time:101193ms step_avg:152.17ms step:676/1480 train_time:101354ms step_avg:152.18ms step:677/1480 train_time:101514ms step_avg:152.19ms step:678/1480 train_time:101674ms step_avg:152.21ms step:679/1480 train_time:101836ms step_avg:152.22ms step:680/1480 train_time:101998ms step_avg:152.24ms step:681/1480 train_time:102158ms step_avg:152.25ms step:682/1480 train_time:102320ms step_avg:152.26ms step:683/1480 train_time:102481ms step_avg:152.28ms step:684/1480 train_time:102642ms step_avg:152.29ms step:685/1480 train_time:102807ms step_avg:152.31ms step:686/1480 train_time:102969ms step_avg:152.32ms step:687/1480 train_time:103130ms step_avg:152.33ms step:688/1480 train_time:103294ms step_avg:152.35ms step:689/1480 train_time:103456ms step_avg:152.37ms step:690/1480 train_time:103618ms step_avg:152.38ms step:691/1480 train_time:103778ms step_avg:152.39ms step:692/1480 train_time:103939ms step_avg:152.40ms step:693/1480 train_time:104101ms step_avg:152.42ms step:694/1480 train_time:104263ms step_avg:152.43ms step:695/1480 train_time:104426ms step_avg:152.45ms step:696/1480 train_time:104588ms step_avg:152.46ms step:697/1480 train_time:104750ms step_avg:152.48ms step:698/1480 train_time:104911ms step_avg:152.49ms step:699/1480 train_time:105074ms step_avg:152.50ms step:700/1480 train_time:105236ms step_avg:152.52ms step:701/1480 train_time:105395ms step_avg:152.53ms step:702/1480 train_time:105556ms step_avg:152.54ms step:703/1480 train_time:105715ms step_avg:152.55ms step:704/1480 train_time:105876ms step_avg:152.56ms step:705/1480 train_time:106039ms step_avg:152.57ms step:706/1480 train_time:106206ms step_avg:152.59ms step:707/1480 train_time:106368ms step_avg:152.61ms step:708/1480 train_time:106529ms step_avg:152.62ms step:709/1480 train_time:106690ms step_avg:152.63ms step:710/1480 train_time:106849ms step_avg:152.64ms step:711/1480 train_time:107011ms step_avg:152.65ms step:712/1480 train_time:107175ms step_avg:152.67ms step:713/1480 train_time:107337ms step_avg:152.68ms step:714/1480 train_time:107498ms step_avg:152.70ms step:715/1480 train_time:107658ms step_avg:152.71ms step:716/1480 train_time:107817ms step_avg:152.72ms step:717/1480 train_time:107980ms step_avg:152.73ms step:718/1480 train_time:108141ms step_avg:152.74ms step:719/1480 train_time:108301ms step_avg:152.75ms step:720/1480 train_time:108465ms step_avg:152.77ms step:721/1480 train_time:108628ms step_avg:152.78ms step:722/1480 train_time:108791ms step_avg:152.80ms step:723/1480 train_time:108950ms step_avg:152.80ms step:724/1480 train_time:109112ms step_avg:152.82ms step:725/1480 train_time:109275ms step_avg:152.83ms step:726/1480 train_time:109438ms step_avg:152.85ms step:727/1480 train_time:109604ms step_avg:152.86ms step:728/1480 train_time:109766ms step_avg:152.88ms step:729/1480 train_time:109927ms step_avg:152.89ms step:730/1480 train_time:110091ms step_avg:152.90ms step:731/1480 train_time:110252ms step_avg:152.91ms step:732/1480 train_time:110412ms step_avg:152.93ms step:733/1480 train_time:110573ms step_avg:152.94ms step:734/1480 train_time:110734ms step_avg:152.95ms step:735/1480 train_time:110894ms step_avg:152.96ms step:736/1480 train_time:111057ms step_avg:152.97ms step:737/1480 train_time:111219ms step_avg:152.98ms step:738/1480 train_time:111381ms step_avg:153.00ms step:739/1480 train_time:111542ms step_avg:153.01ms step:740/1480 train_time:111709ms step_avg:153.03ms step:741/1480 train_time:111872ms step_avg:153.04ms step:742/1480 train_time:112033ms step_avg:153.05ms step:743/1480 train_time:112194ms step_avg:153.06ms step:744/1480 train_time:112357ms step_avg:153.07ms step:745/1480 train_time:112521ms step_avg:153.09ms step:746/1480 train_time:112681ms step_avg:153.10ms step:747/1480 train_time:112843ms step_avg:153.11ms step:748/1480 train_time:113010ms step_avg:153.13ms step:749/1480 train_time:113174ms step_avg:153.14ms step:750/1480 train_time:113333ms step_avg:153.15ms step:750/1480 val_loss:3.5503 train_time:113397ms step_avg:153.24ms step:751/1480 train_time:113500ms step_avg:153.17ms step:752/1480 train_time:113665ms step_avg:153.19ms step:753/1480 train_time:113827ms step_avg:153.20ms step:754/1480 train_time:113987ms step_avg:153.21ms step:755/1480 train_time:114149ms step_avg:153.22ms step:756/1480 train_time:114310ms step_avg:153.23ms step:757/1480 train_time:114476ms step_avg:153.25ms step:758/1480 train_time:114636ms step_avg:153.26ms step:759/1480 train_time:114798ms step_avg:153.27ms step:760/1480 train_time:114961ms step_avg:153.28ms step:761/1480 train_time:115125ms step_avg:153.30ms step:762/1480 train_time:115287ms step_avg:153.31ms step:763/1480 train_time:115449ms step_avg:153.32ms step:764/1480 train_time:115609ms step_avg:153.33ms step:765/1480 train_time:115771ms step_avg:153.34ms step:766/1480 train_time:115933ms step_avg:153.35ms step:767/1480 train_time:116095ms step_avg:153.36ms step:768/1480 train_time:116259ms step_avg:153.38ms step:769/1480 train_time:116423ms step_avg:153.39ms step:770/1480 train_time:116585ms step_avg:153.40ms step:771/1480 train_time:116749ms step_avg:153.42ms step:772/1480 train_time:116911ms step_avg:153.43ms step:773/1480 train_time:117071ms step_avg:153.44ms step:774/1480 train_time:117233ms step_avg:153.45ms step:775/1480 train_time:117394ms step_avg:153.46ms step:776/1480 train_time:117559ms step_avg:153.47ms step:777/1480 train_time:117727ms step_avg:153.49ms step:778/1480 train_time:117890ms step_avg:153.50ms step:779/1480 train_time:118052ms step_avg:153.51ms step:780/1480 train_time:118215ms step_avg:153.53ms step:781/1480 train_time:118379ms step_avg:153.54ms step:782/1480 train_time:118544ms step_avg:153.55ms step:783/1480 train_time:118706ms step_avg:153.57ms step:784/1480 train_time:118870ms step_avg:153.58ms step:785/1480 train_time:119031ms step_avg:153.59ms step:786/1480 train_time:119194ms step_avg:153.60ms step:787/1480 train_time:119356ms step_avg:153.61ms step:788/1480 train_time:119521ms step_avg:153.63ms step:789/1480 train_time:119683ms step_avg:153.64ms step:790/1480 train_time:119849ms step_avg:153.65ms step:791/1480 train_time:120015ms step_avg:153.67ms step:792/1480 train_time:120180ms step_avg:153.68ms step:793/1480 train_time:120343ms step_avg:153.69ms step:794/1480 train_time:120506ms step_avg:153.71ms step:795/1480 train_time:120671ms step_avg:153.72ms step:796/1480 train_time:120836ms step_avg:153.74ms step:797/1480 train_time:121001ms step_avg:153.75ms step:798/1480 train_time:121166ms step_avg:153.76ms step:799/1480 train_time:121333ms step_avg:153.78ms step:800/1480 train_time:121494ms step_avg:153.79ms step:801/1480 train_time:121658ms step_avg:153.80ms step:802/1480 train_time:121825ms step_avg:153.82ms step:803/1480 train_time:121988ms step_avg:153.83ms step:804/1480 train_time:122150ms step_avg:153.84ms step:805/1480 train_time:122315ms step_avg:153.86ms step:806/1480 train_time:122479ms step_avg:153.87ms step:807/1480 train_time:122641ms step_avg:153.88ms step:808/1480 train_time:122805ms step_avg:153.89ms step:809/1480 train_time:122969ms step_avg:153.90ms step:810/1480 train_time:123130ms step_avg:153.91ms step:811/1480 train_time:123292ms step_avg:153.92ms step:812/1480 train_time:123454ms step_avg:153.93ms step:813/1480 train_time:123615ms step_avg:153.94ms step:814/1480 train_time:123777ms step_avg:153.95ms step:815/1480 train_time:123939ms step_avg:153.96ms step:816/1480 train_time:124105ms step_avg:153.98ms step:817/1480 train_time:124269ms step_avg:153.99ms step:818/1480 train_time:124430ms step_avg:154.00ms step:819/1480 train_time:124593ms step_avg:154.01ms step:820/1480 train_time:124756ms step_avg:154.02ms step:821/1480 train_time:124917ms step_avg:154.03ms step:822/1480 train_time:125080ms step_avg:154.04ms step:823/1480 train_time:125244ms step_avg:154.05ms step:824/1480 train_time:125407ms step_avg:154.06ms step:825/1480 train_time:125572ms step_avg:154.08ms step:826/1480 train_time:125737ms step_avg:154.09ms step:827/1480 train_time:125901ms step_avg:154.10ms step:828/1480 train_time:126065ms step_avg:154.11ms step:829/1480 train_time:126230ms step_avg:154.13ms step:830/1480 train_time:126394ms step_avg:154.14ms step:831/1480 train_time:126559ms step_avg:154.15ms step:832/1480 train_time:126724ms step_avg:154.17ms step:833/1480 train_time:126889ms step_avg:154.18ms step:834/1480 train_time:127053ms step_avg:154.19ms step:835/1480 train_time:127216ms step_avg:154.20ms step:836/1480 train_time:127381ms step_avg:154.21ms step:837/1480 train_time:127545ms step_avg:154.23ms step:838/1480 train_time:127707ms step_avg:154.24ms step:839/1480 train_time:127870ms step_avg:154.25ms step:840/1480 train_time:128031ms step_avg:154.25ms step:841/1480 train_time:128191ms step_avg:154.26ms step:842/1480 train_time:128354ms step_avg:154.27ms step:843/1480 train_time:128515ms step_avg:154.28ms step:844/1480 train_time:128677ms step_avg:154.29ms step:845/1480 train_time:128845ms step_avg:154.30ms step:846/1480 train_time:129008ms step_avg:154.32ms step:847/1480 train_time:129171ms step_avg:154.33ms step:848/1480 train_time:129332ms step_avg:154.33ms step:849/1480 train_time:129495ms step_avg:154.34ms step:850/1480 train_time:129659ms step_avg:154.36ms step:851/1480 train_time:129824ms step_avg:154.37ms step:852/1480 train_time:129987ms step_avg:154.38ms step:853/1480 train_time:130149ms step_avg:154.39ms step:854/1480 train_time:130313ms step_avg:154.40ms step:855/1480 train_time:130476ms step_avg:154.41ms step:856/1480 train_time:130637ms step_avg:154.42ms step:857/1480 train_time:130803ms step_avg:154.43ms step:858/1480 train_time:130969ms step_avg:154.44ms step:859/1480 train_time:131132ms step_avg:154.46ms step:860/1480 train_time:131293ms step_avg:154.46ms step:861/1480 train_time:131459ms step_avg:154.48ms step:862/1480 train_time:131629ms step_avg:154.49ms step:863/1480 train_time:131797ms step_avg:154.51ms step:864/1480 train_time:131962ms step_avg:154.52ms step:865/1480 train_time:132124ms step_avg:154.53ms step:866/1480 train_time:132291ms step_avg:154.55ms step:867/1480 train_time:132454ms step_avg:154.56ms step:868/1480 train_time:132615ms step_avg:154.56ms step:869/1480 train_time:132776ms step_avg:154.57ms step:870/1480 train_time:132941ms step_avg:154.58ms step:871/1480 train_time:133104ms step_avg:154.59ms step:872/1480 train_time:133269ms step_avg:154.60ms step:873/1480 train_time:133432ms step_avg:154.61ms step:874/1480 train_time:133597ms step_avg:154.63ms step:875/1480 train_time:133763ms step_avg:154.64ms step:875/1480 val_loss:3.5040 train_time:133827ms step_avg:154.71ms step:876/1480 train_time:133928ms step_avg:154.65ms step:877/1480 train_time:134096ms step_avg:154.67ms step:878/1480 train_time:134259ms step_avg:154.68ms step:879/1480 train_time:134422ms step_avg:154.69ms step:880/1480 train_time:134584ms step_avg:154.69ms step:881/1480 train_time:134745ms step_avg:154.70ms step:882/1480 train_time:134910ms step_avg:154.71ms step:883/1480 train_time:135076ms step_avg:154.73ms step:884/1480 train_time:135243ms step_avg:154.74ms step:885/1480 train_time:135407ms step_avg:154.75ms step:886/1480 train_time:135574ms step_avg:154.77ms step:887/1480 train_time:135743ms step_avg:154.78ms step:888/1480 train_time:135916ms step_avg:154.80ms step:889/1480 train_time:136084ms step_avg:154.82ms step:890/1480 train_time:136247ms step_avg:154.83ms step:891/1480 train_time:136413ms step_avg:154.84ms step:892/1480 train_time:136578ms step_avg:154.85ms step:893/1480 train_time:136740ms step_avg:154.86ms step:894/1480 train_time:136906ms step_avg:154.87ms step:895/1480 train_time:137074ms step_avg:154.89ms step:896/1480 train_time:137239ms step_avg:154.90ms step:897/1480 train_time:137406ms step_avg:154.91ms step:898/1480 train_time:137575ms step_avg:154.93ms step:899/1480 train_time:137740ms step_avg:154.94ms step:900/1480 train_time:137903ms step_avg:154.95ms step:901/1480 train_time:138067ms step_avg:154.96ms step:902/1480 train_time:138229ms step_avg:154.97ms step:903/1480 train_time:138403ms step_avg:154.99ms step:904/1480 train_time:138567ms step_avg:155.00ms step:905/1480 train_time:138728ms step_avg:155.00ms step:906/1480 train_time:138897ms step_avg:155.02ms step:907/1480 train_time:139064ms step_avg:155.03ms step:908/1480 train_time:139226ms step_avg:155.04ms step:909/1480 train_time:139392ms step_avg:155.05ms step:910/1480 train_time:139561ms step_avg:155.07ms step:911/1480 train_time:139726ms step_avg:155.08ms step:912/1480 train_time:139893ms step_avg:155.09ms step:913/1480 train_time:140059ms step_avg:155.10ms step:914/1480 train_time:140225ms step_avg:155.12ms step:915/1480 train_time:140395ms step_avg:155.13ms step:916/1480 train_time:140559ms step_avg:155.14ms step:917/1480 train_time:140722ms step_avg:155.15ms step:918/1480 train_time:140890ms step_avg:155.16ms step:919/1480 train_time:141059ms step_avg:155.18ms step:920/1480 train_time:141223ms step_avg:155.19ms step:921/1480 train_time:141392ms step_avg:155.20ms step:922/1480 train_time:141558ms step_avg:155.22ms step:923/1480 train_time:141720ms step_avg:155.22ms step:924/1480 train_time:141884ms step_avg:155.23ms step:925/1480 train_time:142049ms step_avg:155.24ms step:926/1480 train_time:142214ms step_avg:155.26ms step:927/1480 train_time:142378ms step_avg:155.27ms step:928/1480 train_time:142543ms step_avg:155.28ms step:929/1480 train_time:142708ms step_avg:155.29ms step:930/1480 train_time:142875ms step_avg:155.30ms step:931/1480 train_time:143039ms step_avg:155.31ms step:932/1480 train_time:143205ms step_avg:155.32ms step:933/1480 train_time:143372ms step_avg:155.33ms step:934/1480 train_time:143539ms step_avg:155.35ms step:935/1480 train_time:143711ms step_avg:155.36ms step:936/1480 train_time:143879ms step_avg:155.38ms step:937/1480 train_time:144048ms step_avg:155.39ms step:938/1480 train_time:144214ms step_avg:155.40ms step:939/1480 train_time:144382ms step_avg:155.42ms step:940/1480 train_time:144548ms step_avg:155.43ms step:941/1480 train_time:144713ms step_avg:155.44ms step:942/1480 train_time:144879ms step_avg:155.45ms step:943/1480 train_time:145048ms step_avg:155.46ms step:944/1480 train_time:145222ms step_avg:155.48ms step:945/1480 train_time:145386ms step_avg:155.49ms step:946/1480 train_time:145556ms step_avg:155.51ms step:947/1480 train_time:145723ms step_avg:155.52ms step:948/1480 train_time:145890ms step_avg:155.53ms step:949/1480 train_time:146055ms step_avg:155.54ms step:950/1480 train_time:146219ms step_avg:155.55ms step:951/1480 train_time:146388ms step_avg:155.57ms step:952/1480 train_time:146554ms step_avg:155.58ms step:953/1480 train_time:146723ms step_avg:155.59ms step:954/1480 train_time:146891ms step_avg:155.60ms step:955/1480 train_time:147055ms step_avg:155.61ms step:956/1480 train_time:147221ms step_avg:155.62ms step:957/1480 train_time:147388ms step_avg:155.64ms step:958/1480 train_time:147558ms step_avg:155.65ms step:959/1480 train_time:147721ms step_avg:155.66ms step:960/1480 train_time:147888ms step_avg:155.67ms step:961/1480 train_time:148054ms step_avg:155.68ms step:962/1480 train_time:148219ms step_avg:155.69ms step:963/1480 train_time:148384ms step_avg:155.70ms step:964/1480 train_time:148553ms step_avg:155.72ms step:965/1480 train_time:148718ms step_avg:155.73ms step:966/1480 train_time:148883ms step_avg:155.74ms step:967/1480 train_time:149046ms step_avg:155.74ms step:968/1480 train_time:149212ms step_avg:155.75ms step:969/1480 train_time:149378ms step_avg:155.76ms step:970/1480 train_time:149542ms step_avg:155.77ms step:971/1480 train_time:149707ms step_avg:155.78ms step:972/1480 train_time:149871ms step_avg:155.79ms step:973/1480 train_time:150035ms step_avg:155.80ms step:974/1480 train_time:150204ms step_avg:155.81ms step:975/1480 train_time:150369ms step_avg:155.82ms step:976/1480 train_time:150535ms step_avg:155.83ms step:977/1480 train_time:150700ms step_avg:155.84ms step:978/1480 train_time:150864ms step_avg:155.85ms step:979/1480 train_time:151029ms step_avg:155.86ms step:980/1480 train_time:151196ms step_avg:155.87ms step:981/1480 train_time:151364ms step_avg:155.88ms step:982/1480 train_time:151526ms step_avg:155.89ms step:983/1480 train_time:151691ms step_avg:155.90ms step:984/1480 train_time:151856ms step_avg:155.91ms step:985/1480 train_time:152023ms step_avg:155.92ms step:986/1480 train_time:152188ms step_avg:155.93ms step:987/1480 train_time:152353ms step_avg:155.94ms step:988/1480 train_time:152520ms step_avg:155.95ms step:989/1480 train_time:152685ms step_avg:155.96ms step:990/1480 train_time:152855ms step_avg:155.97ms step:991/1480 train_time:153022ms step_avg:155.99ms step:992/1480 train_time:153197ms step_avg:156.01ms step:993/1480 train_time:153376ms step_avg:156.03ms step:994/1480 train_time:153541ms step_avg:156.04ms step:995/1480 train_time:153706ms step_avg:156.05ms step:996/1480 train_time:153869ms step_avg:156.05ms step:997/1480 train_time:154033ms step_avg:156.06ms step:998/1480 train_time:154197ms step_avg:156.07ms step:999/1480 train_time:154362ms step_avg:156.08ms step:1000/1480 train_time:154530ms step_avg:156.09ms step:1000/1480 val_loss:3.4419 train_time:154598ms step_avg:156.16ms step:1001/1480 train_time:154699ms step_avg:156.10ms step:1002/1480 train_time:154866ms step_avg:156.12ms step:1003/1480 train_time:155038ms step_avg:156.13ms step:1004/1480 train_time:155206ms step_avg:156.14ms step:1005/1480 train_time:155374ms step_avg:156.16ms step:1006/1480 train_time:155542ms step_avg:156.17ms step:1007/1480 train_time:155707ms step_avg:156.18ms step:1008/1480 train_time:155874ms step_avg:156.19ms step:1009/1480 train_time:156049ms step_avg:156.20ms step:1010/1480 train_time:156213ms step_avg:156.21ms step:1011/1480 train_time:156377ms step_avg:156.22ms step:1012/1480 train_time:156545ms step_avg:156.23ms step:1013/1480 train_time:156714ms step_avg:156.25ms step:1014/1480 train_time:156881ms step_avg:156.26ms step:1015/1480 train_time:157051ms step_avg:156.27ms step:1016/1480 train_time:157218ms step_avg:156.28ms step:1017/1480 train_time:157390ms step_avg:156.30ms step:1018/1480 train_time:157559ms step_avg:156.31ms step:1019/1480 train_time:157729ms step_avg:156.32ms step:1020/1480 train_time:157897ms step_avg:156.33ms step:1021/1480 train_time:158061ms step_avg:156.34ms step:1022/1480 train_time:158230ms step_avg:156.35ms step:1023/1480 train_time:158398ms step_avg:156.36ms step:1024/1480 train_time:158563ms step_avg:156.37ms step:1025/1480 train_time:158735ms step_avg:156.39ms step:1026/1480 train_time:158900ms step_avg:156.40ms step:1027/1480 train_time:159067ms step_avg:156.41ms step:1028/1480 train_time:159239ms step_avg:156.42ms step:1029/1480 train_time:159412ms step_avg:156.44ms step:1030/1480 train_time:159578ms step_avg:156.45ms step:1031/1480 train_time:159743ms step_avg:156.46ms step:1032/1480 train_time:159915ms step_avg:156.47ms step:1033/1480 train_time:160080ms step_avg:156.48ms step:1034/1480 train_time:160249ms step_avg:156.49ms step:1035/1480 train_time:160416ms step_avg:156.50ms step:1036/1480 train_time:160582ms step_avg:156.51ms step:1037/1480 train_time:160749ms step_avg:156.52ms step:1038/1480 train_time:160917ms step_avg:156.53ms step:1039/1480 train_time:161087ms step_avg:156.55ms step:1040/1480 train_time:161254ms step_avg:156.56ms step:1041/1480 train_time:161420ms step_avg:156.57ms step:1042/1480 train_time:161586ms step_avg:156.58ms step:1043/1480 train_time:161752ms step_avg:156.59ms step:1044/1480 train_time:161918ms step_avg:156.59ms step:1045/1480 train_time:162089ms step_avg:156.61ms step:1046/1480 train_time:162256ms step_avg:156.62ms step:1047/1480 train_time:162422ms step_avg:156.63ms step:1048/1480 train_time:162589ms step_avg:156.64ms step:1049/1480 train_time:162754ms step_avg:156.64ms step:1050/1480 train_time:162923ms step_avg:156.66ms step:1051/1480 train_time:163093ms step_avg:156.67ms step:1052/1480 train_time:163260ms step_avg:156.68ms step:1053/1480 train_time:163427ms step_avg:156.69ms step:1054/1480 train_time:163594ms step_avg:156.70ms step:1055/1480 train_time:163757ms step_avg:156.71ms step:1056/1480 train_time:163922ms step_avg:156.71ms step:1057/1480 train_time:164088ms step_avg:156.72ms step:1058/1480 train_time:164256ms step_avg:156.73ms step:1059/1480 train_time:164431ms step_avg:156.75ms step:1060/1480 train_time:164600ms step_avg:156.76ms step:1061/1480 train_time:164763ms step_avg:156.77ms step:1062/1480 train_time:164930ms step_avg:156.78ms step:1063/1480 train_time:165094ms step_avg:156.78ms step:1064/1480 train_time:165257ms step_avg:156.79ms step:1065/1480 train_time:165424ms step_avg:156.80ms step:1066/1480 train_time:165591ms step_avg:156.81ms step:1067/1480 train_time:165762ms step_avg:156.82ms step:1068/1480 train_time:165929ms step_avg:156.83ms step:1069/1480 train_time:166099ms step_avg:156.84ms step:1070/1480 train_time:166265ms step_avg:156.85ms step:1071/1480 train_time:166437ms step_avg:156.87ms step:1072/1480 train_time:166603ms step_avg:156.88ms step:1073/1480 train_time:166767ms step_avg:156.88ms step:1074/1480 train_time:166933ms step_avg:156.89ms step:1075/1480 train_time:167103ms step_avg:156.90ms step:1076/1480 train_time:167271ms step_avg:156.91ms step:1077/1480 train_time:167436ms step_avg:156.92ms step:1078/1480 train_time:167612ms step_avg:156.94ms step:1079/1480 train_time:167786ms step_avg:156.96ms step:1080/1480 train_time:167955ms step_avg:156.97ms step:1081/1480 train_time:168122ms step_avg:156.98ms step:1082/1480 train_time:168289ms step_avg:156.99ms step:1083/1480 train_time:168455ms step_avg:156.99ms step:1084/1480 train_time:168622ms step_avg:157.00ms step:1085/1480 train_time:168790ms step_avg:157.01ms step:1086/1480 train_time:168958ms step_avg:157.02ms step:1087/1480 train_time:169123ms step_avg:157.03ms step:1088/1480 train_time:169292ms step_avg:157.04ms step:1089/1480 train_time:169464ms step_avg:157.06ms step:1090/1480 train_time:169637ms step_avg:157.07ms step:1091/1480 train_time:169805ms step_avg:157.08ms step:1092/1480 train_time:169972ms step_avg:157.09ms step:1093/1480 train_time:170139ms step_avg:157.10ms step:1094/1480 train_time:170306ms step_avg:157.11ms step:1095/1480 train_time:170471ms step_avg:157.12ms step:1096/1480 train_time:170640ms step_avg:157.13ms step:1097/1480 train_time:170810ms step_avg:157.14ms step:1098/1480 train_time:170979ms step_avg:157.15ms step:1099/1480 train_time:171151ms step_avg:157.16ms step:1100/1480 train_time:171324ms step_avg:157.18ms step:1101/1480 train_time:171494ms step_avg:157.19ms step:1102/1480 train_time:171664ms step_avg:157.20ms step:1103/1480 train_time:171839ms step_avg:157.22ms step:1104/1480 train_time:172008ms step_avg:157.23ms step:1105/1480 train_time:172178ms step_avg:157.24ms step:1106/1480 train_time:172346ms step_avg:157.25ms step:1107/1480 train_time:172515ms step_avg:157.26ms step:1108/1480 train_time:172679ms step_avg:157.27ms step:1109/1480 train_time:172847ms step_avg:157.28ms step:1110/1480 train_time:173012ms step_avg:157.28ms step:1111/1480 train_time:173178ms step_avg:157.29ms step:1112/1480 train_time:173349ms step_avg:157.30ms step:1113/1480 train_time:173528ms step_avg:157.32ms step:1114/1480 train_time:173701ms step_avg:157.34ms step:1115/1480 train_time:173874ms step_avg:157.35ms step:1116/1480 train_time:174042ms step_avg:157.36ms step:1117/1480 train_time:174215ms step_avg:157.38ms step:1118/1480 train_time:174389ms step_avg:157.39ms step:1119/1480 train_time:174555ms step_avg:157.40ms step:1120/1480 train_time:174725ms step_avg:157.41ms step:1121/1480 train_time:174895ms step_avg:157.42ms step:1122/1480 train_time:175061ms step_avg:157.43ms step:1123/1480 train_time:175228ms step_avg:157.44ms step:1124/1480 train_time:175396ms step_avg:157.45ms step:1125/1480 train_time:175562ms step_avg:157.45ms step:1125/1480 val_loss:3.3859 train_time:175631ms step_avg:157.52ms step:1126/1480 train_time:175735ms step_avg:157.47ms step:1127/1480 train_time:175904ms step_avg:157.48ms step:1128/1480 train_time:176075ms step_avg:157.49ms step:1129/1480 train_time:176248ms step_avg:157.51ms step:1130/1480 train_time:176417ms step_avg:157.51ms step:1131/1480 train_time:176595ms step_avg:157.53ms step:1132/1480 train_time:176760ms step_avg:157.54ms step:1133/1480 train_time:176932ms step_avg:157.55ms step:1134/1480 train_time:177102ms step_avg:157.56ms step:1135/1480 train_time:177271ms step_avg:157.57ms step:1136/1480 train_time:177439ms step_avg:157.58ms step:1137/1480 train_time:177609ms step_avg:157.59ms step:1138/1480 train_time:177779ms step_avg:157.61ms step:1139/1480 train_time:177948ms step_avg:157.62ms step:1140/1480 train_time:178116ms step_avg:157.62ms step:1141/1480 train_time:178288ms step_avg:157.64ms step:1142/1480 train_time:178457ms step_avg:157.65ms step:1143/1480 train_time:178627ms step_avg:157.66ms step:1144/1480 train_time:178797ms step_avg:157.67ms step:1145/1480 train_time:178961ms step_avg:157.67ms step:1146/1480 train_time:179132ms step_avg:157.69ms step:1147/1480 train_time:179299ms step_avg:157.69ms step:1148/1480 train_time:179469ms step_avg:157.71ms step:1149/1480 train_time:179640ms step_avg:157.72ms step:1150/1480 train_time:179807ms step_avg:157.73ms step:1151/1480 train_time:179979ms step_avg:157.74ms step:1152/1480 train_time:180151ms step_avg:157.75ms step:1153/1480 train_time:180325ms step_avg:157.76ms step:1154/1480 train_time:180493ms step_avg:157.77ms step:1155/1480 train_time:180664ms step_avg:157.78ms step:1156/1480 train_time:180842ms step_avg:157.80ms step:1157/1480 train_time:181011ms step_avg:157.81ms step:1158/1480 train_time:181177ms step_avg:157.82ms step:1159/1480 train_time:181345ms step_avg:157.83ms step:1160/1480 train_time:181511ms step_avg:157.84ms step:1161/1480 train_time:181681ms step_avg:157.85ms step:1162/1480 train_time:181853ms step_avg:157.86ms step:1163/1480 train_time:182021ms step_avg:157.87ms step:1164/1480 train_time:182190ms step_avg:157.88ms step:1165/1480 train_time:182355ms step_avg:157.88ms step:1166/1480 train_time:182524ms step_avg:157.89ms step:1167/1480 train_time:182693ms step_avg:157.90ms step:1168/1480 train_time:182862ms step_avg:157.91ms step:1169/1480 train_time:183030ms step_avg:157.92ms step:1170/1480 train_time:183200ms step_avg:157.93ms step:1171/1480 train_time:183366ms step_avg:157.94ms step:1172/1480 train_time:183533ms step_avg:157.95ms step:1173/1480 train_time:183703ms step_avg:157.96ms step:1174/1480 train_time:183884ms step_avg:157.98ms step:1175/1480 train_time:184056ms step_avg:157.99ms step:1176/1480 train_time:184227ms step_avg:158.00ms step:1177/1480 train_time:184404ms step_avg:158.02ms step:1178/1480 train_time:184573ms step_avg:158.02ms step:1179/1480 train_time:184739ms step_avg:158.03ms step:1180/1480 train_time:184920ms step_avg:158.05ms step:1181/1480 train_time:185089ms step_avg:158.06ms step:1182/1480 train_time:185257ms step_avg:158.07ms step:1183/1480 train_time:185426ms step_avg:158.08ms step:1184/1480 train_time:185593ms step_avg:158.09ms step:1185/1480 train_time:185765ms step_avg:158.10ms step:1186/1480 train_time:185936ms step_avg:158.11ms step:1187/1480 train_time:186119ms step_avg:158.13ms step:1188/1480 train_time:186286ms step_avg:158.14ms step:1189/1480 train_time:186456ms step_avg:158.15ms step:1190/1480 train_time:186623ms step_avg:158.16ms step:1191/1480 train_time:186794ms step_avg:158.17ms step:1192/1480 train_time:186959ms step_avg:158.17ms step:1193/1480 train_time:187126ms step_avg:158.18ms step:1194/1480 train_time:187295ms step_avg:158.19ms step:1195/1480 train_time:187470ms step_avg:158.20ms step:1196/1480 train_time:187653ms step_avg:158.22ms step:1197/1480 train_time:187824ms step_avg:158.23ms step:1198/1480 train_time:188007ms step_avg:158.25ms step:1199/1480 train_time:188177ms step_avg:158.27ms step:1200/1480 train_time:188347ms step_avg:158.27ms step:1201/1480 train_time:188514ms step_avg:158.28ms step:1202/1480 train_time:188697ms step_avg:158.30ms step:1203/1480 train_time:188874ms step_avg:158.32ms step:1204/1480 train_time:189049ms step_avg:158.33ms step:1205/1480 train_time:189217ms step_avg:158.34ms step:1206/1480 train_time:189384ms step_avg:158.35ms step:1207/1480 train_time:189556ms step_avg:158.36ms step:1208/1480 train_time:189723ms step_avg:158.37ms step:1209/1480 train_time:189897ms step_avg:158.38ms step:1210/1480 train_time:190074ms step_avg:158.39ms step:1211/1480 train_time:190247ms step_avg:158.41ms step:1212/1480 train_time:190418ms step_avg:158.42ms step:1213/1480 train_time:190590ms step_avg:158.43ms step:1214/1480 train_time:190766ms step_avg:158.44ms step:1215/1480 train_time:190938ms step_avg:158.45ms step:1216/1480 train_time:191108ms step_avg:158.46ms step:1217/1480 train_time:191281ms step_avg:158.48ms step:1218/1480 train_time:191451ms step_avg:158.49ms step:1219/1480 train_time:191631ms step_avg:158.50ms step:1220/1480 train_time:191799ms step_avg:158.51ms step:1221/1480 train_time:191969ms step_avg:158.52ms step:1222/1480 train_time:192137ms step_avg:158.53ms step:1223/1480 train_time:192306ms step_avg:158.54ms step:1224/1480 train_time:192484ms step_avg:158.55ms step:1225/1480 train_time:192655ms step_avg:158.56ms step:1226/1480 train_time:192830ms step_avg:158.58ms step:1227/1480 train_time:193002ms step_avg:158.59ms step:1228/1480 train_time:193173ms step_avg:158.60ms step:1229/1480 train_time:193345ms step_avg:158.61ms step:1230/1480 train_time:193523ms step_avg:158.63ms step:1231/1480 train_time:193699ms step_avg:158.64ms step:1232/1480 train_time:193874ms step_avg:158.65ms step:1233/1480 train_time:194043ms step_avg:158.66ms step:1234/1480 train_time:194212ms step_avg:158.67ms step:1235/1480 train_time:194387ms step_avg:158.68ms step:1236/1480 train_time:194556ms step_avg:158.69ms step:1237/1480 train_time:194726ms step_avg:158.70ms step:1238/1480 train_time:194911ms step_avg:158.72ms step:1239/1480 train_time:195080ms step_avg:158.73ms step:1240/1480 train_time:195252ms step_avg:158.74ms step:1241/1480 train_time:195425ms step_avg:158.75ms step:1242/1480 train_time:195595ms step_avg:158.76ms step:1243/1480 train_time:195770ms step_avg:158.77ms step:1244/1480 train_time:195936ms step_avg:158.78ms step:1245/1480 train_time:196104ms step_avg:158.79ms step:1246/1480 train_time:196274ms step_avg:158.80ms step:1247/1480 train_time:196444ms step_avg:158.81ms step:1248/1480 train_time:196613ms step_avg:158.81ms step:1249/1480 train_time:196780ms step_avg:158.82ms step:1250/1480 train_time:196951ms step_avg:158.83ms step:1250/1480 val_loss:3.3363 train_time:197021ms step_avg:158.89ms step:1251/1480 train_time:197132ms step_avg:158.85ms step:1252/1480 train_time:197300ms step_avg:158.86ms step:1253/1480 train_time:197469ms step_avg:158.86ms step:1254/1480 train_time:197640ms step_avg:158.87ms step:1255/1480 train_time:197829ms step_avg:158.90ms step:1256/1480 train_time:198002ms step_avg:158.91ms step:1257/1480 train_time:198174ms step_avg:158.92ms step:1258/1480 train_time:198349ms step_avg:158.93ms step:1259/1480 train_time:198520ms step_avg:158.94ms step:1260/1480 train_time:198687ms step_avg:158.95ms step:1261/1480 train_time:198859ms step_avg:158.96ms step:1262/1480 train_time:199035ms step_avg:158.97ms step:1263/1480 train_time:199209ms step_avg:158.99ms step:1264/1480 train_time:199375ms step_avg:158.99ms step:1265/1480 train_time:199542ms step_avg:159.00ms step:1266/1480 train_time:199714ms step_avg:159.01ms step:1267/1480 train_time:199885ms step_avg:159.02ms step:1268/1480 train_time:200055ms step_avg:159.03ms step:1269/1480 train_time:200232ms step_avg:159.04ms step:1270/1480 train_time:200402ms step_avg:159.05ms step:1271/1480 train_time:200573ms step_avg:159.06ms step:1272/1480 train_time:200739ms step_avg:159.06ms step:1273/1480 train_time:200910ms step_avg:159.07ms step:1274/1480 train_time:201082ms step_avg:159.08ms step:1275/1480 train_time:201248ms step_avg:159.09ms step:1276/1480 train_time:201414ms step_avg:159.09ms step:1277/1480 train_time:201586ms step_avg:159.10ms step:1278/1480 train_time:201753ms step_avg:159.11ms step:1279/1480 train_time:201924ms step_avg:159.12ms step:1280/1480 train_time:202104ms step_avg:159.14ms step:1281/1480 train_time:202275ms step_avg:159.15ms step:1282/1480 train_time:202439ms step_avg:159.15ms step:1283/1480 train_time:202609ms step_avg:159.16ms step:1284/1480 train_time:202779ms step_avg:159.17ms step:1285/1480 train_time:202949ms step_avg:159.18ms step:1286/1480 train_time:203119ms step_avg:159.18ms step:1287/1480 train_time:203291ms step_avg:159.19ms step:1288/1480 train_time:203461ms step_avg:159.20ms step:1289/1480 train_time:203644ms step_avg:159.22ms step:1290/1480 train_time:203822ms step_avg:159.24ms step:1291/1480 train_time:203996ms step_avg:159.25ms step:1292/1480 train_time:204171ms step_avg:159.26ms step:1293/1480 train_time:204346ms step_avg:159.27ms step:1294/1480 train_time:204517ms step_avg:159.28ms step:1295/1480 train_time:204689ms step_avg:159.29ms step:1296/1480 train_time:204862ms step_avg:159.30ms step:1297/1480 train_time:205034ms step_avg:159.31ms step:1298/1480 train_time:205204ms step_avg:159.32ms step:1299/1480 train_time:205375ms step_avg:159.33ms step:1300/1480 train_time:205542ms step_avg:159.33ms step:1301/1480 train_time:205711ms step_avg:159.34ms step:1302/1480 train_time:205885ms step_avg:159.35ms step:1303/1480 train_time:206061ms step_avg:159.37ms step:1304/1480 train_time:206234ms step_avg:159.38ms step:1305/1480 train_time:206403ms step_avg:159.38ms step:1306/1480 train_time:206579ms step_avg:159.40ms step:1307/1480 train_time:206747ms step_avg:159.40ms step:1308/1480 train_time:206915ms step_avg:159.41ms step:1309/1480 train_time:207088ms step_avg:159.42ms step:1310/1480 train_time:207257ms step_avg:159.43ms step:1311/1480 train_time:207424ms step_avg:159.43ms step:1312/1480 train_time:207597ms step_avg:159.44ms step:1313/1480 train_time:207767ms step_avg:159.45ms step:1314/1480 train_time:207938ms step_avg:159.46ms step:1315/1480 train_time:208108ms step_avg:159.47ms step:1316/1480 train_time:208275ms step_avg:159.48ms step:1317/1480 train_time:208446ms step_avg:159.48ms step:1318/1480 train_time:208627ms step_avg:159.50ms step:1319/1480 train_time:208802ms step_avg:159.51ms step:1320/1480 train_time:208979ms step_avg:159.53ms step:1321/1480 train_time:209153ms step_avg:159.54ms step:1322/1480 train_time:209333ms step_avg:159.55ms step:1323/1480 train_time:209504ms step_avg:159.56ms step:1324/1480 train_time:209679ms step_avg:159.57ms step:1325/1480 train_time:209861ms step_avg:159.59ms step:1326/1480 train_time:210037ms step_avg:159.60ms step:1327/1480 train_time:210207ms step_avg:159.61ms step:1328/1480 train_time:210379ms step_avg:159.62ms step:1329/1480 train_time:210575ms step_avg:159.65ms step:1330/1480 train_time:210755ms step_avg:159.66ms step:1331/1480 train_time:210925ms step_avg:159.67ms step:1332/1480 train_time:211100ms step_avg:159.68ms step:1333/1480 train_time:211275ms step_avg:159.69ms step:1334/1480 train_time:211447ms step_avg:159.70ms step:1335/1480 train_time:211615ms step_avg:159.71ms step:1336/1480 train_time:211800ms step_avg:159.73ms step:1337/1480 train_time:211976ms step_avg:159.74ms step:1338/1480 train_time:212148ms step_avg:159.75ms step:1339/1480 train_time:212321ms step_avg:159.76ms step:1340/1480 train_time:212494ms step_avg:159.77ms step:1341/1480 train_time:212663ms step_avg:159.78ms step:1342/1480 train_time:212837ms step_avg:159.79ms step:1343/1480 train_time:213008ms step_avg:159.80ms step:1344/1480 train_time:213181ms step_avg:159.81ms step:1345/1480 train_time:213358ms step_avg:159.82ms step:1346/1480 train_time:213527ms step_avg:159.83ms step:1347/1480 train_time:213698ms step_avg:159.83ms step:1348/1480 train_time:213867ms step_avg:159.84ms step:1349/1480 train_time:214036ms step_avg:159.85ms step:1350/1480 train_time:214212ms step_avg:159.86ms step:1351/1480 train_time:214385ms step_avg:159.87ms step:1352/1480 train_time:214556ms step_avg:159.88ms step:1353/1480 train_time:214733ms step_avg:159.89ms step:1354/1480 train_time:214905ms step_avg:159.90ms step:1355/1480 train_time:215075ms step_avg:159.91ms step:1356/1480 train_time:215248ms step_avg:159.92ms step:1357/1480 train_time:215420ms step_avg:159.93ms step:1358/1480 train_time:215594ms step_avg:159.94ms step:1359/1480 train_time:215767ms step_avg:159.95ms step:1360/1480 train_time:215941ms step_avg:159.96ms step:1361/1480 train_time:216119ms step_avg:159.97ms step:1362/1480 train_time:216295ms step_avg:159.98ms step:1363/1480 train_time:216475ms step_avg:160.00ms step:1364/1480 train_time:216645ms step_avg:160.00ms step:1365/1480 train_time:216812ms step_avg:160.01ms step:1366/1480 train_time:216984ms step_avg:160.02ms step:1367/1480 train_time:217155ms step_avg:160.03ms step:1368/1480 train_time:217330ms step_avg:160.04ms step:1369/1480 train_time:217511ms step_avg:160.05ms step:1370/1480 train_time:217690ms step_avg:160.07ms step:1371/1480 train_time:217860ms step_avg:160.07ms step:1372/1480 train_time:218037ms step_avg:160.09ms step:1373/1480 train_time:218206ms step_avg:160.09ms step:1374/1480 train_time:218381ms step_avg:160.10ms step:1375/1480 train_time:218553ms step_avg:160.11ms step:1375/1480 val_loss:3.2980 train_time:218620ms step_avg:160.16ms step:1376/1480 train_time:218728ms step_avg:160.12ms step:1377/1480 train_time:218902ms step_avg:160.13ms step:1378/1480 train_time:219070ms step_avg:160.14ms step:1379/1480 train_time:219245ms step_avg:160.15ms step:1380/1480 train_time:219419ms step_avg:160.16ms step:1381/1480 train_time:219603ms step_avg:160.18ms step:1382/1480 train_time:219774ms step_avg:160.19ms step:1383/1480 train_time:219946ms step_avg:160.19ms step:1384/1480 train_time:220122ms step_avg:160.20ms step:1385/1480 train_time:220288ms step_avg:160.21ms step:1386/1480 train_time:220459ms step_avg:160.22ms step:1387/1480 train_time:220630ms step_avg:160.22ms step:1388/1480 train_time:220800ms step_avg:160.23ms step:1389/1480 train_time:220973ms step_avg:160.24ms step:1390/1480 train_time:221142ms step_avg:160.25ms step:1391/1480 train_time:221311ms step_avg:160.25ms step:1392/1480 train_time:221485ms step_avg:160.26ms step:1393/1480 train_time:221655ms step_avg:160.27ms step:1394/1480 train_time:221825ms step_avg:160.28ms step:1395/1480 train_time:221993ms step_avg:160.28ms step:1396/1480 train_time:222162ms step_avg:160.29ms step:1397/1480 train_time:222329ms step_avg:160.29ms step:1398/1480 train_time:222495ms step_avg:160.30ms step:1399/1480 train_time:222664ms step_avg:160.31ms step:1400/1480 train_time:222840ms step_avg:160.32ms step:1401/1480 train_time:223006ms step_avg:160.32ms step:1402/1480 train_time:223177ms step_avg:160.33ms step:1403/1480 train_time:223353ms step_avg:160.34ms step:1404/1480 train_time:223524ms step_avg:160.35ms step:1405/1480 train_time:223698ms step_avg:160.36ms step:1406/1480 train_time:223872ms step_avg:160.37ms step:1407/1480 train_time:224042ms step_avg:160.37ms step:1408/1480 train_time:224211ms step_avg:160.38ms step:1409/1480 train_time:224393ms step_avg:160.39ms step:1410/1480 train_time:224562ms step_avg:160.40ms step:1411/1480 train_time:224731ms step_avg:160.41ms step:1412/1480 train_time:224902ms step_avg:160.42ms step:1413/1480 train_time:225072ms step_avg:160.42ms step:1414/1480 train_time:225244ms step_avg:160.43ms step:1415/1480 train_time:225419ms step_avg:160.44ms step:1416/1480 train_time:225609ms step_avg:160.46ms step:1417/1480 train_time:225783ms step_avg:160.47ms step:1418/1480 train_time:225955ms step_avg:160.48ms step:1419/1480 train_time:226128ms step_avg:160.49ms step:1420/1480 train_time:226304ms step_avg:160.50ms step:1421/1480 train_time:226475ms step_avg:160.51ms step:1422/1480 train_time:226646ms step_avg:160.51ms step:1423/1480 train_time:226815ms step_avg:160.52ms step:1424/1480 train_time:226991ms step_avg:160.53ms step:1425/1480 train_time:227171ms step_avg:160.54ms step:1426/1480 train_time:227342ms step_avg:160.55ms step:1427/1480 train_time:227517ms step_avg:160.56ms step:1428/1480 train_time:227688ms step_avg:160.57ms step:1429/1480 train_time:227857ms step_avg:160.58ms step:1430/1480 train_time:228032ms step_avg:160.59ms step:1431/1480 train_time:228207ms step_avg:160.60ms step:1432/1480 train_time:228383ms step_avg:160.61ms step:1433/1480 train_time:228563ms step_avg:160.62ms step:1434/1480 train_time:228744ms step_avg:160.63ms step:1435/1480 train_time:228917ms step_avg:160.64ms step:1436/1480 train_time:229090ms step_avg:160.65ms step:1437/1480 train_time:229262ms step_avg:160.66ms step:1438/1480 train_time:229431ms step_avg:160.67ms step:1439/1480 train_time:229605ms step_avg:160.68ms step:1440/1480 train_time:229775ms step_avg:160.68ms step:1441/1480 train_time:229945ms step_avg:160.69ms step:1442/1480 train_time:230122ms step_avg:160.70ms step:1443/1480 train_time:230313ms step_avg:160.72ms step:1444/1480 train_time:230484ms step_avg:160.73ms step:1445/1480 train_time:230655ms step_avg:160.73ms step:1446/1480 train_time:230832ms step_avg:160.75ms step:1447/1480 train_time:231011ms step_avg:160.76ms step:1448/1480 train_time:231185ms step_avg:160.77ms step:1449/1480 train_time:231358ms step_avg:160.78ms step:1450/1480 train_time:231530ms step_avg:160.78ms step:1451/1480 train_time:231702ms step_avg:160.79ms step:1452/1480 train_time:231874ms step_avg:160.80ms step:1453/1480 train_time:232044ms step_avg:160.81ms step:1454/1480 train_time:232216ms step_avg:160.81ms step:1455/1480 train_time:232394ms step_avg:160.83ms step:1456/1480 train_time:232567ms step_avg:160.83ms step:1457/1480 train_time:232738ms step_avg:160.84ms step:1458/1480 train_time:232909ms step_avg:160.85ms step:1459/1480 train_time:233086ms step_avg:160.86ms step:1460/1480 train_time:233258ms step_avg:160.87ms step:1461/1480 train_time:233431ms step_avg:160.88ms step:1462/1480 train_time:233603ms step_avg:160.88ms step:1463/1480 train_time:233778ms step_avg:160.89ms step:1464/1480 train_time:233954ms step_avg:160.90ms step:1465/1480 train_time:234126ms step_avg:160.91ms step:1466/1480 train_time:234296ms step_avg:160.92ms step:1467/1480 train_time:234471ms step_avg:160.93ms step:1468/1480 train_time:234640ms step_avg:160.93ms step:1469/1480 train_time:234813ms step_avg:160.94ms step:1470/1480 train_time:234992ms step_avg:160.95ms step:1471/1480 train_time:235179ms step_avg:160.97ms step:1472/1480 train_time:235358ms step_avg:160.98ms step:1473/1480 train_time:235528ms step_avg:160.99ms step:1474/1480 train_time:235706ms step_avg:161.00ms step:1475/1480 train_time:235887ms step_avg:161.01ms step:1476/1480 train_time:236058ms step_avg:161.02ms step:1477/1480 train_time:236244ms step_avg:161.04ms step:1478/1480 train_time:236428ms step_avg:161.05ms step:1479/1480 train_time:236603ms step_avg:161.06ms step:1480/1480 train_time:236776ms step_avg:161.07ms step:1480/1480 val_loss:3.2793 train_time:236846ms step_avg:161.12ms