import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:28:54 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 126W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 88W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 109W / 700W | 47MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 122W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23674ms step_avg:nanms step:2/1480 train_time:23760ms step_avg:nanms step:3/1480 train_time:23899ms step_avg:nanms step:4/1480 train_time:24041ms step_avg:nanms step:5/1480 train_time:24183ms step_avg:nanms step:6/1480 train_time:24325ms step_avg:nanms step:7/1480 train_time:24465ms step_avg:nanms step:8/1480 train_time:24608ms step_avg:nanms step:9/1480 train_time:24756ms step_avg:nanms step:10/1480 train_time:24899ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.73ms step:14/1480 train_time:566ms step_avg:141.62ms step:15/1480 train_time:710ms step_avg:142.10ms step:16/1480 train_time:856ms step_avg:142.68ms step:17/1480 train_time:999ms step_avg:142.65ms step:18/1480 train_time:1140ms step_avg:142.54ms step:19/1480 train_time:1282ms step_avg:142.41ms step:20/1480 train_time:1426ms step_avg:142.57ms step:21/1480 train_time:1568ms step_avg:142.54ms step:22/1480 train_time:1711ms step_avg:142.61ms step:23/1480 train_time:1856ms step_avg:142.81ms step:24/1480 train_time:1999ms step_avg:142.77ms step:25/1480 train_time:2141ms step_avg:142.74ms step:26/1480 train_time:2283ms step_avg:142.70ms step:27/1480 train_time:2426ms step_avg:142.69ms step:28/1480 train_time:2569ms step_avg:142.75ms step:29/1480 train_time:2713ms step_avg:142.79ms step:30/1480 train_time:2856ms step_avg:142.80ms step:31/1480 train_time:2999ms step_avg:142.82ms step:32/1480 train_time:3141ms step_avg:142.78ms step:33/1480 train_time:3285ms step_avg:142.81ms step:34/1480 train_time:3427ms step_avg:142.77ms step:35/1480 train_time:3570ms step_avg:142.79ms step:36/1480 train_time:3714ms step_avg:142.84ms step:37/1480 train_time:3857ms step_avg:142.85ms step:38/1480 train_time:3998ms step_avg:142.79ms step:39/1480 train_time:4141ms step_avg:142.80ms step:40/1480 train_time:4285ms step_avg:142.85ms step:41/1480 train_time:4430ms step_avg:142.91ms step:42/1480 train_time:4574ms step_avg:142.94ms step:43/1480 train_time:4717ms step_avg:142.94ms step:44/1480 train_time:4860ms step_avg:142.94ms step:45/1480 train_time:5002ms step_avg:142.93ms step:46/1480 train_time:5145ms step_avg:142.91ms step:47/1480 train_time:5287ms step_avg:142.89ms step:48/1480 train_time:5431ms step_avg:142.93ms step:49/1480 train_time:5576ms step_avg:142.96ms step:50/1480 train_time:5717ms step_avg:142.93ms step:51/1480 train_time:5860ms step_avg:142.92ms step:52/1480 train_time:6001ms step_avg:142.89ms step:53/1480 train_time:6144ms step_avg:142.89ms step:54/1480 train_time:6289ms step_avg:142.92ms step:55/1480 train_time:6433ms step_avg:142.95ms step:56/1480 train_time:6580ms step_avg:143.04ms step:57/1480 train_time:6722ms step_avg:143.01ms step:58/1480 train_time:6865ms step_avg:143.03ms step:59/1480 train_time:7008ms step_avg:143.03ms step:60/1480 train_time:7151ms step_avg:143.03ms step:61/1480 train_time:7293ms step_avg:143.01ms step:62/1480 train_time:7436ms step_avg:143.00ms step:63/1480 train_time:7577ms step_avg:142.97ms step:64/1480 train_time:7719ms step_avg:142.95ms step:65/1480 train_time:7863ms step_avg:142.96ms step:66/1480 train_time:8006ms step_avg:142.96ms step:67/1480 train_time:8151ms step_avg:142.99ms step:68/1480 train_time:8294ms step_avg:143.00ms step:69/1480 train_time:8437ms step_avg:142.99ms step:70/1480 train_time:8577ms step_avg:142.96ms step:71/1480 train_time:8721ms step_avg:142.96ms step:72/1480 train_time:8863ms step_avg:142.95ms step:73/1480 train_time:9008ms step_avg:142.98ms step:74/1480 train_time:9151ms step_avg:142.99ms step:75/1480 train_time:9296ms step_avg:143.02ms step:76/1480 train_time:9439ms step_avg:143.01ms step:77/1480 train_time:9580ms step_avg:142.98ms step:78/1480 train_time:9722ms step_avg:142.97ms step:79/1480 train_time:9864ms step_avg:142.95ms step:80/1480 train_time:10009ms step_avg:142.98ms step:81/1480 train_time:10153ms step_avg:142.99ms step:82/1480 train_time:10296ms step_avg:143.00ms step:83/1480 train_time:10438ms step_avg:142.99ms step:84/1480 train_time:10579ms step_avg:142.97ms step:85/1480 train_time:10721ms step_avg:142.95ms step:86/1480 train_time:10862ms step_avg:142.93ms step:87/1480 train_time:11005ms step_avg:142.93ms step:88/1480 train_time:11150ms step_avg:142.95ms step:89/1480 train_time:11294ms step_avg:142.96ms step:90/1480 train_time:11437ms step_avg:142.96ms step:91/1480 train_time:11579ms step_avg:142.94ms step:92/1480 train_time:11722ms step_avg:142.95ms step:93/1480 train_time:11865ms step_avg:142.96ms step:94/1480 train_time:12009ms step_avg:142.97ms step:95/1480 train_time:12154ms step_avg:142.99ms step:96/1480 train_time:12297ms step_avg:142.99ms step:97/1480 train_time:12440ms step_avg:142.99ms step:98/1480 train_time:12581ms step_avg:142.96ms step:99/1480 train_time:12722ms step_avg:142.95ms step:100/1480 train_time:12864ms step_avg:142.94ms step:101/1480 train_time:13007ms step_avg:142.93ms step:102/1480 train_time:13152ms step_avg:142.95ms step:103/1480 train_time:13295ms step_avg:142.96ms step:104/1480 train_time:13438ms step_avg:142.96ms step:105/1480 train_time:13579ms step_avg:142.94ms step:106/1480 train_time:13720ms step_avg:142.92ms step:107/1480 train_time:13862ms step_avg:142.91ms step:108/1480 train_time:14005ms step_avg:142.91ms step:109/1480 train_time:14149ms step_avg:142.92ms step:110/1480 train_time:14293ms step_avg:142.93ms step:111/1480 train_time:14439ms step_avg:142.96ms step:112/1480 train_time:14585ms step_avg:142.99ms step:113/1480 train_time:14733ms step_avg:143.04ms step:114/1480 train_time:14879ms step_avg:143.07ms step:115/1480 train_time:15025ms step_avg:143.09ms step:116/1480 train_time:15172ms step_avg:143.14ms step:117/1480 train_time:15320ms step_avg:143.17ms step:118/1480 train_time:15466ms step_avg:143.21ms step:119/1480 train_time:15614ms step_avg:143.25ms step:120/1480 train_time:15761ms step_avg:143.28ms step:121/1480 train_time:15907ms step_avg:143.30ms step:122/1480 train_time:16054ms step_avg:143.34ms step:123/1480 train_time:16201ms step_avg:143.37ms step:124/1480 train_time:16351ms step_avg:143.43ms step:125/1480 train_time:16498ms step_avg:143.46ms step:125/1480 val_loss:4.4153 train_time:16555ms step_avg:143.95ms step:126/1480 train_time:16651ms step_avg:143.54ms step:127/1480 train_time:16801ms step_avg:143.60ms step:128/1480 train_time:16949ms step_avg:143.63ms step:129/1480 train_time:17094ms step_avg:143.65ms step:130/1480 train_time:17240ms step_avg:143.67ms step:131/1480 train_time:17387ms step_avg:143.69ms step:132/1480 train_time:17533ms step_avg:143.72ms step:133/1480 train_time:17682ms step_avg:143.76ms step:134/1480 train_time:17830ms step_avg:143.79ms step:135/1480 train_time:17977ms step_avg:143.82ms step:136/1480 train_time:18124ms step_avg:143.84ms step:137/1480 train_time:18270ms step_avg:143.86ms step:138/1480 train_time:18416ms step_avg:143.88ms step:139/1480 train_time:18564ms step_avg:143.91ms step:140/1480 train_time:18710ms step_avg:143.93ms step:141/1480 train_time:18860ms step_avg:143.97ms step:142/1480 train_time:19008ms step_avg:144.00ms step:143/1480 train_time:19157ms step_avg:144.04ms step:144/1480 train_time:19304ms step_avg:144.06ms step:145/1480 train_time:19450ms step_avg:144.08ms step:146/1480 train_time:19598ms step_avg:144.10ms step:147/1480 train_time:19746ms step_avg:144.13ms step:148/1480 train_time:19893ms step_avg:144.15ms step:149/1480 train_time:20041ms step_avg:144.18ms step:150/1480 train_time:20188ms step_avg:144.20ms step:151/1480 train_time:20334ms step_avg:144.21ms step:152/1480 train_time:20481ms step_avg:144.23ms step:153/1480 train_time:20628ms step_avg:144.25ms step:154/1480 train_time:20776ms step_avg:144.27ms step:155/1480 train_time:20923ms step_avg:144.30ms step:156/1480 train_time:21071ms step_avg:144.32ms step:157/1480 train_time:21217ms step_avg:144.34ms step:158/1480 train_time:21366ms step_avg:144.36ms step:159/1480 train_time:21511ms step_avg:144.37ms step:160/1480 train_time:21659ms step_avg:144.39ms step:161/1480 train_time:21807ms step_avg:144.41ms step:162/1480 train_time:21955ms step_avg:144.44ms step:163/1480 train_time:22103ms step_avg:144.46ms step:164/1480 train_time:22249ms step_avg:144.48ms step:165/1480 train_time:22397ms step_avg:144.50ms step:166/1480 train_time:22544ms step_avg:144.51ms step:167/1480 train_time:22690ms step_avg:144.52ms step:168/1480 train_time:22839ms step_avg:144.55ms step:169/1480 train_time:22986ms step_avg:144.57ms step:170/1480 train_time:23132ms step_avg:144.57ms step:171/1480 train_time:23279ms step_avg:144.59ms step:172/1480 train_time:23426ms step_avg:144.60ms step:173/1480 train_time:23572ms step_avg:144.61ms step:174/1480 train_time:23719ms step_avg:144.63ms step:175/1480 train_time:23867ms step_avg:144.65ms step:176/1480 train_time:24014ms step_avg:144.66ms step:177/1480 train_time:24162ms step_avg:144.68ms step:178/1480 train_time:24310ms step_avg:144.70ms step:179/1480 train_time:24456ms step_avg:144.71ms step:180/1480 train_time:24603ms step_avg:144.73ms step:181/1480 train_time:24750ms step_avg:144.74ms step:182/1480 train_time:24896ms step_avg:144.75ms step:183/1480 train_time:25044ms step_avg:144.76ms step:184/1480 train_time:25191ms step_avg:144.78ms step:185/1480 train_time:25339ms step_avg:144.79ms step:186/1480 train_time:25487ms step_avg:144.81ms step:187/1480 train_time:25633ms step_avg:144.82ms step:188/1480 train_time:25780ms step_avg:144.83ms step:189/1480 train_time:25926ms step_avg:144.84ms step:190/1480 train_time:26071ms step_avg:144.84ms step:191/1480 train_time:26218ms step_avg:144.85ms step:192/1480 train_time:26366ms step_avg:144.87ms step:193/1480 train_time:26512ms step_avg:144.88ms step:194/1480 train_time:26662ms step_avg:144.90ms step:195/1480 train_time:26809ms step_avg:144.91ms step:196/1480 train_time:26956ms step_avg:144.92ms step:197/1480 train_time:27104ms step_avg:144.94ms step:198/1480 train_time:27251ms step_avg:144.95ms step:199/1480 train_time:27399ms step_avg:144.97ms step:200/1480 train_time:27547ms step_avg:144.98ms step:201/1480 train_time:27694ms step_avg:145.00ms step:202/1480 train_time:27842ms step_avg:145.01ms step:203/1480 train_time:27989ms step_avg:145.02ms step:204/1480 train_time:28136ms step_avg:145.03ms step:205/1480 train_time:28284ms step_avg:145.04ms step:206/1480 train_time:28430ms step_avg:145.05ms step:207/1480 train_time:28577ms step_avg:145.06ms step:208/1480 train_time:28725ms step_avg:145.07ms step:209/1480 train_time:28872ms step_avg:145.08ms step:210/1480 train_time:29019ms step_avg:145.09ms step:211/1480 train_time:29166ms step_avg:145.10ms step:212/1480 train_time:29311ms step_avg:145.10ms step:213/1480 train_time:29460ms step_avg:145.12ms step:214/1480 train_time:29607ms step_avg:145.13ms step:215/1480 train_time:29755ms step_avg:145.15ms step:216/1480 train_time:29903ms step_avg:145.16ms step:217/1480 train_time:30050ms step_avg:145.17ms step:218/1480 train_time:30196ms step_avg:145.17ms step:219/1480 train_time:30344ms step_avg:145.19ms step:220/1480 train_time:30491ms step_avg:145.20ms step:221/1480 train_time:30641ms step_avg:145.22ms step:222/1480 train_time:30793ms step_avg:145.25ms step:223/1480 train_time:30943ms step_avg:145.27ms step:224/1480 train_time:31094ms step_avg:145.30ms step:225/1480 train_time:31245ms step_avg:145.33ms step:226/1480 train_time:31395ms step_avg:145.35ms step:227/1480 train_time:31545ms step_avg:145.37ms step:228/1480 train_time:31697ms step_avg:145.40ms step:229/1480 train_time:31848ms step_avg:145.43ms step:230/1480 train_time:31998ms step_avg:145.45ms step:231/1480 train_time:32148ms step_avg:145.47ms step:232/1480 train_time:32300ms step_avg:145.50ms step:233/1480 train_time:32451ms step_avg:145.52ms step:234/1480 train_time:32603ms step_avg:145.55ms step:235/1480 train_time:32754ms step_avg:145.57ms step:236/1480 train_time:32905ms step_avg:145.60ms step:237/1480 train_time:33056ms step_avg:145.62ms step:238/1480 train_time:33206ms step_avg:145.64ms step:239/1480 train_time:33356ms step_avg:145.66ms step:240/1480 train_time:33506ms step_avg:145.68ms step:241/1480 train_time:33657ms step_avg:145.70ms step:242/1480 train_time:33807ms step_avg:145.72ms step:243/1480 train_time:33957ms step_avg:145.74ms step:244/1480 train_time:34108ms step_avg:145.76ms step:245/1480 train_time:34259ms step_avg:145.78ms step:246/1480 train_time:34412ms step_avg:145.81ms step:247/1480 train_time:34561ms step_avg:145.83ms step:248/1480 train_time:34710ms step_avg:145.84ms step:249/1480 train_time:34863ms step_avg:145.87ms step:250/1480 train_time:35015ms step_avg:145.90ms step:250/1480 val_loss:3.9977 train_time:35074ms step_avg:146.14ms step:251/1480 train_time:35172ms step_avg:145.94ms step:252/1480 train_time:35323ms step_avg:145.96ms step:253/1480 train_time:35474ms step_avg:145.98ms step:254/1480 train_time:35623ms step_avg:145.99ms step:255/1480 train_time:35772ms step_avg:146.01ms step:256/1480 train_time:35922ms step_avg:146.02ms step:257/1480 train_time:36072ms step_avg:146.04ms step:258/1480 train_time:36223ms step_avg:146.06ms step:259/1480 train_time:36375ms step_avg:146.09ms step:260/1480 train_time:36526ms step_avg:146.10ms step:261/1480 train_time:36677ms step_avg:146.12ms step:262/1480 train_time:36826ms step_avg:146.14ms step:263/1480 train_time:36977ms step_avg:146.15ms step:264/1480 train_time:37127ms step_avg:146.17ms step:265/1480 train_time:37279ms step_avg:146.19ms step:266/1480 train_time:37430ms step_avg:146.21ms step:267/1480 train_time:37581ms step_avg:146.23ms step:268/1480 train_time:37731ms step_avg:146.24ms step:269/1480 train_time:37881ms step_avg:146.26ms step:270/1480 train_time:38031ms step_avg:146.27ms step:271/1480 train_time:38181ms step_avg:146.29ms step:272/1480 train_time:38331ms step_avg:146.30ms step:273/1480 train_time:38482ms step_avg:146.32ms step:274/1480 train_time:38633ms step_avg:146.34ms step:275/1480 train_time:38785ms step_avg:146.36ms step:276/1480 train_time:38935ms step_avg:146.37ms step:277/1480 train_time:39085ms step_avg:146.39ms step:278/1480 train_time:39235ms step_avg:146.40ms step:279/1480 train_time:39384ms step_avg:146.41ms step:280/1480 train_time:39536ms step_avg:146.43ms step:281/1480 train_time:39686ms step_avg:146.44ms step:282/1480 train_time:39838ms step_avg:146.46ms step:283/1480 train_time:39989ms step_avg:146.48ms step:284/1480 train_time:40140ms step_avg:146.50ms step:285/1480 train_time:40291ms step_avg:146.51ms step:286/1480 train_time:40441ms step_avg:146.53ms step:287/1480 train_time:40594ms step_avg:146.55ms step:288/1480 train_time:40744ms step_avg:146.56ms step:289/1480 train_time:40896ms step_avg:146.58ms step:290/1480 train_time:41047ms step_avg:146.60ms step:291/1480 train_time:41197ms step_avg:146.61ms step:292/1480 train_time:41346ms step_avg:146.62ms step:293/1480 train_time:41497ms step_avg:146.63ms step:294/1480 train_time:41647ms step_avg:146.64ms step:295/1480 train_time:41797ms step_avg:146.66ms step:296/1480 train_time:41948ms step_avg:146.67ms step:297/1480 train_time:42099ms step_avg:146.69ms step:298/1480 train_time:42249ms step_avg:146.70ms step:299/1480 train_time:42399ms step_avg:146.71ms step:300/1480 train_time:42549ms step_avg:146.72ms step:301/1480 train_time:42700ms step_avg:146.73ms step:302/1480 train_time:42850ms step_avg:146.75ms step:303/1480 train_time:43001ms step_avg:146.76ms step:304/1480 train_time:43151ms step_avg:146.77ms step:305/1480 train_time:43302ms step_avg:146.79ms step:306/1480 train_time:43452ms step_avg:146.80ms step:307/1480 train_time:43604ms step_avg:146.82ms step:308/1480 train_time:43755ms step_avg:146.83ms step:309/1480 train_time:43906ms step_avg:146.84ms step:310/1480 train_time:44057ms step_avg:146.86ms step:311/1480 train_time:44207ms step_avg:146.87ms step:312/1480 train_time:44358ms step_avg:146.88ms step:313/1480 train_time:44509ms step_avg:146.89ms step:314/1480 train_time:44659ms step_avg:146.90ms step:315/1480 train_time:44809ms step_avg:146.92ms step:316/1480 train_time:44959ms step_avg:146.93ms step:317/1480 train_time:45110ms step_avg:146.94ms step:318/1480 train_time:45262ms step_avg:146.95ms step:319/1480 train_time:45413ms step_avg:146.97ms step:320/1480 train_time:45564ms step_avg:146.98ms step:321/1480 train_time:45714ms step_avg:146.99ms step:322/1480 train_time:45865ms step_avg:147.00ms step:323/1480 train_time:46016ms step_avg:147.02ms step:324/1480 train_time:46165ms step_avg:147.02ms step:325/1480 train_time:46315ms step_avg:147.03ms step:326/1480 train_time:46465ms step_avg:147.04ms step:327/1480 train_time:46617ms step_avg:147.06ms step:328/1480 train_time:46766ms step_avg:147.06ms step:329/1480 train_time:46917ms step_avg:147.07ms step:330/1480 train_time:47069ms step_avg:147.09ms step:331/1480 train_time:47223ms step_avg:147.11ms step:332/1480 train_time:47378ms step_avg:147.14ms step:333/1480 train_time:47533ms step_avg:147.16ms step:334/1480 train_time:47686ms step_avg:147.18ms step:335/1480 train_time:47840ms step_avg:147.20ms step:336/1480 train_time:47994ms step_avg:147.22ms step:337/1480 train_time:48147ms step_avg:147.24ms step:338/1480 train_time:48301ms step_avg:147.26ms step:339/1480 train_time:48455ms step_avg:147.28ms step:340/1480 train_time:48610ms step_avg:147.30ms step:341/1480 train_time:48763ms step_avg:147.32ms step:342/1480 train_time:48917ms step_avg:147.34ms step:343/1480 train_time:49071ms step_avg:147.36ms step:344/1480 train_time:49225ms step_avg:147.38ms step:345/1480 train_time:49382ms step_avg:147.41ms step:346/1480 train_time:49537ms step_avg:147.43ms step:347/1480 train_time:49693ms step_avg:147.46ms step:348/1480 train_time:49845ms step_avg:147.47ms step:349/1480 train_time:49998ms step_avg:147.49ms step:350/1480 train_time:50153ms step_avg:147.51ms step:351/1480 train_time:50308ms step_avg:147.53ms step:352/1480 train_time:50461ms step_avg:147.55ms step:353/1480 train_time:50618ms step_avg:147.57ms step:354/1480 train_time:50771ms step_avg:147.59ms step:355/1480 train_time:50924ms step_avg:147.61ms step:356/1480 train_time:51078ms step_avg:147.62ms step:357/1480 train_time:51233ms step_avg:147.65ms step:358/1480 train_time:51386ms step_avg:147.66ms step:359/1480 train_time:51541ms step_avg:147.68ms step:360/1480 train_time:51696ms step_avg:147.70ms step:361/1480 train_time:51851ms step_avg:147.72ms step:362/1480 train_time:52004ms step_avg:147.74ms step:363/1480 train_time:52157ms step_avg:147.75ms step:364/1480 train_time:52311ms step_avg:147.77ms step:365/1480 train_time:52465ms step_avg:147.79ms step:366/1480 train_time:52618ms step_avg:147.80ms step:367/1480 train_time:52773ms step_avg:147.82ms step:368/1480 train_time:52928ms step_avg:147.84ms step:369/1480 train_time:53082ms step_avg:147.86ms step:370/1480 train_time:53234ms step_avg:147.87ms step:371/1480 train_time:53388ms step_avg:147.89ms step:372/1480 train_time:53542ms step_avg:147.91ms step:373/1480 train_time:53696ms step_avg:147.92ms step:374/1480 train_time:53851ms step_avg:147.94ms step:375/1480 train_time:54005ms step_avg:147.96ms step:375/1480 val_loss:3.8109 train_time:54065ms step_avg:148.12ms step:376/1480 train_time:54162ms step_avg:147.98ms step:377/1480 train_time:54317ms step_avg:148.00ms step:378/1480 train_time:54471ms step_avg:148.02ms step:379/1480 train_time:54624ms step_avg:148.03ms step:380/1480 train_time:54778ms step_avg:148.05ms step:381/1480 train_time:54930ms step_avg:148.06ms step:382/1480 train_time:55083ms step_avg:148.07ms step:383/1480 train_time:55238ms step_avg:148.09ms step:384/1480 train_time:55393ms step_avg:148.11ms step:385/1480 train_time:55545ms step_avg:148.12ms step:386/1480 train_time:55699ms step_avg:148.14ms step:387/1480 train_time:55852ms step_avg:148.15ms step:388/1480 train_time:56006ms step_avg:148.16ms step:389/1480 train_time:56160ms step_avg:148.18ms step:390/1480 train_time:56315ms step_avg:148.20ms step:391/1480 train_time:56470ms step_avg:148.22ms step:392/1480 train_time:56623ms step_avg:148.23ms step:393/1480 train_time:56777ms step_avg:148.24ms step:394/1480 train_time:56930ms step_avg:148.26ms step:395/1480 train_time:57084ms step_avg:148.27ms step:396/1480 train_time:57238ms step_avg:148.28ms step:397/1480 train_time:57392ms step_avg:148.30ms step:398/1480 train_time:57545ms step_avg:148.31ms step:399/1480 train_time:57699ms step_avg:148.33ms step:400/1480 train_time:57853ms step_avg:148.34ms step:401/1480 train_time:58006ms step_avg:148.35ms step:402/1480 train_time:58159ms step_avg:148.37ms step:403/1480 train_time:58317ms step_avg:148.39ms step:404/1480 train_time:58470ms step_avg:148.40ms step:405/1480 train_time:58624ms step_avg:148.41ms step:406/1480 train_time:58778ms step_avg:148.43ms step:407/1480 train_time:58931ms step_avg:148.44ms step:408/1480 train_time:59084ms step_avg:148.45ms step:409/1480 train_time:59237ms step_avg:148.46ms step:410/1480 train_time:59392ms step_avg:148.48ms step:411/1480 train_time:59547ms step_avg:148.50ms step:412/1480 train_time:59701ms step_avg:148.51ms step:413/1480 train_time:59854ms step_avg:148.52ms step:414/1480 train_time:60009ms step_avg:148.54ms step:415/1480 train_time:60161ms step_avg:148.55ms step:416/1480 train_time:60314ms step_avg:148.56ms step:417/1480 train_time:60469ms step_avg:148.57ms step:418/1480 train_time:60623ms step_avg:148.59ms step:419/1480 train_time:60778ms step_avg:148.60ms step:420/1480 train_time:60933ms step_avg:148.62ms step:421/1480 train_time:61087ms step_avg:148.63ms step:422/1480 train_time:61239ms step_avg:148.64ms step:423/1480 train_time:61393ms step_avg:148.65ms step:424/1480 train_time:61547ms step_avg:148.66ms step:425/1480 train_time:61701ms step_avg:148.68ms step:426/1480 train_time:61855ms step_avg:148.69ms step:427/1480 train_time:62011ms step_avg:148.71ms step:428/1480 train_time:62163ms step_avg:148.72ms step:429/1480 train_time:62319ms step_avg:148.73ms step:430/1480 train_time:62470ms step_avg:148.74ms step:431/1480 train_time:62624ms step_avg:148.75ms step:432/1480 train_time:62778ms step_avg:148.76ms step:433/1480 train_time:62931ms step_avg:148.77ms step:434/1480 train_time:63084ms step_avg:148.78ms step:435/1480 train_time:63238ms step_avg:148.79ms step:436/1480 train_time:63393ms step_avg:148.81ms step:437/1480 train_time:63545ms step_avg:148.82ms step:438/1480 train_time:63699ms step_avg:148.83ms step:439/1480 train_time:63853ms step_avg:148.84ms step:440/1480 train_time:64010ms step_avg:148.86ms step:441/1480 train_time:64166ms step_avg:148.88ms step:442/1480 train_time:64322ms step_avg:148.89ms step:443/1480 train_time:64477ms step_avg:148.91ms step:444/1480 train_time:64633ms step_avg:148.92ms step:445/1480 train_time:64789ms step_avg:148.94ms step:446/1480 train_time:64945ms step_avg:148.96ms step:447/1480 train_time:65101ms step_avg:148.97ms step:448/1480 train_time:65258ms step_avg:148.99ms step:449/1480 train_time:65417ms step_avg:149.01ms step:450/1480 train_time:65575ms step_avg:149.03ms step:451/1480 train_time:65734ms step_avg:149.06ms step:452/1480 train_time:65892ms step_avg:149.08ms step:453/1480 train_time:66047ms step_avg:149.09ms step:454/1480 train_time:66203ms step_avg:149.11ms step:455/1480 train_time:66359ms step_avg:149.12ms step:456/1480 train_time:66516ms step_avg:149.14ms step:457/1480 train_time:66673ms step_avg:149.16ms step:458/1480 train_time:66828ms step_avg:149.17ms step:459/1480 train_time:66987ms step_avg:149.19ms step:460/1480 train_time:67143ms step_avg:149.21ms step:461/1480 train_time:67301ms step_avg:149.23ms step:462/1480 train_time:67457ms step_avg:149.24ms step:463/1480 train_time:67615ms step_avg:149.26ms step:464/1480 train_time:67771ms step_avg:149.28ms step:465/1480 train_time:67927ms step_avg:149.29ms step:466/1480 train_time:68082ms step_avg:149.30ms step:467/1480 train_time:68239ms step_avg:149.32ms step:468/1480 train_time:68396ms step_avg:149.34ms step:469/1480 train_time:68553ms step_avg:149.35ms step:470/1480 train_time:68712ms step_avg:149.37ms step:471/1480 train_time:68868ms step_avg:149.39ms step:472/1480 train_time:69026ms step_avg:149.41ms step:473/1480 train_time:69183ms step_avg:149.42ms step:474/1480 train_time:69339ms step_avg:149.44ms step:475/1480 train_time:69495ms step_avg:149.45ms step:476/1480 train_time:69653ms step_avg:149.47ms step:477/1480 train_time:69813ms step_avg:149.49ms step:478/1480 train_time:69970ms step_avg:149.51ms step:479/1480 train_time:70126ms step_avg:149.52ms step:480/1480 train_time:70283ms step_avg:149.54ms step:481/1480 train_time:70439ms step_avg:149.55ms step:482/1480 train_time:70595ms step_avg:149.57ms step:483/1480 train_time:70753ms step_avg:149.58ms step:484/1480 train_time:70911ms step_avg:149.60ms step:485/1480 train_time:71069ms step_avg:149.62ms step:486/1480 train_time:71226ms step_avg:149.63ms step:487/1480 train_time:71383ms step_avg:149.65ms step:488/1480 train_time:71539ms step_avg:149.66ms step:489/1480 train_time:71695ms step_avg:149.68ms step:490/1480 train_time:71852ms step_avg:149.69ms step:491/1480 train_time:72009ms step_avg:149.71ms step:492/1480 train_time:72165ms step_avg:149.72ms step:493/1480 train_time:72322ms step_avg:149.73ms step:494/1480 train_time:72479ms step_avg:149.75ms step:495/1480 train_time:72636ms step_avg:149.76ms step:496/1480 train_time:72794ms step_avg:149.78ms step:497/1480 train_time:72952ms step_avg:149.80ms step:498/1480 train_time:73110ms step_avg:149.82ms step:499/1480 train_time:73268ms step_avg:149.83ms step:500/1480 train_time:73425ms step_avg:149.85ms step:500/1480 val_loss:3.6890 train_time:73487ms step_avg:149.97ms step:501/1480 train_time:73586ms step_avg:149.87ms step:502/1480 train_time:73743ms step_avg:149.88ms step:503/1480 train_time:73900ms step_avg:149.90ms step:504/1480 train_time:74056ms step_avg:149.91ms step:505/1480 train_time:74210ms step_avg:149.92ms step:506/1480 train_time:74367ms step_avg:149.93ms step:507/1480 train_time:74525ms step_avg:149.95ms step:508/1480 train_time:74684ms step_avg:149.97ms step:509/1480 train_time:74841ms step_avg:149.98ms step:510/1480 train_time:75001ms step_avg:150.00ms step:511/1480 train_time:75157ms step_avg:150.01ms step:512/1480 train_time:75315ms step_avg:150.03ms step:513/1480 train_time:75470ms step_avg:150.04ms step:514/1480 train_time:75627ms step_avg:150.05ms step:515/1480 train_time:75784ms step_avg:150.07ms step:516/1480 train_time:75943ms step_avg:150.09ms step:517/1480 train_time:76102ms step_avg:150.10ms step:518/1480 train_time:76260ms step_avg:150.12ms step:519/1480 train_time:76417ms step_avg:150.13ms step:520/1480 train_time:76575ms step_avg:150.15ms step:521/1480 train_time:76731ms step_avg:150.16ms step:522/1480 train_time:76887ms step_avg:150.17ms step:523/1480 train_time:77044ms step_avg:150.18ms step:524/1480 train_time:77202ms step_avg:150.20ms step:525/1480 train_time:77360ms step_avg:150.21ms step:526/1480 train_time:77517ms step_avg:150.23ms step:527/1480 train_time:77675ms step_avg:150.24ms step:528/1480 train_time:77831ms step_avg:150.25ms step:529/1480 train_time:77987ms step_avg:150.26ms step:530/1480 train_time:78145ms step_avg:150.28ms step:531/1480 train_time:78304ms step_avg:150.29ms step:532/1480 train_time:78462ms step_avg:150.31ms step:533/1480 train_time:78618ms step_avg:150.32ms step:534/1480 train_time:78775ms step_avg:150.33ms step:535/1480 train_time:78935ms step_avg:150.35ms step:536/1480 train_time:79088ms step_avg:150.36ms step:537/1480 train_time:79244ms step_avg:150.37ms step:538/1480 train_time:79403ms step_avg:150.38ms step:539/1480 train_time:79561ms step_avg:150.40ms step:540/1480 train_time:79719ms step_avg:150.41ms step:541/1480 train_time:79876ms step_avg:150.43ms step:542/1480 train_time:80033ms step_avg:150.44ms step:543/1480 train_time:80189ms step_avg:150.45ms step:544/1480 train_time:80344ms step_avg:150.46ms step:545/1480 train_time:80502ms step_avg:150.47ms step:546/1480 train_time:80659ms step_avg:150.48ms step:547/1480 train_time:80815ms step_avg:150.49ms step:548/1480 train_time:80972ms step_avg:150.51ms step:549/1480 train_time:81129ms step_avg:150.52ms step:550/1480 train_time:81287ms step_avg:150.53ms step:551/1480 train_time:81444ms step_avg:150.54ms step:552/1480 train_time:81604ms step_avg:150.56ms step:553/1480 train_time:81765ms step_avg:150.58ms step:554/1480 train_time:81925ms step_avg:150.60ms step:555/1480 train_time:82086ms step_avg:150.62ms step:556/1480 train_time:82244ms step_avg:150.63ms step:557/1480 train_time:82406ms step_avg:150.65ms step:558/1480 train_time:82564ms step_avg:150.67ms step:559/1480 train_time:82725ms step_avg:150.68ms step:560/1480 train_time:82884ms step_avg:150.70ms step:561/1480 train_time:83043ms step_avg:150.71ms step:562/1480 train_time:83204ms step_avg:150.73ms step:563/1480 train_time:83364ms step_avg:150.75ms step:564/1480 train_time:83522ms step_avg:150.76ms step:565/1480 train_time:83683ms step_avg:150.78ms step:566/1480 train_time:83844ms step_avg:150.80ms step:567/1480 train_time:84004ms step_avg:150.81ms step:568/1480 train_time:84164ms step_avg:150.83ms step:569/1480 train_time:84323ms step_avg:150.85ms step:570/1480 train_time:84483ms step_avg:150.86ms step:571/1480 train_time:84643ms step_avg:150.88ms step:572/1480 train_time:84803ms step_avg:150.90ms step:573/1480 train_time:84963ms step_avg:150.91ms step:574/1480 train_time:85124ms step_avg:150.93ms step:575/1480 train_time:85285ms step_avg:150.95ms step:576/1480 train_time:85444ms step_avg:150.96ms step:577/1480 train_time:85604ms step_avg:150.98ms step:578/1480 train_time:85763ms step_avg:150.99ms step:579/1480 train_time:85924ms step_avg:151.01ms step:580/1480 train_time:86084ms step_avg:151.02ms step:581/1480 train_time:86245ms step_avg:151.04ms step:582/1480 train_time:86406ms step_avg:151.06ms step:583/1480 train_time:86565ms step_avg:151.07ms step:584/1480 train_time:86723ms step_avg:151.09ms step:585/1480 train_time:86883ms step_avg:151.10ms step:586/1480 train_time:87043ms step_avg:151.12ms step:587/1480 train_time:87203ms step_avg:151.13ms step:588/1480 train_time:87362ms step_avg:151.14ms step:589/1480 train_time:87522ms step_avg:151.16ms step:590/1480 train_time:87683ms step_avg:151.18ms step:591/1480 train_time:87842ms step_avg:151.19ms step:592/1480 train_time:88003ms step_avg:151.21ms step:593/1480 train_time:88163ms step_avg:151.22ms step:594/1480 train_time:88324ms step_avg:151.24ms step:595/1480 train_time:88485ms step_avg:151.26ms step:596/1480 train_time:88646ms step_avg:151.27ms step:597/1480 train_time:88805ms step_avg:151.29ms step:598/1480 train_time:88963ms step_avg:151.30ms step:599/1480 train_time:89122ms step_avg:151.31ms step:600/1480 train_time:89283ms step_avg:151.33ms step:601/1480 train_time:89443ms step_avg:151.34ms step:602/1480 train_time:89603ms step_avg:151.36ms step:603/1480 train_time:89763ms step_avg:151.37ms step:604/1480 train_time:89923ms step_avg:151.39ms step:605/1480 train_time:90083ms step_avg:151.40ms step:606/1480 train_time:90245ms step_avg:151.42ms step:607/1480 train_time:90407ms step_avg:151.44ms step:608/1480 train_time:90565ms step_avg:151.45ms step:609/1480 train_time:90725ms step_avg:151.46ms step:610/1480 train_time:90884ms step_avg:151.47ms step:611/1480 train_time:91043ms step_avg:151.49ms step:612/1480 train_time:91205ms step_avg:151.50ms step:613/1480 train_time:91366ms step_avg:151.52ms step:614/1480 train_time:91526ms step_avg:151.53ms step:615/1480 train_time:91684ms step_avg:151.54ms step:616/1480 train_time:91843ms step_avg:151.56ms step:617/1480 train_time:92004ms step_avg:151.57ms step:618/1480 train_time:92163ms step_avg:151.58ms step:619/1480 train_time:92323ms step_avg:151.60ms step:620/1480 train_time:92483ms step_avg:151.61ms step:621/1480 train_time:92643ms step_avg:151.62ms step:622/1480 train_time:92803ms step_avg:151.64ms step:623/1480 train_time:92965ms step_avg:151.66ms step:624/1480 train_time:93124ms step_avg:151.67ms step:625/1480 train_time:93284ms step_avg:151.68ms step:625/1480 val_loss:3.6068 train_time:93347ms step_avg:151.78ms step:626/1480 train_time:93448ms step_avg:151.70ms step:627/1480 train_time:93609ms step_avg:151.72ms step:628/1480 train_time:93768ms step_avg:151.73ms step:629/1480 train_time:93927ms step_avg:151.74ms step:630/1480 train_time:94086ms step_avg:151.75ms step:631/1480 train_time:94244ms step_avg:151.76ms step:632/1480 train_time:94404ms step_avg:151.77ms step:633/1480 train_time:94563ms step_avg:151.79ms step:634/1480 train_time:94723ms step_avg:151.80ms step:635/1480 train_time:94883ms step_avg:151.81ms step:636/1480 train_time:95042ms step_avg:151.82ms step:637/1480 train_time:95202ms step_avg:151.84ms step:638/1480 train_time:95362ms step_avg:151.85ms step:639/1480 train_time:95520ms step_avg:151.86ms step:640/1480 train_time:95677ms step_avg:151.87ms step:641/1480 train_time:95836ms step_avg:151.88ms step:642/1480 train_time:95994ms step_avg:151.89ms step:643/1480 train_time:96153ms step_avg:151.90ms step:644/1480 train_time:96310ms step_avg:151.91ms step:645/1480 train_time:96470ms step_avg:151.92ms step:646/1480 train_time:96630ms step_avg:151.93ms step:647/1480 train_time:96790ms step_avg:151.95ms step:648/1480 train_time:96952ms step_avg:151.96ms step:649/1480 train_time:97111ms step_avg:151.97ms step:650/1480 train_time:97273ms step_avg:151.99ms step:651/1480 train_time:97432ms step_avg:152.00ms step:652/1480 train_time:97592ms step_avg:152.01ms step:653/1480 train_time:97751ms step_avg:152.02ms step:654/1480 train_time:97910ms step_avg:152.03ms step:655/1480 train_time:98071ms step_avg:152.05ms step:656/1480 train_time:98231ms step_avg:152.06ms step:657/1480 train_time:98392ms step_avg:152.07ms step:658/1480 train_time:98552ms step_avg:152.09ms step:659/1480 train_time:98713ms step_avg:152.10ms step:660/1480 train_time:98875ms step_avg:152.12ms step:661/1480 train_time:99036ms step_avg:152.13ms step:662/1480 train_time:99196ms step_avg:152.14ms step:663/1480 train_time:99356ms step_avg:152.15ms step:664/1480 train_time:99517ms step_avg:152.17ms step:665/1480 train_time:99678ms step_avg:152.18ms step:666/1480 train_time:99839ms step_avg:152.19ms step:667/1480 train_time:100000ms step_avg:152.21ms step:668/1480 train_time:100162ms step_avg:152.22ms step:669/1480 train_time:100325ms step_avg:152.24ms step:670/1480 train_time:100484ms step_avg:152.25ms step:671/1480 train_time:100646ms step_avg:152.26ms step:672/1480 train_time:100807ms step_avg:152.28ms step:673/1480 train_time:100972ms step_avg:152.30ms step:674/1480 train_time:101134ms step_avg:152.31ms step:675/1480 train_time:101297ms step_avg:152.33ms step:676/1480 train_time:101457ms step_avg:152.34ms step:677/1480 train_time:101617ms step_avg:152.35ms step:678/1480 train_time:101778ms step_avg:152.36ms step:679/1480 train_time:101939ms step_avg:152.37ms step:680/1480 train_time:102100ms step_avg:152.39ms step:681/1480 train_time:102261ms step_avg:152.40ms step:682/1480 train_time:102425ms step_avg:152.42ms step:683/1480 train_time:102587ms step_avg:152.43ms step:684/1480 train_time:102749ms step_avg:152.45ms step:685/1480 train_time:102913ms step_avg:152.46ms step:686/1480 train_time:103074ms step_avg:152.48ms step:687/1480 train_time:103235ms step_avg:152.49ms step:688/1480 train_time:103397ms step_avg:152.50ms step:689/1480 train_time:103559ms step_avg:152.52ms step:690/1480 train_time:103726ms step_avg:152.54ms step:691/1480 train_time:103887ms step_avg:152.55ms step:692/1480 train_time:104049ms step_avg:152.56ms step:693/1480 train_time:104211ms step_avg:152.58ms step:694/1480 train_time:104374ms step_avg:152.59ms step:695/1480 train_time:104535ms step_avg:152.61ms step:696/1480 train_time:104696ms step_avg:152.62ms step:697/1480 train_time:104861ms step_avg:152.64ms step:698/1480 train_time:105022ms step_avg:152.65ms step:699/1480 train_time:105184ms step_avg:152.66ms step:700/1480 train_time:105348ms step_avg:152.68ms step:701/1480 train_time:105508ms step_avg:152.69ms step:702/1480 train_time:105670ms step_avg:152.70ms step:703/1480 train_time:105832ms step_avg:152.72ms step:704/1480 train_time:105992ms step_avg:152.73ms step:705/1480 train_time:106155ms step_avg:152.74ms step:706/1480 train_time:106319ms step_avg:152.76ms step:707/1480 train_time:106481ms step_avg:152.77ms step:708/1480 train_time:106643ms step_avg:152.78ms step:709/1480 train_time:106805ms step_avg:152.80ms step:710/1480 train_time:106967ms step_avg:152.81ms step:711/1480 train_time:107129ms step_avg:152.82ms step:712/1480 train_time:107295ms step_avg:152.84ms step:713/1480 train_time:107458ms step_avg:152.86ms step:714/1480 train_time:107618ms step_avg:152.87ms step:715/1480 train_time:107777ms step_avg:152.88ms step:716/1480 train_time:107937ms step_avg:152.89ms step:717/1480 train_time:108099ms step_avg:152.90ms step:718/1480 train_time:108258ms step_avg:152.91ms step:719/1480 train_time:108419ms step_avg:152.92ms step:720/1480 train_time:108580ms step_avg:152.93ms step:721/1480 train_time:108741ms step_avg:152.94ms step:722/1480 train_time:108902ms step_avg:152.95ms step:723/1480 train_time:109063ms step_avg:152.96ms step:724/1480 train_time:109225ms step_avg:152.98ms step:725/1480 train_time:109391ms step_avg:152.99ms step:726/1480 train_time:109555ms step_avg:153.01ms step:727/1480 train_time:109719ms step_avg:153.02ms step:728/1480 train_time:109879ms step_avg:153.03ms step:729/1480 train_time:110040ms step_avg:153.05ms step:730/1480 train_time:110201ms step_avg:153.06ms step:731/1480 train_time:110361ms step_avg:153.07ms step:732/1480 train_time:110522ms step_avg:153.08ms step:733/1480 train_time:110684ms step_avg:153.09ms step:734/1480 train_time:110847ms step_avg:153.10ms step:735/1480 train_time:111010ms step_avg:153.12ms step:736/1480 train_time:111173ms step_avg:153.13ms step:737/1480 train_time:111334ms step_avg:153.14ms step:738/1480 train_time:111495ms step_avg:153.15ms step:739/1480 train_time:111655ms step_avg:153.16ms step:740/1480 train_time:111820ms step_avg:153.18ms step:741/1480 train_time:111981ms step_avg:153.19ms step:742/1480 train_time:112144ms step_avg:153.20ms step:743/1480 train_time:112305ms step_avg:153.21ms step:744/1480 train_time:112470ms step_avg:153.23ms step:745/1480 train_time:112634ms step_avg:153.24ms step:746/1480 train_time:112794ms step_avg:153.25ms step:747/1480 train_time:112955ms step_avg:153.26ms step:748/1480 train_time:113120ms step_avg:153.28ms step:749/1480 train_time:113283ms step_avg:153.29ms step:750/1480 train_time:113443ms step_avg:153.30ms step:750/1480 val_loss:3.5532 train_time:113507ms step_avg:153.39ms step:751/1480 train_time:113608ms step_avg:153.32ms step:752/1480 train_time:113770ms step_avg:153.33ms step:753/1480 train_time:113933ms step_avg:153.34ms step:754/1480 train_time:114094ms step_avg:153.35ms step:755/1480 train_time:114255ms step_avg:153.36ms step:756/1480 train_time:114418ms step_avg:153.38ms step:757/1480 train_time:114583ms step_avg:153.39ms step:758/1480 train_time:114743ms step_avg:153.40ms step:759/1480 train_time:114907ms step_avg:153.41ms step:760/1480 train_time:115069ms step_avg:153.43ms step:761/1480 train_time:115233ms step_avg:153.44ms step:762/1480 train_time:115396ms step_avg:153.45ms step:763/1480 train_time:115558ms step_avg:153.46ms step:764/1480 train_time:115720ms step_avg:153.47ms step:765/1480 train_time:115880ms step_avg:153.48ms step:766/1480 train_time:116042ms step_avg:153.49ms step:767/1480 train_time:116203ms step_avg:153.50ms step:768/1480 train_time:116363ms step_avg:153.51ms step:769/1480 train_time:116528ms step_avg:153.53ms step:770/1480 train_time:116692ms step_avg:153.54ms step:771/1480 train_time:116856ms step_avg:153.56ms step:772/1480 train_time:117019ms step_avg:153.57ms step:773/1480 train_time:117180ms step_avg:153.58ms step:774/1480 train_time:117342ms step_avg:153.59ms step:775/1480 train_time:117503ms step_avg:153.60ms step:776/1480 train_time:117668ms step_avg:153.61ms step:777/1480 train_time:117836ms step_avg:153.63ms step:778/1480 train_time:117998ms step_avg:153.64ms step:779/1480 train_time:118162ms step_avg:153.66ms step:780/1480 train_time:118324ms step_avg:153.67ms step:781/1480 train_time:118485ms step_avg:153.68ms step:782/1480 train_time:118649ms step_avg:153.69ms step:783/1480 train_time:118811ms step_avg:153.70ms step:784/1480 train_time:118975ms step_avg:153.71ms step:785/1480 train_time:119138ms step_avg:153.73ms step:786/1480 train_time:119302ms step_avg:153.74ms step:787/1480 train_time:119466ms step_avg:153.75ms step:788/1480 train_time:119633ms step_avg:153.77ms step:789/1480 train_time:119795ms step_avg:153.78ms step:790/1480 train_time:119961ms step_avg:153.80ms step:791/1480 train_time:120127ms step_avg:153.81ms step:792/1480 train_time:120294ms step_avg:153.83ms step:793/1480 train_time:120456ms step_avg:153.84ms step:794/1480 train_time:120621ms step_avg:153.85ms step:795/1480 train_time:120786ms step_avg:153.87ms step:796/1480 train_time:120953ms step_avg:153.88ms step:797/1480 train_time:121117ms step_avg:153.90ms step:798/1480 train_time:121281ms step_avg:153.91ms step:799/1480 train_time:121445ms step_avg:153.92ms step:800/1480 train_time:121608ms step_avg:153.93ms step:801/1480 train_time:121772ms step_avg:153.95ms step:802/1480 train_time:121941ms step_avg:153.97ms step:803/1480 train_time:122103ms step_avg:153.98ms step:804/1480 train_time:122265ms step_avg:153.99ms step:805/1480 train_time:122432ms step_avg:154.00ms step:806/1480 train_time:122595ms step_avg:154.01ms step:807/1480 train_time:122757ms step_avg:154.02ms step:808/1480 train_time:122920ms step_avg:154.04ms step:809/1480 train_time:123082ms step_avg:154.04ms step:810/1480 train_time:123242ms step_avg:154.05ms step:811/1480 train_time:123405ms step_avg:154.06ms step:812/1480 train_time:123568ms step_avg:154.07ms step:813/1480 train_time:123729ms step_avg:154.08ms step:814/1480 train_time:123892ms step_avg:154.10ms step:815/1480 train_time:124055ms step_avg:154.11ms step:816/1480 train_time:124220ms step_avg:154.12ms step:817/1480 train_time:124382ms step_avg:154.13ms step:818/1480 train_time:124543ms step_avg:154.14ms step:819/1480 train_time:124708ms step_avg:154.15ms step:820/1480 train_time:124872ms step_avg:154.16ms step:821/1480 train_time:125035ms step_avg:154.17ms step:822/1480 train_time:125198ms step_avg:154.18ms step:823/1480 train_time:125359ms step_avg:154.19ms step:824/1480 train_time:125522ms step_avg:154.20ms step:825/1480 train_time:125686ms step_avg:154.22ms step:826/1480 train_time:125851ms step_avg:154.23ms step:827/1480 train_time:126017ms step_avg:154.24ms step:828/1480 train_time:126179ms step_avg:154.25ms step:829/1480 train_time:126343ms step_avg:154.27ms step:830/1480 train_time:126507ms step_avg:154.28ms step:831/1480 train_time:126671ms step_avg:154.29ms step:832/1480 train_time:126835ms step_avg:154.30ms step:833/1480 train_time:127000ms step_avg:154.31ms step:834/1480 train_time:127164ms step_avg:154.33ms step:835/1480 train_time:127329ms step_avg:154.34ms step:836/1480 train_time:127495ms step_avg:154.35ms step:837/1480 train_time:127657ms step_avg:154.36ms step:838/1480 train_time:127821ms step_avg:154.37ms step:839/1480 train_time:127982ms step_avg:154.38ms step:840/1480 train_time:128143ms step_avg:154.39ms step:841/1480 train_time:128303ms step_avg:154.40ms step:842/1480 train_time:128470ms step_avg:154.41ms step:843/1480 train_time:128632ms step_avg:154.42ms step:844/1480 train_time:128795ms step_avg:154.43ms step:845/1480 train_time:128960ms step_avg:154.44ms step:846/1480 train_time:129125ms step_avg:154.46ms step:847/1480 train_time:129289ms step_avg:154.47ms step:848/1480 train_time:129451ms step_avg:154.48ms step:849/1480 train_time:129615ms step_avg:154.49ms step:850/1480 train_time:129778ms step_avg:154.50ms step:851/1480 train_time:129943ms step_avg:154.51ms step:852/1480 train_time:130104ms step_avg:154.52ms step:853/1480 train_time:130267ms step_avg:154.53ms step:854/1480 train_time:130431ms step_avg:154.54ms step:855/1480 train_time:130596ms step_avg:154.55ms step:856/1480 train_time:130758ms step_avg:154.56ms step:857/1480 train_time:130924ms step_avg:154.57ms step:858/1480 train_time:131091ms step_avg:154.59ms step:859/1480 train_time:131256ms step_avg:154.60ms step:860/1480 train_time:131418ms step_avg:154.61ms step:861/1480 train_time:131584ms step_avg:154.62ms step:862/1480 train_time:131755ms step_avg:154.64ms step:863/1480 train_time:131923ms step_avg:154.66ms step:864/1480 train_time:132087ms step_avg:154.67ms step:865/1480 train_time:132249ms step_avg:154.68ms step:866/1480 train_time:132417ms step_avg:154.69ms step:867/1480 train_time:132579ms step_avg:154.70ms step:868/1480 train_time:132740ms step_avg:154.71ms step:869/1480 train_time:132902ms step_avg:154.72ms step:870/1480 train_time:133067ms step_avg:154.73ms step:871/1480 train_time:133230ms step_avg:154.74ms step:872/1480 train_time:133394ms step_avg:154.75ms step:873/1480 train_time:133557ms step_avg:154.76ms step:874/1480 train_time:133724ms step_avg:154.77ms step:875/1480 train_time:133889ms step_avg:154.78ms step:875/1480 val_loss:3.5064 train_time:133953ms step_avg:154.86ms step:876/1480 train_time:134053ms step_avg:154.80ms step:877/1480 train_time:134219ms step_avg:154.81ms step:878/1480 train_time:134383ms step_avg:154.82ms step:879/1480 train_time:134546ms step_avg:154.83ms step:880/1480 train_time:134708ms step_avg:154.84ms step:881/1480 train_time:134870ms step_avg:154.84ms step:882/1480 train_time:135034ms step_avg:154.86ms step:883/1480 train_time:135201ms step_avg:154.87ms step:884/1480 train_time:135367ms step_avg:154.88ms step:885/1480 train_time:135531ms step_avg:154.89ms step:886/1480 train_time:135697ms step_avg:154.91ms step:887/1480 train_time:135867ms step_avg:154.92ms step:888/1480 train_time:136040ms step_avg:154.94ms step:889/1480 train_time:136209ms step_avg:154.96ms step:890/1480 train_time:136371ms step_avg:154.97ms step:891/1480 train_time:136537ms step_avg:154.98ms step:892/1480 train_time:136703ms step_avg:154.99ms step:893/1480 train_time:136865ms step_avg:155.00ms step:894/1480 train_time:137032ms step_avg:155.01ms step:895/1480 train_time:137200ms step_avg:155.03ms step:896/1480 train_time:137365ms step_avg:155.04ms step:897/1480 train_time:137531ms step_avg:155.05ms step:898/1480 train_time:137697ms step_avg:155.06ms step:899/1480 train_time:137862ms step_avg:155.07ms step:900/1480 train_time:138025ms step_avg:155.08ms step:901/1480 train_time:138189ms step_avg:155.09ms step:902/1480 train_time:138352ms step_avg:155.10ms step:903/1480 train_time:138523ms step_avg:155.12ms step:904/1480 train_time:138688ms step_avg:155.13ms step:905/1480 train_time:138849ms step_avg:155.14ms step:906/1480 train_time:139015ms step_avg:155.15ms step:907/1480 train_time:139183ms step_avg:155.17ms step:908/1480 train_time:139346ms step_avg:155.17ms step:909/1480 train_time:139510ms step_avg:155.18ms step:910/1480 train_time:139681ms step_avg:155.20ms step:911/1480 train_time:139846ms step_avg:155.21ms step:912/1480 train_time:140011ms step_avg:155.22ms step:913/1480 train_time:140178ms step_avg:155.24ms step:914/1480 train_time:140345ms step_avg:155.25ms step:915/1480 train_time:140514ms step_avg:155.26ms step:916/1480 train_time:140680ms step_avg:155.28ms step:917/1480 train_time:140843ms step_avg:155.28ms step:918/1480 train_time:141010ms step_avg:155.30ms step:919/1480 train_time:141181ms step_avg:155.31ms step:920/1480 train_time:141346ms step_avg:155.33ms step:921/1480 train_time:141512ms step_avg:155.34ms step:922/1480 train_time:141681ms step_avg:155.35ms step:923/1480 train_time:141844ms step_avg:155.36ms step:924/1480 train_time:142008ms step_avg:155.37ms step:925/1480 train_time:142173ms step_avg:155.38ms step:926/1480 train_time:142336ms step_avg:155.39ms step:927/1480 train_time:142501ms step_avg:155.40ms step:928/1480 train_time:142667ms step_avg:155.41ms step:929/1480 train_time:142832ms step_avg:155.42ms step:930/1480 train_time:142998ms step_avg:155.43ms step:931/1480 train_time:143162ms step_avg:155.44ms step:932/1480 train_time:143327ms step_avg:155.45ms step:933/1480 train_time:143495ms step_avg:155.47ms step:934/1480 train_time:143663ms step_avg:155.48ms step:935/1480 train_time:143834ms step_avg:155.50ms step:936/1480 train_time:144000ms step_avg:155.51ms step:937/1480 train_time:144170ms step_avg:155.52ms step:938/1480 train_time:144332ms step_avg:155.53ms step:939/1480 train_time:144501ms step_avg:155.55ms step:940/1480 train_time:144668ms step_avg:155.56ms step:941/1480 train_time:144832ms step_avg:155.57ms step:942/1480 train_time:144998ms step_avg:155.58ms step:943/1480 train_time:145168ms step_avg:155.59ms step:944/1480 train_time:145341ms step_avg:155.61ms step:945/1480 train_time:145504ms step_avg:155.62ms step:946/1480 train_time:145672ms step_avg:155.63ms step:947/1480 train_time:145841ms step_avg:155.65ms step:948/1480 train_time:146007ms step_avg:155.66ms step:949/1480 train_time:146171ms step_avg:155.67ms step:950/1480 train_time:146335ms step_avg:155.68ms step:951/1480 train_time:146503ms step_avg:155.69ms step:952/1480 train_time:146668ms step_avg:155.70ms step:953/1480 train_time:146836ms step_avg:155.71ms step:954/1480 train_time:147004ms step_avg:155.73ms step:955/1480 train_time:147169ms step_avg:155.73ms step:956/1480 train_time:147334ms step_avg:155.74ms step:957/1480 train_time:147502ms step_avg:155.76ms step:958/1480 train_time:147670ms step_avg:155.77ms step:959/1480 train_time:147836ms step_avg:155.78ms step:960/1480 train_time:148003ms step_avg:155.79ms step:961/1480 train_time:148168ms step_avg:155.80ms step:962/1480 train_time:148332ms step_avg:155.81ms step:963/1480 train_time:148497ms step_avg:155.82ms step:964/1480 train_time:148666ms step_avg:155.83ms step:965/1480 train_time:148829ms step_avg:155.84ms step:966/1480 train_time:148994ms step_avg:155.85ms step:967/1480 train_time:149157ms step_avg:155.86ms step:968/1480 train_time:149322ms step_avg:155.87ms step:969/1480 train_time:149488ms step_avg:155.88ms step:970/1480 train_time:149651ms step_avg:155.89ms step:971/1480 train_time:149817ms step_avg:155.90ms step:972/1480 train_time:149982ms step_avg:155.91ms step:973/1480 train_time:150146ms step_avg:155.91ms step:974/1480 train_time:150314ms step_avg:155.93ms step:975/1480 train_time:150482ms step_avg:155.94ms step:976/1480 train_time:150648ms step_avg:155.95ms step:977/1480 train_time:150811ms step_avg:155.96ms step:978/1480 train_time:150978ms step_avg:155.97ms step:979/1480 train_time:151144ms step_avg:155.98ms step:980/1480 train_time:151310ms step_avg:155.99ms step:981/1480 train_time:151481ms step_avg:156.00ms step:982/1480 train_time:151643ms step_avg:156.01ms step:983/1480 train_time:151807ms step_avg:156.02ms step:984/1480 train_time:151971ms step_avg:156.03ms step:985/1480 train_time:152140ms step_avg:156.04ms step:986/1480 train_time:152305ms step_avg:156.05ms step:987/1480 train_time:152468ms step_avg:156.06ms step:988/1480 train_time:152635ms step_avg:156.07ms step:989/1480 train_time:152802ms step_avg:156.08ms step:990/1480 train_time:152971ms step_avg:156.09ms step:991/1480 train_time:153138ms step_avg:156.10ms step:992/1480 train_time:153313ms step_avg:156.12ms step:993/1480 train_time:153488ms step_avg:156.14ms step:994/1480 train_time:153653ms step_avg:156.15ms step:995/1480 train_time:153819ms step_avg:156.16ms step:996/1480 train_time:153982ms step_avg:156.17ms step:997/1480 train_time:154146ms step_avg:156.18ms step:998/1480 train_time:154308ms step_avg:156.18ms step:999/1480 train_time:154475ms step_avg:156.19ms step:1000/1480 train_time:154643ms step_avg:156.21ms step:1000/1480 val_loss:3.4428 train_time:154712ms step_avg:156.27ms step:1001/1480 train_time:154814ms step_avg:156.22ms step:1002/1480 train_time:154980ms step_avg:156.23ms step:1003/1480 train_time:155151ms step_avg:156.25ms step:1004/1480 train_time:155320ms step_avg:156.26ms step:1005/1480 train_time:155489ms step_avg:156.27ms step:1006/1480 train_time:155657ms step_avg:156.28ms step:1007/1480 train_time:155823ms step_avg:156.29ms step:1008/1480 train_time:155991ms step_avg:156.30ms step:1009/1480 train_time:156163ms step_avg:156.32ms step:1010/1480 train_time:156329ms step_avg:156.33ms step:1011/1480 train_time:156496ms step_avg:156.34ms step:1012/1480 train_time:156661ms step_avg:156.35ms step:1013/1480 train_time:156832ms step_avg:156.36ms step:1014/1480 train_time:156999ms step_avg:156.37ms step:1015/1480 train_time:157169ms step_avg:156.39ms step:1016/1480 train_time:157338ms step_avg:156.40ms step:1017/1480 train_time:157509ms step_avg:156.41ms step:1018/1480 train_time:157678ms step_avg:156.43ms step:1019/1480 train_time:157846ms step_avg:156.44ms step:1020/1480 train_time:158016ms step_avg:156.45ms step:1021/1480 train_time:158181ms step_avg:156.46ms step:1022/1480 train_time:158347ms step_avg:156.47ms step:1023/1480 train_time:158516ms step_avg:156.48ms step:1024/1480 train_time:158681ms step_avg:156.49ms step:1025/1480 train_time:158854ms step_avg:156.51ms step:1026/1480 train_time:159020ms step_avg:156.52ms step:1027/1480 train_time:159185ms step_avg:156.52ms step:1028/1480 train_time:159358ms step_avg:156.54ms step:1029/1480 train_time:159532ms step_avg:156.56ms step:1030/1480 train_time:159701ms step_avg:156.57ms step:1031/1480 train_time:159865ms step_avg:156.58ms step:1032/1480 train_time:160039ms step_avg:156.59ms step:1033/1480 train_time:160205ms step_avg:156.60ms step:1034/1480 train_time:160373ms step_avg:156.61ms step:1035/1480 train_time:160541ms step_avg:156.63ms step:1036/1480 train_time:160706ms step_avg:156.63ms step:1037/1480 train_time:160873ms step_avg:156.64ms step:1038/1480 train_time:161041ms step_avg:156.65ms step:1039/1480 train_time:161212ms step_avg:156.67ms step:1040/1480 train_time:161379ms step_avg:156.68ms step:1041/1480 train_time:161546ms step_avg:156.69ms step:1042/1480 train_time:161709ms step_avg:156.70ms step:1043/1480 train_time:161876ms step_avg:156.70ms step:1044/1480 train_time:162041ms step_avg:156.71ms step:1045/1480 train_time:162211ms step_avg:156.73ms step:1046/1480 train_time:162379ms step_avg:156.74ms step:1047/1480 train_time:162546ms step_avg:156.75ms step:1048/1480 train_time:162713ms step_avg:156.76ms step:1049/1480 train_time:162879ms step_avg:156.77ms step:1050/1480 train_time:163046ms step_avg:156.77ms step:1051/1480 train_time:163217ms step_avg:156.79ms step:1052/1480 train_time:163385ms step_avg:156.80ms step:1053/1480 train_time:163552ms step_avg:156.81ms step:1054/1480 train_time:163720ms step_avg:156.82ms step:1055/1480 train_time:163885ms step_avg:156.83ms step:1056/1480 train_time:164049ms step_avg:156.83ms step:1057/1480 train_time:164217ms step_avg:156.85ms step:1058/1480 train_time:164385ms step_avg:156.86ms step:1059/1480 train_time:164559ms step_avg:156.87ms step:1060/1480 train_time:164728ms step_avg:156.88ms step:1061/1480 train_time:164891ms step_avg:156.89ms step:1062/1480 train_time:165057ms step_avg:156.90ms step:1063/1480 train_time:165223ms step_avg:156.91ms step:1064/1480 train_time:165387ms step_avg:156.91ms step:1065/1480 train_time:165554ms step_avg:156.92ms step:1066/1480 train_time:165722ms step_avg:156.93ms step:1067/1480 train_time:165892ms step_avg:156.95ms step:1068/1480 train_time:166059ms step_avg:156.96ms step:1069/1480 train_time:166229ms step_avg:156.97ms step:1070/1480 train_time:166395ms step_avg:156.98ms step:1071/1480 train_time:166566ms step_avg:156.99ms step:1072/1480 train_time:166733ms step_avg:157.00ms step:1073/1480 train_time:166897ms step_avg:157.01ms step:1074/1480 train_time:167063ms step_avg:157.01ms step:1075/1480 train_time:167235ms step_avg:157.03ms step:1076/1480 train_time:167402ms step_avg:157.04ms step:1077/1480 train_time:167566ms step_avg:157.04ms step:1078/1480 train_time:167740ms step_avg:157.06ms step:1079/1480 train_time:167912ms step_avg:157.07ms step:1080/1480 train_time:168082ms step_avg:157.09ms step:1081/1480 train_time:168250ms step_avg:157.10ms step:1082/1480 train_time:168418ms step_avg:157.11ms step:1083/1480 train_time:168584ms step_avg:157.12ms step:1084/1480 train_time:168750ms step_avg:157.12ms step:1085/1480 train_time:168918ms step_avg:157.13ms step:1086/1480 train_time:169086ms step_avg:157.14ms step:1087/1480 train_time:169252ms step_avg:157.15ms step:1088/1480 train_time:169421ms step_avg:157.16ms step:1089/1480 train_time:169593ms step_avg:157.18ms step:1090/1480 train_time:169764ms step_avg:157.19ms step:1091/1480 train_time:169932ms step_avg:157.20ms step:1092/1480 train_time:170100ms step_avg:157.21ms step:1093/1480 train_time:170267ms step_avg:157.22ms step:1094/1480 train_time:170433ms step_avg:157.23ms step:1095/1480 train_time:170598ms step_avg:157.23ms step:1096/1480 train_time:170767ms step_avg:157.24ms step:1097/1480 train_time:170937ms step_avg:157.26ms step:1098/1480 train_time:171107ms step_avg:157.27ms step:1099/1480 train_time:171277ms step_avg:157.28ms step:1100/1480 train_time:171448ms step_avg:157.29ms step:1101/1480 train_time:171619ms step_avg:157.30ms step:1102/1480 train_time:171791ms step_avg:157.32ms step:1103/1480 train_time:171968ms step_avg:157.34ms step:1104/1480 train_time:172137ms step_avg:157.35ms step:1105/1480 train_time:172305ms step_avg:157.36ms step:1106/1480 train_time:172475ms step_avg:157.37ms step:1107/1480 train_time:172644ms step_avg:157.38ms step:1108/1480 train_time:172810ms step_avg:157.39ms step:1109/1480 train_time:172976ms step_avg:157.39ms step:1110/1480 train_time:173141ms step_avg:157.40ms step:1111/1480 train_time:173309ms step_avg:157.41ms step:1112/1480 train_time:173480ms step_avg:157.42ms step:1113/1480 train_time:173660ms step_avg:157.44ms step:1114/1480 train_time:173833ms step_avg:157.46ms step:1115/1480 train_time:174005ms step_avg:157.47ms step:1116/1480 train_time:174172ms step_avg:157.48ms step:1117/1480 train_time:174345ms step_avg:157.49ms step:1118/1480 train_time:174520ms step_avg:157.51ms step:1119/1480 train_time:174686ms step_avg:157.52ms step:1120/1480 train_time:174854ms step_avg:157.53ms step:1121/1480 train_time:175023ms step_avg:157.54ms step:1122/1480 train_time:175190ms step_avg:157.55ms step:1123/1480 train_time:175356ms step_avg:157.55ms step:1124/1480 train_time:175524ms step_avg:157.56ms step:1125/1480 train_time:175693ms step_avg:157.57ms step:1125/1480 val_loss:3.3857 train_time:175761ms step_avg:157.63ms step:1126/1480 train_time:175864ms step_avg:157.58ms step:1127/1480 train_time:176033ms step_avg:157.59ms step:1128/1480 train_time:176204ms step_avg:157.61ms step:1129/1480 train_time:176378ms step_avg:157.62ms step:1130/1480 train_time:176547ms step_avg:157.63ms step:1131/1480 train_time:176725ms step_avg:157.65ms step:1132/1480 train_time:176890ms step_avg:157.66ms step:1133/1480 train_time:177064ms step_avg:157.67ms step:1134/1480 train_time:177236ms step_avg:157.68ms step:1135/1480 train_time:177404ms step_avg:157.69ms step:1136/1480 train_time:177574ms step_avg:157.70ms step:1137/1480 train_time:177745ms step_avg:157.71ms step:1138/1480 train_time:177917ms step_avg:157.73ms step:1139/1480 train_time:178086ms step_avg:157.74ms step:1140/1480 train_time:178254ms step_avg:157.75ms step:1141/1480 train_time:178428ms step_avg:157.76ms step:1142/1480 train_time:178595ms step_avg:157.77ms step:1143/1480 train_time:178766ms step_avg:157.78ms step:1144/1480 train_time:178934ms step_avg:157.79ms step:1145/1480 train_time:179098ms step_avg:157.80ms step:1146/1480 train_time:179268ms step_avg:157.81ms step:1147/1480 train_time:179437ms step_avg:157.82ms step:1148/1480 train_time:179605ms step_avg:157.83ms step:1149/1480 train_time:179775ms step_avg:157.84ms step:1150/1480 train_time:179944ms step_avg:157.85ms step:1151/1480 train_time:180117ms step_avg:157.86ms step:1152/1480 train_time:180289ms step_avg:157.87ms step:1153/1480 train_time:180462ms step_avg:157.88ms step:1154/1480 train_time:180629ms step_avg:157.89ms step:1155/1480 train_time:180803ms step_avg:157.91ms step:1156/1480 train_time:180983ms step_avg:157.93ms step:1157/1480 train_time:181150ms step_avg:157.93ms step:1158/1480 train_time:181317ms step_avg:157.94ms step:1159/1480 train_time:181484ms step_avg:157.95ms step:1160/1480 train_time:181650ms step_avg:157.96ms step:1161/1480 train_time:181821ms step_avg:157.97ms step:1162/1480 train_time:181991ms step_avg:157.98ms step:1163/1480 train_time:182160ms step_avg:157.99ms step:1164/1480 train_time:182330ms step_avg:158.00ms step:1165/1480 train_time:182495ms step_avg:158.00ms step:1166/1480 train_time:182665ms step_avg:158.01ms step:1167/1480 train_time:182833ms step_avg:158.02ms step:1168/1480 train_time:183002ms step_avg:158.03ms step:1169/1480 train_time:183169ms step_avg:158.04ms step:1170/1480 train_time:183338ms step_avg:158.05ms step:1171/1480 train_time:183506ms step_avg:158.06ms step:1172/1480 train_time:183674ms step_avg:158.07ms step:1173/1480 train_time:183848ms step_avg:158.08ms step:1174/1480 train_time:184029ms step_avg:158.10ms step:1175/1480 train_time:184201ms step_avg:158.11ms step:1176/1480 train_time:184373ms step_avg:158.12ms step:1177/1480 train_time:184550ms step_avg:158.14ms step:1178/1480 train_time:184719ms step_avg:158.15ms step:1179/1480 train_time:184885ms step_avg:158.16ms step:1180/1480 train_time:185065ms step_avg:158.18ms step:1181/1480 train_time:185235ms step_avg:158.19ms step:1182/1480 train_time:185404ms step_avg:158.19ms step:1183/1480 train_time:185575ms step_avg:158.21ms step:1184/1480 train_time:185743ms step_avg:158.21ms step:1185/1480 train_time:185915ms step_avg:158.23ms step:1186/1480 train_time:186087ms step_avg:158.24ms step:1187/1480 train_time:186269ms step_avg:158.26ms step:1188/1480 train_time:186436ms step_avg:158.26ms step:1189/1480 train_time:186608ms step_avg:158.28ms step:1190/1480 train_time:186775ms step_avg:158.28ms step:1191/1480 train_time:186946ms step_avg:158.29ms step:1192/1480 train_time:187112ms step_avg:158.30ms step:1193/1480 train_time:187280ms step_avg:158.31ms step:1194/1480 train_time:187448ms step_avg:158.32ms step:1195/1480 train_time:187622ms step_avg:158.33ms step:1196/1480 train_time:187803ms step_avg:158.35ms step:1197/1480 train_time:187973ms step_avg:158.36ms step:1198/1480 train_time:188155ms step_avg:158.38ms step:1199/1480 train_time:188327ms step_avg:158.39ms step:1200/1480 train_time:188495ms step_avg:158.40ms step:1201/1480 train_time:188663ms step_avg:158.41ms step:1202/1480 train_time:188845ms step_avg:158.43ms step:1203/1480 train_time:189022ms step_avg:158.44ms step:1204/1480 train_time:189196ms step_avg:158.46ms step:1205/1480 train_time:189364ms step_avg:158.46ms step:1206/1480 train_time:189530ms step_avg:158.47ms step:1207/1480 train_time:189700ms step_avg:158.48ms step:1208/1480 train_time:189869ms step_avg:158.49ms step:1209/1480 train_time:190042ms step_avg:158.50ms step:1210/1480 train_time:190218ms step_avg:158.51ms step:1211/1480 train_time:190391ms step_avg:158.53ms step:1212/1480 train_time:190564ms step_avg:158.54ms step:1213/1480 train_time:190737ms step_avg:158.55ms step:1214/1480 train_time:190914ms step_avg:158.57ms step:1215/1480 train_time:191085ms step_avg:158.58ms step:1216/1480 train_time:191253ms step_avg:158.58ms step:1217/1480 train_time:191427ms step_avg:158.60ms step:1218/1480 train_time:191596ms step_avg:158.61ms step:1219/1480 train_time:191774ms step_avg:158.62ms step:1220/1480 train_time:191944ms step_avg:158.63ms step:1221/1480 train_time:192113ms step_avg:158.64ms step:1222/1480 train_time:192281ms step_avg:158.65ms step:1223/1480 train_time:192451ms step_avg:158.66ms step:1224/1480 train_time:192628ms step_avg:158.67ms step:1225/1480 train_time:192799ms step_avg:158.68ms step:1226/1480 train_time:192971ms step_avg:158.69ms step:1227/1480 train_time:193145ms step_avg:158.71ms step:1228/1480 train_time:193315ms step_avg:158.71ms step:1229/1480 train_time:193487ms step_avg:158.73ms step:1230/1480 train_time:193668ms step_avg:158.74ms step:1231/1480 train_time:193844ms step_avg:158.76ms step:1232/1480 train_time:194018ms step_avg:158.77ms step:1233/1480 train_time:194186ms step_avg:158.78ms step:1234/1480 train_time:194357ms step_avg:158.79ms step:1235/1480 train_time:194531ms step_avg:158.80ms step:1236/1480 train_time:194699ms step_avg:158.81ms step:1237/1480 train_time:194871ms step_avg:158.82ms step:1238/1480 train_time:195055ms step_avg:158.84ms step:1239/1480 train_time:195226ms step_avg:158.85ms step:1240/1480 train_time:195397ms step_avg:158.86ms step:1241/1480 train_time:195570ms step_avg:158.87ms step:1242/1480 train_time:195739ms step_avg:158.88ms step:1243/1480 train_time:195913ms step_avg:158.89ms step:1244/1480 train_time:196079ms step_avg:158.90ms step:1245/1480 train_time:196247ms step_avg:158.90ms step:1246/1480 train_time:196417ms step_avg:158.91ms step:1247/1480 train_time:196586ms step_avg:158.92ms step:1248/1480 train_time:196755ms step_avg:158.93ms step:1249/1480 train_time:196925ms step_avg:158.94ms step:1250/1480 train_time:197095ms step_avg:158.95ms step:1250/1480 val_loss:3.3370 train_time:197167ms step_avg:159.01ms step:1251/1480 train_time:197276ms step_avg:158.97ms step:1252/1480 train_time:197446ms step_avg:158.97ms step:1253/1480 train_time:197614ms step_avg:158.98ms step:1254/1480 train_time:197786ms step_avg:158.99ms step:1255/1480 train_time:197974ms step_avg:159.02ms step:1256/1480 train_time:198150ms step_avg:159.03ms step:1257/1480 train_time:198319ms step_avg:159.04ms step:1258/1480 train_time:198494ms step_avg:159.05ms step:1259/1480 train_time:198667ms step_avg:159.06ms step:1260/1480 train_time:198834ms step_avg:159.07ms step:1261/1480 train_time:199007ms step_avg:159.08ms step:1262/1480 train_time:199182ms step_avg:159.09ms step:1263/1480 train_time:199357ms step_avg:159.10ms step:1264/1480 train_time:199524ms step_avg:159.11ms step:1265/1480 train_time:199691ms step_avg:159.12ms step:1266/1480 train_time:199864ms step_avg:159.13ms step:1267/1480 train_time:200034ms step_avg:159.14ms step:1268/1480 train_time:200205ms step_avg:159.15ms step:1269/1480 train_time:200380ms step_avg:159.16ms step:1270/1480 train_time:200549ms step_avg:159.17ms step:1271/1480 train_time:200718ms step_avg:159.17ms step:1272/1480 train_time:200884ms step_avg:159.18ms step:1273/1480 train_time:201055ms step_avg:159.19ms step:1274/1480 train_time:201230ms step_avg:159.20ms step:1275/1480 train_time:201397ms step_avg:159.21ms step:1276/1480 train_time:201564ms step_avg:159.21ms step:1277/1480 train_time:201737ms step_avg:159.22ms step:1278/1480 train_time:201906ms step_avg:159.23ms step:1279/1480 train_time:202078ms step_avg:159.24ms step:1280/1480 train_time:202257ms step_avg:159.26ms step:1281/1480 train_time:202428ms step_avg:159.27ms step:1282/1480 train_time:202594ms step_avg:159.27ms step:1283/1480 train_time:202766ms step_avg:159.28ms step:1284/1480 train_time:202935ms step_avg:159.29ms step:1285/1480 train_time:203104ms step_avg:159.30ms step:1286/1480 train_time:203274ms step_avg:159.31ms step:1287/1480 train_time:203446ms step_avg:159.32ms step:1288/1480 train_time:203616ms step_avg:159.32ms step:1289/1480 train_time:203800ms step_avg:159.34ms step:1290/1480 train_time:203979ms step_avg:159.36ms step:1291/1480 train_time:204154ms step_avg:159.37ms step:1292/1480 train_time:204330ms step_avg:159.38ms step:1293/1480 train_time:204506ms step_avg:159.40ms step:1294/1480 train_time:204676ms step_avg:159.40ms step:1295/1480 train_time:204847ms step_avg:159.41ms step:1296/1480 train_time:205021ms step_avg:159.43ms step:1297/1480 train_time:205194ms step_avg:159.44ms step:1298/1480 train_time:205364ms step_avg:159.44ms step:1299/1480 train_time:205535ms step_avg:159.45ms step:1300/1480 train_time:205701ms step_avg:159.46ms step:1301/1480 train_time:205870ms step_avg:159.47ms step:1302/1480 train_time:206043ms step_avg:159.48ms step:1303/1480 train_time:206219ms step_avg:159.49ms step:1304/1480 train_time:206393ms step_avg:159.50ms step:1305/1480 train_time:206561ms step_avg:159.51ms step:1306/1480 train_time:206736ms step_avg:159.52ms step:1307/1480 train_time:206904ms step_avg:159.53ms step:1308/1480 train_time:207074ms step_avg:159.53ms step:1309/1480 train_time:207246ms step_avg:159.54ms step:1310/1480 train_time:207414ms step_avg:159.55ms step:1311/1480 train_time:207582ms step_avg:159.56ms step:1312/1480 train_time:207756ms step_avg:159.57ms step:1313/1480 train_time:207924ms step_avg:159.57ms step:1314/1480 train_time:208097ms step_avg:159.58ms step:1315/1480 train_time:208268ms step_avg:159.59ms step:1316/1480 train_time:208435ms step_avg:159.60ms step:1317/1480 train_time:208607ms step_avg:159.61ms step:1318/1480 train_time:208788ms step_avg:159.62ms step:1319/1480 train_time:208963ms step_avg:159.64ms step:1320/1480 train_time:209140ms step_avg:159.65ms step:1321/1480 train_time:209313ms step_avg:159.66ms step:1322/1480 train_time:209494ms step_avg:159.68ms step:1323/1480 train_time:209669ms step_avg:159.69ms step:1324/1480 train_time:209844ms step_avg:159.70ms step:1325/1480 train_time:210027ms step_avg:159.72ms step:1326/1480 train_time:210201ms step_avg:159.73ms step:1327/1480 train_time:210371ms step_avg:159.74ms step:1328/1480 train_time:210540ms step_avg:159.74ms step:1329/1480 train_time:210739ms step_avg:159.77ms step:1330/1480 train_time:210916ms step_avg:159.79ms step:1331/1480 train_time:211086ms step_avg:159.79ms step:1332/1480 train_time:211259ms step_avg:159.80ms step:1333/1480 train_time:211436ms step_avg:159.82ms step:1334/1480 train_time:211607ms step_avg:159.82ms step:1335/1480 train_time:211776ms step_avg:159.83ms step:1336/1480 train_time:211959ms step_avg:159.85ms step:1337/1480 train_time:212134ms step_avg:159.86ms step:1338/1480 train_time:212306ms step_avg:159.87ms step:1339/1480 train_time:212480ms step_avg:159.88ms step:1340/1480 train_time:212654ms step_avg:159.89ms step:1341/1480 train_time:212823ms step_avg:159.90ms step:1342/1480 train_time:212997ms step_avg:159.91ms step:1343/1480 train_time:213167ms step_avg:159.92ms step:1344/1480 train_time:213339ms step_avg:159.92ms step:1345/1480 train_time:213518ms step_avg:159.94ms step:1346/1480 train_time:213687ms step_avg:159.95ms step:1347/1480 train_time:213857ms step_avg:159.95ms step:1348/1480 train_time:214027ms step_avg:159.96ms step:1349/1480 train_time:214196ms step_avg:159.97ms step:1350/1480 train_time:214372ms step_avg:159.98ms step:1351/1480 train_time:214544ms step_avg:159.99ms step:1352/1480 train_time:214713ms step_avg:160.00ms step:1353/1480 train_time:214890ms step_avg:160.01ms step:1354/1480 train_time:215063ms step_avg:160.02ms step:1355/1480 train_time:215230ms step_avg:160.02ms step:1356/1480 train_time:215403ms step_avg:160.03ms step:1357/1480 train_time:215577ms step_avg:160.04ms step:1358/1480 train_time:215750ms step_avg:160.05ms step:1359/1480 train_time:215921ms step_avg:160.06ms step:1360/1480 train_time:216096ms step_avg:160.07ms step:1361/1480 train_time:216274ms step_avg:160.08ms step:1362/1480 train_time:216451ms step_avg:160.10ms step:1363/1480 train_time:216631ms step_avg:160.11ms step:1364/1480 train_time:216799ms step_avg:160.12ms step:1365/1480 train_time:216967ms step_avg:160.12ms step:1366/1480 train_time:217138ms step_avg:160.13ms step:1367/1480 train_time:217310ms step_avg:160.14ms step:1368/1480 train_time:217485ms step_avg:160.15ms step:1369/1480 train_time:217667ms step_avg:160.17ms step:1370/1480 train_time:217844ms step_avg:160.18ms step:1371/1480 train_time:218015ms step_avg:160.19ms step:1372/1480 train_time:218192ms step_avg:160.20ms step:1373/1480 train_time:218361ms step_avg:160.21ms step:1374/1480 train_time:218537ms step_avg:160.22ms step:1375/1480 train_time:218709ms step_avg:160.23ms step:1375/1480 val_loss:3.2983 train_time:218777ms step_avg:160.28ms step:1376/1480 train_time:218882ms step_avg:160.24ms step:1377/1480 train_time:219054ms step_avg:160.24ms step:1378/1480 train_time:219222ms step_avg:160.25ms step:1379/1480 train_time:219396ms step_avg:160.26ms step:1380/1480 train_time:219568ms step_avg:160.27ms step:1381/1480 train_time:219748ms step_avg:160.28ms step:1382/1480 train_time:219919ms step_avg:160.29ms step:1383/1480 train_time:220089ms step_avg:160.30ms step:1384/1480 train_time:220266ms step_avg:160.31ms step:1385/1480 train_time:220433ms step_avg:160.31ms step:1386/1480 train_time:220604ms step_avg:160.32ms step:1387/1480 train_time:220776ms step_avg:160.33ms step:1388/1480 train_time:220944ms step_avg:160.34ms step:1389/1480 train_time:221119ms step_avg:160.35ms step:1390/1480 train_time:221286ms step_avg:160.35ms step:1391/1480 train_time:221458ms step_avg:160.36ms step:1392/1480 train_time:221629ms step_avg:160.37ms step:1393/1480 train_time:221800ms step_avg:160.38ms step:1394/1480 train_time:221972ms step_avg:160.38ms step:1395/1480 train_time:222141ms step_avg:160.39ms step:1396/1480 train_time:222310ms step_avg:160.40ms step:1397/1480 train_time:222478ms step_avg:160.40ms step:1398/1480 train_time:222645ms step_avg:160.41ms step:1399/1480 train_time:222815ms step_avg:160.41ms step:1400/1480 train_time:222992ms step_avg:160.43ms step:1401/1480 train_time:223158ms step_avg:160.43ms step:1402/1480 train_time:223328ms step_avg:160.44ms step:1403/1480 train_time:223504ms step_avg:160.45ms step:1404/1480 train_time:223674ms step_avg:160.46ms step:1405/1480 train_time:223848ms step_avg:160.46ms step:1406/1480 train_time:224022ms step_avg:160.47ms step:1407/1480 train_time:224190ms step_avg:160.48ms step:1408/1480 train_time:224359ms step_avg:160.49ms step:1409/1480 train_time:224541ms step_avg:160.50ms step:1410/1480 train_time:224710ms step_avg:160.51ms step:1411/1480 train_time:224879ms step_avg:160.51ms step:1412/1480 train_time:225048ms step_avg:160.52ms step:1413/1480 train_time:225219ms step_avg:160.53ms step:1414/1480 train_time:225391ms step_avg:160.53ms step:1415/1480 train_time:225566ms step_avg:160.55ms step:1416/1480 train_time:225753ms step_avg:160.56ms step:1417/1480 train_time:225927ms step_avg:160.57ms step:1418/1480 train_time:226099ms step_avg:160.58ms step:1419/1480 train_time:226273ms step_avg:160.59ms step:1420/1480 train_time:226448ms step_avg:160.60ms step:1421/1480 train_time:226622ms step_avg:160.61ms step:1422/1480 train_time:226794ms step_avg:160.62ms step:1423/1480 train_time:226965ms step_avg:160.63ms step:1424/1480 train_time:227143ms step_avg:160.64ms step:1425/1480 train_time:227323ms step_avg:160.65ms step:1426/1480 train_time:227494ms step_avg:160.66ms step:1427/1480 train_time:227669ms step_avg:160.67ms step:1428/1480 train_time:227839ms step_avg:160.68ms step:1429/1480 train_time:228006ms step_avg:160.68ms step:1430/1480 train_time:228181ms step_avg:160.69ms step:1431/1480 train_time:228358ms step_avg:160.70ms step:1432/1480 train_time:228535ms step_avg:160.71ms step:1433/1480 train_time:228714ms step_avg:160.73ms step:1434/1480 train_time:228894ms step_avg:160.74ms step:1435/1480 train_time:229069ms step_avg:160.75ms step:1436/1480 train_time:229245ms step_avg:160.76ms step:1437/1480 train_time:229415ms step_avg:160.77ms step:1438/1480 train_time:229584ms step_avg:160.77ms step:1439/1480 train_time:229759ms step_avg:160.78ms step:1440/1480 train_time:229928ms step_avg:160.79ms step:1441/1480 train_time:230099ms step_avg:160.80ms step:1442/1480 train_time:230275ms step_avg:160.81ms step:1443/1480 train_time:230465ms step_avg:160.83ms step:1444/1480 train_time:230637ms step_avg:160.84ms step:1445/1480 train_time:230810ms step_avg:160.84ms step:1446/1480 train_time:230985ms step_avg:160.85ms step:1447/1480 train_time:231164ms step_avg:160.87ms step:1448/1480 train_time:231337ms step_avg:160.87ms step:1449/1480 train_time:231512ms step_avg:160.88ms step:1450/1480 train_time:231684ms step_avg:160.89ms step:1451/1480 train_time:231855ms step_avg:160.90ms step:1452/1480 train_time:232029ms step_avg:160.91ms step:1453/1480 train_time:232198ms step_avg:160.91ms step:1454/1480 train_time:232371ms step_avg:160.92ms step:1455/1480 train_time:232549ms step_avg:160.93ms step:1456/1480 train_time:232722ms step_avg:160.94ms step:1457/1480 train_time:232894ms step_avg:160.95ms step:1458/1480 train_time:233066ms step_avg:160.96ms step:1459/1480 train_time:233242ms step_avg:160.97ms step:1460/1480 train_time:233414ms step_avg:160.97ms step:1461/1480 train_time:233589ms step_avg:160.98ms step:1462/1480 train_time:233759ms step_avg:160.99ms step:1463/1480 train_time:233935ms step_avg:161.00ms step:1464/1480 train_time:234110ms step_avg:161.01ms step:1465/1480 train_time:234283ms step_avg:161.02ms step:1466/1480 train_time:234453ms step_avg:161.03ms step:1467/1480 train_time:234626ms step_avg:161.03ms step:1468/1480 train_time:234797ms step_avg:161.04ms step:1469/1480 train_time:234971ms step_avg:161.05ms step:1470/1480 train_time:235151ms step_avg:161.06ms step:1471/1480 train_time:235338ms step_avg:161.08ms step:1472/1480 train_time:235519ms step_avg:161.09ms step:1473/1480 train_time:235690ms step_avg:161.10ms step:1474/1480 train_time:235868ms step_avg:161.11ms step:1475/1480 train_time:236049ms step_avg:161.13ms step:1476/1480 train_time:236221ms step_avg:161.13ms step:1477/1480 train_time:236404ms step_avg:161.15ms step:1478/1480 train_time:236587ms step_avg:161.16ms step:1479/1480 train_time:236762ms step_avg:161.17ms step:1480/1480 train_time:236936ms step_avg:161.18ms step:1480/1480 val_loss:3.2796 train_time:237007ms step_avg:161.23ms