import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:50:24 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 129W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 97W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 106W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23341ms step_avg:nanms step:2/1480 train_time:23427ms step_avg:nanms step:3/1480 train_time:23565ms step_avg:nanms step:4/1480 train_time:23705ms step_avg:nanms step:5/1480 train_time:23846ms step_avg:nanms step:6/1480 train_time:23986ms step_avg:nanms step:7/1480 train_time:24126ms step_avg:nanms step:8/1480 train_time:24269ms step_avg:nanms step:9/1480 train_time:24414ms step_avg:nanms step:10/1480 train_time:24559ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:426ms step_avg:142.13ms step:14/1480 train_time:567ms step_avg:141.68ms step:15/1480 train_time:709ms step_avg:141.88ms step:16/1480 train_time:853ms step_avg:142.18ms step:17/1480 train_time:998ms step_avg:142.56ms step:18/1480 train_time:1141ms step_avg:142.64ms step:19/1480 train_time:1284ms step_avg:142.68ms step:20/1480 train_time:1426ms step_avg:142.61ms step:21/1480 train_time:1567ms step_avg:142.50ms step:22/1480 train_time:1709ms step_avg:142.43ms step:23/1480 train_time:1851ms step_avg:142.40ms step:24/1480 train_time:1992ms step_avg:142.32ms step:25/1480 train_time:2135ms step_avg:142.33ms step:26/1480 train_time:2279ms step_avg:142.47ms step:27/1480 train_time:2423ms step_avg:142.51ms step:28/1480 train_time:2565ms step_avg:142.53ms step:29/1480 train_time:2709ms step_avg:142.57ms step:30/1480 train_time:2852ms step_avg:142.61ms step:31/1480 train_time:2997ms step_avg:142.72ms step:32/1480 train_time:3141ms step_avg:142.77ms step:33/1480 train_time:3284ms step_avg:142.77ms step:34/1480 train_time:3425ms step_avg:142.73ms step:35/1480 train_time:3566ms step_avg:142.66ms step:36/1480 train_time:3709ms step_avg:142.67ms step:37/1480 train_time:3852ms step_avg:142.68ms step:38/1480 train_time:3995ms step_avg:142.69ms step:39/1480 train_time:4139ms step_avg:142.71ms step:40/1480 train_time:4282ms step_avg:142.73ms step:41/1480 train_time:4425ms step_avg:142.74ms step:42/1480 train_time:4568ms step_avg:142.74ms step:43/1480 train_time:4711ms step_avg:142.76ms step:44/1480 train_time:4853ms step_avg:142.75ms step:45/1480 train_time:4997ms step_avg:142.78ms step:46/1480 train_time:5141ms step_avg:142.81ms step:47/1480 train_time:5286ms step_avg:142.85ms step:48/1480 train_time:5427ms step_avg:142.82ms step:49/1480 train_time:5569ms step_avg:142.80ms step:50/1480 train_time:5711ms step_avg:142.77ms step:51/1480 train_time:5853ms step_avg:142.76ms step:52/1480 train_time:5995ms step_avg:142.74ms step:53/1480 train_time:6138ms step_avg:142.74ms step:54/1480 train_time:6282ms step_avg:142.77ms step:55/1480 train_time:6425ms step_avg:142.78ms step:56/1480 train_time:6569ms step_avg:142.79ms step:57/1480 train_time:6710ms step_avg:142.76ms step:58/1480 train_time:6852ms step_avg:142.74ms step:59/1480 train_time:6994ms step_avg:142.73ms step:60/1480 train_time:7135ms step_avg:142.71ms step:61/1480 train_time:7280ms step_avg:142.74ms step:62/1480 train_time:7424ms step_avg:142.76ms step:63/1480 train_time:7566ms step_avg:142.75ms step:64/1480 train_time:7708ms step_avg:142.75ms step:65/1480 train_time:7851ms step_avg:142.74ms step:66/1480 train_time:7992ms step_avg:142.71ms step:67/1480 train_time:8133ms step_avg:142.68ms step:68/1480 train_time:8275ms step_avg:142.68ms step:69/1480 train_time:8419ms step_avg:142.70ms step:70/1480 train_time:8562ms step_avg:142.69ms step:71/1480 train_time:8704ms step_avg:142.69ms step:72/1480 train_time:8847ms step_avg:142.69ms step:73/1480 train_time:8988ms step_avg:142.67ms step:74/1480 train_time:9129ms step_avg:142.64ms step:75/1480 train_time:9271ms step_avg:142.63ms step:76/1480 train_time:9413ms step_avg:142.62ms step:77/1480 train_time:9557ms step_avg:142.64ms step:78/1480 train_time:9700ms step_avg:142.64ms step:79/1480 train_time:9843ms step_avg:142.65ms step:80/1480 train_time:9985ms step_avg:142.65ms step:81/1480 train_time:10127ms step_avg:142.63ms step:82/1480 train_time:10268ms step_avg:142.62ms step:83/1480 train_time:10411ms step_avg:142.62ms step:84/1480 train_time:10555ms step_avg:142.64ms step:85/1480 train_time:10699ms step_avg:142.66ms step:86/1480 train_time:10842ms step_avg:142.65ms step:87/1480 train_time:10985ms step_avg:142.66ms step:88/1480 train_time:11126ms step_avg:142.65ms step:89/1480 train_time:11267ms step_avg:142.63ms step:90/1480 train_time:11410ms step_avg:142.63ms step:91/1480 train_time:11552ms step_avg:142.62ms step:92/1480 train_time:11697ms step_avg:142.64ms step:93/1480 train_time:11838ms step_avg:142.63ms step:94/1480 train_time:11981ms step_avg:142.63ms step:95/1480 train_time:12124ms step_avg:142.63ms step:96/1480 train_time:12266ms step_avg:142.62ms step:97/1480 train_time:12408ms step_avg:142.62ms step:98/1480 train_time:12549ms step_avg:142.60ms step:99/1480 train_time:12692ms step_avg:142.61ms step:100/1480 train_time:12834ms step_avg:142.60ms step:101/1480 train_time:12979ms step_avg:142.62ms step:102/1480 train_time:13123ms step_avg:142.64ms step:103/1480 train_time:13265ms step_avg:142.63ms step:104/1480 train_time:13407ms step_avg:142.63ms step:105/1480 train_time:13549ms step_avg:142.62ms step:106/1480 train_time:13692ms step_avg:142.62ms step:107/1480 train_time:13835ms step_avg:142.62ms step:108/1480 train_time:13976ms step_avg:142.62ms step:109/1480 train_time:14121ms step_avg:142.64ms step:110/1480 train_time:14263ms step_avg:142.63ms step:111/1480 train_time:14409ms step_avg:142.66ms step:112/1480 train_time:14556ms step_avg:142.71ms step:113/1480 train_time:14704ms step_avg:142.75ms step:114/1480 train_time:14849ms step_avg:142.78ms step:115/1480 train_time:14996ms step_avg:142.82ms step:116/1480 train_time:15143ms step_avg:142.86ms step:117/1480 train_time:15290ms step_avg:142.90ms step:118/1480 train_time:15436ms step_avg:142.93ms step:119/1480 train_time:15584ms step_avg:142.97ms step:120/1480 train_time:15730ms step_avg:143.00ms step:121/1480 train_time:15878ms step_avg:143.04ms step:122/1480 train_time:16025ms step_avg:143.08ms step:123/1480 train_time:16171ms step_avg:143.11ms step:124/1480 train_time:16319ms step_avg:143.15ms step:125/1480 train_time:16466ms step_avg:143.19ms step:125/1480 val_loss:4.4281 train_time:16523ms step_avg:143.68ms step:126/1480 train_time:16621ms step_avg:143.28ms step:127/1480 train_time:16769ms step_avg:143.32ms step:128/1480 train_time:16915ms step_avg:143.35ms step:129/1480 train_time:17061ms step_avg:143.37ms step:130/1480 train_time:17205ms step_avg:143.38ms step:131/1480 train_time:17351ms step_avg:143.39ms step:132/1480 train_time:17498ms step_avg:143.43ms step:133/1480 train_time:17647ms step_avg:143.47ms step:134/1480 train_time:17796ms step_avg:143.51ms step:135/1480 train_time:17943ms step_avg:143.54ms step:136/1480 train_time:18089ms step_avg:143.56ms step:137/1480 train_time:18235ms step_avg:143.58ms step:138/1480 train_time:18381ms step_avg:143.61ms step:139/1480 train_time:18528ms step_avg:143.63ms step:140/1480 train_time:18676ms step_avg:143.66ms step:141/1480 train_time:18824ms step_avg:143.69ms step:142/1480 train_time:18970ms step_avg:143.71ms step:143/1480 train_time:19118ms step_avg:143.74ms step:144/1480 train_time:19265ms step_avg:143.77ms step:145/1480 train_time:19410ms step_avg:143.78ms step:146/1480 train_time:19557ms step_avg:143.80ms step:147/1480 train_time:19704ms step_avg:143.82ms step:148/1480 train_time:19852ms step_avg:143.85ms step:149/1480 train_time:19999ms step_avg:143.88ms step:150/1480 train_time:20147ms step_avg:143.91ms step:151/1480 train_time:20295ms step_avg:143.94ms step:152/1480 train_time:20442ms step_avg:143.96ms step:153/1480 train_time:20588ms step_avg:143.97ms step:154/1480 train_time:20735ms step_avg:143.99ms step:155/1480 train_time:20883ms step_avg:144.02ms step:156/1480 train_time:21030ms step_avg:144.04ms step:157/1480 train_time:21177ms step_avg:144.06ms step:158/1480 train_time:21324ms step_avg:144.08ms step:159/1480 train_time:21469ms step_avg:144.09ms step:160/1480 train_time:21617ms step_avg:144.11ms step:161/1480 train_time:21764ms step_avg:144.13ms step:162/1480 train_time:21910ms step_avg:144.14ms step:163/1480 train_time:22058ms step_avg:144.17ms step:164/1480 train_time:22205ms step_avg:144.19ms step:165/1480 train_time:22352ms step_avg:144.21ms step:166/1480 train_time:22499ms step_avg:144.22ms step:167/1480 train_time:22646ms step_avg:144.24ms step:168/1480 train_time:22792ms step_avg:144.25ms step:169/1480 train_time:22939ms step_avg:144.27ms step:170/1480 train_time:23086ms step_avg:144.29ms step:171/1480 train_time:23232ms step_avg:144.30ms step:172/1480 train_time:23379ms step_avg:144.32ms step:173/1480 train_time:23526ms step_avg:144.33ms step:174/1480 train_time:23673ms step_avg:144.35ms step:175/1480 train_time:23821ms step_avg:144.37ms step:176/1480 train_time:23967ms step_avg:144.38ms step:177/1480 train_time:24115ms step_avg:144.40ms step:178/1480 train_time:24263ms step_avg:144.42ms step:179/1480 train_time:24409ms step_avg:144.43ms step:180/1480 train_time:24555ms step_avg:144.44ms step:181/1480 train_time:24701ms step_avg:144.45ms step:182/1480 train_time:24849ms step_avg:144.47ms step:183/1480 train_time:24996ms step_avg:144.49ms step:184/1480 train_time:25144ms step_avg:144.50ms step:185/1480 train_time:25290ms step_avg:144.52ms step:186/1480 train_time:25438ms step_avg:144.54ms step:187/1480 train_time:25585ms step_avg:144.55ms step:188/1480 train_time:25731ms step_avg:144.56ms step:189/1480 train_time:25878ms step_avg:144.57ms step:190/1480 train_time:26025ms step_avg:144.58ms step:191/1480 train_time:26171ms step_avg:144.59ms step:192/1480 train_time:26319ms step_avg:144.61ms step:193/1480 train_time:26465ms step_avg:144.62ms step:194/1480 train_time:26611ms step_avg:144.62ms step:195/1480 train_time:26758ms step_avg:144.64ms step:196/1480 train_time:26904ms step_avg:144.65ms step:197/1480 train_time:27050ms step_avg:144.65ms step:198/1480 train_time:27198ms step_avg:144.67ms step:199/1480 train_time:27345ms step_avg:144.68ms step:200/1480 train_time:27491ms step_avg:144.69ms step:201/1480 train_time:27638ms step_avg:144.70ms step:202/1480 train_time:27784ms step_avg:144.71ms step:203/1480 train_time:27931ms step_avg:144.72ms step:204/1480 train_time:28081ms step_avg:144.75ms step:205/1480 train_time:28225ms step_avg:144.74ms step:206/1480 train_time:28372ms step_avg:144.76ms step:207/1480 train_time:28519ms step_avg:144.76ms step:208/1480 train_time:28666ms step_avg:144.78ms step:209/1480 train_time:28814ms step_avg:144.79ms step:210/1480 train_time:28961ms step_avg:144.80ms step:211/1480 train_time:29107ms step_avg:144.81ms step:212/1480 train_time:29252ms step_avg:144.81ms step:213/1480 train_time:29400ms step_avg:144.83ms step:214/1480 train_time:29547ms step_avg:144.84ms step:215/1480 train_time:29694ms step_avg:144.85ms step:216/1480 train_time:29841ms step_avg:144.86ms step:217/1480 train_time:29987ms step_avg:144.87ms step:218/1480 train_time:30135ms step_avg:144.88ms step:219/1480 train_time:30282ms step_avg:144.89ms step:220/1480 train_time:30428ms step_avg:144.90ms step:221/1480 train_time:30576ms step_avg:144.91ms step:222/1480 train_time:30727ms step_avg:144.94ms step:223/1480 train_time:30877ms step_avg:144.96ms step:224/1480 train_time:31028ms step_avg:144.99ms step:225/1480 train_time:31181ms step_avg:145.03ms step:226/1480 train_time:31329ms step_avg:145.04ms step:227/1480 train_time:31479ms step_avg:145.07ms step:228/1480 train_time:31630ms step_avg:145.09ms step:229/1480 train_time:31781ms step_avg:145.12ms step:230/1480 train_time:31931ms step_avg:145.14ms step:231/1480 train_time:32082ms step_avg:145.17ms step:232/1480 train_time:32231ms step_avg:145.19ms step:233/1480 train_time:32382ms step_avg:145.21ms step:234/1480 train_time:32533ms step_avg:145.24ms step:235/1480 train_time:32684ms step_avg:145.26ms step:236/1480 train_time:32835ms step_avg:145.29ms step:237/1480 train_time:32985ms step_avg:145.31ms step:238/1480 train_time:33135ms step_avg:145.33ms step:239/1480 train_time:33285ms step_avg:145.35ms step:240/1480 train_time:33435ms step_avg:145.37ms step:241/1480 train_time:33585ms step_avg:145.39ms step:242/1480 train_time:33736ms step_avg:145.42ms step:243/1480 train_time:33887ms step_avg:145.44ms step:244/1480 train_time:34037ms step_avg:145.46ms step:245/1480 train_time:34187ms step_avg:145.48ms step:246/1480 train_time:34338ms step_avg:145.50ms step:247/1480 train_time:34488ms step_avg:145.52ms step:248/1480 train_time:34639ms step_avg:145.54ms step:249/1480 train_time:34789ms step_avg:145.56ms step:250/1480 train_time:34939ms step_avg:145.58ms step:250/1480 val_loss:4.0008 train_time:34998ms step_avg:145.83ms step:251/1480 train_time:35095ms step_avg:145.62ms step:252/1480 train_time:35246ms step_avg:145.65ms step:253/1480 train_time:35397ms step_avg:145.67ms step:254/1480 train_time:35547ms step_avg:145.68ms step:255/1480 train_time:35696ms step_avg:145.70ms step:256/1480 train_time:35846ms step_avg:145.72ms step:257/1480 train_time:35996ms step_avg:145.73ms step:258/1480 train_time:36149ms step_avg:145.76ms step:259/1480 train_time:36300ms step_avg:145.78ms step:260/1480 train_time:36451ms step_avg:145.80ms step:261/1480 train_time:36600ms step_avg:145.82ms step:262/1480 train_time:36751ms step_avg:145.84ms step:263/1480 train_time:36901ms step_avg:145.85ms step:264/1480 train_time:37052ms step_avg:145.87ms step:265/1480 train_time:37203ms step_avg:145.90ms step:266/1480 train_time:37354ms step_avg:145.92ms step:267/1480 train_time:37505ms step_avg:145.94ms step:268/1480 train_time:37656ms step_avg:145.95ms step:269/1480 train_time:37806ms step_avg:145.97ms step:270/1480 train_time:37957ms step_avg:145.99ms step:271/1480 train_time:38107ms step_avg:146.00ms step:272/1480 train_time:38257ms step_avg:146.02ms step:273/1480 train_time:38408ms step_avg:146.04ms step:274/1480 train_time:38557ms step_avg:146.05ms step:275/1480 train_time:38707ms step_avg:146.07ms step:276/1480 train_time:38857ms step_avg:146.08ms step:277/1480 train_time:39009ms step_avg:146.10ms step:278/1480 train_time:39159ms step_avg:146.12ms step:279/1480 train_time:39310ms step_avg:146.13ms step:280/1480 train_time:39461ms step_avg:146.15ms step:281/1480 train_time:39612ms step_avg:146.17ms step:282/1480 train_time:39763ms step_avg:146.19ms step:283/1480 train_time:39914ms step_avg:146.20ms step:284/1480 train_time:40065ms step_avg:146.22ms step:285/1480 train_time:40216ms step_avg:146.24ms step:286/1480 train_time:40367ms step_avg:146.26ms step:287/1480 train_time:40515ms step_avg:146.26ms step:288/1480 train_time:40667ms step_avg:146.29ms step:289/1480 train_time:40819ms step_avg:146.30ms step:290/1480 train_time:40969ms step_avg:146.32ms step:291/1480 train_time:41118ms step_avg:146.33ms step:292/1480 train_time:41269ms step_avg:146.35ms step:293/1480 train_time:41420ms step_avg:146.36ms step:294/1480 train_time:41571ms step_avg:146.38ms step:295/1480 train_time:41721ms step_avg:146.39ms step:296/1480 train_time:41871ms step_avg:146.40ms step:297/1480 train_time:42022ms step_avg:146.42ms step:298/1480 train_time:42172ms step_avg:146.43ms step:299/1480 train_time:42323ms step_avg:146.45ms step:300/1480 train_time:42473ms step_avg:146.46ms step:301/1480 train_time:42623ms step_avg:146.47ms step:302/1480 train_time:42773ms step_avg:146.48ms step:303/1480 train_time:42924ms step_avg:146.50ms step:304/1480 train_time:43075ms step_avg:146.51ms step:305/1480 train_time:43226ms step_avg:146.53ms step:306/1480 train_time:43377ms step_avg:146.54ms step:307/1480 train_time:43527ms step_avg:146.55ms step:308/1480 train_time:43678ms step_avg:146.57ms step:309/1480 train_time:43829ms step_avg:146.58ms step:310/1480 train_time:43981ms step_avg:146.60ms step:311/1480 train_time:44132ms step_avg:146.62ms step:312/1480 train_time:44284ms step_avg:146.64ms step:313/1480 train_time:44433ms step_avg:146.65ms step:314/1480 train_time:44585ms step_avg:146.66ms step:315/1480 train_time:44735ms step_avg:146.67ms step:316/1480 train_time:44886ms step_avg:146.68ms step:317/1480 train_time:45036ms step_avg:146.70ms step:318/1480 train_time:45187ms step_avg:146.71ms step:319/1480 train_time:45338ms step_avg:146.72ms step:320/1480 train_time:45488ms step_avg:146.74ms step:321/1480 train_time:45638ms step_avg:146.75ms step:322/1480 train_time:45789ms step_avg:146.76ms step:323/1480 train_time:45939ms step_avg:146.77ms step:324/1480 train_time:46090ms step_avg:146.78ms step:325/1480 train_time:46241ms step_avg:146.80ms step:326/1480 train_time:46391ms step_avg:146.81ms step:327/1480 train_time:46543ms step_avg:146.82ms step:328/1480 train_time:46693ms step_avg:146.83ms step:329/1480 train_time:46844ms step_avg:146.85ms step:330/1480 train_time:46998ms step_avg:146.87ms step:331/1480 train_time:47152ms step_avg:146.89ms step:332/1480 train_time:47306ms step_avg:146.91ms step:333/1480 train_time:47461ms step_avg:146.94ms step:334/1480 train_time:47615ms step_avg:146.96ms step:335/1480 train_time:47769ms step_avg:146.98ms step:336/1480 train_time:47923ms step_avg:147.00ms step:337/1480 train_time:48077ms step_avg:147.02ms step:338/1480 train_time:48229ms step_avg:147.04ms step:339/1480 train_time:48384ms step_avg:147.06ms step:340/1480 train_time:48539ms step_avg:147.09ms step:341/1480 train_time:48693ms step_avg:147.11ms step:342/1480 train_time:48847ms step_avg:147.13ms step:343/1480 train_time:49002ms step_avg:147.15ms step:344/1480 train_time:49156ms step_avg:147.17ms step:345/1480 train_time:49310ms step_avg:147.19ms step:346/1480 train_time:49464ms step_avg:147.21ms step:347/1480 train_time:49619ms step_avg:147.24ms step:348/1480 train_time:49773ms step_avg:147.26ms step:349/1480 train_time:49925ms step_avg:147.27ms step:350/1480 train_time:50081ms step_avg:147.30ms step:351/1480 train_time:50236ms step_avg:147.32ms step:352/1480 train_time:50389ms step_avg:147.34ms step:353/1480 train_time:50543ms step_avg:147.36ms step:354/1480 train_time:50697ms step_avg:147.37ms step:355/1480 train_time:50851ms step_avg:147.39ms step:356/1480 train_time:51004ms step_avg:147.41ms step:357/1480 train_time:51161ms step_avg:147.44ms step:358/1480 train_time:51315ms step_avg:147.46ms step:359/1480 train_time:51468ms step_avg:147.47ms step:360/1480 train_time:51623ms step_avg:147.50ms step:361/1480 train_time:51780ms step_avg:147.52ms step:362/1480 train_time:51933ms step_avg:147.54ms step:363/1480 train_time:52086ms step_avg:147.55ms step:364/1480 train_time:52241ms step_avg:147.57ms step:365/1480 train_time:52394ms step_avg:147.59ms step:366/1480 train_time:52548ms step_avg:147.61ms step:367/1480 train_time:52702ms step_avg:147.63ms step:368/1480 train_time:52856ms step_avg:147.64ms step:369/1480 train_time:53010ms step_avg:147.66ms step:370/1480 train_time:53165ms step_avg:147.68ms step:371/1480 train_time:53318ms step_avg:147.69ms step:372/1480 train_time:53472ms step_avg:147.71ms step:373/1480 train_time:53625ms step_avg:147.73ms step:374/1480 train_time:53780ms step_avg:147.75ms step:375/1480 train_time:53934ms step_avg:147.77ms step:375/1480 val_loss:3.8119 train_time:53994ms step_avg:147.93ms step:376/1480 train_time:54094ms step_avg:147.80ms step:377/1480 train_time:54250ms step_avg:147.82ms step:378/1480 train_time:54404ms step_avg:147.84ms step:379/1480 train_time:54555ms step_avg:147.85ms step:380/1480 train_time:54708ms step_avg:147.86ms step:381/1480 train_time:54860ms step_avg:147.87ms step:382/1480 train_time:55013ms step_avg:147.89ms step:383/1480 train_time:55170ms step_avg:147.91ms step:384/1480 train_time:55325ms step_avg:147.93ms step:385/1480 train_time:55478ms step_avg:147.94ms step:386/1480 train_time:55631ms step_avg:147.95ms step:387/1480 train_time:55785ms step_avg:147.97ms step:388/1480 train_time:55939ms step_avg:147.99ms step:389/1480 train_time:56093ms step_avg:148.00ms step:390/1480 train_time:56247ms step_avg:148.02ms step:391/1480 train_time:56401ms step_avg:148.04ms step:392/1480 train_time:56555ms step_avg:148.05ms step:393/1480 train_time:56708ms step_avg:148.06ms step:394/1480 train_time:56863ms step_avg:148.08ms step:395/1480 train_time:57016ms step_avg:148.09ms step:396/1480 train_time:57169ms step_avg:148.11ms step:397/1480 train_time:57324ms step_avg:148.12ms step:398/1480 train_time:57478ms step_avg:148.14ms step:399/1480 train_time:57631ms step_avg:148.15ms step:400/1480 train_time:57787ms step_avg:148.17ms step:401/1480 train_time:57941ms step_avg:148.19ms step:402/1480 train_time:58097ms step_avg:148.21ms step:403/1480 train_time:58249ms step_avg:148.22ms step:404/1480 train_time:58403ms step_avg:148.23ms step:405/1480 train_time:58556ms step_avg:148.24ms step:406/1480 train_time:58710ms step_avg:148.26ms step:407/1480 train_time:58865ms step_avg:148.28ms step:408/1480 train_time:59019ms step_avg:148.29ms step:409/1480 train_time:59174ms step_avg:148.31ms step:410/1480 train_time:59327ms step_avg:148.32ms step:411/1480 train_time:59484ms step_avg:148.34ms step:412/1480 train_time:59638ms step_avg:148.35ms step:413/1480 train_time:59791ms step_avg:148.36ms step:414/1480 train_time:59945ms step_avg:148.38ms step:415/1480 train_time:60098ms step_avg:148.39ms step:416/1480 train_time:60253ms step_avg:148.41ms step:417/1480 train_time:60408ms step_avg:148.42ms step:418/1480 train_time:60562ms step_avg:148.44ms step:419/1480 train_time:60715ms step_avg:148.45ms step:420/1480 train_time:60869ms step_avg:148.46ms step:421/1480 train_time:61023ms step_avg:148.47ms step:422/1480 train_time:61176ms step_avg:148.49ms step:423/1480 train_time:61330ms step_avg:148.50ms step:424/1480 train_time:61487ms step_avg:148.52ms step:425/1480 train_time:61640ms step_avg:148.53ms step:426/1480 train_time:61793ms step_avg:148.54ms step:427/1480 train_time:61946ms step_avg:148.55ms step:428/1480 train_time:62100ms step_avg:148.56ms step:429/1480 train_time:62254ms step_avg:148.58ms step:430/1480 train_time:62408ms step_avg:148.59ms step:431/1480 train_time:62563ms step_avg:148.61ms step:432/1480 train_time:62717ms step_avg:148.62ms step:433/1480 train_time:62870ms step_avg:148.63ms step:434/1480 train_time:63023ms step_avg:148.64ms step:435/1480 train_time:63175ms step_avg:148.65ms step:436/1480 train_time:63329ms step_avg:148.66ms step:437/1480 train_time:63483ms step_avg:148.67ms step:438/1480 train_time:63637ms step_avg:148.68ms step:439/1480 train_time:63793ms step_avg:148.70ms step:440/1480 train_time:63947ms step_avg:148.71ms step:441/1480 train_time:64105ms step_avg:148.74ms step:442/1480 train_time:64263ms step_avg:148.76ms step:443/1480 train_time:64420ms step_avg:148.78ms step:444/1480 train_time:64575ms step_avg:148.79ms step:445/1480 train_time:64731ms step_avg:148.81ms step:446/1480 train_time:64889ms step_avg:148.83ms step:447/1480 train_time:65047ms step_avg:148.85ms step:448/1480 train_time:65203ms step_avg:148.87ms step:449/1480 train_time:65363ms step_avg:148.89ms step:450/1480 train_time:65519ms step_avg:148.91ms step:451/1480 train_time:65676ms step_avg:148.92ms step:452/1480 train_time:65831ms step_avg:148.94ms step:453/1480 train_time:65988ms step_avg:148.96ms step:454/1480 train_time:66143ms step_avg:148.97ms step:455/1480 train_time:66298ms step_avg:148.98ms step:456/1480 train_time:66456ms step_avg:149.00ms step:457/1480 train_time:66613ms step_avg:149.02ms step:458/1480 train_time:66770ms step_avg:149.04ms step:459/1480 train_time:66929ms step_avg:149.06ms step:460/1480 train_time:67087ms step_avg:149.08ms step:461/1480 train_time:67244ms step_avg:149.10ms step:462/1480 train_time:67401ms step_avg:149.12ms step:463/1480 train_time:67557ms step_avg:149.13ms step:464/1480 train_time:67713ms step_avg:149.15ms step:465/1480 train_time:67869ms step_avg:149.16ms step:466/1480 train_time:68027ms step_avg:149.18ms step:467/1480 train_time:68186ms step_avg:149.20ms step:468/1480 train_time:68344ms step_avg:149.22ms step:469/1480 train_time:68498ms step_avg:149.23ms step:470/1480 train_time:68654ms step_avg:149.25ms step:471/1480 train_time:68810ms step_avg:149.26ms step:472/1480 train_time:68969ms step_avg:149.28ms step:473/1480 train_time:69125ms step_avg:149.30ms step:474/1480 train_time:69281ms step_avg:149.31ms step:475/1480 train_time:69437ms step_avg:149.33ms step:476/1480 train_time:69594ms step_avg:149.34ms step:477/1480 train_time:69751ms step_avg:149.36ms step:478/1480 train_time:69908ms step_avg:149.38ms step:479/1480 train_time:70066ms step_avg:149.39ms step:480/1480 train_time:70223ms step_avg:149.41ms step:481/1480 train_time:70379ms step_avg:149.42ms step:482/1480 train_time:70535ms step_avg:149.44ms step:483/1480 train_time:70690ms step_avg:149.45ms step:484/1480 train_time:70848ms step_avg:149.47ms step:485/1480 train_time:71006ms step_avg:149.49ms step:486/1480 train_time:71163ms step_avg:149.50ms step:487/1480 train_time:71319ms step_avg:149.51ms step:488/1480 train_time:71476ms step_avg:149.53ms step:489/1480 train_time:71633ms step_avg:149.55ms step:490/1480 train_time:71790ms step_avg:149.56ms step:491/1480 train_time:71948ms step_avg:149.58ms step:492/1480 train_time:72105ms step_avg:149.60ms step:493/1480 train_time:72262ms step_avg:149.61ms step:494/1480 train_time:72418ms step_avg:149.62ms step:495/1480 train_time:72575ms step_avg:149.64ms step:496/1480 train_time:72732ms step_avg:149.65ms step:497/1480 train_time:72889ms step_avg:149.67ms step:498/1480 train_time:73048ms step_avg:149.69ms step:499/1480 train_time:73206ms step_avg:149.70ms step:500/1480 train_time:73364ms step_avg:149.72ms step:500/1480 val_loss:3.6913 train_time:73426ms step_avg:149.85ms step:501/1480 train_time:73523ms step_avg:149.74ms step:502/1480 train_time:73682ms step_avg:149.76ms step:503/1480 train_time:73837ms step_avg:149.77ms step:504/1480 train_time:73993ms step_avg:149.78ms step:505/1480 train_time:74149ms step_avg:149.80ms step:506/1480 train_time:74305ms step_avg:149.81ms step:507/1480 train_time:74461ms step_avg:149.82ms step:508/1480 train_time:74619ms step_avg:149.84ms step:509/1480 train_time:74776ms step_avg:149.85ms step:510/1480 train_time:74934ms step_avg:149.87ms step:511/1480 train_time:75091ms step_avg:149.88ms step:512/1480 train_time:75250ms step_avg:149.90ms step:513/1480 train_time:75408ms step_avg:149.92ms step:514/1480 train_time:75564ms step_avg:149.93ms step:515/1480 train_time:75720ms step_avg:149.94ms step:516/1480 train_time:75878ms step_avg:149.96ms step:517/1480 train_time:76035ms step_avg:149.97ms step:518/1480 train_time:76193ms step_avg:149.99ms step:519/1480 train_time:76351ms step_avg:150.00ms step:520/1480 train_time:76509ms step_avg:150.02ms step:521/1480 train_time:76667ms step_avg:150.03ms step:522/1480 train_time:76824ms step_avg:150.05ms step:523/1480 train_time:76981ms step_avg:150.06ms step:524/1480 train_time:77138ms step_avg:150.07ms step:525/1480 train_time:77296ms step_avg:150.09ms step:526/1480 train_time:77454ms step_avg:150.11ms step:527/1480 train_time:77612ms step_avg:150.12ms step:528/1480 train_time:77769ms step_avg:150.13ms step:529/1480 train_time:77926ms step_avg:150.15ms step:530/1480 train_time:78082ms step_avg:150.16ms step:531/1480 train_time:78239ms step_avg:150.17ms step:532/1480 train_time:78396ms step_avg:150.18ms step:533/1480 train_time:78555ms step_avg:150.20ms step:534/1480 train_time:78713ms step_avg:150.21ms step:535/1480 train_time:78870ms step_avg:150.23ms step:536/1480 train_time:79029ms step_avg:150.25ms step:537/1480 train_time:79186ms step_avg:150.26ms step:538/1480 train_time:79342ms step_avg:150.27ms step:539/1480 train_time:79499ms step_avg:150.28ms step:540/1480 train_time:79656ms step_avg:150.29ms step:541/1480 train_time:79813ms step_avg:150.31ms step:542/1480 train_time:79971ms step_avg:150.32ms step:543/1480 train_time:80128ms step_avg:150.33ms step:544/1480 train_time:80284ms step_avg:150.35ms step:545/1480 train_time:80441ms step_avg:150.36ms step:546/1480 train_time:80597ms step_avg:150.37ms step:547/1480 train_time:80754ms step_avg:150.38ms step:548/1480 train_time:80911ms step_avg:150.39ms step:549/1480 train_time:81067ms step_avg:150.40ms step:550/1480 train_time:81225ms step_avg:150.42ms step:551/1480 train_time:81383ms step_avg:150.43ms step:552/1480 train_time:81543ms step_avg:150.45ms step:553/1480 train_time:81703ms step_avg:150.47ms step:554/1480 train_time:81861ms step_avg:150.48ms step:555/1480 train_time:82020ms step_avg:150.50ms step:556/1480 train_time:82179ms step_avg:150.51ms step:557/1480 train_time:82338ms step_avg:150.53ms step:558/1480 train_time:82498ms step_avg:150.54ms step:559/1480 train_time:82657ms step_avg:150.56ms step:560/1480 train_time:82817ms step_avg:150.58ms step:561/1480 train_time:82977ms step_avg:150.59ms step:562/1480 train_time:83137ms step_avg:150.61ms step:563/1480 train_time:83296ms step_avg:150.63ms step:564/1480 train_time:83456ms step_avg:150.64ms step:565/1480 train_time:83616ms step_avg:150.66ms step:566/1480 train_time:83777ms step_avg:150.68ms step:567/1480 train_time:83937ms step_avg:150.69ms step:568/1480 train_time:84096ms step_avg:150.71ms step:569/1480 train_time:84255ms step_avg:150.72ms step:570/1480 train_time:84413ms step_avg:150.74ms step:571/1480 train_time:84574ms step_avg:150.76ms step:572/1480 train_time:84733ms step_avg:150.77ms step:573/1480 train_time:84894ms step_avg:150.79ms step:574/1480 train_time:85057ms step_avg:150.81ms step:575/1480 train_time:85217ms step_avg:150.83ms step:576/1480 train_time:85377ms step_avg:150.84ms step:577/1480 train_time:85537ms step_avg:150.86ms step:578/1480 train_time:85696ms step_avg:150.87ms step:579/1480 train_time:85856ms step_avg:150.89ms step:580/1480 train_time:86016ms step_avg:150.91ms step:581/1480 train_time:86178ms step_avg:150.92ms step:582/1480 train_time:86339ms step_avg:150.94ms step:583/1480 train_time:86497ms step_avg:150.96ms step:584/1480 train_time:86657ms step_avg:150.97ms step:585/1480 train_time:86816ms step_avg:150.98ms step:586/1480 train_time:86976ms step_avg:151.00ms step:587/1480 train_time:87136ms step_avg:151.02ms step:588/1480 train_time:87295ms step_avg:151.03ms step:589/1480 train_time:87456ms step_avg:151.05ms step:590/1480 train_time:87616ms step_avg:151.06ms step:591/1480 train_time:87776ms step_avg:151.08ms step:592/1480 train_time:87936ms step_avg:151.09ms step:593/1480 train_time:88097ms step_avg:151.11ms step:594/1480 train_time:88258ms step_avg:151.13ms step:595/1480 train_time:88418ms step_avg:151.14ms step:596/1480 train_time:88579ms step_avg:151.16ms step:597/1480 train_time:88739ms step_avg:151.17ms step:598/1480 train_time:88897ms step_avg:151.19ms step:599/1480 train_time:89056ms step_avg:151.20ms step:600/1480 train_time:89217ms step_avg:151.22ms step:601/1480 train_time:89377ms step_avg:151.23ms step:602/1480 train_time:89537ms step_avg:151.24ms step:603/1480 train_time:89698ms step_avg:151.26ms step:604/1480 train_time:89857ms step_avg:151.27ms step:605/1480 train_time:90017ms step_avg:151.29ms step:606/1480 train_time:90183ms step_avg:151.31ms step:607/1480 train_time:90342ms step_avg:151.33ms step:608/1480 train_time:90501ms step_avg:151.34ms step:609/1480 train_time:90660ms step_avg:151.35ms step:610/1480 train_time:90818ms step_avg:151.36ms step:611/1480 train_time:90978ms step_avg:151.38ms step:612/1480 train_time:91137ms step_avg:151.39ms step:613/1480 train_time:91298ms step_avg:151.41ms step:614/1480 train_time:91458ms step_avg:151.42ms step:615/1480 train_time:91617ms step_avg:151.43ms step:616/1480 train_time:91777ms step_avg:151.45ms step:617/1480 train_time:91937ms step_avg:151.46ms step:618/1480 train_time:92095ms step_avg:151.47ms step:619/1480 train_time:92256ms step_avg:151.49ms step:620/1480 train_time:92416ms step_avg:151.50ms step:621/1480 train_time:92576ms step_avg:151.51ms step:622/1480 train_time:92735ms step_avg:151.53ms step:623/1480 train_time:92896ms step_avg:151.54ms step:624/1480 train_time:93057ms step_avg:151.56ms step:625/1480 train_time:93216ms step_avg:151.57ms step:625/1480 val_loss:3.6083 train_time:93280ms step_avg:151.68ms step:626/1480 train_time:93380ms step_avg:151.59ms step:627/1480 train_time:93539ms step_avg:151.60ms step:628/1480 train_time:93696ms step_avg:151.61ms step:629/1480 train_time:93856ms step_avg:151.63ms step:630/1480 train_time:94015ms step_avg:151.64ms step:631/1480 train_time:94173ms step_avg:151.65ms step:632/1480 train_time:94333ms step_avg:151.66ms step:633/1480 train_time:94494ms step_avg:151.68ms step:634/1480 train_time:94655ms step_avg:151.69ms step:635/1480 train_time:94816ms step_avg:151.71ms step:636/1480 train_time:94975ms step_avg:151.72ms step:637/1480 train_time:95136ms step_avg:151.73ms step:638/1480 train_time:95295ms step_avg:151.74ms step:639/1480 train_time:95455ms step_avg:151.76ms step:640/1480 train_time:95615ms step_avg:151.77ms step:641/1480 train_time:95776ms step_avg:151.78ms step:642/1480 train_time:95935ms step_avg:151.80ms step:643/1480 train_time:96095ms step_avg:151.81ms step:644/1480 train_time:96254ms step_avg:151.82ms step:645/1480 train_time:96413ms step_avg:151.83ms step:646/1480 train_time:96571ms step_avg:151.84ms step:647/1480 train_time:96730ms step_avg:151.85ms step:648/1480 train_time:96892ms step_avg:151.87ms step:649/1480 train_time:97051ms step_avg:151.88ms step:650/1480 train_time:97212ms step_avg:151.89ms step:651/1480 train_time:97372ms step_avg:151.91ms step:652/1480 train_time:97533ms step_avg:151.92ms step:653/1480 train_time:97692ms step_avg:151.93ms step:654/1480 train_time:97852ms step_avg:151.94ms step:655/1480 train_time:98012ms step_avg:151.96ms step:656/1480 train_time:98171ms step_avg:151.97ms step:657/1480 train_time:98333ms step_avg:151.98ms step:658/1480 train_time:98493ms step_avg:152.00ms step:659/1480 train_time:98656ms step_avg:152.01ms step:660/1480 train_time:98818ms step_avg:152.03ms step:661/1480 train_time:98980ms step_avg:152.04ms step:662/1480 train_time:99139ms step_avg:152.05ms step:663/1480 train_time:99298ms step_avg:152.06ms step:664/1480 train_time:99460ms step_avg:152.08ms step:665/1480 train_time:99623ms step_avg:152.10ms step:666/1480 train_time:99782ms step_avg:152.11ms step:667/1480 train_time:99944ms step_avg:152.12ms step:668/1480 train_time:100106ms step_avg:152.14ms step:669/1480 train_time:100268ms step_avg:152.15ms step:670/1480 train_time:100429ms step_avg:152.16ms step:671/1480 train_time:100590ms step_avg:152.18ms step:672/1480 train_time:100754ms step_avg:152.20ms step:673/1480 train_time:100917ms step_avg:152.21ms step:674/1480 train_time:101080ms step_avg:152.23ms step:675/1480 train_time:101242ms step_avg:152.24ms step:676/1480 train_time:101402ms step_avg:152.26ms step:677/1480 train_time:101562ms step_avg:152.27ms step:678/1480 train_time:101724ms step_avg:152.28ms step:679/1480 train_time:101884ms step_avg:152.29ms step:680/1480 train_time:102047ms step_avg:152.31ms step:681/1480 train_time:102208ms step_avg:152.32ms step:682/1480 train_time:102373ms step_avg:152.34ms step:683/1480 train_time:102535ms step_avg:152.36ms step:684/1480 train_time:102697ms step_avg:152.37ms step:685/1480 train_time:102861ms step_avg:152.39ms step:686/1480 train_time:103023ms step_avg:152.40ms step:687/1480 train_time:103183ms step_avg:152.41ms step:688/1480 train_time:103346ms step_avg:152.43ms step:689/1480 train_time:103509ms step_avg:152.44ms step:690/1480 train_time:103672ms step_avg:152.46ms step:691/1480 train_time:103834ms step_avg:152.47ms step:692/1480 train_time:103995ms step_avg:152.49ms step:693/1480 train_time:104158ms step_avg:152.50ms step:694/1480 train_time:104320ms step_avg:152.51ms step:695/1480 train_time:104481ms step_avg:152.53ms step:696/1480 train_time:104642ms step_avg:152.54ms step:697/1480 train_time:104804ms step_avg:152.55ms step:698/1480 train_time:104964ms step_avg:152.56ms step:699/1480 train_time:105126ms step_avg:152.58ms step:700/1480 train_time:105288ms step_avg:152.59ms step:701/1480 train_time:105447ms step_avg:152.60ms step:702/1480 train_time:105609ms step_avg:152.61ms step:703/1480 train_time:105770ms step_avg:152.63ms step:704/1480 train_time:105931ms step_avg:152.64ms step:705/1480 train_time:106094ms step_avg:152.65ms step:706/1480 train_time:106257ms step_avg:152.67ms step:707/1480 train_time:106418ms step_avg:152.68ms step:708/1480 train_time:106579ms step_avg:152.69ms step:709/1480 train_time:106742ms step_avg:152.71ms step:710/1480 train_time:106902ms step_avg:152.72ms step:711/1480 train_time:107062ms step_avg:152.73ms step:712/1480 train_time:107229ms step_avg:152.75ms step:713/1480 train_time:107394ms step_avg:152.76ms step:714/1480 train_time:107556ms step_avg:152.78ms step:715/1480 train_time:107717ms step_avg:152.79ms step:716/1480 train_time:107877ms step_avg:152.80ms step:717/1480 train_time:108041ms step_avg:152.82ms step:718/1480 train_time:108199ms step_avg:152.82ms step:719/1480 train_time:108359ms step_avg:152.83ms step:720/1480 train_time:108521ms step_avg:152.85ms step:721/1480 train_time:108683ms step_avg:152.86ms step:722/1480 train_time:108845ms step_avg:152.87ms step:723/1480 train_time:109005ms step_avg:152.88ms step:724/1480 train_time:109167ms step_avg:152.89ms step:725/1480 train_time:109330ms step_avg:152.91ms step:726/1480 train_time:109493ms step_avg:152.92ms step:727/1480 train_time:109656ms step_avg:152.94ms step:728/1480 train_time:109818ms step_avg:152.95ms step:729/1480 train_time:109979ms step_avg:152.96ms step:730/1480 train_time:110143ms step_avg:152.98ms step:731/1480 train_time:110303ms step_avg:152.99ms step:732/1480 train_time:110462ms step_avg:152.99ms step:733/1480 train_time:110625ms step_avg:153.01ms step:734/1480 train_time:110787ms step_avg:153.02ms step:735/1480 train_time:110951ms step_avg:153.04ms step:736/1480 train_time:111114ms step_avg:153.05ms step:737/1480 train_time:111275ms step_avg:153.06ms step:738/1480 train_time:111438ms step_avg:153.07ms step:739/1480 train_time:111597ms step_avg:153.08ms step:740/1480 train_time:111763ms step_avg:153.10ms step:741/1480 train_time:111925ms step_avg:153.11ms step:742/1480 train_time:112085ms step_avg:153.12ms step:743/1480 train_time:112244ms step_avg:153.13ms step:744/1480 train_time:112408ms step_avg:153.14ms step:745/1480 train_time:112572ms step_avg:153.16ms step:746/1480 train_time:112734ms step_avg:153.17ms step:747/1480 train_time:112895ms step_avg:153.18ms step:748/1480 train_time:113060ms step_avg:153.20ms step:749/1480 train_time:113223ms step_avg:153.21ms step:750/1480 train_time:113383ms step_avg:153.22ms step:750/1480 val_loss:3.5528 train_time:113447ms step_avg:153.31ms step:751/1480 train_time:113546ms step_avg:153.23ms step:752/1480 train_time:113708ms step_avg:153.24ms step:753/1480 train_time:113867ms step_avg:153.25ms step:754/1480 train_time:114028ms step_avg:153.26ms step:755/1480 train_time:114190ms step_avg:153.27ms step:756/1480 train_time:114350ms step_avg:153.28ms step:757/1480 train_time:114515ms step_avg:153.30ms step:758/1480 train_time:114677ms step_avg:153.31ms step:759/1480 train_time:114840ms step_avg:153.32ms step:760/1480 train_time:115002ms step_avg:153.34ms step:761/1480 train_time:115163ms step_avg:153.35ms step:762/1480 train_time:115324ms step_avg:153.36ms step:763/1480 train_time:115486ms step_avg:153.37ms step:764/1480 train_time:115648ms step_avg:153.38ms step:765/1480 train_time:115809ms step_avg:153.39ms step:766/1480 train_time:115971ms step_avg:153.40ms step:767/1480 train_time:116132ms step_avg:153.41ms step:768/1480 train_time:116295ms step_avg:153.42ms step:769/1480 train_time:116459ms step_avg:153.44ms step:770/1480 train_time:116623ms step_avg:153.45ms step:771/1480 train_time:116786ms step_avg:153.46ms step:772/1480 train_time:116948ms step_avg:153.48ms step:773/1480 train_time:117109ms step_avg:153.48ms step:774/1480 train_time:117271ms step_avg:153.50ms step:775/1480 train_time:117433ms step_avg:153.51ms step:776/1480 train_time:117600ms step_avg:153.52ms step:777/1480 train_time:117765ms step_avg:153.54ms step:778/1480 train_time:117928ms step_avg:153.55ms step:779/1480 train_time:118088ms step_avg:153.56ms step:780/1480 train_time:118252ms step_avg:153.57ms step:781/1480 train_time:118415ms step_avg:153.59ms step:782/1480 train_time:118581ms step_avg:153.60ms step:783/1480 train_time:118742ms step_avg:153.61ms step:784/1480 train_time:118904ms step_avg:153.62ms step:785/1480 train_time:119066ms step_avg:153.63ms step:786/1480 train_time:119231ms step_avg:153.65ms step:787/1480 train_time:119395ms step_avg:153.66ms step:788/1480 train_time:119560ms step_avg:153.68ms step:789/1480 train_time:119722ms step_avg:153.69ms step:790/1480 train_time:119886ms step_avg:153.70ms step:791/1480 train_time:120053ms step_avg:153.72ms step:792/1480 train_time:120218ms step_avg:153.73ms step:793/1480 train_time:120380ms step_avg:153.74ms step:794/1480 train_time:120545ms step_avg:153.76ms step:795/1480 train_time:120710ms step_avg:153.77ms step:796/1480 train_time:120877ms step_avg:153.79ms step:797/1480 train_time:121041ms step_avg:153.80ms step:798/1480 train_time:121206ms step_avg:153.81ms step:799/1480 train_time:121372ms step_avg:153.83ms step:800/1480 train_time:121535ms step_avg:153.84ms step:801/1480 train_time:121699ms step_avg:153.85ms step:802/1480 train_time:121867ms step_avg:153.87ms step:803/1480 train_time:122028ms step_avg:153.88ms step:804/1480 train_time:122190ms step_avg:153.89ms step:805/1480 train_time:122356ms step_avg:153.91ms step:806/1480 train_time:122518ms step_avg:153.92ms step:807/1480 train_time:122680ms step_avg:153.93ms step:808/1480 train_time:122844ms step_avg:153.94ms step:809/1480 train_time:123006ms step_avg:153.95ms step:810/1480 train_time:123167ms step_avg:153.96ms step:811/1480 train_time:123328ms step_avg:153.97ms step:812/1480 train_time:123492ms step_avg:153.98ms step:813/1480 train_time:123653ms step_avg:153.99ms step:814/1480 train_time:123817ms step_avg:154.00ms step:815/1480 train_time:123980ms step_avg:154.01ms step:816/1480 train_time:124144ms step_avg:154.02ms step:817/1480 train_time:124306ms step_avg:154.03ms step:818/1480 train_time:124467ms step_avg:154.04ms step:819/1480 train_time:124631ms step_avg:154.06ms step:820/1480 train_time:124796ms step_avg:154.07ms step:821/1480 train_time:124958ms step_avg:154.08ms step:822/1480 train_time:125122ms step_avg:154.09ms step:823/1480 train_time:125284ms step_avg:154.10ms step:824/1480 train_time:125446ms step_avg:154.11ms step:825/1480 train_time:125611ms step_avg:154.12ms step:826/1480 train_time:125777ms step_avg:154.14ms step:827/1480 train_time:125941ms step_avg:154.15ms step:828/1480 train_time:126104ms step_avg:154.16ms step:829/1480 train_time:126266ms step_avg:154.17ms step:830/1480 train_time:126430ms step_avg:154.18ms step:831/1480 train_time:126595ms step_avg:154.20ms step:832/1480 train_time:126759ms step_avg:154.21ms step:833/1480 train_time:126924ms step_avg:154.22ms step:834/1480 train_time:127088ms step_avg:154.23ms step:835/1480 train_time:127252ms step_avg:154.24ms step:836/1480 train_time:127418ms step_avg:154.26ms step:837/1480 train_time:127581ms step_avg:154.27ms step:838/1480 train_time:127744ms step_avg:154.28ms step:839/1480 train_time:127906ms step_avg:154.29ms step:840/1480 train_time:128066ms step_avg:154.30ms step:841/1480 train_time:128226ms step_avg:154.30ms step:842/1480 train_time:128390ms step_avg:154.31ms step:843/1480 train_time:128553ms step_avg:154.32ms step:844/1480 train_time:128716ms step_avg:154.34ms step:845/1480 train_time:128879ms step_avg:154.35ms step:846/1480 train_time:129044ms step_avg:154.36ms step:847/1480 train_time:129208ms step_avg:154.37ms step:848/1480 train_time:129370ms step_avg:154.38ms step:849/1480 train_time:129533ms step_avg:154.39ms step:850/1480 train_time:129695ms step_avg:154.40ms step:851/1480 train_time:129860ms step_avg:154.41ms step:852/1480 train_time:130023ms step_avg:154.42ms step:853/1480 train_time:130186ms step_avg:154.43ms step:854/1480 train_time:130351ms step_avg:154.44ms step:855/1480 train_time:130514ms step_avg:154.45ms step:856/1480 train_time:130677ms step_avg:154.46ms step:857/1480 train_time:130841ms step_avg:154.48ms step:858/1480 train_time:131006ms step_avg:154.49ms step:859/1480 train_time:131169ms step_avg:154.50ms step:860/1480 train_time:131330ms step_avg:154.51ms step:861/1480 train_time:131496ms step_avg:154.52ms step:862/1480 train_time:131664ms step_avg:154.54ms step:863/1480 train_time:131832ms step_avg:154.55ms step:864/1480 train_time:131998ms step_avg:154.56ms step:865/1480 train_time:132160ms step_avg:154.57ms step:866/1480 train_time:132327ms step_avg:154.59ms step:867/1480 train_time:132492ms step_avg:154.60ms step:868/1480 train_time:132651ms step_avg:154.60ms step:869/1480 train_time:132814ms step_avg:154.61ms step:870/1480 train_time:132979ms step_avg:154.63ms step:871/1480 train_time:133143ms step_avg:154.64ms step:872/1480 train_time:133306ms step_avg:154.65ms step:873/1480 train_time:133468ms step_avg:154.66ms step:874/1480 train_time:133633ms step_avg:154.67ms step:875/1480 train_time:133799ms step_avg:154.68ms step:875/1480 val_loss:3.5073 train_time:133864ms step_avg:154.76ms step:876/1480 train_time:133963ms step_avg:154.69ms step:877/1480 train_time:134128ms step_avg:154.70ms step:878/1480 train_time:134292ms step_avg:154.71ms step:879/1480 train_time:134457ms step_avg:154.73ms step:880/1480 train_time:134619ms step_avg:154.73ms step:881/1480 train_time:134781ms step_avg:154.74ms step:882/1480 train_time:134946ms step_avg:154.75ms step:883/1480 train_time:135112ms step_avg:154.77ms step:884/1480 train_time:135281ms step_avg:154.78ms step:885/1480 train_time:135444ms step_avg:154.79ms step:886/1480 train_time:135611ms step_avg:154.81ms step:887/1480 train_time:135779ms step_avg:154.82ms step:888/1480 train_time:135954ms step_avg:154.84ms step:889/1480 train_time:136122ms step_avg:154.86ms step:890/1480 train_time:136284ms step_avg:154.87ms step:891/1480 train_time:136449ms step_avg:154.88ms step:892/1480 train_time:136616ms step_avg:154.89ms step:893/1480 train_time:136778ms step_avg:154.90ms step:894/1480 train_time:136943ms step_avg:154.91ms step:895/1480 train_time:137111ms step_avg:154.93ms step:896/1480 train_time:137277ms step_avg:154.94ms step:897/1480 train_time:137443ms step_avg:154.95ms step:898/1480 train_time:137610ms step_avg:154.97ms step:899/1480 train_time:137775ms step_avg:154.98ms step:900/1480 train_time:137938ms step_avg:154.99ms step:901/1480 train_time:138102ms step_avg:155.00ms step:902/1480 train_time:138265ms step_avg:155.01ms step:903/1480 train_time:138436ms step_avg:155.02ms step:904/1480 train_time:138601ms step_avg:155.03ms step:905/1480 train_time:138762ms step_avg:155.04ms step:906/1480 train_time:138928ms step_avg:155.05ms step:907/1480 train_time:139096ms step_avg:155.07ms step:908/1480 train_time:139258ms step_avg:155.08ms step:909/1480 train_time:139422ms step_avg:155.09ms step:910/1480 train_time:139592ms step_avg:155.10ms step:911/1480 train_time:139757ms step_avg:155.11ms step:912/1480 train_time:139922ms step_avg:155.12ms step:913/1480 train_time:140090ms step_avg:155.14ms step:914/1480 train_time:140258ms step_avg:155.15ms step:915/1480 train_time:140426ms step_avg:155.17ms step:916/1480 train_time:140590ms step_avg:155.18ms step:917/1480 train_time:140754ms step_avg:155.19ms step:918/1480 train_time:140921ms step_avg:155.20ms step:919/1480 train_time:141091ms step_avg:155.22ms step:920/1480 train_time:141256ms step_avg:155.23ms step:921/1480 train_time:141420ms step_avg:155.24ms step:922/1480 train_time:141588ms step_avg:155.25ms step:923/1480 train_time:141753ms step_avg:155.26ms step:924/1480 train_time:141917ms step_avg:155.27ms step:925/1480 train_time:142082ms step_avg:155.28ms step:926/1480 train_time:142245ms step_avg:155.29ms step:927/1480 train_time:142410ms step_avg:155.30ms step:928/1480 train_time:142576ms step_avg:155.31ms step:929/1480 train_time:142740ms step_avg:155.32ms step:930/1480 train_time:142905ms step_avg:155.33ms step:931/1480 train_time:143067ms step_avg:155.34ms step:932/1480 train_time:143233ms step_avg:155.35ms step:933/1480 train_time:143401ms step_avg:155.36ms step:934/1480 train_time:143567ms step_avg:155.38ms step:935/1480 train_time:143738ms step_avg:155.39ms step:936/1480 train_time:143905ms step_avg:155.41ms step:937/1480 train_time:144076ms step_avg:155.42ms step:938/1480 train_time:144239ms step_avg:155.43ms step:939/1480 train_time:144409ms step_avg:155.45ms step:940/1480 train_time:144576ms step_avg:155.46ms step:941/1480 train_time:144740ms step_avg:155.47ms step:942/1480 train_time:144906ms step_avg:155.48ms step:943/1480 train_time:145078ms step_avg:155.50ms step:944/1480 train_time:145251ms step_avg:155.51ms step:945/1480 train_time:145414ms step_avg:155.52ms step:946/1480 train_time:145583ms step_avg:155.54ms step:947/1480 train_time:145751ms step_avg:155.55ms step:948/1480 train_time:145916ms step_avg:155.56ms step:949/1480 train_time:146081ms step_avg:155.57ms step:950/1480 train_time:146244ms step_avg:155.58ms step:951/1480 train_time:146414ms step_avg:155.59ms step:952/1480 train_time:146581ms step_avg:155.61ms step:953/1480 train_time:146750ms step_avg:155.62ms step:954/1480 train_time:146918ms step_avg:155.63ms step:955/1480 train_time:147082ms step_avg:155.64ms step:956/1480 train_time:147247ms step_avg:155.65ms step:957/1480 train_time:147416ms step_avg:155.67ms step:958/1480 train_time:147586ms step_avg:155.68ms step:959/1480 train_time:147749ms step_avg:155.69ms step:960/1480 train_time:147916ms step_avg:155.70ms step:961/1480 train_time:148081ms step_avg:155.71ms step:962/1480 train_time:148244ms step_avg:155.72ms step:963/1480 train_time:148410ms step_avg:155.73ms step:964/1480 train_time:148579ms step_avg:155.74ms step:965/1480 train_time:148742ms step_avg:155.75ms step:966/1480 train_time:148905ms step_avg:155.76ms step:967/1480 train_time:149068ms step_avg:155.77ms step:968/1480 train_time:149235ms step_avg:155.78ms step:969/1480 train_time:149400ms step_avg:155.79ms step:970/1480 train_time:149563ms step_avg:155.80ms step:971/1480 train_time:149726ms step_avg:155.80ms step:972/1480 train_time:149890ms step_avg:155.81ms step:973/1480 train_time:150055ms step_avg:155.82ms step:974/1480 train_time:150224ms step_avg:155.83ms step:975/1480 train_time:150388ms step_avg:155.84ms step:976/1480 train_time:150553ms step_avg:155.85ms step:977/1480 train_time:150717ms step_avg:155.86ms step:978/1480 train_time:150883ms step_avg:155.87ms step:979/1480 train_time:151048ms step_avg:155.88ms step:980/1480 train_time:151215ms step_avg:155.89ms step:981/1480 train_time:151382ms step_avg:155.90ms step:982/1480 train_time:151545ms step_avg:155.91ms step:983/1480 train_time:151711ms step_avg:155.92ms step:984/1480 train_time:151876ms step_avg:155.93ms step:985/1480 train_time:152043ms step_avg:155.94ms step:986/1480 train_time:152207ms step_avg:155.95ms step:987/1480 train_time:152371ms step_avg:155.96ms step:988/1480 train_time:152539ms step_avg:155.97ms step:989/1480 train_time:152704ms step_avg:155.98ms step:990/1480 train_time:152874ms step_avg:155.99ms step:991/1480 train_time:153042ms step_avg:156.01ms step:992/1480 train_time:153217ms step_avg:156.03ms step:993/1480 train_time:153393ms step_avg:156.05ms step:994/1480 train_time:153559ms step_avg:156.06ms step:995/1480 train_time:153723ms step_avg:156.06ms step:996/1480 train_time:153885ms step_avg:156.07ms step:997/1480 train_time:154052ms step_avg:156.08ms step:998/1480 train_time:154215ms step_avg:156.09ms step:999/1480 train_time:154380ms step_avg:156.10ms step:1000/1480 train_time:154550ms step_avg:156.11ms step:1000/1480 val_loss:3.4425 train_time:154616ms step_avg:156.18ms step:1001/1480 train_time:154719ms step_avg:156.12ms step:1002/1480 train_time:154886ms step_avg:156.13ms step:1003/1480 train_time:155057ms step_avg:156.15ms step:1004/1480 train_time:155226ms step_avg:156.16ms step:1005/1480 train_time:155394ms step_avg:156.17ms step:1006/1480 train_time:155561ms step_avg:156.19ms step:1007/1480 train_time:155726ms step_avg:156.19ms step:1008/1480 train_time:155894ms step_avg:156.21ms step:1009/1480 train_time:156067ms step_avg:156.22ms step:1010/1480 train_time:156232ms step_avg:156.23ms step:1011/1480 train_time:156398ms step_avg:156.24ms step:1012/1480 train_time:156563ms step_avg:156.25ms step:1013/1480 train_time:156733ms step_avg:156.26ms step:1014/1480 train_time:156900ms step_avg:156.28ms step:1015/1480 train_time:157070ms step_avg:156.29ms step:1016/1480 train_time:157240ms step_avg:156.30ms step:1017/1480 train_time:157412ms step_avg:156.32ms step:1018/1480 train_time:157581ms step_avg:156.33ms step:1019/1480 train_time:157749ms step_avg:156.34ms step:1020/1480 train_time:157919ms step_avg:156.36ms step:1021/1480 train_time:158085ms step_avg:156.36ms step:1022/1480 train_time:158252ms step_avg:156.38ms step:1023/1480 train_time:158419ms step_avg:156.39ms step:1024/1480 train_time:158585ms step_avg:156.40ms step:1025/1480 train_time:158756ms step_avg:156.41ms step:1026/1480 train_time:158921ms step_avg:156.42ms step:1027/1480 train_time:159087ms step_avg:156.43ms step:1028/1480 train_time:159260ms step_avg:156.44ms step:1029/1480 train_time:159434ms step_avg:156.46ms step:1030/1480 train_time:159601ms step_avg:156.47ms step:1031/1480 train_time:159765ms step_avg:156.48ms step:1032/1480 train_time:159939ms step_avg:156.50ms step:1033/1480 train_time:160106ms step_avg:156.51ms step:1034/1480 train_time:160272ms step_avg:156.52ms step:1035/1480 train_time:160441ms step_avg:156.53ms step:1036/1480 train_time:160605ms step_avg:156.54ms step:1037/1480 train_time:160772ms step_avg:156.55ms step:1038/1480 train_time:160939ms step_avg:156.56ms step:1039/1480 train_time:161108ms step_avg:156.57ms step:1040/1480 train_time:161274ms step_avg:156.58ms step:1041/1480 train_time:161442ms step_avg:156.59ms step:1042/1480 train_time:161605ms step_avg:156.59ms step:1043/1480 train_time:161769ms step_avg:156.60ms step:1044/1480 train_time:161934ms step_avg:156.61ms step:1045/1480 train_time:162103ms step_avg:156.62ms step:1046/1480 train_time:162271ms step_avg:156.63ms step:1047/1480 train_time:162436ms step_avg:156.64ms step:1048/1480 train_time:162603ms step_avg:156.65ms step:1049/1480 train_time:162768ms step_avg:156.66ms step:1050/1480 train_time:162938ms step_avg:156.67ms step:1051/1480 train_time:163109ms step_avg:156.68ms step:1052/1480 train_time:163278ms step_avg:156.70ms step:1053/1480 train_time:163443ms step_avg:156.71ms step:1054/1480 train_time:163611ms step_avg:156.72ms step:1055/1480 train_time:163778ms step_avg:156.72ms step:1056/1480 train_time:163943ms step_avg:156.73ms step:1057/1480 train_time:164108ms step_avg:156.74ms step:1058/1480 train_time:164278ms step_avg:156.75ms step:1059/1480 train_time:164451ms step_avg:156.77ms step:1060/1480 train_time:164620ms step_avg:156.78ms step:1061/1480 train_time:164783ms step_avg:156.79ms step:1062/1480 train_time:164950ms step_avg:156.80ms step:1063/1480 train_time:165114ms step_avg:156.80ms step:1064/1480 train_time:165278ms step_avg:156.81ms step:1065/1480 train_time:165445ms step_avg:156.82ms step:1066/1480 train_time:165612ms step_avg:156.83ms step:1067/1480 train_time:165781ms step_avg:156.84ms step:1068/1480 train_time:165946ms step_avg:156.85ms step:1069/1480 train_time:166119ms step_avg:156.86ms step:1070/1480 train_time:166284ms step_avg:156.87ms step:1071/1480 train_time:166457ms step_avg:156.89ms step:1072/1480 train_time:166624ms step_avg:156.90ms step:1073/1480 train_time:166787ms step_avg:156.90ms step:1074/1480 train_time:166956ms step_avg:156.91ms step:1075/1480 train_time:167126ms step_avg:156.93ms step:1076/1480 train_time:167292ms step_avg:156.93ms step:1077/1480 train_time:167457ms step_avg:156.94ms step:1078/1480 train_time:167630ms step_avg:156.96ms step:1079/1480 train_time:167803ms step_avg:156.97ms step:1080/1480 train_time:167971ms step_avg:156.98ms step:1081/1480 train_time:168138ms step_avg:156.99ms step:1082/1480 train_time:168305ms step_avg:157.00ms step:1083/1480 train_time:168471ms step_avg:157.01ms step:1084/1480 train_time:168639ms step_avg:157.02ms step:1085/1480 train_time:168807ms step_avg:157.03ms step:1086/1480 train_time:168973ms step_avg:157.04ms step:1087/1480 train_time:169140ms step_avg:157.05ms step:1088/1480 train_time:169310ms step_avg:157.06ms step:1089/1480 train_time:169483ms step_avg:157.07ms step:1090/1480 train_time:169656ms step_avg:157.09ms step:1091/1480 train_time:169825ms step_avg:157.10ms step:1092/1480 train_time:169993ms step_avg:157.11ms step:1093/1480 train_time:170162ms step_avg:157.12ms step:1094/1480 train_time:170328ms step_avg:157.13ms step:1095/1480 train_time:170495ms step_avg:157.14ms step:1096/1480 train_time:170663ms step_avg:157.15ms step:1097/1480 train_time:170832ms step_avg:157.16ms step:1098/1480 train_time:171003ms step_avg:157.17ms step:1099/1480 train_time:171175ms step_avg:157.19ms step:1100/1480 train_time:171345ms step_avg:157.20ms step:1101/1480 train_time:171516ms step_avg:157.21ms step:1102/1480 train_time:171687ms step_avg:157.22ms step:1103/1480 train_time:171862ms step_avg:157.24ms step:1104/1480 train_time:172030ms step_avg:157.25ms step:1105/1480 train_time:172203ms step_avg:157.26ms step:1106/1480 train_time:172371ms step_avg:157.27ms step:1107/1480 train_time:172540ms step_avg:157.28ms step:1108/1480 train_time:172705ms step_avg:157.29ms step:1109/1480 train_time:172869ms step_avg:157.30ms step:1110/1480 train_time:173034ms step_avg:157.30ms step:1111/1480 train_time:173202ms step_avg:157.31ms step:1112/1480 train_time:173370ms step_avg:157.32ms step:1113/1480 train_time:173551ms step_avg:157.34ms step:1114/1480 train_time:173723ms step_avg:157.36ms step:1115/1480 train_time:173896ms step_avg:157.37ms step:1116/1480 train_time:174062ms step_avg:157.38ms step:1117/1480 train_time:174235ms step_avg:157.39ms step:1118/1480 train_time:174410ms step_avg:157.41ms step:1119/1480 train_time:174577ms step_avg:157.42ms step:1120/1480 train_time:174745ms step_avg:157.43ms step:1121/1480 train_time:174915ms step_avg:157.44ms step:1122/1480 train_time:175081ms step_avg:157.45ms step:1123/1480 train_time:175247ms step_avg:157.45ms step:1124/1480 train_time:175416ms step_avg:157.46ms step:1125/1480 train_time:175583ms step_avg:157.47ms step:1125/1480 val_loss:3.3871 train_time:175651ms step_avg:157.53ms step:1126/1480 train_time:175753ms step_avg:157.48ms step:1127/1480 train_time:175921ms step_avg:157.49ms step:1128/1480 train_time:176092ms step_avg:157.51ms step:1129/1480 train_time:176266ms step_avg:157.52ms step:1130/1480 train_time:176435ms step_avg:157.53ms step:1131/1480 train_time:176613ms step_avg:157.55ms step:1132/1480 train_time:176778ms step_avg:157.56ms step:1133/1480 train_time:176950ms step_avg:157.57ms step:1134/1480 train_time:177120ms step_avg:157.58ms step:1135/1480 train_time:177289ms step_avg:157.59ms step:1136/1480 train_time:177459ms step_avg:157.60ms step:1137/1480 train_time:177627ms step_avg:157.61ms step:1138/1480 train_time:177798ms step_avg:157.62ms step:1139/1480 train_time:177967ms step_avg:157.63ms step:1140/1480 train_time:178134ms step_avg:157.64ms step:1141/1480 train_time:178306ms step_avg:157.65ms step:1142/1480 train_time:178474ms step_avg:157.66ms step:1143/1480 train_time:178645ms step_avg:157.67ms step:1144/1480 train_time:178813ms step_avg:157.68ms step:1145/1480 train_time:178978ms step_avg:157.69ms step:1146/1480 train_time:179151ms step_avg:157.70ms step:1147/1480 train_time:179320ms step_avg:157.71ms step:1148/1480 train_time:179488ms step_avg:157.72ms step:1149/1480 train_time:179658ms step_avg:157.73ms step:1150/1480 train_time:179827ms step_avg:157.74ms step:1151/1480 train_time:179997ms step_avg:157.75ms step:1152/1480 train_time:180169ms step_avg:157.77ms step:1153/1480 train_time:180343ms step_avg:157.78ms step:1154/1480 train_time:180512ms step_avg:157.79ms step:1155/1480 train_time:180682ms step_avg:157.80ms step:1156/1480 train_time:180860ms step_avg:157.82ms step:1157/1480 train_time:181031ms step_avg:157.83ms step:1158/1480 train_time:181198ms step_avg:157.84ms step:1159/1480 train_time:181364ms step_avg:157.85ms step:1160/1480 train_time:181530ms step_avg:157.85ms step:1161/1480 train_time:181701ms step_avg:157.86ms step:1162/1480 train_time:181871ms step_avg:157.87ms step:1163/1480 train_time:182040ms step_avg:157.88ms step:1164/1480 train_time:182210ms step_avg:157.89ms step:1165/1480 train_time:182375ms step_avg:157.90ms step:1166/1480 train_time:182546ms step_avg:157.91ms step:1167/1480 train_time:182713ms step_avg:157.92ms step:1168/1480 train_time:182881ms step_avg:157.93ms step:1169/1480 train_time:183050ms step_avg:157.94ms step:1170/1480 train_time:183218ms step_avg:157.95ms step:1171/1480 train_time:183385ms step_avg:157.95ms step:1172/1480 train_time:183551ms step_avg:157.96ms step:1173/1480 train_time:183721ms step_avg:157.97ms step:1174/1480 train_time:183903ms step_avg:157.99ms step:1175/1480 train_time:184075ms step_avg:158.00ms step:1176/1480 train_time:184248ms step_avg:158.02ms step:1177/1480 train_time:184423ms step_avg:158.03ms step:1178/1480 train_time:184590ms step_avg:158.04ms step:1179/1480 train_time:184756ms step_avg:158.05ms step:1180/1480 train_time:184937ms step_avg:158.07ms step:1181/1480 train_time:185108ms step_avg:158.08ms step:1182/1480 train_time:185276ms step_avg:158.09ms step:1183/1480 train_time:185449ms step_avg:158.10ms step:1184/1480 train_time:185616ms step_avg:158.11ms step:1185/1480 train_time:185789ms step_avg:158.12ms step:1186/1480 train_time:185960ms step_avg:158.13ms step:1187/1480 train_time:186144ms step_avg:158.15ms step:1188/1480 train_time:186311ms step_avg:158.16ms step:1189/1480 train_time:186481ms step_avg:158.17ms step:1190/1480 train_time:186650ms step_avg:158.18ms step:1191/1480 train_time:186820ms step_avg:158.19ms step:1192/1480 train_time:186986ms step_avg:158.19ms step:1193/1480 train_time:187153ms step_avg:158.20ms step:1194/1480 train_time:187320ms step_avg:158.21ms step:1195/1480 train_time:187493ms step_avg:158.22ms step:1196/1480 train_time:187675ms step_avg:158.24ms step:1197/1480 train_time:187845ms step_avg:158.25ms step:1198/1480 train_time:188031ms step_avg:158.28ms step:1199/1480 train_time:188201ms step_avg:158.29ms step:1200/1480 train_time:188371ms step_avg:158.29ms step:1201/1480 train_time:188537ms step_avg:158.30ms step:1202/1480 train_time:188719ms step_avg:158.32ms step:1203/1480 train_time:188895ms step_avg:158.34ms step:1204/1480 train_time:189070ms step_avg:158.35ms step:1205/1480 train_time:189238ms step_avg:158.36ms step:1206/1480 train_time:189408ms step_avg:158.37ms step:1207/1480 train_time:189577ms step_avg:158.38ms step:1208/1480 train_time:189746ms step_avg:158.39ms step:1209/1480 train_time:189920ms step_avg:158.40ms step:1210/1480 train_time:190095ms step_avg:158.41ms step:1211/1480 train_time:190271ms step_avg:158.43ms step:1212/1480 train_time:190443ms step_avg:158.44ms step:1213/1480 train_time:190616ms step_avg:158.45ms step:1214/1480 train_time:190794ms step_avg:158.47ms step:1215/1480 train_time:190967ms step_avg:158.48ms step:1216/1480 train_time:191136ms step_avg:158.49ms step:1217/1480 train_time:191310ms step_avg:158.50ms step:1218/1480 train_time:191479ms step_avg:158.51ms step:1219/1480 train_time:191659ms step_avg:158.53ms step:1220/1480 train_time:191829ms step_avg:158.54ms step:1221/1480 train_time:191996ms step_avg:158.54ms step:1222/1480 train_time:192163ms step_avg:158.55ms step:1223/1480 train_time:192334ms step_avg:158.56ms step:1224/1480 train_time:192512ms step_avg:158.58ms step:1225/1480 train_time:192684ms step_avg:158.59ms step:1226/1480 train_time:192857ms step_avg:158.60ms step:1227/1480 train_time:193030ms step_avg:158.61ms step:1228/1480 train_time:193199ms step_avg:158.62ms step:1229/1480 train_time:193372ms step_avg:158.63ms step:1230/1480 train_time:193554ms step_avg:158.65ms step:1231/1480 train_time:193730ms step_avg:158.66ms step:1232/1480 train_time:193905ms step_avg:158.68ms step:1233/1480 train_time:194074ms step_avg:158.69ms step:1234/1480 train_time:194245ms step_avg:158.70ms step:1235/1480 train_time:194420ms step_avg:158.71ms step:1236/1480 train_time:194588ms step_avg:158.72ms step:1237/1480 train_time:194759ms step_avg:158.73ms step:1238/1480 train_time:194943ms step_avg:158.75ms step:1239/1480 train_time:195114ms step_avg:158.76ms step:1240/1480 train_time:195284ms step_avg:158.77ms step:1241/1480 train_time:195457ms step_avg:158.78ms step:1242/1480 train_time:195626ms step_avg:158.79ms step:1243/1480 train_time:195797ms step_avg:158.80ms step:1244/1480 train_time:195964ms step_avg:158.80ms step:1245/1480 train_time:196133ms step_avg:158.81ms step:1246/1480 train_time:196303ms step_avg:158.82ms step:1247/1480 train_time:196472ms step_avg:158.83ms step:1248/1480 train_time:196642ms step_avg:158.84ms step:1249/1480 train_time:196811ms step_avg:158.85ms step:1250/1480 train_time:196980ms step_avg:158.86ms step:1250/1480 val_loss:3.3372 train_time:197052ms step_avg:158.91ms step:1251/1480 train_time:197163ms step_avg:158.87ms step:1252/1480 train_time:197332ms step_avg:158.88ms step:1253/1480 train_time:197501ms step_avg:158.89ms step:1254/1480 train_time:197673ms step_avg:158.90ms step:1255/1480 train_time:197859ms step_avg:158.92ms step:1256/1480 train_time:198033ms step_avg:158.94ms step:1257/1480 train_time:198203ms step_avg:158.94ms step:1258/1480 train_time:198380ms step_avg:158.96ms step:1259/1480 train_time:198553ms step_avg:158.97ms step:1260/1480 train_time:198721ms step_avg:158.98ms step:1261/1480 train_time:198893ms step_avg:158.99ms step:1262/1480 train_time:199068ms step_avg:159.00ms step:1263/1480 train_time:199244ms step_avg:159.01ms step:1264/1480 train_time:199413ms step_avg:159.02ms step:1265/1480 train_time:199580ms step_avg:159.03ms step:1266/1480 train_time:199751ms step_avg:159.04ms step:1267/1480 train_time:199921ms step_avg:159.05ms step:1268/1480 train_time:200093ms step_avg:159.06ms step:1269/1480 train_time:200269ms step_avg:159.07ms step:1270/1480 train_time:200439ms step_avg:159.08ms step:1271/1480 train_time:200608ms step_avg:159.09ms step:1272/1480 train_time:200773ms step_avg:159.09ms step:1273/1480 train_time:200944ms step_avg:159.10ms step:1274/1480 train_time:201118ms step_avg:159.11ms step:1275/1480 train_time:201285ms step_avg:159.12ms step:1276/1480 train_time:201450ms step_avg:159.12ms step:1277/1480 train_time:201621ms step_avg:159.13ms step:1278/1480 train_time:201790ms step_avg:159.14ms step:1279/1480 train_time:201962ms step_avg:159.15ms step:1280/1480 train_time:202142ms step_avg:159.17ms step:1281/1480 train_time:202310ms step_avg:159.17ms step:1282/1480 train_time:202477ms step_avg:159.18ms step:1283/1480 train_time:202646ms step_avg:159.19ms step:1284/1480 train_time:202815ms step_avg:159.20ms step:1285/1480 train_time:202984ms step_avg:159.20ms step:1286/1480 train_time:203153ms step_avg:159.21ms step:1287/1480 train_time:203325ms step_avg:159.22ms step:1288/1480 train_time:203496ms step_avg:159.23ms step:1289/1480 train_time:203682ms step_avg:159.25ms step:1290/1480 train_time:203862ms step_avg:159.27ms step:1291/1480 train_time:204034ms step_avg:159.28ms step:1292/1480 train_time:204208ms step_avg:159.29ms step:1293/1480 train_time:204382ms step_avg:159.30ms step:1294/1480 train_time:204555ms step_avg:159.31ms step:1295/1480 train_time:204727ms step_avg:159.32ms step:1296/1480 train_time:204902ms step_avg:159.33ms step:1297/1480 train_time:205076ms step_avg:159.34ms step:1298/1480 train_time:205246ms step_avg:159.35ms step:1299/1480 train_time:205418ms step_avg:159.36ms step:1300/1480 train_time:205585ms step_avg:159.37ms step:1301/1480 train_time:205754ms step_avg:159.38ms step:1302/1480 train_time:205927ms step_avg:159.39ms step:1303/1480 train_time:206102ms step_avg:159.40ms step:1304/1480 train_time:206277ms step_avg:159.41ms step:1305/1480 train_time:206445ms step_avg:159.42ms step:1306/1480 train_time:206620ms step_avg:159.43ms step:1307/1480 train_time:206789ms step_avg:159.44ms step:1308/1480 train_time:206959ms step_avg:159.44ms step:1309/1480 train_time:207132ms step_avg:159.46ms step:1310/1480 train_time:207301ms step_avg:159.46ms step:1311/1480 train_time:207468ms step_avg:159.47ms step:1312/1480 train_time:207642ms step_avg:159.48ms step:1313/1480 train_time:207812ms step_avg:159.49ms step:1314/1480 train_time:207985ms step_avg:159.50ms step:1315/1480 train_time:208156ms step_avg:159.51ms step:1316/1480 train_time:208323ms step_avg:159.51ms step:1317/1480 train_time:208494ms step_avg:159.52ms step:1318/1480 train_time:208674ms step_avg:159.54ms step:1319/1480 train_time:208851ms step_avg:159.55ms step:1320/1480 train_time:209026ms step_avg:159.56ms step:1321/1480 train_time:209198ms step_avg:159.57ms step:1322/1480 train_time:209380ms step_avg:159.59ms step:1323/1480 train_time:209551ms step_avg:159.60ms step:1324/1480 train_time:209725ms step_avg:159.61ms step:1325/1480 train_time:209908ms step_avg:159.63ms step:1326/1480 train_time:210085ms step_avg:159.64ms step:1327/1480 train_time:210255ms step_avg:159.65ms step:1328/1480 train_time:210424ms step_avg:159.65ms step:1329/1480 train_time:210620ms step_avg:159.68ms step:1330/1480 train_time:210798ms step_avg:159.70ms step:1331/1480 train_time:210968ms step_avg:159.70ms step:1332/1480 train_time:211142ms step_avg:159.71ms step:1333/1480 train_time:211318ms step_avg:159.73ms step:1334/1480 train_time:211487ms step_avg:159.73ms step:1335/1480 train_time:211656ms step_avg:159.74ms step:1336/1480 train_time:211840ms step_avg:159.76ms step:1337/1480 train_time:212016ms step_avg:159.77ms step:1338/1480 train_time:212187ms step_avg:159.78ms step:1339/1480 train_time:212361ms step_avg:159.79ms step:1340/1480 train_time:212534ms step_avg:159.80ms step:1341/1480 train_time:212701ms step_avg:159.81ms step:1342/1480 train_time:212875ms step_avg:159.82ms step:1343/1480 train_time:213044ms step_avg:159.82ms step:1344/1480 train_time:213216ms step_avg:159.83ms step:1345/1480 train_time:213394ms step_avg:159.85ms step:1346/1480 train_time:213563ms step_avg:159.85ms step:1347/1480 train_time:213732ms step_avg:159.86ms step:1348/1480 train_time:213902ms step_avg:159.87ms step:1349/1480 train_time:214072ms step_avg:159.87ms step:1350/1480 train_time:214247ms step_avg:159.89ms step:1351/1480 train_time:214418ms step_avg:159.89ms step:1352/1480 train_time:214589ms step_avg:159.90ms step:1353/1480 train_time:214767ms step_avg:159.92ms step:1354/1480 train_time:214939ms step_avg:159.92ms step:1355/1480 train_time:215106ms step_avg:159.93ms step:1356/1480 train_time:215280ms step_avg:159.94ms step:1357/1480 train_time:215453ms step_avg:159.95ms step:1358/1480 train_time:215624ms step_avg:159.96ms step:1359/1480 train_time:215797ms step_avg:159.97ms step:1360/1480 train_time:215972ms step_avg:159.98ms step:1361/1480 train_time:216149ms step_avg:159.99ms step:1362/1480 train_time:216324ms step_avg:160.00ms step:1363/1480 train_time:216505ms step_avg:160.02ms step:1364/1480 train_time:216673ms step_avg:160.02ms step:1365/1480 train_time:216841ms step_avg:160.03ms step:1366/1480 train_time:217014ms step_avg:160.04ms step:1367/1480 train_time:217185ms step_avg:160.05ms step:1368/1480 train_time:217358ms step_avg:160.06ms step:1369/1480 train_time:217539ms step_avg:160.07ms step:1370/1480 train_time:217714ms step_avg:160.08ms step:1371/1480 train_time:217885ms step_avg:160.09ms step:1372/1480 train_time:218064ms step_avg:160.11ms step:1373/1480 train_time:218233ms step_avg:160.11ms step:1374/1480 train_time:218407ms step_avg:160.12ms step:1375/1480 train_time:218579ms step_avg:160.13ms step:1375/1480 val_loss:3.2984 train_time:218647ms step_avg:160.18ms step:1376/1480 train_time:218753ms step_avg:160.14ms step:1377/1480 train_time:218926ms step_avg:160.15ms step:1378/1480 train_time:219093ms step_avg:160.16ms step:1379/1480 train_time:219270ms step_avg:160.17ms step:1380/1480 train_time:219444ms step_avg:160.18ms step:1381/1480 train_time:219627ms step_avg:160.19ms step:1382/1480 train_time:219798ms step_avg:160.20ms step:1383/1480 train_time:219970ms step_avg:160.21ms step:1384/1480 train_time:220147ms step_avg:160.22ms step:1385/1480 train_time:220311ms step_avg:160.23ms step:1386/1480 train_time:220484ms step_avg:160.24ms step:1387/1480 train_time:220655ms step_avg:160.24ms step:1388/1480 train_time:220824ms step_avg:160.25ms step:1389/1480 train_time:220998ms step_avg:160.26ms step:1390/1480 train_time:221166ms step_avg:160.27ms step:1391/1480 train_time:221336ms step_avg:160.27ms step:1392/1480 train_time:221510ms step_avg:160.28ms step:1393/1480 train_time:221681ms step_avg:160.29ms step:1394/1480 train_time:221850ms step_avg:160.30ms step:1395/1480 train_time:222019ms step_avg:160.30ms step:1396/1480 train_time:222188ms step_avg:160.31ms step:1397/1480 train_time:222354ms step_avg:160.31ms step:1398/1480 train_time:222522ms step_avg:160.32ms step:1399/1480 train_time:222691ms step_avg:160.32ms step:1400/1480 train_time:222868ms step_avg:160.34ms step:1401/1480 train_time:223034ms step_avg:160.34ms step:1402/1480 train_time:223206ms step_avg:160.35ms step:1403/1480 train_time:223382ms step_avg:160.36ms step:1404/1480 train_time:223553ms step_avg:160.37ms step:1405/1480 train_time:223727ms step_avg:160.38ms step:1406/1480 train_time:223904ms step_avg:160.39ms step:1407/1480 train_time:224071ms step_avg:160.39ms step:1408/1480 train_time:224240ms step_avg:160.40ms step:1409/1480 train_time:224424ms step_avg:160.42ms step:1410/1480 train_time:224593ms step_avg:160.42ms step:1411/1480 train_time:224761ms step_avg:160.43ms step:1412/1480 train_time:224931ms step_avg:160.44ms step:1413/1480 train_time:225103ms step_avg:160.44ms step:1414/1480 train_time:225273ms step_avg:160.45ms step:1415/1480 train_time:225448ms step_avg:160.46ms step:1416/1480 train_time:225633ms step_avg:160.48ms step:1417/1480 train_time:225808ms step_avg:160.49ms step:1418/1480 train_time:225979ms step_avg:160.50ms step:1419/1480 train_time:226153ms step_avg:160.51ms step:1420/1480 train_time:226328ms step_avg:160.52ms step:1421/1480 train_time:226502ms step_avg:160.53ms step:1422/1480 train_time:226673ms step_avg:160.53ms step:1423/1480 train_time:226842ms step_avg:160.54ms step:1424/1480 train_time:227019ms step_avg:160.55ms step:1425/1480 train_time:227200ms step_avg:160.57ms step:1426/1480 train_time:227373ms step_avg:160.57ms step:1427/1480 train_time:227548ms step_avg:160.58ms step:1428/1480 train_time:227720ms step_avg:160.59ms step:1429/1480 train_time:227889ms step_avg:160.60ms step:1430/1480 train_time:228063ms step_avg:160.61ms step:1431/1480 train_time:228238ms step_avg:160.62ms step:1432/1480 train_time:228414ms step_avg:160.63ms step:1433/1480 train_time:228592ms step_avg:160.64ms step:1434/1480 train_time:228772ms step_avg:160.65ms step:1435/1480 train_time:228946ms step_avg:160.66ms step:1436/1480 train_time:229119ms step_avg:160.67ms step:1437/1480 train_time:229290ms step_avg:160.68ms step:1438/1480 train_time:229459ms step_avg:160.69ms step:1439/1480 train_time:229633ms step_avg:160.70ms step:1440/1480 train_time:229803ms step_avg:160.70ms step:1441/1480 train_time:229972ms step_avg:160.71ms step:1442/1480 train_time:230150ms step_avg:160.72ms step:1443/1480 train_time:230341ms step_avg:160.74ms step:1444/1480 train_time:230511ms step_avg:160.75ms step:1445/1480 train_time:230684ms step_avg:160.76ms step:1446/1480 train_time:230860ms step_avg:160.77ms step:1447/1480 train_time:231037ms step_avg:160.78ms step:1448/1480 train_time:231207ms step_avg:160.78ms step:1449/1480 train_time:231381ms step_avg:160.79ms step:1450/1480 train_time:231554ms step_avg:160.80ms step:1451/1480 train_time:231724ms step_avg:160.81ms step:1452/1480 train_time:231897ms step_avg:160.82ms step:1453/1480 train_time:232067ms step_avg:160.82ms step:1454/1480 train_time:232240ms step_avg:160.83ms step:1455/1480 train_time:232418ms step_avg:160.84ms step:1456/1480 train_time:232590ms step_avg:160.85ms step:1457/1480 train_time:232761ms step_avg:160.86ms step:1458/1480 train_time:232932ms step_avg:160.86ms step:1459/1480 train_time:233109ms step_avg:160.88ms step:1460/1480 train_time:233281ms step_avg:160.88ms step:1461/1480 train_time:233453ms step_avg:160.89ms step:1462/1480 train_time:233624ms step_avg:160.90ms step:1463/1480 train_time:233802ms step_avg:160.91ms step:1464/1480 train_time:233976ms step_avg:160.92ms step:1465/1480 train_time:234149ms step_avg:160.93ms step:1466/1480 train_time:234320ms step_avg:160.93ms step:1467/1480 train_time:234496ms step_avg:160.94ms step:1468/1480 train_time:234666ms step_avg:160.95ms step:1469/1480 train_time:234840ms step_avg:160.96ms step:1470/1480 train_time:235020ms step_avg:160.97ms step:1471/1480 train_time:235208ms step_avg:160.99ms step:1472/1480 train_time:235387ms step_avg:161.00ms step:1473/1480 train_time:235558ms step_avg:161.01ms step:1474/1480 train_time:235736ms step_avg:161.02ms step:1475/1480 train_time:235915ms step_avg:161.03ms step:1476/1480 train_time:236088ms step_avg:161.04ms step:1477/1480 train_time:236270ms step_avg:161.06ms step:1478/1480 train_time:236453ms step_avg:161.07ms step:1479/1480 train_time:236627ms step_avg:161.08ms step:1480/1480 train_time:236800ms step_avg:161.09ms step:1480/1480 val_loss:3.2791 train_time:236871ms step_avg:161.14ms