import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 10:31:31 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 90W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 81W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 110W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23756ms step_avg:nanms step:2/1480 train_time:23889ms step_avg:nanms step:3/1480 train_time:24028ms step_avg:nanms step:4/1480 train_time:24169ms step_avg:nanms step:5/1480 train_time:24310ms step_avg:nanms step:6/1480 train_time:24452ms step_avg:nanms step:7/1480 train_time:24593ms step_avg:nanms step:8/1480 train_time:24737ms step_avg:nanms step:9/1480 train_time:24882ms step_avg:nanms step:10/1480 train_time:25025ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.18ms step:14/1480 train_time:566ms step_avg:141.41ms step:15/1480 train_time:711ms step_avg:142.18ms step:16/1480 train_time:856ms step_avg:142.62ms step:17/1480 train_time:998ms step_avg:142.61ms step:18/1480 train_time:1142ms step_avg:142.70ms step:19/1480 train_time:1284ms step_avg:142.70ms step:20/1480 train_time:1429ms step_avg:142.94ms step:21/1480 train_time:1571ms step_avg:142.83ms step:22/1480 train_time:1715ms step_avg:142.89ms step:23/1480 train_time:1859ms step_avg:142.96ms step:24/1480 train_time:2002ms step_avg:143.02ms step:25/1480 train_time:2147ms step_avg:143.11ms step:26/1480 train_time:2290ms step_avg:143.14ms step:27/1480 train_time:2434ms step_avg:143.16ms step:28/1480 train_time:2576ms step_avg:143.14ms step:29/1480 train_time:2718ms step_avg:143.07ms step:30/1480 train_time:2860ms step_avg:142.98ms step:31/1480 train_time:3002ms step_avg:142.97ms step:32/1480 train_time:3146ms step_avg:143.01ms step:33/1480 train_time:3290ms step_avg:143.06ms step:34/1480 train_time:3435ms step_avg:143.11ms step:35/1480 train_time:3577ms step_avg:143.09ms step:36/1480 train_time:3721ms step_avg:143.10ms step:37/1480 train_time:3863ms step_avg:143.08ms step:38/1480 train_time:4007ms step_avg:143.11ms step:39/1480 train_time:4152ms step_avg:143.18ms step:40/1480 train_time:4296ms step_avg:143.21ms step:41/1480 train_time:4440ms step_avg:143.21ms step:42/1480 train_time:4583ms step_avg:143.23ms step:43/1480 train_time:4726ms step_avg:143.20ms step:44/1480 train_time:4866ms step_avg:143.13ms step:45/1480 train_time:5011ms step_avg:143.17ms step:46/1480 train_time:5155ms step_avg:143.19ms step:47/1480 train_time:5297ms step_avg:143.17ms step:48/1480 train_time:5441ms step_avg:143.18ms step:49/1480 train_time:5583ms step_avg:143.16ms step:50/1480 train_time:5727ms step_avg:143.17ms step:51/1480 train_time:5870ms step_avg:143.17ms step:52/1480 train_time:6013ms step_avg:143.17ms step:53/1480 train_time:6156ms step_avg:143.16ms step:54/1480 train_time:6298ms step_avg:143.14ms step:55/1480 train_time:6441ms step_avg:143.13ms step:56/1480 train_time:6584ms step_avg:143.13ms step:57/1480 train_time:6729ms step_avg:143.16ms step:58/1480 train_time:6873ms step_avg:143.19ms step:59/1480 train_time:7017ms step_avg:143.21ms step:60/1480 train_time:7160ms step_avg:143.20ms step:61/1480 train_time:7302ms step_avg:143.19ms step:62/1480 train_time:7446ms step_avg:143.20ms step:63/1480 train_time:7590ms step_avg:143.21ms step:64/1480 train_time:7734ms step_avg:143.23ms step:65/1480 train_time:7878ms step_avg:143.23ms step:66/1480 train_time:8020ms step_avg:143.22ms step:67/1480 train_time:8162ms step_avg:143.19ms step:68/1480 train_time:8306ms step_avg:143.21ms step:69/1480 train_time:8452ms step_avg:143.25ms step:70/1480 train_time:8595ms step_avg:143.25ms step:71/1480 train_time:8739ms step_avg:143.26ms step:72/1480 train_time:8881ms step_avg:143.23ms step:73/1480 train_time:9023ms step_avg:143.23ms step:74/1480 train_time:9167ms step_avg:143.23ms step:75/1480 train_time:9312ms step_avg:143.26ms step:76/1480 train_time:9455ms step_avg:143.26ms step:77/1480 train_time:9598ms step_avg:143.26ms step:78/1480 train_time:9741ms step_avg:143.25ms step:79/1480 train_time:9884ms step_avg:143.25ms step:80/1480 train_time:10027ms step_avg:143.24ms step:81/1480 train_time:10169ms step_avg:143.23ms step:82/1480 train_time:10312ms step_avg:143.22ms step:83/1480 train_time:10456ms step_avg:143.23ms step:84/1480 train_time:10599ms step_avg:143.23ms step:85/1480 train_time:10742ms step_avg:143.23ms step:86/1480 train_time:10885ms step_avg:143.22ms step:87/1480 train_time:11027ms step_avg:143.21ms step:88/1480 train_time:11172ms step_avg:143.23ms step:89/1480 train_time:11316ms step_avg:143.23ms step:90/1480 train_time:11457ms step_avg:143.21ms step:91/1480 train_time:11598ms step_avg:143.19ms step:92/1480 train_time:11742ms step_avg:143.19ms step:93/1480 train_time:11885ms step_avg:143.19ms step:94/1480 train_time:12030ms step_avg:143.21ms step:95/1480 train_time:12174ms step_avg:143.22ms step:96/1480 train_time:12317ms step_avg:143.22ms step:97/1480 train_time:12459ms step_avg:143.21ms step:98/1480 train_time:12600ms step_avg:143.18ms step:99/1480 train_time:12742ms step_avg:143.17ms step:100/1480 train_time:12887ms step_avg:143.19ms step:101/1480 train_time:13032ms step_avg:143.20ms step:102/1480 train_time:13175ms step_avg:143.21ms step:103/1480 train_time:13318ms step_avg:143.20ms step:104/1480 train_time:13459ms step_avg:143.18ms step:105/1480 train_time:13600ms step_avg:143.16ms step:106/1480 train_time:13743ms step_avg:143.15ms step:107/1480 train_time:13885ms step_avg:143.14ms step:108/1480 train_time:14029ms step_avg:143.15ms step:109/1480 train_time:14173ms step_avg:143.16ms step:110/1480 train_time:14315ms step_avg:143.15ms step:111/1480 train_time:14459ms step_avg:143.16ms step:112/1480 train_time:14606ms step_avg:143.19ms step:113/1480 train_time:14754ms step_avg:143.24ms step:114/1480 train_time:14900ms step_avg:143.27ms step:115/1480 train_time:15045ms step_avg:143.29ms step:116/1480 train_time:15193ms step_avg:143.33ms step:117/1480 train_time:15342ms step_avg:143.38ms step:118/1480 train_time:15489ms step_avg:143.42ms step:119/1480 train_time:15637ms step_avg:143.46ms step:120/1480 train_time:15783ms step_avg:143.48ms step:121/1480 train_time:15929ms step_avg:143.50ms step:122/1480 train_time:16076ms step_avg:143.54ms step:123/1480 train_time:16222ms step_avg:143.56ms step:124/1480 train_time:16369ms step_avg:143.59ms step:125/1480 train_time:16517ms step_avg:143.63ms step:125/1480 val_loss:4.4116 train_time:16575ms step_avg:144.13ms step:126/1480 train_time:16669ms step_avg:143.70ms step:127/1480 train_time:16817ms step_avg:143.73ms step:128/1480 train_time:16964ms step_avg:143.76ms step:129/1480 train_time:17108ms step_avg:143.77ms step:130/1480 train_time:17255ms step_avg:143.79ms step:131/1480 train_time:17401ms step_avg:143.81ms step:132/1480 train_time:17547ms step_avg:143.83ms step:133/1480 train_time:17695ms step_avg:143.87ms step:134/1480 train_time:17844ms step_avg:143.91ms step:135/1480 train_time:17991ms step_avg:143.93ms step:136/1480 train_time:18139ms step_avg:143.96ms step:137/1480 train_time:18286ms step_avg:143.98ms step:138/1480 train_time:18433ms step_avg:144.01ms step:139/1480 train_time:18581ms step_avg:144.04ms step:140/1480 train_time:18728ms step_avg:144.06ms step:141/1480 train_time:18877ms step_avg:144.10ms step:142/1480 train_time:19024ms step_avg:144.12ms step:143/1480 train_time:19171ms step_avg:144.14ms step:144/1480 train_time:19318ms step_avg:144.16ms step:145/1480 train_time:19465ms step_avg:144.18ms step:146/1480 train_time:19613ms step_avg:144.21ms step:147/1480 train_time:19761ms step_avg:144.24ms step:148/1480 train_time:19908ms step_avg:144.26ms step:149/1480 train_time:20057ms step_avg:144.29ms step:150/1480 train_time:20204ms step_avg:144.31ms step:151/1480 train_time:20349ms step_avg:144.32ms step:152/1480 train_time:20496ms step_avg:144.33ms step:153/1480 train_time:20643ms step_avg:144.36ms step:154/1480 train_time:20788ms step_avg:144.36ms step:155/1480 train_time:20938ms step_avg:144.40ms step:156/1480 train_time:21085ms step_avg:144.42ms step:157/1480 train_time:21233ms step_avg:144.44ms step:158/1480 train_time:21381ms step_avg:144.46ms step:159/1480 train_time:21527ms step_avg:144.47ms step:160/1480 train_time:21675ms step_avg:144.50ms step:161/1480 train_time:21822ms step_avg:144.51ms step:162/1480 train_time:21970ms step_avg:144.54ms step:163/1480 train_time:22118ms step_avg:144.56ms step:164/1480 train_time:22265ms step_avg:144.58ms step:165/1480 train_time:22410ms step_avg:144.58ms step:166/1480 train_time:22559ms step_avg:144.61ms step:167/1480 train_time:22706ms step_avg:144.62ms step:168/1480 train_time:22853ms step_avg:144.64ms step:169/1480 train_time:23000ms step_avg:144.66ms step:170/1480 train_time:23147ms step_avg:144.67ms step:171/1480 train_time:23294ms step_avg:144.68ms step:172/1480 train_time:23442ms step_avg:144.70ms step:173/1480 train_time:23588ms step_avg:144.71ms step:174/1480 train_time:23736ms step_avg:144.73ms step:175/1480 train_time:23884ms step_avg:144.75ms step:176/1480 train_time:24030ms step_avg:144.76ms step:177/1480 train_time:24179ms step_avg:144.79ms step:178/1480 train_time:24325ms step_avg:144.79ms step:179/1480 train_time:24473ms step_avg:144.81ms step:180/1480 train_time:24620ms step_avg:144.82ms step:181/1480 train_time:24767ms step_avg:144.84ms step:182/1480 train_time:24917ms step_avg:144.86ms step:183/1480 train_time:25064ms step_avg:144.88ms step:184/1480 train_time:25208ms step_avg:144.88ms step:185/1480 train_time:25357ms step_avg:144.90ms step:186/1480 train_time:25504ms step_avg:144.91ms step:187/1480 train_time:25651ms step_avg:144.92ms step:188/1480 train_time:25798ms step_avg:144.93ms step:189/1480 train_time:25945ms step_avg:144.95ms step:190/1480 train_time:26091ms step_avg:144.95ms step:191/1480 train_time:26240ms step_avg:144.97ms step:192/1480 train_time:26386ms step_avg:144.98ms step:193/1480 train_time:26533ms step_avg:144.99ms step:194/1480 train_time:26681ms step_avg:145.01ms step:195/1480 train_time:26827ms step_avg:145.01ms step:196/1480 train_time:26975ms step_avg:145.03ms step:197/1480 train_time:27122ms step_avg:145.04ms step:198/1480 train_time:27270ms step_avg:145.05ms step:199/1480 train_time:27418ms step_avg:145.07ms step:200/1480 train_time:27565ms step_avg:145.08ms step:201/1480 train_time:27712ms step_avg:145.09ms step:202/1480 train_time:27860ms step_avg:145.10ms step:203/1480 train_time:28006ms step_avg:145.11ms step:204/1480 train_time:28154ms step_avg:145.12ms step:205/1480 train_time:28302ms step_avg:145.14ms step:206/1480 train_time:28449ms step_avg:145.15ms step:207/1480 train_time:28596ms step_avg:145.16ms step:208/1480 train_time:28744ms step_avg:145.17ms step:209/1480 train_time:28890ms step_avg:145.17ms step:210/1480 train_time:29037ms step_avg:145.19ms step:211/1480 train_time:29185ms step_avg:145.20ms step:212/1480 train_time:29332ms step_avg:145.21ms step:213/1480 train_time:29479ms step_avg:145.22ms step:214/1480 train_time:29626ms step_avg:145.22ms step:215/1480 train_time:29774ms step_avg:145.24ms step:216/1480 train_time:29921ms step_avg:145.25ms step:217/1480 train_time:30068ms step_avg:145.26ms step:218/1480 train_time:30216ms step_avg:145.27ms step:219/1480 train_time:30364ms step_avg:145.28ms step:220/1480 train_time:30509ms step_avg:145.28ms step:221/1480 train_time:30659ms step_avg:145.30ms step:222/1480 train_time:30808ms step_avg:145.32ms step:223/1480 train_time:30960ms step_avg:145.35ms step:224/1480 train_time:31110ms step_avg:145.37ms step:225/1480 train_time:31261ms step_avg:145.40ms step:226/1480 train_time:31411ms step_avg:145.42ms step:227/1480 train_time:31562ms step_avg:145.45ms step:228/1480 train_time:31712ms step_avg:145.47ms step:229/1480 train_time:31864ms step_avg:145.50ms step:230/1480 train_time:32014ms step_avg:145.52ms step:231/1480 train_time:32166ms step_avg:145.55ms step:232/1480 train_time:32318ms step_avg:145.57ms step:233/1480 train_time:32469ms step_avg:145.60ms step:234/1480 train_time:32619ms step_avg:145.62ms step:235/1480 train_time:32769ms step_avg:145.64ms step:236/1480 train_time:32920ms step_avg:145.66ms step:237/1480 train_time:33070ms step_avg:145.68ms step:238/1480 train_time:33220ms step_avg:145.70ms step:239/1480 train_time:33371ms step_avg:145.72ms step:240/1480 train_time:33522ms step_avg:145.75ms step:241/1480 train_time:33672ms step_avg:145.77ms step:242/1480 train_time:33822ms step_avg:145.78ms step:243/1480 train_time:33972ms step_avg:145.80ms step:244/1480 train_time:34123ms step_avg:145.82ms step:245/1480 train_time:34273ms step_avg:145.84ms step:246/1480 train_time:34423ms step_avg:145.86ms step:247/1480 train_time:34574ms step_avg:145.88ms step:248/1480 train_time:34725ms step_avg:145.90ms step:249/1480 train_time:34877ms step_avg:145.93ms step:250/1480 train_time:35027ms step_avg:145.94ms step:250/1480 val_loss:3.9994 train_time:35086ms step_avg:146.19ms step:251/1480 train_time:35184ms step_avg:145.99ms step:252/1480 train_time:35337ms step_avg:146.02ms step:253/1480 train_time:35489ms step_avg:146.04ms step:254/1480 train_time:35638ms step_avg:146.06ms step:255/1480 train_time:35788ms step_avg:146.07ms step:256/1480 train_time:35937ms step_avg:146.09ms step:257/1480 train_time:36087ms step_avg:146.10ms step:258/1480 train_time:36239ms step_avg:146.12ms step:259/1480 train_time:36392ms step_avg:146.15ms step:260/1480 train_time:36541ms step_avg:146.16ms step:261/1480 train_time:36692ms step_avg:146.18ms step:262/1480 train_time:36841ms step_avg:146.19ms step:263/1480 train_time:36991ms step_avg:146.21ms step:264/1480 train_time:37141ms step_avg:146.22ms step:265/1480 train_time:37293ms step_avg:146.25ms step:266/1480 train_time:37444ms step_avg:146.27ms step:267/1480 train_time:37594ms step_avg:146.28ms step:268/1480 train_time:37744ms step_avg:146.30ms step:269/1480 train_time:37894ms step_avg:146.31ms step:270/1480 train_time:38044ms step_avg:146.32ms step:271/1480 train_time:38196ms step_avg:146.35ms step:272/1480 train_time:38347ms step_avg:146.36ms step:273/1480 train_time:38497ms step_avg:146.38ms step:274/1480 train_time:38650ms step_avg:146.40ms step:275/1480 train_time:38800ms step_avg:146.42ms step:276/1480 train_time:38951ms step_avg:146.43ms step:277/1480 train_time:39099ms step_avg:146.44ms step:278/1480 train_time:39252ms step_avg:146.46ms step:279/1480 train_time:39402ms step_avg:146.48ms step:280/1480 train_time:39554ms step_avg:146.49ms step:281/1480 train_time:39704ms step_avg:146.51ms step:282/1480 train_time:39855ms step_avg:146.53ms step:283/1480 train_time:40005ms step_avg:146.54ms step:284/1480 train_time:40156ms step_avg:146.55ms step:285/1480 train_time:40307ms step_avg:146.57ms step:286/1480 train_time:40458ms step_avg:146.59ms step:287/1480 train_time:40610ms step_avg:146.61ms step:288/1480 train_time:40760ms step_avg:146.62ms step:289/1480 train_time:40910ms step_avg:146.63ms step:290/1480 train_time:41059ms step_avg:146.64ms step:291/1480 train_time:41209ms step_avg:146.65ms step:292/1480 train_time:41359ms step_avg:146.66ms step:293/1480 train_time:41511ms step_avg:146.68ms step:294/1480 train_time:41662ms step_avg:146.70ms step:295/1480 train_time:41813ms step_avg:146.71ms step:296/1480 train_time:41965ms step_avg:146.73ms step:297/1480 train_time:42115ms step_avg:146.74ms step:298/1480 train_time:42264ms step_avg:146.75ms step:299/1480 train_time:42415ms step_avg:146.76ms step:300/1480 train_time:42567ms step_avg:146.78ms step:301/1480 train_time:42717ms step_avg:146.79ms step:302/1480 train_time:42868ms step_avg:146.81ms step:303/1480 train_time:43018ms step_avg:146.82ms step:304/1480 train_time:43169ms step_avg:146.83ms step:305/1480 train_time:43319ms step_avg:146.84ms step:306/1480 train_time:43469ms step_avg:146.85ms step:307/1480 train_time:43618ms step_avg:146.86ms step:308/1480 train_time:43769ms step_avg:146.88ms step:309/1480 train_time:43919ms step_avg:146.89ms step:310/1480 train_time:44070ms step_avg:146.90ms step:311/1480 train_time:44220ms step_avg:146.91ms step:312/1480 train_time:44371ms step_avg:146.92ms step:313/1480 train_time:44521ms step_avg:146.93ms step:314/1480 train_time:44671ms step_avg:146.94ms step:315/1480 train_time:44820ms step_avg:146.95ms step:316/1480 train_time:44971ms step_avg:146.96ms step:317/1480 train_time:45123ms step_avg:146.98ms step:318/1480 train_time:45273ms step_avg:146.99ms step:319/1480 train_time:45422ms step_avg:147.00ms step:320/1480 train_time:45572ms step_avg:147.01ms step:321/1480 train_time:45723ms step_avg:147.02ms step:322/1480 train_time:45874ms step_avg:147.03ms step:323/1480 train_time:46025ms step_avg:147.04ms step:324/1480 train_time:46176ms step_avg:147.06ms step:325/1480 train_time:46326ms step_avg:147.07ms step:326/1480 train_time:46477ms step_avg:147.08ms step:327/1480 train_time:46626ms step_avg:147.09ms step:328/1480 train_time:46779ms step_avg:147.10ms step:329/1480 train_time:46930ms step_avg:147.12ms step:330/1480 train_time:47082ms step_avg:147.13ms step:331/1480 train_time:47235ms step_avg:147.15ms step:332/1480 train_time:47388ms step_avg:147.17ms step:333/1480 train_time:47542ms step_avg:147.19ms step:334/1480 train_time:47696ms step_avg:147.21ms step:335/1480 train_time:47850ms step_avg:147.23ms step:336/1480 train_time:48004ms step_avg:147.25ms step:337/1480 train_time:48159ms step_avg:147.27ms step:338/1480 train_time:48313ms step_avg:147.30ms step:339/1480 train_time:48466ms step_avg:147.31ms step:340/1480 train_time:48620ms step_avg:147.33ms step:341/1480 train_time:48773ms step_avg:147.35ms step:342/1480 train_time:48928ms step_avg:147.37ms step:343/1480 train_time:49082ms step_avg:147.39ms step:344/1480 train_time:49236ms step_avg:147.41ms step:345/1480 train_time:49390ms step_avg:147.43ms step:346/1480 train_time:49545ms step_avg:147.45ms step:347/1480 train_time:49698ms step_avg:147.47ms step:348/1480 train_time:49852ms step_avg:147.49ms step:349/1480 train_time:50007ms step_avg:147.51ms step:350/1480 train_time:50162ms step_avg:147.54ms step:351/1480 train_time:50316ms step_avg:147.55ms step:352/1480 train_time:50470ms step_avg:147.57ms step:353/1480 train_time:50624ms step_avg:147.59ms step:354/1480 train_time:50777ms step_avg:147.61ms step:355/1480 train_time:50930ms step_avg:147.62ms step:356/1480 train_time:51086ms step_avg:147.65ms step:357/1480 train_time:51239ms step_avg:147.66ms step:358/1480 train_time:51394ms step_avg:147.68ms step:359/1480 train_time:51550ms step_avg:147.71ms step:360/1480 train_time:51705ms step_avg:147.73ms step:361/1480 train_time:51859ms step_avg:147.75ms step:362/1480 train_time:52014ms step_avg:147.77ms step:363/1480 train_time:52169ms step_avg:147.79ms step:364/1480 train_time:52322ms step_avg:147.80ms step:365/1480 train_time:52475ms step_avg:147.82ms step:366/1480 train_time:52629ms step_avg:147.83ms step:367/1480 train_time:52781ms step_avg:147.85ms step:368/1480 train_time:52935ms step_avg:147.86ms step:369/1480 train_time:53091ms step_avg:147.88ms step:370/1480 train_time:53246ms step_avg:147.91ms step:371/1480 train_time:53400ms step_avg:147.92ms step:372/1480 train_time:53555ms step_avg:147.94ms step:373/1480 train_time:53709ms step_avg:147.96ms step:374/1480 train_time:53862ms step_avg:147.97ms step:375/1480 train_time:54015ms step_avg:147.99ms step:375/1480 val_loss:3.8062 train_time:54074ms step_avg:148.15ms step:376/1480 train_time:54170ms step_avg:148.01ms step:377/1480 train_time:54325ms step_avg:148.03ms step:378/1480 train_time:54478ms step_avg:148.04ms step:379/1480 train_time:54631ms step_avg:148.05ms step:380/1480 train_time:54783ms step_avg:148.06ms step:381/1480 train_time:54935ms step_avg:148.07ms step:382/1480 train_time:55089ms step_avg:148.09ms step:383/1480 train_time:55244ms step_avg:148.11ms step:384/1480 train_time:55400ms step_avg:148.13ms step:385/1480 train_time:55554ms step_avg:148.14ms step:386/1480 train_time:55708ms step_avg:148.16ms step:387/1480 train_time:55862ms step_avg:148.18ms step:388/1480 train_time:56016ms step_avg:148.19ms step:389/1480 train_time:56169ms step_avg:148.20ms step:390/1480 train_time:56324ms step_avg:148.22ms step:391/1480 train_time:56478ms step_avg:148.24ms step:392/1480 train_time:56631ms step_avg:148.25ms step:393/1480 train_time:56786ms step_avg:148.27ms step:394/1480 train_time:56939ms step_avg:148.28ms step:395/1480 train_time:57092ms step_avg:148.29ms step:396/1480 train_time:57246ms step_avg:148.31ms step:397/1480 train_time:57401ms step_avg:148.32ms step:398/1480 train_time:57555ms step_avg:148.34ms step:399/1480 train_time:57709ms step_avg:148.35ms step:400/1480 train_time:57864ms step_avg:148.37ms step:401/1480 train_time:58020ms step_avg:148.39ms step:402/1480 train_time:58175ms step_avg:148.41ms step:403/1480 train_time:58329ms step_avg:148.42ms step:404/1480 train_time:58485ms step_avg:148.44ms step:405/1480 train_time:58640ms step_avg:148.46ms step:406/1480 train_time:58793ms step_avg:148.47ms step:407/1480 train_time:58947ms step_avg:148.48ms step:408/1480 train_time:59100ms step_avg:148.49ms step:409/1480 train_time:59254ms step_avg:148.51ms step:410/1480 train_time:59407ms step_avg:148.52ms step:411/1480 train_time:59562ms step_avg:148.53ms step:412/1480 train_time:59716ms step_avg:148.55ms step:413/1480 train_time:59869ms step_avg:148.56ms step:414/1480 train_time:60023ms step_avg:148.57ms step:415/1480 train_time:60178ms step_avg:148.59ms step:416/1480 train_time:60332ms step_avg:148.60ms step:417/1480 train_time:60485ms step_avg:148.61ms step:418/1480 train_time:60640ms step_avg:148.63ms step:419/1480 train_time:60794ms step_avg:148.64ms step:420/1480 train_time:60948ms step_avg:148.65ms step:421/1480 train_time:61101ms step_avg:148.66ms step:422/1480 train_time:61255ms step_avg:148.68ms step:423/1480 train_time:61409ms step_avg:148.69ms step:424/1480 train_time:61563ms step_avg:148.70ms step:425/1480 train_time:61717ms step_avg:148.72ms step:426/1480 train_time:61870ms step_avg:148.73ms step:427/1480 train_time:62024ms step_avg:148.74ms step:428/1480 train_time:62179ms step_avg:148.75ms step:429/1480 train_time:62333ms step_avg:148.77ms step:430/1480 train_time:62487ms step_avg:148.78ms step:431/1480 train_time:62640ms step_avg:148.79ms step:432/1480 train_time:62792ms step_avg:148.80ms step:433/1480 train_time:62947ms step_avg:148.81ms step:434/1480 train_time:63101ms step_avg:148.82ms step:435/1480 train_time:63255ms step_avg:148.84ms step:436/1480 train_time:63409ms step_avg:148.85ms step:437/1480 train_time:63563ms step_avg:148.86ms step:438/1480 train_time:63718ms step_avg:148.87ms step:439/1480 train_time:63872ms step_avg:148.88ms step:440/1480 train_time:64025ms step_avg:148.90ms step:441/1480 train_time:64183ms step_avg:148.92ms step:442/1480 train_time:64341ms step_avg:148.94ms step:443/1480 train_time:64497ms step_avg:148.95ms step:444/1480 train_time:64651ms step_avg:148.97ms step:445/1480 train_time:64808ms step_avg:148.98ms step:446/1480 train_time:64964ms step_avg:149.00ms step:447/1480 train_time:65122ms step_avg:149.02ms step:448/1480 train_time:65279ms step_avg:149.04ms step:449/1480 train_time:65437ms step_avg:149.06ms step:450/1480 train_time:65596ms step_avg:149.08ms step:451/1480 train_time:65753ms step_avg:149.10ms step:452/1480 train_time:65908ms step_avg:149.11ms step:453/1480 train_time:66066ms step_avg:149.13ms step:454/1480 train_time:66223ms step_avg:149.15ms step:455/1480 train_time:66379ms step_avg:149.17ms step:456/1480 train_time:66535ms step_avg:149.18ms step:457/1480 train_time:66692ms step_avg:149.20ms step:458/1480 train_time:66848ms step_avg:149.21ms step:459/1480 train_time:67006ms step_avg:149.23ms step:460/1480 train_time:67164ms step_avg:149.25ms step:461/1480 train_time:67323ms step_avg:149.27ms step:462/1480 train_time:67480ms step_avg:149.29ms step:463/1480 train_time:67638ms step_avg:149.31ms step:464/1480 train_time:67795ms step_avg:149.33ms step:465/1480 train_time:67950ms step_avg:149.34ms step:466/1480 train_time:68107ms step_avg:149.36ms step:467/1480 train_time:68265ms step_avg:149.38ms step:468/1480 train_time:68421ms step_avg:149.39ms step:469/1480 train_time:68577ms step_avg:149.40ms step:470/1480 train_time:68732ms step_avg:149.42ms step:471/1480 train_time:68889ms step_avg:149.43ms step:472/1480 train_time:69049ms step_avg:149.46ms step:473/1480 train_time:69205ms step_avg:149.47ms step:474/1480 train_time:69361ms step_avg:149.49ms step:475/1480 train_time:69517ms step_avg:149.50ms step:476/1480 train_time:69673ms step_avg:149.51ms step:477/1480 train_time:69829ms step_avg:149.53ms step:478/1480 train_time:69986ms step_avg:149.54ms step:479/1480 train_time:70143ms step_avg:149.56ms step:480/1480 train_time:70302ms step_avg:149.58ms step:481/1480 train_time:70459ms step_avg:149.60ms step:482/1480 train_time:70617ms step_avg:149.61ms step:483/1480 train_time:70772ms step_avg:149.62ms step:484/1480 train_time:70929ms step_avg:149.64ms step:485/1480 train_time:71086ms step_avg:149.66ms step:486/1480 train_time:71245ms step_avg:149.67ms step:487/1480 train_time:71404ms step_avg:149.69ms step:488/1480 train_time:71563ms step_avg:149.71ms step:489/1480 train_time:71722ms step_avg:149.73ms step:490/1480 train_time:71879ms step_avg:149.75ms step:491/1480 train_time:72036ms step_avg:149.76ms step:492/1480 train_time:72192ms step_avg:149.78ms step:493/1480 train_time:72348ms step_avg:149.79ms step:494/1480 train_time:72505ms step_avg:149.80ms step:495/1480 train_time:72663ms step_avg:149.82ms step:496/1480 train_time:72821ms step_avg:149.84ms step:497/1480 train_time:72978ms step_avg:149.85ms step:498/1480 train_time:73136ms step_avg:149.87ms step:499/1480 train_time:73292ms step_avg:149.88ms step:500/1480 train_time:73450ms step_avg:149.90ms step:500/1480 val_loss:3.6860 train_time:73511ms step_avg:150.02ms step:501/1480 train_time:73612ms step_avg:149.92ms step:502/1480 train_time:73772ms step_avg:149.94ms step:503/1480 train_time:73928ms step_avg:149.96ms step:504/1480 train_time:74083ms step_avg:149.96ms step:505/1480 train_time:74238ms step_avg:149.98ms step:506/1480 train_time:74395ms step_avg:149.99ms step:507/1480 train_time:74553ms step_avg:150.01ms step:508/1480 train_time:74712ms step_avg:150.02ms step:509/1480 train_time:74869ms step_avg:150.04ms step:510/1480 train_time:75026ms step_avg:150.05ms step:511/1480 train_time:75182ms step_avg:150.06ms step:512/1480 train_time:75340ms step_avg:150.08ms step:513/1480 train_time:75495ms step_avg:150.09ms step:514/1480 train_time:75653ms step_avg:150.10ms step:515/1480 train_time:75809ms step_avg:150.12ms step:516/1480 train_time:75969ms step_avg:150.14ms step:517/1480 train_time:76126ms step_avg:150.15ms step:518/1480 train_time:76282ms step_avg:150.16ms step:519/1480 train_time:76439ms step_avg:150.18ms step:520/1480 train_time:76597ms step_avg:150.19ms step:521/1480 train_time:76753ms step_avg:150.20ms step:522/1480 train_time:76912ms step_avg:150.22ms step:523/1480 train_time:77071ms step_avg:150.24ms step:524/1480 train_time:77229ms step_avg:150.25ms step:525/1480 train_time:77386ms step_avg:150.26ms step:526/1480 train_time:77544ms step_avg:150.28ms step:527/1480 train_time:77700ms step_avg:150.29ms step:528/1480 train_time:77856ms step_avg:150.30ms step:529/1480 train_time:78013ms step_avg:150.31ms step:530/1480 train_time:78171ms step_avg:150.33ms step:531/1480 train_time:78329ms step_avg:150.34ms step:532/1480 train_time:78486ms step_avg:150.36ms step:533/1480 train_time:78643ms step_avg:150.37ms step:534/1480 train_time:78799ms step_avg:150.38ms step:535/1480 train_time:78956ms step_avg:150.39ms step:536/1480 train_time:79113ms step_avg:150.41ms step:537/1480 train_time:79271ms step_avg:150.42ms step:538/1480 train_time:79428ms step_avg:150.43ms step:539/1480 train_time:79586ms step_avg:150.45ms step:540/1480 train_time:79741ms step_avg:150.45ms step:541/1480 train_time:79896ms step_avg:150.46ms step:542/1480 train_time:80054ms step_avg:150.48ms step:543/1480 train_time:80212ms step_avg:150.49ms step:544/1480 train_time:80370ms step_avg:150.50ms step:545/1480 train_time:80526ms step_avg:150.52ms step:546/1480 train_time:80683ms step_avg:150.53ms step:547/1480 train_time:80840ms step_avg:150.54ms step:548/1480 train_time:80997ms step_avg:150.55ms step:549/1480 train_time:81154ms step_avg:150.56ms step:550/1480 train_time:81312ms step_avg:150.58ms step:551/1480 train_time:81471ms step_avg:150.59ms step:552/1480 train_time:81629ms step_avg:150.61ms step:553/1480 train_time:81789ms step_avg:150.62ms step:554/1480 train_time:81949ms step_avg:150.64ms step:555/1480 train_time:82109ms step_avg:150.66ms step:556/1480 train_time:82269ms step_avg:150.68ms step:557/1480 train_time:82428ms step_avg:150.69ms step:558/1480 train_time:82588ms step_avg:150.71ms step:559/1480 train_time:82747ms step_avg:150.72ms step:560/1480 train_time:82906ms step_avg:150.74ms step:561/1480 train_time:83064ms step_avg:150.75ms step:562/1480 train_time:83222ms step_avg:150.77ms step:563/1480 train_time:83380ms step_avg:150.78ms step:564/1480 train_time:83538ms step_avg:150.79ms step:565/1480 train_time:83697ms step_avg:150.81ms step:566/1480 train_time:83856ms step_avg:150.82ms step:567/1480 train_time:84014ms step_avg:150.83ms step:568/1480 train_time:84175ms step_avg:150.85ms step:569/1480 train_time:84334ms step_avg:150.87ms step:570/1480 train_time:84494ms step_avg:150.88ms step:571/1480 train_time:84655ms step_avg:150.90ms step:572/1480 train_time:84814ms step_avg:150.91ms step:573/1480 train_time:84975ms step_avg:150.93ms step:574/1480 train_time:85135ms step_avg:150.95ms step:575/1480 train_time:85296ms step_avg:150.97ms step:576/1480 train_time:85455ms step_avg:150.98ms step:577/1480 train_time:85615ms step_avg:151.00ms step:578/1480 train_time:85774ms step_avg:151.01ms step:579/1480 train_time:85935ms step_avg:151.03ms step:580/1480 train_time:86094ms step_avg:151.04ms step:581/1480 train_time:86256ms step_avg:151.06ms step:582/1480 train_time:86416ms step_avg:151.08ms step:583/1480 train_time:86575ms step_avg:151.09ms step:584/1480 train_time:86736ms step_avg:151.11ms step:585/1480 train_time:86895ms step_avg:151.12ms step:586/1480 train_time:87055ms step_avg:151.14ms step:587/1480 train_time:87215ms step_avg:151.15ms step:588/1480 train_time:87376ms step_avg:151.17ms step:589/1480 train_time:87536ms step_avg:151.18ms step:590/1480 train_time:87697ms step_avg:151.20ms step:591/1480 train_time:87856ms step_avg:151.22ms step:592/1480 train_time:88015ms step_avg:151.23ms step:593/1480 train_time:88176ms step_avg:151.24ms step:594/1480 train_time:88336ms step_avg:151.26ms step:595/1480 train_time:88496ms step_avg:151.28ms step:596/1480 train_time:88657ms step_avg:151.29ms step:597/1480 train_time:88816ms step_avg:151.30ms step:598/1480 train_time:88975ms step_avg:151.32ms step:599/1480 train_time:89135ms step_avg:151.33ms step:600/1480 train_time:89294ms step_avg:151.35ms step:601/1480 train_time:89455ms step_avg:151.36ms step:602/1480 train_time:89615ms step_avg:151.38ms step:603/1480 train_time:89776ms step_avg:151.39ms step:604/1480 train_time:89936ms step_avg:151.41ms step:605/1480 train_time:90096ms step_avg:151.42ms step:606/1480 train_time:90257ms step_avg:151.44ms step:607/1480 train_time:90417ms step_avg:151.45ms step:608/1480 train_time:90577ms step_avg:151.47ms step:609/1480 train_time:90736ms step_avg:151.48ms step:610/1480 train_time:90895ms step_avg:151.49ms step:611/1480 train_time:91055ms step_avg:151.51ms step:612/1480 train_time:91216ms step_avg:151.52ms step:613/1480 train_time:91376ms step_avg:151.54ms step:614/1480 train_time:91536ms step_avg:151.55ms step:615/1480 train_time:91694ms step_avg:151.56ms step:616/1480 train_time:91854ms step_avg:151.57ms step:617/1480 train_time:92014ms step_avg:151.59ms step:618/1480 train_time:92174ms step_avg:151.60ms step:619/1480 train_time:92334ms step_avg:151.62ms step:620/1480 train_time:92494ms step_avg:151.63ms step:621/1480 train_time:92655ms step_avg:151.64ms step:622/1480 train_time:92814ms step_avg:151.66ms step:623/1480 train_time:92975ms step_avg:151.67ms step:624/1480 train_time:93135ms step_avg:151.69ms step:625/1480 train_time:93295ms step_avg:151.70ms step:625/1480 val_loss:3.6056 train_time:93359ms step_avg:151.80ms step:626/1480 train_time:93458ms step_avg:151.72ms step:627/1480 train_time:93619ms step_avg:151.73ms step:628/1480 train_time:93777ms step_avg:151.74ms step:629/1480 train_time:93936ms step_avg:151.75ms step:630/1480 train_time:94094ms step_avg:151.76ms step:631/1480 train_time:94251ms step_avg:151.77ms step:632/1480 train_time:94409ms step_avg:151.78ms step:633/1480 train_time:94568ms step_avg:151.79ms step:634/1480 train_time:94726ms step_avg:151.81ms step:635/1480 train_time:94886ms step_avg:151.82ms step:636/1480 train_time:95044ms step_avg:151.83ms step:637/1480 train_time:95203ms step_avg:151.84ms step:638/1480 train_time:95362ms step_avg:151.85ms step:639/1480 train_time:95521ms step_avg:151.86ms step:640/1480 train_time:95680ms step_avg:151.87ms step:641/1480 train_time:95841ms step_avg:151.89ms step:642/1480 train_time:95999ms step_avg:151.90ms step:643/1480 train_time:96159ms step_avg:151.91ms step:644/1480 train_time:96319ms step_avg:151.92ms step:645/1480 train_time:96478ms step_avg:151.93ms step:646/1480 train_time:96639ms step_avg:151.95ms step:647/1480 train_time:96799ms step_avg:151.96ms step:648/1480 train_time:96960ms step_avg:151.98ms step:649/1480 train_time:97119ms step_avg:151.99ms step:650/1480 train_time:97279ms step_avg:152.00ms step:651/1480 train_time:97439ms step_avg:152.01ms step:652/1480 train_time:97599ms step_avg:152.02ms step:653/1480 train_time:97759ms step_avg:152.04ms step:654/1480 train_time:97920ms step_avg:152.05ms step:655/1480 train_time:98079ms step_avg:152.06ms step:656/1480 train_time:98240ms step_avg:152.07ms step:657/1480 train_time:98400ms step_avg:152.09ms step:658/1480 train_time:98560ms step_avg:152.10ms step:659/1480 train_time:98721ms step_avg:152.11ms step:660/1480 train_time:98883ms step_avg:152.13ms step:661/1480 train_time:99045ms step_avg:152.14ms step:662/1480 train_time:99205ms step_avg:152.16ms step:663/1480 train_time:99364ms step_avg:152.17ms step:664/1480 train_time:99526ms step_avg:152.18ms step:665/1480 train_time:99687ms step_avg:152.19ms step:666/1480 train_time:99848ms step_avg:152.21ms step:667/1480 train_time:100009ms step_avg:152.22ms step:668/1480 train_time:100171ms step_avg:152.24ms step:669/1480 train_time:100334ms step_avg:152.25ms step:670/1480 train_time:100493ms step_avg:152.26ms step:671/1480 train_time:100654ms step_avg:152.27ms step:672/1480 train_time:100817ms step_avg:152.29ms step:673/1480 train_time:100979ms step_avg:152.31ms step:674/1480 train_time:101142ms step_avg:152.32ms step:675/1480 train_time:101303ms step_avg:152.34ms step:676/1480 train_time:101465ms step_avg:152.35ms step:677/1480 train_time:101626ms step_avg:152.36ms step:678/1480 train_time:101785ms step_avg:152.37ms step:679/1480 train_time:101947ms step_avg:152.39ms step:680/1480 train_time:102110ms step_avg:152.40ms step:681/1480 train_time:102270ms step_avg:152.41ms step:682/1480 train_time:102433ms step_avg:152.43ms step:683/1480 train_time:102596ms step_avg:152.45ms step:684/1480 train_time:102758ms step_avg:152.46ms step:685/1480 train_time:102922ms step_avg:152.48ms step:686/1480 train_time:103082ms step_avg:152.49ms step:687/1480 train_time:103243ms step_avg:152.50ms step:688/1480 train_time:103406ms step_avg:152.52ms step:689/1480 train_time:103567ms step_avg:152.53ms step:690/1480 train_time:103732ms step_avg:152.55ms step:691/1480 train_time:103894ms step_avg:152.56ms step:692/1480 train_time:104057ms step_avg:152.58ms step:693/1480 train_time:104219ms step_avg:152.59ms step:694/1480 train_time:104381ms step_avg:152.60ms step:695/1480 train_time:104543ms step_avg:152.62ms step:696/1480 train_time:104703ms step_avg:152.63ms step:697/1480 train_time:104866ms step_avg:152.64ms step:698/1480 train_time:105027ms step_avg:152.66ms step:699/1480 train_time:105190ms step_avg:152.67ms step:700/1480 train_time:105351ms step_avg:152.68ms step:701/1480 train_time:105513ms step_avg:152.70ms step:702/1480 train_time:105673ms step_avg:152.71ms step:703/1480 train_time:105834ms step_avg:152.72ms step:704/1480 train_time:105996ms step_avg:152.73ms step:705/1480 train_time:106161ms step_avg:152.75ms step:706/1480 train_time:106325ms step_avg:152.77ms step:707/1480 train_time:106485ms step_avg:152.78ms step:708/1480 train_time:106647ms step_avg:152.79ms step:709/1480 train_time:106809ms step_avg:152.80ms step:710/1480 train_time:106968ms step_avg:152.81ms step:711/1480 train_time:107131ms step_avg:152.83ms step:712/1480 train_time:107300ms step_avg:152.85ms step:713/1480 train_time:107462ms step_avg:152.86ms step:714/1480 train_time:107622ms step_avg:152.87ms step:715/1480 train_time:107781ms step_avg:152.88ms step:716/1480 train_time:107941ms step_avg:152.89ms step:717/1480 train_time:108103ms step_avg:152.90ms step:718/1480 train_time:108263ms step_avg:152.91ms step:719/1480 train_time:108424ms step_avg:152.93ms step:720/1480 train_time:108587ms step_avg:152.94ms step:721/1480 train_time:108749ms step_avg:152.95ms step:722/1480 train_time:108910ms step_avg:152.96ms step:723/1480 train_time:109069ms step_avg:152.97ms step:724/1480 train_time:109229ms step_avg:152.98ms step:725/1480 train_time:109396ms step_avg:153.00ms step:726/1480 train_time:109561ms step_avg:153.02ms step:727/1480 train_time:109724ms step_avg:153.03ms step:728/1480 train_time:109884ms step_avg:153.04ms step:729/1480 train_time:110045ms step_avg:153.05ms step:730/1480 train_time:110206ms step_avg:153.06ms step:731/1480 train_time:110367ms step_avg:153.07ms step:732/1480 train_time:110526ms step_avg:153.08ms step:733/1480 train_time:110687ms step_avg:153.09ms step:734/1480 train_time:110847ms step_avg:153.10ms step:735/1480 train_time:111007ms step_avg:153.11ms step:736/1480 train_time:111168ms step_avg:153.12ms step:737/1480 train_time:111328ms step_avg:153.13ms step:738/1480 train_time:111491ms step_avg:153.15ms step:739/1480 train_time:111653ms step_avg:153.16ms step:740/1480 train_time:111818ms step_avg:153.18ms step:741/1480 train_time:111982ms step_avg:153.19ms step:742/1480 train_time:112144ms step_avg:153.20ms step:743/1480 train_time:112305ms step_avg:153.21ms step:744/1480 train_time:112467ms step_avg:153.22ms step:745/1480 train_time:112631ms step_avg:153.24ms step:746/1480 train_time:112791ms step_avg:153.25ms step:747/1480 train_time:112953ms step_avg:153.26ms step:748/1480 train_time:113120ms step_avg:153.28ms step:749/1480 train_time:113283ms step_avg:153.29ms step:750/1480 train_time:113442ms step_avg:153.30ms step:750/1480 val_loss:3.5496 train_time:113506ms step_avg:153.39ms step:751/1480 train_time:113607ms step_avg:153.32ms step:752/1480 train_time:113768ms step_avg:153.33ms step:753/1480 train_time:113929ms step_avg:153.34ms step:754/1480 train_time:114090ms step_avg:153.35ms step:755/1480 train_time:114250ms step_avg:153.36ms step:756/1480 train_time:114411ms step_avg:153.37ms step:757/1480 train_time:114578ms step_avg:153.38ms step:758/1480 train_time:114738ms step_avg:153.39ms step:759/1480 train_time:114902ms step_avg:153.41ms step:760/1480 train_time:115064ms step_avg:153.42ms step:761/1480 train_time:115228ms step_avg:153.43ms step:762/1480 train_time:115388ms step_avg:153.44ms step:763/1480 train_time:115550ms step_avg:153.45ms step:764/1480 train_time:115710ms step_avg:153.46ms step:765/1480 train_time:115872ms step_avg:153.47ms step:766/1480 train_time:116035ms step_avg:153.49ms step:767/1480 train_time:116202ms step_avg:153.50ms step:768/1480 train_time:116363ms step_avg:153.51ms step:769/1480 train_time:116527ms step_avg:153.53ms step:770/1480 train_time:116690ms step_avg:153.54ms step:771/1480 train_time:116854ms step_avg:153.55ms step:772/1480 train_time:117017ms step_avg:153.57ms step:773/1480 train_time:117180ms step_avg:153.58ms step:774/1480 train_time:117344ms step_avg:153.59ms step:775/1480 train_time:117506ms step_avg:153.60ms step:776/1480 train_time:117669ms step_avg:153.62ms step:777/1480 train_time:117835ms step_avg:153.63ms step:778/1480 train_time:118000ms step_avg:153.65ms step:779/1480 train_time:118164ms step_avg:153.66ms step:780/1480 train_time:118327ms step_avg:153.67ms step:781/1480 train_time:118490ms step_avg:153.68ms step:782/1480 train_time:118655ms step_avg:153.70ms step:783/1480 train_time:118816ms step_avg:153.71ms step:784/1480 train_time:118982ms step_avg:153.72ms step:785/1480 train_time:119145ms step_avg:153.74ms step:786/1480 train_time:119310ms step_avg:153.75ms step:787/1480 train_time:119473ms step_avg:153.76ms step:788/1480 train_time:119638ms step_avg:153.78ms step:789/1480 train_time:119800ms step_avg:153.79ms step:790/1480 train_time:119966ms step_avg:153.80ms step:791/1480 train_time:120132ms step_avg:153.82ms step:792/1480 train_time:120298ms step_avg:153.83ms step:793/1480 train_time:120461ms step_avg:153.85ms step:794/1480 train_time:120626ms step_avg:153.86ms step:795/1480 train_time:120791ms step_avg:153.87ms step:796/1480 train_time:120958ms step_avg:153.89ms step:797/1480 train_time:121123ms step_avg:153.90ms step:798/1480 train_time:121287ms step_avg:153.92ms step:799/1480 train_time:121454ms step_avg:153.93ms step:800/1480 train_time:121617ms step_avg:153.94ms step:801/1480 train_time:121780ms step_avg:153.96ms step:802/1480 train_time:121948ms step_avg:153.97ms step:803/1480 train_time:122109ms step_avg:153.98ms step:804/1480 train_time:122272ms step_avg:153.99ms step:805/1480 train_time:122437ms step_avg:154.01ms step:806/1480 train_time:122600ms step_avg:154.02ms step:807/1480 train_time:122763ms step_avg:154.03ms step:808/1480 train_time:122926ms step_avg:154.04ms step:809/1480 train_time:123088ms step_avg:154.05ms step:810/1480 train_time:123250ms step_avg:154.06ms step:811/1480 train_time:123413ms step_avg:154.07ms step:812/1480 train_time:123576ms step_avg:154.09ms step:813/1480 train_time:123737ms step_avg:154.09ms step:814/1480 train_time:123902ms step_avg:154.11ms step:815/1480 train_time:124064ms step_avg:154.12ms step:816/1480 train_time:124229ms step_avg:154.13ms step:817/1480 train_time:124391ms step_avg:154.14ms step:818/1480 train_time:124552ms step_avg:154.15ms step:819/1480 train_time:124714ms step_avg:154.16ms step:820/1480 train_time:124879ms step_avg:154.17ms step:821/1480 train_time:125041ms step_avg:154.18ms step:822/1480 train_time:125205ms step_avg:154.19ms step:823/1480 train_time:125367ms step_avg:154.20ms step:824/1480 train_time:125528ms step_avg:154.21ms step:825/1480 train_time:125694ms step_avg:154.23ms step:826/1480 train_time:125863ms step_avg:154.24ms step:827/1480 train_time:126029ms step_avg:154.26ms step:828/1480 train_time:126193ms step_avg:154.27ms step:829/1480 train_time:126358ms step_avg:154.28ms step:830/1480 train_time:126523ms step_avg:154.30ms step:831/1480 train_time:126687ms step_avg:154.31ms step:832/1480 train_time:126852ms step_avg:154.32ms step:833/1480 train_time:127016ms step_avg:154.33ms step:834/1480 train_time:127181ms step_avg:154.35ms step:835/1480 train_time:127343ms step_avg:154.36ms step:836/1480 train_time:127508ms step_avg:154.37ms step:837/1480 train_time:127670ms step_avg:154.38ms step:838/1480 train_time:127833ms step_avg:154.39ms step:839/1480 train_time:127995ms step_avg:154.40ms step:840/1480 train_time:128158ms step_avg:154.41ms step:841/1480 train_time:128320ms step_avg:154.42ms step:842/1480 train_time:128486ms step_avg:154.43ms step:843/1480 train_time:128647ms step_avg:154.44ms step:844/1480 train_time:128808ms step_avg:154.45ms step:845/1480 train_time:128971ms step_avg:154.46ms step:846/1480 train_time:129137ms step_avg:154.47ms step:847/1480 train_time:129302ms step_avg:154.48ms step:848/1480 train_time:129465ms step_avg:154.49ms step:849/1480 train_time:129629ms step_avg:154.50ms step:850/1480 train_time:129792ms step_avg:154.51ms step:851/1480 train_time:129958ms step_avg:154.53ms step:852/1480 train_time:130122ms step_avg:154.54ms step:853/1480 train_time:130285ms step_avg:154.55ms step:854/1480 train_time:130448ms step_avg:154.56ms step:855/1480 train_time:130611ms step_avg:154.57ms step:856/1480 train_time:130773ms step_avg:154.58ms step:857/1480 train_time:130939ms step_avg:154.59ms step:858/1480 train_time:131105ms step_avg:154.60ms step:859/1480 train_time:131269ms step_avg:154.62ms step:860/1480 train_time:131430ms step_avg:154.62ms step:861/1480 train_time:131599ms step_avg:154.64ms step:862/1480 train_time:131766ms step_avg:154.66ms step:863/1480 train_time:131932ms step_avg:154.67ms step:864/1480 train_time:132096ms step_avg:154.68ms step:865/1480 train_time:132257ms step_avg:154.69ms step:866/1480 train_time:132426ms step_avg:154.70ms step:867/1480 train_time:132589ms step_avg:154.71ms step:868/1480 train_time:132750ms step_avg:154.72ms step:869/1480 train_time:132911ms step_avg:154.73ms step:870/1480 train_time:133077ms step_avg:154.74ms step:871/1480 train_time:133240ms step_avg:154.75ms step:872/1480 train_time:133405ms step_avg:154.76ms step:873/1480 train_time:133567ms step_avg:154.77ms step:874/1480 train_time:133731ms step_avg:154.78ms step:875/1480 train_time:133897ms step_avg:154.79ms step:875/1480 val_loss:3.5039 train_time:133962ms step_avg:154.87ms step:876/1480 train_time:134063ms step_avg:154.81ms step:877/1480 train_time:134230ms step_avg:154.82ms step:878/1480 train_time:134392ms step_avg:154.83ms step:879/1480 train_time:134556ms step_avg:154.84ms step:880/1480 train_time:134719ms step_avg:154.85ms step:881/1480 train_time:134883ms step_avg:154.86ms step:882/1480 train_time:135048ms step_avg:154.87ms step:883/1480 train_time:135213ms step_avg:154.88ms step:884/1480 train_time:135380ms step_avg:154.90ms step:885/1480 train_time:135546ms step_avg:154.91ms step:886/1480 train_time:135712ms step_avg:154.92ms step:887/1480 train_time:135879ms step_avg:154.94ms step:888/1480 train_time:136052ms step_avg:154.96ms step:889/1480 train_time:136220ms step_avg:154.97ms step:890/1480 train_time:136384ms step_avg:154.98ms step:891/1480 train_time:136551ms step_avg:155.00ms step:892/1480 train_time:136715ms step_avg:155.01ms step:893/1480 train_time:136876ms step_avg:155.01ms step:894/1480 train_time:137043ms step_avg:155.03ms step:895/1480 train_time:137209ms step_avg:155.04ms step:896/1480 train_time:137375ms step_avg:155.05ms step:897/1480 train_time:137543ms step_avg:155.07ms step:898/1480 train_time:137710ms step_avg:155.08ms step:899/1480 train_time:137874ms step_avg:155.09ms step:900/1480 train_time:138038ms step_avg:155.10ms step:901/1480 train_time:138202ms step_avg:155.11ms step:902/1480 train_time:138368ms step_avg:155.12ms step:903/1480 train_time:138539ms step_avg:155.14ms step:904/1480 train_time:138705ms step_avg:155.15ms step:905/1480 train_time:138867ms step_avg:155.16ms step:906/1480 train_time:139034ms step_avg:155.17ms step:907/1480 train_time:139203ms step_avg:155.19ms step:908/1480 train_time:139366ms step_avg:155.20ms step:909/1480 train_time:139531ms step_avg:155.21ms step:910/1480 train_time:139703ms step_avg:155.23ms step:911/1480 train_time:139868ms step_avg:155.24ms step:912/1480 train_time:140034ms step_avg:155.25ms step:913/1480 train_time:140200ms step_avg:155.26ms step:914/1480 train_time:140368ms step_avg:155.27ms step:915/1480 train_time:140537ms step_avg:155.29ms step:916/1480 train_time:140702ms step_avg:155.30ms step:917/1480 train_time:140865ms step_avg:155.31ms step:918/1480 train_time:141033ms step_avg:155.32ms step:919/1480 train_time:141202ms step_avg:155.34ms step:920/1480 train_time:141368ms step_avg:155.35ms step:921/1480 train_time:141533ms step_avg:155.36ms step:922/1480 train_time:141701ms step_avg:155.37ms step:923/1480 train_time:141865ms step_avg:155.38ms step:924/1480 train_time:142029ms step_avg:155.39ms step:925/1480 train_time:142194ms step_avg:155.40ms step:926/1480 train_time:142356ms step_avg:155.41ms step:927/1480 train_time:142521ms step_avg:155.42ms step:928/1480 train_time:142687ms step_avg:155.43ms step:929/1480 train_time:142852ms step_avg:155.44ms step:930/1480 train_time:143016ms step_avg:155.45ms step:931/1480 train_time:143180ms step_avg:155.46ms step:932/1480 train_time:143347ms step_avg:155.47ms step:933/1480 train_time:143514ms step_avg:155.49ms step:934/1480 train_time:143679ms step_avg:155.50ms step:935/1480 train_time:143850ms step_avg:155.51ms step:936/1480 train_time:144017ms step_avg:155.53ms step:937/1480 train_time:144187ms step_avg:155.54ms step:938/1480 train_time:144350ms step_avg:155.55ms step:939/1480 train_time:144520ms step_avg:155.57ms step:940/1480 train_time:144687ms step_avg:155.58ms step:941/1480 train_time:144852ms step_avg:155.59ms step:942/1480 train_time:145019ms step_avg:155.60ms step:943/1480 train_time:145189ms step_avg:155.61ms step:944/1480 train_time:145361ms step_avg:155.63ms step:945/1480 train_time:145526ms step_avg:155.64ms step:946/1480 train_time:145694ms step_avg:155.66ms step:947/1480 train_time:145861ms step_avg:155.67ms step:948/1480 train_time:146029ms step_avg:155.68ms step:949/1480 train_time:146193ms step_avg:155.69ms step:950/1480 train_time:146355ms step_avg:155.70ms step:951/1480 train_time:146525ms step_avg:155.71ms step:952/1480 train_time:146691ms step_avg:155.72ms step:953/1480 train_time:146859ms step_avg:155.74ms step:954/1480 train_time:147030ms step_avg:155.75ms step:955/1480 train_time:147194ms step_avg:155.76ms step:956/1480 train_time:147360ms step_avg:155.77ms step:957/1480 train_time:147529ms step_avg:155.79ms step:958/1480 train_time:147699ms step_avg:155.80ms step:959/1480 train_time:147864ms step_avg:155.81ms step:960/1480 train_time:148032ms step_avg:155.82ms step:961/1480 train_time:148198ms step_avg:155.83ms step:962/1480 train_time:148363ms step_avg:155.84ms step:963/1480 train_time:148530ms step_avg:155.85ms step:964/1480 train_time:148697ms step_avg:155.87ms step:965/1480 train_time:148861ms step_avg:155.88ms step:966/1480 train_time:149027ms step_avg:155.89ms step:967/1480 train_time:149190ms step_avg:155.89ms step:968/1480 train_time:149355ms step_avg:155.90ms step:969/1480 train_time:149525ms step_avg:155.92ms step:970/1480 train_time:149689ms step_avg:155.93ms step:971/1480 train_time:149853ms step_avg:155.93ms step:972/1480 train_time:150018ms step_avg:155.94ms step:973/1480 train_time:150182ms step_avg:155.95ms step:974/1480 train_time:150351ms step_avg:155.97ms step:975/1480 train_time:150517ms step_avg:155.98ms step:976/1480 train_time:150682ms step_avg:155.99ms step:977/1480 train_time:150848ms step_avg:156.00ms step:978/1480 train_time:151013ms step_avg:156.01ms step:979/1480 train_time:151178ms step_avg:156.01ms step:980/1480 train_time:151346ms step_avg:156.03ms step:981/1480 train_time:151513ms step_avg:156.04ms step:982/1480 train_time:151675ms step_avg:156.04ms step:983/1480 train_time:151840ms step_avg:156.05ms step:984/1480 train_time:152005ms step_avg:156.06ms step:985/1480 train_time:152173ms step_avg:156.07ms step:986/1480 train_time:152337ms step_avg:156.08ms step:987/1480 train_time:152502ms step_avg:156.09ms step:988/1480 train_time:152670ms step_avg:156.10ms step:989/1480 train_time:152834ms step_avg:156.11ms step:990/1480 train_time:153005ms step_avg:156.13ms step:991/1480 train_time:153172ms step_avg:156.14ms step:992/1480 train_time:153348ms step_avg:156.16ms step:993/1480 train_time:153527ms step_avg:156.18ms step:994/1480 train_time:153692ms step_avg:156.19ms step:995/1480 train_time:153855ms step_avg:156.20ms step:996/1480 train_time:154018ms step_avg:156.20ms step:997/1480 train_time:154184ms step_avg:156.21ms step:998/1480 train_time:154348ms step_avg:156.22ms step:999/1480 train_time:154513ms step_avg:156.23ms step:1000/1480 train_time:154683ms step_avg:156.25ms step:1000/1480 val_loss:3.4411 train_time:154749ms step_avg:156.31ms step:1001/1480 train_time:154852ms step_avg:156.26ms step:1002/1480 train_time:155019ms step_avg:156.27ms step:1003/1480 train_time:155188ms step_avg:156.28ms step:1004/1480 train_time:155357ms step_avg:156.30ms step:1005/1480 train_time:155524ms step_avg:156.31ms step:1006/1480 train_time:155691ms step_avg:156.32ms step:1007/1480 train_time:155858ms step_avg:156.33ms step:1008/1480 train_time:156026ms step_avg:156.34ms step:1009/1480 train_time:156199ms step_avg:156.36ms step:1010/1480 train_time:156364ms step_avg:156.36ms step:1011/1480 train_time:156530ms step_avg:156.37ms step:1012/1480 train_time:156696ms step_avg:156.38ms step:1013/1480 train_time:156868ms step_avg:156.40ms step:1014/1480 train_time:157035ms step_avg:156.41ms step:1015/1480 train_time:157205ms step_avg:156.42ms step:1016/1480 train_time:157374ms step_avg:156.44ms step:1017/1480 train_time:157545ms step_avg:156.45ms step:1018/1480 train_time:157712ms step_avg:156.46ms step:1019/1480 train_time:157881ms step_avg:156.47ms step:1020/1480 train_time:158052ms step_avg:156.49ms step:1021/1480 train_time:158218ms step_avg:156.50ms step:1022/1480 train_time:158386ms step_avg:156.51ms step:1023/1480 train_time:158555ms step_avg:156.52ms step:1024/1480 train_time:158722ms step_avg:156.53ms step:1025/1480 train_time:158893ms step_avg:156.54ms step:1026/1480 train_time:159059ms step_avg:156.55ms step:1027/1480 train_time:159226ms step_avg:156.56ms step:1028/1480 train_time:159400ms step_avg:156.58ms step:1029/1480 train_time:159575ms step_avg:156.60ms step:1030/1480 train_time:159742ms step_avg:156.61ms step:1031/1480 train_time:159906ms step_avg:156.62ms step:1032/1480 train_time:160078ms step_avg:156.63ms step:1033/1480 train_time:160244ms step_avg:156.64ms step:1034/1480 train_time:160414ms step_avg:156.65ms step:1035/1480 train_time:160582ms step_avg:156.67ms step:1036/1480 train_time:160748ms step_avg:156.67ms step:1037/1480 train_time:160915ms step_avg:156.68ms step:1038/1480 train_time:161082ms step_avg:156.69ms step:1039/1480 train_time:161252ms step_avg:156.71ms step:1040/1480 train_time:161418ms step_avg:156.72ms step:1041/1480 train_time:161585ms step_avg:156.73ms step:1042/1480 train_time:161749ms step_avg:156.73ms step:1043/1480 train_time:161913ms step_avg:156.74ms step:1044/1480 train_time:162080ms step_avg:156.75ms step:1045/1480 train_time:162250ms step_avg:156.76ms step:1046/1480 train_time:162418ms step_avg:156.77ms step:1047/1480 train_time:162584ms step_avg:156.78ms step:1048/1480 train_time:162751ms step_avg:156.79ms step:1049/1480 train_time:162916ms step_avg:156.80ms step:1050/1480 train_time:163085ms step_avg:156.81ms step:1051/1480 train_time:163256ms step_avg:156.83ms step:1052/1480 train_time:163424ms step_avg:156.84ms step:1053/1480 train_time:163590ms step_avg:156.85ms step:1054/1480 train_time:163758ms step_avg:156.86ms step:1055/1480 train_time:163923ms step_avg:156.86ms step:1056/1480 train_time:164089ms step_avg:156.87ms step:1057/1480 train_time:164256ms step_avg:156.88ms step:1058/1480 train_time:164425ms step_avg:156.89ms step:1059/1480 train_time:164600ms step_avg:156.91ms step:1060/1480 train_time:164768ms step_avg:156.92ms step:1061/1480 train_time:164931ms step_avg:156.93ms step:1062/1480 train_time:165098ms step_avg:156.94ms step:1063/1480 train_time:165262ms step_avg:156.94ms step:1064/1480 train_time:165427ms step_avg:156.95ms step:1065/1480 train_time:165595ms step_avg:156.96ms step:1066/1480 train_time:165762ms step_avg:156.97ms step:1067/1480 train_time:165932ms step_avg:156.98ms step:1068/1480 train_time:166099ms step_avg:156.99ms step:1069/1480 train_time:166272ms step_avg:157.01ms step:1070/1480 train_time:166439ms step_avg:157.02ms step:1071/1480 train_time:166613ms step_avg:157.03ms step:1072/1480 train_time:166780ms step_avg:157.04ms step:1073/1480 train_time:166944ms step_avg:157.05ms step:1074/1480 train_time:167110ms step_avg:157.06ms step:1075/1480 train_time:167282ms step_avg:157.07ms step:1076/1480 train_time:167450ms step_avg:157.08ms step:1077/1480 train_time:167616ms step_avg:157.09ms step:1078/1480 train_time:167790ms step_avg:157.11ms step:1079/1480 train_time:167962ms step_avg:157.12ms step:1080/1480 train_time:168132ms step_avg:157.13ms step:1081/1480 train_time:168300ms step_avg:157.14ms step:1082/1480 train_time:168466ms step_avg:157.15ms step:1083/1480 train_time:168633ms step_avg:157.16ms step:1084/1480 train_time:168800ms step_avg:157.17ms step:1085/1480 train_time:168969ms step_avg:157.18ms step:1086/1480 train_time:169139ms step_avg:157.19ms step:1087/1480 train_time:169305ms step_avg:157.20ms step:1088/1480 train_time:169473ms step_avg:157.21ms step:1089/1480 train_time:169645ms step_avg:157.22ms step:1090/1480 train_time:169817ms step_avg:157.24ms step:1091/1480 train_time:169985ms step_avg:157.25ms step:1092/1480 train_time:170153ms step_avg:157.26ms step:1093/1480 train_time:170321ms step_avg:157.27ms step:1094/1480 train_time:170487ms step_avg:157.28ms step:1095/1480 train_time:170652ms step_avg:157.28ms step:1096/1480 train_time:170820ms step_avg:157.29ms step:1097/1480 train_time:170988ms step_avg:157.30ms step:1098/1480 train_time:171160ms step_avg:157.32ms step:1099/1480 train_time:171330ms step_avg:157.33ms step:1100/1480 train_time:171504ms step_avg:157.34ms step:1101/1480 train_time:171676ms step_avg:157.36ms step:1102/1480 train_time:171847ms step_avg:157.37ms step:1103/1480 train_time:172022ms step_avg:157.38ms step:1104/1480 train_time:172189ms step_avg:157.39ms step:1105/1480 train_time:172359ms step_avg:157.41ms step:1106/1480 train_time:172527ms step_avg:157.42ms step:1107/1480 train_time:172695ms step_avg:157.42ms step:1108/1480 train_time:172861ms step_avg:157.43ms step:1109/1480 train_time:173026ms step_avg:157.44ms step:1110/1480 train_time:173193ms step_avg:157.45ms step:1111/1480 train_time:173361ms step_avg:157.46ms step:1112/1480 train_time:173531ms step_avg:157.47ms step:1113/1480 train_time:173710ms step_avg:157.49ms step:1114/1480 train_time:173883ms step_avg:157.50ms step:1115/1480 train_time:174057ms step_avg:157.52ms step:1116/1480 train_time:174223ms step_avg:157.53ms step:1117/1480 train_time:174395ms step_avg:157.54ms step:1118/1480 train_time:174571ms step_avg:157.56ms step:1119/1480 train_time:174738ms step_avg:157.56ms step:1120/1480 train_time:174905ms step_avg:157.57ms step:1121/1480 train_time:175076ms step_avg:157.58ms step:1122/1480 train_time:175242ms step_avg:157.59ms step:1123/1480 train_time:175409ms step_avg:157.60ms step:1124/1480 train_time:175577ms step_avg:157.61ms step:1125/1480 train_time:175744ms step_avg:157.62ms step:1125/1480 val_loss:3.3853 train_time:175812ms step_avg:157.68ms step:1126/1480 train_time:175914ms step_avg:157.63ms step:1127/1480 train_time:176086ms step_avg:157.64ms step:1128/1480 train_time:176257ms step_avg:157.65ms step:1129/1480 train_time:176430ms step_avg:157.67ms step:1130/1480 train_time:176600ms step_avg:157.68ms step:1131/1480 train_time:176777ms step_avg:157.70ms step:1132/1480 train_time:176943ms step_avg:157.70ms step:1133/1480 train_time:177116ms step_avg:157.72ms step:1134/1480 train_time:177286ms step_avg:157.73ms step:1135/1480 train_time:177455ms step_avg:157.74ms step:1136/1480 train_time:177627ms step_avg:157.75ms step:1137/1480 train_time:177797ms step_avg:157.76ms step:1138/1480 train_time:177969ms step_avg:157.77ms step:1139/1480 train_time:178136ms step_avg:157.78ms step:1140/1480 train_time:178305ms step_avg:157.79ms step:1141/1480 train_time:178476ms step_avg:157.80ms step:1142/1480 train_time:178643ms step_avg:157.81ms step:1143/1480 train_time:178813ms step_avg:157.82ms step:1144/1480 train_time:178983ms step_avg:157.83ms step:1145/1480 train_time:179149ms step_avg:157.84ms step:1146/1480 train_time:179320ms step_avg:157.85ms step:1147/1480 train_time:179489ms step_avg:157.86ms step:1148/1480 train_time:179659ms step_avg:157.87ms step:1149/1480 train_time:179830ms step_avg:157.88ms step:1150/1480 train_time:179999ms step_avg:157.89ms step:1151/1480 train_time:180171ms step_avg:157.91ms step:1152/1480 train_time:180343ms step_avg:157.92ms step:1153/1480 train_time:180516ms step_avg:157.93ms step:1154/1480 train_time:180682ms step_avg:157.94ms step:1155/1480 train_time:180854ms step_avg:157.95ms step:1156/1480 train_time:181033ms step_avg:157.97ms step:1157/1480 train_time:181203ms step_avg:157.98ms step:1158/1480 train_time:181370ms step_avg:157.99ms step:1159/1480 train_time:181538ms step_avg:158.00ms step:1160/1480 train_time:181705ms step_avg:158.00ms step:1161/1480 train_time:181874ms step_avg:158.01ms step:1162/1480 train_time:182044ms step_avg:158.02ms step:1163/1480 train_time:182213ms step_avg:158.03ms step:1164/1480 train_time:182384ms step_avg:158.04ms step:1165/1480 train_time:182549ms step_avg:158.05ms step:1166/1480 train_time:182718ms step_avg:158.06ms step:1167/1480 train_time:182888ms step_avg:158.07ms step:1168/1480 train_time:183057ms step_avg:158.08ms step:1169/1480 train_time:183226ms step_avg:158.09ms step:1170/1480 train_time:183396ms step_avg:158.10ms step:1171/1480 train_time:183564ms step_avg:158.11ms step:1172/1480 train_time:183731ms step_avg:158.12ms step:1173/1480 train_time:183904ms step_avg:158.13ms step:1174/1480 train_time:184085ms step_avg:158.15ms step:1175/1480 train_time:184255ms step_avg:158.16ms step:1176/1480 train_time:184427ms step_avg:158.17ms step:1177/1480 train_time:184605ms step_avg:158.19ms step:1178/1480 train_time:184772ms step_avg:158.20ms step:1179/1480 train_time:184938ms step_avg:158.20ms step:1180/1480 train_time:185118ms step_avg:158.22ms step:1181/1480 train_time:185288ms step_avg:158.23ms step:1182/1480 train_time:185455ms step_avg:158.24ms step:1183/1480 train_time:185625ms step_avg:158.25ms step:1184/1480 train_time:185792ms step_avg:158.26ms step:1185/1480 train_time:185965ms step_avg:158.27ms step:1186/1480 train_time:186136ms step_avg:158.28ms step:1187/1480 train_time:186321ms step_avg:158.30ms step:1188/1480 train_time:186488ms step_avg:158.31ms step:1189/1480 train_time:186661ms step_avg:158.32ms step:1190/1480 train_time:186829ms step_avg:158.33ms step:1191/1480 train_time:187001ms step_avg:158.34ms step:1192/1480 train_time:187169ms step_avg:158.35ms step:1193/1480 train_time:187334ms step_avg:158.36ms step:1194/1480 train_time:187505ms step_avg:158.37ms step:1195/1480 train_time:187677ms step_avg:158.38ms step:1196/1480 train_time:187862ms step_avg:158.40ms step:1197/1480 train_time:188032ms step_avg:158.41ms step:1198/1480 train_time:188214ms step_avg:158.43ms step:1199/1480 train_time:188385ms step_avg:158.44ms step:1200/1480 train_time:188554ms step_avg:158.45ms step:1201/1480 train_time:188721ms step_avg:158.46ms step:1202/1480 train_time:188902ms step_avg:158.48ms step:1203/1480 train_time:189077ms step_avg:158.49ms step:1204/1480 train_time:189251ms step_avg:158.50ms step:1205/1480 train_time:189420ms step_avg:158.51ms step:1206/1480 train_time:189588ms step_avg:158.52ms step:1207/1480 train_time:189757ms step_avg:158.53ms step:1208/1480 train_time:189926ms step_avg:158.54ms step:1209/1480 train_time:190099ms step_avg:158.55ms step:1210/1480 train_time:190273ms step_avg:158.56ms step:1211/1480 train_time:190447ms step_avg:158.57ms step:1212/1480 train_time:190617ms step_avg:158.58ms step:1213/1480 train_time:190789ms step_avg:158.59ms step:1214/1480 train_time:190967ms step_avg:158.61ms step:1215/1480 train_time:191142ms step_avg:158.62ms step:1216/1480 train_time:191312ms step_avg:158.63ms step:1217/1480 train_time:191486ms step_avg:158.65ms step:1218/1480 train_time:191656ms step_avg:158.66ms step:1219/1480 train_time:191836ms step_avg:158.67ms step:1220/1480 train_time:192007ms step_avg:158.68ms step:1221/1480 train_time:192174ms step_avg:158.69ms step:1222/1480 train_time:192342ms step_avg:158.70ms step:1223/1480 train_time:192512ms step_avg:158.71ms step:1224/1480 train_time:192690ms step_avg:158.72ms step:1225/1480 train_time:192861ms step_avg:158.73ms step:1226/1480 train_time:193035ms step_avg:158.75ms step:1227/1480 train_time:193209ms step_avg:158.76ms step:1228/1480 train_time:193378ms step_avg:158.77ms step:1229/1480 train_time:193551ms step_avg:158.78ms step:1230/1480 train_time:193732ms step_avg:158.80ms step:1231/1480 train_time:193908ms step_avg:158.81ms step:1232/1480 train_time:194083ms step_avg:158.82ms step:1233/1480 train_time:194251ms step_avg:158.83ms step:1234/1480 train_time:194422ms step_avg:158.84ms step:1235/1480 train_time:194597ms step_avg:158.85ms step:1236/1480 train_time:194767ms step_avg:158.86ms step:1237/1480 train_time:194938ms step_avg:158.87ms step:1238/1480 train_time:195124ms step_avg:158.90ms step:1239/1480 train_time:195295ms step_avg:158.91ms step:1240/1480 train_time:195466ms step_avg:158.92ms step:1241/1480 train_time:195640ms step_avg:158.93ms step:1242/1480 train_time:195809ms step_avg:158.94ms step:1243/1480 train_time:195985ms step_avg:158.95ms step:1244/1480 train_time:196150ms step_avg:158.95ms step:1245/1480 train_time:196320ms step_avg:158.96ms step:1246/1480 train_time:196490ms step_avg:158.97ms step:1247/1480 train_time:196660ms step_avg:158.98ms step:1248/1480 train_time:196830ms step_avg:158.99ms step:1249/1480 train_time:196998ms step_avg:159.00ms step:1250/1480 train_time:197168ms step_avg:159.01ms step:1250/1480 val_loss:3.3353 train_time:197240ms step_avg:159.06ms step:1251/1480 train_time:197348ms step_avg:159.02ms step:1252/1480 train_time:197517ms step_avg:159.03ms step:1253/1480 train_time:197688ms step_avg:159.04ms step:1254/1480 train_time:197860ms step_avg:159.05ms step:1255/1480 train_time:198046ms step_avg:159.07ms step:1256/1480 train_time:198219ms step_avg:159.08ms step:1257/1480 train_time:198390ms step_avg:159.09ms step:1258/1480 train_time:198564ms step_avg:159.11ms step:1259/1480 train_time:198735ms step_avg:159.11ms step:1260/1480 train_time:198901ms step_avg:159.12ms step:1261/1480 train_time:199074ms step_avg:159.13ms step:1262/1480 train_time:199251ms step_avg:159.15ms step:1263/1480 train_time:199425ms step_avg:159.16ms step:1264/1480 train_time:199591ms step_avg:159.16ms step:1265/1480 train_time:199758ms step_avg:159.17ms step:1266/1480 train_time:199930ms step_avg:159.18ms step:1267/1480 train_time:200101ms step_avg:159.19ms step:1268/1480 train_time:200272ms step_avg:159.20ms step:1269/1480 train_time:200448ms step_avg:159.21ms step:1270/1480 train_time:200617ms step_avg:159.22ms step:1271/1480 train_time:200788ms step_avg:159.23ms step:1272/1480 train_time:200955ms step_avg:159.24ms step:1273/1480 train_time:201126ms step_avg:159.24ms step:1274/1480 train_time:201297ms step_avg:159.25ms step:1275/1480 train_time:201465ms step_avg:159.26ms step:1276/1480 train_time:201631ms step_avg:159.27ms step:1277/1480 train_time:201802ms step_avg:159.28ms step:1278/1480 train_time:201970ms step_avg:159.28ms step:1279/1480 train_time:202142ms step_avg:159.29ms step:1280/1480 train_time:202323ms step_avg:159.31ms step:1281/1480 train_time:202491ms step_avg:159.32ms step:1282/1480 train_time:202656ms step_avg:159.32ms step:1283/1480 train_time:202829ms step_avg:159.33ms step:1284/1480 train_time:202998ms step_avg:159.34ms step:1285/1480 train_time:203169ms step_avg:159.35ms step:1286/1480 train_time:203337ms step_avg:159.36ms step:1287/1480 train_time:203510ms step_avg:159.37ms step:1288/1480 train_time:203681ms step_avg:159.38ms step:1289/1480 train_time:203866ms step_avg:159.39ms step:1290/1480 train_time:204045ms step_avg:159.41ms step:1291/1480 train_time:204218ms step_avg:159.42ms step:1292/1480 train_time:204392ms step_avg:159.43ms step:1293/1480 train_time:204569ms step_avg:159.45ms step:1294/1480 train_time:204740ms step_avg:159.45ms step:1295/1480 train_time:204911ms step_avg:159.46ms step:1296/1480 train_time:205084ms step_avg:159.47ms step:1297/1480 train_time:205254ms step_avg:159.48ms step:1298/1480 train_time:205426ms step_avg:159.49ms step:1299/1480 train_time:205596ms step_avg:159.50ms step:1300/1480 train_time:205763ms step_avg:159.51ms step:1301/1480 train_time:205932ms step_avg:159.51ms step:1302/1480 train_time:206105ms step_avg:159.52ms step:1303/1480 train_time:206281ms step_avg:159.54ms step:1304/1480 train_time:206455ms step_avg:159.55ms step:1305/1480 train_time:206624ms step_avg:159.56ms step:1306/1480 train_time:206798ms step_avg:159.57ms step:1307/1480 train_time:206966ms step_avg:159.57ms step:1308/1480 train_time:207135ms step_avg:159.58ms step:1309/1480 train_time:207309ms step_avg:159.59ms step:1310/1480 train_time:207478ms step_avg:159.60ms step:1311/1480 train_time:207647ms step_avg:159.61ms step:1312/1480 train_time:207821ms step_avg:159.62ms step:1313/1480 train_time:207989ms step_avg:159.62ms step:1314/1480 train_time:208162ms step_avg:159.63ms step:1315/1480 train_time:208333ms step_avg:159.64ms step:1316/1480 train_time:208500ms step_avg:159.65ms step:1317/1480 train_time:208671ms step_avg:159.66ms step:1318/1480 train_time:208852ms step_avg:159.67ms step:1319/1480 train_time:209030ms step_avg:159.69ms step:1320/1480 train_time:209206ms step_avg:159.70ms step:1321/1480 train_time:209378ms step_avg:159.71ms step:1322/1480 train_time:209561ms step_avg:159.73ms step:1323/1480 train_time:209734ms step_avg:159.74ms step:1324/1480 train_time:209910ms step_avg:159.75ms step:1325/1480 train_time:210091ms step_avg:159.77ms step:1326/1480 train_time:210267ms step_avg:159.78ms step:1327/1480 train_time:210437ms step_avg:159.78ms step:1328/1480 train_time:210609ms step_avg:159.79ms step:1329/1480 train_time:210805ms step_avg:159.82ms step:1330/1480 train_time:210984ms step_avg:159.84ms step:1331/1480 train_time:211155ms step_avg:159.84ms step:1332/1480 train_time:211331ms step_avg:159.86ms step:1333/1480 train_time:211505ms step_avg:159.87ms step:1334/1480 train_time:211676ms step_avg:159.88ms step:1335/1480 train_time:211845ms step_avg:159.88ms step:1336/1480 train_time:212031ms step_avg:159.90ms step:1337/1480 train_time:212206ms step_avg:159.91ms step:1338/1480 train_time:212378ms step_avg:159.92ms step:1339/1480 train_time:212552ms step_avg:159.93ms step:1340/1480 train_time:212725ms step_avg:159.94ms step:1341/1480 train_time:212893ms step_avg:159.95ms step:1342/1480 train_time:213065ms step_avg:159.96ms step:1343/1480 train_time:213234ms step_avg:159.97ms step:1344/1480 train_time:213408ms step_avg:159.98ms step:1345/1480 train_time:213588ms step_avg:159.99ms step:1346/1480 train_time:213756ms step_avg:160.00ms step:1347/1480 train_time:213926ms step_avg:160.00ms step:1348/1480 train_time:214096ms step_avg:160.01ms step:1349/1480 train_time:214266ms step_avg:160.02ms step:1350/1480 train_time:214440ms step_avg:160.03ms step:1351/1480 train_time:214612ms step_avg:160.04ms step:1352/1480 train_time:214782ms step_avg:160.05ms step:1353/1480 train_time:214957ms step_avg:160.06ms step:1354/1480 train_time:215129ms step_avg:160.07ms step:1355/1480 train_time:215296ms step_avg:160.07ms step:1356/1480 train_time:215469ms step_avg:160.08ms step:1357/1480 train_time:215643ms step_avg:160.09ms step:1358/1480 train_time:215815ms step_avg:160.10ms step:1359/1480 train_time:215987ms step_avg:160.11ms step:1360/1480 train_time:216161ms step_avg:160.12ms step:1361/1480 train_time:216338ms step_avg:160.13ms step:1362/1480 train_time:216513ms step_avg:160.14ms step:1363/1480 train_time:216696ms step_avg:160.16ms step:1364/1480 train_time:216865ms step_avg:160.17ms step:1365/1480 train_time:217032ms step_avg:160.17ms step:1366/1480 train_time:217204ms step_avg:160.18ms step:1367/1480 train_time:217374ms step_avg:160.19ms step:1368/1480 train_time:217550ms step_avg:160.20ms step:1369/1480 train_time:217734ms step_avg:160.22ms step:1370/1480 train_time:217912ms step_avg:160.23ms step:1371/1480 train_time:218084ms step_avg:160.24ms step:1372/1480 train_time:218260ms step_avg:160.25ms step:1373/1480 train_time:218430ms step_avg:160.26ms step:1374/1480 train_time:218605ms step_avg:160.27ms step:1375/1480 train_time:218775ms step_avg:160.27ms step:1375/1480 val_loss:3.2968 train_time:218842ms step_avg:160.32ms step:1376/1480 train_time:218949ms step_avg:160.28ms step:1377/1480 train_time:219121ms step_avg:160.29ms step:1378/1480 train_time:219289ms step_avg:160.30ms step:1379/1480 train_time:219464ms step_avg:160.31ms step:1380/1480 train_time:219639ms step_avg:160.32ms step:1381/1480 train_time:219820ms step_avg:160.34ms step:1382/1480 train_time:219991ms step_avg:160.34ms step:1383/1480 train_time:220163ms step_avg:160.35ms step:1384/1480 train_time:220341ms step_avg:160.36ms step:1385/1480 train_time:220506ms step_avg:160.37ms step:1386/1480 train_time:220677ms step_avg:160.38ms step:1387/1480 train_time:220848ms step_avg:160.38ms step:1388/1480 train_time:221017ms step_avg:160.39ms step:1389/1480 train_time:221192ms step_avg:160.40ms step:1390/1480 train_time:221360ms step_avg:160.41ms step:1391/1480 train_time:221530ms step_avg:160.41ms step:1392/1480 train_time:221702ms step_avg:160.42ms step:1393/1480 train_time:221873ms step_avg:160.43ms step:1394/1480 train_time:222044ms step_avg:160.44ms step:1395/1480 train_time:222212ms step_avg:160.44ms step:1396/1480 train_time:222381ms step_avg:160.45ms step:1397/1480 train_time:222549ms step_avg:160.45ms step:1398/1480 train_time:222716ms step_avg:160.46ms step:1399/1480 train_time:222885ms step_avg:160.46ms step:1400/1480 train_time:223063ms step_avg:160.48ms step:1401/1480 train_time:223229ms step_avg:160.48ms step:1402/1480 train_time:223401ms step_avg:160.49ms step:1403/1480 train_time:223578ms step_avg:160.50ms step:1404/1480 train_time:223749ms step_avg:160.51ms step:1405/1480 train_time:223925ms step_avg:160.52ms step:1406/1480 train_time:224099ms step_avg:160.53ms step:1407/1480 train_time:224267ms step_avg:160.53ms step:1408/1480 train_time:224436ms step_avg:160.54ms step:1409/1480 train_time:224620ms step_avg:160.56ms step:1410/1480 train_time:224789ms step_avg:160.56ms step:1411/1480 train_time:224960ms step_avg:160.57ms step:1412/1480 train_time:225129ms step_avg:160.58ms step:1413/1480 train_time:225300ms step_avg:160.58ms step:1414/1480 train_time:225470ms step_avg:160.59ms step:1415/1480 train_time:225647ms step_avg:160.60ms step:1416/1480 train_time:225833ms step_avg:160.62ms step:1417/1480 train_time:226008ms step_avg:160.63ms step:1418/1480 train_time:226180ms step_avg:160.64ms step:1419/1480 train_time:226354ms step_avg:160.65ms step:1420/1480 train_time:226529ms step_avg:160.66ms step:1421/1480 train_time:226704ms step_avg:160.67ms step:1422/1480 train_time:226874ms step_avg:160.68ms step:1423/1480 train_time:227044ms step_avg:160.68ms step:1424/1480 train_time:227221ms step_avg:160.69ms step:1425/1480 train_time:227403ms step_avg:160.71ms step:1426/1480 train_time:227576ms step_avg:160.72ms step:1427/1480 train_time:227751ms step_avg:160.73ms step:1428/1480 train_time:227923ms step_avg:160.74ms step:1429/1480 train_time:228090ms step_avg:160.74ms step:1430/1480 train_time:228264ms step_avg:160.75ms step:1431/1480 train_time:228441ms step_avg:160.76ms step:1432/1480 train_time:228617ms step_avg:160.77ms step:1433/1480 train_time:228795ms step_avg:160.78ms step:1434/1480 train_time:228973ms step_avg:160.80ms step:1435/1480 train_time:229146ms step_avg:160.80ms step:1436/1480 train_time:229320ms step_avg:160.81ms step:1437/1480 train_time:229490ms step_avg:160.82ms step:1438/1480 train_time:229659ms step_avg:160.83ms step:1439/1480 train_time:229834ms step_avg:160.84ms step:1440/1480 train_time:230003ms step_avg:160.84ms step:1441/1480 train_time:230174ms step_avg:160.85ms step:1442/1480 train_time:230351ms step_avg:160.86ms step:1443/1480 train_time:230542ms step_avg:160.88ms step:1444/1480 train_time:230714ms step_avg:160.89ms step:1445/1480 train_time:230884ms step_avg:160.89ms step:1446/1480 train_time:231059ms step_avg:160.90ms step:1447/1480 train_time:231238ms step_avg:160.92ms step:1448/1480 train_time:231409ms step_avg:160.92ms step:1449/1480 train_time:231582ms step_avg:160.93ms step:1450/1480 train_time:231756ms step_avg:160.94ms step:1451/1480 train_time:231926ms step_avg:160.95ms step:1452/1480 train_time:232099ms step_avg:160.96ms step:1453/1480 train_time:232268ms step_avg:160.96ms step:1454/1480 train_time:232440ms step_avg:160.97ms step:1455/1480 train_time:232619ms step_avg:160.98ms step:1456/1480 train_time:232790ms step_avg:160.99ms step:1457/1480 train_time:232962ms step_avg:161.00ms step:1458/1480 train_time:233134ms step_avg:161.00ms step:1459/1480 train_time:233310ms step_avg:161.01ms step:1460/1480 train_time:233482ms step_avg:161.02ms step:1461/1480 train_time:233656ms step_avg:161.03ms step:1462/1480 train_time:233828ms step_avg:161.04ms step:1463/1480 train_time:234005ms step_avg:161.05ms step:1464/1480 train_time:234180ms step_avg:161.06ms step:1465/1480 train_time:234351ms step_avg:161.07ms step:1466/1480 train_time:234522ms step_avg:161.07ms step:1467/1480 train_time:234695ms step_avg:161.08ms step:1468/1480 train_time:234864ms step_avg:161.09ms step:1469/1480 train_time:235037ms step_avg:161.09ms step:1470/1480 train_time:235217ms step_avg:161.11ms step:1471/1480 train_time:235403ms step_avg:161.12ms step:1472/1480 train_time:235584ms step_avg:161.14ms step:1473/1480 train_time:235755ms step_avg:161.15ms step:1474/1480 train_time:235933ms step_avg:161.16ms step:1475/1480 train_time:236111ms step_avg:161.17ms step:1476/1480 train_time:236283ms step_avg:161.18ms step:1477/1480 train_time:236466ms step_avg:161.19ms step:1478/1480 train_time:236648ms step_avg:161.20ms step:1479/1480 train_time:236822ms step_avg:161.21ms step:1480/1480 train_time:236993ms step_avg:161.22ms step:1480/1480 val_loss:3.2779 train_time:237064ms step_avg:161.27ms