import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 10:20:45 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 96W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 82W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 44C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 95W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23392ms step_avg:nanms step:2/1480 train_time:23582ms step_avg:nanms step:3/1480 train_time:23722ms step_avg:nanms step:4/1480 train_time:23865ms step_avg:nanms step:5/1480 train_time:24006ms step_avg:nanms step:6/1480 train_time:24149ms step_avg:nanms step:7/1480 train_time:24291ms step_avg:nanms step:8/1480 train_time:24433ms step_avg:nanms step:9/1480 train_time:24577ms step_avg:nanms step:10/1480 train_time:24719ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:426ms step_avg:142.02ms step:14/1480 train_time:569ms step_avg:142.18ms step:15/1480 train_time:712ms step_avg:142.42ms step:16/1480 train_time:856ms step_avg:142.64ms step:17/1480 train_time:998ms step_avg:142.59ms step:18/1480 train_time:1140ms step_avg:142.54ms step:19/1480 train_time:1282ms step_avg:142.49ms step:20/1480 train_time:1426ms step_avg:142.57ms step:21/1480 train_time:1568ms step_avg:142.57ms step:22/1480 train_time:1712ms step_avg:142.69ms step:23/1480 train_time:1855ms step_avg:142.71ms step:24/1480 train_time:1997ms step_avg:142.67ms step:25/1480 train_time:2141ms step_avg:142.72ms step:26/1480 train_time:2283ms step_avg:142.69ms step:27/1480 train_time:2426ms step_avg:142.69ms step:28/1480 train_time:2569ms step_avg:142.70ms step:29/1480 train_time:2713ms step_avg:142.77ms step:30/1480 train_time:2855ms step_avg:142.76ms step:31/1480 train_time:2998ms step_avg:142.75ms step:32/1480 train_time:3140ms step_avg:142.71ms step:33/1480 train_time:3281ms step_avg:142.65ms step:34/1480 train_time:3423ms step_avg:142.62ms step:35/1480 train_time:3566ms step_avg:142.63ms step:36/1480 train_time:3709ms step_avg:142.65ms step:37/1480 train_time:3853ms step_avg:142.70ms step:38/1480 train_time:3995ms step_avg:142.68ms step:39/1480 train_time:4138ms step_avg:142.68ms step:40/1480 train_time:4279ms step_avg:142.65ms step:41/1480 train_time:4422ms step_avg:142.65ms step:42/1480 train_time:4566ms step_avg:142.67ms step:43/1480 train_time:4710ms step_avg:142.72ms step:44/1480 train_time:4853ms step_avg:142.73ms step:45/1480 train_time:4996ms step_avg:142.74ms step:46/1480 train_time:5139ms step_avg:142.74ms step:47/1480 train_time:5280ms step_avg:142.69ms step:48/1480 train_time:5421ms step_avg:142.65ms step:49/1480 train_time:5562ms step_avg:142.61ms step:50/1480 train_time:5706ms step_avg:142.66ms step:51/1480 train_time:5850ms step_avg:142.69ms step:52/1480 train_time:5994ms step_avg:142.71ms step:53/1480 train_time:6138ms step_avg:142.74ms step:54/1480 train_time:6279ms step_avg:142.71ms step:55/1480 train_time:6420ms step_avg:142.66ms step:56/1480 train_time:6561ms step_avg:142.63ms step:57/1480 train_time:6705ms step_avg:142.67ms step:58/1480 train_time:6850ms step_avg:142.71ms step:59/1480 train_time:6995ms step_avg:142.75ms step:60/1480 train_time:7138ms step_avg:142.76ms step:61/1480 train_time:7279ms step_avg:142.73ms step:62/1480 train_time:7421ms step_avg:142.71ms step:63/1480 train_time:7562ms step_avg:142.67ms step:64/1480 train_time:7704ms step_avg:142.67ms step:65/1480 train_time:7848ms step_avg:142.69ms step:66/1480 train_time:7993ms step_avg:142.73ms step:67/1480 train_time:8136ms step_avg:142.73ms step:68/1480 train_time:8278ms step_avg:142.73ms step:69/1480 train_time:8421ms step_avg:142.73ms step:70/1480 train_time:8562ms step_avg:142.70ms step:71/1480 train_time:8706ms step_avg:142.71ms step:72/1480 train_time:8849ms step_avg:142.73ms step:73/1480 train_time:8993ms step_avg:142.74ms step:74/1480 train_time:9136ms step_avg:142.74ms step:75/1480 train_time:9278ms step_avg:142.75ms step:76/1480 train_time:9421ms step_avg:142.73ms step:77/1480 train_time:9561ms step_avg:142.71ms step:78/1480 train_time:9704ms step_avg:142.70ms step:79/1480 train_time:9850ms step_avg:142.75ms step:80/1480 train_time:9991ms step_avg:142.73ms step:81/1480 train_time:10135ms step_avg:142.75ms step:82/1480 train_time:10277ms step_avg:142.73ms step:83/1480 train_time:10419ms step_avg:142.73ms step:84/1480 train_time:10561ms step_avg:142.72ms step:85/1480 train_time:10703ms step_avg:142.71ms step:86/1480 train_time:10848ms step_avg:142.73ms step:87/1480 train_time:10991ms step_avg:142.74ms step:88/1480 train_time:11133ms step_avg:142.73ms step:89/1480 train_time:11275ms step_avg:142.72ms step:90/1480 train_time:11418ms step_avg:142.72ms step:91/1480 train_time:11559ms step_avg:142.70ms step:92/1480 train_time:11699ms step_avg:142.68ms step:93/1480 train_time:11843ms step_avg:142.68ms step:94/1480 train_time:11986ms step_avg:142.68ms step:95/1480 train_time:12129ms step_avg:142.70ms step:96/1480 train_time:12272ms step_avg:142.70ms step:97/1480 train_time:12415ms step_avg:142.70ms step:98/1480 train_time:12556ms step_avg:142.69ms step:99/1480 train_time:12698ms step_avg:142.67ms step:100/1480 train_time:12841ms step_avg:142.67ms step:101/1480 train_time:12984ms step_avg:142.68ms step:102/1480 train_time:13128ms step_avg:142.70ms step:103/1480 train_time:13270ms step_avg:142.69ms step:104/1480 train_time:13412ms step_avg:142.68ms step:105/1480 train_time:13554ms step_avg:142.67ms step:106/1480 train_time:13695ms step_avg:142.65ms step:107/1480 train_time:13837ms step_avg:142.65ms step:108/1480 train_time:13979ms step_avg:142.64ms step:109/1480 train_time:14121ms step_avg:142.64ms step:110/1480 train_time:14264ms step_avg:142.64ms step:111/1480 train_time:14411ms step_avg:142.68ms step:112/1480 train_time:14558ms step_avg:142.72ms step:113/1480 train_time:14704ms step_avg:142.75ms step:114/1480 train_time:14851ms step_avg:142.80ms step:115/1480 train_time:14997ms step_avg:142.83ms step:116/1480 train_time:15143ms step_avg:142.85ms step:117/1480 train_time:15290ms step_avg:142.90ms step:118/1480 train_time:15437ms step_avg:142.94ms step:119/1480 train_time:15583ms step_avg:142.96ms step:120/1480 train_time:15731ms step_avg:143.01ms step:121/1480 train_time:15878ms step_avg:143.04ms step:122/1480 train_time:16024ms step_avg:143.08ms step:123/1480 train_time:16172ms step_avg:143.12ms step:124/1480 train_time:16318ms step_avg:143.14ms step:125/1480 train_time:16463ms step_avg:143.16ms step:125/1480 val_loss:4.4398 train_time:16520ms step_avg:143.65ms step:126/1480 train_time:16615ms step_avg:143.23ms step:127/1480 train_time:16764ms step_avg:143.28ms step:128/1480 train_time:16909ms step_avg:143.30ms step:129/1480 train_time:17055ms step_avg:143.32ms step:130/1480 train_time:17202ms step_avg:143.35ms step:131/1480 train_time:17347ms step_avg:143.36ms step:132/1480 train_time:17492ms step_avg:143.38ms step:133/1480 train_time:17642ms step_avg:143.43ms step:134/1480 train_time:17790ms step_avg:143.46ms step:135/1480 train_time:17936ms step_avg:143.49ms step:136/1480 train_time:18083ms step_avg:143.52ms step:137/1480 train_time:18229ms step_avg:143.54ms step:138/1480 train_time:18377ms step_avg:143.57ms step:139/1480 train_time:18523ms step_avg:143.59ms step:140/1480 train_time:18671ms step_avg:143.62ms step:141/1480 train_time:18818ms step_avg:143.65ms step:142/1480 train_time:18965ms step_avg:143.68ms step:143/1480 train_time:19110ms step_avg:143.68ms step:144/1480 train_time:19256ms step_avg:143.70ms step:145/1480 train_time:19403ms step_avg:143.73ms step:146/1480 train_time:19550ms step_avg:143.75ms step:147/1480 train_time:19696ms step_avg:143.77ms step:148/1480 train_time:19844ms step_avg:143.80ms step:149/1480 train_time:19990ms step_avg:143.81ms step:150/1480 train_time:20136ms step_avg:143.83ms step:151/1480 train_time:20283ms step_avg:143.85ms step:152/1480 train_time:20429ms step_avg:143.87ms step:153/1480 train_time:20577ms step_avg:143.90ms step:154/1480 train_time:20725ms step_avg:143.92ms step:155/1480 train_time:20871ms step_avg:143.94ms step:156/1480 train_time:21018ms step_avg:143.96ms step:157/1480 train_time:21165ms step_avg:143.98ms step:158/1480 train_time:21310ms step_avg:143.99ms step:159/1480 train_time:21458ms step_avg:144.01ms step:160/1480 train_time:21605ms step_avg:144.03ms step:161/1480 train_time:21752ms step_avg:144.05ms step:162/1480 train_time:21898ms step_avg:144.07ms step:163/1480 train_time:22045ms step_avg:144.09ms step:164/1480 train_time:22190ms step_avg:144.09ms step:165/1480 train_time:22337ms step_avg:144.11ms step:166/1480 train_time:22485ms step_avg:144.13ms step:167/1480 train_time:22631ms step_avg:144.14ms step:168/1480 train_time:22780ms step_avg:144.18ms step:169/1480 train_time:22925ms step_avg:144.18ms step:170/1480 train_time:23070ms step_avg:144.19ms step:171/1480 train_time:23218ms step_avg:144.21ms step:172/1480 train_time:23365ms step_avg:144.23ms step:173/1480 train_time:23510ms step_avg:144.23ms step:174/1480 train_time:23657ms step_avg:144.25ms step:175/1480 train_time:23805ms step_avg:144.27ms step:176/1480 train_time:23950ms step_avg:144.28ms step:177/1480 train_time:24096ms step_avg:144.29ms step:178/1480 train_time:24243ms step_avg:144.31ms step:179/1480 train_time:24388ms step_avg:144.31ms step:180/1480 train_time:24535ms step_avg:144.33ms step:181/1480 train_time:24683ms step_avg:144.34ms step:182/1480 train_time:24829ms step_avg:144.36ms step:183/1480 train_time:24975ms step_avg:144.37ms step:184/1480 train_time:25123ms step_avg:144.38ms step:185/1480 train_time:25269ms step_avg:144.39ms step:186/1480 train_time:25416ms step_avg:144.41ms step:187/1480 train_time:25563ms step_avg:144.42ms step:188/1480 train_time:25710ms step_avg:144.44ms step:189/1480 train_time:25857ms step_avg:144.45ms step:190/1480 train_time:26004ms step_avg:144.47ms step:191/1480 train_time:26150ms step_avg:144.47ms step:192/1480 train_time:26295ms step_avg:144.48ms step:193/1480 train_time:26443ms step_avg:144.49ms step:194/1480 train_time:26588ms step_avg:144.50ms step:195/1480 train_time:26735ms step_avg:144.52ms step:196/1480 train_time:26884ms step_avg:144.54ms step:197/1480 train_time:27029ms step_avg:144.54ms step:198/1480 train_time:27177ms step_avg:144.56ms step:199/1480 train_time:27323ms step_avg:144.57ms step:200/1480 train_time:27469ms step_avg:144.58ms step:201/1480 train_time:27616ms step_avg:144.59ms step:202/1480 train_time:27764ms step_avg:144.61ms step:203/1480 train_time:27910ms step_avg:144.61ms step:204/1480 train_time:28056ms step_avg:144.62ms step:205/1480 train_time:28203ms step_avg:144.63ms step:206/1480 train_time:28350ms step_avg:144.64ms step:207/1480 train_time:28495ms step_avg:144.65ms step:208/1480 train_time:28643ms step_avg:144.66ms step:209/1480 train_time:28789ms step_avg:144.67ms step:210/1480 train_time:28934ms step_avg:144.67ms step:211/1480 train_time:29083ms step_avg:144.69ms step:212/1480 train_time:29228ms step_avg:144.69ms step:213/1480 train_time:29375ms step_avg:144.71ms step:214/1480 train_time:29523ms step_avg:144.72ms step:215/1480 train_time:29669ms step_avg:144.73ms step:216/1480 train_time:29816ms step_avg:144.74ms step:217/1480 train_time:29963ms step_avg:144.75ms step:218/1480 train_time:30110ms step_avg:144.76ms step:219/1480 train_time:30257ms step_avg:144.77ms step:220/1480 train_time:30404ms step_avg:144.78ms step:221/1480 train_time:30552ms step_avg:144.80ms step:222/1480 train_time:30702ms step_avg:144.82ms step:223/1480 train_time:30852ms step_avg:144.85ms step:224/1480 train_time:31003ms step_avg:144.87ms step:225/1480 train_time:31153ms step_avg:144.90ms step:226/1480 train_time:31304ms step_avg:144.92ms step:227/1480 train_time:31453ms step_avg:144.95ms step:228/1480 train_time:31604ms step_avg:144.97ms step:229/1480 train_time:31754ms step_avg:144.99ms step:230/1480 train_time:31904ms step_avg:145.02ms step:231/1480 train_time:32053ms step_avg:145.04ms step:232/1480 train_time:32204ms step_avg:145.06ms step:233/1480 train_time:32354ms step_avg:145.08ms step:234/1480 train_time:32505ms step_avg:145.11ms step:235/1480 train_time:32656ms step_avg:145.14ms step:236/1480 train_time:32807ms step_avg:145.16ms step:237/1480 train_time:32956ms step_avg:145.18ms step:238/1480 train_time:33106ms step_avg:145.20ms step:239/1480 train_time:33256ms step_avg:145.22ms step:240/1480 train_time:33407ms step_avg:145.25ms step:241/1480 train_time:33558ms step_avg:145.27ms step:242/1480 train_time:33709ms step_avg:145.30ms step:243/1480 train_time:33860ms step_avg:145.32ms step:244/1480 train_time:34010ms step_avg:145.34ms step:245/1480 train_time:34160ms step_avg:145.36ms step:246/1480 train_time:34309ms step_avg:145.38ms step:247/1480 train_time:34460ms step_avg:145.40ms step:248/1480 train_time:34610ms step_avg:145.42ms step:249/1480 train_time:34760ms step_avg:145.44ms step:250/1480 train_time:34910ms step_avg:145.46ms step:250/1480 val_loss:3.9956 train_time:34968ms step_avg:145.70ms step:251/1480 train_time:35064ms step_avg:145.49ms step:252/1480 train_time:35216ms step_avg:145.52ms step:253/1480 train_time:35365ms step_avg:145.54ms step:254/1480 train_time:35514ms step_avg:145.55ms step:255/1480 train_time:35664ms step_avg:145.57ms step:256/1480 train_time:35813ms step_avg:145.58ms step:257/1480 train_time:35963ms step_avg:145.60ms step:258/1480 train_time:36114ms step_avg:145.62ms step:259/1480 train_time:36266ms step_avg:145.65ms step:260/1480 train_time:36417ms step_avg:145.67ms step:261/1480 train_time:36567ms step_avg:145.69ms step:262/1480 train_time:36717ms step_avg:145.70ms step:263/1480 train_time:36866ms step_avg:145.72ms step:264/1480 train_time:37017ms step_avg:145.74ms step:265/1480 train_time:37168ms step_avg:145.76ms step:266/1480 train_time:37319ms step_avg:145.78ms step:267/1480 train_time:37470ms step_avg:145.80ms step:268/1480 train_time:37620ms step_avg:145.81ms step:269/1480 train_time:37770ms step_avg:145.83ms step:270/1480 train_time:37920ms step_avg:145.85ms step:271/1480 train_time:38068ms step_avg:145.85ms step:272/1480 train_time:38219ms step_avg:145.87ms step:273/1480 train_time:38369ms step_avg:145.89ms step:274/1480 train_time:38519ms step_avg:145.91ms step:275/1480 train_time:38670ms step_avg:145.93ms step:276/1480 train_time:38821ms step_avg:145.94ms step:277/1480 train_time:38970ms step_avg:145.96ms step:278/1480 train_time:39121ms step_avg:145.97ms step:279/1480 train_time:39272ms step_avg:145.99ms step:280/1480 train_time:39423ms step_avg:146.01ms step:281/1480 train_time:39573ms step_avg:146.03ms step:282/1480 train_time:39724ms step_avg:146.04ms step:283/1480 train_time:39875ms step_avg:146.06ms step:284/1480 train_time:40024ms step_avg:146.07ms step:285/1480 train_time:40175ms step_avg:146.09ms step:286/1480 train_time:40326ms step_avg:146.11ms step:287/1480 train_time:40476ms step_avg:146.12ms step:288/1480 train_time:40626ms step_avg:146.14ms step:289/1480 train_time:40776ms step_avg:146.15ms step:290/1480 train_time:40925ms step_avg:146.16ms step:291/1480 train_time:41076ms step_avg:146.18ms step:292/1480 train_time:41225ms step_avg:146.19ms step:293/1480 train_time:41377ms step_avg:146.21ms step:294/1480 train_time:41527ms step_avg:146.22ms step:295/1480 train_time:41677ms step_avg:146.24ms step:296/1480 train_time:41827ms step_avg:146.25ms step:297/1480 train_time:41977ms step_avg:146.26ms step:298/1480 train_time:42126ms step_avg:146.27ms step:299/1480 train_time:42277ms step_avg:146.29ms step:300/1480 train_time:42427ms step_avg:146.30ms step:301/1480 train_time:42578ms step_avg:146.31ms step:302/1480 train_time:42727ms step_avg:146.33ms step:303/1480 train_time:42878ms step_avg:146.34ms step:304/1480 train_time:43028ms step_avg:146.35ms step:305/1480 train_time:43179ms step_avg:146.37ms step:306/1480 train_time:43328ms step_avg:146.38ms step:307/1480 train_time:43480ms step_avg:146.40ms step:308/1480 train_time:43629ms step_avg:146.41ms step:309/1480 train_time:43779ms step_avg:146.42ms step:310/1480 train_time:43929ms step_avg:146.43ms step:311/1480 train_time:44081ms step_avg:146.45ms step:312/1480 train_time:44230ms step_avg:146.46ms step:313/1480 train_time:44382ms step_avg:146.48ms step:314/1480 train_time:44532ms step_avg:146.49ms step:315/1480 train_time:44683ms step_avg:146.50ms step:316/1480 train_time:44832ms step_avg:146.51ms step:317/1480 train_time:44983ms step_avg:146.53ms step:318/1480 train_time:45133ms step_avg:146.53ms step:319/1480 train_time:45285ms step_avg:146.55ms step:320/1480 train_time:45436ms step_avg:146.57ms step:321/1480 train_time:45587ms step_avg:146.58ms step:322/1480 train_time:45738ms step_avg:146.60ms step:323/1480 train_time:45888ms step_avg:146.61ms step:324/1480 train_time:46038ms step_avg:146.62ms step:325/1480 train_time:46188ms step_avg:146.63ms step:326/1480 train_time:46339ms step_avg:146.64ms step:327/1480 train_time:46490ms step_avg:146.66ms step:328/1480 train_time:46641ms step_avg:146.67ms step:329/1480 train_time:46791ms step_avg:146.68ms step:330/1480 train_time:46943ms step_avg:146.70ms step:331/1480 train_time:47097ms step_avg:146.72ms step:332/1480 train_time:47250ms step_avg:146.74ms step:333/1480 train_time:47403ms step_avg:146.76ms step:334/1480 train_time:47557ms step_avg:146.78ms step:335/1480 train_time:47711ms step_avg:146.80ms step:336/1480 train_time:47865ms step_avg:146.82ms step:337/1480 train_time:48019ms step_avg:146.85ms step:338/1480 train_time:48175ms step_avg:146.87ms step:339/1480 train_time:48328ms step_avg:146.89ms step:340/1480 train_time:48481ms step_avg:146.91ms step:341/1480 train_time:48634ms step_avg:146.93ms step:342/1480 train_time:48790ms step_avg:146.96ms step:343/1480 train_time:48944ms step_avg:146.98ms step:344/1480 train_time:49098ms step_avg:147.00ms step:345/1480 train_time:49252ms step_avg:147.02ms step:346/1480 train_time:49406ms step_avg:147.04ms step:347/1480 train_time:49561ms step_avg:147.06ms step:348/1480 train_time:49713ms step_avg:147.08ms step:349/1480 train_time:49868ms step_avg:147.10ms step:350/1480 train_time:50022ms step_avg:147.12ms step:351/1480 train_time:50175ms step_avg:147.14ms step:352/1480 train_time:50329ms step_avg:147.16ms step:353/1480 train_time:50484ms step_avg:147.18ms step:354/1480 train_time:50638ms step_avg:147.20ms step:355/1480 train_time:50793ms step_avg:147.23ms step:356/1480 train_time:50945ms step_avg:147.24ms step:357/1480 train_time:51101ms step_avg:147.26ms step:358/1480 train_time:51254ms step_avg:147.28ms step:359/1480 train_time:51410ms step_avg:147.31ms step:360/1480 train_time:51566ms step_avg:147.33ms step:361/1480 train_time:51720ms step_avg:147.35ms step:362/1480 train_time:51874ms step_avg:147.37ms step:363/1480 train_time:52026ms step_avg:147.38ms step:364/1480 train_time:52179ms step_avg:147.40ms step:365/1480 train_time:52333ms step_avg:147.42ms step:366/1480 train_time:52487ms step_avg:147.44ms step:367/1480 train_time:52640ms step_avg:147.45ms step:368/1480 train_time:52795ms step_avg:147.47ms step:369/1480 train_time:52948ms step_avg:147.49ms step:370/1480 train_time:53102ms step_avg:147.51ms step:371/1480 train_time:53255ms step_avg:147.52ms step:372/1480 train_time:53410ms step_avg:147.54ms step:373/1480 train_time:53564ms step_avg:147.56ms step:374/1480 train_time:53717ms step_avg:147.58ms step:375/1480 train_time:53872ms step_avg:147.59ms step:375/1480 val_loss:3.8044 train_time:53932ms step_avg:147.76ms step:376/1480 train_time:54031ms step_avg:147.63ms step:377/1480 train_time:54188ms step_avg:147.65ms step:378/1480 train_time:54340ms step_avg:147.66ms step:379/1480 train_time:54495ms step_avg:147.68ms step:380/1480 train_time:54646ms step_avg:147.69ms step:381/1480 train_time:54799ms step_avg:147.71ms step:382/1480 train_time:54955ms step_avg:147.73ms step:383/1480 train_time:55110ms step_avg:147.75ms step:384/1480 train_time:55264ms step_avg:147.76ms step:385/1480 train_time:55417ms step_avg:147.78ms step:386/1480 train_time:55571ms step_avg:147.80ms step:387/1480 train_time:55724ms step_avg:147.81ms step:388/1480 train_time:55878ms step_avg:147.82ms step:389/1480 train_time:56032ms step_avg:147.84ms step:390/1480 train_time:56188ms step_avg:147.86ms step:391/1480 train_time:56343ms step_avg:147.88ms step:392/1480 train_time:56497ms step_avg:147.90ms step:393/1480 train_time:56651ms step_avg:147.91ms step:394/1480 train_time:56803ms step_avg:147.93ms step:395/1480 train_time:56957ms step_avg:147.94ms step:396/1480 train_time:57110ms step_avg:147.95ms step:397/1480 train_time:57263ms step_avg:147.97ms step:398/1480 train_time:57417ms step_avg:147.98ms step:399/1480 train_time:57571ms step_avg:148.00ms step:400/1480 train_time:57725ms step_avg:148.01ms step:401/1480 train_time:57878ms step_avg:148.03ms step:402/1480 train_time:58031ms step_avg:148.04ms step:403/1480 train_time:58184ms step_avg:148.05ms step:404/1480 train_time:58337ms step_avg:148.06ms step:405/1480 train_time:58491ms step_avg:148.08ms step:406/1480 train_time:58646ms step_avg:148.10ms step:407/1480 train_time:58800ms step_avg:148.11ms step:408/1480 train_time:58954ms step_avg:148.13ms step:409/1480 train_time:59107ms step_avg:148.14ms step:410/1480 train_time:59262ms step_avg:148.16ms step:411/1480 train_time:59416ms step_avg:148.17ms step:412/1480 train_time:59570ms step_avg:148.18ms step:413/1480 train_time:59723ms step_avg:148.20ms step:414/1480 train_time:59878ms step_avg:148.21ms step:415/1480 train_time:60032ms step_avg:148.23ms step:416/1480 train_time:60188ms step_avg:148.25ms step:417/1480 train_time:60339ms step_avg:148.25ms step:418/1480 train_time:60494ms step_avg:148.27ms step:419/1480 train_time:60649ms step_avg:148.29ms step:420/1480 train_time:60802ms step_avg:148.30ms step:421/1480 train_time:60956ms step_avg:148.31ms step:422/1480 train_time:61109ms step_avg:148.32ms step:423/1480 train_time:61262ms step_avg:148.33ms step:424/1480 train_time:61416ms step_avg:148.35ms step:425/1480 train_time:61573ms step_avg:148.37ms step:426/1480 train_time:61728ms step_avg:148.38ms step:427/1480 train_time:61881ms step_avg:148.40ms step:428/1480 train_time:62035ms step_avg:148.41ms step:429/1480 train_time:62189ms step_avg:148.42ms step:430/1480 train_time:62340ms step_avg:148.43ms step:431/1480 train_time:62494ms step_avg:148.44ms step:432/1480 train_time:62647ms step_avg:148.45ms step:433/1480 train_time:62801ms step_avg:148.47ms step:434/1480 train_time:62955ms step_avg:148.48ms step:435/1480 train_time:63109ms step_avg:148.49ms step:436/1480 train_time:63263ms step_avg:148.50ms step:437/1480 train_time:63416ms step_avg:148.52ms step:438/1480 train_time:63571ms step_avg:148.53ms step:439/1480 train_time:63724ms step_avg:148.54ms step:440/1480 train_time:63880ms step_avg:148.56ms step:441/1480 train_time:64035ms step_avg:148.57ms step:442/1480 train_time:64195ms step_avg:148.60ms step:443/1480 train_time:64350ms step_avg:148.62ms step:444/1480 train_time:64504ms step_avg:148.63ms step:445/1480 train_time:64658ms step_avg:148.64ms step:446/1480 train_time:64815ms step_avg:148.66ms step:447/1480 train_time:64970ms step_avg:148.67ms step:448/1480 train_time:65127ms step_avg:148.69ms step:449/1480 train_time:65285ms step_avg:148.71ms step:450/1480 train_time:65441ms step_avg:148.73ms step:451/1480 train_time:65598ms step_avg:148.75ms step:452/1480 train_time:65754ms step_avg:148.77ms step:453/1480 train_time:65909ms step_avg:148.78ms step:454/1480 train_time:66066ms step_avg:148.80ms step:455/1480 train_time:66221ms step_avg:148.81ms step:456/1480 train_time:66377ms step_avg:148.83ms step:457/1480 train_time:66533ms step_avg:148.84ms step:458/1480 train_time:66689ms step_avg:148.86ms step:459/1480 train_time:66847ms step_avg:148.88ms step:460/1480 train_time:67002ms step_avg:148.89ms step:461/1480 train_time:67161ms step_avg:148.92ms step:462/1480 train_time:67318ms step_avg:148.93ms step:463/1480 train_time:67476ms step_avg:148.95ms step:464/1480 train_time:67633ms step_avg:148.97ms step:465/1480 train_time:67790ms step_avg:148.99ms step:466/1480 train_time:67948ms step_avg:149.01ms step:467/1480 train_time:68104ms step_avg:149.02ms step:468/1480 train_time:68261ms step_avg:149.04ms step:469/1480 train_time:68417ms step_avg:149.06ms step:470/1480 train_time:68573ms step_avg:149.07ms step:471/1480 train_time:68728ms step_avg:149.09ms step:472/1480 train_time:68886ms step_avg:149.10ms step:473/1480 train_time:69041ms step_avg:149.12ms step:474/1480 train_time:69198ms step_avg:149.13ms step:475/1480 train_time:69356ms step_avg:149.15ms step:476/1480 train_time:69514ms step_avg:149.17ms step:477/1480 train_time:69673ms step_avg:149.19ms step:478/1480 train_time:69830ms step_avg:149.21ms step:479/1480 train_time:69988ms step_avg:149.23ms step:480/1480 train_time:70145ms step_avg:149.25ms step:481/1480 train_time:70301ms step_avg:149.26ms step:482/1480 train_time:70457ms step_avg:149.27ms step:483/1480 train_time:70614ms step_avg:149.29ms step:484/1480 train_time:70771ms step_avg:149.31ms step:485/1480 train_time:70929ms step_avg:149.32ms step:486/1480 train_time:71086ms step_avg:149.34ms step:487/1480 train_time:71242ms step_avg:149.35ms step:488/1480 train_time:71399ms step_avg:149.37ms step:489/1480 train_time:71556ms step_avg:149.39ms step:490/1480 train_time:71712ms step_avg:149.40ms step:491/1480 train_time:71870ms step_avg:149.42ms step:492/1480 train_time:72026ms step_avg:149.43ms step:493/1480 train_time:72183ms step_avg:149.45ms step:494/1480 train_time:72339ms step_avg:149.46ms step:495/1480 train_time:72498ms step_avg:149.48ms step:496/1480 train_time:72656ms step_avg:149.50ms step:497/1480 train_time:72814ms step_avg:149.52ms step:498/1480 train_time:72972ms step_avg:149.53ms step:499/1480 train_time:73131ms step_avg:149.55ms step:500/1480 train_time:73288ms step_avg:149.57ms step:500/1480 val_loss:3.6853 train_time:73349ms step_avg:149.69ms step:501/1480 train_time:73446ms step_avg:149.58ms step:502/1480 train_time:73604ms step_avg:149.60ms step:503/1480 train_time:73762ms step_avg:149.62ms step:504/1480 train_time:73918ms step_avg:149.63ms step:505/1480 train_time:74073ms step_avg:149.64ms step:506/1480 train_time:74229ms step_avg:149.65ms step:507/1480 train_time:74385ms step_avg:149.67ms step:508/1480 train_time:74542ms step_avg:149.68ms step:509/1480 train_time:74699ms step_avg:149.70ms step:510/1480 train_time:74856ms step_avg:149.71ms step:511/1480 train_time:75012ms step_avg:149.72ms step:512/1480 train_time:75169ms step_avg:149.74ms step:513/1480 train_time:75325ms step_avg:149.75ms step:514/1480 train_time:75483ms step_avg:149.77ms step:515/1480 train_time:75640ms step_avg:149.78ms step:516/1480 train_time:75799ms step_avg:149.80ms step:517/1480 train_time:75958ms step_avg:149.82ms step:518/1480 train_time:76115ms step_avg:149.83ms step:519/1480 train_time:76272ms step_avg:149.85ms step:520/1480 train_time:76429ms step_avg:149.86ms step:521/1480 train_time:76586ms step_avg:149.87ms step:522/1480 train_time:76742ms step_avg:149.89ms step:523/1480 train_time:76899ms step_avg:149.90ms step:524/1480 train_time:77057ms step_avg:149.92ms step:525/1480 train_time:77214ms step_avg:149.93ms step:526/1480 train_time:77375ms step_avg:149.95ms step:527/1480 train_time:77532ms step_avg:149.96ms step:528/1480 train_time:77687ms step_avg:149.98ms step:529/1480 train_time:77843ms step_avg:149.99ms step:530/1480 train_time:78000ms step_avg:150.00ms step:531/1480 train_time:78157ms step_avg:150.01ms step:532/1480 train_time:78313ms step_avg:150.03ms step:533/1480 train_time:78471ms step_avg:150.04ms step:534/1480 train_time:78627ms step_avg:150.05ms step:535/1480 train_time:78784ms step_avg:150.06ms step:536/1480 train_time:78943ms step_avg:150.08ms step:537/1480 train_time:79100ms step_avg:150.10ms step:538/1480 train_time:79258ms step_avg:150.11ms step:539/1480 train_time:79416ms step_avg:150.12ms step:540/1480 train_time:79573ms step_avg:150.14ms step:541/1480 train_time:79729ms step_avg:150.15ms step:542/1480 train_time:79886ms step_avg:150.16ms step:543/1480 train_time:80042ms step_avg:150.17ms step:544/1480 train_time:80199ms step_avg:150.19ms step:545/1480 train_time:80356ms step_avg:150.20ms step:546/1480 train_time:80513ms step_avg:150.21ms step:547/1480 train_time:80669ms step_avg:150.22ms step:548/1480 train_time:80826ms step_avg:150.23ms step:549/1480 train_time:80984ms step_avg:150.25ms step:550/1480 train_time:81142ms step_avg:150.26ms step:551/1480 train_time:81301ms step_avg:150.28ms step:552/1480 train_time:81461ms step_avg:150.30ms step:553/1480 train_time:81622ms step_avg:150.32ms step:554/1480 train_time:81783ms step_avg:150.34ms step:555/1480 train_time:81943ms step_avg:150.35ms step:556/1480 train_time:82102ms step_avg:150.37ms step:557/1480 train_time:82262ms step_avg:150.39ms step:558/1480 train_time:82422ms step_avg:150.40ms step:559/1480 train_time:82582ms step_avg:150.42ms step:560/1480 train_time:82742ms step_avg:150.44ms step:561/1480 train_time:82902ms step_avg:150.46ms step:562/1480 train_time:83063ms step_avg:150.48ms step:563/1480 train_time:83222ms step_avg:150.49ms step:564/1480 train_time:83382ms step_avg:150.51ms step:565/1480 train_time:83542ms step_avg:150.53ms step:566/1480 train_time:83701ms step_avg:150.54ms step:567/1480 train_time:83860ms step_avg:150.56ms step:568/1480 train_time:84020ms step_avg:150.57ms step:569/1480 train_time:84181ms step_avg:150.59ms step:570/1480 train_time:84342ms step_avg:150.61ms step:571/1480 train_time:84502ms step_avg:150.63ms step:572/1480 train_time:84661ms step_avg:150.64ms step:573/1480 train_time:84821ms step_avg:150.66ms step:574/1480 train_time:84982ms step_avg:150.68ms step:575/1480 train_time:85143ms step_avg:150.70ms step:576/1480 train_time:85302ms step_avg:150.71ms step:577/1480 train_time:85463ms step_avg:150.73ms step:578/1480 train_time:85622ms step_avg:150.74ms step:579/1480 train_time:85781ms step_avg:150.76ms step:580/1480 train_time:85941ms step_avg:150.77ms step:581/1480 train_time:86101ms step_avg:150.79ms step:582/1480 train_time:86261ms step_avg:150.81ms step:583/1480 train_time:86421ms step_avg:150.82ms step:584/1480 train_time:86582ms step_avg:150.84ms step:585/1480 train_time:86742ms step_avg:150.86ms step:586/1480 train_time:86902ms step_avg:150.87ms step:587/1480 train_time:87061ms step_avg:150.89ms step:588/1480 train_time:87221ms step_avg:150.90ms step:589/1480 train_time:87382ms step_avg:150.92ms step:590/1480 train_time:87542ms step_avg:150.94ms step:591/1480 train_time:87700ms step_avg:150.95ms step:592/1480 train_time:87861ms step_avg:150.96ms step:593/1480 train_time:88021ms step_avg:150.98ms step:594/1480 train_time:88183ms step_avg:151.00ms step:595/1480 train_time:88344ms step_avg:151.01ms step:596/1480 train_time:88505ms step_avg:151.03ms step:597/1480 train_time:88664ms step_avg:151.05ms step:598/1480 train_time:88822ms step_avg:151.06ms step:599/1480 train_time:88981ms step_avg:151.07ms step:600/1480 train_time:89142ms step_avg:151.09ms step:601/1480 train_time:89301ms step_avg:151.10ms step:602/1480 train_time:89461ms step_avg:151.12ms step:603/1480 train_time:89621ms step_avg:151.13ms step:604/1480 train_time:89782ms step_avg:151.15ms step:605/1480 train_time:89941ms step_avg:151.16ms step:606/1480 train_time:90102ms step_avg:151.18ms step:607/1480 train_time:90265ms step_avg:151.20ms step:608/1480 train_time:90424ms step_avg:151.21ms step:609/1480 train_time:90583ms step_avg:151.22ms step:610/1480 train_time:90742ms step_avg:151.24ms step:611/1480 train_time:90901ms step_avg:151.25ms step:612/1480 train_time:91063ms step_avg:151.27ms step:613/1480 train_time:91223ms step_avg:151.28ms step:614/1480 train_time:91383ms step_avg:151.30ms step:615/1480 train_time:91543ms step_avg:151.31ms step:616/1480 train_time:91700ms step_avg:151.32ms step:617/1480 train_time:91860ms step_avg:151.33ms step:618/1480 train_time:92018ms step_avg:151.35ms step:619/1480 train_time:92178ms step_avg:151.36ms step:620/1480 train_time:92339ms step_avg:151.38ms step:621/1480 train_time:92500ms step_avg:151.39ms step:622/1480 train_time:92661ms step_avg:151.41ms step:623/1480 train_time:92821ms step_avg:151.42ms step:624/1480 train_time:92981ms step_avg:151.44ms step:625/1480 train_time:93141ms step_avg:151.45ms step:625/1480 val_loss:3.6052 train_time:93205ms step_avg:151.55ms step:626/1480 train_time:93304ms step_avg:151.47ms step:627/1480 train_time:93463ms step_avg:151.48ms step:628/1480 train_time:93621ms step_avg:151.49ms step:629/1480 train_time:93779ms step_avg:151.50ms step:630/1480 train_time:93938ms step_avg:151.51ms step:631/1480 train_time:94095ms step_avg:151.52ms step:632/1480 train_time:94254ms step_avg:151.53ms step:633/1480 train_time:94414ms step_avg:151.55ms step:634/1480 train_time:94575ms step_avg:151.56ms step:635/1480 train_time:94734ms step_avg:151.57ms step:636/1480 train_time:94893ms step_avg:151.59ms step:637/1480 train_time:95054ms step_avg:151.60ms step:638/1480 train_time:95213ms step_avg:151.61ms step:639/1480 train_time:95372ms step_avg:151.62ms step:640/1480 train_time:95533ms step_avg:151.64ms step:641/1480 train_time:95693ms step_avg:151.65ms step:642/1480 train_time:95853ms step_avg:151.67ms step:643/1480 train_time:96013ms step_avg:151.68ms step:644/1480 train_time:96171ms step_avg:151.69ms step:645/1480 train_time:96331ms step_avg:151.70ms step:646/1480 train_time:96490ms step_avg:151.71ms step:647/1480 train_time:96650ms step_avg:151.73ms step:648/1480 train_time:96812ms step_avg:151.74ms step:649/1480 train_time:96972ms step_avg:151.76ms step:650/1480 train_time:97132ms step_avg:151.77ms step:651/1480 train_time:97292ms step_avg:151.78ms step:652/1480 train_time:97453ms step_avg:151.80ms step:653/1480 train_time:97613ms step_avg:151.81ms step:654/1480 train_time:97773ms step_avg:151.82ms step:655/1480 train_time:97933ms step_avg:151.83ms step:656/1480 train_time:98092ms step_avg:151.85ms step:657/1480 train_time:98253ms step_avg:151.86ms step:658/1480 train_time:98413ms step_avg:151.87ms step:659/1480 train_time:98574ms step_avg:151.89ms step:660/1480 train_time:98737ms step_avg:151.90ms step:661/1480 train_time:98898ms step_avg:151.92ms step:662/1480 train_time:99058ms step_avg:151.93ms step:663/1480 train_time:99218ms step_avg:151.94ms step:664/1480 train_time:99380ms step_avg:151.96ms step:665/1480 train_time:99541ms step_avg:151.97ms step:666/1480 train_time:99701ms step_avg:151.98ms step:667/1480 train_time:99864ms step_avg:152.00ms step:668/1480 train_time:100027ms step_avg:152.02ms step:669/1480 train_time:100189ms step_avg:152.03ms step:670/1480 train_time:100350ms step_avg:152.04ms step:671/1480 train_time:100512ms step_avg:152.06ms step:672/1480 train_time:100674ms step_avg:152.07ms step:673/1480 train_time:100836ms step_avg:152.09ms step:674/1480 train_time:100997ms step_avg:152.10ms step:675/1480 train_time:101158ms step_avg:152.12ms step:676/1480 train_time:101320ms step_avg:152.13ms step:677/1480 train_time:101480ms step_avg:152.14ms step:678/1480 train_time:101640ms step_avg:152.16ms step:679/1480 train_time:101801ms step_avg:152.17ms step:680/1480 train_time:101963ms step_avg:152.18ms step:681/1480 train_time:102123ms step_avg:152.20ms step:682/1480 train_time:102286ms step_avg:152.21ms step:683/1480 train_time:102448ms step_avg:152.23ms step:684/1480 train_time:102609ms step_avg:152.24ms step:685/1480 train_time:102774ms step_avg:152.26ms step:686/1480 train_time:102936ms step_avg:152.27ms step:687/1480 train_time:103096ms step_avg:152.28ms step:688/1480 train_time:103259ms step_avg:152.30ms step:689/1480 train_time:103422ms step_avg:152.32ms step:690/1480 train_time:103584ms step_avg:152.33ms step:691/1480 train_time:103745ms step_avg:152.34ms step:692/1480 train_time:103907ms step_avg:152.36ms step:693/1480 train_time:104069ms step_avg:152.37ms step:694/1480 train_time:104231ms step_avg:152.39ms step:695/1480 train_time:104393ms step_avg:152.40ms step:696/1480 train_time:104554ms step_avg:152.41ms step:697/1480 train_time:104716ms step_avg:152.42ms step:698/1480 train_time:104876ms step_avg:152.44ms step:699/1480 train_time:105040ms step_avg:152.45ms step:700/1480 train_time:105202ms step_avg:152.47ms step:701/1480 train_time:105362ms step_avg:152.48ms step:702/1480 train_time:105523ms step_avg:152.49ms step:703/1480 train_time:105683ms step_avg:152.50ms step:704/1480 train_time:105842ms step_avg:152.51ms step:705/1480 train_time:106004ms step_avg:152.52ms step:706/1480 train_time:106168ms step_avg:152.54ms step:707/1480 train_time:106331ms step_avg:152.56ms step:708/1480 train_time:106492ms step_avg:152.57ms step:709/1480 train_time:106655ms step_avg:152.58ms step:710/1480 train_time:106815ms step_avg:152.59ms step:711/1480 train_time:106977ms step_avg:152.61ms step:712/1480 train_time:107141ms step_avg:152.62ms step:713/1480 train_time:107302ms step_avg:152.63ms step:714/1480 train_time:107462ms step_avg:152.64ms step:715/1480 train_time:107622ms step_avg:152.65ms step:716/1480 train_time:107781ms step_avg:152.66ms step:717/1480 train_time:107943ms step_avg:152.68ms step:718/1480 train_time:108101ms step_avg:152.69ms step:719/1480 train_time:108261ms step_avg:152.70ms step:720/1480 train_time:108428ms step_avg:152.72ms step:721/1480 train_time:108591ms step_avg:152.73ms step:722/1480 train_time:108755ms step_avg:152.75ms step:723/1480 train_time:108916ms step_avg:152.76ms step:724/1480 train_time:109077ms step_avg:152.77ms step:725/1480 train_time:109240ms step_avg:152.78ms step:726/1480 train_time:109402ms step_avg:152.80ms step:727/1480 train_time:109564ms step_avg:152.81ms step:728/1480 train_time:109724ms step_avg:152.82ms step:729/1480 train_time:109885ms step_avg:152.83ms step:730/1480 train_time:110050ms step_avg:152.85ms step:731/1480 train_time:110212ms step_avg:152.86ms step:732/1480 train_time:110373ms step_avg:152.87ms step:733/1480 train_time:110535ms step_avg:152.88ms step:734/1480 train_time:110696ms step_avg:152.89ms step:735/1480 train_time:110857ms step_avg:152.91ms step:736/1480 train_time:111019ms step_avg:152.92ms step:737/1480 train_time:111181ms step_avg:152.93ms step:738/1480 train_time:111340ms step_avg:152.94ms step:739/1480 train_time:111499ms step_avg:152.95ms step:740/1480 train_time:111663ms step_avg:152.96ms step:741/1480 train_time:111825ms step_avg:152.98ms step:742/1480 train_time:111987ms step_avg:152.99ms step:743/1480 train_time:112150ms step_avg:153.00ms step:744/1480 train_time:112314ms step_avg:153.02ms step:745/1480 train_time:112479ms step_avg:153.03ms step:746/1480 train_time:112638ms step_avg:153.04ms step:747/1480 train_time:112798ms step_avg:153.05ms step:748/1480 train_time:112963ms step_avg:153.07ms step:749/1480 train_time:113127ms step_avg:153.08ms step:750/1480 train_time:113288ms step_avg:153.09ms step:750/1480 val_loss:3.5500 train_time:113352ms step_avg:153.18ms step:751/1480 train_time:113455ms step_avg:153.11ms step:752/1480 train_time:113617ms step_avg:153.12ms step:753/1480 train_time:113778ms step_avg:153.13ms step:754/1480 train_time:113938ms step_avg:153.14ms step:755/1480 train_time:114100ms step_avg:153.15ms step:756/1480 train_time:114260ms step_avg:153.16ms step:757/1480 train_time:114424ms step_avg:153.18ms step:758/1480 train_time:114584ms step_avg:153.19ms step:759/1480 train_time:114745ms step_avg:153.20ms step:760/1480 train_time:114905ms step_avg:153.21ms step:761/1480 train_time:115067ms step_avg:153.22ms step:762/1480 train_time:115228ms step_avg:153.23ms step:763/1480 train_time:115392ms step_avg:153.24ms step:764/1480 train_time:115555ms step_avg:153.26ms step:765/1480 train_time:115718ms step_avg:153.27ms step:766/1480 train_time:115881ms step_avg:153.28ms step:767/1480 train_time:116042ms step_avg:153.29ms step:768/1480 train_time:116203ms step_avg:153.30ms step:769/1480 train_time:116365ms step_avg:153.31ms step:770/1480 train_time:116528ms step_avg:153.33ms step:771/1480 train_time:116692ms step_avg:153.34ms step:772/1480 train_time:116856ms step_avg:153.35ms step:773/1480 train_time:117019ms step_avg:153.37ms step:774/1480 train_time:117181ms step_avg:153.38ms step:775/1480 train_time:117344ms step_avg:153.39ms step:776/1480 train_time:117507ms step_avg:153.40ms step:777/1480 train_time:117673ms step_avg:153.42ms step:778/1480 train_time:117837ms step_avg:153.43ms step:779/1480 train_time:117999ms step_avg:153.44ms step:780/1480 train_time:118161ms step_avg:153.46ms step:781/1480 train_time:118324ms step_avg:153.47ms step:782/1480 train_time:118486ms step_avg:153.48ms step:783/1480 train_time:118647ms step_avg:153.49ms step:784/1480 train_time:118810ms step_avg:153.50ms step:785/1480 train_time:118973ms step_avg:153.51ms step:786/1480 train_time:119140ms step_avg:153.53ms step:787/1480 train_time:119304ms step_avg:153.54ms step:788/1480 train_time:119466ms step_avg:153.56ms step:789/1480 train_time:119627ms step_avg:153.56ms step:790/1480 train_time:119793ms step_avg:153.58ms step:791/1480 train_time:119961ms step_avg:153.60ms step:792/1480 train_time:120126ms step_avg:153.61ms step:793/1480 train_time:120288ms step_avg:153.62ms step:794/1480 train_time:120452ms step_avg:153.64ms step:795/1480 train_time:120619ms step_avg:153.65ms step:796/1480 train_time:120785ms step_avg:153.67ms step:797/1480 train_time:120951ms step_avg:153.69ms step:798/1480 train_time:121116ms step_avg:153.70ms step:799/1480 train_time:121282ms step_avg:153.72ms step:800/1480 train_time:121445ms step_avg:153.73ms step:801/1480 train_time:121607ms step_avg:153.74ms step:802/1480 train_time:121776ms step_avg:153.76ms step:803/1480 train_time:121939ms step_avg:153.77ms step:804/1480 train_time:122102ms step_avg:153.78ms step:805/1480 train_time:122267ms step_avg:153.79ms step:806/1480 train_time:122427ms step_avg:153.80ms step:807/1480 train_time:122588ms step_avg:153.81ms step:808/1480 train_time:122753ms step_avg:153.83ms step:809/1480 train_time:122920ms step_avg:153.84ms step:810/1480 train_time:123082ms step_avg:153.85ms step:811/1480 train_time:123243ms step_avg:153.86ms step:812/1480 train_time:123406ms step_avg:153.87ms step:813/1480 train_time:123566ms step_avg:153.88ms step:814/1480 train_time:123729ms step_avg:153.89ms step:815/1480 train_time:123893ms step_avg:153.90ms step:816/1480 train_time:124057ms step_avg:153.92ms step:817/1480 train_time:124221ms step_avg:153.93ms step:818/1480 train_time:124381ms step_avg:153.94ms step:819/1480 train_time:124544ms step_avg:153.95ms step:820/1480 train_time:124708ms step_avg:153.96ms step:821/1480 train_time:124870ms step_avg:153.97ms step:822/1480 train_time:125035ms step_avg:153.98ms step:823/1480 train_time:125198ms step_avg:154.00ms step:824/1480 train_time:125360ms step_avg:154.00ms step:825/1480 train_time:125524ms step_avg:154.02ms step:826/1480 train_time:125693ms step_avg:154.04ms step:827/1480 train_time:125858ms step_avg:154.05ms step:828/1480 train_time:126021ms step_avg:154.06ms step:829/1480 train_time:126184ms step_avg:154.07ms step:830/1480 train_time:126348ms step_avg:154.08ms step:831/1480 train_time:126512ms step_avg:154.09ms step:832/1480 train_time:126677ms step_avg:154.11ms step:833/1480 train_time:126841ms step_avg:154.12ms step:834/1480 train_time:127005ms step_avg:154.13ms step:835/1480 train_time:127168ms step_avg:154.14ms step:836/1480 train_time:127333ms step_avg:154.16ms step:837/1480 train_time:127496ms step_avg:154.17ms step:838/1480 train_time:127658ms step_avg:154.18ms step:839/1480 train_time:127821ms step_avg:154.19ms step:840/1480 train_time:127982ms step_avg:154.19ms step:841/1480 train_time:128142ms step_avg:154.20ms step:842/1480 train_time:128305ms step_avg:154.21ms step:843/1480 train_time:128465ms step_avg:154.22ms step:844/1480 train_time:128628ms step_avg:154.23ms step:845/1480 train_time:128794ms step_avg:154.24ms step:846/1480 train_time:128958ms step_avg:154.26ms step:847/1480 train_time:129123ms step_avg:154.27ms step:848/1480 train_time:129284ms step_avg:154.28ms step:849/1480 train_time:129445ms step_avg:154.29ms step:850/1480 train_time:129608ms step_avg:154.30ms step:851/1480 train_time:129775ms step_avg:154.31ms step:852/1480 train_time:129939ms step_avg:154.32ms step:853/1480 train_time:130100ms step_avg:154.33ms step:854/1480 train_time:130263ms step_avg:154.34ms step:855/1480 train_time:130424ms step_avg:154.35ms step:856/1480 train_time:130587ms step_avg:154.36ms step:857/1480 train_time:130753ms step_avg:154.37ms step:858/1480 train_time:130919ms step_avg:154.39ms step:859/1480 train_time:131083ms step_avg:154.40ms step:860/1480 train_time:131245ms step_avg:154.41ms step:861/1480 train_time:131412ms step_avg:154.42ms step:862/1480 train_time:131580ms step_avg:154.44ms step:863/1480 train_time:131747ms step_avg:154.45ms step:864/1480 train_time:131911ms step_avg:154.46ms step:865/1480 train_time:132073ms step_avg:154.47ms step:866/1480 train_time:132240ms step_avg:154.49ms step:867/1480 train_time:132403ms step_avg:154.50ms step:868/1480 train_time:132563ms step_avg:154.50ms step:869/1480 train_time:132725ms step_avg:154.51ms step:870/1480 train_time:132890ms step_avg:154.52ms step:871/1480 train_time:133053ms step_avg:154.53ms step:872/1480 train_time:133218ms step_avg:154.55ms step:873/1480 train_time:133382ms step_avg:154.56ms step:874/1480 train_time:133547ms step_avg:154.57ms step:875/1480 train_time:133712ms step_avg:154.58ms step:875/1480 val_loss:3.5029 train_time:133777ms step_avg:154.66ms step:876/1480 train_time:133877ms step_avg:154.59ms step:877/1480 train_time:134045ms step_avg:154.61ms step:878/1480 train_time:134207ms step_avg:154.62ms step:879/1480 train_time:134371ms step_avg:154.63ms step:880/1480 train_time:134534ms step_avg:154.64ms step:881/1480 train_time:134695ms step_avg:154.64ms step:882/1480 train_time:134860ms step_avg:154.66ms step:883/1480 train_time:135027ms step_avg:154.67ms step:884/1480 train_time:135193ms step_avg:154.68ms step:885/1480 train_time:135358ms step_avg:154.70ms step:886/1480 train_time:135524ms step_avg:154.71ms step:887/1480 train_time:135691ms step_avg:154.72ms step:888/1480 train_time:135864ms step_avg:154.74ms step:889/1480 train_time:136032ms step_avg:154.76ms step:890/1480 train_time:136193ms step_avg:154.76ms step:891/1480 train_time:136360ms step_avg:154.78ms step:892/1480 train_time:136526ms step_avg:154.79ms step:893/1480 train_time:136688ms step_avg:154.80ms step:894/1480 train_time:136854ms step_avg:154.81ms step:895/1480 train_time:137021ms step_avg:154.83ms step:896/1480 train_time:137187ms step_avg:154.84ms step:897/1480 train_time:137353ms step_avg:154.85ms step:898/1480 train_time:137521ms step_avg:154.87ms step:899/1480 train_time:137685ms step_avg:154.88ms step:900/1480 train_time:137848ms step_avg:154.89ms step:901/1480 train_time:138012ms step_avg:154.90ms step:902/1480 train_time:138174ms step_avg:154.90ms step:903/1480 train_time:138347ms step_avg:154.92ms step:904/1480 train_time:138512ms step_avg:154.94ms step:905/1480 train_time:138673ms step_avg:154.94ms step:906/1480 train_time:138841ms step_avg:154.96ms step:907/1480 train_time:139008ms step_avg:154.97ms step:908/1480 train_time:139170ms step_avg:154.98ms step:909/1480 train_time:139336ms step_avg:154.99ms step:910/1480 train_time:139505ms step_avg:155.01ms step:911/1480 train_time:139670ms step_avg:155.02ms step:912/1480 train_time:139835ms step_avg:155.03ms step:913/1480 train_time:140004ms step_avg:155.04ms step:914/1480 train_time:140170ms step_avg:155.06ms step:915/1480 train_time:140340ms step_avg:155.07ms step:916/1480 train_time:140504ms step_avg:155.08ms step:917/1480 train_time:140669ms step_avg:155.09ms step:918/1480 train_time:140838ms step_avg:155.11ms step:919/1480 train_time:141007ms step_avg:155.12ms step:920/1480 train_time:141171ms step_avg:155.13ms step:921/1480 train_time:141339ms step_avg:155.15ms step:922/1480 train_time:141506ms step_avg:155.16ms step:923/1480 train_time:141669ms step_avg:155.17ms step:924/1480 train_time:141835ms step_avg:155.18ms step:925/1480 train_time:142001ms step_avg:155.19ms step:926/1480 train_time:142165ms step_avg:155.20ms step:927/1480 train_time:142329ms step_avg:155.21ms step:928/1480 train_time:142494ms step_avg:155.22ms step:929/1480 train_time:142661ms step_avg:155.24ms step:930/1480 train_time:142827ms step_avg:155.25ms step:931/1480 train_time:142990ms step_avg:155.26ms step:932/1480 train_time:143155ms step_avg:155.27ms step:933/1480 train_time:143324ms step_avg:155.28ms step:934/1480 train_time:143490ms step_avg:155.29ms step:935/1480 train_time:143660ms step_avg:155.31ms step:936/1480 train_time:143828ms step_avg:155.32ms step:937/1480 train_time:143999ms step_avg:155.34ms step:938/1480 train_time:144163ms step_avg:155.35ms step:939/1480 train_time:144331ms step_avg:155.36ms step:940/1480 train_time:144498ms step_avg:155.37ms step:941/1480 train_time:144662ms step_avg:155.38ms step:942/1480 train_time:144828ms step_avg:155.39ms step:943/1480 train_time:144998ms step_avg:155.41ms step:944/1480 train_time:145170ms step_avg:155.43ms step:945/1480 train_time:145333ms step_avg:155.44ms step:946/1480 train_time:145504ms step_avg:155.45ms step:947/1480 train_time:145671ms step_avg:155.47ms step:948/1480 train_time:145837ms step_avg:155.48ms step:949/1480 train_time:146004ms step_avg:155.49ms step:950/1480 train_time:146168ms step_avg:155.50ms step:951/1480 train_time:146335ms step_avg:155.51ms step:952/1480 train_time:146501ms step_avg:155.52ms step:953/1480 train_time:146670ms step_avg:155.54ms step:954/1480 train_time:146839ms step_avg:155.55ms step:955/1480 train_time:147002ms step_avg:155.56ms step:956/1480 train_time:147168ms step_avg:155.57ms step:957/1480 train_time:147336ms step_avg:155.58ms step:958/1480 train_time:147505ms step_avg:155.60ms step:959/1480 train_time:147671ms step_avg:155.61ms step:960/1480 train_time:147839ms step_avg:155.62ms step:961/1480 train_time:148004ms step_avg:155.63ms step:962/1480 train_time:148169ms step_avg:155.64ms step:963/1480 train_time:148334ms step_avg:155.65ms step:964/1480 train_time:148504ms step_avg:155.67ms step:965/1480 train_time:148669ms step_avg:155.67ms step:966/1480 train_time:148835ms step_avg:155.68ms step:967/1480 train_time:148999ms step_avg:155.69ms step:968/1480 train_time:149165ms step_avg:155.70ms step:969/1480 train_time:149330ms step_avg:155.71ms step:970/1480 train_time:149493ms step_avg:155.72ms step:971/1480 train_time:149658ms step_avg:155.73ms step:972/1480 train_time:149824ms step_avg:155.74ms step:973/1480 train_time:149988ms step_avg:155.75ms step:974/1480 train_time:150154ms step_avg:155.76ms step:975/1480 train_time:150321ms step_avg:155.77ms step:976/1480 train_time:150486ms step_avg:155.78ms step:977/1480 train_time:150650ms step_avg:155.79ms step:978/1480 train_time:150815ms step_avg:155.80ms step:979/1480 train_time:150980ms step_avg:155.81ms step:980/1480 train_time:151146ms step_avg:155.82ms step:981/1480 train_time:151313ms step_avg:155.83ms step:982/1480 train_time:151476ms step_avg:155.84ms step:983/1480 train_time:151644ms step_avg:155.85ms step:984/1480 train_time:151808ms step_avg:155.86ms step:985/1480 train_time:151973ms step_avg:155.87ms step:986/1480 train_time:152139ms step_avg:155.88ms step:987/1480 train_time:152305ms step_avg:155.89ms step:988/1480 train_time:152471ms step_avg:155.90ms step:989/1480 train_time:152635ms step_avg:155.91ms step:990/1480 train_time:152805ms step_avg:155.92ms step:991/1480 train_time:152972ms step_avg:155.93ms step:992/1480 train_time:153148ms step_avg:155.96ms step:993/1480 train_time:153326ms step_avg:155.98ms step:994/1480 train_time:153491ms step_avg:155.99ms step:995/1480 train_time:153654ms step_avg:155.99ms step:996/1480 train_time:153819ms step_avg:156.00ms step:997/1480 train_time:153984ms step_avg:156.01ms step:998/1480 train_time:154148ms step_avg:156.02ms step:999/1480 train_time:154312ms step_avg:156.03ms step:1000/1480 train_time:154482ms step_avg:156.04ms step:1000/1480 val_loss:3.4392 train_time:154550ms step_avg:156.11ms step:1001/1480 train_time:154651ms step_avg:156.06ms step:1002/1480 train_time:154817ms step_avg:156.07ms step:1003/1480 train_time:154988ms step_avg:156.08ms step:1004/1480 train_time:155158ms step_avg:156.09ms step:1005/1480 train_time:155326ms step_avg:156.11ms step:1006/1480 train_time:155494ms step_avg:156.12ms step:1007/1480 train_time:155660ms step_avg:156.13ms step:1008/1480 train_time:155829ms step_avg:156.14ms step:1009/1480 train_time:156002ms step_avg:156.16ms step:1010/1480 train_time:156168ms step_avg:156.17ms step:1011/1480 train_time:156332ms step_avg:156.18ms step:1012/1480 train_time:156499ms step_avg:156.19ms step:1013/1480 train_time:156668ms step_avg:156.20ms step:1014/1480 train_time:156835ms step_avg:156.21ms step:1015/1480 train_time:157005ms step_avg:156.22ms step:1016/1480 train_time:157172ms step_avg:156.23ms step:1017/1480 train_time:157342ms step_avg:156.25ms step:1018/1480 train_time:157510ms step_avg:156.26ms step:1019/1480 train_time:157680ms step_avg:156.27ms step:1020/1480 train_time:157849ms step_avg:156.29ms step:1021/1480 train_time:158013ms step_avg:156.29ms step:1022/1480 train_time:158181ms step_avg:156.31ms step:1023/1480 train_time:158346ms step_avg:156.31ms step:1024/1480 train_time:158512ms step_avg:156.32ms step:1025/1480 train_time:158683ms step_avg:156.34ms step:1026/1480 train_time:158849ms step_avg:156.35ms step:1027/1480 train_time:159015ms step_avg:156.36ms step:1028/1480 train_time:159188ms step_avg:156.37ms step:1029/1480 train_time:159362ms step_avg:156.39ms step:1030/1480 train_time:159528ms step_avg:156.40ms step:1031/1480 train_time:159692ms step_avg:156.41ms step:1032/1480 train_time:159864ms step_avg:156.42ms step:1033/1480 train_time:160030ms step_avg:156.43ms step:1034/1480 train_time:160198ms step_avg:156.44ms step:1035/1480 train_time:160365ms step_avg:156.45ms step:1036/1480 train_time:160530ms step_avg:156.46ms step:1037/1480 train_time:160698ms step_avg:156.47ms step:1038/1480 train_time:160865ms step_avg:156.48ms step:1039/1480 train_time:161035ms step_avg:156.50ms step:1040/1480 train_time:161203ms step_avg:156.51ms step:1041/1480 train_time:161369ms step_avg:156.52ms step:1042/1480 train_time:161532ms step_avg:156.52ms step:1043/1480 train_time:161699ms step_avg:156.53ms step:1044/1480 train_time:161864ms step_avg:156.54ms step:1045/1480 train_time:162034ms step_avg:156.55ms step:1046/1480 train_time:162203ms step_avg:156.57ms step:1047/1480 train_time:162368ms step_avg:156.57ms step:1048/1480 train_time:162533ms step_avg:156.58ms step:1049/1480 train_time:162700ms step_avg:156.59ms step:1050/1480 train_time:162867ms step_avg:156.60ms step:1051/1480 train_time:163036ms step_avg:156.61ms step:1052/1480 train_time:163205ms step_avg:156.63ms step:1053/1480 train_time:163370ms step_avg:156.63ms step:1054/1480 train_time:163540ms step_avg:156.65ms step:1055/1480 train_time:163706ms step_avg:156.66ms step:1056/1480 train_time:163870ms step_avg:156.66ms step:1057/1480 train_time:164038ms step_avg:156.67ms step:1058/1480 train_time:164207ms step_avg:156.69ms step:1059/1480 train_time:164380ms step_avg:156.70ms step:1060/1480 train_time:164549ms step_avg:156.71ms step:1061/1480 train_time:164712ms step_avg:156.72ms step:1062/1480 train_time:164879ms step_avg:156.73ms step:1063/1480 train_time:165042ms step_avg:156.74ms step:1064/1480 train_time:165205ms step_avg:156.74ms step:1065/1480 train_time:165371ms step_avg:156.75ms step:1066/1480 train_time:165540ms step_avg:156.76ms step:1067/1480 train_time:165709ms step_avg:156.77ms step:1068/1480 train_time:165874ms step_avg:156.78ms step:1069/1480 train_time:166045ms step_avg:156.79ms step:1070/1480 train_time:166210ms step_avg:156.80ms step:1071/1480 train_time:166384ms step_avg:156.82ms step:1072/1480 train_time:166550ms step_avg:156.83ms step:1073/1480 train_time:166712ms step_avg:156.83ms step:1074/1480 train_time:166880ms step_avg:156.84ms step:1075/1480 train_time:167050ms step_avg:156.85ms step:1076/1480 train_time:167217ms step_avg:156.86ms step:1077/1480 train_time:167384ms step_avg:156.87ms step:1078/1480 train_time:167560ms step_avg:156.89ms step:1079/1480 train_time:167733ms step_avg:156.91ms step:1080/1480 train_time:167904ms step_avg:156.92ms step:1081/1480 train_time:168069ms step_avg:156.93ms step:1082/1480 train_time:168235ms step_avg:156.94ms step:1083/1480 train_time:168402ms step_avg:156.95ms step:1084/1480 train_time:168569ms step_avg:156.95ms step:1085/1480 train_time:168737ms step_avg:156.97ms step:1086/1480 train_time:168906ms step_avg:156.98ms step:1087/1480 train_time:169073ms step_avg:156.98ms step:1088/1480 train_time:169243ms step_avg:157.00ms step:1089/1480 train_time:169415ms step_avg:157.01ms step:1090/1480 train_time:169588ms step_avg:157.03ms step:1091/1480 train_time:169755ms step_avg:157.03ms step:1092/1480 train_time:169924ms step_avg:157.05ms step:1093/1480 train_time:170092ms step_avg:157.06ms step:1094/1480 train_time:170260ms step_avg:157.07ms step:1095/1480 train_time:170424ms step_avg:157.07ms step:1096/1480 train_time:170593ms step_avg:157.08ms step:1097/1480 train_time:170761ms step_avg:157.09ms step:1098/1480 train_time:170931ms step_avg:157.11ms step:1099/1480 train_time:171103ms step_avg:157.12ms step:1100/1480 train_time:171273ms step_avg:157.13ms step:1101/1480 train_time:171445ms step_avg:157.14ms step:1102/1480 train_time:171618ms step_avg:157.16ms step:1103/1480 train_time:171796ms step_avg:157.18ms step:1104/1480 train_time:171965ms step_avg:157.19ms step:1105/1480 train_time:172134ms step_avg:157.20ms step:1106/1480 train_time:172303ms step_avg:157.21ms step:1107/1480 train_time:172470ms step_avg:157.22ms step:1108/1480 train_time:172635ms step_avg:157.23ms step:1109/1480 train_time:172803ms step_avg:157.24ms step:1110/1480 train_time:172968ms step_avg:157.24ms step:1111/1480 train_time:173134ms step_avg:157.25ms step:1112/1480 train_time:173306ms step_avg:157.26ms step:1113/1480 train_time:173485ms step_avg:157.29ms step:1114/1480 train_time:173660ms step_avg:157.30ms step:1115/1480 train_time:173831ms step_avg:157.31ms step:1116/1480 train_time:173999ms step_avg:157.32ms step:1117/1480 train_time:174172ms step_avg:157.34ms step:1118/1480 train_time:174344ms step_avg:157.35ms step:1119/1480 train_time:174510ms step_avg:157.36ms step:1120/1480 train_time:174680ms step_avg:157.37ms step:1121/1480 train_time:174850ms step_avg:157.38ms step:1122/1480 train_time:175017ms step_avg:157.39ms step:1123/1480 train_time:175184ms step_avg:157.40ms step:1124/1480 train_time:175354ms step_avg:157.41ms step:1125/1480 train_time:175522ms step_avg:157.42ms step:1125/1480 val_loss:3.3832 train_time:175591ms step_avg:157.48ms step:1126/1480 train_time:175692ms step_avg:157.43ms step:1127/1480 train_time:175861ms step_avg:157.44ms step:1128/1480 train_time:176032ms step_avg:157.45ms step:1129/1480 train_time:176208ms step_avg:157.47ms step:1130/1480 train_time:176377ms step_avg:157.48ms step:1131/1480 train_time:176555ms step_avg:157.50ms step:1132/1480 train_time:176720ms step_avg:157.50ms step:1133/1480 train_time:176892ms step_avg:157.52ms step:1134/1480 train_time:177064ms step_avg:157.53ms step:1135/1480 train_time:177232ms step_avg:157.54ms step:1136/1480 train_time:177402ms step_avg:157.55ms step:1137/1480 train_time:177572ms step_avg:157.56ms step:1138/1480 train_time:177746ms step_avg:157.58ms step:1139/1480 train_time:177913ms step_avg:157.58ms step:1140/1480 train_time:178081ms step_avg:157.59ms step:1141/1480 train_time:178254ms step_avg:157.61ms step:1142/1480 train_time:178423ms step_avg:157.62ms step:1143/1480 train_time:178593ms step_avg:157.63ms step:1144/1480 train_time:178761ms step_avg:157.64ms step:1145/1480 train_time:178928ms step_avg:157.65ms step:1146/1480 train_time:179098ms step_avg:157.66ms step:1147/1480 train_time:179267ms step_avg:157.67ms step:1148/1480 train_time:179435ms step_avg:157.68ms step:1149/1480 train_time:179608ms step_avg:157.69ms step:1150/1480 train_time:179776ms step_avg:157.70ms step:1151/1480 train_time:179950ms step_avg:157.71ms step:1152/1480 train_time:180121ms step_avg:157.72ms step:1153/1480 train_time:180294ms step_avg:157.74ms step:1154/1480 train_time:180460ms step_avg:157.74ms step:1155/1480 train_time:180633ms step_avg:157.76ms step:1156/1480 train_time:180811ms step_avg:157.78ms step:1157/1480 train_time:180981ms step_avg:157.79ms step:1158/1480 train_time:181148ms step_avg:157.79ms step:1159/1480 train_time:181315ms step_avg:157.80ms step:1160/1480 train_time:181480ms step_avg:157.81ms step:1161/1480 train_time:181652ms step_avg:157.82ms step:1162/1480 train_time:181821ms step_avg:157.83ms step:1163/1480 train_time:181991ms step_avg:157.84ms step:1164/1480 train_time:182159ms step_avg:157.85ms step:1165/1480 train_time:182325ms step_avg:157.86ms step:1166/1480 train_time:182494ms step_avg:157.87ms step:1167/1480 train_time:182663ms step_avg:157.88ms step:1168/1480 train_time:182830ms step_avg:157.88ms step:1169/1480 train_time:182998ms step_avg:157.89ms step:1170/1480 train_time:183167ms step_avg:157.90ms step:1171/1480 train_time:183333ms step_avg:157.91ms step:1172/1480 train_time:183498ms step_avg:157.92ms step:1173/1480 train_time:183669ms step_avg:157.93ms step:1174/1480 train_time:183852ms step_avg:157.95ms step:1175/1480 train_time:184024ms step_avg:157.96ms step:1176/1480 train_time:184195ms step_avg:157.97ms step:1177/1480 train_time:184370ms step_avg:157.99ms step:1178/1480 train_time:184537ms step_avg:157.99ms step:1179/1480 train_time:184703ms step_avg:158.00ms step:1180/1480 train_time:184883ms step_avg:158.02ms step:1181/1480 train_time:185053ms step_avg:158.03ms step:1182/1480 train_time:185220ms step_avg:158.04ms step:1183/1480 train_time:185391ms step_avg:158.05ms step:1184/1480 train_time:185559ms step_avg:158.06ms step:1185/1480 train_time:185733ms step_avg:158.07ms step:1186/1480 train_time:185903ms step_avg:158.08ms step:1187/1480 train_time:186085ms step_avg:158.10ms step:1188/1480 train_time:186252ms step_avg:158.11ms step:1189/1480 train_time:186425ms step_avg:158.12ms step:1190/1480 train_time:186592ms step_avg:158.13ms step:1191/1480 train_time:186762ms step_avg:158.14ms step:1192/1480 train_time:186929ms step_avg:158.15ms step:1193/1480 train_time:187095ms step_avg:158.15ms step:1194/1480 train_time:187264ms step_avg:158.16ms step:1195/1480 train_time:187439ms step_avg:158.18ms step:1196/1480 train_time:187623ms step_avg:158.20ms step:1197/1480 train_time:187794ms step_avg:158.21ms step:1198/1480 train_time:187974ms step_avg:158.23ms step:1199/1480 train_time:188144ms step_avg:158.24ms step:1200/1480 train_time:188314ms step_avg:158.25ms step:1201/1480 train_time:188481ms step_avg:158.25ms step:1202/1480 train_time:188663ms step_avg:158.27ms step:1203/1480 train_time:188838ms step_avg:158.29ms step:1204/1480 train_time:189013ms step_avg:158.30ms step:1205/1480 train_time:189180ms step_avg:158.31ms step:1206/1480 train_time:189348ms step_avg:158.32ms step:1207/1480 train_time:189518ms step_avg:158.33ms step:1208/1480 train_time:189687ms step_avg:158.34ms step:1209/1480 train_time:189860ms step_avg:158.35ms step:1210/1480 train_time:190034ms step_avg:158.36ms step:1211/1480 train_time:190208ms step_avg:158.37ms step:1212/1480 train_time:190378ms step_avg:158.38ms step:1213/1480 train_time:190551ms step_avg:158.40ms step:1214/1480 train_time:190728ms step_avg:158.41ms step:1215/1480 train_time:190900ms step_avg:158.42ms step:1216/1480 train_time:191071ms step_avg:158.43ms step:1217/1480 train_time:191246ms step_avg:158.45ms step:1218/1480 train_time:191416ms step_avg:158.46ms step:1219/1480 train_time:191594ms step_avg:158.47ms step:1220/1480 train_time:191764ms step_avg:158.48ms step:1221/1480 train_time:191934ms step_avg:158.49ms step:1222/1480 train_time:192102ms step_avg:158.50ms step:1223/1480 train_time:192271ms step_avg:158.51ms step:1224/1480 train_time:192450ms step_avg:158.53ms step:1225/1480 train_time:192622ms step_avg:158.54ms step:1226/1480 train_time:192795ms step_avg:158.55ms step:1227/1480 train_time:192969ms step_avg:158.56ms step:1228/1480 train_time:193139ms step_avg:158.57ms step:1229/1480 train_time:193312ms step_avg:158.58ms step:1230/1480 train_time:193492ms step_avg:158.60ms step:1231/1480 train_time:193668ms step_avg:158.61ms step:1232/1480 train_time:193844ms step_avg:158.63ms step:1233/1480 train_time:194013ms step_avg:158.64ms step:1234/1480 train_time:194184ms step_avg:158.65ms step:1235/1480 train_time:194359ms step_avg:158.66ms step:1236/1480 train_time:194528ms step_avg:158.67ms step:1237/1480 train_time:194698ms step_avg:158.68ms step:1238/1480 train_time:194882ms step_avg:158.70ms step:1239/1480 train_time:195052ms step_avg:158.71ms step:1240/1480 train_time:195227ms step_avg:158.72ms step:1241/1480 train_time:195399ms step_avg:158.73ms step:1242/1480 train_time:195568ms step_avg:158.74ms step:1243/1480 train_time:195741ms step_avg:158.75ms step:1244/1480 train_time:195908ms step_avg:158.76ms step:1245/1480 train_time:196075ms step_avg:158.77ms step:1246/1480 train_time:196244ms step_avg:158.77ms step:1247/1480 train_time:196412ms step_avg:158.78ms step:1248/1480 train_time:196582ms step_avg:158.79ms step:1249/1480 train_time:196751ms step_avg:158.80ms step:1250/1480 train_time:196920ms step_avg:158.81ms step:1250/1480 val_loss:3.3331 train_time:196991ms step_avg:158.86ms step:1251/1480 train_time:197098ms step_avg:158.82ms step:1252/1480 train_time:197268ms step_avg:158.83ms step:1253/1480 train_time:197436ms step_avg:158.84ms step:1254/1480 train_time:197608ms step_avg:158.85ms step:1255/1480 train_time:197795ms step_avg:158.87ms step:1256/1480 train_time:197970ms step_avg:158.88ms step:1257/1480 train_time:198141ms step_avg:158.89ms step:1258/1480 train_time:198316ms step_avg:158.91ms step:1259/1480 train_time:198487ms step_avg:158.92ms step:1260/1480 train_time:198655ms step_avg:158.92ms step:1261/1480 train_time:198826ms step_avg:158.93ms step:1262/1480 train_time:199000ms step_avg:158.95ms step:1263/1480 train_time:199173ms step_avg:158.96ms step:1264/1480 train_time:199341ms step_avg:158.96ms step:1265/1480 train_time:199510ms step_avg:158.97ms step:1266/1480 train_time:199680ms step_avg:158.98ms step:1267/1480 train_time:199852ms step_avg:158.99ms step:1268/1480 train_time:200023ms step_avg:159.00ms step:1269/1480 train_time:200198ms step_avg:159.01ms step:1270/1480 train_time:200369ms step_avg:159.02ms step:1271/1480 train_time:200539ms step_avg:159.03ms step:1272/1480 train_time:200706ms step_avg:159.04ms step:1273/1480 train_time:200877ms step_avg:159.05ms step:1274/1480 train_time:201051ms step_avg:159.06ms step:1275/1480 train_time:201217ms step_avg:159.06ms step:1276/1480 train_time:201382ms step_avg:159.07ms step:1277/1480 train_time:201556ms step_avg:159.08ms step:1278/1480 train_time:201724ms step_avg:159.09ms step:1279/1480 train_time:201895ms step_avg:159.10ms step:1280/1480 train_time:202075ms step_avg:159.11ms step:1281/1480 train_time:202245ms step_avg:159.12ms step:1282/1480 train_time:202412ms step_avg:159.13ms step:1283/1480 train_time:202581ms step_avg:159.14ms step:1284/1480 train_time:202752ms step_avg:159.15ms step:1285/1480 train_time:202921ms step_avg:159.15ms step:1286/1480 train_time:203092ms step_avg:159.16ms step:1287/1480 train_time:203263ms step_avg:159.17ms step:1288/1480 train_time:203435ms step_avg:159.18ms step:1289/1480 train_time:203616ms step_avg:159.20ms step:1290/1480 train_time:203797ms step_avg:159.22ms step:1291/1480 train_time:203972ms step_avg:159.23ms step:1292/1480 train_time:204147ms step_avg:159.24ms step:1293/1480 train_time:204321ms step_avg:159.25ms step:1294/1480 train_time:204491ms step_avg:159.26ms step:1295/1480 train_time:204662ms step_avg:159.27ms step:1296/1480 train_time:204836ms step_avg:159.28ms step:1297/1480 train_time:205008ms step_avg:159.29ms step:1298/1480 train_time:205179ms step_avg:159.30ms step:1299/1480 train_time:205348ms step_avg:159.31ms step:1300/1480 train_time:205516ms step_avg:159.31ms step:1301/1480 train_time:205686ms step_avg:159.32ms step:1302/1480 train_time:205860ms step_avg:159.33ms step:1303/1480 train_time:206037ms step_avg:159.35ms step:1304/1480 train_time:206211ms step_avg:159.36ms step:1305/1480 train_time:206379ms step_avg:159.37ms step:1306/1480 train_time:206555ms step_avg:159.38ms step:1307/1480 train_time:206722ms step_avg:159.38ms step:1308/1480 train_time:206891ms step_avg:159.39ms step:1309/1480 train_time:207063ms step_avg:159.40ms step:1310/1480 train_time:207232ms step_avg:159.41ms step:1311/1480 train_time:207400ms step_avg:159.42ms step:1312/1480 train_time:207574ms step_avg:159.43ms step:1313/1480 train_time:207743ms step_avg:159.43ms step:1314/1480 train_time:207917ms step_avg:159.45ms step:1315/1480 train_time:208088ms step_avg:159.45ms step:1316/1480 train_time:208254ms step_avg:159.46ms step:1317/1480 train_time:208424ms step_avg:159.47ms step:1318/1480 train_time:208603ms step_avg:159.48ms step:1319/1480 train_time:208778ms step_avg:159.49ms step:1320/1480 train_time:208955ms step_avg:159.51ms step:1321/1480 train_time:209127ms step_avg:159.52ms step:1322/1480 train_time:209309ms step_avg:159.53ms step:1323/1480 train_time:209480ms step_avg:159.54ms step:1324/1480 train_time:209656ms step_avg:159.56ms step:1325/1480 train_time:209836ms step_avg:159.57ms step:1326/1480 train_time:210011ms step_avg:159.58ms step:1327/1480 train_time:210181ms step_avg:159.59ms step:1328/1480 train_time:210353ms step_avg:159.60ms step:1329/1480 train_time:210548ms step_avg:159.63ms step:1330/1480 train_time:210727ms step_avg:159.64ms step:1331/1480 train_time:210897ms step_avg:159.65ms step:1332/1480 train_time:211072ms step_avg:159.66ms step:1333/1480 train_time:211249ms step_avg:159.67ms step:1334/1480 train_time:211419ms step_avg:159.68ms step:1335/1480 train_time:211586ms step_avg:159.69ms step:1336/1480 train_time:211771ms step_avg:159.71ms step:1337/1480 train_time:211946ms step_avg:159.72ms step:1338/1480 train_time:212117ms step_avg:159.73ms step:1339/1480 train_time:212291ms step_avg:159.74ms step:1340/1480 train_time:212464ms step_avg:159.75ms step:1341/1480 train_time:212633ms step_avg:159.75ms step:1342/1480 train_time:212807ms step_avg:159.77ms step:1343/1480 train_time:212976ms step_avg:159.77ms step:1344/1480 train_time:213150ms step_avg:159.78ms step:1345/1480 train_time:213328ms step_avg:159.80ms step:1346/1480 train_time:213497ms step_avg:159.80ms step:1347/1480 train_time:213668ms step_avg:159.81ms step:1348/1480 train_time:213838ms step_avg:159.82ms step:1349/1480 train_time:214007ms step_avg:159.83ms step:1350/1480 train_time:214182ms step_avg:159.84ms step:1351/1480 train_time:214354ms step_avg:159.85ms step:1352/1480 train_time:214523ms step_avg:159.85ms step:1353/1480 train_time:214699ms step_avg:159.86ms step:1354/1480 train_time:214869ms step_avg:159.87ms step:1355/1480 train_time:215039ms step_avg:159.88ms step:1356/1480 train_time:215212ms step_avg:159.89ms step:1357/1480 train_time:215386ms step_avg:159.90ms step:1358/1480 train_time:215557ms step_avg:159.91ms step:1359/1480 train_time:215730ms step_avg:159.92ms step:1360/1480 train_time:215906ms step_avg:159.93ms step:1361/1480 train_time:216082ms step_avg:159.94ms step:1362/1480 train_time:216257ms step_avg:159.95ms step:1363/1480 train_time:216438ms step_avg:159.97ms step:1364/1480 train_time:216608ms step_avg:159.98ms step:1365/1480 train_time:216775ms step_avg:159.98ms step:1366/1480 train_time:216948ms step_avg:159.99ms step:1367/1480 train_time:217119ms step_avg:160.00ms step:1368/1480 train_time:217294ms step_avg:160.01ms step:1369/1480 train_time:217476ms step_avg:160.03ms step:1370/1480 train_time:217653ms step_avg:160.04ms step:1371/1480 train_time:217824ms step_avg:160.05ms step:1372/1480 train_time:218001ms step_avg:160.06ms step:1373/1480 train_time:218171ms step_avg:160.07ms step:1374/1480 train_time:218347ms step_avg:160.08ms step:1375/1480 train_time:218518ms step_avg:160.09ms step:1375/1480 val_loss:3.2946 train_time:218585ms step_avg:160.14ms step:1376/1480 train_time:218690ms step_avg:160.10ms step:1377/1480 train_time:218863ms step_avg:160.10ms step:1378/1480 train_time:219030ms step_avg:160.11ms step:1379/1480 train_time:219206ms step_avg:160.12ms step:1380/1480 train_time:219380ms step_avg:160.13ms step:1381/1480 train_time:219563ms step_avg:160.15ms step:1382/1480 train_time:219733ms step_avg:160.16ms step:1383/1480 train_time:219906ms step_avg:160.16ms step:1384/1480 train_time:220083ms step_avg:160.18ms step:1385/1480 train_time:220248ms step_avg:160.18ms step:1386/1480 train_time:220420ms step_avg:160.19ms step:1387/1480 train_time:220591ms step_avg:160.20ms step:1388/1480 train_time:220761ms step_avg:160.20ms step:1389/1480 train_time:220933ms step_avg:160.21ms step:1390/1480 train_time:221102ms step_avg:160.22ms step:1391/1480 train_time:221273ms step_avg:160.23ms step:1392/1480 train_time:221445ms step_avg:160.24ms step:1393/1480 train_time:221617ms step_avg:160.24ms step:1394/1480 train_time:221788ms step_avg:160.25ms step:1395/1480 train_time:221956ms step_avg:160.26ms step:1396/1480 train_time:222124ms step_avg:160.26ms step:1397/1480 train_time:222291ms step_avg:160.27ms step:1398/1480 train_time:222459ms step_avg:160.27ms step:1399/1480 train_time:222627ms step_avg:160.28ms step:1400/1480 train_time:222804ms step_avg:160.29ms step:1401/1480 train_time:222970ms step_avg:160.29ms step:1402/1480 train_time:223143ms step_avg:160.30ms step:1403/1480 train_time:223319ms step_avg:160.32ms step:1404/1480 train_time:223490ms step_avg:160.32ms step:1405/1480 train_time:223666ms step_avg:160.33ms step:1406/1480 train_time:223840ms step_avg:160.34ms step:1407/1480 train_time:224007ms step_avg:160.35ms step:1408/1480 train_time:224175ms step_avg:160.35ms step:1409/1480 train_time:224357ms step_avg:160.37ms step:1410/1480 train_time:224527ms step_avg:160.38ms step:1411/1480 train_time:224697ms step_avg:160.38ms step:1412/1480 train_time:224868ms step_avg:160.39ms step:1413/1480 train_time:225038ms step_avg:160.40ms step:1414/1480 train_time:225209ms step_avg:160.41ms step:1415/1480 train_time:225384ms step_avg:160.42ms step:1416/1480 train_time:225571ms step_avg:160.43ms step:1417/1480 train_time:225746ms step_avg:160.44ms step:1418/1480 train_time:225917ms step_avg:160.45ms step:1419/1480 train_time:226092ms step_avg:160.46ms step:1420/1480 train_time:226268ms step_avg:160.47ms step:1421/1480 train_time:226442ms step_avg:160.48ms step:1422/1480 train_time:226612ms step_avg:160.49ms step:1423/1480 train_time:226781ms step_avg:160.50ms step:1424/1480 train_time:226958ms step_avg:160.51ms step:1425/1480 train_time:227138ms step_avg:160.52ms step:1426/1480 train_time:227308ms step_avg:160.53ms step:1427/1480 train_time:227484ms step_avg:160.54ms step:1428/1480 train_time:227654ms step_avg:160.55ms step:1429/1480 train_time:227822ms step_avg:160.55ms step:1430/1480 train_time:227995ms step_avg:160.56ms step:1431/1480 train_time:228171ms step_avg:160.57ms step:1432/1480 train_time:228347ms step_avg:160.58ms step:1433/1480 train_time:228528ms step_avg:160.60ms step:1434/1480 train_time:228708ms step_avg:160.61ms step:1435/1480 train_time:228883ms step_avg:160.62ms step:1436/1480 train_time:229058ms step_avg:160.63ms step:1437/1480 train_time:229228ms step_avg:160.64ms step:1438/1480 train_time:229396ms step_avg:160.64ms step:1439/1480 train_time:229570ms step_avg:160.65ms step:1440/1480 train_time:229741ms step_avg:160.66ms step:1441/1480 train_time:229911ms step_avg:160.66ms step:1442/1480 train_time:230088ms step_avg:160.68ms step:1443/1480 train_time:230275ms step_avg:160.69ms step:1444/1480 train_time:230445ms step_avg:160.70ms step:1445/1480 train_time:230616ms step_avg:160.71ms step:1446/1480 train_time:230792ms step_avg:160.72ms step:1447/1480 train_time:230970ms step_avg:160.73ms step:1448/1480 train_time:231141ms step_avg:160.74ms step:1449/1480 train_time:231316ms step_avg:160.75ms step:1450/1480 train_time:231488ms step_avg:160.76ms step:1451/1480 train_time:231659ms step_avg:160.76ms step:1452/1480 train_time:231832ms step_avg:160.77ms step:1453/1480 train_time:232002ms step_avg:160.78ms step:1454/1480 train_time:232174ms step_avg:160.79ms step:1455/1480 train_time:232353ms step_avg:160.80ms step:1456/1480 train_time:232525ms step_avg:160.81ms step:1457/1480 train_time:232696ms step_avg:160.81ms step:1458/1480 train_time:232866ms step_avg:160.82ms step:1459/1480 train_time:233044ms step_avg:160.83ms step:1460/1480 train_time:233216ms step_avg:160.84ms step:1461/1480 train_time:233390ms step_avg:160.85ms step:1462/1480 train_time:233563ms step_avg:160.86ms step:1463/1480 train_time:233738ms step_avg:160.87ms step:1464/1480 train_time:233912ms step_avg:160.87ms step:1465/1480 train_time:234085ms step_avg:160.88ms step:1466/1480 train_time:234255ms step_avg:160.89ms step:1467/1480 train_time:234430ms step_avg:160.90ms step:1468/1480 train_time:234601ms step_avg:160.91ms step:1469/1480 train_time:234774ms step_avg:160.91ms step:1470/1480 train_time:234954ms step_avg:160.93ms step:1471/1480 train_time:235140ms step_avg:160.94ms step:1472/1480 train_time:235321ms step_avg:160.96ms step:1473/1480 train_time:235493ms step_avg:160.97ms step:1474/1480 train_time:235671ms step_avg:160.98ms step:1475/1480 train_time:235850ms step_avg:160.99ms step:1476/1480 train_time:236023ms step_avg:161.00ms step:1477/1480 train_time:236206ms step_avg:161.01ms step:1478/1480 train_time:236388ms step_avg:161.03ms step:1479/1480 train_time:236562ms step_avg:161.04ms step:1480/1480 train_time:236734ms step_avg:161.04ms step:1480/1480 val_loss:3.2757 train_time:236805ms step_avg:161.09ms