import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:12:46 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 44C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 44C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 86W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 106W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23758ms step_avg:nanms step:2/1480 train_time:23847ms step_avg:nanms step:3/1480 train_time:23984ms step_avg:nanms step:4/1480 train_time:24124ms step_avg:nanms step:5/1480 train_time:24264ms step_avg:nanms step:6/1480 train_time:24404ms step_avg:nanms step:7/1480 train_time:24545ms step_avg:nanms step:8/1480 train_time:24688ms step_avg:nanms step:9/1480 train_time:24836ms step_avg:nanms step:10/1480 train_time:24980ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.47ms step:14/1480 train_time:565ms step_avg:141.35ms step:15/1480 train_time:708ms step_avg:141.60ms step:16/1480 train_time:853ms step_avg:142.24ms step:17/1480 train_time:998ms step_avg:142.61ms step:18/1480 train_time:1141ms step_avg:142.63ms step:19/1480 train_time:1283ms step_avg:142.51ms step:20/1480 train_time:1423ms step_avg:142.35ms step:21/1480 train_time:1564ms step_avg:142.21ms step:22/1480 train_time:1708ms step_avg:142.31ms step:23/1480 train_time:1851ms step_avg:142.42ms step:24/1480 train_time:1996ms step_avg:142.58ms step:25/1480 train_time:2140ms step_avg:142.69ms step:26/1480 train_time:2282ms step_avg:142.64ms step:27/1480 train_time:2424ms step_avg:142.61ms step:28/1480 train_time:2565ms step_avg:142.47ms step:29/1480 train_time:2706ms step_avg:142.41ms step:30/1480 train_time:2849ms step_avg:142.45ms step:31/1480 train_time:2993ms step_avg:142.52ms step:32/1480 train_time:3138ms step_avg:142.62ms step:33/1480 train_time:3280ms step_avg:142.60ms step:34/1480 train_time:3422ms step_avg:142.58ms step:35/1480 train_time:3563ms step_avg:142.51ms step:36/1480 train_time:3703ms step_avg:142.44ms step:37/1480 train_time:3846ms step_avg:142.45ms step:38/1480 train_time:3990ms step_avg:142.50ms step:39/1480 train_time:4134ms step_avg:142.56ms step:40/1480 train_time:4278ms step_avg:142.61ms step:41/1480 train_time:4421ms step_avg:142.62ms step:42/1480 train_time:4563ms step_avg:142.58ms step:43/1480 train_time:4704ms step_avg:142.53ms step:44/1480 train_time:4846ms step_avg:142.53ms step:45/1480 train_time:4989ms step_avg:142.54ms step:46/1480 train_time:5132ms step_avg:142.57ms step:47/1480 train_time:5275ms step_avg:142.57ms step:48/1480 train_time:5418ms step_avg:142.58ms step:49/1480 train_time:5560ms step_avg:142.56ms step:50/1480 train_time:5701ms step_avg:142.53ms step:51/1480 train_time:5843ms step_avg:142.52ms step:52/1480 train_time:5985ms step_avg:142.50ms step:53/1480 train_time:6129ms step_avg:142.52ms step:54/1480 train_time:6271ms step_avg:142.52ms step:55/1480 train_time:6416ms step_avg:142.59ms step:56/1480 train_time:6560ms step_avg:142.60ms step:57/1480 train_time:6701ms step_avg:142.58ms step:58/1480 train_time:6843ms step_avg:142.57ms step:59/1480 train_time:6985ms step_avg:142.55ms step:60/1480 train_time:7126ms step_avg:142.52ms step:61/1480 train_time:7269ms step_avg:142.53ms step:62/1480 train_time:7414ms step_avg:142.57ms step:63/1480 train_time:7558ms step_avg:142.60ms step:64/1480 train_time:7701ms step_avg:142.61ms step:65/1480 train_time:7844ms step_avg:142.62ms step:66/1480 train_time:7986ms step_avg:142.60ms step:67/1480 train_time:8129ms step_avg:142.62ms step:68/1480 train_time:8273ms step_avg:142.63ms step:69/1480 train_time:8418ms step_avg:142.68ms step:70/1480 train_time:8562ms step_avg:142.69ms step:71/1480 train_time:8703ms step_avg:142.68ms step:72/1480 train_time:8845ms step_avg:142.66ms step:73/1480 train_time:8987ms step_avg:142.65ms step:74/1480 train_time:9128ms step_avg:142.63ms step:75/1480 train_time:9270ms step_avg:142.61ms step:76/1480 train_time:9413ms step_avg:142.63ms step:77/1480 train_time:9558ms step_avg:142.66ms step:78/1480 train_time:9701ms step_avg:142.67ms step:79/1480 train_time:9844ms step_avg:142.67ms step:80/1480 train_time:9986ms step_avg:142.66ms step:81/1480 train_time:10127ms step_avg:142.64ms step:82/1480 train_time:10270ms step_avg:142.64ms step:83/1480 train_time:10414ms step_avg:142.66ms step:84/1480 train_time:10559ms step_avg:142.69ms step:85/1480 train_time:10701ms step_avg:142.68ms step:86/1480 train_time:10843ms step_avg:142.67ms step:87/1480 train_time:10985ms step_avg:142.67ms step:88/1480 train_time:11128ms step_avg:142.66ms step:89/1480 train_time:11269ms step_avg:142.65ms step:90/1480 train_time:11415ms step_avg:142.68ms step:91/1480 train_time:11560ms step_avg:142.71ms step:92/1480 train_time:11703ms step_avg:142.71ms step:93/1480 train_time:11845ms step_avg:142.71ms step:94/1480 train_time:11986ms step_avg:142.69ms step:95/1480 train_time:12128ms step_avg:142.68ms step:96/1480 train_time:12270ms step_avg:142.67ms step:97/1480 train_time:12415ms step_avg:142.70ms step:98/1480 train_time:12558ms step_avg:142.70ms step:99/1480 train_time:12700ms step_avg:142.70ms step:100/1480 train_time:12843ms step_avg:142.70ms step:101/1480 train_time:12984ms step_avg:142.69ms step:102/1480 train_time:13125ms step_avg:142.67ms step:103/1480 train_time:13267ms step_avg:142.65ms step:104/1480 train_time:13410ms step_avg:142.66ms step:105/1480 train_time:13555ms step_avg:142.68ms step:106/1480 train_time:13699ms step_avg:142.70ms step:107/1480 train_time:13841ms step_avg:142.69ms step:108/1480 train_time:13984ms step_avg:142.69ms step:109/1480 train_time:14126ms step_avg:142.69ms step:110/1480 train_time:14267ms step_avg:142.67ms step:111/1480 train_time:14414ms step_avg:142.71ms step:112/1480 train_time:14563ms step_avg:142.77ms step:113/1480 train_time:14709ms step_avg:142.80ms step:114/1480 train_time:14857ms step_avg:142.85ms step:115/1480 train_time:15003ms step_avg:142.89ms step:116/1480 train_time:15150ms step_avg:142.92ms step:117/1480 train_time:15298ms step_avg:142.97ms step:118/1480 train_time:15445ms step_avg:143.01ms step:119/1480 train_time:15593ms step_avg:143.05ms step:120/1480 train_time:15741ms step_avg:143.10ms step:121/1480 train_time:15888ms step_avg:143.13ms step:122/1480 train_time:16037ms step_avg:143.19ms step:123/1480 train_time:16183ms step_avg:143.21ms step:124/1480 train_time:16330ms step_avg:143.25ms step:125/1480 train_time:16477ms step_avg:143.28ms step:125/1480 val_loss:4.4125 train_time:16535ms step_avg:143.78ms step:126/1480 train_time:16634ms step_avg:143.40ms step:127/1480 train_time:16783ms step_avg:143.45ms step:128/1480 train_time:16930ms step_avg:143.47ms step:129/1480 train_time:17077ms step_avg:143.50ms step:130/1480 train_time:17222ms step_avg:143.51ms step:131/1480 train_time:17368ms step_avg:143.54ms step:132/1480 train_time:17515ms step_avg:143.56ms step:133/1480 train_time:17664ms step_avg:143.61ms step:134/1480 train_time:17812ms step_avg:143.65ms step:135/1480 train_time:17960ms step_avg:143.68ms step:136/1480 train_time:18108ms step_avg:143.71ms step:137/1480 train_time:18255ms step_avg:143.74ms step:138/1480 train_time:18401ms step_avg:143.76ms step:139/1480 train_time:18547ms step_avg:143.77ms step:140/1480 train_time:18695ms step_avg:143.81ms step:141/1480 train_time:18842ms step_avg:143.83ms step:142/1480 train_time:18991ms step_avg:143.87ms step:143/1480 train_time:19138ms step_avg:143.90ms step:144/1480 train_time:19286ms step_avg:143.92ms step:145/1480 train_time:19432ms step_avg:143.94ms step:146/1480 train_time:19579ms step_avg:143.96ms step:147/1480 train_time:19726ms step_avg:143.99ms step:148/1480 train_time:19874ms step_avg:144.01ms step:149/1480 train_time:20020ms step_avg:144.03ms step:150/1480 train_time:20168ms step_avg:144.06ms step:151/1480 train_time:20315ms step_avg:144.08ms step:152/1480 train_time:20462ms step_avg:144.10ms step:153/1480 train_time:20609ms step_avg:144.12ms step:154/1480 train_time:20756ms step_avg:144.14ms step:155/1480 train_time:20902ms step_avg:144.15ms step:156/1480 train_time:21049ms step_avg:144.17ms step:157/1480 train_time:21196ms step_avg:144.19ms step:158/1480 train_time:21342ms step_avg:144.20ms step:159/1480 train_time:21489ms step_avg:144.22ms step:160/1480 train_time:21636ms step_avg:144.24ms step:161/1480 train_time:21782ms step_avg:144.25ms step:162/1480 train_time:21930ms step_avg:144.27ms step:163/1480 train_time:22077ms step_avg:144.29ms step:164/1480 train_time:22222ms step_avg:144.30ms step:165/1480 train_time:22369ms step_avg:144.32ms step:166/1480 train_time:22515ms step_avg:144.33ms step:167/1480 train_time:22661ms step_avg:144.34ms step:168/1480 train_time:22808ms step_avg:144.36ms step:169/1480 train_time:22955ms step_avg:144.37ms step:170/1480 train_time:23100ms step_avg:144.38ms step:171/1480 train_time:23248ms step_avg:144.40ms step:172/1480 train_time:23395ms step_avg:144.41ms step:173/1480 train_time:23540ms step_avg:144.42ms step:174/1480 train_time:23687ms step_avg:144.44ms step:175/1480 train_time:23834ms step_avg:144.45ms step:176/1480 train_time:23982ms step_avg:144.47ms step:177/1480 train_time:24129ms step_avg:144.49ms step:178/1480 train_time:24276ms step_avg:144.50ms step:179/1480 train_time:24422ms step_avg:144.51ms step:180/1480 train_time:24569ms step_avg:144.52ms step:181/1480 train_time:24716ms step_avg:144.54ms step:182/1480 train_time:24863ms step_avg:144.55ms step:183/1480 train_time:25010ms step_avg:144.57ms step:184/1480 train_time:25158ms step_avg:144.59ms step:185/1480 train_time:25305ms step_avg:144.60ms step:186/1480 train_time:25452ms step_avg:144.62ms step:187/1480 train_time:25598ms step_avg:144.62ms step:188/1480 train_time:25744ms step_avg:144.63ms step:189/1480 train_time:25892ms step_avg:144.65ms step:190/1480 train_time:26038ms step_avg:144.65ms step:191/1480 train_time:26184ms step_avg:144.66ms step:192/1480 train_time:26331ms step_avg:144.68ms step:193/1480 train_time:26478ms step_avg:144.69ms step:194/1480 train_time:26626ms step_avg:144.70ms step:195/1480 train_time:26773ms step_avg:144.72ms step:196/1480 train_time:26919ms step_avg:144.73ms step:197/1480 train_time:27066ms step_avg:144.74ms step:198/1480 train_time:27213ms step_avg:144.75ms step:199/1480 train_time:27358ms step_avg:144.75ms step:200/1480 train_time:27504ms step_avg:144.76ms step:201/1480 train_time:27651ms step_avg:144.77ms step:202/1480 train_time:27798ms step_avg:144.78ms step:203/1480 train_time:27945ms step_avg:144.80ms step:204/1480 train_time:28093ms step_avg:144.81ms step:205/1480 train_time:28238ms step_avg:144.81ms step:206/1480 train_time:28386ms step_avg:144.82ms step:207/1480 train_time:28533ms step_avg:144.84ms step:208/1480 train_time:28679ms step_avg:144.84ms step:209/1480 train_time:28826ms step_avg:144.85ms step:210/1480 train_time:28973ms step_avg:144.87ms step:211/1480 train_time:29119ms step_avg:144.87ms step:212/1480 train_time:29264ms step_avg:144.87ms step:213/1480 train_time:29410ms step_avg:144.88ms step:214/1480 train_time:29557ms step_avg:144.89ms step:215/1480 train_time:29704ms step_avg:144.90ms step:216/1480 train_time:29852ms step_avg:144.91ms step:217/1480 train_time:29998ms step_avg:144.92ms step:218/1480 train_time:30145ms step_avg:144.93ms step:219/1480 train_time:30293ms step_avg:144.94ms step:220/1480 train_time:30440ms step_avg:144.95ms step:221/1480 train_time:30590ms step_avg:144.98ms step:222/1480 train_time:30739ms step_avg:145.00ms step:223/1480 train_time:30890ms step_avg:145.02ms step:224/1480 train_time:31040ms step_avg:145.05ms step:225/1480 train_time:31191ms step_avg:145.08ms step:226/1480 train_time:31340ms step_avg:145.09ms step:227/1480 train_time:31490ms step_avg:145.12ms step:228/1480 train_time:31640ms step_avg:145.14ms step:229/1480 train_time:31791ms step_avg:145.17ms step:230/1480 train_time:31941ms step_avg:145.19ms step:231/1480 train_time:32093ms step_avg:145.22ms step:232/1480 train_time:32243ms step_avg:145.24ms step:233/1480 train_time:32393ms step_avg:145.26ms step:234/1480 train_time:32544ms step_avg:145.29ms step:235/1480 train_time:32696ms step_avg:145.31ms step:236/1480 train_time:32846ms step_avg:145.34ms step:237/1480 train_time:32999ms step_avg:145.37ms step:238/1480 train_time:33148ms step_avg:145.38ms step:239/1480 train_time:33298ms step_avg:145.41ms step:240/1480 train_time:33448ms step_avg:145.43ms step:241/1480 train_time:33598ms step_avg:145.45ms step:242/1480 train_time:33749ms step_avg:145.47ms step:243/1480 train_time:33899ms step_avg:145.49ms step:244/1480 train_time:34051ms step_avg:145.52ms step:245/1480 train_time:34200ms step_avg:145.53ms step:246/1480 train_time:34352ms step_avg:145.56ms step:247/1480 train_time:34502ms step_avg:145.58ms step:248/1480 train_time:34654ms step_avg:145.60ms step:249/1480 train_time:34803ms step_avg:145.62ms step:250/1480 train_time:34954ms step_avg:145.64ms step:250/1480 val_loss:3.9997 train_time:35013ms step_avg:145.89ms step:251/1480 train_time:35112ms step_avg:145.69ms step:252/1480 train_time:35266ms step_avg:145.73ms step:253/1480 train_time:35415ms step_avg:145.74ms step:254/1480 train_time:35563ms step_avg:145.75ms step:255/1480 train_time:35712ms step_avg:145.76ms step:256/1480 train_time:35860ms step_avg:145.77ms step:257/1480 train_time:36010ms step_avg:145.79ms step:258/1480 train_time:36162ms step_avg:145.81ms step:259/1480 train_time:36316ms step_avg:145.85ms step:260/1480 train_time:36465ms step_avg:145.86ms step:261/1480 train_time:36615ms step_avg:145.87ms step:262/1480 train_time:36764ms step_avg:145.89ms step:263/1480 train_time:36913ms step_avg:145.90ms step:264/1480 train_time:37064ms step_avg:145.92ms step:265/1480 train_time:37216ms step_avg:145.94ms step:266/1480 train_time:37366ms step_avg:145.96ms step:267/1480 train_time:37516ms step_avg:145.98ms step:268/1480 train_time:37667ms step_avg:145.99ms step:269/1480 train_time:37817ms step_avg:146.01ms step:270/1480 train_time:37967ms step_avg:146.03ms step:271/1480 train_time:38117ms step_avg:146.04ms step:272/1480 train_time:38269ms step_avg:146.06ms step:273/1480 train_time:38419ms step_avg:146.08ms step:274/1480 train_time:38570ms step_avg:146.10ms step:275/1480 train_time:38720ms step_avg:146.11ms step:276/1480 train_time:38872ms step_avg:146.13ms step:277/1480 train_time:39021ms step_avg:146.15ms step:278/1480 train_time:39172ms step_avg:146.16ms step:279/1480 train_time:39323ms step_avg:146.18ms step:280/1480 train_time:39473ms step_avg:146.20ms step:281/1480 train_time:39623ms step_avg:146.21ms step:282/1480 train_time:39774ms step_avg:146.23ms step:283/1480 train_time:39924ms step_avg:146.24ms step:284/1480 train_time:40075ms step_avg:146.26ms step:285/1480 train_time:40225ms step_avg:146.27ms step:286/1480 train_time:40377ms step_avg:146.29ms step:287/1480 train_time:40528ms step_avg:146.31ms step:288/1480 train_time:40678ms step_avg:146.33ms step:289/1480 train_time:40829ms step_avg:146.34ms step:290/1480 train_time:40979ms step_avg:146.35ms step:291/1480 train_time:41130ms step_avg:146.37ms step:292/1480 train_time:41281ms step_avg:146.39ms step:293/1480 train_time:41431ms step_avg:146.40ms step:294/1480 train_time:41582ms step_avg:146.42ms step:295/1480 train_time:41733ms step_avg:146.43ms step:296/1480 train_time:41885ms step_avg:146.45ms step:297/1480 train_time:42036ms step_avg:146.47ms step:298/1480 train_time:42186ms step_avg:146.48ms step:299/1480 train_time:42336ms step_avg:146.49ms step:300/1480 train_time:42488ms step_avg:146.51ms step:301/1480 train_time:42637ms step_avg:146.52ms step:302/1480 train_time:42788ms step_avg:146.53ms step:303/1480 train_time:42938ms step_avg:146.55ms step:304/1480 train_time:43089ms step_avg:146.56ms step:305/1480 train_time:43239ms step_avg:146.57ms step:306/1480 train_time:43389ms step_avg:146.59ms step:307/1480 train_time:43539ms step_avg:146.60ms step:308/1480 train_time:43689ms step_avg:146.61ms step:309/1480 train_time:43840ms step_avg:146.62ms step:310/1480 train_time:43991ms step_avg:146.64ms step:311/1480 train_time:44140ms step_avg:146.65ms step:312/1480 train_time:44291ms step_avg:146.66ms step:313/1480 train_time:44441ms step_avg:146.67ms step:314/1480 train_time:44591ms step_avg:146.68ms step:315/1480 train_time:44741ms step_avg:146.69ms step:316/1480 train_time:44892ms step_avg:146.71ms step:317/1480 train_time:45043ms step_avg:146.72ms step:318/1480 train_time:45193ms step_avg:146.73ms step:319/1480 train_time:45343ms step_avg:146.74ms step:320/1480 train_time:45493ms step_avg:146.75ms step:321/1480 train_time:45642ms step_avg:146.76ms step:322/1480 train_time:45793ms step_avg:146.77ms step:323/1480 train_time:45944ms step_avg:146.78ms step:324/1480 train_time:46094ms step_avg:146.80ms step:325/1480 train_time:46244ms step_avg:146.81ms step:326/1480 train_time:46394ms step_avg:146.82ms step:327/1480 train_time:46544ms step_avg:146.83ms step:328/1480 train_time:46694ms step_avg:146.84ms step:329/1480 train_time:46845ms step_avg:146.85ms step:330/1480 train_time:46998ms step_avg:146.87ms step:331/1480 train_time:47151ms step_avg:146.89ms step:332/1480 train_time:47307ms step_avg:146.92ms step:333/1480 train_time:47460ms step_avg:146.94ms step:334/1480 train_time:47614ms step_avg:146.96ms step:335/1480 train_time:47768ms step_avg:146.98ms step:336/1480 train_time:47920ms step_avg:146.99ms step:337/1480 train_time:48074ms step_avg:147.02ms step:338/1480 train_time:48228ms step_avg:147.04ms step:339/1480 train_time:48382ms step_avg:147.06ms step:340/1480 train_time:48536ms step_avg:147.08ms step:341/1480 train_time:48689ms step_avg:147.10ms step:342/1480 train_time:48842ms step_avg:147.12ms step:343/1480 train_time:48996ms step_avg:147.14ms step:344/1480 train_time:49150ms step_avg:147.16ms step:345/1480 train_time:49306ms step_avg:147.18ms step:346/1480 train_time:49460ms step_avg:147.20ms step:347/1480 train_time:49614ms step_avg:147.22ms step:348/1480 train_time:49768ms step_avg:147.24ms step:349/1480 train_time:49921ms step_avg:147.26ms step:350/1480 train_time:50075ms step_avg:147.28ms step:351/1480 train_time:50229ms step_avg:147.30ms step:352/1480 train_time:50384ms step_avg:147.32ms step:353/1480 train_time:50537ms step_avg:147.34ms step:354/1480 train_time:50690ms step_avg:147.35ms step:355/1480 train_time:50842ms step_avg:147.37ms step:356/1480 train_time:50997ms step_avg:147.39ms step:357/1480 train_time:51151ms step_avg:147.41ms step:358/1480 train_time:51306ms step_avg:147.43ms step:359/1480 train_time:51461ms step_avg:147.45ms step:360/1480 train_time:51616ms step_avg:147.47ms step:361/1480 train_time:51771ms step_avg:147.50ms step:362/1480 train_time:51926ms step_avg:147.52ms step:363/1480 train_time:52079ms step_avg:147.53ms step:364/1480 train_time:52233ms step_avg:147.55ms step:365/1480 train_time:52389ms step_avg:147.57ms step:366/1480 train_time:52542ms step_avg:147.59ms step:367/1480 train_time:52696ms step_avg:147.61ms step:368/1480 train_time:52849ms step_avg:147.62ms step:369/1480 train_time:53001ms step_avg:147.63ms step:370/1480 train_time:53154ms step_avg:147.65ms step:371/1480 train_time:53309ms step_avg:147.67ms step:372/1480 train_time:53463ms step_avg:147.69ms step:373/1480 train_time:53616ms step_avg:147.70ms step:374/1480 train_time:53769ms step_avg:147.72ms step:375/1480 train_time:53922ms step_avg:147.73ms step:375/1480 val_loss:3.8118 train_time:53983ms step_avg:147.90ms step:376/1480 train_time:54081ms step_avg:147.76ms step:377/1480 train_time:54235ms step_avg:147.78ms step:378/1480 train_time:54388ms step_avg:147.79ms step:379/1480 train_time:54541ms step_avg:147.81ms step:380/1480 train_time:54693ms step_avg:147.82ms step:381/1480 train_time:54846ms step_avg:147.83ms step:382/1480 train_time:55000ms step_avg:147.85ms step:383/1480 train_time:55154ms step_avg:147.87ms step:384/1480 train_time:55308ms step_avg:147.88ms step:385/1480 train_time:55462ms step_avg:147.90ms step:386/1480 train_time:55615ms step_avg:147.91ms step:387/1480 train_time:55769ms step_avg:147.93ms step:388/1480 train_time:55923ms step_avg:147.94ms step:389/1480 train_time:56076ms step_avg:147.96ms step:390/1480 train_time:56230ms step_avg:147.97ms step:391/1480 train_time:56384ms step_avg:147.99ms step:392/1480 train_time:56537ms step_avg:148.00ms step:393/1480 train_time:56690ms step_avg:148.02ms step:394/1480 train_time:56843ms step_avg:148.03ms step:395/1480 train_time:56997ms step_avg:148.04ms step:396/1480 train_time:57152ms step_avg:148.06ms step:397/1480 train_time:57306ms step_avg:148.08ms step:398/1480 train_time:57460ms step_avg:148.09ms step:399/1480 train_time:57613ms step_avg:148.11ms step:400/1480 train_time:57768ms step_avg:148.12ms step:401/1480 train_time:57921ms step_avg:148.13ms step:402/1480 train_time:58074ms step_avg:148.15ms step:403/1480 train_time:58228ms step_avg:148.16ms step:404/1480 train_time:58383ms step_avg:148.18ms step:405/1480 train_time:58537ms step_avg:148.19ms step:406/1480 train_time:58690ms step_avg:148.21ms step:407/1480 train_time:58843ms step_avg:148.22ms step:408/1480 train_time:58997ms step_avg:148.23ms step:409/1480 train_time:59149ms step_avg:148.24ms step:410/1480 train_time:59303ms step_avg:148.26ms step:411/1480 train_time:59457ms step_avg:148.27ms step:412/1480 train_time:59611ms step_avg:148.29ms step:413/1480 train_time:59764ms step_avg:148.30ms step:414/1480 train_time:59919ms step_avg:148.31ms step:415/1480 train_time:60072ms step_avg:148.33ms step:416/1480 train_time:60225ms step_avg:148.34ms step:417/1480 train_time:60379ms step_avg:148.35ms step:418/1480 train_time:60533ms step_avg:148.36ms step:419/1480 train_time:60686ms step_avg:148.38ms step:420/1480 train_time:60840ms step_avg:148.39ms step:421/1480 train_time:60993ms step_avg:148.40ms step:422/1480 train_time:61146ms step_avg:148.41ms step:423/1480 train_time:61301ms step_avg:148.43ms step:424/1480 train_time:61454ms step_avg:148.44ms step:425/1480 train_time:61609ms step_avg:148.46ms step:426/1480 train_time:61763ms step_avg:148.47ms step:427/1480 train_time:61916ms step_avg:148.48ms step:428/1480 train_time:62071ms step_avg:148.49ms step:429/1480 train_time:62224ms step_avg:148.51ms step:430/1480 train_time:62378ms step_avg:148.52ms step:431/1480 train_time:62531ms step_avg:148.53ms step:432/1480 train_time:62685ms step_avg:148.54ms step:433/1480 train_time:62839ms step_avg:148.56ms step:434/1480 train_time:62992ms step_avg:148.57ms step:435/1480 train_time:63145ms step_avg:148.58ms step:436/1480 train_time:63301ms step_avg:148.59ms step:437/1480 train_time:63454ms step_avg:148.60ms step:438/1480 train_time:63607ms step_avg:148.62ms step:439/1480 train_time:63762ms step_avg:148.63ms step:440/1480 train_time:63916ms step_avg:148.64ms step:441/1480 train_time:64073ms step_avg:148.66ms step:442/1480 train_time:64229ms step_avg:148.68ms step:443/1480 train_time:64388ms step_avg:148.70ms step:444/1480 train_time:64545ms step_avg:148.72ms step:445/1480 train_time:64701ms step_avg:148.74ms step:446/1480 train_time:64857ms step_avg:148.75ms step:447/1480 train_time:65012ms step_avg:148.77ms step:448/1480 train_time:65168ms step_avg:148.79ms step:449/1480 train_time:65328ms step_avg:148.81ms step:450/1480 train_time:65487ms step_avg:148.83ms step:451/1480 train_time:65646ms step_avg:148.86ms step:452/1480 train_time:65803ms step_avg:148.88ms step:453/1480 train_time:65959ms step_avg:148.89ms step:454/1480 train_time:66114ms step_avg:148.91ms step:455/1480 train_time:66270ms step_avg:148.92ms step:456/1480 train_time:66426ms step_avg:148.94ms step:457/1480 train_time:66585ms step_avg:148.96ms step:458/1480 train_time:66743ms step_avg:148.98ms step:459/1480 train_time:66900ms step_avg:149.00ms step:460/1480 train_time:67057ms step_avg:149.01ms step:461/1480 train_time:67214ms step_avg:149.03ms step:462/1480 train_time:67371ms step_avg:149.05ms step:463/1480 train_time:67528ms step_avg:149.07ms step:464/1480 train_time:67686ms step_avg:149.09ms step:465/1480 train_time:67843ms step_avg:149.10ms step:466/1480 train_time:68000ms step_avg:149.12ms step:467/1480 train_time:68156ms step_avg:149.14ms step:468/1480 train_time:68312ms step_avg:149.15ms step:469/1480 train_time:68468ms step_avg:149.17ms step:470/1480 train_time:68626ms step_avg:149.19ms step:471/1480 train_time:68782ms step_avg:149.20ms step:472/1480 train_time:68940ms step_avg:149.22ms step:473/1480 train_time:69095ms step_avg:149.23ms step:474/1480 train_time:69251ms step_avg:149.25ms step:475/1480 train_time:69408ms step_avg:149.26ms step:476/1480 train_time:69565ms step_avg:149.28ms step:477/1480 train_time:69722ms step_avg:149.30ms step:478/1480 train_time:69877ms step_avg:149.31ms step:479/1480 train_time:70033ms step_avg:149.32ms step:480/1480 train_time:70191ms step_avg:149.34ms step:481/1480 train_time:70348ms step_avg:149.36ms step:482/1480 train_time:70507ms step_avg:149.38ms step:483/1480 train_time:70664ms step_avg:149.40ms step:484/1480 train_time:70821ms step_avg:149.41ms step:485/1480 train_time:70979ms step_avg:149.43ms step:486/1480 train_time:71135ms step_avg:149.44ms step:487/1480 train_time:71292ms step_avg:149.46ms step:488/1480 train_time:71448ms step_avg:149.47ms step:489/1480 train_time:71606ms step_avg:149.49ms step:490/1480 train_time:71762ms step_avg:149.50ms step:491/1480 train_time:71919ms step_avg:149.52ms step:492/1480 train_time:72076ms step_avg:149.53ms step:493/1480 train_time:72232ms step_avg:149.55ms step:494/1480 train_time:72390ms step_avg:149.57ms step:495/1480 train_time:72548ms step_avg:149.58ms step:496/1480 train_time:72706ms step_avg:149.60ms step:497/1480 train_time:72863ms step_avg:149.62ms step:498/1480 train_time:73020ms step_avg:149.63ms step:499/1480 train_time:73176ms step_avg:149.64ms step:500/1480 train_time:73333ms step_avg:149.66ms step:500/1480 val_loss:3.6901 train_time:73396ms step_avg:149.79ms step:501/1480 train_time:73493ms step_avg:149.68ms step:502/1480 train_time:73651ms step_avg:149.70ms step:503/1480 train_time:73808ms step_avg:149.71ms step:504/1480 train_time:73965ms step_avg:149.73ms step:505/1480 train_time:74120ms step_avg:149.74ms step:506/1480 train_time:74276ms step_avg:149.75ms step:507/1480 train_time:74432ms step_avg:149.76ms step:508/1480 train_time:74591ms step_avg:149.78ms step:509/1480 train_time:74748ms step_avg:149.80ms step:510/1480 train_time:74905ms step_avg:149.81ms step:511/1480 train_time:75062ms step_avg:149.82ms step:512/1480 train_time:75218ms step_avg:149.84ms step:513/1480 train_time:75374ms step_avg:149.85ms step:514/1480 train_time:75530ms step_avg:149.86ms step:515/1480 train_time:75689ms step_avg:149.88ms step:516/1480 train_time:75849ms step_avg:149.90ms step:517/1480 train_time:76007ms step_avg:149.92ms step:518/1480 train_time:76165ms step_avg:149.93ms step:519/1480 train_time:76322ms step_avg:149.95ms step:520/1480 train_time:76480ms step_avg:149.96ms step:521/1480 train_time:76635ms step_avg:149.97ms step:522/1480 train_time:76792ms step_avg:149.98ms step:523/1480 train_time:76949ms step_avg:150.00ms step:524/1480 train_time:77107ms step_avg:150.01ms step:525/1480 train_time:77265ms step_avg:150.03ms step:526/1480 train_time:77421ms step_avg:150.04ms step:527/1480 train_time:77576ms step_avg:150.05ms step:528/1480 train_time:77732ms step_avg:150.06ms step:529/1480 train_time:77889ms step_avg:150.08ms step:530/1480 train_time:78048ms step_avg:150.09ms step:531/1480 train_time:78204ms step_avg:150.10ms step:532/1480 train_time:78361ms step_avg:150.12ms step:533/1480 train_time:78516ms step_avg:150.13ms step:534/1480 train_time:78673ms step_avg:150.14ms step:535/1480 train_time:78829ms step_avg:150.15ms step:536/1480 train_time:78988ms step_avg:150.17ms step:537/1480 train_time:79144ms step_avg:150.18ms step:538/1480 train_time:79300ms step_avg:150.19ms step:539/1480 train_time:79458ms step_avg:150.20ms step:540/1480 train_time:79615ms step_avg:150.22ms step:541/1480 train_time:79770ms step_avg:150.23ms step:542/1480 train_time:79927ms step_avg:150.24ms step:543/1480 train_time:80085ms step_avg:150.25ms step:544/1480 train_time:80240ms step_avg:150.26ms step:545/1480 train_time:80394ms step_avg:150.27ms step:546/1480 train_time:80553ms step_avg:150.29ms step:547/1480 train_time:80709ms step_avg:150.30ms step:548/1480 train_time:80868ms step_avg:150.31ms step:549/1480 train_time:81025ms step_avg:150.32ms step:550/1480 train_time:81184ms step_avg:150.34ms step:551/1480 train_time:81342ms step_avg:150.35ms step:552/1480 train_time:81500ms step_avg:150.37ms step:553/1480 train_time:81661ms step_avg:150.39ms step:554/1480 train_time:81820ms step_avg:150.41ms step:555/1480 train_time:81979ms step_avg:150.42ms step:556/1480 train_time:82136ms step_avg:150.43ms step:557/1480 train_time:82294ms step_avg:150.45ms step:558/1480 train_time:82453ms step_avg:150.46ms step:559/1480 train_time:82611ms step_avg:150.48ms step:560/1480 train_time:82771ms step_avg:150.49ms step:561/1480 train_time:82930ms step_avg:150.51ms step:562/1480 train_time:83091ms step_avg:150.53ms step:563/1480 train_time:83249ms step_avg:150.54ms step:564/1480 train_time:83409ms step_avg:150.56ms step:565/1480 train_time:83569ms step_avg:150.57ms step:566/1480 train_time:83729ms step_avg:150.59ms step:567/1480 train_time:83890ms step_avg:150.61ms step:568/1480 train_time:84048ms step_avg:150.62ms step:569/1480 train_time:84207ms step_avg:150.64ms step:570/1480 train_time:84366ms step_avg:150.65ms step:571/1480 train_time:84526ms step_avg:150.67ms step:572/1480 train_time:84686ms step_avg:150.69ms step:573/1480 train_time:84847ms step_avg:150.71ms step:574/1480 train_time:85009ms step_avg:150.72ms step:575/1480 train_time:85171ms step_avg:150.74ms step:576/1480 train_time:85330ms step_avg:150.76ms step:577/1480 train_time:85491ms step_avg:150.78ms step:578/1480 train_time:85650ms step_avg:150.79ms step:579/1480 train_time:85809ms step_avg:150.81ms step:580/1480 train_time:85969ms step_avg:150.82ms step:581/1480 train_time:86130ms step_avg:150.84ms step:582/1480 train_time:86290ms step_avg:150.86ms step:583/1480 train_time:86450ms step_avg:150.87ms step:584/1480 train_time:86609ms step_avg:150.89ms step:585/1480 train_time:86769ms step_avg:150.90ms step:586/1480 train_time:86928ms step_avg:150.92ms step:587/1480 train_time:87088ms step_avg:150.93ms step:588/1480 train_time:87247ms step_avg:150.95ms step:589/1480 train_time:87408ms step_avg:150.96ms step:590/1480 train_time:87569ms step_avg:150.98ms step:591/1480 train_time:87728ms step_avg:150.99ms step:592/1480 train_time:87888ms step_avg:151.01ms step:593/1480 train_time:88048ms step_avg:151.03ms step:594/1480 train_time:88209ms step_avg:151.04ms step:595/1480 train_time:88371ms step_avg:151.06ms step:596/1480 train_time:88532ms step_avg:151.08ms step:597/1480 train_time:88692ms step_avg:151.09ms step:598/1480 train_time:88850ms step_avg:151.11ms step:599/1480 train_time:89008ms step_avg:151.12ms step:600/1480 train_time:89168ms step_avg:151.13ms step:601/1480 train_time:89327ms step_avg:151.15ms step:602/1480 train_time:89486ms step_avg:151.16ms step:603/1480 train_time:89647ms step_avg:151.18ms step:604/1480 train_time:89806ms step_avg:151.19ms step:605/1480 train_time:89964ms step_avg:151.20ms step:606/1480 train_time:90125ms step_avg:151.22ms step:607/1480 train_time:90288ms step_avg:151.24ms step:608/1480 train_time:90447ms step_avg:151.25ms step:609/1480 train_time:90606ms step_avg:151.26ms step:610/1480 train_time:90765ms step_avg:151.27ms step:611/1480 train_time:90925ms step_avg:151.29ms step:612/1480 train_time:91085ms step_avg:151.30ms step:613/1480 train_time:91246ms step_avg:151.32ms step:614/1480 train_time:91406ms step_avg:151.33ms step:615/1480 train_time:91564ms step_avg:151.35ms step:616/1480 train_time:91721ms step_avg:151.36ms step:617/1480 train_time:91881ms step_avg:151.37ms step:618/1480 train_time:92039ms step_avg:151.38ms step:619/1480 train_time:92196ms step_avg:151.39ms step:620/1480 train_time:92356ms step_avg:151.40ms step:621/1480 train_time:92515ms step_avg:151.42ms step:622/1480 train_time:92675ms step_avg:151.43ms step:623/1480 train_time:92834ms step_avg:151.44ms step:624/1480 train_time:92994ms step_avg:151.46ms step:625/1480 train_time:93153ms step_avg:151.47ms step:625/1480 val_loss:3.6104 train_time:93217ms step_avg:151.57ms step:626/1480 train_time:93317ms step_avg:151.49ms step:627/1480 train_time:93477ms step_avg:151.50ms step:628/1480 train_time:93637ms step_avg:151.52ms step:629/1480 train_time:93795ms step_avg:151.53ms step:630/1480 train_time:93953ms step_avg:151.54ms step:631/1480 train_time:94110ms step_avg:151.55ms step:632/1480 train_time:94267ms step_avg:151.56ms step:633/1480 train_time:94427ms step_avg:151.57ms step:634/1480 train_time:94587ms step_avg:151.58ms step:635/1480 train_time:94745ms step_avg:151.59ms step:636/1480 train_time:94902ms step_avg:151.60ms step:637/1480 train_time:95063ms step_avg:151.62ms step:638/1480 train_time:95221ms step_avg:151.63ms step:639/1480 train_time:95380ms step_avg:151.64ms step:640/1480 train_time:95540ms step_avg:151.65ms step:641/1480 train_time:95699ms step_avg:151.66ms step:642/1480 train_time:95860ms step_avg:151.68ms step:643/1480 train_time:96020ms step_avg:151.69ms step:644/1480 train_time:96179ms step_avg:151.70ms step:645/1480 train_time:96338ms step_avg:151.71ms step:646/1480 train_time:96498ms step_avg:151.73ms step:647/1480 train_time:96659ms step_avg:151.74ms step:648/1480 train_time:96820ms step_avg:151.76ms step:649/1480 train_time:96980ms step_avg:151.77ms step:650/1480 train_time:97140ms step_avg:151.78ms step:651/1480 train_time:97300ms step_avg:151.79ms step:652/1480 train_time:97461ms step_avg:151.81ms step:653/1480 train_time:97620ms step_avg:151.82ms step:654/1480 train_time:97781ms step_avg:151.83ms step:655/1480 train_time:97941ms step_avg:151.85ms step:656/1480 train_time:98100ms step_avg:151.86ms step:657/1480 train_time:98261ms step_avg:151.87ms step:658/1480 train_time:98421ms step_avg:151.88ms step:659/1480 train_time:98583ms step_avg:151.90ms step:660/1480 train_time:98745ms step_avg:151.92ms step:661/1480 train_time:98907ms step_avg:151.93ms step:662/1480 train_time:99067ms step_avg:151.94ms step:663/1480 train_time:99228ms step_avg:151.96ms step:664/1480 train_time:99390ms step_avg:151.97ms step:665/1480 train_time:99551ms step_avg:151.99ms step:666/1480 train_time:99711ms step_avg:152.00ms step:667/1480 train_time:99872ms step_avg:152.01ms step:668/1480 train_time:100034ms step_avg:152.03ms step:669/1480 train_time:100195ms step_avg:152.04ms step:670/1480 train_time:100356ms step_avg:152.06ms step:671/1480 train_time:100519ms step_avg:152.07ms step:672/1480 train_time:100681ms step_avg:152.09ms step:673/1480 train_time:100844ms step_avg:152.10ms step:674/1480 train_time:101004ms step_avg:152.11ms step:675/1480 train_time:101166ms step_avg:152.13ms step:676/1480 train_time:101328ms step_avg:152.14ms step:677/1480 train_time:101488ms step_avg:152.16ms step:678/1480 train_time:101649ms step_avg:152.17ms step:679/1480 train_time:101811ms step_avg:152.18ms step:680/1480 train_time:101972ms step_avg:152.20ms step:681/1480 train_time:102132ms step_avg:152.21ms step:682/1480 train_time:102296ms step_avg:152.23ms step:683/1480 train_time:102459ms step_avg:152.24ms step:684/1480 train_time:102621ms step_avg:152.26ms step:685/1480 train_time:102785ms step_avg:152.27ms step:686/1480 train_time:102946ms step_avg:152.29ms step:687/1480 train_time:103105ms step_avg:152.30ms step:688/1480 train_time:103267ms step_avg:152.31ms step:689/1480 train_time:103429ms step_avg:152.33ms step:690/1480 train_time:103595ms step_avg:152.35ms step:691/1480 train_time:103757ms step_avg:152.36ms step:692/1480 train_time:103919ms step_avg:152.37ms step:693/1480 train_time:104081ms step_avg:152.39ms step:694/1480 train_time:104242ms step_avg:152.40ms step:695/1480 train_time:104402ms step_avg:152.41ms step:696/1480 train_time:104563ms step_avg:152.42ms step:697/1480 train_time:104727ms step_avg:152.44ms step:698/1480 train_time:104886ms step_avg:152.45ms step:699/1480 train_time:105048ms step_avg:152.46ms step:700/1480 train_time:105209ms step_avg:152.48ms step:701/1480 train_time:105369ms step_avg:152.49ms step:702/1480 train_time:105530ms step_avg:152.50ms step:703/1480 train_time:105691ms step_avg:152.51ms step:704/1480 train_time:105853ms step_avg:152.53ms step:705/1480 train_time:106016ms step_avg:152.54ms step:706/1480 train_time:106181ms step_avg:152.56ms step:707/1480 train_time:106342ms step_avg:152.57ms step:708/1480 train_time:106503ms step_avg:152.58ms step:709/1480 train_time:106665ms step_avg:152.60ms step:710/1480 train_time:106826ms step_avg:152.61ms step:711/1480 train_time:106987ms step_avg:152.62ms step:712/1480 train_time:107154ms step_avg:152.64ms step:713/1480 train_time:107318ms step_avg:152.66ms step:714/1480 train_time:107479ms step_avg:152.67ms step:715/1480 train_time:107641ms step_avg:152.68ms step:716/1480 train_time:107801ms step_avg:152.69ms step:717/1480 train_time:107964ms step_avg:152.71ms step:718/1480 train_time:108123ms step_avg:152.72ms step:719/1480 train_time:108283ms step_avg:152.73ms step:720/1480 train_time:108447ms step_avg:152.74ms step:721/1480 train_time:108607ms step_avg:152.75ms step:722/1480 train_time:108768ms step_avg:152.76ms step:723/1480 train_time:108928ms step_avg:152.77ms step:724/1480 train_time:109089ms step_avg:152.79ms step:725/1480 train_time:109251ms step_avg:152.80ms step:726/1480 train_time:109414ms step_avg:152.81ms step:727/1480 train_time:109578ms step_avg:152.83ms step:728/1480 train_time:109739ms step_avg:152.84ms step:729/1480 train_time:109901ms step_avg:152.85ms step:730/1480 train_time:110065ms step_avg:152.87ms step:731/1480 train_time:110225ms step_avg:152.88ms step:732/1480 train_time:110384ms step_avg:152.89ms step:733/1480 train_time:110545ms step_avg:152.90ms step:734/1480 train_time:110707ms step_avg:152.91ms step:735/1480 train_time:110867ms step_avg:152.92ms step:736/1480 train_time:111029ms step_avg:152.93ms step:737/1480 train_time:111189ms step_avg:152.94ms step:738/1480 train_time:111349ms step_avg:152.95ms step:739/1480 train_time:111508ms step_avg:152.96ms step:740/1480 train_time:111672ms step_avg:152.98ms step:741/1480 train_time:111836ms step_avg:152.99ms step:742/1480 train_time:111999ms step_avg:153.00ms step:743/1480 train_time:112162ms step_avg:153.02ms step:744/1480 train_time:112325ms step_avg:153.03ms step:745/1480 train_time:112488ms step_avg:153.04ms step:746/1480 train_time:112647ms step_avg:153.05ms step:747/1480 train_time:112810ms step_avg:153.07ms step:748/1480 train_time:112975ms step_avg:153.08ms step:749/1480 train_time:113139ms step_avg:153.10ms step:750/1480 train_time:113299ms step_avg:153.11ms step:750/1480 val_loss:3.5540 train_time:113365ms step_avg:153.20ms step:751/1480 train_time:113465ms step_avg:153.12ms step:752/1480 train_time:113626ms step_avg:153.13ms step:753/1480 train_time:113786ms step_avg:153.14ms step:754/1480 train_time:113947ms step_avg:153.15ms step:755/1480 train_time:114109ms step_avg:153.17ms step:756/1480 train_time:114270ms step_avg:153.18ms step:757/1480 train_time:114432ms step_avg:153.19ms step:758/1480 train_time:114594ms step_avg:153.20ms step:759/1480 train_time:114757ms step_avg:153.21ms step:760/1480 train_time:114919ms step_avg:153.22ms step:761/1480 train_time:115082ms step_avg:153.24ms step:762/1480 train_time:115242ms step_avg:153.25ms step:763/1480 train_time:115402ms step_avg:153.26ms step:764/1480 train_time:115563ms step_avg:153.27ms step:765/1480 train_time:115723ms step_avg:153.28ms step:766/1480 train_time:115886ms step_avg:153.29ms step:767/1480 train_time:116048ms step_avg:153.30ms step:768/1480 train_time:116210ms step_avg:153.31ms step:769/1480 train_time:116375ms step_avg:153.33ms step:770/1480 train_time:116538ms step_avg:153.34ms step:771/1480 train_time:116701ms step_avg:153.35ms step:772/1480 train_time:116862ms step_avg:153.36ms step:773/1480 train_time:117024ms step_avg:153.37ms step:774/1480 train_time:117185ms step_avg:153.38ms step:775/1480 train_time:117347ms step_avg:153.39ms step:776/1480 train_time:117513ms step_avg:153.41ms step:777/1480 train_time:117680ms step_avg:153.43ms step:778/1480 train_time:117842ms step_avg:153.44ms step:779/1480 train_time:118004ms step_avg:153.45ms step:780/1480 train_time:118169ms step_avg:153.47ms step:781/1480 train_time:118332ms step_avg:153.48ms step:782/1480 train_time:118497ms step_avg:153.49ms step:783/1480 train_time:118659ms step_avg:153.50ms step:784/1480 train_time:118822ms step_avg:153.52ms step:785/1480 train_time:118983ms step_avg:153.53ms step:786/1480 train_time:119148ms step_avg:153.54ms step:787/1480 train_time:119312ms step_avg:153.55ms step:788/1480 train_time:119477ms step_avg:153.57ms step:789/1480 train_time:119641ms step_avg:153.58ms step:790/1480 train_time:119806ms step_avg:153.60ms step:791/1480 train_time:119975ms step_avg:153.62ms step:792/1480 train_time:120139ms step_avg:153.63ms step:793/1480 train_time:120300ms step_avg:153.64ms step:794/1480 train_time:120464ms step_avg:153.65ms step:795/1480 train_time:120628ms step_avg:153.67ms step:796/1480 train_time:120796ms step_avg:153.68ms step:797/1480 train_time:120960ms step_avg:153.70ms step:798/1480 train_time:121123ms step_avg:153.71ms step:799/1480 train_time:121290ms step_avg:153.73ms step:800/1480 train_time:121453ms step_avg:153.74ms step:801/1480 train_time:121618ms step_avg:153.75ms step:802/1480 train_time:121784ms step_avg:153.77ms step:803/1480 train_time:121946ms step_avg:153.78ms step:804/1480 train_time:122108ms step_avg:153.79ms step:805/1480 train_time:122276ms step_avg:153.81ms step:806/1480 train_time:122437ms step_avg:153.82ms step:807/1480 train_time:122599ms step_avg:153.83ms step:808/1480 train_time:122762ms step_avg:153.84ms step:809/1480 train_time:122924ms step_avg:153.85ms step:810/1480 train_time:123086ms step_avg:153.86ms step:811/1480 train_time:123249ms step_avg:153.87ms step:812/1480 train_time:123412ms step_avg:153.88ms step:813/1480 train_time:123574ms step_avg:153.89ms step:814/1480 train_time:123738ms step_avg:153.90ms step:815/1480 train_time:123900ms step_avg:153.91ms step:816/1480 train_time:124065ms step_avg:153.93ms step:817/1480 train_time:124226ms step_avg:153.94ms step:818/1480 train_time:124388ms step_avg:153.95ms step:819/1480 train_time:124553ms step_avg:153.96ms step:820/1480 train_time:124719ms step_avg:153.97ms step:821/1480 train_time:124880ms step_avg:153.98ms step:822/1480 train_time:125044ms step_avg:153.99ms step:823/1480 train_time:125205ms step_avg:154.00ms step:824/1480 train_time:125366ms step_avg:154.01ms step:825/1480 train_time:125530ms step_avg:154.03ms step:826/1480 train_time:125698ms step_avg:154.04ms step:827/1480 train_time:125862ms step_avg:154.05ms step:828/1480 train_time:126023ms step_avg:154.06ms step:829/1480 train_time:126187ms step_avg:154.07ms step:830/1480 train_time:126353ms step_avg:154.09ms step:831/1480 train_time:126518ms step_avg:154.10ms step:832/1480 train_time:126682ms step_avg:154.11ms step:833/1480 train_time:126846ms step_avg:154.13ms step:834/1480 train_time:127010ms step_avg:154.14ms step:835/1480 train_time:127174ms step_avg:154.15ms step:836/1480 train_time:127338ms step_avg:154.16ms step:837/1480 train_time:127499ms step_avg:154.17ms step:838/1480 train_time:127662ms step_avg:154.18ms step:839/1480 train_time:127825ms step_avg:154.19ms step:840/1480 train_time:127987ms step_avg:154.20ms step:841/1480 train_time:128147ms step_avg:154.21ms step:842/1480 train_time:128311ms step_avg:154.22ms step:843/1480 train_time:128474ms step_avg:154.23ms step:844/1480 train_time:128635ms step_avg:154.24ms step:845/1480 train_time:128800ms step_avg:154.25ms step:846/1480 train_time:128964ms step_avg:154.26ms step:847/1480 train_time:129126ms step_avg:154.27ms step:848/1480 train_time:129289ms step_avg:154.28ms step:849/1480 train_time:129452ms step_avg:154.29ms step:850/1480 train_time:129617ms step_avg:154.31ms step:851/1480 train_time:129781ms step_avg:154.32ms step:852/1480 train_time:129942ms step_avg:154.33ms step:853/1480 train_time:130105ms step_avg:154.34ms step:854/1480 train_time:130269ms step_avg:154.35ms step:855/1480 train_time:130433ms step_avg:154.36ms step:856/1480 train_time:130595ms step_avg:154.37ms step:857/1480 train_time:130759ms step_avg:154.38ms step:858/1480 train_time:130924ms step_avg:154.39ms step:859/1480 train_time:131086ms step_avg:154.40ms step:860/1480 train_time:131248ms step_avg:154.41ms step:861/1480 train_time:131414ms step_avg:154.42ms step:862/1480 train_time:131583ms step_avg:154.44ms step:863/1480 train_time:131752ms step_avg:154.46ms step:864/1480 train_time:131918ms step_avg:154.47ms step:865/1480 train_time:132079ms step_avg:154.48ms step:866/1480 train_time:132245ms step_avg:154.49ms step:867/1480 train_time:132407ms step_avg:154.50ms step:868/1480 train_time:132569ms step_avg:154.51ms step:869/1480 train_time:132730ms step_avg:154.52ms step:870/1480 train_time:132896ms step_avg:154.53ms step:871/1480 train_time:133058ms step_avg:154.54ms step:872/1480 train_time:133222ms step_avg:154.55ms step:873/1480 train_time:133384ms step_avg:154.56ms step:874/1480 train_time:133551ms step_avg:154.57ms step:875/1480 train_time:133716ms step_avg:154.59ms step:875/1480 val_loss:3.5089 train_time:133780ms step_avg:154.66ms step:876/1480 train_time:133882ms step_avg:154.60ms step:877/1480 train_time:134049ms step_avg:154.61ms step:878/1480 train_time:134210ms step_avg:154.62ms step:879/1480 train_time:134374ms step_avg:154.63ms step:880/1480 train_time:134536ms step_avg:154.64ms step:881/1480 train_time:134699ms step_avg:154.65ms step:882/1480 train_time:134864ms step_avg:154.66ms step:883/1480 train_time:135031ms step_avg:154.67ms step:884/1480 train_time:135198ms step_avg:154.69ms step:885/1480 train_time:135364ms step_avg:154.70ms step:886/1480 train_time:135531ms step_avg:154.72ms step:887/1480 train_time:135697ms step_avg:154.73ms step:888/1480 train_time:135871ms step_avg:154.75ms step:889/1480 train_time:136039ms step_avg:154.77ms step:890/1480 train_time:136201ms step_avg:154.77ms step:891/1480 train_time:136368ms step_avg:154.79ms step:892/1480 train_time:136533ms step_avg:154.80ms step:893/1480 train_time:136694ms step_avg:154.81ms step:894/1480 train_time:136861ms step_avg:154.82ms step:895/1480 train_time:137026ms step_avg:154.83ms step:896/1480 train_time:137190ms step_avg:154.84ms step:897/1480 train_time:137360ms step_avg:154.86ms step:898/1480 train_time:137528ms step_avg:154.87ms step:899/1480 train_time:137691ms step_avg:154.88ms step:900/1480 train_time:137854ms step_avg:154.89ms step:901/1480 train_time:138017ms step_avg:154.90ms step:902/1480 train_time:138181ms step_avg:154.91ms step:903/1480 train_time:138353ms step_avg:154.93ms step:904/1480 train_time:138518ms step_avg:154.94ms step:905/1480 train_time:138681ms step_avg:154.95ms step:906/1480 train_time:138848ms step_avg:154.96ms step:907/1480 train_time:139016ms step_avg:154.98ms step:908/1480 train_time:139179ms step_avg:154.99ms step:909/1480 train_time:139345ms step_avg:155.00ms step:910/1480 train_time:139515ms step_avg:155.02ms step:911/1480 train_time:139680ms step_avg:155.03ms step:912/1480 train_time:139846ms step_avg:155.04ms step:913/1480 train_time:140012ms step_avg:155.05ms step:914/1480 train_time:140180ms step_avg:155.07ms step:915/1480 train_time:140350ms step_avg:155.08ms step:916/1480 train_time:140513ms step_avg:155.09ms step:917/1480 train_time:140675ms step_avg:155.10ms step:918/1480 train_time:140845ms step_avg:155.12ms step:919/1480 train_time:141014ms step_avg:155.13ms step:920/1480 train_time:141177ms step_avg:155.14ms step:921/1480 train_time:141345ms step_avg:155.15ms step:922/1480 train_time:141510ms step_avg:155.16ms step:923/1480 train_time:141673ms step_avg:155.17ms step:924/1480 train_time:141839ms step_avg:155.18ms step:925/1480 train_time:142004ms step_avg:155.20ms step:926/1480 train_time:142168ms step_avg:155.20ms step:927/1480 train_time:142332ms step_avg:155.21ms step:928/1480 train_time:142498ms step_avg:155.23ms step:929/1480 train_time:142664ms step_avg:155.24ms step:930/1480 train_time:142830ms step_avg:155.25ms step:931/1480 train_time:142992ms step_avg:155.26ms step:932/1480 train_time:143158ms step_avg:155.27ms step:933/1480 train_time:143324ms step_avg:155.28ms step:934/1480 train_time:143490ms step_avg:155.29ms step:935/1480 train_time:143661ms step_avg:155.31ms step:936/1480 train_time:143829ms step_avg:155.32ms step:937/1480 train_time:143999ms step_avg:155.34ms step:938/1480 train_time:144163ms step_avg:155.35ms step:939/1480 train_time:144331ms step_avg:155.36ms step:940/1480 train_time:144499ms step_avg:155.38ms step:941/1480 train_time:144664ms step_avg:155.39ms step:942/1480 train_time:144828ms step_avg:155.40ms step:943/1480 train_time:144998ms step_avg:155.41ms step:944/1480 train_time:145170ms step_avg:155.43ms step:945/1480 train_time:145335ms step_avg:155.44ms step:946/1480 train_time:145505ms step_avg:155.45ms step:947/1480 train_time:145673ms step_avg:155.47ms step:948/1480 train_time:145839ms step_avg:155.48ms step:949/1480 train_time:146005ms step_avg:155.49ms step:950/1480 train_time:146169ms step_avg:155.50ms step:951/1480 train_time:146336ms step_avg:155.51ms step:952/1480 train_time:146501ms step_avg:155.52ms step:953/1480 train_time:146671ms step_avg:155.54ms step:954/1480 train_time:146840ms step_avg:155.55ms step:955/1480 train_time:147004ms step_avg:155.56ms step:956/1480 train_time:147169ms step_avg:155.57ms step:957/1480 train_time:147337ms step_avg:155.58ms step:958/1480 train_time:147506ms step_avg:155.60ms step:959/1480 train_time:147671ms step_avg:155.61ms step:960/1480 train_time:147837ms step_avg:155.62ms step:961/1480 train_time:148002ms step_avg:155.63ms step:962/1480 train_time:148168ms step_avg:155.64ms step:963/1480 train_time:148334ms step_avg:155.65ms step:964/1480 train_time:148503ms step_avg:155.66ms step:965/1480 train_time:148667ms step_avg:155.67ms step:966/1480 train_time:148832ms step_avg:155.68ms step:967/1480 train_time:148997ms step_avg:155.69ms step:968/1480 train_time:149162ms step_avg:155.70ms step:969/1480 train_time:149328ms step_avg:155.71ms step:970/1480 train_time:149492ms step_avg:155.72ms step:971/1480 train_time:149655ms step_avg:155.73ms step:972/1480 train_time:149820ms step_avg:155.74ms step:973/1480 train_time:149985ms step_avg:155.75ms step:974/1480 train_time:150154ms step_avg:155.76ms step:975/1480 train_time:150320ms step_avg:155.77ms step:976/1480 train_time:150487ms step_avg:155.78ms step:977/1480 train_time:150651ms step_avg:155.79ms step:978/1480 train_time:150814ms step_avg:155.80ms step:979/1480 train_time:150980ms step_avg:155.81ms step:980/1480 train_time:151146ms step_avg:155.82ms step:981/1480 train_time:151312ms step_avg:155.83ms step:982/1480 train_time:151474ms step_avg:155.84ms step:983/1480 train_time:151641ms step_avg:155.85ms step:984/1480 train_time:151806ms step_avg:155.86ms step:985/1480 train_time:151973ms step_avg:155.87ms step:986/1480 train_time:152138ms step_avg:155.88ms step:987/1480 train_time:152303ms step_avg:155.89ms step:988/1480 train_time:152471ms step_avg:155.90ms step:989/1480 train_time:152635ms step_avg:155.91ms step:990/1480 train_time:152804ms step_avg:155.92ms step:991/1480 train_time:152970ms step_avg:155.93ms step:992/1480 train_time:153145ms step_avg:155.95ms step:993/1480 train_time:153323ms step_avg:155.98ms step:994/1480 train_time:153489ms step_avg:155.98ms step:995/1480 train_time:153653ms step_avg:155.99ms step:996/1480 train_time:153815ms step_avg:156.00ms step:997/1480 train_time:153979ms step_avg:156.01ms step:998/1480 train_time:154143ms step_avg:156.01ms step:999/1480 train_time:154307ms step_avg:156.02ms step:1000/1480 train_time:154476ms step_avg:156.04ms step:1000/1480 val_loss:3.4445 train_time:154545ms step_avg:156.11ms step:1001/1480 train_time:154647ms step_avg:156.05ms step:1002/1480 train_time:154813ms step_avg:156.06ms step:1003/1480 train_time:154985ms step_avg:156.08ms step:1004/1480 train_time:155154ms step_avg:156.09ms step:1005/1480 train_time:155324ms step_avg:156.10ms step:1006/1480 train_time:155490ms step_avg:156.11ms step:1007/1480 train_time:155654ms step_avg:156.12ms step:1008/1480 train_time:155821ms step_avg:156.13ms step:1009/1480 train_time:155995ms step_avg:156.15ms step:1010/1480 train_time:156162ms step_avg:156.16ms step:1011/1480 train_time:156326ms step_avg:156.17ms step:1012/1480 train_time:156491ms step_avg:156.18ms step:1013/1480 train_time:156660ms step_avg:156.19ms step:1014/1480 train_time:156826ms step_avg:156.20ms step:1015/1480 train_time:156997ms step_avg:156.22ms step:1016/1480 train_time:157165ms step_avg:156.23ms step:1017/1480 train_time:157337ms step_avg:156.24ms step:1018/1480 train_time:157506ms step_avg:156.26ms step:1019/1480 train_time:157673ms step_avg:156.27ms step:1020/1480 train_time:157841ms step_avg:156.28ms step:1021/1480 train_time:158006ms step_avg:156.29ms step:1022/1480 train_time:158173ms step_avg:156.30ms step:1023/1480 train_time:158340ms step_avg:156.31ms step:1024/1480 train_time:158508ms step_avg:156.32ms step:1025/1480 train_time:158680ms step_avg:156.33ms step:1026/1480 train_time:158846ms step_avg:156.34ms step:1027/1480 train_time:159012ms step_avg:156.35ms step:1028/1480 train_time:159186ms step_avg:156.37ms step:1029/1480 train_time:159360ms step_avg:156.39ms step:1030/1480 train_time:159527ms step_avg:156.40ms step:1031/1480 train_time:159692ms step_avg:156.41ms step:1032/1480 train_time:159867ms step_avg:156.43ms step:1033/1480 train_time:160033ms step_avg:156.43ms step:1034/1480 train_time:160201ms step_avg:156.45ms step:1035/1480 train_time:160369ms step_avg:156.46ms step:1036/1480 train_time:160535ms step_avg:156.47ms step:1037/1480 train_time:160704ms step_avg:156.48ms step:1038/1480 train_time:160870ms step_avg:156.49ms step:1039/1480 train_time:161043ms step_avg:156.50ms step:1040/1480 train_time:161209ms step_avg:156.51ms step:1041/1480 train_time:161376ms step_avg:156.52ms step:1042/1480 train_time:161542ms step_avg:156.53ms step:1043/1480 train_time:161708ms step_avg:156.54ms step:1044/1480 train_time:161872ms step_avg:156.55ms step:1045/1480 train_time:162043ms step_avg:156.56ms step:1046/1480 train_time:162211ms step_avg:156.57ms step:1047/1480 train_time:162377ms step_avg:156.58ms step:1048/1480 train_time:162544ms step_avg:156.59ms step:1049/1480 train_time:162710ms step_avg:156.60ms step:1050/1480 train_time:162879ms step_avg:156.61ms step:1051/1480 train_time:163049ms step_avg:156.63ms step:1052/1480 train_time:163216ms step_avg:156.64ms step:1053/1480 train_time:163385ms step_avg:156.65ms step:1054/1480 train_time:163552ms step_avg:156.66ms step:1055/1480 train_time:163718ms step_avg:156.67ms step:1056/1480 train_time:163883ms step_avg:156.68ms step:1057/1480 train_time:164049ms step_avg:156.68ms step:1058/1480 train_time:164219ms step_avg:156.70ms step:1059/1480 train_time:164392ms step_avg:156.71ms step:1060/1480 train_time:164561ms step_avg:156.72ms step:1061/1480 train_time:164725ms step_avg:156.73ms step:1062/1480 train_time:164891ms step_avg:156.74ms step:1063/1480 train_time:165055ms step_avg:156.75ms step:1064/1480 train_time:165218ms step_avg:156.75ms step:1065/1480 train_time:165387ms step_avg:156.76ms step:1066/1480 train_time:165553ms step_avg:156.77ms step:1067/1480 train_time:165724ms step_avg:156.79ms step:1068/1480 train_time:165890ms step_avg:156.80ms step:1069/1480 train_time:166062ms step_avg:156.81ms step:1070/1480 train_time:166229ms step_avg:156.82ms step:1071/1480 train_time:166403ms step_avg:156.84ms step:1072/1480 train_time:166569ms step_avg:156.84ms step:1073/1480 train_time:166731ms step_avg:156.85ms step:1074/1480 train_time:166897ms step_avg:156.86ms step:1075/1480 train_time:167068ms step_avg:156.87ms step:1076/1480 train_time:167235ms step_avg:156.88ms step:1077/1480 train_time:167400ms step_avg:156.89ms step:1078/1480 train_time:167574ms step_avg:156.90ms step:1079/1480 train_time:167747ms step_avg:156.92ms step:1080/1480 train_time:167917ms step_avg:156.93ms step:1081/1480 train_time:168084ms step_avg:156.94ms step:1082/1480 train_time:168249ms step_avg:156.95ms step:1083/1480 train_time:168416ms step_avg:156.96ms step:1084/1480 train_time:168586ms step_avg:156.97ms step:1085/1480 train_time:168752ms step_avg:156.98ms step:1086/1480 train_time:168921ms step_avg:156.99ms step:1087/1480 train_time:169087ms step_avg:157.00ms step:1088/1480 train_time:169256ms step_avg:157.01ms step:1089/1480 train_time:169428ms step_avg:157.02ms step:1090/1480 train_time:169600ms step_avg:157.04ms step:1091/1480 train_time:169768ms step_avg:157.05ms step:1092/1480 train_time:169934ms step_avg:157.06ms step:1093/1480 train_time:170106ms step_avg:157.07ms step:1094/1480 train_time:170272ms step_avg:157.08ms step:1095/1480 train_time:170437ms step_avg:157.08ms step:1096/1480 train_time:170606ms step_avg:157.10ms step:1097/1480 train_time:170773ms step_avg:157.11ms step:1098/1480 train_time:170945ms step_avg:157.12ms step:1099/1480 train_time:171117ms step_avg:157.13ms step:1100/1480 train_time:171290ms step_avg:157.15ms step:1101/1480 train_time:171462ms step_avg:157.16ms step:1102/1480 train_time:171634ms step_avg:157.17ms step:1103/1480 train_time:171811ms step_avg:157.19ms step:1104/1480 train_time:171978ms step_avg:157.20ms step:1105/1480 train_time:172149ms step_avg:157.21ms step:1106/1480 train_time:172315ms step_avg:157.22ms step:1107/1480 train_time:172485ms step_avg:157.23ms step:1108/1480 train_time:172651ms step_avg:157.24ms step:1109/1480 train_time:172816ms step_avg:157.25ms step:1110/1480 train_time:172982ms step_avg:157.26ms step:1111/1480 train_time:173149ms step_avg:157.27ms step:1112/1480 train_time:173319ms step_avg:157.28ms step:1113/1480 train_time:173499ms step_avg:157.30ms step:1114/1480 train_time:173672ms step_avg:157.31ms step:1115/1480 train_time:173844ms step_avg:157.33ms step:1116/1480 train_time:174012ms step_avg:157.33ms step:1117/1480 train_time:174185ms step_avg:157.35ms step:1118/1480 train_time:174360ms step_avg:157.36ms step:1119/1480 train_time:174526ms step_avg:157.37ms step:1120/1480 train_time:174694ms step_avg:157.38ms step:1121/1480 train_time:174866ms step_avg:157.40ms step:1122/1480 train_time:175032ms step_avg:157.40ms step:1123/1480 train_time:175199ms step_avg:157.41ms step:1124/1480 train_time:175367ms step_avg:157.42ms step:1125/1480 train_time:175534ms step_avg:157.43ms step:1125/1480 val_loss:3.3883 train_time:175601ms step_avg:157.49ms step:1126/1480 train_time:175703ms step_avg:157.44ms step:1127/1480 train_time:175874ms step_avg:157.45ms step:1128/1480 train_time:176043ms step_avg:157.46ms step:1129/1480 train_time:176219ms step_avg:157.48ms step:1130/1480 train_time:176389ms step_avg:157.49ms step:1131/1480 train_time:176566ms step_avg:157.51ms step:1132/1480 train_time:176732ms step_avg:157.52ms step:1133/1480 train_time:176903ms step_avg:157.53ms step:1134/1480 train_time:177075ms step_avg:157.54ms step:1135/1480 train_time:177242ms step_avg:157.55ms step:1136/1480 train_time:177414ms step_avg:157.56ms step:1137/1480 train_time:177582ms step_avg:157.57ms step:1138/1480 train_time:177754ms step_avg:157.58ms step:1139/1480 train_time:177921ms step_avg:157.59ms step:1140/1480 train_time:178089ms step_avg:157.60ms step:1141/1480 train_time:178260ms step_avg:157.61ms step:1142/1480 train_time:178429ms step_avg:157.62ms step:1143/1480 train_time:178600ms step_avg:157.63ms step:1144/1480 train_time:178769ms step_avg:157.64ms step:1145/1480 train_time:178935ms step_avg:157.65ms step:1146/1480 train_time:179106ms step_avg:157.66ms step:1147/1480 train_time:179275ms step_avg:157.67ms step:1148/1480 train_time:179443ms step_avg:157.68ms step:1149/1480 train_time:179614ms step_avg:157.69ms step:1150/1480 train_time:179782ms step_avg:157.70ms step:1151/1480 train_time:179954ms step_avg:157.72ms step:1152/1480 train_time:180126ms step_avg:157.73ms step:1153/1480 train_time:180300ms step_avg:157.74ms step:1154/1480 train_time:180468ms step_avg:157.75ms step:1155/1480 train_time:180639ms step_avg:157.76ms step:1156/1480 train_time:180816ms step_avg:157.78ms step:1157/1480 train_time:180986ms step_avg:157.79ms step:1158/1480 train_time:181154ms step_avg:157.80ms step:1159/1480 train_time:181320ms step_avg:157.81ms step:1160/1480 train_time:181487ms step_avg:157.81ms step:1161/1480 train_time:181659ms step_avg:157.83ms step:1162/1480 train_time:181829ms step_avg:157.84ms step:1163/1480 train_time:181998ms step_avg:157.85ms step:1164/1480 train_time:182168ms step_avg:157.86ms step:1165/1480 train_time:182333ms step_avg:157.86ms step:1166/1480 train_time:182502ms step_avg:157.87ms step:1167/1480 train_time:182670ms step_avg:157.88ms step:1168/1480 train_time:182839ms step_avg:157.89ms step:1169/1480 train_time:183009ms step_avg:157.90ms step:1170/1480 train_time:183176ms step_avg:157.91ms step:1171/1480 train_time:183342ms step_avg:157.92ms step:1172/1480 train_time:183510ms step_avg:157.93ms step:1173/1480 train_time:183680ms step_avg:157.94ms step:1174/1480 train_time:183862ms step_avg:157.96ms step:1175/1480 train_time:184034ms step_avg:157.97ms step:1176/1480 train_time:184206ms step_avg:157.98ms step:1177/1480 train_time:184382ms step_avg:158.00ms step:1178/1480 train_time:184550ms step_avg:158.01ms step:1179/1480 train_time:184715ms step_avg:158.01ms step:1180/1480 train_time:184897ms step_avg:158.03ms step:1181/1480 train_time:185067ms step_avg:158.04ms step:1182/1480 train_time:185235ms step_avg:158.05ms step:1183/1480 train_time:185407ms step_avg:158.06ms step:1184/1480 train_time:185574ms step_avg:158.07ms step:1185/1480 train_time:185745ms step_avg:158.08ms step:1186/1480 train_time:185916ms step_avg:158.09ms step:1187/1480 train_time:186099ms step_avg:158.11ms step:1188/1480 train_time:186265ms step_avg:158.12ms step:1189/1480 train_time:186439ms step_avg:158.13ms step:1190/1480 train_time:186607ms step_avg:158.14ms step:1191/1480 train_time:186777ms step_avg:158.15ms step:1192/1480 train_time:186944ms step_avg:158.16ms step:1193/1480 train_time:187113ms step_avg:158.17ms step:1194/1480 train_time:187281ms step_avg:158.18ms step:1195/1480 train_time:187455ms step_avg:158.19ms step:1196/1480 train_time:187637ms step_avg:158.21ms step:1197/1480 train_time:187808ms step_avg:158.22ms step:1198/1480 train_time:187990ms step_avg:158.24ms step:1199/1480 train_time:188160ms step_avg:158.25ms step:1200/1480 train_time:188329ms step_avg:158.26ms step:1201/1480 train_time:188496ms step_avg:158.27ms step:1202/1480 train_time:188679ms step_avg:158.29ms step:1203/1480 train_time:188855ms step_avg:158.30ms step:1204/1480 train_time:189030ms step_avg:158.32ms step:1205/1480 train_time:189197ms step_avg:158.32ms step:1206/1480 train_time:189364ms step_avg:158.33ms step:1207/1480 train_time:189535ms step_avg:158.34ms step:1208/1480 train_time:189702ms step_avg:158.35ms step:1209/1480 train_time:189875ms step_avg:158.36ms step:1210/1480 train_time:190050ms step_avg:158.37ms step:1211/1480 train_time:190223ms step_avg:158.39ms step:1212/1480 train_time:190395ms step_avg:158.40ms step:1213/1480 train_time:190567ms step_avg:158.41ms step:1214/1480 train_time:190742ms step_avg:158.42ms step:1215/1480 train_time:190915ms step_avg:158.44ms step:1216/1480 train_time:191083ms step_avg:158.44ms step:1217/1480 train_time:191256ms step_avg:158.46ms step:1218/1480 train_time:191427ms step_avg:158.47ms step:1219/1480 train_time:191605ms step_avg:158.48ms step:1220/1480 train_time:191774ms step_avg:158.49ms step:1221/1480 train_time:191944ms step_avg:158.50ms step:1222/1480 train_time:192113ms step_avg:158.51ms step:1223/1480 train_time:192282ms step_avg:158.52ms step:1224/1480 train_time:192460ms step_avg:158.53ms step:1225/1480 train_time:192632ms step_avg:158.54ms step:1226/1480 train_time:192804ms step_avg:158.56ms step:1227/1480 train_time:192977ms step_avg:158.57ms step:1228/1480 train_time:193145ms step_avg:158.58ms step:1229/1480 train_time:193319ms step_avg:158.59ms step:1230/1480 train_time:193499ms step_avg:158.61ms step:1231/1480 train_time:193675ms step_avg:158.62ms step:1232/1480 train_time:193849ms step_avg:158.63ms step:1233/1480 train_time:194020ms step_avg:158.64ms step:1234/1480 train_time:194191ms step_avg:158.65ms step:1235/1480 train_time:194364ms step_avg:158.66ms step:1236/1480 train_time:194533ms step_avg:158.67ms step:1237/1480 train_time:194703ms step_avg:158.68ms step:1238/1480 train_time:194888ms step_avg:158.70ms step:1239/1480 train_time:195058ms step_avg:158.71ms step:1240/1480 train_time:195229ms step_avg:158.72ms step:1241/1480 train_time:195401ms step_avg:158.73ms step:1242/1480 train_time:195570ms step_avg:158.74ms step:1243/1480 train_time:195744ms step_avg:158.75ms step:1244/1480 train_time:195911ms step_avg:158.76ms step:1245/1480 train_time:196079ms step_avg:158.77ms step:1246/1480 train_time:196250ms step_avg:158.78ms step:1247/1480 train_time:196420ms step_avg:158.79ms step:1248/1480 train_time:196589ms step_avg:158.80ms step:1249/1480 train_time:196757ms step_avg:158.80ms step:1250/1480 train_time:196927ms step_avg:158.81ms step:1250/1480 val_loss:3.3394 train_time:196999ms step_avg:158.87ms step:1251/1480 train_time:197109ms step_avg:158.83ms step:1252/1480 train_time:197278ms step_avg:158.84ms step:1253/1480 train_time:197446ms step_avg:158.85ms step:1254/1480 train_time:197617ms step_avg:158.86ms step:1255/1480 train_time:197802ms step_avg:158.88ms step:1256/1480 train_time:197975ms step_avg:158.89ms step:1257/1480 train_time:198145ms step_avg:158.90ms step:1258/1480 train_time:198319ms step_avg:158.91ms step:1259/1480 train_time:198491ms step_avg:158.92ms step:1260/1480 train_time:198657ms step_avg:158.93ms step:1261/1480 train_time:198831ms step_avg:158.94ms step:1262/1480 train_time:199005ms step_avg:158.95ms step:1263/1480 train_time:199178ms step_avg:158.96ms step:1264/1480 train_time:199346ms step_avg:158.97ms step:1265/1480 train_time:199513ms step_avg:158.97ms step:1266/1480 train_time:199685ms step_avg:158.98ms step:1267/1480 train_time:199856ms step_avg:158.99ms step:1268/1480 train_time:200027ms step_avg:159.00ms step:1269/1480 train_time:200202ms step_avg:159.02ms step:1270/1480 train_time:200370ms step_avg:159.02ms step:1271/1480 train_time:200541ms step_avg:159.03ms step:1272/1480 train_time:200708ms step_avg:159.04ms step:1273/1480 train_time:200878ms step_avg:159.05ms step:1274/1480 train_time:201051ms step_avg:159.06ms step:1275/1480 train_time:201219ms step_avg:159.07ms step:1276/1480 train_time:201384ms step_avg:159.07ms step:1277/1480 train_time:201557ms step_avg:159.08ms step:1278/1480 train_time:201724ms step_avg:159.09ms step:1279/1480 train_time:201896ms step_avg:159.10ms step:1280/1480 train_time:202075ms step_avg:159.11ms step:1281/1480 train_time:202245ms step_avg:159.12ms step:1282/1480 train_time:202411ms step_avg:159.13ms step:1283/1480 train_time:202580ms step_avg:159.14ms step:1284/1480 train_time:202751ms step_avg:159.14ms step:1285/1480 train_time:202919ms step_avg:159.15ms step:1286/1480 train_time:203089ms step_avg:159.16ms step:1287/1480 train_time:203263ms step_avg:159.17ms step:1288/1480 train_time:203436ms step_avg:159.18ms step:1289/1480 train_time:203618ms step_avg:159.20ms step:1290/1480 train_time:203797ms step_avg:159.22ms step:1291/1480 train_time:203971ms step_avg:159.23ms step:1292/1480 train_time:204144ms step_avg:159.24ms step:1293/1480 train_time:204319ms step_avg:159.25ms step:1294/1480 train_time:204491ms step_avg:159.26ms step:1295/1480 train_time:204660ms step_avg:159.27ms step:1296/1480 train_time:204835ms step_avg:159.28ms step:1297/1480 train_time:205006ms step_avg:159.29ms step:1298/1480 train_time:205176ms step_avg:159.30ms step:1299/1480 train_time:205346ms step_avg:159.31ms step:1300/1480 train_time:205514ms step_avg:159.31ms step:1301/1480 train_time:205682ms step_avg:159.32ms step:1302/1480 train_time:205857ms step_avg:159.33ms step:1303/1480 train_time:206034ms step_avg:159.35ms step:1304/1480 train_time:206208ms step_avg:159.36ms step:1305/1480 train_time:206377ms step_avg:159.36ms step:1306/1480 train_time:206553ms step_avg:159.38ms step:1307/1480 train_time:206720ms step_avg:159.38ms step:1308/1480 train_time:206889ms step_avg:159.39ms step:1309/1480 train_time:207061ms step_avg:159.40ms step:1310/1480 train_time:207232ms step_avg:159.41ms step:1311/1480 train_time:207400ms step_avg:159.42ms step:1312/1480 train_time:207573ms step_avg:159.43ms step:1313/1480 train_time:207743ms step_avg:159.43ms step:1314/1480 train_time:207915ms step_avg:159.44ms step:1315/1480 train_time:208086ms step_avg:159.45ms step:1316/1480 train_time:208253ms step_avg:159.46ms step:1317/1480 train_time:208423ms step_avg:159.47ms step:1318/1480 train_time:208603ms step_avg:159.48ms step:1319/1480 train_time:208778ms step_avg:159.49ms step:1320/1480 train_time:208956ms step_avg:159.51ms step:1321/1480 train_time:209127ms step_avg:159.52ms step:1322/1480 train_time:209310ms step_avg:159.54ms step:1323/1480 train_time:209481ms step_avg:159.54ms step:1324/1480 train_time:209657ms step_avg:159.56ms step:1325/1480 train_time:209840ms step_avg:159.57ms step:1326/1480 train_time:210016ms step_avg:159.59ms step:1327/1480 train_time:210187ms step_avg:159.60ms step:1328/1480 train_time:210358ms step_avg:159.60ms step:1329/1480 train_time:210555ms step_avg:159.63ms step:1330/1480 train_time:210735ms step_avg:159.65ms step:1331/1480 train_time:210905ms step_avg:159.66ms step:1332/1480 train_time:211080ms step_avg:159.67ms step:1333/1480 train_time:211256ms step_avg:159.68ms step:1334/1480 train_time:211427ms step_avg:159.69ms step:1335/1480 train_time:211595ms step_avg:159.69ms step:1336/1480 train_time:211780ms step_avg:159.71ms step:1337/1480 train_time:211955ms step_avg:159.73ms step:1338/1480 train_time:212129ms step_avg:159.74ms step:1339/1480 train_time:212302ms step_avg:159.75ms step:1340/1480 train_time:212474ms step_avg:159.75ms step:1341/1480 train_time:212642ms step_avg:159.76ms step:1342/1480 train_time:212816ms step_avg:159.77ms step:1343/1480 train_time:212986ms step_avg:159.78ms step:1344/1480 train_time:213159ms step_avg:159.79ms step:1345/1480 train_time:213338ms step_avg:159.80ms step:1346/1480 train_time:213506ms step_avg:159.81ms step:1347/1480 train_time:213676ms step_avg:159.82ms step:1348/1480 train_time:213846ms step_avg:159.82ms step:1349/1480 train_time:214014ms step_avg:159.83ms step:1350/1480 train_time:214189ms step_avg:159.84ms step:1351/1480 train_time:214360ms step_avg:159.85ms step:1352/1480 train_time:214530ms step_avg:159.86ms step:1353/1480 train_time:214705ms step_avg:159.87ms step:1354/1480 train_time:214877ms step_avg:159.88ms step:1355/1480 train_time:215044ms step_avg:159.88ms step:1356/1480 train_time:215218ms step_avg:159.89ms step:1357/1480 train_time:215392ms step_avg:159.90ms step:1358/1480 train_time:215563ms step_avg:159.91ms step:1359/1480 train_time:215735ms step_avg:159.92ms step:1360/1480 train_time:215910ms step_avg:159.93ms step:1361/1480 train_time:216088ms step_avg:159.95ms step:1362/1480 train_time:216263ms step_avg:159.96ms step:1363/1480 train_time:216444ms step_avg:159.97ms step:1364/1480 train_time:216614ms step_avg:159.98ms step:1365/1480 train_time:216781ms step_avg:159.99ms step:1366/1480 train_time:216955ms step_avg:160.00ms step:1367/1480 train_time:217125ms step_avg:160.00ms step:1368/1480 train_time:217297ms step_avg:160.01ms step:1369/1480 train_time:217479ms step_avg:160.03ms step:1370/1480 train_time:217656ms step_avg:160.04ms step:1371/1480 train_time:217829ms step_avg:160.05ms step:1372/1480 train_time:218005ms step_avg:160.06ms step:1373/1480 train_time:218175ms step_avg:160.07ms step:1374/1480 train_time:218351ms step_avg:160.08ms step:1375/1480 train_time:218521ms step_avg:160.09ms step:1375/1480 val_loss:3.3003 train_time:218589ms step_avg:160.14ms step:1376/1480 train_time:218695ms step_avg:160.10ms step:1377/1480 train_time:218868ms step_avg:160.11ms step:1378/1480 train_time:219039ms step_avg:160.12ms step:1379/1480 train_time:219212ms step_avg:160.13ms step:1380/1480 train_time:219386ms step_avg:160.14ms step:1381/1480 train_time:219566ms step_avg:160.15ms step:1382/1480 train_time:219737ms step_avg:160.16ms step:1383/1480 train_time:219908ms step_avg:160.17ms step:1384/1480 train_time:220086ms step_avg:160.18ms step:1385/1480 train_time:220250ms step_avg:160.18ms step:1386/1480 train_time:220421ms step_avg:160.19ms step:1387/1480 train_time:220592ms step_avg:160.20ms step:1388/1480 train_time:220760ms step_avg:160.20ms step:1389/1480 train_time:220933ms step_avg:160.21ms step:1390/1480 train_time:221101ms step_avg:160.22ms step:1391/1480 train_time:221271ms step_avg:160.23ms step:1392/1480 train_time:221443ms step_avg:160.23ms step:1393/1480 train_time:221613ms step_avg:160.24ms step:1394/1480 train_time:221784ms step_avg:160.25ms step:1395/1480 train_time:221952ms step_avg:160.25ms step:1396/1480 train_time:222121ms step_avg:160.26ms step:1397/1480 train_time:222288ms step_avg:160.27ms step:1398/1480 train_time:222458ms step_avg:160.27ms step:1399/1480 train_time:222626ms step_avg:160.28ms step:1400/1480 train_time:222802ms step_avg:160.29ms step:1401/1480 train_time:222968ms step_avg:160.29ms step:1402/1480 train_time:223141ms step_avg:160.30ms step:1403/1480 train_time:223316ms step_avg:160.31ms step:1404/1480 train_time:223487ms step_avg:160.32ms step:1405/1480 train_time:223662ms step_avg:160.33ms step:1406/1480 train_time:223837ms step_avg:160.34ms step:1407/1480 train_time:224004ms step_avg:160.35ms step:1408/1480 train_time:224171ms step_avg:160.35ms step:1409/1480 train_time:224356ms step_avg:160.37ms step:1410/1480 train_time:224525ms step_avg:160.37ms step:1411/1480 train_time:224693ms step_avg:160.38ms step:1412/1480 train_time:224864ms step_avg:160.39ms step:1413/1480 train_time:225033ms step_avg:160.39ms step:1414/1480 train_time:225204ms step_avg:160.40ms step:1415/1480 train_time:225379ms step_avg:160.41ms step:1416/1480 train_time:225567ms step_avg:160.43ms step:1417/1480 train_time:225742ms step_avg:160.44ms step:1418/1480 train_time:225914ms step_avg:160.45ms step:1419/1480 train_time:226089ms step_avg:160.46ms step:1420/1480 train_time:226264ms step_avg:160.47ms step:1421/1480 train_time:226436ms step_avg:160.48ms step:1422/1480 train_time:226606ms step_avg:160.49ms step:1423/1480 train_time:226774ms step_avg:160.49ms step:1424/1480 train_time:226952ms step_avg:160.50ms step:1425/1480 train_time:227131ms step_avg:160.52ms step:1426/1480 train_time:227303ms step_avg:160.52ms step:1427/1480 train_time:227479ms step_avg:160.54ms step:1428/1480 train_time:227652ms step_avg:160.54ms step:1429/1480 train_time:227820ms step_avg:160.55ms step:1430/1480 train_time:227994ms step_avg:160.56ms step:1431/1480 train_time:228169ms step_avg:160.57ms step:1432/1480 train_time:228345ms step_avg:160.58ms step:1433/1480 train_time:228524ms step_avg:160.59ms step:1434/1480 train_time:228704ms step_avg:160.61ms step:1435/1480 train_time:228879ms step_avg:160.62ms step:1436/1480 train_time:229052ms step_avg:160.63ms step:1437/1480 train_time:229222ms step_avg:160.63ms step:1438/1480 train_time:229391ms step_avg:160.64ms step:1439/1480 train_time:229565ms step_avg:160.65ms step:1440/1480 train_time:229735ms step_avg:160.65ms step:1441/1480 train_time:229906ms step_avg:160.66ms step:1442/1480 train_time:230085ms step_avg:160.67ms step:1443/1480 train_time:230274ms step_avg:160.69ms step:1444/1480 train_time:230445ms step_avg:160.70ms step:1445/1480 train_time:230616ms step_avg:160.71ms step:1446/1480 train_time:230792ms step_avg:160.72ms step:1447/1480 train_time:230972ms step_avg:160.73ms step:1448/1480 train_time:231145ms step_avg:160.74ms step:1449/1480 train_time:231317ms step_avg:160.75ms step:1450/1480 train_time:231489ms step_avg:160.76ms step:1451/1480 train_time:231661ms step_avg:160.76ms step:1452/1480 train_time:231834ms step_avg:160.77ms step:1453/1480 train_time:232004ms step_avg:160.78ms step:1454/1480 train_time:232177ms step_avg:160.79ms step:1455/1480 train_time:232357ms step_avg:160.80ms step:1456/1480 train_time:232528ms step_avg:160.81ms step:1457/1480 train_time:232698ms step_avg:160.81ms step:1458/1480 train_time:232869ms step_avg:160.82ms step:1459/1480 train_time:233047ms step_avg:160.83ms step:1460/1480 train_time:233220ms step_avg:160.84ms step:1461/1480 train_time:233395ms step_avg:160.85ms step:1462/1480 train_time:233565ms step_avg:160.86ms step:1463/1480 train_time:233742ms step_avg:160.87ms step:1464/1480 train_time:233916ms step_avg:160.88ms step:1465/1480 train_time:234087ms step_avg:160.88ms step:1466/1480 train_time:234258ms step_avg:160.89ms step:1467/1480 train_time:234432ms step_avg:160.90ms step:1468/1480 train_time:234601ms step_avg:160.91ms step:1469/1480 train_time:234774ms step_avg:160.91ms step:1470/1480 train_time:234957ms step_avg:160.93ms step:1471/1480 train_time:235145ms step_avg:160.95ms step:1472/1480 train_time:235326ms step_avg:160.96ms step:1473/1480 train_time:235497ms step_avg:160.97ms step:1474/1480 train_time:235673ms step_avg:160.98ms step:1475/1480 train_time:235852ms step_avg:160.99ms step:1476/1480 train_time:236025ms step_avg:161.00ms step:1477/1480 train_time:236207ms step_avg:161.01ms step:1478/1480 train_time:236389ms step_avg:161.03ms step:1479/1480 train_time:236564ms step_avg:161.04ms step:1480/1480 train_time:236735ms step_avg:161.04ms step:1480/1480 val_loss:3.2814 train_time:236806ms step_avg:161.09ms