import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 10:47:41 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 130W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 97W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 108W / 700W | 37MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 116W / 700W | 117MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22752ms step_avg:nanms step:2/1480 train_time:22838ms step_avg:nanms step:3/1480 train_time:22977ms step_avg:nanms step:4/1480 train_time:23117ms step_avg:nanms step:5/1480 train_time:23259ms step_avg:nanms step:6/1480 train_time:23399ms step_avg:nanms step:7/1480 train_time:23540ms step_avg:nanms step:8/1480 train_time:23683ms step_avg:nanms step:9/1480 train_time:23826ms step_avg:nanms step:10/1480 train_time:23971ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:427ms step_avg:142.29ms step:14/1480 train_time:568ms step_avg:141.95ms step:15/1480 train_time:711ms step_avg:142.13ms step:16/1480 train_time:853ms step_avg:142.15ms step:17/1480 train_time:998ms step_avg:142.61ms step:18/1480 train_time:1142ms step_avg:142.78ms step:19/1480 train_time:1286ms step_avg:142.89ms step:20/1480 train_time:1428ms step_avg:142.79ms step:21/1480 train_time:1569ms step_avg:142.67ms step:22/1480 train_time:1713ms step_avg:142.75ms step:23/1480 train_time:1856ms step_avg:142.78ms step:24/1480 train_time:2002ms step_avg:143.00ms step:25/1480 train_time:2146ms step_avg:143.09ms step:26/1480 train_time:2290ms step_avg:143.12ms step:27/1480 train_time:2431ms step_avg:142.98ms step:28/1480 train_time:2573ms step_avg:142.93ms step:29/1480 train_time:2715ms step_avg:142.89ms step:30/1480 train_time:2858ms step_avg:142.89ms step:31/1480 train_time:3002ms step_avg:142.97ms step:32/1480 train_time:3146ms step_avg:143.01ms step:33/1480 train_time:3290ms step_avg:143.04ms step:34/1480 train_time:3430ms step_avg:142.93ms step:35/1480 train_time:3572ms step_avg:142.87ms step:36/1480 train_time:3715ms step_avg:142.88ms step:37/1480 train_time:3858ms step_avg:142.88ms step:38/1480 train_time:4002ms step_avg:142.94ms step:39/1480 train_time:4146ms step_avg:142.97ms step:40/1480 train_time:4289ms step_avg:142.98ms step:41/1480 train_time:4430ms step_avg:142.92ms step:42/1480 train_time:4572ms step_avg:142.86ms step:43/1480 train_time:4713ms step_avg:142.81ms step:44/1480 train_time:4854ms step_avg:142.76ms step:45/1480 train_time:4997ms step_avg:142.77ms step:46/1480 train_time:5140ms step_avg:142.79ms step:47/1480 train_time:5285ms step_avg:142.84ms step:48/1480 train_time:5428ms step_avg:142.84ms step:49/1480 train_time:5569ms step_avg:142.80ms step:50/1480 train_time:5711ms step_avg:142.79ms step:51/1480 train_time:5853ms step_avg:142.76ms step:52/1480 train_time:5997ms step_avg:142.79ms step:53/1480 train_time:6141ms step_avg:142.81ms step:54/1480 train_time:6286ms step_avg:142.87ms step:55/1480 train_time:6429ms step_avg:142.87ms step:56/1480 train_time:6571ms step_avg:142.84ms step:57/1480 train_time:6712ms step_avg:142.81ms step:58/1480 train_time:6853ms step_avg:142.77ms step:59/1480 train_time:6995ms step_avg:142.76ms step:60/1480 train_time:7137ms step_avg:142.75ms step:61/1480 train_time:7284ms step_avg:142.81ms step:62/1480 train_time:7428ms step_avg:142.85ms step:63/1480 train_time:7570ms step_avg:142.83ms step:64/1480 train_time:7712ms step_avg:142.81ms step:65/1480 train_time:7853ms step_avg:142.78ms step:66/1480 train_time:7996ms step_avg:142.79ms step:67/1480 train_time:8138ms step_avg:142.77ms step:68/1480 train_time:8281ms step_avg:142.77ms step:69/1480 train_time:8425ms step_avg:142.79ms step:70/1480 train_time:8567ms step_avg:142.78ms step:71/1480 train_time:8709ms step_avg:142.78ms step:72/1480 train_time:8851ms step_avg:142.76ms step:73/1480 train_time:8993ms step_avg:142.75ms step:74/1480 train_time:9134ms step_avg:142.72ms step:75/1480 train_time:9278ms step_avg:142.73ms step:76/1480 train_time:9422ms step_avg:142.76ms step:77/1480 train_time:9566ms step_avg:142.77ms step:78/1480 train_time:9708ms step_avg:142.77ms step:79/1480 train_time:9850ms step_avg:142.75ms step:80/1480 train_time:9991ms step_avg:142.73ms step:81/1480 train_time:10133ms step_avg:142.72ms step:82/1480 train_time:10277ms step_avg:142.73ms step:83/1480 train_time:10421ms step_avg:142.75ms step:84/1480 train_time:10565ms step_avg:142.77ms step:85/1480 train_time:10707ms step_avg:142.76ms step:86/1480 train_time:10849ms step_avg:142.75ms step:87/1480 train_time:10991ms step_avg:142.74ms step:88/1480 train_time:11131ms step_avg:142.71ms step:89/1480 train_time:11275ms step_avg:142.73ms step:90/1480 train_time:11421ms step_avg:142.77ms step:91/1480 train_time:11564ms step_avg:142.77ms step:92/1480 train_time:11707ms step_avg:142.77ms step:93/1480 train_time:11848ms step_avg:142.75ms step:94/1480 train_time:11990ms step_avg:142.74ms step:95/1480 train_time:12131ms step_avg:142.71ms step:96/1480 train_time:12272ms step_avg:142.69ms step:97/1480 train_time:12415ms step_avg:142.70ms step:98/1480 train_time:12558ms step_avg:142.70ms step:99/1480 train_time:12702ms step_avg:142.72ms step:100/1480 train_time:12845ms step_avg:142.72ms step:101/1480 train_time:12987ms step_avg:142.71ms step:102/1480 train_time:13128ms step_avg:142.70ms step:103/1480 train_time:13270ms step_avg:142.69ms step:104/1480 train_time:13413ms step_avg:142.69ms step:105/1480 train_time:13556ms step_avg:142.70ms step:106/1480 train_time:13700ms step_avg:142.71ms step:107/1480 train_time:13844ms step_avg:142.72ms step:108/1480 train_time:13988ms step_avg:142.73ms step:109/1480 train_time:14129ms step_avg:142.71ms step:110/1480 train_time:14271ms step_avg:142.71ms step:111/1480 train_time:14415ms step_avg:142.73ms step:112/1480 train_time:14564ms step_avg:142.79ms step:113/1480 train_time:14712ms step_avg:142.83ms step:114/1480 train_time:14859ms step_avg:142.87ms step:115/1480 train_time:15006ms step_avg:142.92ms step:116/1480 train_time:15152ms step_avg:142.94ms step:117/1480 train_time:15298ms step_avg:142.97ms step:118/1480 train_time:15445ms step_avg:143.01ms step:119/1480 train_time:15592ms step_avg:143.05ms step:120/1480 train_time:15739ms step_avg:143.08ms step:121/1480 train_time:15887ms step_avg:143.12ms step:122/1480 train_time:16033ms step_avg:143.15ms step:123/1480 train_time:16181ms step_avg:143.20ms step:124/1480 train_time:16329ms step_avg:143.23ms step:125/1480 train_time:16475ms step_avg:143.26ms step:125/1480 val_loss:4.4345 train_time:16532ms step_avg:143.76ms step:126/1480 train_time:16629ms step_avg:143.35ms step:127/1480 train_time:16779ms step_avg:143.41ms step:128/1480 train_time:16924ms step_avg:143.43ms step:129/1480 train_time:17069ms step_avg:143.44ms step:130/1480 train_time:17216ms step_avg:143.47ms step:131/1480 train_time:17362ms step_avg:143.49ms step:132/1480 train_time:17507ms step_avg:143.50ms step:133/1480 train_time:17656ms step_avg:143.55ms step:134/1480 train_time:17806ms step_avg:143.59ms step:135/1480 train_time:17954ms step_avg:143.63ms step:136/1480 train_time:18100ms step_avg:143.65ms step:137/1480 train_time:18246ms step_avg:143.67ms step:138/1480 train_time:18392ms step_avg:143.69ms step:139/1480 train_time:18539ms step_avg:143.72ms step:140/1480 train_time:18686ms step_avg:143.74ms step:141/1480 train_time:18835ms step_avg:143.78ms step:142/1480 train_time:18982ms step_avg:143.81ms step:143/1480 train_time:19128ms step_avg:143.82ms step:144/1480 train_time:19275ms step_avg:143.84ms step:145/1480 train_time:19423ms step_avg:143.87ms step:146/1480 train_time:19568ms step_avg:143.88ms step:147/1480 train_time:19715ms step_avg:143.90ms step:148/1480 train_time:19863ms step_avg:143.93ms step:149/1480 train_time:20010ms step_avg:143.96ms step:150/1480 train_time:20158ms step_avg:143.99ms step:151/1480 train_time:20304ms step_avg:144.00ms step:152/1480 train_time:20451ms step_avg:144.02ms step:153/1480 train_time:20598ms step_avg:144.04ms step:154/1480 train_time:20744ms step_avg:144.06ms step:155/1480 train_time:20891ms step_avg:144.08ms step:156/1480 train_time:21039ms step_avg:144.10ms step:157/1480 train_time:21185ms step_avg:144.11ms step:158/1480 train_time:21333ms step_avg:144.14ms step:159/1480 train_time:21480ms step_avg:144.16ms step:160/1480 train_time:21625ms step_avg:144.17ms step:161/1480 train_time:21771ms step_avg:144.18ms step:162/1480 train_time:21920ms step_avg:144.21ms step:163/1480 train_time:22066ms step_avg:144.22ms step:164/1480 train_time:22214ms step_avg:144.25ms step:165/1480 train_time:22362ms step_avg:144.27ms step:166/1480 train_time:22509ms step_avg:144.29ms step:167/1480 train_time:22657ms step_avg:144.31ms step:168/1480 train_time:22803ms step_avg:144.32ms step:169/1480 train_time:22950ms step_avg:144.34ms step:170/1480 train_time:23098ms step_avg:144.36ms step:171/1480 train_time:23245ms step_avg:144.38ms step:172/1480 train_time:23392ms step_avg:144.39ms step:173/1480 train_time:23540ms step_avg:144.42ms step:174/1480 train_time:23687ms step_avg:144.43ms step:175/1480 train_time:23835ms step_avg:144.45ms step:176/1480 train_time:23982ms step_avg:144.47ms step:177/1480 train_time:24129ms step_avg:144.49ms step:178/1480 train_time:24276ms step_avg:144.50ms step:179/1480 train_time:24423ms step_avg:144.51ms step:180/1480 train_time:24568ms step_avg:144.52ms step:181/1480 train_time:24716ms step_avg:144.54ms step:182/1480 train_time:24864ms step_avg:144.56ms step:183/1480 train_time:25011ms step_avg:144.57ms step:184/1480 train_time:25160ms step_avg:144.60ms step:185/1480 train_time:25306ms step_avg:144.60ms step:186/1480 train_time:25454ms step_avg:144.62ms step:187/1480 train_time:25601ms step_avg:144.64ms step:188/1480 train_time:25746ms step_avg:144.64ms step:189/1480 train_time:25893ms step_avg:144.66ms step:190/1480 train_time:26041ms step_avg:144.67ms step:191/1480 train_time:26188ms step_avg:144.68ms step:192/1480 train_time:26335ms step_avg:144.70ms step:193/1480 train_time:26482ms step_avg:144.71ms step:194/1480 train_time:26629ms step_avg:144.72ms step:195/1480 train_time:26777ms step_avg:144.74ms step:196/1480 train_time:26923ms step_avg:144.75ms step:197/1480 train_time:27070ms step_avg:144.76ms step:198/1480 train_time:27218ms step_avg:144.78ms step:199/1480 train_time:27364ms step_avg:144.78ms step:200/1480 train_time:27512ms step_avg:144.80ms step:201/1480 train_time:27659ms step_avg:144.81ms step:202/1480 train_time:27805ms step_avg:144.82ms step:203/1480 train_time:27954ms step_avg:144.84ms step:204/1480 train_time:28101ms step_avg:144.85ms step:205/1480 train_time:28248ms step_avg:144.86ms step:206/1480 train_time:28396ms step_avg:144.88ms step:207/1480 train_time:28543ms step_avg:144.89ms step:208/1480 train_time:28689ms step_avg:144.89ms step:209/1480 train_time:28837ms step_avg:144.91ms step:210/1480 train_time:28984ms step_avg:144.92ms step:211/1480 train_time:29131ms step_avg:144.93ms step:212/1480 train_time:29279ms step_avg:144.94ms step:213/1480 train_time:29425ms step_avg:144.95ms step:214/1480 train_time:29572ms step_avg:144.96ms step:215/1480 train_time:29720ms step_avg:144.97ms step:216/1480 train_time:29865ms step_avg:144.98ms step:217/1480 train_time:30013ms step_avg:144.99ms step:218/1480 train_time:30161ms step_avg:145.00ms step:219/1480 train_time:30307ms step_avg:145.01ms step:220/1480 train_time:30456ms step_avg:145.03ms step:221/1480 train_time:30604ms step_avg:145.04ms step:222/1480 train_time:30756ms step_avg:145.08ms step:223/1480 train_time:30906ms step_avg:145.10ms step:224/1480 train_time:31058ms step_avg:145.13ms step:225/1480 train_time:31208ms step_avg:145.15ms step:226/1480 train_time:31359ms step_avg:145.18ms step:227/1480 train_time:31509ms step_avg:145.20ms step:228/1480 train_time:31659ms step_avg:145.23ms step:229/1480 train_time:31810ms step_avg:145.25ms step:230/1480 train_time:31962ms step_avg:145.28ms step:231/1480 train_time:32114ms step_avg:145.31ms step:232/1480 train_time:32264ms step_avg:145.33ms step:233/1480 train_time:32415ms step_avg:145.36ms step:234/1480 train_time:32565ms step_avg:145.38ms step:235/1480 train_time:32718ms step_avg:145.41ms step:236/1480 train_time:32868ms step_avg:145.43ms step:237/1480 train_time:33018ms step_avg:145.45ms step:238/1480 train_time:33168ms step_avg:145.47ms step:239/1480 train_time:33319ms step_avg:145.50ms step:240/1480 train_time:33469ms step_avg:145.52ms step:241/1480 train_time:33619ms step_avg:145.54ms step:242/1480 train_time:33770ms step_avg:145.56ms step:243/1480 train_time:33921ms step_avg:145.58ms step:244/1480 train_time:34071ms step_avg:145.60ms step:245/1480 train_time:34222ms step_avg:145.63ms step:246/1480 train_time:34372ms step_avg:145.65ms step:247/1480 train_time:34523ms step_avg:145.67ms step:248/1480 train_time:34673ms step_avg:145.68ms step:249/1480 train_time:34824ms step_avg:145.71ms step:250/1480 train_time:34973ms step_avg:145.72ms step:250/1480 val_loss:3.9865 train_time:35034ms step_avg:145.97ms step:251/1480 train_time:35131ms step_avg:145.77ms step:252/1480 train_time:35282ms step_avg:145.79ms step:253/1480 train_time:35433ms step_avg:145.81ms step:254/1480 train_time:35582ms step_avg:145.83ms step:255/1480 train_time:35731ms step_avg:145.84ms step:256/1480 train_time:35880ms step_avg:145.85ms step:257/1480 train_time:36030ms step_avg:145.87ms step:258/1480 train_time:36182ms step_avg:145.89ms step:259/1480 train_time:36334ms step_avg:145.92ms step:260/1480 train_time:36484ms step_avg:145.94ms step:261/1480 train_time:36635ms step_avg:145.96ms step:262/1480 train_time:36785ms step_avg:145.97ms step:263/1480 train_time:36935ms step_avg:145.99ms step:264/1480 train_time:37084ms step_avg:146.00ms step:265/1480 train_time:37236ms step_avg:146.02ms step:266/1480 train_time:37387ms step_avg:146.04ms step:267/1480 train_time:37539ms step_avg:146.07ms step:268/1480 train_time:37689ms step_avg:146.08ms step:269/1480 train_time:37839ms step_avg:146.10ms step:270/1480 train_time:37988ms step_avg:146.11ms step:271/1480 train_time:38138ms step_avg:146.12ms step:272/1480 train_time:38287ms step_avg:146.13ms step:273/1480 train_time:38438ms step_avg:146.15ms step:274/1480 train_time:38587ms step_avg:146.16ms step:275/1480 train_time:38738ms step_avg:146.18ms step:276/1480 train_time:38887ms step_avg:146.19ms step:277/1480 train_time:39037ms step_avg:146.21ms step:278/1480 train_time:39186ms step_avg:146.22ms step:279/1480 train_time:39337ms step_avg:146.23ms step:280/1480 train_time:39488ms step_avg:146.25ms step:281/1480 train_time:39638ms step_avg:146.27ms step:282/1480 train_time:39789ms step_avg:146.28ms step:283/1480 train_time:39940ms step_avg:146.30ms step:284/1480 train_time:40091ms step_avg:146.32ms step:285/1480 train_time:40242ms step_avg:146.33ms step:286/1480 train_time:40393ms step_avg:146.35ms step:287/1480 train_time:40544ms step_avg:146.37ms step:288/1480 train_time:40695ms step_avg:146.39ms step:289/1480 train_time:40845ms step_avg:146.40ms step:290/1480 train_time:40996ms step_avg:146.41ms step:291/1480 train_time:41145ms step_avg:146.42ms step:292/1480 train_time:41296ms step_avg:146.44ms step:293/1480 train_time:41446ms step_avg:146.45ms step:294/1480 train_time:41597ms step_avg:146.47ms step:295/1480 train_time:41747ms step_avg:146.48ms step:296/1480 train_time:41898ms step_avg:146.50ms step:297/1480 train_time:42049ms step_avg:146.51ms step:298/1480 train_time:42198ms step_avg:146.52ms step:299/1480 train_time:42348ms step_avg:146.53ms step:300/1480 train_time:42499ms step_avg:146.55ms step:301/1480 train_time:42649ms step_avg:146.56ms step:302/1480 train_time:42799ms step_avg:146.57ms step:303/1480 train_time:42950ms step_avg:146.59ms step:304/1480 train_time:43100ms step_avg:146.60ms step:305/1480 train_time:43252ms step_avg:146.62ms step:306/1480 train_time:43401ms step_avg:146.62ms step:307/1480 train_time:43552ms step_avg:146.64ms step:308/1480 train_time:43701ms step_avg:146.65ms step:309/1480 train_time:43852ms step_avg:146.66ms step:310/1480 train_time:44002ms step_avg:146.67ms step:311/1480 train_time:44153ms step_avg:146.69ms step:312/1480 train_time:44302ms step_avg:146.70ms step:313/1480 train_time:44453ms step_avg:146.71ms step:314/1480 train_time:44603ms step_avg:146.72ms step:315/1480 train_time:44753ms step_avg:146.73ms step:316/1480 train_time:44902ms step_avg:146.74ms step:317/1480 train_time:45054ms step_avg:146.75ms step:318/1480 train_time:45203ms step_avg:146.76ms step:319/1480 train_time:45354ms step_avg:146.78ms step:320/1480 train_time:45504ms step_avg:146.79ms step:321/1480 train_time:45655ms step_avg:146.80ms step:322/1480 train_time:45805ms step_avg:146.81ms step:323/1480 train_time:45955ms step_avg:146.82ms step:324/1480 train_time:46105ms step_avg:146.83ms step:325/1480 train_time:46256ms step_avg:146.84ms step:326/1480 train_time:46406ms step_avg:146.85ms step:327/1480 train_time:46557ms step_avg:146.87ms step:328/1480 train_time:46707ms step_avg:146.88ms step:329/1480 train_time:46858ms step_avg:146.89ms step:330/1480 train_time:47011ms step_avg:146.91ms step:331/1480 train_time:47167ms step_avg:146.94ms step:332/1480 train_time:47319ms step_avg:146.95ms step:333/1480 train_time:47475ms step_avg:146.98ms step:334/1480 train_time:47628ms step_avg:147.00ms step:335/1480 train_time:47781ms step_avg:147.02ms step:336/1480 train_time:47935ms step_avg:147.04ms step:337/1480 train_time:48088ms step_avg:147.06ms step:338/1480 train_time:48242ms step_avg:147.08ms step:339/1480 train_time:48395ms step_avg:147.10ms step:340/1480 train_time:48549ms step_avg:147.12ms step:341/1480 train_time:48701ms step_avg:147.13ms step:342/1480 train_time:48855ms step_avg:147.15ms step:343/1480 train_time:49010ms step_avg:147.18ms step:344/1480 train_time:49164ms step_avg:147.20ms step:345/1480 train_time:49318ms step_avg:147.22ms step:346/1480 train_time:49473ms step_avg:147.24ms step:347/1480 train_time:49628ms step_avg:147.26ms step:348/1480 train_time:49782ms step_avg:147.28ms step:349/1480 train_time:49936ms step_avg:147.30ms step:350/1480 train_time:50090ms step_avg:147.32ms step:351/1480 train_time:50245ms step_avg:147.35ms step:352/1480 train_time:50399ms step_avg:147.37ms step:353/1480 train_time:50553ms step_avg:147.38ms step:354/1480 train_time:50707ms step_avg:147.41ms step:355/1480 train_time:50863ms step_avg:147.43ms step:356/1480 train_time:51015ms step_avg:147.44ms step:357/1480 train_time:51173ms step_avg:147.47ms step:358/1480 train_time:51327ms step_avg:147.49ms step:359/1480 train_time:51480ms step_avg:147.51ms step:360/1480 train_time:51635ms step_avg:147.53ms step:361/1480 train_time:51790ms step_avg:147.55ms step:362/1480 train_time:51942ms step_avg:147.56ms step:363/1480 train_time:52095ms step_avg:147.58ms step:364/1480 train_time:52250ms step_avg:147.60ms step:365/1480 train_time:52404ms step_avg:147.62ms step:366/1480 train_time:52558ms step_avg:147.63ms step:367/1480 train_time:52711ms step_avg:147.65ms step:368/1480 train_time:52865ms step_avg:147.67ms step:369/1480 train_time:53017ms step_avg:147.68ms step:370/1480 train_time:53172ms step_avg:147.70ms step:371/1480 train_time:53326ms step_avg:147.72ms step:372/1480 train_time:53479ms step_avg:147.73ms step:373/1480 train_time:53633ms step_avg:147.75ms step:374/1480 train_time:53788ms step_avg:147.77ms step:375/1480 train_time:53941ms step_avg:147.78ms step:375/1480 val_loss:3.8009 train_time:54001ms step_avg:147.95ms step:376/1480 train_time:54098ms step_avg:147.81ms step:377/1480 train_time:54252ms step_avg:147.83ms step:378/1480 train_time:54406ms step_avg:147.84ms step:379/1480 train_time:54558ms step_avg:147.85ms step:380/1480 train_time:54711ms step_avg:147.87ms step:381/1480 train_time:54865ms step_avg:147.88ms step:382/1480 train_time:55019ms step_avg:147.90ms step:383/1480 train_time:55174ms step_avg:147.92ms step:384/1480 train_time:55329ms step_avg:147.94ms step:385/1480 train_time:55482ms step_avg:147.95ms step:386/1480 train_time:55636ms step_avg:147.97ms step:387/1480 train_time:55788ms step_avg:147.98ms step:388/1480 train_time:55941ms step_avg:147.99ms step:389/1480 train_time:56094ms step_avg:148.01ms step:390/1480 train_time:56250ms step_avg:148.03ms step:391/1480 train_time:56403ms step_avg:148.04ms step:392/1480 train_time:56557ms step_avg:148.05ms step:393/1480 train_time:56711ms step_avg:148.07ms step:394/1480 train_time:56864ms step_avg:148.08ms step:395/1480 train_time:57016ms step_avg:148.09ms step:396/1480 train_time:57171ms step_avg:148.11ms step:397/1480 train_time:57326ms step_avg:148.13ms step:398/1480 train_time:57478ms step_avg:148.14ms step:399/1480 train_time:57633ms step_avg:148.16ms step:400/1480 train_time:57788ms step_avg:148.17ms step:401/1480 train_time:57941ms step_avg:148.19ms step:402/1480 train_time:58094ms step_avg:148.20ms step:403/1480 train_time:58250ms step_avg:148.22ms step:404/1480 train_time:58405ms step_avg:148.24ms step:405/1480 train_time:58560ms step_avg:148.25ms step:406/1480 train_time:58713ms step_avg:148.27ms step:407/1480 train_time:58867ms step_avg:148.28ms step:408/1480 train_time:59020ms step_avg:148.29ms step:409/1480 train_time:59173ms step_avg:148.30ms step:410/1480 train_time:59326ms step_avg:148.32ms step:411/1480 train_time:59480ms step_avg:148.33ms step:412/1480 train_time:59634ms step_avg:148.34ms step:413/1480 train_time:59788ms step_avg:148.36ms step:414/1480 train_time:59943ms step_avg:148.37ms step:415/1480 train_time:60097ms step_avg:148.39ms step:416/1480 train_time:60250ms step_avg:148.40ms step:417/1480 train_time:60405ms step_avg:148.41ms step:418/1480 train_time:60558ms step_avg:148.43ms step:419/1480 train_time:60711ms step_avg:148.44ms step:420/1480 train_time:60866ms step_avg:148.45ms step:421/1480 train_time:61018ms step_avg:148.46ms step:422/1480 train_time:61172ms step_avg:148.48ms step:423/1480 train_time:61324ms step_avg:148.48ms step:424/1480 train_time:61477ms step_avg:148.50ms step:425/1480 train_time:61631ms step_avg:148.51ms step:426/1480 train_time:61785ms step_avg:148.52ms step:427/1480 train_time:61939ms step_avg:148.53ms step:428/1480 train_time:62092ms step_avg:148.54ms step:429/1480 train_time:62245ms step_avg:148.56ms step:430/1480 train_time:62399ms step_avg:148.57ms step:431/1480 train_time:62553ms step_avg:148.58ms step:432/1480 train_time:62707ms step_avg:148.59ms step:433/1480 train_time:62861ms step_avg:148.61ms step:434/1480 train_time:63014ms step_avg:148.62ms step:435/1480 train_time:63168ms step_avg:148.63ms step:436/1480 train_time:63322ms step_avg:148.64ms step:437/1480 train_time:63474ms step_avg:148.65ms step:438/1480 train_time:63627ms step_avg:148.66ms step:439/1480 train_time:63781ms step_avg:148.67ms step:440/1480 train_time:63936ms step_avg:148.69ms step:441/1480 train_time:64092ms step_avg:148.71ms step:442/1480 train_time:64252ms step_avg:148.73ms step:443/1480 train_time:64409ms step_avg:148.75ms step:444/1480 train_time:64565ms step_avg:148.77ms step:445/1480 train_time:64720ms step_avg:148.78ms step:446/1480 train_time:64876ms step_avg:148.80ms step:447/1480 train_time:65031ms step_avg:148.81ms step:448/1480 train_time:65188ms step_avg:148.83ms step:449/1480 train_time:65347ms step_avg:148.85ms step:450/1480 train_time:65504ms step_avg:148.87ms step:451/1480 train_time:65661ms step_avg:148.89ms step:452/1480 train_time:65817ms step_avg:148.91ms step:453/1480 train_time:65973ms step_avg:148.92ms step:454/1480 train_time:66130ms step_avg:148.94ms step:455/1480 train_time:66288ms step_avg:148.96ms step:456/1480 train_time:66446ms step_avg:148.98ms step:457/1480 train_time:66602ms step_avg:149.00ms step:458/1480 train_time:66757ms step_avg:149.01ms step:459/1480 train_time:66915ms step_avg:149.03ms step:460/1480 train_time:67071ms step_avg:149.05ms step:461/1480 train_time:67229ms step_avg:149.07ms step:462/1480 train_time:67386ms step_avg:149.08ms step:463/1480 train_time:67544ms step_avg:149.10ms step:464/1480 train_time:67701ms step_avg:149.12ms step:465/1480 train_time:67856ms step_avg:149.13ms step:466/1480 train_time:68011ms step_avg:149.15ms step:467/1480 train_time:68170ms step_avg:149.17ms step:468/1480 train_time:68327ms step_avg:149.19ms step:469/1480 train_time:68483ms step_avg:149.20ms step:470/1480 train_time:68641ms step_avg:149.22ms step:471/1480 train_time:68798ms step_avg:149.24ms step:472/1480 train_time:68955ms step_avg:149.25ms step:473/1480 train_time:69111ms step_avg:149.27ms step:474/1480 train_time:69268ms step_avg:149.28ms step:475/1480 train_time:69424ms step_avg:149.30ms step:476/1480 train_time:69581ms step_avg:149.31ms step:477/1480 train_time:69738ms step_avg:149.33ms step:478/1480 train_time:69895ms step_avg:149.35ms step:479/1480 train_time:70051ms step_avg:149.36ms step:480/1480 train_time:70209ms step_avg:149.38ms step:481/1480 train_time:70365ms step_avg:149.39ms step:482/1480 train_time:70519ms step_avg:149.41ms step:483/1480 train_time:70675ms step_avg:149.42ms step:484/1480 train_time:70832ms step_avg:149.43ms step:485/1480 train_time:70990ms step_avg:149.45ms step:486/1480 train_time:71148ms step_avg:149.47ms step:487/1480 train_time:71305ms step_avg:149.49ms step:488/1480 train_time:71462ms step_avg:149.50ms step:489/1480 train_time:71616ms step_avg:149.51ms step:490/1480 train_time:71773ms step_avg:149.53ms step:491/1480 train_time:71930ms step_avg:149.54ms step:492/1480 train_time:72090ms step_avg:149.56ms step:493/1480 train_time:72248ms step_avg:149.58ms step:494/1480 train_time:72405ms step_avg:149.60ms step:495/1480 train_time:72563ms step_avg:149.61ms step:496/1480 train_time:72720ms step_avg:149.63ms step:497/1480 train_time:72876ms step_avg:149.64ms step:498/1480 train_time:73034ms step_avg:149.66ms step:499/1480 train_time:73191ms step_avg:149.68ms step:500/1480 train_time:73350ms step_avg:149.69ms step:500/1480 val_loss:3.6815 train_time:73411ms step_avg:149.82ms step:501/1480 train_time:73508ms step_avg:149.71ms step:502/1480 train_time:73665ms step_avg:149.73ms step:503/1480 train_time:73821ms step_avg:149.74ms step:504/1480 train_time:73977ms step_avg:149.75ms step:505/1480 train_time:74132ms step_avg:149.76ms step:506/1480 train_time:74287ms step_avg:149.77ms step:507/1480 train_time:74443ms step_avg:149.78ms step:508/1480 train_time:74601ms step_avg:149.80ms step:509/1480 train_time:74759ms step_avg:149.82ms step:510/1480 train_time:74915ms step_avg:149.83ms step:511/1480 train_time:75071ms step_avg:149.84ms step:512/1480 train_time:75228ms step_avg:149.86ms step:513/1480 train_time:75384ms step_avg:149.87ms step:514/1480 train_time:75541ms step_avg:149.88ms step:515/1480 train_time:75699ms step_avg:149.90ms step:516/1480 train_time:75859ms step_avg:149.92ms step:517/1480 train_time:76018ms step_avg:149.94ms step:518/1480 train_time:76174ms step_avg:149.95ms step:519/1480 train_time:76331ms step_avg:149.96ms step:520/1480 train_time:76488ms step_avg:149.98ms step:521/1480 train_time:76643ms step_avg:149.99ms step:522/1480 train_time:76800ms step_avg:150.00ms step:523/1480 train_time:76960ms step_avg:150.02ms step:524/1480 train_time:77118ms step_avg:150.03ms step:525/1480 train_time:77275ms step_avg:150.05ms step:526/1480 train_time:77433ms step_avg:150.06ms step:527/1480 train_time:77589ms step_avg:150.08ms step:528/1480 train_time:77744ms step_avg:150.09ms step:529/1480 train_time:77901ms step_avg:150.10ms step:530/1480 train_time:78058ms step_avg:150.11ms step:531/1480 train_time:78215ms step_avg:150.13ms step:532/1480 train_time:78371ms step_avg:150.14ms step:533/1480 train_time:78526ms step_avg:150.15ms step:534/1480 train_time:78683ms step_avg:150.16ms step:535/1480 train_time:78841ms step_avg:150.17ms step:536/1480 train_time:78999ms step_avg:150.19ms step:537/1480 train_time:79156ms step_avg:150.20ms step:538/1480 train_time:79314ms step_avg:150.22ms step:539/1480 train_time:79471ms step_avg:150.23ms step:540/1480 train_time:79627ms step_avg:150.24ms step:541/1480 train_time:79784ms step_avg:150.25ms step:542/1480 train_time:79940ms step_avg:150.26ms step:543/1480 train_time:80098ms step_avg:150.28ms step:544/1480 train_time:80255ms step_avg:150.29ms step:545/1480 train_time:80411ms step_avg:150.30ms step:546/1480 train_time:80566ms step_avg:150.31ms step:547/1480 train_time:80722ms step_avg:150.32ms step:548/1480 train_time:80879ms step_avg:150.33ms step:549/1480 train_time:81036ms step_avg:150.35ms step:550/1480 train_time:81193ms step_avg:150.36ms step:551/1480 train_time:81352ms step_avg:150.37ms step:552/1480 train_time:81510ms step_avg:150.39ms step:553/1480 train_time:81669ms step_avg:150.40ms step:554/1480 train_time:81827ms step_avg:150.42ms step:555/1480 train_time:81986ms step_avg:150.43ms step:556/1480 train_time:82143ms step_avg:150.44ms step:557/1480 train_time:82301ms step_avg:150.46ms step:558/1480 train_time:82461ms step_avg:150.48ms step:559/1480 train_time:82621ms step_avg:150.49ms step:560/1480 train_time:82781ms step_avg:150.51ms step:561/1480 train_time:82940ms step_avg:150.53ms step:562/1480 train_time:83101ms step_avg:150.54ms step:563/1480 train_time:83259ms step_avg:150.56ms step:564/1480 train_time:83419ms step_avg:150.58ms step:565/1480 train_time:83578ms step_avg:150.59ms step:566/1480 train_time:83739ms step_avg:150.61ms step:567/1480 train_time:83899ms step_avg:150.63ms step:568/1480 train_time:84058ms step_avg:150.64ms step:569/1480 train_time:84218ms step_avg:150.66ms step:570/1480 train_time:84377ms step_avg:150.67ms step:571/1480 train_time:84537ms step_avg:150.69ms step:572/1480 train_time:84694ms step_avg:150.70ms step:573/1480 train_time:84855ms step_avg:150.72ms step:574/1480 train_time:85015ms step_avg:150.74ms step:575/1480 train_time:85176ms step_avg:150.75ms step:576/1480 train_time:85336ms step_avg:150.77ms step:577/1480 train_time:85494ms step_avg:150.78ms step:578/1480 train_time:85654ms step_avg:150.80ms step:579/1480 train_time:85812ms step_avg:150.81ms step:580/1480 train_time:85969ms step_avg:150.82ms step:581/1480 train_time:86128ms step_avg:150.84ms step:582/1480 train_time:86286ms step_avg:150.85ms step:583/1480 train_time:86444ms step_avg:150.86ms step:584/1480 train_time:86603ms step_avg:150.88ms step:585/1480 train_time:86762ms step_avg:150.89ms step:586/1480 train_time:86922ms step_avg:150.91ms step:587/1480 train_time:87082ms step_avg:150.92ms step:588/1480 train_time:87242ms step_avg:150.94ms step:589/1480 train_time:87400ms step_avg:150.95ms step:590/1480 train_time:87560ms step_avg:150.97ms step:591/1480 train_time:87719ms step_avg:150.98ms step:592/1480 train_time:87880ms step_avg:151.00ms step:593/1480 train_time:88042ms step_avg:151.02ms step:594/1480 train_time:88203ms step_avg:151.03ms step:595/1480 train_time:88364ms step_avg:151.05ms step:596/1480 train_time:88524ms step_avg:151.06ms step:597/1480 train_time:88682ms step_avg:151.08ms step:598/1480 train_time:88841ms step_avg:151.09ms step:599/1480 train_time:89000ms step_avg:151.10ms step:600/1480 train_time:89161ms step_avg:151.12ms step:601/1480 train_time:89320ms step_avg:151.13ms step:602/1480 train_time:89479ms step_avg:151.15ms step:603/1480 train_time:89640ms step_avg:151.16ms step:604/1480 train_time:89798ms step_avg:151.18ms step:605/1480 train_time:89960ms step_avg:151.19ms step:606/1480 train_time:90122ms step_avg:151.21ms step:607/1480 train_time:90284ms step_avg:151.23ms step:608/1480 train_time:90442ms step_avg:151.24ms step:609/1480 train_time:90601ms step_avg:151.25ms step:610/1480 train_time:90760ms step_avg:151.27ms step:611/1480 train_time:90921ms step_avg:151.28ms step:612/1480 train_time:91081ms step_avg:151.30ms step:613/1480 train_time:91241ms step_avg:151.31ms step:614/1480 train_time:91401ms step_avg:151.33ms step:615/1480 train_time:91561ms step_avg:151.34ms step:616/1480 train_time:91720ms step_avg:151.35ms step:617/1480 train_time:91880ms step_avg:151.37ms step:618/1480 train_time:92040ms step_avg:151.38ms step:619/1480 train_time:92200ms step_avg:151.40ms step:620/1480 train_time:92361ms step_avg:151.41ms step:621/1480 train_time:92521ms step_avg:151.43ms step:622/1480 train_time:92681ms step_avg:151.44ms step:623/1480 train_time:92842ms step_avg:151.45ms step:624/1480 train_time:93001ms step_avg:151.47ms step:625/1480 train_time:93161ms step_avg:151.48ms step:625/1480 val_loss:3.6001 train_time:93225ms step_avg:151.59ms step:626/1480 train_time:93326ms step_avg:151.50ms step:627/1480 train_time:93486ms step_avg:151.52ms step:628/1480 train_time:93645ms step_avg:151.53ms step:629/1480 train_time:93804ms step_avg:151.54ms step:630/1480 train_time:93962ms step_avg:151.55ms step:631/1480 train_time:94119ms step_avg:151.56ms step:632/1480 train_time:94278ms step_avg:151.57ms step:633/1480 train_time:94436ms step_avg:151.58ms step:634/1480 train_time:94599ms step_avg:151.60ms step:635/1480 train_time:94757ms step_avg:151.61ms step:636/1480 train_time:94915ms step_avg:151.62ms step:637/1480 train_time:95072ms step_avg:151.63ms step:638/1480 train_time:95231ms step_avg:151.64ms step:639/1480 train_time:95389ms step_avg:151.65ms step:640/1480 train_time:95549ms step_avg:151.66ms step:641/1480 train_time:95708ms step_avg:151.68ms step:642/1480 train_time:95867ms step_avg:151.69ms step:643/1480 train_time:96028ms step_avg:151.70ms step:644/1480 train_time:96188ms step_avg:151.72ms step:645/1480 train_time:96348ms step_avg:151.73ms step:646/1480 train_time:96508ms step_avg:151.74ms step:647/1480 train_time:96666ms step_avg:151.75ms step:648/1480 train_time:96829ms step_avg:151.77ms step:649/1480 train_time:96989ms step_avg:151.78ms step:650/1480 train_time:97149ms step_avg:151.80ms step:651/1480 train_time:97309ms step_avg:151.81ms step:652/1480 train_time:97468ms step_avg:151.82ms step:653/1480 train_time:97628ms step_avg:151.83ms step:654/1480 train_time:97788ms step_avg:151.84ms step:655/1480 train_time:97949ms step_avg:151.86ms step:656/1480 train_time:98108ms step_avg:151.87ms step:657/1480 train_time:98267ms step_avg:151.88ms step:658/1480 train_time:98428ms step_avg:151.90ms step:659/1480 train_time:98589ms step_avg:151.91ms step:660/1480 train_time:98752ms step_avg:151.93ms step:661/1480 train_time:98913ms step_avg:151.94ms step:662/1480 train_time:99073ms step_avg:151.95ms step:663/1480 train_time:99232ms step_avg:151.96ms step:664/1480 train_time:99394ms step_avg:151.98ms step:665/1480 train_time:99556ms step_avg:151.99ms step:666/1480 train_time:99716ms step_avg:152.01ms step:667/1480 train_time:99878ms step_avg:152.02ms step:668/1480 train_time:100041ms step_avg:152.04ms step:669/1480 train_time:100204ms step_avg:152.05ms step:670/1480 train_time:100364ms step_avg:152.07ms step:671/1480 train_time:100526ms step_avg:152.08ms step:672/1480 train_time:100688ms step_avg:152.10ms step:673/1480 train_time:100851ms step_avg:152.11ms step:674/1480 train_time:101013ms step_avg:152.13ms step:675/1480 train_time:101174ms step_avg:152.14ms step:676/1480 train_time:101335ms step_avg:152.15ms step:677/1480 train_time:101495ms step_avg:152.17ms step:678/1480 train_time:101656ms step_avg:152.18ms step:679/1480 train_time:101818ms step_avg:152.19ms step:680/1480 train_time:101982ms step_avg:152.21ms step:681/1480 train_time:102142ms step_avg:152.22ms step:682/1480 train_time:102306ms step_avg:152.24ms step:683/1480 train_time:102468ms step_avg:152.25ms step:684/1480 train_time:102629ms step_avg:152.27ms step:685/1480 train_time:102793ms step_avg:152.29ms step:686/1480 train_time:102954ms step_avg:152.30ms step:687/1480 train_time:103115ms step_avg:152.31ms step:688/1480 train_time:103277ms step_avg:152.33ms step:689/1480 train_time:103438ms step_avg:152.34ms step:690/1480 train_time:103601ms step_avg:152.36ms step:691/1480 train_time:103763ms step_avg:152.37ms step:692/1480 train_time:103926ms step_avg:152.38ms step:693/1480 train_time:104088ms step_avg:152.40ms step:694/1480 train_time:104250ms step_avg:152.41ms step:695/1480 train_time:104410ms step_avg:152.42ms step:696/1480 train_time:104571ms step_avg:152.44ms step:697/1480 train_time:104734ms step_avg:152.45ms step:698/1480 train_time:104895ms step_avg:152.46ms step:699/1480 train_time:105057ms step_avg:152.48ms step:700/1480 train_time:105220ms step_avg:152.49ms step:701/1480 train_time:105379ms step_avg:152.50ms step:702/1480 train_time:105538ms step_avg:152.51ms step:703/1480 train_time:105698ms step_avg:152.52ms step:704/1480 train_time:105857ms step_avg:152.53ms step:705/1480 train_time:106020ms step_avg:152.55ms step:706/1480 train_time:106184ms step_avg:152.56ms step:707/1480 train_time:106346ms step_avg:152.58ms step:708/1480 train_time:106508ms step_avg:152.59ms step:709/1480 train_time:106669ms step_avg:152.60ms step:710/1480 train_time:106831ms step_avg:152.62ms step:711/1480 train_time:106993ms step_avg:152.63ms step:712/1480 train_time:107157ms step_avg:152.64ms step:713/1480 train_time:107320ms step_avg:152.66ms step:714/1480 train_time:107481ms step_avg:152.67ms step:715/1480 train_time:107641ms step_avg:152.68ms step:716/1480 train_time:107803ms step_avg:152.70ms step:717/1480 train_time:107965ms step_avg:152.71ms step:718/1480 train_time:108127ms step_avg:152.72ms step:719/1480 train_time:108288ms step_avg:152.73ms step:720/1480 train_time:108450ms step_avg:152.75ms step:721/1480 train_time:108611ms step_avg:152.76ms step:722/1480 train_time:108772ms step_avg:152.77ms step:723/1480 train_time:108932ms step_avg:152.78ms step:724/1480 train_time:109093ms step_avg:152.79ms step:725/1480 train_time:109256ms step_avg:152.81ms step:726/1480 train_time:109419ms step_avg:152.82ms step:727/1480 train_time:109582ms step_avg:152.83ms step:728/1480 train_time:109743ms step_avg:152.85ms step:729/1480 train_time:109904ms step_avg:152.86ms step:730/1480 train_time:110068ms step_avg:152.87ms step:731/1480 train_time:110231ms step_avg:152.89ms step:732/1480 train_time:110391ms step_avg:152.90ms step:733/1480 train_time:110553ms step_avg:152.91ms step:734/1480 train_time:110713ms step_avg:152.92ms step:735/1480 train_time:110874ms step_avg:152.93ms step:736/1480 train_time:111035ms step_avg:152.94ms step:737/1480 train_time:111198ms step_avg:152.95ms step:738/1480 train_time:111358ms step_avg:152.96ms step:739/1480 train_time:111518ms step_avg:152.97ms step:740/1480 train_time:111683ms step_avg:152.99ms step:741/1480 train_time:111846ms step_avg:153.00ms step:742/1480 train_time:112009ms step_avg:153.02ms step:743/1480 train_time:112170ms step_avg:153.03ms step:744/1480 train_time:112333ms step_avg:153.04ms step:745/1480 train_time:112497ms step_avg:153.06ms step:746/1480 train_time:112657ms step_avg:153.07ms step:747/1480 train_time:112818ms step_avg:153.08ms step:748/1480 train_time:112983ms step_avg:153.09ms step:749/1480 train_time:113148ms step_avg:153.11ms step:750/1480 train_time:113308ms step_avg:153.12ms step:750/1480 val_loss:3.5452 train_time:113373ms step_avg:153.21ms step:751/1480 train_time:113476ms step_avg:153.14ms step:752/1480 train_time:113637ms step_avg:153.15ms step:753/1480 train_time:113797ms step_avg:153.16ms step:754/1480 train_time:113958ms step_avg:153.17ms step:755/1480 train_time:114119ms step_avg:153.18ms step:756/1480 train_time:114280ms step_avg:153.19ms step:757/1480 train_time:114444ms step_avg:153.20ms step:758/1480 train_time:114605ms step_avg:153.22ms step:759/1480 train_time:114768ms step_avg:153.23ms step:760/1480 train_time:114929ms step_avg:153.24ms step:761/1480 train_time:115094ms step_avg:153.25ms step:762/1480 train_time:115256ms step_avg:153.27ms step:763/1480 train_time:115418ms step_avg:153.28ms step:764/1480 train_time:115579ms step_avg:153.29ms step:765/1480 train_time:115741ms step_avg:153.30ms step:766/1480 train_time:115903ms step_avg:153.31ms step:767/1480 train_time:116064ms step_avg:153.32ms step:768/1480 train_time:116228ms step_avg:153.33ms step:769/1480 train_time:116391ms step_avg:153.35ms step:770/1480 train_time:116555ms step_avg:153.36ms step:771/1480 train_time:116719ms step_avg:153.38ms step:772/1480 train_time:116882ms step_avg:153.39ms step:773/1480 train_time:117043ms step_avg:153.40ms step:774/1480 train_time:117206ms step_avg:153.41ms step:775/1480 train_time:117370ms step_avg:153.42ms step:776/1480 train_time:117535ms step_avg:153.44ms step:777/1480 train_time:117701ms step_avg:153.46ms step:778/1480 train_time:117863ms step_avg:153.47ms step:779/1480 train_time:118025ms step_avg:153.48ms step:780/1480 train_time:118189ms step_avg:153.49ms step:781/1480 train_time:118353ms step_avg:153.51ms step:782/1480 train_time:118517ms step_avg:153.52ms step:783/1480 train_time:118678ms step_avg:153.53ms step:784/1480 train_time:118842ms step_avg:153.54ms step:785/1480 train_time:119003ms step_avg:153.55ms step:786/1480 train_time:119170ms step_avg:153.57ms step:787/1480 train_time:119335ms step_avg:153.58ms step:788/1480 train_time:119499ms step_avg:153.60ms step:789/1480 train_time:119659ms step_avg:153.61ms step:790/1480 train_time:119824ms step_avg:153.62ms step:791/1480 train_time:119991ms step_avg:153.64ms step:792/1480 train_time:120156ms step_avg:153.65ms step:793/1480 train_time:120318ms step_avg:153.66ms step:794/1480 train_time:120481ms step_avg:153.68ms step:795/1480 train_time:120646ms step_avg:153.69ms step:796/1480 train_time:120812ms step_avg:153.70ms step:797/1480 train_time:120976ms step_avg:153.72ms step:798/1480 train_time:121139ms step_avg:153.73ms step:799/1480 train_time:121305ms step_avg:153.75ms step:800/1480 train_time:121469ms step_avg:153.76ms step:801/1480 train_time:121632ms step_avg:153.77ms step:802/1480 train_time:121801ms step_avg:153.79ms step:803/1480 train_time:121962ms step_avg:153.80ms step:804/1480 train_time:122125ms step_avg:153.81ms step:805/1480 train_time:122291ms step_avg:153.82ms step:806/1480 train_time:122453ms step_avg:153.84ms step:807/1480 train_time:122614ms step_avg:153.84ms step:808/1480 train_time:122779ms step_avg:153.86ms step:809/1480 train_time:122940ms step_avg:153.87ms step:810/1480 train_time:123101ms step_avg:153.88ms step:811/1480 train_time:123264ms step_avg:153.89ms step:812/1480 train_time:123427ms step_avg:153.90ms step:813/1480 train_time:123588ms step_avg:153.91ms step:814/1480 train_time:123751ms step_avg:153.92ms step:815/1480 train_time:123915ms step_avg:153.93ms step:816/1480 train_time:124082ms step_avg:153.95ms step:817/1480 train_time:124242ms step_avg:153.96ms step:818/1480 train_time:124403ms step_avg:153.96ms step:819/1480 train_time:124567ms step_avg:153.98ms step:820/1480 train_time:124730ms step_avg:153.99ms step:821/1480 train_time:124893ms step_avg:154.00ms step:822/1480 train_time:125056ms step_avg:154.01ms step:823/1480 train_time:125219ms step_avg:154.02ms step:824/1480 train_time:125381ms step_avg:154.03ms step:825/1480 train_time:125545ms step_avg:154.04ms step:826/1480 train_time:125712ms step_avg:154.06ms step:827/1480 train_time:125877ms step_avg:154.07ms step:828/1480 train_time:126039ms step_avg:154.08ms step:829/1480 train_time:126203ms step_avg:154.09ms step:830/1480 train_time:126367ms step_avg:154.11ms step:831/1480 train_time:126531ms step_avg:154.12ms step:832/1480 train_time:126696ms step_avg:154.13ms step:833/1480 train_time:126859ms step_avg:154.14ms step:834/1480 train_time:127023ms step_avg:154.15ms step:835/1480 train_time:127186ms step_avg:154.16ms step:836/1480 train_time:127351ms step_avg:154.18ms step:837/1480 train_time:127513ms step_avg:154.19ms step:838/1480 train_time:127678ms step_avg:154.20ms step:839/1480 train_time:127840ms step_avg:154.21ms step:840/1480 train_time:128001ms step_avg:154.22ms step:841/1480 train_time:128162ms step_avg:154.23ms step:842/1480 train_time:128324ms step_avg:154.24ms step:843/1480 train_time:128485ms step_avg:154.24ms step:844/1480 train_time:128647ms step_avg:154.25ms step:845/1480 train_time:128810ms step_avg:154.26ms step:846/1480 train_time:128975ms step_avg:154.28ms step:847/1480 train_time:129139ms step_avg:154.29ms step:848/1480 train_time:129302ms step_avg:154.30ms step:849/1480 train_time:129465ms step_avg:154.31ms step:850/1480 train_time:129628ms step_avg:154.32ms step:851/1480 train_time:129795ms step_avg:154.33ms step:852/1480 train_time:129958ms step_avg:154.34ms step:853/1480 train_time:130119ms step_avg:154.35ms step:854/1480 train_time:130282ms step_avg:154.36ms step:855/1480 train_time:130445ms step_avg:154.37ms step:856/1480 train_time:130607ms step_avg:154.38ms step:857/1480 train_time:130773ms step_avg:154.40ms step:858/1480 train_time:130938ms step_avg:154.41ms step:859/1480 train_time:131101ms step_avg:154.42ms step:860/1480 train_time:131261ms step_avg:154.42ms step:861/1480 train_time:131426ms step_avg:154.44ms step:862/1480 train_time:131595ms step_avg:154.45ms step:863/1480 train_time:131762ms step_avg:154.47ms step:864/1480 train_time:131924ms step_avg:154.48ms step:865/1480 train_time:132084ms step_avg:154.48ms step:866/1480 train_time:132252ms step_avg:154.50ms step:867/1480 train_time:132415ms step_avg:154.51ms step:868/1480 train_time:132577ms step_avg:154.52ms step:869/1480 train_time:132739ms step_avg:154.53ms step:870/1480 train_time:132903ms step_avg:154.54ms step:871/1480 train_time:133069ms step_avg:154.55ms step:872/1480 train_time:133233ms step_avg:154.56ms step:873/1480 train_time:133396ms step_avg:154.57ms step:874/1480 train_time:133561ms step_avg:154.58ms step:875/1480 train_time:133725ms step_avg:154.60ms step:875/1480 val_loss:3.4980 train_time:133790ms step_avg:154.67ms step:876/1480 train_time:133889ms step_avg:154.61ms step:877/1480 train_time:134052ms step_avg:154.62ms step:878/1480 train_time:134216ms step_avg:154.63ms step:879/1480 train_time:134380ms step_avg:154.64ms step:880/1480 train_time:134543ms step_avg:154.65ms step:881/1480 train_time:134706ms step_avg:154.66ms step:882/1480 train_time:134872ms step_avg:154.67ms step:883/1480 train_time:135038ms step_avg:154.68ms step:884/1480 train_time:135206ms step_avg:154.70ms step:885/1480 train_time:135371ms step_avg:154.71ms step:886/1480 train_time:135537ms step_avg:154.72ms step:887/1480 train_time:135706ms step_avg:154.74ms step:888/1480 train_time:135878ms step_avg:154.76ms step:889/1480 train_time:136046ms step_avg:154.77ms step:890/1480 train_time:136208ms step_avg:154.78ms step:891/1480 train_time:136373ms step_avg:154.79ms step:892/1480 train_time:136538ms step_avg:154.81ms step:893/1480 train_time:136702ms step_avg:154.82ms step:894/1480 train_time:136869ms step_avg:154.83ms step:895/1480 train_time:137036ms step_avg:154.84ms step:896/1480 train_time:137201ms step_avg:154.85ms step:897/1480 train_time:137367ms step_avg:154.87ms step:898/1480 train_time:137535ms step_avg:154.88ms step:899/1480 train_time:137701ms step_avg:154.89ms step:900/1480 train_time:137866ms step_avg:154.91ms step:901/1480 train_time:138031ms step_avg:154.92ms step:902/1480 train_time:138193ms step_avg:154.93ms step:903/1480 train_time:138366ms step_avg:154.95ms step:904/1480 train_time:138531ms step_avg:154.96ms step:905/1480 train_time:138692ms step_avg:154.96ms step:906/1480 train_time:138860ms step_avg:154.98ms step:907/1480 train_time:139028ms step_avg:154.99ms step:908/1480 train_time:139189ms step_avg:155.00ms step:909/1480 train_time:139353ms step_avg:155.01ms step:910/1480 train_time:139524ms step_avg:155.03ms step:911/1480 train_time:139689ms step_avg:155.04ms step:912/1480 train_time:139855ms step_avg:155.05ms step:913/1480 train_time:140023ms step_avg:155.06ms step:914/1480 train_time:140192ms step_avg:155.08ms step:915/1480 train_time:140363ms step_avg:155.10ms step:916/1480 train_time:140527ms step_avg:155.11ms step:917/1480 train_time:140689ms step_avg:155.11ms step:918/1480 train_time:140856ms step_avg:155.13ms step:919/1480 train_time:141026ms step_avg:155.14ms step:920/1480 train_time:141190ms step_avg:155.15ms step:921/1480 train_time:141356ms step_avg:155.17ms step:922/1480 train_time:141524ms step_avg:155.18ms step:923/1480 train_time:141687ms step_avg:155.19ms step:924/1480 train_time:141852ms step_avg:155.20ms step:925/1480 train_time:142018ms step_avg:155.21ms step:926/1480 train_time:142183ms step_avg:155.22ms step:927/1480 train_time:142346ms step_avg:155.23ms step:928/1480 train_time:142511ms step_avg:155.24ms step:929/1480 train_time:142676ms step_avg:155.25ms step:930/1480 train_time:142843ms step_avg:155.26ms step:931/1480 train_time:143007ms step_avg:155.27ms step:932/1480 train_time:143173ms step_avg:155.28ms step:933/1480 train_time:143339ms step_avg:155.30ms step:934/1480 train_time:143506ms step_avg:155.31ms step:935/1480 train_time:143677ms step_avg:155.33ms step:936/1480 train_time:143845ms step_avg:155.34ms step:937/1480 train_time:144014ms step_avg:155.36ms step:938/1480 train_time:144176ms step_avg:155.36ms step:939/1480 train_time:144348ms step_avg:155.38ms step:940/1480 train_time:144515ms step_avg:155.39ms step:941/1480 train_time:144678ms step_avg:155.40ms step:942/1480 train_time:144844ms step_avg:155.41ms step:943/1480 train_time:145012ms step_avg:155.43ms step:944/1480 train_time:145185ms step_avg:155.44ms step:945/1480 train_time:145348ms step_avg:155.45ms step:946/1480 train_time:145517ms step_avg:155.47ms step:947/1480 train_time:145685ms step_avg:155.48ms step:948/1480 train_time:145850ms step_avg:155.49ms step:949/1480 train_time:146016ms step_avg:155.50ms step:950/1480 train_time:146180ms step_avg:155.51ms step:951/1480 train_time:146348ms step_avg:155.52ms step:952/1480 train_time:146512ms step_avg:155.53ms step:953/1480 train_time:146680ms step_avg:155.55ms step:954/1480 train_time:146849ms step_avg:155.56ms step:955/1480 train_time:147012ms step_avg:155.57ms step:956/1480 train_time:147178ms step_avg:155.58ms step:957/1480 train_time:147347ms step_avg:155.59ms step:958/1480 train_time:147517ms step_avg:155.61ms step:959/1480 train_time:147683ms step_avg:155.62ms step:960/1480 train_time:147849ms step_avg:155.63ms step:961/1480 train_time:148013ms step_avg:155.64ms step:962/1480 train_time:148177ms step_avg:155.65ms step:963/1480 train_time:148345ms step_avg:155.66ms step:964/1480 train_time:148513ms step_avg:155.67ms step:965/1480 train_time:148678ms step_avg:155.68ms step:966/1480 train_time:148843ms step_avg:155.69ms step:967/1480 train_time:149006ms step_avg:155.70ms step:968/1480 train_time:149171ms step_avg:155.71ms step:969/1480 train_time:149341ms step_avg:155.73ms step:970/1480 train_time:149505ms step_avg:155.73ms step:971/1480 train_time:149669ms step_avg:155.74ms step:972/1480 train_time:149833ms step_avg:155.75ms step:973/1480 train_time:149998ms step_avg:155.76ms step:974/1480 train_time:150168ms step_avg:155.78ms step:975/1480 train_time:150332ms step_avg:155.78ms step:976/1480 train_time:150498ms step_avg:155.79ms step:977/1480 train_time:150663ms step_avg:155.80ms step:978/1480 train_time:150830ms step_avg:155.82ms step:979/1480 train_time:150995ms step_avg:155.83ms step:980/1480 train_time:151162ms step_avg:155.84ms step:981/1480 train_time:151331ms step_avg:155.85ms step:982/1480 train_time:151493ms step_avg:155.86ms step:983/1480 train_time:151657ms step_avg:155.87ms step:984/1480 train_time:151822ms step_avg:155.88ms step:985/1480 train_time:151989ms step_avg:155.89ms step:986/1480 train_time:152153ms step_avg:155.89ms step:987/1480 train_time:152319ms step_avg:155.90ms step:988/1480 train_time:152487ms step_avg:155.92ms step:989/1480 train_time:152654ms step_avg:155.93ms step:990/1480 train_time:152825ms step_avg:155.94ms step:991/1480 train_time:152993ms step_avg:155.96ms step:992/1480 train_time:153168ms step_avg:155.98ms step:993/1480 train_time:153346ms step_avg:156.00ms step:994/1480 train_time:153511ms step_avg:156.01ms step:995/1480 train_time:153674ms step_avg:156.01ms step:996/1480 train_time:153837ms step_avg:156.02ms step:997/1480 train_time:154003ms step_avg:156.03ms step:998/1480 train_time:154166ms step_avg:156.04ms step:999/1480 train_time:154332ms step_avg:156.05ms step:1000/1480 train_time:154501ms step_avg:156.06ms step:1000/1480 val_loss:3.4349 train_time:154567ms step_avg:156.13ms step:1001/1480 train_time:154669ms step_avg:156.07ms step:1002/1480 train_time:154835ms step_avg:156.08ms step:1003/1480 train_time:155006ms step_avg:156.10ms step:1004/1480 train_time:155175ms step_avg:156.11ms step:1005/1480 train_time:155342ms step_avg:156.12ms step:1006/1480 train_time:155511ms step_avg:156.14ms step:1007/1480 train_time:155677ms step_avg:156.15ms step:1008/1480 train_time:155843ms step_avg:156.16ms step:1009/1480 train_time:156017ms step_avg:156.17ms step:1010/1480 train_time:156183ms step_avg:156.18ms step:1011/1480 train_time:156348ms step_avg:156.19ms step:1012/1480 train_time:156513ms step_avg:156.20ms step:1013/1480 train_time:156684ms step_avg:156.22ms step:1014/1480 train_time:156850ms step_avg:156.22ms step:1015/1480 train_time:157020ms step_avg:156.24ms step:1016/1480 train_time:157188ms step_avg:156.25ms step:1017/1480 train_time:157360ms step_avg:156.27ms step:1018/1480 train_time:157528ms step_avg:156.28ms step:1019/1480 train_time:157697ms step_avg:156.29ms step:1020/1480 train_time:157866ms step_avg:156.30ms step:1021/1480 train_time:158030ms step_avg:156.31ms step:1022/1480 train_time:158197ms step_avg:156.32ms step:1023/1480 train_time:158365ms step_avg:156.33ms step:1024/1480 train_time:158530ms step_avg:156.34ms step:1025/1480 train_time:158702ms step_avg:156.36ms step:1026/1480 train_time:158866ms step_avg:156.36ms step:1027/1480 train_time:159033ms step_avg:156.37ms step:1028/1480 train_time:159205ms step_avg:156.39ms step:1029/1480 train_time:159380ms step_avg:156.41ms step:1030/1480 train_time:159546ms step_avg:156.42ms step:1031/1480 train_time:159710ms step_avg:156.43ms step:1032/1480 train_time:159883ms step_avg:156.44ms step:1033/1480 train_time:160048ms step_avg:156.45ms step:1034/1480 train_time:160218ms step_avg:156.46ms step:1035/1480 train_time:160385ms step_avg:156.47ms step:1036/1480 train_time:160550ms step_avg:156.48ms step:1037/1480 train_time:160718ms step_avg:156.49ms step:1038/1480 train_time:160884ms step_avg:156.50ms step:1039/1480 train_time:161053ms step_avg:156.51ms step:1040/1480 train_time:161219ms step_avg:156.52ms step:1041/1480 train_time:161386ms step_avg:156.53ms step:1042/1480 train_time:161550ms step_avg:156.54ms step:1043/1480 train_time:161717ms step_avg:156.55ms step:1044/1480 train_time:161882ms step_avg:156.56ms step:1045/1480 train_time:162053ms step_avg:156.57ms step:1046/1480 train_time:162220ms step_avg:156.58ms step:1047/1480 train_time:162386ms step_avg:156.59ms step:1048/1480 train_time:162552ms step_avg:156.60ms step:1049/1480 train_time:162719ms step_avg:156.61ms step:1050/1480 train_time:162888ms step_avg:156.62ms step:1051/1480 train_time:163058ms step_avg:156.64ms step:1052/1480 train_time:163224ms step_avg:156.65ms step:1053/1480 train_time:163390ms step_avg:156.65ms step:1054/1480 train_time:163558ms step_avg:156.67ms step:1055/1480 train_time:163724ms step_avg:156.67ms step:1056/1480 train_time:163889ms step_avg:156.68ms step:1057/1480 train_time:164057ms step_avg:156.69ms step:1058/1480 train_time:164224ms step_avg:156.70ms step:1059/1480 train_time:164399ms step_avg:156.72ms step:1060/1480 train_time:164568ms step_avg:156.73ms step:1061/1480 train_time:164731ms step_avg:156.74ms step:1062/1480 train_time:164898ms step_avg:156.75ms step:1063/1480 train_time:165064ms step_avg:156.76ms step:1064/1480 train_time:165227ms step_avg:156.76ms step:1065/1480 train_time:165395ms step_avg:156.77ms step:1066/1480 train_time:165562ms step_avg:156.78ms step:1067/1480 train_time:165733ms step_avg:156.80ms step:1068/1480 train_time:165900ms step_avg:156.81ms step:1069/1480 train_time:166073ms step_avg:156.82ms step:1070/1480 train_time:166240ms step_avg:156.83ms step:1071/1480 train_time:166412ms step_avg:156.84ms step:1072/1480 train_time:166580ms step_avg:156.86ms step:1073/1480 train_time:166743ms step_avg:156.86ms step:1074/1480 train_time:166910ms step_avg:156.87ms step:1075/1480 train_time:167080ms step_avg:156.88ms step:1076/1480 train_time:167247ms step_avg:156.89ms step:1077/1480 train_time:167412ms step_avg:156.90ms step:1078/1480 train_time:167587ms step_avg:156.92ms step:1079/1480 train_time:167759ms step_avg:156.93ms step:1080/1480 train_time:167928ms step_avg:156.94ms step:1081/1480 train_time:168095ms step_avg:156.95ms step:1082/1480 train_time:168262ms step_avg:156.96ms step:1083/1480 train_time:168428ms step_avg:156.97ms step:1084/1480 train_time:168595ms step_avg:156.98ms step:1085/1480 train_time:168764ms step_avg:156.99ms step:1086/1480 train_time:168932ms step_avg:157.00ms step:1087/1480 train_time:169099ms step_avg:157.01ms step:1088/1480 train_time:169268ms step_avg:157.02ms step:1089/1480 train_time:169439ms step_avg:157.03ms step:1090/1480 train_time:169611ms step_avg:157.05ms step:1091/1480 train_time:169781ms step_avg:157.06ms step:1092/1480 train_time:169948ms step_avg:157.07ms step:1093/1480 train_time:170115ms step_avg:157.08ms step:1094/1480 train_time:170282ms step_avg:157.09ms step:1095/1480 train_time:170447ms step_avg:157.09ms step:1096/1480 train_time:170617ms step_avg:157.11ms step:1097/1480 train_time:170784ms step_avg:157.12ms step:1098/1480 train_time:170957ms step_avg:157.13ms step:1099/1480 train_time:171127ms step_avg:157.14ms step:1100/1480 train_time:171299ms step_avg:157.16ms step:1101/1480 train_time:171470ms step_avg:157.17ms step:1102/1480 train_time:171641ms step_avg:157.18ms step:1103/1480 train_time:171819ms step_avg:157.20ms step:1104/1480 train_time:171986ms step_avg:157.21ms step:1105/1480 train_time:172157ms step_avg:157.22ms step:1106/1480 train_time:172325ms step_avg:157.23ms step:1107/1480 train_time:172492ms step_avg:157.24ms step:1108/1480 train_time:172659ms step_avg:157.25ms step:1109/1480 train_time:172823ms step_avg:157.26ms step:1110/1480 train_time:172989ms step_avg:157.26ms step:1111/1480 train_time:173156ms step_avg:157.27ms step:1112/1480 train_time:173326ms step_avg:157.28ms step:1113/1480 train_time:173505ms step_avg:157.30ms step:1114/1480 train_time:173680ms step_avg:157.32ms step:1115/1480 train_time:173850ms step_avg:157.33ms step:1116/1480 train_time:174016ms step_avg:157.34ms step:1117/1480 train_time:174188ms step_avg:157.35ms step:1118/1480 train_time:174363ms step_avg:157.37ms step:1119/1480 train_time:174530ms step_avg:157.38ms step:1120/1480 train_time:174698ms step_avg:157.39ms step:1121/1480 train_time:174867ms step_avg:157.40ms step:1122/1480 train_time:175033ms step_avg:157.40ms step:1123/1480 train_time:175200ms step_avg:157.41ms step:1124/1480 train_time:175369ms step_avg:157.42ms step:1125/1480 train_time:175537ms step_avg:157.43ms step:1125/1480 val_loss:3.3798 train_time:175606ms step_avg:157.49ms step:1126/1480 train_time:175708ms step_avg:157.44ms step:1127/1480 train_time:175878ms step_avg:157.46ms step:1128/1480 train_time:176048ms step_avg:157.47ms step:1129/1480 train_time:176222ms step_avg:157.48ms step:1130/1480 train_time:176392ms step_avg:157.49ms step:1131/1480 train_time:176570ms step_avg:157.51ms step:1132/1480 train_time:176735ms step_avg:157.52ms step:1133/1480 train_time:176909ms step_avg:157.53ms step:1134/1480 train_time:177080ms step_avg:157.54ms step:1135/1480 train_time:177247ms step_avg:157.55ms step:1136/1480 train_time:177418ms step_avg:157.57ms step:1137/1480 train_time:177589ms step_avg:157.58ms step:1138/1480 train_time:177761ms step_avg:157.59ms step:1139/1480 train_time:177930ms step_avg:157.60ms step:1140/1480 train_time:178098ms step_avg:157.61ms step:1141/1480 train_time:178269ms step_avg:157.62ms step:1142/1480 train_time:178435ms step_avg:157.63ms step:1143/1480 train_time:178607ms step_avg:157.64ms step:1144/1480 train_time:178774ms step_avg:157.65ms step:1145/1480 train_time:178938ms step_avg:157.65ms step:1146/1480 train_time:179108ms step_avg:157.67ms step:1147/1480 train_time:179277ms step_avg:157.68ms step:1148/1480 train_time:179445ms step_avg:157.68ms step:1149/1480 train_time:179615ms step_avg:157.69ms step:1150/1480 train_time:179785ms step_avg:157.71ms step:1151/1480 train_time:179956ms step_avg:157.72ms step:1152/1480 train_time:180128ms step_avg:157.73ms step:1153/1480 train_time:180301ms step_avg:157.74ms step:1154/1480 train_time:180468ms step_avg:157.75ms step:1155/1480 train_time:180641ms step_avg:157.76ms step:1156/1480 train_time:180820ms step_avg:157.78ms step:1157/1480 train_time:180989ms step_avg:157.79ms step:1158/1480 train_time:181155ms step_avg:157.80ms step:1159/1480 train_time:181321ms step_avg:157.81ms step:1160/1480 train_time:181487ms step_avg:157.81ms step:1161/1480 train_time:181656ms step_avg:157.82ms step:1162/1480 train_time:181826ms step_avg:157.84ms step:1163/1480 train_time:181995ms step_avg:157.84ms step:1164/1480 train_time:182166ms step_avg:157.86ms step:1165/1480 train_time:182331ms step_avg:157.86ms step:1166/1480 train_time:182500ms step_avg:157.87ms step:1167/1480 train_time:182669ms step_avg:157.88ms step:1168/1480 train_time:182838ms step_avg:157.89ms step:1169/1480 train_time:183007ms step_avg:157.90ms step:1170/1480 train_time:183175ms step_avg:157.91ms step:1171/1480 train_time:183343ms step_avg:157.92ms step:1172/1480 train_time:183510ms step_avg:157.93ms step:1173/1480 train_time:183682ms step_avg:157.94ms step:1174/1480 train_time:183864ms step_avg:157.96ms step:1175/1480 train_time:184036ms step_avg:157.97ms step:1176/1480 train_time:184208ms step_avg:157.98ms step:1177/1480 train_time:184385ms step_avg:158.00ms step:1178/1480 train_time:184551ms step_avg:158.01ms step:1179/1480 train_time:184716ms step_avg:158.01ms step:1180/1480 train_time:184900ms step_avg:158.03ms step:1181/1480 train_time:185071ms step_avg:158.04ms step:1182/1480 train_time:185238ms step_avg:158.05ms step:1183/1480 train_time:185409ms step_avg:158.06ms step:1184/1480 train_time:185576ms step_avg:158.07ms step:1185/1480 train_time:185748ms step_avg:158.08ms step:1186/1480 train_time:185919ms step_avg:158.09ms step:1187/1480 train_time:186103ms step_avg:158.12ms step:1188/1480 train_time:186269ms step_avg:158.12ms step:1189/1480 train_time:186442ms step_avg:158.14ms step:1190/1480 train_time:186610ms step_avg:158.14ms step:1191/1480 train_time:186781ms step_avg:158.15ms step:1192/1480 train_time:186947ms step_avg:158.16ms step:1193/1480 train_time:187114ms step_avg:158.17ms step:1194/1480 train_time:187283ms step_avg:158.18ms step:1195/1480 train_time:187456ms step_avg:158.19ms step:1196/1480 train_time:187639ms step_avg:158.21ms step:1197/1480 train_time:187811ms step_avg:158.22ms step:1198/1480 train_time:187991ms step_avg:158.24ms step:1199/1480 train_time:188162ms step_avg:158.25ms step:1200/1480 train_time:188331ms step_avg:158.26ms step:1201/1480 train_time:188499ms step_avg:158.27ms step:1202/1480 train_time:188679ms step_avg:158.29ms step:1203/1480 train_time:188854ms step_avg:158.30ms step:1204/1480 train_time:189028ms step_avg:158.32ms step:1205/1480 train_time:189195ms step_avg:158.32ms step:1206/1480 train_time:189364ms step_avg:158.33ms step:1207/1480 train_time:189532ms step_avg:158.34ms step:1208/1480 train_time:189699ms step_avg:158.35ms step:1209/1480 train_time:189873ms step_avg:158.36ms step:1210/1480 train_time:190048ms step_avg:158.37ms step:1211/1480 train_time:190222ms step_avg:158.39ms step:1212/1480 train_time:190394ms step_avg:158.40ms step:1213/1480 train_time:190568ms step_avg:158.41ms step:1214/1480 train_time:190746ms step_avg:158.43ms step:1215/1480 train_time:190920ms step_avg:158.44ms step:1216/1480 train_time:191089ms step_avg:158.45ms step:1217/1480 train_time:191262ms step_avg:158.46ms step:1218/1480 train_time:191432ms step_avg:158.47ms step:1219/1480 train_time:191611ms step_avg:158.49ms step:1220/1480 train_time:191780ms step_avg:158.50ms step:1221/1480 train_time:191948ms step_avg:158.50ms step:1222/1480 train_time:192115ms step_avg:158.51ms step:1223/1480 train_time:192286ms step_avg:158.52ms step:1224/1480 train_time:192463ms step_avg:158.54ms step:1225/1480 train_time:192635ms step_avg:158.55ms step:1226/1480 train_time:192808ms step_avg:158.56ms step:1227/1480 train_time:192980ms step_avg:158.57ms step:1228/1480 train_time:193149ms step_avg:158.58ms step:1229/1480 train_time:193322ms step_avg:158.59ms step:1230/1480 train_time:193503ms step_avg:158.61ms step:1231/1480 train_time:193678ms step_avg:158.62ms step:1232/1480 train_time:193852ms step_avg:158.64ms step:1233/1480 train_time:194022ms step_avg:158.64ms step:1234/1480 train_time:194191ms step_avg:158.65ms step:1235/1480 train_time:194366ms step_avg:158.67ms step:1236/1480 train_time:194534ms step_avg:158.67ms step:1237/1480 train_time:194706ms step_avg:158.68ms step:1238/1480 train_time:194890ms step_avg:158.70ms step:1239/1480 train_time:195062ms step_avg:158.72ms step:1240/1480 train_time:195232ms step_avg:158.72ms step:1241/1480 train_time:195406ms step_avg:158.74ms step:1242/1480 train_time:195575ms step_avg:158.75ms step:1243/1480 train_time:195749ms step_avg:158.76ms step:1244/1480 train_time:195916ms step_avg:158.76ms step:1245/1480 train_time:196086ms step_avg:158.77ms step:1246/1480 train_time:196255ms step_avg:158.78ms step:1247/1480 train_time:196425ms step_avg:158.79ms step:1248/1480 train_time:196594ms step_avg:158.80ms step:1249/1480 train_time:196762ms step_avg:158.81ms step:1250/1480 train_time:196932ms step_avg:158.82ms step:1250/1480 val_loss:3.3302 train_time:197004ms step_avg:158.87ms step:1251/1480 train_time:197114ms step_avg:158.83ms step:1252/1480 train_time:197283ms step_avg:158.84ms step:1253/1480 train_time:197451ms step_avg:158.85ms step:1254/1480 train_time:197623ms step_avg:158.86ms step:1255/1480 train_time:197810ms step_avg:158.88ms step:1256/1480 train_time:197983ms step_avg:158.89ms step:1257/1480 train_time:198152ms step_avg:158.90ms step:1258/1480 train_time:198326ms step_avg:158.92ms step:1259/1480 train_time:198498ms step_avg:158.93ms step:1260/1480 train_time:198665ms step_avg:158.93ms step:1261/1480 train_time:198839ms step_avg:158.94ms step:1262/1480 train_time:199015ms step_avg:158.96ms step:1263/1480 train_time:199188ms step_avg:158.97ms step:1264/1480 train_time:199355ms step_avg:158.98ms step:1265/1480 train_time:199523ms step_avg:158.98ms step:1266/1480 train_time:199696ms step_avg:158.99ms step:1267/1480 train_time:199865ms step_avg:159.00ms step:1268/1480 train_time:200037ms step_avg:159.01ms step:1269/1480 train_time:200212ms step_avg:159.02ms step:1270/1480 train_time:200382ms step_avg:159.03ms step:1271/1480 train_time:200552ms step_avg:159.04ms step:1272/1480 train_time:200719ms step_avg:159.05ms step:1273/1480 train_time:200890ms step_avg:159.06ms step:1274/1480 train_time:201062ms step_avg:159.07ms step:1275/1480 train_time:201228ms step_avg:159.07ms step:1276/1480 train_time:201395ms step_avg:159.08ms step:1277/1480 train_time:201568ms step_avg:159.09ms step:1278/1480 train_time:201736ms step_avg:159.10ms step:1279/1480 train_time:201906ms step_avg:159.11ms step:1280/1480 train_time:202086ms step_avg:159.12ms step:1281/1480 train_time:202255ms step_avg:159.13ms step:1282/1480 train_time:202422ms step_avg:159.14ms step:1283/1480 train_time:202593ms step_avg:159.15ms step:1284/1480 train_time:202763ms step_avg:159.15ms step:1285/1480 train_time:202931ms step_avg:159.16ms step:1286/1480 train_time:203101ms step_avg:159.17ms step:1287/1480 train_time:203274ms step_avg:159.18ms step:1288/1480 train_time:203445ms step_avg:159.19ms step:1289/1480 train_time:203629ms step_avg:159.21ms step:1290/1480 train_time:203808ms step_avg:159.22ms step:1291/1480 train_time:203979ms step_avg:159.23ms step:1292/1480 train_time:204152ms step_avg:159.24ms step:1293/1480 train_time:204327ms step_avg:159.26ms step:1294/1480 train_time:204498ms step_avg:159.27ms step:1295/1480 train_time:204670ms step_avg:159.28ms step:1296/1480 train_time:204843ms step_avg:159.29ms step:1297/1480 train_time:205016ms step_avg:159.30ms step:1298/1480 train_time:205188ms step_avg:159.31ms step:1299/1480 train_time:205359ms step_avg:159.32ms step:1300/1480 train_time:205525ms step_avg:159.32ms step:1301/1480 train_time:205694ms step_avg:159.33ms step:1302/1480 train_time:205868ms step_avg:159.34ms step:1303/1480 train_time:206045ms step_avg:159.35ms step:1304/1480 train_time:206219ms step_avg:159.37ms step:1305/1480 train_time:206388ms step_avg:159.37ms step:1306/1480 train_time:206562ms step_avg:159.38ms step:1307/1480 train_time:206731ms step_avg:159.39ms step:1308/1480 train_time:206900ms step_avg:159.40ms step:1309/1480 train_time:207073ms step_avg:159.41ms step:1310/1480 train_time:207242ms step_avg:159.42ms step:1311/1480 train_time:207409ms step_avg:159.42ms step:1312/1480 train_time:207582ms step_avg:159.43ms step:1313/1480 train_time:207751ms step_avg:159.44ms step:1314/1480 train_time:207926ms step_avg:159.45ms step:1315/1480 train_time:208097ms step_avg:159.46ms step:1316/1480 train_time:208263ms step_avg:159.47ms step:1317/1480 train_time:208436ms step_avg:159.48ms step:1318/1480 train_time:208615ms step_avg:159.49ms step:1319/1480 train_time:208792ms step_avg:159.51ms step:1320/1480 train_time:208968ms step_avg:159.52ms step:1321/1480 train_time:209141ms step_avg:159.53ms step:1322/1480 train_time:209322ms step_avg:159.54ms step:1323/1480 train_time:209494ms step_avg:159.55ms step:1324/1480 train_time:209670ms step_avg:159.57ms step:1325/1480 train_time:209852ms step_avg:159.58ms step:1326/1480 train_time:210027ms step_avg:159.59ms step:1327/1480 train_time:210198ms step_avg:159.60ms step:1328/1480 train_time:210368ms step_avg:159.61ms step:1329/1480 train_time:210563ms step_avg:159.64ms step:1330/1480 train_time:210741ms step_avg:159.65ms step:1331/1480 train_time:210911ms step_avg:159.66ms step:1332/1480 train_time:211084ms step_avg:159.67ms step:1333/1480 train_time:211259ms step_avg:159.68ms step:1334/1480 train_time:211431ms step_avg:159.69ms step:1335/1480 train_time:211600ms step_avg:159.70ms step:1336/1480 train_time:211783ms step_avg:159.72ms step:1337/1480 train_time:211961ms step_avg:159.73ms step:1338/1480 train_time:212133ms step_avg:159.74ms step:1339/1480 train_time:212305ms step_avg:159.75ms step:1340/1480 train_time:212477ms step_avg:159.76ms step:1341/1480 train_time:212645ms step_avg:159.76ms step:1342/1480 train_time:212819ms step_avg:159.77ms step:1343/1480 train_time:212988ms step_avg:159.78ms step:1344/1480 train_time:213160ms step_avg:159.79ms step:1345/1480 train_time:213339ms step_avg:159.80ms step:1346/1480 train_time:213508ms step_avg:159.81ms step:1347/1480 train_time:213677ms step_avg:159.82ms step:1348/1480 train_time:213847ms step_avg:159.83ms step:1349/1480 train_time:214017ms step_avg:159.83ms step:1350/1480 train_time:214193ms step_avg:159.85ms step:1351/1480 train_time:214365ms step_avg:159.85ms step:1352/1480 train_time:214536ms step_avg:159.86ms step:1353/1480 train_time:214712ms step_avg:159.88ms step:1354/1480 train_time:214882ms step_avg:159.88ms step:1355/1480 train_time:215049ms step_avg:159.89ms step:1356/1480 train_time:215222ms step_avg:159.90ms step:1357/1480 train_time:215398ms step_avg:159.91ms step:1358/1480 train_time:215570ms step_avg:159.92ms step:1359/1480 train_time:215743ms step_avg:159.93ms step:1360/1480 train_time:215918ms step_avg:159.94ms step:1361/1480 train_time:216096ms step_avg:159.95ms step:1362/1480 train_time:216272ms step_avg:159.96ms step:1363/1480 train_time:216451ms step_avg:159.98ms step:1364/1480 train_time:216620ms step_avg:159.99ms step:1365/1480 train_time:216788ms step_avg:159.99ms step:1366/1480 train_time:216960ms step_avg:160.00ms step:1367/1480 train_time:217131ms step_avg:160.01ms step:1368/1480 train_time:217303ms step_avg:160.02ms step:1369/1480 train_time:217485ms step_avg:160.03ms step:1370/1480 train_time:217662ms step_avg:160.05ms step:1371/1480 train_time:217834ms step_avg:160.05ms step:1372/1480 train_time:218010ms step_avg:160.07ms step:1373/1480 train_time:218180ms step_avg:160.07ms step:1374/1480 train_time:218358ms step_avg:160.09ms step:1375/1480 train_time:218528ms step_avg:160.09ms step:1375/1480 val_loss:3.2920 train_time:218596ms step_avg:160.14ms step:1376/1480 train_time:218702ms step_avg:160.10ms step:1377/1480 train_time:218874ms step_avg:160.11ms step:1378/1480 train_time:219044ms step_avg:160.12ms step:1379/1480 train_time:219218ms step_avg:160.13ms step:1380/1480 train_time:219392ms step_avg:160.14ms step:1381/1480 train_time:219572ms step_avg:160.15ms step:1382/1480 train_time:219743ms step_avg:160.16ms step:1383/1480 train_time:219914ms step_avg:160.17ms step:1384/1480 train_time:220092ms step_avg:160.18ms step:1385/1480 train_time:220257ms step_avg:160.19ms step:1386/1480 train_time:220428ms step_avg:160.20ms step:1387/1480 train_time:220599ms step_avg:160.20ms step:1388/1480 train_time:220769ms step_avg:160.21ms step:1389/1480 train_time:220943ms step_avg:160.22ms step:1390/1480 train_time:221111ms step_avg:160.23ms step:1391/1480 train_time:221281ms step_avg:160.23ms step:1392/1480 train_time:221453ms step_avg:160.24ms step:1393/1480 train_time:221623ms step_avg:160.25ms step:1394/1480 train_time:221793ms step_avg:160.26ms step:1395/1480 train_time:221961ms step_avg:160.26ms step:1396/1480 train_time:222131ms step_avg:160.27ms step:1397/1480 train_time:222298ms step_avg:160.27ms step:1398/1480 train_time:222463ms step_avg:160.28ms step:1399/1480 train_time:222633ms step_avg:160.28ms step:1400/1480 train_time:222810ms step_avg:160.29ms step:1401/1480 train_time:222977ms step_avg:160.30ms step:1402/1480 train_time:223150ms step_avg:160.31ms step:1403/1480 train_time:223326ms step_avg:160.32ms step:1404/1480 train_time:223498ms step_avg:160.33ms step:1405/1480 train_time:223674ms step_avg:160.34ms step:1406/1480 train_time:223850ms step_avg:160.35ms step:1407/1480 train_time:224017ms step_avg:160.36ms step:1408/1480 train_time:224185ms step_avg:160.36ms step:1409/1480 train_time:224368ms step_avg:160.38ms step:1410/1480 train_time:224537ms step_avg:160.38ms step:1411/1480 train_time:224706ms step_avg:160.39ms step:1412/1480 train_time:224877ms step_avg:160.40ms step:1413/1480 train_time:225047ms step_avg:160.40ms step:1414/1480 train_time:225219ms step_avg:160.41ms step:1415/1480 train_time:225395ms step_avg:160.42ms step:1416/1480 train_time:225580ms step_avg:160.44ms step:1417/1480 train_time:225754ms step_avg:160.45ms step:1418/1480 train_time:225925ms step_avg:160.46ms step:1419/1480 train_time:226097ms step_avg:160.47ms step:1420/1480 train_time:226272ms step_avg:160.48ms step:1421/1480 train_time:226447ms step_avg:160.49ms step:1422/1480 train_time:226618ms step_avg:160.49ms step:1423/1480 train_time:226788ms step_avg:160.50ms step:1424/1480 train_time:226965ms step_avg:160.51ms step:1425/1480 train_time:227146ms step_avg:160.53ms step:1426/1480 train_time:227319ms step_avg:160.54ms step:1427/1480 train_time:227493ms step_avg:160.55ms step:1428/1480 train_time:227662ms step_avg:160.55ms step:1429/1480 train_time:227830ms step_avg:160.56ms step:1430/1480 train_time:228004ms step_avg:160.57ms step:1431/1480 train_time:228179ms step_avg:160.58ms step:1432/1480 train_time:228356ms step_avg:160.59ms step:1433/1480 train_time:228536ms step_avg:160.60ms step:1434/1480 train_time:228718ms step_avg:160.62ms step:1435/1480 train_time:228894ms step_avg:160.63ms step:1436/1480 train_time:229069ms step_avg:160.64ms step:1437/1480 train_time:229239ms step_avg:160.64ms step:1438/1480 train_time:229408ms step_avg:160.65ms step:1439/1480 train_time:229582ms step_avg:160.66ms step:1440/1480 train_time:229753ms step_avg:160.67ms step:1441/1480 train_time:229924ms step_avg:160.67ms step:1442/1480 train_time:230101ms step_avg:160.68ms step:1443/1480 train_time:230291ms step_avg:160.71ms step:1444/1480 train_time:230463ms step_avg:160.71ms step:1445/1480 train_time:230635ms step_avg:160.72ms step:1446/1480 train_time:230812ms step_avg:160.73ms step:1447/1480 train_time:230989ms step_avg:160.74ms step:1448/1480 train_time:231159ms step_avg:160.75ms step:1449/1480 train_time:231333ms step_avg:160.76ms step:1450/1480 train_time:231507ms step_avg:160.77ms step:1451/1480 train_time:231677ms step_avg:160.78ms step:1452/1480 train_time:231853ms step_avg:160.79ms step:1453/1480 train_time:232021ms step_avg:160.79ms step:1454/1480 train_time:232193ms step_avg:160.80ms step:1455/1480 train_time:232373ms step_avg:160.81ms step:1456/1480 train_time:232548ms step_avg:160.82ms step:1457/1480 train_time:232721ms step_avg:160.83ms step:1458/1480 train_time:232892ms step_avg:160.84ms step:1459/1480 train_time:233068ms step_avg:160.85ms step:1460/1480 train_time:233239ms step_avg:160.85ms step:1461/1480 train_time:233414ms step_avg:160.86ms step:1462/1480 train_time:233584ms step_avg:160.87ms step:1463/1480 train_time:233760ms step_avg:160.88ms step:1464/1480 train_time:233935ms step_avg:160.89ms step:1465/1480 train_time:234109ms step_avg:160.90ms step:1466/1480 train_time:234279ms step_avg:160.91ms step:1467/1480 train_time:234453ms step_avg:160.92ms step:1468/1480 train_time:234624ms step_avg:160.92ms step:1469/1480 train_time:234797ms step_avg:160.93ms step:1470/1480 train_time:234977ms step_avg:160.94ms step:1471/1480 train_time:235161ms step_avg:160.96ms step:1472/1480 train_time:235342ms step_avg:160.97ms step:1473/1480 train_time:235514ms step_avg:160.98ms step:1474/1480 train_time:235691ms step_avg:160.99ms step:1475/1480 train_time:235873ms step_avg:161.01ms step:1476/1480 train_time:236045ms step_avg:161.01ms step:1477/1480 train_time:236230ms step_avg:161.03ms step:1478/1480 train_time:236413ms step_avg:161.04ms step:1479/1480 train_time:236586ms step_avg:161.05ms step:1480/1480 train_time:236760ms step_avg:161.06ms step:1480/1480 val_loss:3.2731 train_time:236832ms step_avg:161.11ms