import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 07:44:49 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 131W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 119W / 700W | 533MiB / 81559MiB | 2% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 122W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23346ms step_avg:nanms step:2/1480 train_time:23472ms step_avg:nanms step:3/1480 train_time:23611ms step_avg:nanms step:4/1480 train_time:23752ms step_avg:nanms step:5/1480 train_time:23894ms step_avg:nanms step:6/1480 train_time:24037ms step_avg:nanms step:7/1480 train_time:24180ms step_avg:nanms step:8/1480 train_time:24321ms step_avg:nanms step:9/1480 train_time:24465ms step_avg:nanms step:10/1480 train_time:24608ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:281ms step_avg:nanms step:13/1480 train_time:422ms step_avg:140.74ms step:14/1480 train_time:565ms step_avg:141.16ms step:15/1480 train_time:707ms step_avg:141.46ms step:16/1480 train_time:851ms step_avg:141.84ms step:17/1480 train_time:994ms step_avg:141.98ms step:18/1480 train_time:1137ms step_avg:142.07ms step:19/1480 train_time:1278ms step_avg:141.97ms step:20/1480 train_time:1418ms step_avg:141.85ms step:21/1480 train_time:1562ms step_avg:141.98ms step:22/1480 train_time:1704ms step_avg:142.04ms step:23/1480 train_time:1848ms step_avg:142.19ms step:24/1480 train_time:1991ms step_avg:142.22ms step:25/1480 train_time:2134ms step_avg:142.27ms step:26/1480 train_time:2275ms step_avg:142.21ms step:27/1480 train_time:2416ms step_avg:142.13ms step:28/1480 train_time:2558ms step_avg:142.13ms step:29/1480 train_time:2701ms step_avg:142.16ms step:30/1480 train_time:2844ms step_avg:142.21ms step:31/1480 train_time:2988ms step_avg:142.28ms step:32/1480 train_time:3132ms step_avg:142.35ms step:33/1480 train_time:3274ms step_avg:142.34ms step:34/1480 train_time:3415ms step_avg:142.31ms step:35/1480 train_time:3560ms step_avg:142.39ms step:36/1480 train_time:3702ms step_avg:142.39ms step:37/1480 train_time:3847ms step_avg:142.47ms step:38/1480 train_time:3991ms step_avg:142.53ms step:39/1480 train_time:4134ms step_avg:142.54ms step:40/1480 train_time:4276ms step_avg:142.52ms step:41/1480 train_time:4416ms step_avg:142.45ms step:42/1480 train_time:4558ms step_avg:142.44ms step:43/1480 train_time:4700ms step_avg:142.43ms step:44/1480 train_time:4844ms step_avg:142.47ms step:45/1480 train_time:4987ms step_avg:142.48ms step:46/1480 train_time:5132ms step_avg:142.57ms step:47/1480 train_time:5276ms step_avg:142.59ms step:48/1480 train_time:5416ms step_avg:142.54ms step:49/1480 train_time:5558ms step_avg:142.53ms step:50/1480 train_time:5700ms step_avg:142.49ms step:51/1480 train_time:5843ms step_avg:142.51ms step:52/1480 train_time:5986ms step_avg:142.52ms step:53/1480 train_time:6130ms step_avg:142.56ms step:54/1480 train_time:6273ms step_avg:142.57ms step:55/1480 train_time:6414ms step_avg:142.54ms step:56/1480 train_time:6557ms step_avg:142.53ms step:57/1480 train_time:6698ms step_avg:142.50ms step:58/1480 train_time:6842ms step_avg:142.53ms step:59/1480 train_time:6984ms step_avg:142.53ms step:60/1480 train_time:7127ms step_avg:142.54ms step:61/1480 train_time:7270ms step_avg:142.56ms step:62/1480 train_time:7414ms step_avg:142.57ms step:63/1480 train_time:7556ms step_avg:142.58ms step:64/1480 train_time:7698ms step_avg:142.55ms step:65/1480 train_time:7840ms step_avg:142.55ms step:66/1480 train_time:7984ms step_avg:142.57ms step:67/1480 train_time:8130ms step_avg:142.63ms step:68/1480 train_time:8274ms step_avg:142.66ms step:69/1480 train_time:8416ms step_avg:142.65ms step:70/1480 train_time:8560ms step_avg:142.66ms step:71/1480 train_time:8702ms step_avg:142.65ms step:72/1480 train_time:8844ms step_avg:142.65ms step:73/1480 train_time:8987ms step_avg:142.65ms step:74/1480 train_time:9131ms step_avg:142.67ms step:75/1480 train_time:9274ms step_avg:142.68ms step:76/1480 train_time:9417ms step_avg:142.68ms step:77/1480 train_time:9559ms step_avg:142.68ms step:78/1480 train_time:9703ms step_avg:142.69ms step:79/1480 train_time:9846ms step_avg:142.70ms step:80/1480 train_time:9990ms step_avg:142.71ms step:81/1480 train_time:10133ms step_avg:142.72ms step:82/1480 train_time:10275ms step_avg:142.71ms step:83/1480 train_time:10418ms step_avg:142.71ms step:84/1480 train_time:10561ms step_avg:142.71ms step:85/1480 train_time:10703ms step_avg:142.71ms step:86/1480 train_time:10847ms step_avg:142.73ms step:87/1480 train_time:10990ms step_avg:142.72ms step:88/1480 train_time:11133ms step_avg:142.73ms step:89/1480 train_time:11276ms step_avg:142.73ms step:90/1480 train_time:11417ms step_avg:142.71ms step:91/1480 train_time:11561ms step_avg:142.72ms step:92/1480 train_time:11704ms step_avg:142.73ms step:93/1480 train_time:11848ms step_avg:142.75ms step:94/1480 train_time:11992ms step_avg:142.76ms step:95/1480 train_time:12135ms step_avg:142.76ms step:96/1480 train_time:12277ms step_avg:142.76ms step:97/1480 train_time:12419ms step_avg:142.74ms step:98/1480 train_time:12562ms step_avg:142.75ms step:99/1480 train_time:12706ms step_avg:142.77ms step:100/1480 train_time:12850ms step_avg:142.78ms step:101/1480 train_time:12992ms step_avg:142.76ms step:102/1480 train_time:13134ms step_avg:142.76ms step:103/1480 train_time:13276ms step_avg:142.75ms step:104/1480 train_time:13417ms step_avg:142.74ms step:105/1480 train_time:13560ms step_avg:142.74ms step:106/1480 train_time:13704ms step_avg:142.75ms step:107/1480 train_time:13849ms step_avg:142.78ms step:108/1480 train_time:13992ms step_avg:142.78ms step:109/1480 train_time:14134ms step_avg:142.76ms step:110/1480 train_time:14275ms step_avg:142.75ms step:111/1480 train_time:14418ms step_avg:142.76ms step:112/1480 train_time:14565ms step_avg:142.79ms step:113/1480 train_time:14713ms step_avg:142.84ms step:114/1480 train_time:14860ms step_avg:142.89ms step:115/1480 train_time:15007ms step_avg:142.93ms step:116/1480 train_time:15155ms step_avg:142.97ms step:117/1480 train_time:15300ms step_avg:142.99ms step:118/1480 train_time:15449ms step_avg:143.04ms step:119/1480 train_time:15596ms step_avg:143.08ms step:120/1480 train_time:15742ms step_avg:143.11ms step:121/1480 train_time:15891ms step_avg:143.16ms step:122/1480 train_time:16039ms step_avg:143.21ms step:123/1480 train_time:16187ms step_avg:143.25ms step:124/1480 train_time:16335ms step_avg:143.29ms step:125/1480 train_time:16480ms step_avg:143.31ms step:125/1480 val_loss:4.4195 train_time:16537ms step_avg:143.80ms step:126/1480 train_time:16632ms step_avg:143.38ms step:127/1480 train_time:16782ms step_avg:143.44ms step:128/1480 train_time:16928ms step_avg:143.46ms step:129/1480 train_time:17074ms step_avg:143.48ms step:130/1480 train_time:17220ms step_avg:143.50ms step:131/1480 train_time:17366ms step_avg:143.52ms step:132/1480 train_time:17512ms step_avg:143.54ms step:133/1480 train_time:17663ms step_avg:143.60ms step:134/1480 train_time:17810ms step_avg:143.63ms step:135/1480 train_time:17958ms step_avg:143.66ms step:136/1480 train_time:18105ms step_avg:143.69ms step:137/1480 train_time:18250ms step_avg:143.70ms step:138/1480 train_time:18397ms step_avg:143.73ms step:139/1480 train_time:18545ms step_avg:143.76ms step:140/1480 train_time:18693ms step_avg:143.79ms step:141/1480 train_time:18842ms step_avg:143.83ms step:142/1480 train_time:18989ms step_avg:143.85ms step:143/1480 train_time:19138ms step_avg:143.89ms step:144/1480 train_time:19284ms step_avg:143.91ms step:145/1480 train_time:19431ms step_avg:143.93ms step:146/1480 train_time:19579ms step_avg:143.96ms step:147/1480 train_time:19726ms step_avg:143.98ms step:148/1480 train_time:19871ms step_avg:144.00ms step:149/1480 train_time:20019ms step_avg:144.02ms step:150/1480 train_time:20166ms step_avg:144.04ms step:151/1480 train_time:20312ms step_avg:144.06ms step:152/1480 train_time:20461ms step_avg:144.09ms step:153/1480 train_time:20607ms step_avg:144.11ms step:154/1480 train_time:20756ms step_avg:144.14ms step:155/1480 train_time:20904ms step_avg:144.17ms step:156/1480 train_time:21050ms step_avg:144.18ms step:157/1480 train_time:21199ms step_avg:144.21ms step:158/1480 train_time:21345ms step_avg:144.23ms step:159/1480 train_time:21493ms step_avg:144.25ms step:160/1480 train_time:21640ms step_avg:144.27ms step:161/1480 train_time:21786ms step_avg:144.28ms step:162/1480 train_time:21933ms step_avg:144.30ms step:163/1480 train_time:22082ms step_avg:144.32ms step:164/1480 train_time:22228ms step_avg:144.33ms step:165/1480 train_time:22376ms step_avg:144.36ms step:166/1480 train_time:22524ms step_avg:144.38ms step:167/1480 train_time:22671ms step_avg:144.40ms step:168/1480 train_time:22819ms step_avg:144.42ms step:169/1480 train_time:22966ms step_avg:144.44ms step:170/1480 train_time:23112ms step_avg:144.45ms step:171/1480 train_time:23260ms step_avg:144.47ms step:172/1480 train_time:23406ms step_avg:144.48ms step:173/1480 train_time:23552ms step_avg:144.49ms step:174/1480 train_time:23700ms step_avg:144.51ms step:175/1480 train_time:23846ms step_avg:144.52ms step:176/1480 train_time:23993ms step_avg:144.53ms step:177/1480 train_time:24139ms step_avg:144.55ms step:178/1480 train_time:24287ms step_avg:144.56ms step:179/1480 train_time:24435ms step_avg:144.58ms step:180/1480 train_time:24583ms step_avg:144.60ms step:181/1480 train_time:24728ms step_avg:144.61ms step:182/1480 train_time:24875ms step_avg:144.62ms step:183/1480 train_time:25022ms step_avg:144.64ms step:184/1480 train_time:25169ms step_avg:144.65ms step:185/1480 train_time:25317ms step_avg:144.67ms step:186/1480 train_time:25465ms step_avg:144.69ms step:187/1480 train_time:25609ms step_avg:144.69ms step:188/1480 train_time:25757ms step_avg:144.70ms step:189/1480 train_time:25904ms step_avg:144.72ms step:190/1480 train_time:26048ms step_avg:144.71ms step:191/1480 train_time:26194ms step_avg:144.72ms step:192/1480 train_time:26343ms step_avg:144.74ms step:193/1480 train_time:26490ms step_avg:144.75ms step:194/1480 train_time:26638ms step_avg:144.77ms step:195/1480 train_time:26785ms step_avg:144.78ms step:196/1480 train_time:26931ms step_avg:144.79ms step:197/1480 train_time:27078ms step_avg:144.80ms step:198/1480 train_time:27225ms step_avg:144.81ms step:199/1480 train_time:27371ms step_avg:144.82ms step:200/1480 train_time:27519ms step_avg:144.84ms step:201/1480 train_time:27666ms step_avg:144.85ms step:202/1480 train_time:27810ms step_avg:144.85ms step:203/1480 train_time:27959ms step_avg:144.87ms step:204/1480 train_time:28106ms step_avg:144.88ms step:205/1480 train_time:28253ms step_avg:144.89ms step:206/1480 train_time:28400ms step_avg:144.90ms step:207/1480 train_time:28546ms step_avg:144.90ms step:208/1480 train_time:28693ms step_avg:144.91ms step:209/1480 train_time:28841ms step_avg:144.93ms step:210/1480 train_time:28987ms step_avg:144.94ms step:211/1480 train_time:29136ms step_avg:144.95ms step:212/1480 train_time:29284ms step_avg:144.97ms step:213/1480 train_time:29429ms step_avg:144.97ms step:214/1480 train_time:29578ms step_avg:144.99ms step:215/1480 train_time:29725ms step_avg:145.00ms step:216/1480 train_time:29872ms step_avg:145.01ms step:217/1480 train_time:30020ms step_avg:145.02ms step:218/1480 train_time:30166ms step_avg:145.03ms step:219/1480 train_time:30313ms step_avg:145.04ms step:220/1480 train_time:30461ms step_avg:145.05ms step:221/1480 train_time:30608ms step_avg:145.06ms step:222/1480 train_time:30760ms step_avg:145.10ms step:223/1480 train_time:30911ms step_avg:145.12ms step:224/1480 train_time:31062ms step_avg:145.15ms step:225/1480 train_time:31212ms step_avg:145.17ms step:226/1480 train_time:31365ms step_avg:145.21ms step:227/1480 train_time:31514ms step_avg:145.23ms step:228/1480 train_time:31665ms step_avg:145.25ms step:229/1480 train_time:31814ms step_avg:145.27ms step:230/1480 train_time:31965ms step_avg:145.30ms step:231/1480 train_time:32115ms step_avg:145.32ms step:232/1480 train_time:32266ms step_avg:145.34ms step:233/1480 train_time:32417ms step_avg:145.37ms step:234/1480 train_time:32569ms step_avg:145.40ms step:235/1480 train_time:32719ms step_avg:145.42ms step:236/1480 train_time:32869ms step_avg:145.44ms step:237/1480 train_time:33020ms step_avg:145.46ms step:238/1480 train_time:33170ms step_avg:145.48ms step:239/1480 train_time:33321ms step_avg:145.51ms step:240/1480 train_time:33471ms step_avg:145.52ms step:241/1480 train_time:33621ms step_avg:145.54ms step:242/1480 train_time:33771ms step_avg:145.56ms step:243/1480 train_time:33922ms step_avg:145.59ms step:244/1480 train_time:34074ms step_avg:145.61ms step:245/1480 train_time:34225ms step_avg:145.64ms step:246/1480 train_time:34375ms step_avg:145.66ms step:247/1480 train_time:34526ms step_avg:145.68ms step:248/1480 train_time:34676ms step_avg:145.70ms step:249/1480 train_time:34827ms step_avg:145.72ms step:250/1480 train_time:34979ms step_avg:145.75ms step:250/1480 val_loss:3.9955 train_time:35038ms step_avg:145.99ms step:251/1480 train_time:35137ms step_avg:145.80ms step:252/1480 train_time:35286ms step_avg:145.81ms step:253/1480 train_time:35436ms step_avg:145.83ms step:254/1480 train_time:35584ms step_avg:145.84ms step:255/1480 train_time:35733ms step_avg:145.85ms step:256/1480 train_time:35883ms step_avg:145.86ms step:257/1480 train_time:36033ms step_avg:145.88ms step:258/1480 train_time:36187ms step_avg:145.91ms step:259/1480 train_time:36339ms step_avg:145.94ms step:260/1480 train_time:36489ms step_avg:145.96ms step:261/1480 train_time:36639ms step_avg:145.97ms step:262/1480 train_time:36789ms step_avg:145.99ms step:263/1480 train_time:36938ms step_avg:146.00ms step:264/1480 train_time:37089ms step_avg:146.02ms step:265/1480 train_time:37241ms step_avg:146.04ms step:266/1480 train_time:37393ms step_avg:146.07ms step:267/1480 train_time:37543ms step_avg:146.08ms step:268/1480 train_time:37694ms step_avg:146.10ms step:269/1480 train_time:37843ms step_avg:146.11ms step:270/1480 train_time:37994ms step_avg:146.13ms step:271/1480 train_time:38144ms step_avg:146.15ms step:272/1480 train_time:38296ms step_avg:146.17ms step:273/1480 train_time:38445ms step_avg:146.18ms step:274/1480 train_time:38596ms step_avg:146.20ms step:275/1480 train_time:38745ms step_avg:146.21ms step:276/1480 train_time:38897ms step_avg:146.23ms step:277/1480 train_time:39046ms step_avg:146.24ms step:278/1480 train_time:39197ms step_avg:146.26ms step:279/1480 train_time:39347ms step_avg:146.27ms step:280/1480 train_time:39499ms step_avg:146.29ms step:281/1480 train_time:39648ms step_avg:146.30ms step:282/1480 train_time:39799ms step_avg:146.32ms step:283/1480 train_time:39949ms step_avg:146.33ms step:284/1480 train_time:40100ms step_avg:146.35ms step:285/1480 train_time:40250ms step_avg:146.36ms step:286/1480 train_time:40402ms step_avg:146.38ms step:287/1480 train_time:40552ms step_avg:146.40ms step:288/1480 train_time:40703ms step_avg:146.41ms step:289/1480 train_time:40853ms step_avg:146.43ms step:290/1480 train_time:41004ms step_avg:146.44ms step:291/1480 train_time:41155ms step_avg:146.46ms step:292/1480 train_time:41305ms step_avg:146.47ms step:293/1480 train_time:41455ms step_avg:146.48ms step:294/1480 train_time:41607ms step_avg:146.50ms step:295/1480 train_time:41758ms step_avg:146.52ms step:296/1480 train_time:41907ms step_avg:146.53ms step:297/1480 train_time:42058ms step_avg:146.54ms step:298/1480 train_time:42209ms step_avg:146.56ms step:299/1480 train_time:42360ms step_avg:146.57ms step:300/1480 train_time:42512ms step_avg:146.59ms step:301/1480 train_time:42662ms step_avg:146.61ms step:302/1480 train_time:42814ms step_avg:146.62ms step:303/1480 train_time:42965ms step_avg:146.64ms step:304/1480 train_time:43116ms step_avg:146.65ms step:305/1480 train_time:43265ms step_avg:146.66ms step:306/1480 train_time:43417ms step_avg:146.68ms step:307/1480 train_time:43567ms step_avg:146.69ms step:308/1480 train_time:43717ms step_avg:146.70ms step:309/1480 train_time:43867ms step_avg:146.71ms step:310/1480 train_time:44020ms step_avg:146.73ms step:311/1480 train_time:44169ms step_avg:146.74ms step:312/1480 train_time:44320ms step_avg:146.76ms step:313/1480 train_time:44470ms step_avg:146.77ms step:314/1480 train_time:44621ms step_avg:146.78ms step:315/1480 train_time:44772ms step_avg:146.79ms step:316/1480 train_time:44922ms step_avg:146.80ms step:317/1480 train_time:45074ms step_avg:146.82ms step:318/1480 train_time:45224ms step_avg:146.83ms step:319/1480 train_time:45376ms step_avg:146.85ms step:320/1480 train_time:45525ms step_avg:146.85ms step:321/1480 train_time:45676ms step_avg:146.87ms step:322/1480 train_time:45826ms step_avg:146.88ms step:323/1480 train_time:45977ms step_avg:146.89ms step:324/1480 train_time:46126ms step_avg:146.90ms step:325/1480 train_time:46277ms step_avg:146.91ms step:326/1480 train_time:46426ms step_avg:146.92ms step:327/1480 train_time:46577ms step_avg:146.93ms step:328/1480 train_time:46726ms step_avg:146.94ms step:329/1480 train_time:46877ms step_avg:146.95ms step:330/1480 train_time:47028ms step_avg:146.96ms step:331/1480 train_time:47184ms step_avg:146.99ms step:332/1480 train_time:47338ms step_avg:147.01ms step:333/1480 train_time:47492ms step_avg:147.04ms step:334/1480 train_time:47647ms step_avg:147.06ms step:335/1480 train_time:47801ms step_avg:147.08ms step:336/1480 train_time:47954ms step_avg:147.10ms step:337/1480 train_time:48109ms step_avg:147.12ms step:338/1480 train_time:48262ms step_avg:147.14ms step:339/1480 train_time:48417ms step_avg:147.16ms step:340/1480 train_time:48572ms step_avg:147.19ms step:341/1480 train_time:48725ms step_avg:147.21ms step:342/1480 train_time:48881ms step_avg:147.23ms step:343/1480 train_time:49036ms step_avg:147.25ms step:344/1480 train_time:49191ms step_avg:147.28ms step:345/1480 train_time:49345ms step_avg:147.30ms step:346/1480 train_time:49499ms step_avg:147.32ms step:347/1480 train_time:49652ms step_avg:147.34ms step:348/1480 train_time:49805ms step_avg:147.35ms step:349/1480 train_time:49958ms step_avg:147.37ms step:350/1480 train_time:50113ms step_avg:147.39ms step:351/1480 train_time:50266ms step_avg:147.41ms step:352/1480 train_time:50421ms step_avg:147.43ms step:353/1480 train_time:50575ms step_avg:147.45ms step:354/1480 train_time:50730ms step_avg:147.47ms step:355/1480 train_time:50885ms step_avg:147.49ms step:356/1480 train_time:51039ms step_avg:147.51ms step:357/1480 train_time:51192ms step_avg:147.53ms step:358/1480 train_time:51345ms step_avg:147.54ms step:359/1480 train_time:51499ms step_avg:147.56ms step:360/1480 train_time:51655ms step_avg:147.59ms step:361/1480 train_time:51810ms step_avg:147.61ms step:362/1480 train_time:51964ms step_avg:147.63ms step:363/1480 train_time:52118ms step_avg:147.64ms step:364/1480 train_time:52272ms step_avg:147.66ms step:365/1480 train_time:52425ms step_avg:147.68ms step:366/1480 train_time:52579ms step_avg:147.69ms step:367/1480 train_time:52731ms step_avg:147.71ms step:368/1480 train_time:52885ms step_avg:147.72ms step:369/1480 train_time:53038ms step_avg:147.74ms step:370/1480 train_time:53191ms step_avg:147.75ms step:371/1480 train_time:53344ms step_avg:147.77ms step:372/1480 train_time:53498ms step_avg:147.79ms step:373/1480 train_time:53652ms step_avg:147.80ms step:374/1480 train_time:53806ms step_avg:147.82ms step:375/1480 train_time:53960ms step_avg:147.84ms step:375/1480 val_loss:3.8073 train_time:54021ms step_avg:148.00ms step:376/1480 train_time:54120ms step_avg:147.87ms step:377/1480 train_time:54275ms step_avg:147.89ms step:378/1480 train_time:54429ms step_avg:147.90ms step:379/1480 train_time:54582ms step_avg:147.92ms step:380/1480 train_time:54735ms step_avg:147.93ms step:381/1480 train_time:54886ms step_avg:147.94ms step:382/1480 train_time:55042ms step_avg:147.96ms step:383/1480 train_time:55198ms step_avg:147.98ms step:384/1480 train_time:55352ms step_avg:148.00ms step:385/1480 train_time:55508ms step_avg:148.02ms step:386/1480 train_time:55661ms step_avg:148.04ms step:387/1480 train_time:55816ms step_avg:148.05ms step:388/1480 train_time:55971ms step_avg:148.07ms step:389/1480 train_time:56124ms step_avg:148.09ms step:390/1480 train_time:56279ms step_avg:148.10ms step:391/1480 train_time:56432ms step_avg:148.11ms step:392/1480 train_time:56586ms step_avg:148.13ms step:393/1480 train_time:56740ms step_avg:148.15ms step:394/1480 train_time:56893ms step_avg:148.16ms step:395/1480 train_time:57047ms step_avg:148.17ms step:396/1480 train_time:57201ms step_avg:148.19ms step:397/1480 train_time:57356ms step_avg:148.21ms step:398/1480 train_time:57510ms step_avg:148.22ms step:399/1480 train_time:57663ms step_avg:148.23ms step:400/1480 train_time:57817ms step_avg:148.25ms step:401/1480 train_time:57971ms step_avg:148.26ms step:402/1480 train_time:58125ms step_avg:148.28ms step:403/1480 train_time:58280ms step_avg:148.30ms step:404/1480 train_time:58435ms step_avg:148.31ms step:405/1480 train_time:58590ms step_avg:148.33ms step:406/1480 train_time:58743ms step_avg:148.34ms step:407/1480 train_time:58898ms step_avg:148.36ms step:408/1480 train_time:59050ms step_avg:148.37ms step:409/1480 train_time:59202ms step_avg:148.38ms step:410/1480 train_time:59355ms step_avg:148.39ms step:411/1480 train_time:59509ms step_avg:148.40ms step:412/1480 train_time:59663ms step_avg:148.41ms step:413/1480 train_time:59817ms step_avg:148.43ms step:414/1480 train_time:59971ms step_avg:148.44ms step:415/1480 train_time:60125ms step_avg:148.46ms step:416/1480 train_time:60279ms step_avg:148.47ms step:417/1480 train_time:60433ms step_avg:148.48ms step:418/1480 train_time:60587ms step_avg:148.50ms step:419/1480 train_time:60741ms step_avg:148.51ms step:420/1480 train_time:60894ms step_avg:148.52ms step:421/1480 train_time:61048ms step_avg:148.53ms step:422/1480 train_time:61201ms step_avg:148.55ms step:423/1480 train_time:61354ms step_avg:148.56ms step:424/1480 train_time:61509ms step_avg:148.57ms step:425/1480 train_time:61662ms step_avg:148.58ms step:426/1480 train_time:61815ms step_avg:148.59ms step:427/1480 train_time:61969ms step_avg:148.61ms step:428/1480 train_time:62122ms step_avg:148.62ms step:429/1480 train_time:62275ms step_avg:148.63ms step:430/1480 train_time:62428ms step_avg:148.64ms step:431/1480 train_time:62583ms step_avg:148.65ms step:432/1480 train_time:62736ms step_avg:148.66ms step:433/1480 train_time:62890ms step_avg:148.68ms step:434/1480 train_time:63044ms step_avg:148.69ms step:435/1480 train_time:63198ms step_avg:148.70ms step:436/1480 train_time:63352ms step_avg:148.71ms step:437/1480 train_time:63506ms step_avg:148.72ms step:438/1480 train_time:63660ms step_avg:148.74ms step:439/1480 train_time:63814ms step_avg:148.75ms step:440/1480 train_time:63968ms step_avg:148.76ms step:441/1480 train_time:64127ms step_avg:148.79ms step:442/1480 train_time:64284ms step_avg:148.81ms step:443/1480 train_time:64440ms step_avg:148.82ms step:444/1480 train_time:64597ms step_avg:148.84ms step:445/1480 train_time:64753ms step_avg:148.86ms step:446/1480 train_time:64909ms step_avg:148.87ms step:447/1480 train_time:65064ms step_avg:148.89ms step:448/1480 train_time:65222ms step_avg:148.91ms step:449/1480 train_time:65380ms step_avg:148.93ms step:450/1480 train_time:65538ms step_avg:148.95ms step:451/1480 train_time:65696ms step_avg:148.97ms step:452/1480 train_time:65853ms step_avg:148.99ms step:453/1480 train_time:66009ms step_avg:149.01ms step:454/1480 train_time:66166ms step_avg:149.02ms step:455/1480 train_time:66322ms step_avg:149.04ms step:456/1480 train_time:66479ms step_avg:149.06ms step:457/1480 train_time:66634ms step_avg:149.07ms step:458/1480 train_time:66789ms step_avg:149.08ms step:459/1480 train_time:66946ms step_avg:149.10ms step:460/1480 train_time:67103ms step_avg:149.12ms step:461/1480 train_time:67261ms step_avg:149.14ms step:462/1480 train_time:67419ms step_avg:149.16ms step:463/1480 train_time:67577ms step_avg:149.18ms step:464/1480 train_time:67734ms step_avg:149.19ms step:465/1480 train_time:67888ms step_avg:149.21ms step:466/1480 train_time:68045ms step_avg:149.22ms step:467/1480 train_time:68203ms step_avg:149.24ms step:468/1480 train_time:68360ms step_avg:149.26ms step:469/1480 train_time:68518ms step_avg:149.28ms step:470/1480 train_time:68676ms step_avg:149.30ms step:471/1480 train_time:68833ms step_avg:149.31ms step:472/1480 train_time:68991ms step_avg:149.33ms step:473/1480 train_time:69148ms step_avg:149.35ms step:474/1480 train_time:69304ms step_avg:149.36ms step:475/1480 train_time:69459ms step_avg:149.38ms step:476/1480 train_time:69618ms step_avg:149.39ms step:477/1480 train_time:69776ms step_avg:149.41ms step:478/1480 train_time:69933ms step_avg:149.43ms step:479/1480 train_time:70091ms step_avg:149.45ms step:480/1480 train_time:70247ms step_avg:149.46ms step:481/1480 train_time:70403ms step_avg:149.48ms step:482/1480 train_time:70559ms step_avg:149.49ms step:483/1480 train_time:70716ms step_avg:149.51ms step:484/1480 train_time:70875ms step_avg:149.53ms step:485/1480 train_time:71033ms step_avg:149.54ms step:486/1480 train_time:71191ms step_avg:149.56ms step:487/1480 train_time:71347ms step_avg:149.57ms step:488/1480 train_time:71503ms step_avg:149.59ms step:489/1480 train_time:71658ms step_avg:149.60ms step:490/1480 train_time:71814ms step_avg:149.61ms step:491/1480 train_time:71971ms step_avg:149.63ms step:492/1480 train_time:72128ms step_avg:149.64ms step:493/1480 train_time:72285ms step_avg:149.66ms step:494/1480 train_time:72442ms step_avg:149.67ms step:495/1480 train_time:72600ms step_avg:149.69ms step:496/1480 train_time:72757ms step_avg:149.71ms step:497/1480 train_time:72914ms step_avg:149.72ms step:498/1480 train_time:73070ms step_avg:149.73ms step:499/1480 train_time:73228ms step_avg:149.75ms step:500/1480 train_time:73385ms step_avg:149.76ms step:500/1480 val_loss:3.6853 train_time:73446ms step_avg:149.89ms step:501/1480 train_time:73546ms step_avg:149.79ms step:502/1480 train_time:73706ms step_avg:149.81ms step:503/1480 train_time:73863ms step_avg:149.82ms step:504/1480 train_time:74019ms step_avg:149.84ms step:505/1480 train_time:74175ms step_avg:149.85ms step:506/1480 train_time:74331ms step_avg:149.86ms step:507/1480 train_time:74487ms step_avg:149.87ms step:508/1480 train_time:74647ms step_avg:149.89ms step:509/1480 train_time:74805ms step_avg:149.91ms step:510/1480 train_time:74962ms step_avg:149.92ms step:511/1480 train_time:75119ms step_avg:149.94ms step:512/1480 train_time:75277ms step_avg:149.95ms step:513/1480 train_time:75433ms step_avg:149.97ms step:514/1480 train_time:75590ms step_avg:149.98ms step:515/1480 train_time:75748ms step_avg:150.00ms step:516/1480 train_time:75908ms step_avg:150.02ms step:517/1480 train_time:76067ms step_avg:150.03ms step:518/1480 train_time:76223ms step_avg:150.05ms step:519/1480 train_time:76380ms step_avg:150.06ms step:520/1480 train_time:76537ms step_avg:150.07ms step:521/1480 train_time:76693ms step_avg:150.08ms step:522/1480 train_time:76851ms step_avg:150.10ms step:523/1480 train_time:77011ms step_avg:150.12ms step:524/1480 train_time:77169ms step_avg:150.13ms step:525/1480 train_time:77326ms step_avg:150.15ms step:526/1480 train_time:77485ms step_avg:150.16ms step:527/1480 train_time:77641ms step_avg:150.18ms step:528/1480 train_time:77795ms step_avg:150.18ms step:529/1480 train_time:77952ms step_avg:150.20ms step:530/1480 train_time:78109ms step_avg:150.21ms step:531/1480 train_time:78267ms step_avg:150.22ms step:532/1480 train_time:78424ms step_avg:150.24ms step:533/1480 train_time:78580ms step_avg:150.25ms step:534/1480 train_time:78736ms step_avg:150.26ms step:535/1480 train_time:78892ms step_avg:150.27ms step:536/1480 train_time:79051ms step_avg:150.29ms step:537/1480 train_time:79207ms step_avg:150.30ms step:538/1480 train_time:79366ms step_avg:150.31ms step:539/1480 train_time:79524ms step_avg:150.33ms step:540/1480 train_time:79682ms step_avg:150.34ms step:541/1480 train_time:79838ms step_avg:150.35ms step:542/1480 train_time:79994ms step_avg:150.37ms step:543/1480 train_time:80149ms step_avg:150.37ms step:544/1480 train_time:80307ms step_avg:150.39ms step:545/1480 train_time:80464ms step_avg:150.40ms step:546/1480 train_time:80619ms step_avg:150.41ms step:547/1480 train_time:80775ms step_avg:150.42ms step:548/1480 train_time:80932ms step_avg:150.43ms step:549/1480 train_time:81089ms step_avg:150.44ms step:550/1480 train_time:81247ms step_avg:150.46ms step:551/1480 train_time:81405ms step_avg:150.47ms step:552/1480 train_time:81565ms step_avg:150.49ms step:553/1480 train_time:81726ms step_avg:150.51ms step:554/1480 train_time:81887ms step_avg:150.53ms step:555/1480 train_time:82047ms step_avg:150.55ms step:556/1480 train_time:82206ms step_avg:150.56ms step:557/1480 train_time:82368ms step_avg:150.58ms step:558/1480 train_time:82528ms step_avg:150.60ms step:559/1480 train_time:82688ms step_avg:150.62ms step:560/1480 train_time:82848ms step_avg:150.63ms step:561/1480 train_time:83008ms step_avg:150.65ms step:562/1480 train_time:83168ms step_avg:150.67ms step:563/1480 train_time:83327ms step_avg:150.68ms step:564/1480 train_time:83487ms step_avg:150.70ms step:565/1480 train_time:83647ms step_avg:150.71ms step:566/1480 train_time:83808ms step_avg:150.73ms step:567/1480 train_time:83967ms step_avg:150.75ms step:568/1480 train_time:84126ms step_avg:150.76ms step:569/1480 train_time:84285ms step_avg:150.78ms step:570/1480 train_time:84444ms step_avg:150.79ms step:571/1480 train_time:84603ms step_avg:150.81ms step:572/1480 train_time:84763ms step_avg:150.82ms step:573/1480 train_time:84923ms step_avg:150.84ms step:574/1480 train_time:85085ms step_avg:150.86ms step:575/1480 train_time:85245ms step_avg:150.88ms step:576/1480 train_time:85404ms step_avg:150.89ms step:577/1480 train_time:85563ms step_avg:150.90ms step:578/1480 train_time:85722ms step_avg:150.92ms step:579/1480 train_time:85882ms step_avg:150.93ms step:580/1480 train_time:86040ms step_avg:150.95ms step:581/1480 train_time:86200ms step_avg:150.96ms step:582/1480 train_time:86361ms step_avg:150.98ms step:583/1480 train_time:86521ms step_avg:151.00ms step:584/1480 train_time:86679ms step_avg:151.01ms step:585/1480 train_time:86837ms step_avg:151.02ms step:586/1480 train_time:86996ms step_avg:151.03ms step:587/1480 train_time:87153ms step_avg:151.05ms step:588/1480 train_time:87314ms step_avg:151.06ms step:589/1480 train_time:87473ms step_avg:151.08ms step:590/1480 train_time:87632ms step_avg:151.09ms step:591/1480 train_time:87790ms step_avg:151.10ms step:592/1480 train_time:87949ms step_avg:151.12ms step:593/1480 train_time:88110ms step_avg:151.13ms step:594/1480 train_time:88270ms step_avg:151.15ms step:595/1480 train_time:88431ms step_avg:151.16ms step:596/1480 train_time:88593ms step_avg:151.18ms step:597/1480 train_time:88751ms step_avg:151.19ms step:598/1480 train_time:88910ms step_avg:151.21ms step:599/1480 train_time:89069ms step_avg:151.22ms step:600/1480 train_time:89229ms step_avg:151.23ms step:601/1480 train_time:89388ms step_avg:151.25ms step:602/1480 train_time:89549ms step_avg:151.26ms step:603/1480 train_time:89710ms step_avg:151.28ms step:604/1480 train_time:89870ms step_avg:151.30ms step:605/1480 train_time:90029ms step_avg:151.31ms step:606/1480 train_time:90190ms step_avg:151.33ms step:607/1480 train_time:90351ms step_avg:151.34ms step:608/1480 train_time:90511ms step_avg:151.36ms step:609/1480 train_time:90671ms step_avg:151.37ms step:610/1480 train_time:90830ms step_avg:151.38ms step:611/1480 train_time:90990ms step_avg:151.40ms step:612/1480 train_time:91150ms step_avg:151.41ms step:613/1480 train_time:91310ms step_avg:151.43ms step:614/1480 train_time:91470ms step_avg:151.44ms step:615/1480 train_time:91630ms step_avg:151.45ms step:616/1480 train_time:91789ms step_avg:151.47ms step:617/1480 train_time:91948ms step_avg:151.48ms step:618/1480 train_time:92107ms step_avg:151.49ms step:619/1480 train_time:92268ms step_avg:151.51ms step:620/1480 train_time:92427ms step_avg:151.52ms step:621/1480 train_time:92588ms step_avg:151.54ms step:622/1480 train_time:92748ms step_avg:151.55ms step:623/1480 train_time:92910ms step_avg:151.57ms step:624/1480 train_time:93070ms step_avg:151.58ms step:625/1480 train_time:93229ms step_avg:151.59ms step:625/1480 val_loss:3.6049 train_time:93291ms step_avg:151.69ms step:626/1480 train_time:93391ms step_avg:151.61ms step:627/1480 train_time:93552ms step_avg:151.62ms step:628/1480 train_time:93711ms step_avg:151.64ms step:629/1480 train_time:93868ms step_avg:151.64ms step:630/1480 train_time:94025ms step_avg:151.65ms step:631/1480 train_time:94184ms step_avg:151.66ms step:632/1480 train_time:94344ms step_avg:151.68ms step:633/1480 train_time:94503ms step_avg:151.69ms step:634/1480 train_time:94662ms step_avg:151.70ms step:635/1480 train_time:94820ms step_avg:151.71ms step:636/1480 train_time:94980ms step_avg:151.73ms step:637/1480 train_time:95140ms step_avg:151.74ms step:638/1480 train_time:95299ms step_avg:151.75ms step:639/1480 train_time:95459ms step_avg:151.76ms step:640/1480 train_time:95619ms step_avg:151.78ms step:641/1480 train_time:95778ms step_avg:151.79ms step:642/1480 train_time:95937ms step_avg:151.80ms step:643/1480 train_time:96098ms step_avg:151.81ms step:644/1480 train_time:96257ms step_avg:151.82ms step:645/1480 train_time:96417ms step_avg:151.84ms step:646/1480 train_time:96577ms step_avg:151.85ms step:647/1480 train_time:96737ms step_avg:151.86ms step:648/1480 train_time:96898ms step_avg:151.88ms step:649/1480 train_time:97058ms step_avg:151.89ms step:650/1480 train_time:97218ms step_avg:151.90ms step:651/1480 train_time:97378ms step_avg:151.92ms step:652/1480 train_time:97539ms step_avg:151.93ms step:653/1480 train_time:97698ms step_avg:151.94ms step:654/1480 train_time:97857ms step_avg:151.95ms step:655/1480 train_time:98017ms step_avg:151.96ms step:656/1480 train_time:98178ms step_avg:151.98ms step:657/1480 train_time:98338ms step_avg:151.99ms step:658/1480 train_time:98498ms step_avg:152.00ms step:659/1480 train_time:98660ms step_avg:152.02ms step:660/1480 train_time:98821ms step_avg:152.03ms step:661/1480 train_time:98984ms step_avg:152.05ms step:662/1480 train_time:99143ms step_avg:152.06ms step:663/1480 train_time:99302ms step_avg:152.07ms step:664/1480 train_time:99464ms step_avg:152.09ms step:665/1480 train_time:99626ms step_avg:152.10ms step:666/1480 train_time:99785ms step_avg:152.11ms step:667/1480 train_time:99946ms step_avg:152.13ms step:668/1480 train_time:100107ms step_avg:152.14ms step:669/1480 train_time:100271ms step_avg:152.16ms step:670/1480 train_time:100431ms step_avg:152.17ms step:671/1480 train_time:100591ms step_avg:152.18ms step:672/1480 train_time:100753ms step_avg:152.20ms step:673/1480 train_time:100917ms step_avg:152.21ms step:674/1480 train_time:101080ms step_avg:152.23ms step:675/1480 train_time:101242ms step_avg:152.24ms step:676/1480 train_time:101404ms step_avg:152.26ms step:677/1480 train_time:101565ms step_avg:152.27ms step:678/1480 train_time:101726ms step_avg:152.29ms step:679/1480 train_time:101886ms step_avg:152.30ms step:680/1480 train_time:102047ms step_avg:152.31ms step:681/1480 train_time:102208ms step_avg:152.32ms step:682/1480 train_time:102374ms step_avg:152.34ms step:683/1480 train_time:102537ms step_avg:152.36ms step:684/1480 train_time:102699ms step_avg:152.37ms step:685/1480 train_time:102862ms step_avg:152.39ms step:686/1480 train_time:103022ms step_avg:152.40ms step:687/1480 train_time:103183ms step_avg:152.41ms step:688/1480 train_time:103345ms step_avg:152.43ms step:689/1480 train_time:103508ms step_avg:152.44ms step:690/1480 train_time:103672ms step_avg:152.46ms step:691/1480 train_time:103834ms step_avg:152.47ms step:692/1480 train_time:103995ms step_avg:152.48ms step:693/1480 train_time:104158ms step_avg:152.50ms step:694/1480 train_time:104320ms step_avg:152.51ms step:695/1480 train_time:104481ms step_avg:152.53ms step:696/1480 train_time:104641ms step_avg:152.54ms step:697/1480 train_time:104804ms step_avg:152.55ms step:698/1480 train_time:104964ms step_avg:152.56ms step:699/1480 train_time:105125ms step_avg:152.58ms step:700/1480 train_time:105286ms step_avg:152.59ms step:701/1480 train_time:105446ms step_avg:152.60ms step:702/1480 train_time:105607ms step_avg:152.61ms step:703/1480 train_time:105767ms step_avg:152.62ms step:704/1480 train_time:105928ms step_avg:152.63ms step:705/1480 train_time:106092ms step_avg:152.65ms step:706/1480 train_time:106256ms step_avg:152.67ms step:707/1480 train_time:106418ms step_avg:152.68ms step:708/1480 train_time:106579ms step_avg:152.69ms step:709/1480 train_time:106741ms step_avg:152.71ms step:710/1480 train_time:106901ms step_avg:152.72ms step:711/1480 train_time:107062ms step_avg:152.73ms step:712/1480 train_time:107226ms step_avg:152.74ms step:713/1480 train_time:107390ms step_avg:152.76ms step:714/1480 train_time:107551ms step_avg:152.77ms step:715/1480 train_time:107712ms step_avg:152.78ms step:716/1480 train_time:107876ms step_avg:152.80ms step:717/1480 train_time:108040ms step_avg:152.81ms step:718/1480 train_time:108200ms step_avg:152.82ms step:719/1480 train_time:108360ms step_avg:152.84ms step:720/1480 train_time:108521ms step_avg:152.85ms step:721/1480 train_time:108682ms step_avg:152.86ms step:722/1480 train_time:108843ms step_avg:152.87ms step:723/1480 train_time:109003ms step_avg:152.88ms step:724/1480 train_time:109166ms step_avg:152.89ms step:725/1480 train_time:109329ms step_avg:152.91ms step:726/1480 train_time:109494ms step_avg:152.92ms step:727/1480 train_time:109658ms step_avg:152.94ms step:728/1480 train_time:109819ms step_avg:152.95ms step:729/1480 train_time:109981ms step_avg:152.96ms step:730/1480 train_time:110143ms step_avg:152.98ms step:731/1480 train_time:110303ms step_avg:152.99ms step:732/1480 train_time:110463ms step_avg:153.00ms step:733/1480 train_time:110623ms step_avg:153.01ms step:734/1480 train_time:110787ms step_avg:153.02ms step:735/1480 train_time:110948ms step_avg:153.03ms step:736/1480 train_time:111109ms step_avg:153.04ms step:737/1480 train_time:111270ms step_avg:153.05ms step:738/1480 train_time:111431ms step_avg:153.06ms step:739/1480 train_time:111592ms step_avg:153.08ms step:740/1480 train_time:111759ms step_avg:153.09ms step:741/1480 train_time:111922ms step_avg:153.11ms step:742/1480 train_time:112084ms step_avg:153.12ms step:743/1480 train_time:112245ms step_avg:153.13ms step:744/1480 train_time:112408ms step_avg:153.14ms step:745/1480 train_time:112573ms step_avg:153.16ms step:746/1480 train_time:112735ms step_avg:153.17ms step:747/1480 train_time:112895ms step_avg:153.18ms step:748/1480 train_time:113061ms step_avg:153.20ms step:749/1480 train_time:113223ms step_avg:153.21ms step:750/1480 train_time:113383ms step_avg:153.22ms step:750/1480 val_loss:3.5501 train_time:113447ms step_avg:153.31ms step:751/1480 train_time:113549ms step_avg:153.24ms step:752/1480 train_time:113709ms step_avg:153.25ms step:753/1480 train_time:113869ms step_avg:153.26ms step:754/1480 train_time:114029ms step_avg:153.27ms step:755/1480 train_time:114191ms step_avg:153.28ms step:756/1480 train_time:114352ms step_avg:153.29ms step:757/1480 train_time:114515ms step_avg:153.30ms step:758/1480 train_time:114675ms step_avg:153.31ms step:759/1480 train_time:114839ms step_avg:153.32ms step:760/1480 train_time:115000ms step_avg:153.33ms step:761/1480 train_time:115165ms step_avg:153.35ms step:762/1480 train_time:115326ms step_avg:153.36ms step:763/1480 train_time:115487ms step_avg:153.37ms step:764/1480 train_time:115649ms step_avg:153.38ms step:765/1480 train_time:115810ms step_avg:153.39ms step:766/1480 train_time:115975ms step_avg:153.41ms step:767/1480 train_time:116138ms step_avg:153.42ms step:768/1480 train_time:116302ms step_avg:153.43ms step:769/1480 train_time:116465ms step_avg:153.45ms step:770/1480 train_time:116628ms step_avg:153.46ms step:771/1480 train_time:116790ms step_avg:153.47ms step:772/1480 train_time:116952ms step_avg:153.48ms step:773/1480 train_time:117114ms step_avg:153.49ms step:774/1480 train_time:117274ms step_avg:153.50ms step:775/1480 train_time:117436ms step_avg:153.51ms step:776/1480 train_time:117602ms step_avg:153.53ms step:777/1480 train_time:117769ms step_avg:153.54ms step:778/1480 train_time:117931ms step_avg:153.56ms step:779/1480 train_time:118095ms step_avg:153.57ms step:780/1480 train_time:118259ms step_avg:153.58ms step:781/1480 train_time:118423ms step_avg:153.60ms step:782/1480 train_time:118586ms step_avg:153.61ms step:783/1480 train_time:118747ms step_avg:153.62ms step:784/1480 train_time:118910ms step_avg:153.63ms step:785/1480 train_time:119071ms step_avg:153.64ms step:786/1480 train_time:119236ms step_avg:153.66ms step:787/1480 train_time:119401ms step_avg:153.67ms step:788/1480 train_time:119565ms step_avg:153.68ms step:789/1480 train_time:119726ms step_avg:153.69ms step:790/1480 train_time:119890ms step_avg:153.70ms step:791/1480 train_time:120059ms step_avg:153.72ms step:792/1480 train_time:120225ms step_avg:153.74ms step:793/1480 train_time:120387ms step_avg:153.75ms step:794/1480 train_time:120551ms step_avg:153.76ms step:795/1480 train_time:120716ms step_avg:153.78ms step:796/1480 train_time:120884ms step_avg:153.80ms step:797/1480 train_time:121048ms step_avg:153.81ms step:798/1480 train_time:121211ms step_avg:153.82ms step:799/1480 train_time:121378ms step_avg:153.84ms step:800/1480 train_time:121543ms step_avg:153.85ms step:801/1480 train_time:121706ms step_avg:153.86ms step:802/1480 train_time:121872ms step_avg:153.88ms step:803/1480 train_time:122033ms step_avg:153.89ms step:804/1480 train_time:122196ms step_avg:153.90ms step:805/1480 train_time:122361ms step_avg:153.91ms step:806/1480 train_time:122523ms step_avg:153.92ms step:807/1480 train_time:122684ms step_avg:153.93ms step:808/1480 train_time:122848ms step_avg:153.94ms step:809/1480 train_time:123009ms step_avg:153.95ms step:810/1480 train_time:123171ms step_avg:153.96ms step:811/1480 train_time:123333ms step_avg:153.97ms step:812/1480 train_time:123497ms step_avg:153.99ms step:813/1480 train_time:123659ms step_avg:154.00ms step:814/1480 train_time:123823ms step_avg:154.01ms step:815/1480 train_time:123985ms step_avg:154.02ms step:816/1480 train_time:124150ms step_avg:154.03ms step:817/1480 train_time:124312ms step_avg:154.04ms step:818/1480 train_time:124473ms step_avg:154.05ms step:819/1480 train_time:124639ms step_avg:154.07ms step:820/1480 train_time:124805ms step_avg:154.08ms step:821/1480 train_time:124966ms step_avg:154.09ms step:822/1480 train_time:125129ms step_avg:154.10ms step:823/1480 train_time:125291ms step_avg:154.11ms step:824/1480 train_time:125454ms step_avg:154.12ms step:825/1480 train_time:125620ms step_avg:154.13ms step:826/1480 train_time:125787ms step_avg:154.15ms step:827/1480 train_time:125951ms step_avg:154.16ms step:828/1480 train_time:126114ms step_avg:154.17ms step:829/1480 train_time:126277ms step_avg:154.18ms step:830/1480 train_time:126444ms step_avg:154.20ms step:831/1480 train_time:126608ms step_avg:154.21ms step:832/1480 train_time:126770ms step_avg:154.22ms step:833/1480 train_time:126935ms step_avg:154.23ms step:834/1480 train_time:127100ms step_avg:154.25ms step:835/1480 train_time:127265ms step_avg:154.26ms step:836/1480 train_time:127430ms step_avg:154.27ms step:837/1480 train_time:127591ms step_avg:154.28ms step:838/1480 train_time:127754ms step_avg:154.29ms step:839/1480 train_time:127916ms step_avg:154.30ms step:840/1480 train_time:128078ms step_avg:154.31ms step:841/1480 train_time:128240ms step_avg:154.32ms step:842/1480 train_time:128406ms step_avg:154.33ms step:843/1480 train_time:128569ms step_avg:154.34ms step:844/1480 train_time:128731ms step_avg:154.35ms step:845/1480 train_time:128894ms step_avg:154.36ms step:846/1480 train_time:129059ms step_avg:154.38ms step:847/1480 train_time:129224ms step_avg:154.39ms step:848/1480 train_time:129386ms step_avg:154.40ms step:849/1480 train_time:129549ms step_avg:154.41ms step:850/1480 train_time:129712ms step_avg:154.42ms step:851/1480 train_time:129876ms step_avg:154.43ms step:852/1480 train_time:130037ms step_avg:154.44ms step:853/1480 train_time:130201ms step_avg:154.45ms step:854/1480 train_time:130365ms step_avg:154.46ms step:855/1480 train_time:130528ms step_avg:154.47ms step:856/1480 train_time:130689ms step_avg:154.48ms step:857/1480 train_time:130855ms step_avg:154.49ms step:858/1480 train_time:131022ms step_avg:154.51ms step:859/1480 train_time:131187ms step_avg:154.52ms step:860/1480 train_time:131348ms step_avg:154.53ms step:861/1480 train_time:131513ms step_avg:154.54ms step:862/1480 train_time:131683ms step_avg:154.56ms step:863/1480 train_time:131851ms step_avg:154.57ms step:864/1480 train_time:132015ms step_avg:154.58ms step:865/1480 train_time:132176ms step_avg:154.59ms step:866/1480 train_time:132346ms step_avg:154.61ms step:867/1480 train_time:132508ms step_avg:154.62ms step:868/1480 train_time:132669ms step_avg:154.63ms step:869/1480 train_time:132832ms step_avg:154.64ms step:870/1480 train_time:132997ms step_avg:154.65ms step:871/1480 train_time:133162ms step_avg:154.66ms step:872/1480 train_time:133326ms step_avg:154.67ms step:873/1480 train_time:133488ms step_avg:154.68ms step:874/1480 train_time:133653ms step_avg:154.69ms step:875/1480 train_time:133818ms step_avg:154.70ms step:875/1480 val_loss:3.5021 train_time:133883ms step_avg:154.78ms step:876/1480 train_time:133983ms step_avg:154.72ms step:877/1480 train_time:134150ms step_avg:154.73ms step:878/1480 train_time:134312ms step_avg:154.74ms step:879/1480 train_time:134476ms step_avg:154.75ms step:880/1480 train_time:134638ms step_avg:154.76ms step:881/1480 train_time:134800ms step_avg:154.76ms step:882/1480 train_time:134966ms step_avg:154.78ms step:883/1480 train_time:135133ms step_avg:154.79ms step:884/1480 train_time:135299ms step_avg:154.80ms step:885/1480 train_time:135466ms step_avg:154.82ms step:886/1480 train_time:135631ms step_avg:154.83ms step:887/1480 train_time:135798ms step_avg:154.84ms step:888/1480 train_time:135971ms step_avg:154.86ms step:889/1480 train_time:136139ms step_avg:154.88ms step:890/1480 train_time:136302ms step_avg:154.89ms step:891/1480 train_time:136469ms step_avg:154.90ms step:892/1480 train_time:136634ms step_avg:154.91ms step:893/1480 train_time:136795ms step_avg:154.92ms step:894/1480 train_time:136963ms step_avg:154.94ms step:895/1480 train_time:137130ms step_avg:154.95ms step:896/1480 train_time:137295ms step_avg:154.96ms step:897/1480 train_time:137462ms step_avg:154.97ms step:898/1480 train_time:137630ms step_avg:154.99ms step:899/1480 train_time:137794ms step_avg:155.00ms step:900/1480 train_time:137957ms step_avg:155.01ms step:901/1480 train_time:138121ms step_avg:155.02ms step:902/1480 train_time:138286ms step_avg:155.03ms step:903/1480 train_time:138458ms step_avg:155.05ms step:904/1480 train_time:138624ms step_avg:155.06ms step:905/1480 train_time:138787ms step_avg:155.07ms step:906/1480 train_time:138954ms step_avg:155.08ms step:907/1480 train_time:139122ms step_avg:155.10ms step:908/1480 train_time:139284ms step_avg:155.10ms step:909/1480 train_time:139449ms step_avg:155.12ms step:910/1480 train_time:139617ms step_avg:155.13ms step:911/1480 train_time:139782ms step_avg:155.14ms step:912/1480 train_time:139949ms step_avg:155.15ms step:913/1480 train_time:140116ms step_avg:155.17ms step:914/1480 train_time:140283ms step_avg:155.18ms step:915/1480 train_time:140453ms step_avg:155.20ms step:916/1480 train_time:140615ms step_avg:155.20ms step:917/1480 train_time:140779ms step_avg:155.21ms step:918/1480 train_time:140949ms step_avg:155.23ms step:919/1480 train_time:141118ms step_avg:155.25ms step:920/1480 train_time:141282ms step_avg:155.26ms step:921/1480 train_time:141450ms step_avg:155.27ms step:922/1480 train_time:141616ms step_avg:155.28ms step:923/1480 train_time:141780ms step_avg:155.29ms step:924/1480 train_time:141945ms step_avg:155.30ms step:925/1480 train_time:142112ms step_avg:155.31ms step:926/1480 train_time:142274ms step_avg:155.32ms step:927/1480 train_time:142438ms step_avg:155.33ms step:928/1480 train_time:142605ms step_avg:155.34ms step:929/1480 train_time:142771ms step_avg:155.35ms step:930/1480 train_time:142935ms step_avg:155.36ms step:931/1480 train_time:143098ms step_avg:155.37ms step:932/1480 train_time:143266ms step_avg:155.39ms step:933/1480 train_time:143433ms step_avg:155.40ms step:934/1480 train_time:143600ms step_avg:155.41ms step:935/1480 train_time:143773ms step_avg:155.43ms step:936/1480 train_time:143939ms step_avg:155.44ms step:937/1480 train_time:144110ms step_avg:155.46ms step:938/1480 train_time:144272ms step_avg:155.47ms step:939/1480 train_time:144441ms step_avg:155.48ms step:940/1480 train_time:144607ms step_avg:155.49ms step:941/1480 train_time:144771ms step_avg:155.50ms step:942/1480 train_time:144936ms step_avg:155.51ms step:943/1480 train_time:145106ms step_avg:155.53ms step:944/1480 train_time:145277ms step_avg:155.54ms step:945/1480 train_time:145441ms step_avg:155.55ms step:946/1480 train_time:145611ms step_avg:155.57ms step:947/1480 train_time:145778ms step_avg:155.58ms step:948/1480 train_time:145944ms step_avg:155.59ms step:949/1480 train_time:146110ms step_avg:155.60ms step:950/1480 train_time:146273ms step_avg:155.61ms step:951/1480 train_time:146441ms step_avg:155.62ms step:952/1480 train_time:146607ms step_avg:155.63ms step:953/1480 train_time:146776ms step_avg:155.65ms step:954/1480 train_time:146945ms step_avg:155.66ms step:955/1480 train_time:147109ms step_avg:155.67ms step:956/1480 train_time:147274ms step_avg:155.68ms step:957/1480 train_time:147441ms step_avg:155.69ms step:958/1480 train_time:147612ms step_avg:155.71ms step:959/1480 train_time:147776ms step_avg:155.72ms step:960/1480 train_time:147944ms step_avg:155.73ms step:961/1480 train_time:148109ms step_avg:155.74ms step:962/1480 train_time:148274ms step_avg:155.75ms step:963/1480 train_time:148438ms step_avg:155.76ms step:964/1480 train_time:148605ms step_avg:155.77ms step:965/1480 train_time:148770ms step_avg:155.78ms step:966/1480 train_time:148933ms step_avg:155.79ms step:967/1480 train_time:149097ms step_avg:155.80ms step:968/1480 train_time:149264ms step_avg:155.81ms step:969/1480 train_time:149430ms step_avg:155.82ms step:970/1480 train_time:149595ms step_avg:155.83ms step:971/1480 train_time:149760ms step_avg:155.84ms step:972/1480 train_time:149925ms step_avg:155.85ms step:973/1480 train_time:150090ms step_avg:155.86ms step:974/1480 train_time:150257ms step_avg:155.87ms step:975/1480 train_time:150421ms step_avg:155.88ms step:976/1480 train_time:150587ms step_avg:155.89ms step:977/1480 train_time:150752ms step_avg:155.90ms step:978/1480 train_time:150917ms step_avg:155.91ms step:979/1480 train_time:151082ms step_avg:155.91ms step:980/1480 train_time:151248ms step_avg:155.93ms step:981/1480 train_time:151414ms step_avg:155.94ms step:982/1480 train_time:151577ms step_avg:155.94ms step:983/1480 train_time:151744ms step_avg:155.96ms step:984/1480 train_time:151909ms step_avg:155.96ms step:985/1480 train_time:152075ms step_avg:155.97ms step:986/1480 train_time:152239ms step_avg:155.98ms step:987/1480 train_time:152403ms step_avg:155.99ms step:988/1480 train_time:152572ms step_avg:156.00ms step:989/1480 train_time:152735ms step_avg:156.01ms step:990/1480 train_time:152905ms step_avg:156.03ms step:991/1480 train_time:153073ms step_avg:156.04ms step:992/1480 train_time:153247ms step_avg:156.06ms step:993/1480 train_time:153423ms step_avg:156.08ms step:994/1480 train_time:153590ms step_avg:156.09ms step:995/1480 train_time:153754ms step_avg:156.10ms step:996/1480 train_time:153916ms step_avg:156.10ms step:997/1480 train_time:154081ms step_avg:156.11ms step:998/1480 train_time:154244ms step_avg:156.12ms step:999/1480 train_time:154410ms step_avg:156.13ms step:1000/1480 train_time:154580ms step_avg:156.14ms step:1000/1480 val_loss:3.4395 train_time:154648ms step_avg:156.21ms step:1001/1480 train_time:154749ms step_avg:156.15ms step:1002/1480 train_time:154917ms step_avg:156.17ms step:1003/1480 train_time:155088ms step_avg:156.18ms step:1004/1480 train_time:155258ms step_avg:156.19ms step:1005/1480 train_time:155426ms step_avg:156.21ms step:1006/1480 train_time:155594ms step_avg:156.22ms step:1007/1480 train_time:155761ms step_avg:156.23ms step:1008/1480 train_time:155929ms step_avg:156.24ms step:1009/1480 train_time:156105ms step_avg:156.26ms step:1010/1480 train_time:156270ms step_avg:156.27ms step:1011/1480 train_time:156436ms step_avg:156.28ms step:1012/1480 train_time:156601ms step_avg:156.29ms step:1013/1480 train_time:156770ms step_avg:156.30ms step:1014/1480 train_time:156938ms step_avg:156.31ms step:1015/1480 train_time:157108ms step_avg:156.33ms step:1016/1480 train_time:157274ms step_avg:156.34ms step:1017/1480 train_time:157445ms step_avg:156.35ms step:1018/1480 train_time:157613ms step_avg:156.36ms step:1019/1480 train_time:157783ms step_avg:156.38ms step:1020/1480 train_time:157952ms step_avg:156.39ms step:1021/1480 train_time:158119ms step_avg:156.40ms step:1022/1480 train_time:158286ms step_avg:156.41ms step:1023/1480 train_time:158451ms step_avg:156.42ms step:1024/1480 train_time:158620ms step_avg:156.43ms step:1025/1480 train_time:158792ms step_avg:156.45ms step:1026/1480 train_time:158957ms step_avg:156.45ms step:1027/1480 train_time:159124ms step_avg:156.46ms step:1028/1480 train_time:159295ms step_avg:156.48ms step:1029/1480 train_time:159469ms step_avg:156.50ms step:1030/1480 train_time:159637ms step_avg:156.51ms step:1031/1480 train_time:159803ms step_avg:156.52ms step:1032/1480 train_time:159972ms step_avg:156.53ms step:1033/1480 train_time:160138ms step_avg:156.54ms step:1034/1480 train_time:160307ms step_avg:156.55ms step:1035/1480 train_time:160475ms step_avg:156.56ms step:1036/1480 train_time:160640ms step_avg:156.57ms step:1037/1480 train_time:160808ms step_avg:156.58ms step:1038/1480 train_time:160975ms step_avg:156.59ms step:1039/1480 train_time:161146ms step_avg:156.60ms step:1040/1480 train_time:161311ms step_avg:156.61ms step:1041/1480 train_time:161479ms step_avg:156.62ms step:1042/1480 train_time:161643ms step_avg:156.63ms step:1043/1480 train_time:161809ms step_avg:156.64ms step:1044/1480 train_time:161973ms step_avg:156.65ms step:1045/1480 train_time:162144ms step_avg:156.66ms step:1046/1480 train_time:162312ms step_avg:156.67ms step:1047/1480 train_time:162479ms step_avg:156.68ms step:1048/1480 train_time:162646ms step_avg:156.69ms step:1049/1480 train_time:162811ms step_avg:156.70ms step:1050/1480 train_time:162982ms step_avg:156.71ms step:1051/1480 train_time:163152ms step_avg:156.73ms step:1052/1480 train_time:163320ms step_avg:156.74ms step:1053/1480 train_time:163486ms step_avg:156.75ms step:1054/1480 train_time:163654ms step_avg:156.76ms step:1055/1480 train_time:163821ms step_avg:156.77ms step:1056/1480 train_time:163986ms step_avg:156.77ms step:1057/1480 train_time:164151ms step_avg:156.78ms step:1058/1480 train_time:164322ms step_avg:156.80ms step:1059/1480 train_time:164494ms step_avg:156.81ms step:1060/1480 train_time:164663ms step_avg:156.82ms step:1061/1480 train_time:164827ms step_avg:156.83ms step:1062/1480 train_time:164991ms step_avg:156.84ms step:1063/1480 train_time:165156ms step_avg:156.84ms step:1064/1480 train_time:165322ms step_avg:156.85ms step:1065/1480 train_time:165489ms step_avg:156.86ms step:1066/1480 train_time:165656ms step_avg:156.87ms step:1067/1480 train_time:165826ms step_avg:156.88ms step:1068/1480 train_time:165993ms step_avg:156.89ms step:1069/1480 train_time:166164ms step_avg:156.91ms step:1070/1480 train_time:166329ms step_avg:156.91ms step:1071/1480 train_time:166503ms step_avg:156.93ms step:1072/1480 train_time:166668ms step_avg:156.94ms step:1073/1480 train_time:166831ms step_avg:156.94ms step:1074/1480 train_time:167000ms step_avg:156.95ms step:1075/1480 train_time:167171ms step_avg:156.97ms step:1076/1480 train_time:167340ms step_avg:156.98ms step:1077/1480 train_time:167507ms step_avg:156.99ms step:1078/1480 train_time:167681ms step_avg:157.00ms step:1079/1480 train_time:167853ms step_avg:157.02ms step:1080/1480 train_time:168025ms step_avg:157.03ms step:1081/1480 train_time:168190ms step_avg:157.04ms step:1082/1480 train_time:168355ms step_avg:157.05ms step:1083/1480 train_time:168524ms step_avg:157.06ms step:1084/1480 train_time:168689ms step_avg:157.07ms step:1085/1480 train_time:168857ms step_avg:157.08ms step:1086/1480 train_time:169026ms step_avg:157.09ms step:1087/1480 train_time:169192ms step_avg:157.10ms step:1088/1480 train_time:169361ms step_avg:157.11ms step:1089/1480 train_time:169533ms step_avg:157.12ms step:1090/1480 train_time:169706ms step_avg:157.14ms step:1091/1480 train_time:169873ms step_avg:157.14ms step:1092/1480 train_time:170042ms step_avg:157.16ms step:1093/1480 train_time:170210ms step_avg:157.16ms step:1094/1480 train_time:170375ms step_avg:157.17ms step:1095/1480 train_time:170541ms step_avg:157.18ms step:1096/1480 train_time:170710ms step_avg:157.19ms step:1097/1480 train_time:170879ms step_avg:157.20ms step:1098/1480 train_time:171051ms step_avg:157.22ms step:1099/1480 train_time:171222ms step_avg:157.23ms step:1100/1480 train_time:171393ms step_avg:157.24ms step:1101/1480 train_time:171565ms step_avg:157.25ms step:1102/1480 train_time:171736ms step_avg:157.27ms step:1103/1480 train_time:171912ms step_avg:157.28ms step:1104/1480 train_time:172079ms step_avg:157.29ms step:1105/1480 train_time:172249ms step_avg:157.30ms step:1106/1480 train_time:172417ms step_avg:157.32ms step:1107/1480 train_time:172586ms step_avg:157.33ms step:1108/1480 train_time:172751ms step_avg:157.33ms step:1109/1480 train_time:172918ms step_avg:157.34ms step:1110/1480 train_time:173083ms step_avg:157.35ms step:1111/1480 train_time:173249ms step_avg:157.36ms step:1112/1480 train_time:173422ms step_avg:157.37ms step:1113/1480 train_time:173601ms step_avg:157.39ms step:1114/1480 train_time:173772ms step_avg:157.40ms step:1115/1480 train_time:173945ms step_avg:157.42ms step:1116/1480 train_time:174112ms step_avg:157.43ms step:1117/1480 train_time:174287ms step_avg:157.44ms step:1118/1480 train_time:174461ms step_avg:157.46ms step:1119/1480 train_time:174626ms step_avg:157.46ms step:1120/1480 train_time:174794ms step_avg:157.47ms step:1121/1480 train_time:174965ms step_avg:157.48ms step:1122/1480 train_time:175130ms step_avg:157.49ms step:1123/1480 train_time:175299ms step_avg:157.50ms step:1124/1480 train_time:175466ms step_avg:157.51ms step:1125/1480 train_time:175632ms step_avg:157.52ms step:1125/1480 val_loss:3.3846 train_time:175700ms step_avg:157.58ms step:1126/1480 train_time:175804ms step_avg:157.53ms step:1127/1480 train_time:175975ms step_avg:157.54ms step:1128/1480 train_time:176146ms step_avg:157.55ms step:1129/1480 train_time:176319ms step_avg:157.57ms step:1130/1480 train_time:176488ms step_avg:157.58ms step:1131/1480 train_time:176665ms step_avg:157.60ms step:1132/1480 train_time:176832ms step_avg:157.60ms step:1133/1480 train_time:177002ms step_avg:157.62ms step:1134/1480 train_time:177173ms step_avg:157.63ms step:1135/1480 train_time:177342ms step_avg:157.64ms step:1136/1480 train_time:177514ms step_avg:157.65ms step:1137/1480 train_time:177682ms step_avg:157.66ms step:1138/1480 train_time:177855ms step_avg:157.67ms step:1139/1480 train_time:178025ms step_avg:157.68ms step:1140/1480 train_time:178192ms step_avg:157.69ms step:1141/1480 train_time:178365ms step_avg:157.71ms step:1142/1480 train_time:178533ms step_avg:157.71ms step:1143/1480 train_time:178704ms step_avg:157.73ms step:1144/1480 train_time:178874ms step_avg:157.74ms step:1145/1480 train_time:179038ms step_avg:157.74ms step:1146/1480 train_time:179210ms step_avg:157.75ms step:1147/1480 train_time:179377ms step_avg:157.76ms step:1148/1480 train_time:179544ms step_avg:157.77ms step:1149/1480 train_time:179716ms step_avg:157.78ms step:1150/1480 train_time:179884ms step_avg:157.79ms step:1151/1480 train_time:180057ms step_avg:157.81ms step:1152/1480 train_time:180230ms step_avg:157.82ms step:1153/1480 train_time:180403ms step_avg:157.83ms step:1154/1480 train_time:180570ms step_avg:157.84ms step:1155/1480 train_time:180741ms step_avg:157.85ms step:1156/1480 train_time:180920ms step_avg:157.87ms step:1157/1480 train_time:181090ms step_avg:157.88ms step:1158/1480 train_time:181257ms step_avg:157.89ms step:1159/1480 train_time:181426ms step_avg:157.90ms step:1160/1480 train_time:181592ms step_avg:157.91ms step:1161/1480 train_time:181761ms step_avg:157.92ms step:1162/1480 train_time:181931ms step_avg:157.93ms step:1163/1480 train_time:182100ms step_avg:157.94ms step:1164/1480 train_time:182268ms step_avg:157.94ms step:1165/1480 train_time:182435ms step_avg:157.95ms step:1166/1480 train_time:182604ms step_avg:157.96ms step:1167/1480 train_time:182773ms step_avg:157.97ms step:1168/1480 train_time:182940ms step_avg:157.98ms step:1169/1480 train_time:183110ms step_avg:157.99ms step:1170/1480 train_time:183279ms step_avg:158.00ms step:1171/1480 train_time:183447ms step_avg:158.01ms step:1172/1480 train_time:183613ms step_avg:158.01ms step:1173/1480 train_time:183784ms step_avg:158.03ms step:1174/1480 train_time:183968ms step_avg:158.05ms step:1175/1480 train_time:184140ms step_avg:158.06ms step:1176/1480 train_time:184313ms step_avg:158.07ms step:1177/1480 train_time:184489ms step_avg:158.09ms step:1178/1480 train_time:184657ms step_avg:158.10ms step:1179/1480 train_time:184823ms step_avg:158.10ms step:1180/1480 train_time:185002ms step_avg:158.12ms step:1181/1480 train_time:185172ms step_avg:158.13ms step:1182/1480 train_time:185340ms step_avg:158.14ms step:1183/1480 train_time:185511ms step_avg:158.15ms step:1184/1480 train_time:185678ms step_avg:158.16ms step:1185/1480 train_time:185853ms step_avg:158.17ms step:1186/1480 train_time:186024ms step_avg:158.18ms step:1187/1480 train_time:186208ms step_avg:158.21ms step:1188/1480 train_time:186375ms step_avg:158.21ms step:1189/1480 train_time:186546ms step_avg:158.22ms step:1190/1480 train_time:186713ms step_avg:158.23ms step:1191/1480 train_time:186884ms step_avg:158.24ms step:1192/1480 train_time:187051ms step_avg:158.25ms step:1193/1480 train_time:187216ms step_avg:158.25ms step:1194/1480 train_time:187384ms step_avg:158.26ms step:1195/1480 train_time:187559ms step_avg:158.28ms step:1196/1480 train_time:187744ms step_avg:158.30ms step:1197/1480 train_time:187915ms step_avg:158.31ms step:1198/1480 train_time:188095ms step_avg:158.33ms step:1199/1480 train_time:188265ms step_avg:158.34ms step:1200/1480 train_time:188435ms step_avg:158.35ms step:1201/1480 train_time:188603ms step_avg:158.36ms step:1202/1480 train_time:188784ms step_avg:158.38ms step:1203/1480 train_time:188959ms step_avg:158.39ms step:1204/1480 train_time:189134ms step_avg:158.40ms step:1205/1480 train_time:189302ms step_avg:158.41ms step:1206/1480 train_time:189471ms step_avg:158.42ms step:1207/1480 train_time:189640ms step_avg:158.43ms step:1208/1480 train_time:189808ms step_avg:158.44ms step:1209/1480 train_time:189982ms step_avg:158.45ms step:1210/1480 train_time:190157ms step_avg:158.46ms step:1211/1480 train_time:190332ms step_avg:158.48ms step:1212/1480 train_time:190502ms step_avg:158.49ms step:1213/1480 train_time:190674ms step_avg:158.50ms step:1214/1480 train_time:190851ms step_avg:158.51ms step:1215/1480 train_time:191024ms step_avg:158.53ms step:1216/1480 train_time:191194ms step_avg:158.54ms step:1217/1480 train_time:191370ms step_avg:158.55ms step:1218/1480 train_time:191540ms step_avg:158.56ms step:1219/1480 train_time:191719ms step_avg:158.58ms step:1220/1480 train_time:191888ms step_avg:158.59ms step:1221/1480 train_time:192058ms step_avg:158.59ms step:1222/1480 train_time:192227ms step_avg:158.60ms step:1223/1480 train_time:192395ms step_avg:158.61ms step:1224/1480 train_time:192574ms step_avg:158.63ms step:1225/1480 train_time:192746ms step_avg:158.64ms step:1226/1480 train_time:192918ms step_avg:158.65ms step:1227/1480 train_time:193092ms step_avg:158.66ms step:1228/1480 train_time:193263ms step_avg:158.67ms step:1229/1480 train_time:193436ms step_avg:158.68ms step:1230/1480 train_time:193615ms step_avg:158.70ms step:1231/1480 train_time:193791ms step_avg:158.72ms step:1232/1480 train_time:193964ms step_avg:158.73ms step:1233/1480 train_time:194134ms step_avg:158.74ms step:1234/1480 train_time:194303ms step_avg:158.74ms step:1235/1480 train_time:194477ms step_avg:158.76ms step:1236/1480 train_time:194646ms step_avg:158.77ms step:1237/1480 train_time:194816ms step_avg:158.77ms step:1238/1480 train_time:195000ms step_avg:158.80ms step:1239/1480 train_time:195173ms step_avg:158.81ms step:1240/1480 train_time:195343ms step_avg:158.82ms step:1241/1480 train_time:195516ms step_avg:158.83ms step:1242/1480 train_time:195685ms step_avg:158.84ms step:1243/1480 train_time:195859ms step_avg:158.85ms step:1244/1480 train_time:196027ms step_avg:158.86ms step:1245/1480 train_time:196195ms step_avg:158.86ms step:1246/1480 train_time:196366ms step_avg:158.87ms step:1247/1480 train_time:196534ms step_avg:158.88ms step:1248/1480 train_time:196705ms step_avg:158.89ms step:1249/1480 train_time:196873ms step_avg:158.90ms step:1250/1480 train_time:197042ms step_avg:158.90ms step:1250/1480 val_loss:3.3348 train_time:197114ms step_avg:158.96ms step:1251/1480 train_time:197223ms step_avg:158.92ms step:1252/1480 train_time:197394ms step_avg:158.93ms step:1253/1480 train_time:197563ms step_avg:158.94ms step:1254/1480 train_time:197734ms step_avg:158.95ms step:1255/1480 train_time:197920ms step_avg:158.97ms step:1256/1480 train_time:198096ms step_avg:158.99ms step:1257/1480 train_time:198267ms step_avg:159.00ms step:1258/1480 train_time:198444ms step_avg:159.01ms step:1259/1480 train_time:198615ms step_avg:159.02ms step:1260/1480 train_time:198783ms step_avg:159.03ms step:1261/1480 train_time:198954ms step_avg:159.04ms step:1262/1480 train_time:199130ms step_avg:159.05ms step:1263/1480 train_time:199304ms step_avg:159.06ms step:1264/1480 train_time:199469ms step_avg:159.07ms step:1265/1480 train_time:199637ms step_avg:159.07ms step:1266/1480 train_time:199808ms step_avg:159.08ms step:1267/1480 train_time:199978ms step_avg:159.09ms step:1268/1480 train_time:200148ms step_avg:159.10ms step:1269/1480 train_time:200324ms step_avg:159.11ms step:1270/1480 train_time:200493ms step_avg:159.12ms step:1271/1480 train_time:200664ms step_avg:159.13ms step:1272/1480 train_time:200831ms step_avg:159.14ms step:1273/1480 train_time:201003ms step_avg:159.15ms step:1274/1480 train_time:201176ms step_avg:159.16ms step:1275/1480 train_time:201343ms step_avg:159.16ms step:1276/1480 train_time:201508ms step_avg:159.17ms step:1277/1480 train_time:201682ms step_avg:159.18ms step:1278/1480 train_time:201849ms step_avg:159.19ms step:1279/1480 train_time:202023ms step_avg:159.20ms step:1280/1480 train_time:202201ms step_avg:159.21ms step:1281/1480 train_time:202371ms step_avg:159.22ms step:1282/1480 train_time:202537ms step_avg:159.23ms step:1283/1480 train_time:202707ms step_avg:159.24ms step:1284/1480 train_time:202878ms step_avg:159.25ms step:1285/1480 train_time:203046ms step_avg:159.25ms step:1286/1480 train_time:203218ms step_avg:159.26ms step:1287/1480 train_time:203390ms step_avg:159.27ms step:1288/1480 train_time:203563ms step_avg:159.28ms step:1289/1480 train_time:203745ms step_avg:159.30ms step:1290/1480 train_time:203921ms step_avg:159.31ms step:1291/1480 train_time:204094ms step_avg:159.32ms step:1292/1480 train_time:204267ms step_avg:159.33ms step:1293/1480 train_time:204443ms step_avg:159.35ms step:1294/1480 train_time:204614ms step_avg:159.36ms step:1295/1480 train_time:204785ms step_avg:159.37ms step:1296/1480 train_time:204959ms step_avg:159.38ms step:1297/1480 train_time:205129ms step_avg:159.39ms step:1298/1480 train_time:205301ms step_avg:159.40ms step:1299/1480 train_time:205471ms step_avg:159.40ms step:1300/1480 train_time:205639ms step_avg:159.41ms step:1301/1480 train_time:205807ms step_avg:159.42ms step:1302/1480 train_time:205983ms step_avg:159.43ms step:1303/1480 train_time:206159ms step_avg:159.44ms step:1304/1480 train_time:206334ms step_avg:159.45ms step:1305/1480 train_time:206504ms step_avg:159.46ms step:1306/1480 train_time:206680ms step_avg:159.48ms step:1307/1480 train_time:206846ms step_avg:159.48ms step:1308/1480 train_time:207015ms step_avg:159.49ms step:1309/1480 train_time:207187ms step_avg:159.50ms step:1310/1480 train_time:207358ms step_avg:159.51ms step:1311/1480 train_time:207527ms step_avg:159.51ms step:1312/1480 train_time:207701ms step_avg:159.52ms step:1313/1480 train_time:207870ms step_avg:159.53ms step:1314/1480 train_time:208043ms step_avg:159.54ms step:1315/1480 train_time:208214ms step_avg:159.55ms step:1316/1480 train_time:208381ms step_avg:159.56ms step:1317/1480 train_time:208552ms step_avg:159.57ms step:1318/1480 train_time:208733ms step_avg:159.58ms step:1319/1480 train_time:208909ms step_avg:159.59ms step:1320/1480 train_time:209086ms step_avg:159.61ms step:1321/1480 train_time:209260ms step_avg:159.62ms step:1322/1480 train_time:209439ms step_avg:159.63ms step:1323/1480 train_time:209611ms step_avg:159.64ms step:1324/1480 train_time:209787ms step_avg:159.66ms step:1325/1480 train_time:209969ms step_avg:159.67ms step:1326/1480 train_time:210144ms step_avg:159.68ms step:1327/1480 train_time:210314ms step_avg:159.69ms step:1328/1480 train_time:210485ms step_avg:159.70ms step:1329/1480 train_time:210682ms step_avg:159.73ms step:1330/1480 train_time:210862ms step_avg:159.74ms step:1331/1480 train_time:211033ms step_avg:159.75ms step:1332/1480 train_time:211207ms step_avg:159.76ms step:1333/1480 train_time:211383ms step_avg:159.78ms step:1334/1480 train_time:211556ms step_avg:159.79ms step:1335/1480 train_time:211724ms step_avg:159.79ms step:1336/1480 train_time:211910ms step_avg:159.81ms step:1337/1480 train_time:212087ms step_avg:159.82ms step:1338/1480 train_time:212260ms step_avg:159.83ms step:1339/1480 train_time:212434ms step_avg:159.84ms step:1340/1480 train_time:212607ms step_avg:159.86ms step:1341/1480 train_time:212777ms step_avg:159.86ms step:1342/1480 train_time:212949ms step_avg:159.87ms step:1343/1480 train_time:213120ms step_avg:159.88ms step:1344/1480 train_time:213292ms step_avg:159.89ms step:1345/1480 train_time:213472ms step_avg:159.90ms step:1346/1480 train_time:213641ms step_avg:159.91ms step:1347/1480 train_time:213810ms step_avg:159.92ms step:1348/1480 train_time:213980ms step_avg:159.93ms step:1349/1480 train_time:214149ms step_avg:159.93ms step:1350/1480 train_time:214323ms step_avg:159.94ms step:1351/1480 train_time:214495ms step_avg:159.95ms step:1352/1480 train_time:214667ms step_avg:159.96ms step:1353/1480 train_time:214844ms step_avg:159.97ms step:1354/1480 train_time:215015ms step_avg:159.98ms step:1355/1480 train_time:215182ms step_avg:159.99ms step:1356/1480 train_time:215354ms step_avg:160.00ms step:1357/1480 train_time:215526ms step_avg:160.00ms step:1358/1480 train_time:215699ms step_avg:160.01ms step:1359/1480 train_time:215871ms step_avg:160.02ms step:1360/1480 train_time:216045ms step_avg:160.03ms step:1361/1480 train_time:216222ms step_avg:160.05ms step:1362/1480 train_time:216398ms step_avg:160.06ms step:1363/1480 train_time:216579ms step_avg:160.07ms step:1364/1480 train_time:216747ms step_avg:160.08ms step:1365/1480 train_time:216915ms step_avg:160.08ms step:1366/1480 train_time:217086ms step_avg:160.09ms step:1367/1480 train_time:217259ms step_avg:160.10ms step:1368/1480 train_time:217432ms step_avg:160.11ms step:1369/1480 train_time:217613ms step_avg:160.13ms step:1370/1480 train_time:217792ms step_avg:160.14ms step:1371/1480 train_time:217963ms step_avg:160.15ms step:1372/1480 train_time:218142ms step_avg:160.16ms step:1373/1480 train_time:218310ms step_avg:160.17ms step:1374/1480 train_time:218487ms step_avg:160.18ms step:1375/1480 train_time:218658ms step_avg:160.19ms step:1375/1480 val_loss:3.2959 train_time:218726ms step_avg:160.24ms step:1376/1480 train_time:218834ms step_avg:160.20ms step:1377/1480 train_time:219005ms step_avg:160.21ms step:1378/1480 train_time:219175ms step_avg:160.22ms step:1379/1480 train_time:219349ms step_avg:160.23ms step:1380/1480 train_time:219523ms step_avg:160.24ms step:1381/1480 train_time:219704ms step_avg:160.25ms step:1382/1480 train_time:219875ms step_avg:160.26ms step:1383/1480 train_time:220046ms step_avg:160.27ms step:1384/1480 train_time:220224ms step_avg:160.28ms step:1385/1480 train_time:220390ms step_avg:160.28ms step:1386/1480 train_time:220560ms step_avg:160.29ms step:1387/1480 train_time:220734ms step_avg:160.30ms step:1388/1480 train_time:220903ms step_avg:160.31ms step:1389/1480 train_time:221076ms step_avg:160.32ms step:1390/1480 train_time:221244ms step_avg:160.32ms step:1391/1480 train_time:221415ms step_avg:160.33ms step:1392/1480 train_time:221586ms step_avg:160.34ms step:1393/1480 train_time:221757ms step_avg:160.35ms step:1394/1480 train_time:221927ms step_avg:160.35ms step:1395/1480 train_time:222097ms step_avg:160.36ms step:1396/1480 train_time:222265ms step_avg:160.36ms step:1397/1480 train_time:222432ms step_avg:160.37ms step:1398/1480 train_time:222599ms step_avg:160.37ms step:1399/1480 train_time:222769ms step_avg:160.38ms step:1400/1480 train_time:222945ms step_avg:160.39ms step:1401/1480 train_time:223113ms step_avg:160.40ms step:1402/1480 train_time:223284ms step_avg:160.40ms step:1403/1480 train_time:223462ms step_avg:160.42ms step:1404/1480 train_time:223633ms step_avg:160.43ms step:1405/1480 train_time:223806ms step_avg:160.43ms step:1406/1480 train_time:223982ms step_avg:160.45ms step:1407/1480 train_time:224152ms step_avg:160.45ms step:1408/1480 train_time:224322ms step_avg:160.46ms step:1409/1480 train_time:224505ms step_avg:160.48ms step:1410/1480 train_time:224675ms step_avg:160.48ms step:1411/1480 train_time:224843ms step_avg:160.49ms step:1412/1480 train_time:225014ms step_avg:160.50ms step:1413/1480 train_time:225183ms step_avg:160.50ms step:1414/1480 train_time:225355ms step_avg:160.51ms step:1415/1480 train_time:225529ms step_avg:160.52ms step:1416/1480 train_time:225715ms step_avg:160.54ms step:1417/1480 train_time:225890ms step_avg:160.55ms step:1418/1480 train_time:226059ms step_avg:160.55ms step:1419/1480 train_time:226233ms step_avg:160.56ms step:1420/1480 train_time:226407ms step_avg:160.57ms step:1421/1480 train_time:226581ms step_avg:160.58ms step:1422/1480 train_time:226752ms step_avg:160.59ms step:1423/1480 train_time:226922ms step_avg:160.60ms step:1424/1480 train_time:227100ms step_avg:160.61ms step:1425/1480 train_time:227282ms step_avg:160.62ms step:1426/1480 train_time:227453ms step_avg:160.63ms step:1427/1480 train_time:227627ms step_avg:160.64ms step:1428/1480 train_time:227798ms step_avg:160.65ms step:1429/1480 train_time:227967ms step_avg:160.65ms step:1430/1480 train_time:228142ms step_avg:160.66ms step:1431/1480 train_time:228318ms step_avg:160.67ms step:1432/1480 train_time:228496ms step_avg:160.69ms step:1433/1480 train_time:228675ms step_avg:160.70ms step:1434/1480 train_time:228855ms step_avg:160.71ms step:1435/1480 train_time:229032ms step_avg:160.72ms step:1436/1480 train_time:229204ms step_avg:160.73ms step:1437/1480 train_time:229376ms step_avg:160.74ms step:1438/1480 train_time:229545ms step_avg:160.75ms step:1439/1480 train_time:229719ms step_avg:160.75ms step:1440/1480 train_time:229888ms step_avg:160.76ms step:1441/1480 train_time:230058ms step_avg:160.77ms step:1442/1480 train_time:230236ms step_avg:160.78ms step:1443/1480 train_time:230423ms step_avg:160.80ms step:1444/1480 train_time:230594ms step_avg:160.80ms step:1445/1480 train_time:230765ms step_avg:160.81ms step:1446/1480 train_time:230941ms step_avg:160.82ms step:1447/1480 train_time:231120ms step_avg:160.83ms step:1448/1480 train_time:231293ms step_avg:160.84ms step:1449/1480 train_time:231467ms step_avg:160.85ms step:1450/1480 train_time:231639ms step_avg:160.86ms step:1451/1480 train_time:231811ms step_avg:160.87ms step:1452/1480 train_time:231984ms step_avg:160.88ms step:1453/1480 train_time:232154ms step_avg:160.88ms step:1454/1480 train_time:232326ms step_avg:160.89ms step:1455/1480 train_time:232505ms step_avg:160.90ms step:1456/1480 train_time:232680ms step_avg:160.91ms step:1457/1480 train_time:232851ms step_avg:160.92ms step:1458/1480 train_time:233021ms step_avg:160.93ms step:1459/1480 train_time:233198ms step_avg:160.94ms step:1460/1480 train_time:233370ms step_avg:160.94ms step:1461/1480 train_time:233544ms step_avg:160.95ms step:1462/1480 train_time:233715ms step_avg:160.96ms step:1463/1480 train_time:233893ms step_avg:160.97ms step:1464/1480 train_time:234066ms step_avg:160.98ms step:1465/1480 train_time:234237ms step_avg:160.99ms step:1466/1480 train_time:234408ms step_avg:160.99ms step:1467/1480 train_time:234584ms step_avg:161.00ms step:1468/1480 train_time:234754ms step_avg:161.01ms step:1469/1480 train_time:234926ms step_avg:161.02ms step:1470/1480 train_time:235106ms step_avg:161.03ms step:1471/1480 train_time:235296ms step_avg:161.05ms step:1472/1480 train_time:235477ms step_avg:161.07ms step:1473/1480 train_time:235648ms step_avg:161.07ms step:1474/1480 train_time:235827ms step_avg:161.08ms step:1475/1480 train_time:236006ms step_avg:161.10ms step:1476/1480 train_time:236178ms step_avg:161.10ms step:1477/1480 train_time:236359ms step_avg:161.12ms step:1478/1480 train_time:236542ms step_avg:161.13ms step:1479/1480 train_time:236717ms step_avg:161.14ms step:1480/1480 train_time:236890ms step_avg:161.15ms step:1480/1480 val_loss:3.2773 train_time:236961ms step_avg:161.20ms