import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 09:26:56 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 114W / 700W | 37MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 100W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 120W / 700W | 119MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 112W / 700W | 37MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23952ms step_avg:nanms step:2/1480 train_time:24038ms step_avg:nanms step:3/1480 train_time:24178ms step_avg:nanms step:4/1480 train_time:24319ms step_avg:nanms step:5/1480 train_time:24462ms step_avg:nanms step:6/1480 train_time:24603ms step_avg:nanms step:7/1480 train_time:24745ms step_avg:nanms step:8/1480 train_time:24887ms step_avg:nanms step:9/1480 train_time:25031ms step_avg:nanms step:10/1480 train_time:25173ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:427ms step_avg:142.46ms step:14/1480 train_time:570ms step_avg:142.57ms step:15/1480 train_time:712ms step_avg:142.41ms step:16/1480 train_time:856ms step_avg:142.62ms step:17/1480 train_time:998ms step_avg:142.53ms step:18/1480 train_time:1139ms step_avg:142.38ms step:19/1480 train_time:1283ms step_avg:142.53ms step:20/1480 train_time:1426ms step_avg:142.65ms step:21/1480 train_time:1569ms step_avg:142.65ms step:22/1480 train_time:1711ms step_avg:142.62ms step:23/1480 train_time:1854ms step_avg:142.65ms step:24/1480 train_time:1997ms step_avg:142.63ms step:25/1480 train_time:2140ms step_avg:142.64ms step:26/1480 train_time:2284ms step_avg:142.72ms step:27/1480 train_time:2426ms step_avg:142.72ms step:28/1480 train_time:2569ms step_avg:142.74ms step:29/1480 train_time:2713ms step_avg:142.80ms step:30/1480 train_time:2855ms step_avg:142.75ms step:31/1480 train_time:2996ms step_avg:142.68ms step:32/1480 train_time:3140ms step_avg:142.73ms step:33/1480 train_time:3285ms step_avg:142.84ms step:34/1480 train_time:3430ms step_avg:142.93ms step:35/1480 train_time:3574ms step_avg:142.97ms step:36/1480 train_time:3715ms step_avg:142.89ms step:37/1480 train_time:3856ms step_avg:142.81ms step:38/1480 train_time:3997ms step_avg:142.75ms step:39/1480 train_time:4140ms step_avg:142.74ms step:40/1480 train_time:4284ms step_avg:142.81ms step:41/1480 train_time:4430ms step_avg:142.89ms step:42/1480 train_time:4573ms step_avg:142.92ms step:43/1480 train_time:4715ms step_avg:142.88ms step:44/1480 train_time:4856ms step_avg:142.82ms step:45/1480 train_time:4997ms step_avg:142.77ms step:46/1480 train_time:5139ms step_avg:142.74ms step:47/1480 train_time:5280ms step_avg:142.71ms step:48/1480 train_time:5425ms step_avg:142.76ms step:49/1480 train_time:5569ms step_avg:142.80ms step:50/1480 train_time:5712ms step_avg:142.80ms step:51/1480 train_time:5853ms step_avg:142.76ms step:52/1480 train_time:5994ms step_avg:142.72ms step:53/1480 train_time:6137ms step_avg:142.72ms step:54/1480 train_time:6277ms step_avg:142.66ms step:55/1480 train_time:6419ms step_avg:142.65ms step:56/1480 train_time:6561ms step_avg:142.64ms step:57/1480 train_time:6706ms step_avg:142.67ms step:58/1480 train_time:6849ms step_avg:142.68ms step:59/1480 train_time:6991ms step_avg:142.67ms step:60/1480 train_time:7134ms step_avg:142.68ms step:61/1480 train_time:7275ms step_avg:142.66ms step:62/1480 train_time:7418ms step_avg:142.66ms step:63/1480 train_time:7561ms step_avg:142.67ms step:64/1480 train_time:7704ms step_avg:142.66ms step:65/1480 train_time:7847ms step_avg:142.68ms step:66/1480 train_time:7991ms step_avg:142.69ms step:67/1480 train_time:8133ms step_avg:142.69ms step:68/1480 train_time:8275ms step_avg:142.67ms step:69/1480 train_time:8417ms step_avg:142.66ms step:70/1480 train_time:8559ms step_avg:142.65ms step:71/1480 train_time:8701ms step_avg:142.63ms step:72/1480 train_time:8844ms step_avg:142.65ms step:73/1480 train_time:8988ms step_avg:142.67ms step:74/1480 train_time:9131ms step_avg:142.68ms step:75/1480 train_time:9273ms step_avg:142.65ms step:76/1480 train_time:9415ms step_avg:142.65ms step:77/1480 train_time:9557ms step_avg:142.65ms step:78/1480 train_time:9699ms step_avg:142.64ms step:79/1480 train_time:9843ms step_avg:142.65ms step:80/1480 train_time:9987ms step_avg:142.67ms step:81/1480 train_time:10132ms step_avg:142.70ms step:82/1480 train_time:10274ms step_avg:142.70ms step:83/1480 train_time:10416ms step_avg:142.69ms step:84/1480 train_time:10557ms step_avg:142.67ms step:85/1480 train_time:10699ms step_avg:142.65ms step:86/1480 train_time:10840ms step_avg:142.63ms step:87/1480 train_time:10983ms step_avg:142.63ms step:88/1480 train_time:11126ms step_avg:142.65ms step:89/1480 train_time:11269ms step_avg:142.64ms step:90/1480 train_time:11412ms step_avg:142.65ms step:91/1480 train_time:11553ms step_avg:142.63ms step:92/1480 train_time:11695ms step_avg:142.62ms step:93/1480 train_time:11836ms step_avg:142.61ms step:94/1480 train_time:11977ms step_avg:142.59ms step:95/1480 train_time:12119ms step_avg:142.58ms step:96/1480 train_time:12260ms step_avg:142.56ms step:97/1480 train_time:12402ms step_avg:142.56ms step:98/1480 train_time:12545ms step_avg:142.55ms step:99/1480 train_time:12688ms step_avg:142.56ms step:100/1480 train_time:12831ms step_avg:142.57ms step:101/1480 train_time:12973ms step_avg:142.56ms step:102/1480 train_time:13116ms step_avg:142.57ms step:103/1480 train_time:13257ms step_avg:142.55ms step:104/1480 train_time:13398ms step_avg:142.53ms step:105/1480 train_time:13540ms step_avg:142.52ms step:106/1480 train_time:13682ms step_avg:142.52ms step:107/1480 train_time:13826ms step_avg:142.54ms step:108/1480 train_time:13968ms step_avg:142.53ms step:109/1480 train_time:14112ms step_avg:142.54ms step:110/1480 train_time:14253ms step_avg:142.53ms step:111/1480 train_time:14396ms step_avg:142.54ms step:112/1480 train_time:14543ms step_avg:142.58ms step:113/1480 train_time:14691ms step_avg:142.63ms step:114/1480 train_time:14838ms step_avg:142.68ms step:115/1480 train_time:14986ms step_avg:142.73ms step:116/1480 train_time:15134ms step_avg:142.77ms step:117/1480 train_time:15280ms step_avg:142.80ms step:118/1480 train_time:15426ms step_avg:142.84ms step:119/1480 train_time:15574ms step_avg:142.88ms step:120/1480 train_time:15720ms step_avg:142.91ms step:121/1480 train_time:15868ms step_avg:142.95ms step:122/1480 train_time:16015ms step_avg:142.99ms step:123/1480 train_time:16160ms step_avg:143.01ms step:124/1480 train_time:16308ms step_avg:143.05ms step:125/1480 train_time:16455ms step_avg:143.09ms step:125/1480 val_loss:4.4087 train_time:16512ms step_avg:143.58ms step:126/1480 train_time:16609ms step_avg:143.18ms step:127/1480 train_time:16759ms step_avg:143.24ms step:128/1480 train_time:16905ms step_avg:143.27ms step:129/1480 train_time:17051ms step_avg:143.28ms step:130/1480 train_time:17196ms step_avg:143.30ms step:131/1480 train_time:17343ms step_avg:143.33ms step:132/1480 train_time:17488ms step_avg:143.34ms step:133/1480 train_time:17638ms step_avg:143.40ms step:134/1480 train_time:17787ms step_avg:143.44ms step:135/1480 train_time:17934ms step_avg:143.47ms step:136/1480 train_time:18081ms step_avg:143.50ms step:137/1480 train_time:18226ms step_avg:143.51ms step:138/1480 train_time:18372ms step_avg:143.53ms step:139/1480 train_time:18519ms step_avg:143.56ms step:140/1480 train_time:18666ms step_avg:143.59ms step:141/1480 train_time:18814ms step_avg:143.61ms step:142/1480 train_time:18962ms step_avg:143.65ms step:143/1480 train_time:19107ms step_avg:143.66ms step:144/1480 train_time:19256ms step_avg:143.70ms step:145/1480 train_time:19403ms step_avg:143.73ms step:146/1480 train_time:19550ms step_avg:143.75ms step:147/1480 train_time:19698ms step_avg:143.78ms step:148/1480 train_time:19845ms step_avg:143.80ms step:149/1480 train_time:19991ms step_avg:143.82ms step:150/1480 train_time:20139ms step_avg:143.85ms step:151/1480 train_time:20286ms step_avg:143.87ms step:152/1480 train_time:20431ms step_avg:143.88ms step:153/1480 train_time:20580ms step_avg:143.92ms step:154/1480 train_time:20727ms step_avg:143.94ms step:155/1480 train_time:20874ms step_avg:143.96ms step:156/1480 train_time:21022ms step_avg:143.99ms step:157/1480 train_time:21169ms step_avg:144.00ms step:158/1480 train_time:21316ms step_avg:144.03ms step:159/1480 train_time:21464ms step_avg:144.05ms step:160/1480 train_time:21610ms step_avg:144.07ms step:161/1480 train_time:21757ms step_avg:144.09ms step:162/1480 train_time:21905ms step_avg:144.11ms step:163/1480 train_time:22051ms step_avg:144.13ms step:164/1480 train_time:22198ms step_avg:144.14ms step:165/1480 train_time:22346ms step_avg:144.16ms step:166/1480 train_time:22491ms step_avg:144.18ms step:167/1480 train_time:22640ms step_avg:144.20ms step:168/1480 train_time:22786ms step_avg:144.21ms step:169/1480 train_time:22931ms step_avg:144.22ms step:170/1480 train_time:23079ms step_avg:144.25ms step:171/1480 train_time:23226ms step_avg:144.26ms step:172/1480 train_time:23371ms step_avg:144.27ms step:173/1480 train_time:23519ms step_avg:144.29ms step:174/1480 train_time:23668ms step_avg:144.32ms step:175/1480 train_time:23816ms step_avg:144.34ms step:176/1480 train_time:23962ms step_avg:144.35ms step:177/1480 train_time:24109ms step_avg:144.36ms step:178/1480 train_time:24256ms step_avg:144.38ms step:179/1480 train_time:24402ms step_avg:144.39ms step:180/1480 train_time:24548ms step_avg:144.40ms step:181/1480 train_time:24695ms step_avg:144.42ms step:182/1480 train_time:24843ms step_avg:144.44ms step:183/1480 train_time:24989ms step_avg:144.44ms step:184/1480 train_time:25136ms step_avg:144.46ms step:185/1480 train_time:25284ms step_avg:144.48ms step:186/1480 train_time:25430ms step_avg:144.49ms step:187/1480 train_time:25578ms step_avg:144.51ms step:188/1480 train_time:25726ms step_avg:144.53ms step:189/1480 train_time:25872ms step_avg:144.54ms step:190/1480 train_time:26019ms step_avg:144.55ms step:191/1480 train_time:26166ms step_avg:144.56ms step:192/1480 train_time:26311ms step_avg:144.56ms step:193/1480 train_time:26458ms step_avg:144.58ms step:194/1480 train_time:26605ms step_avg:144.59ms step:195/1480 train_time:26752ms step_avg:144.61ms step:196/1480 train_time:26900ms step_avg:144.62ms step:197/1480 train_time:27047ms step_avg:144.64ms step:198/1480 train_time:27194ms step_avg:144.65ms step:199/1480 train_time:27342ms step_avg:144.67ms step:200/1480 train_time:27488ms step_avg:144.67ms step:201/1480 train_time:27636ms step_avg:144.69ms step:202/1480 train_time:27783ms step_avg:144.70ms step:203/1480 train_time:27929ms step_avg:144.71ms step:204/1480 train_time:28077ms step_avg:144.73ms step:205/1480 train_time:28224ms step_avg:144.74ms step:206/1480 train_time:28370ms step_avg:144.74ms step:207/1480 train_time:28517ms step_avg:144.76ms step:208/1480 train_time:28664ms step_avg:144.77ms step:209/1480 train_time:28810ms step_avg:144.77ms step:210/1480 train_time:28959ms step_avg:144.79ms step:211/1480 train_time:29106ms step_avg:144.80ms step:212/1480 train_time:29253ms step_avg:144.82ms step:213/1480 train_time:29399ms step_avg:144.83ms step:214/1480 train_time:29546ms step_avg:144.83ms step:215/1480 train_time:29691ms step_avg:144.84ms step:216/1480 train_time:29839ms step_avg:144.85ms step:217/1480 train_time:29987ms step_avg:144.86ms step:218/1480 train_time:30133ms step_avg:144.87ms step:219/1480 train_time:30281ms step_avg:144.89ms step:220/1480 train_time:30427ms step_avg:144.89ms step:221/1480 train_time:30575ms step_avg:144.90ms step:222/1480 train_time:30725ms step_avg:144.93ms step:223/1480 train_time:30876ms step_avg:144.96ms step:224/1480 train_time:31026ms step_avg:144.98ms step:225/1480 train_time:31178ms step_avg:145.01ms step:226/1480 train_time:31328ms step_avg:145.04ms step:227/1480 train_time:31481ms step_avg:145.07ms step:228/1480 train_time:31630ms step_avg:145.09ms step:229/1480 train_time:31782ms step_avg:145.12ms step:230/1480 train_time:31932ms step_avg:145.14ms step:231/1480 train_time:32083ms step_avg:145.17ms step:232/1480 train_time:32232ms step_avg:145.19ms step:233/1480 train_time:32383ms step_avg:145.21ms step:234/1480 train_time:32532ms step_avg:145.23ms step:235/1480 train_time:32683ms step_avg:145.26ms step:236/1480 train_time:32832ms step_avg:145.28ms step:237/1480 train_time:32983ms step_avg:145.30ms step:238/1480 train_time:33133ms step_avg:145.32ms step:239/1480 train_time:33284ms step_avg:145.34ms step:240/1480 train_time:33432ms step_avg:145.36ms step:241/1480 train_time:33583ms step_avg:145.38ms step:242/1480 train_time:33733ms step_avg:145.40ms step:243/1480 train_time:33885ms step_avg:145.43ms step:244/1480 train_time:34035ms step_avg:145.45ms step:245/1480 train_time:34185ms step_avg:145.47ms step:246/1480 train_time:34336ms step_avg:145.49ms step:247/1480 train_time:34486ms step_avg:145.51ms step:248/1480 train_time:34637ms step_avg:145.53ms step:249/1480 train_time:34787ms step_avg:145.55ms step:250/1480 train_time:34938ms step_avg:145.58ms step:250/1480 val_loss:3.9942 train_time:34998ms step_avg:145.83ms step:251/1480 train_time:35096ms step_avg:145.63ms step:252/1480 train_time:35247ms step_avg:145.65ms step:253/1480 train_time:35397ms step_avg:145.67ms step:254/1480 train_time:35545ms step_avg:145.68ms step:255/1480 train_time:35696ms step_avg:145.70ms step:256/1480 train_time:35844ms step_avg:145.71ms step:257/1480 train_time:35995ms step_avg:145.73ms step:258/1480 train_time:36146ms step_avg:145.75ms step:259/1480 train_time:36298ms step_avg:145.78ms step:260/1480 train_time:36448ms step_avg:145.79ms step:261/1480 train_time:36598ms step_avg:145.81ms step:262/1480 train_time:36747ms step_avg:145.82ms step:263/1480 train_time:36897ms step_avg:145.84ms step:264/1480 train_time:37048ms step_avg:145.86ms step:265/1480 train_time:37199ms step_avg:145.88ms step:266/1480 train_time:37350ms step_avg:145.90ms step:267/1480 train_time:37501ms step_avg:145.92ms step:268/1480 train_time:37650ms step_avg:145.93ms step:269/1480 train_time:37800ms step_avg:145.95ms step:270/1480 train_time:37949ms step_avg:145.96ms step:271/1480 train_time:38100ms step_avg:145.98ms step:272/1480 train_time:38251ms step_avg:146.00ms step:273/1480 train_time:38402ms step_avg:146.02ms step:274/1480 train_time:38554ms step_avg:146.04ms step:275/1480 train_time:38704ms step_avg:146.05ms step:276/1480 train_time:38855ms step_avg:146.07ms step:277/1480 train_time:39004ms step_avg:146.08ms step:278/1480 train_time:39156ms step_avg:146.10ms step:279/1480 train_time:39306ms step_avg:146.12ms step:280/1480 train_time:39457ms step_avg:146.14ms step:281/1480 train_time:39607ms step_avg:146.15ms step:282/1480 train_time:39758ms step_avg:146.17ms step:283/1480 train_time:39907ms step_avg:146.18ms step:284/1480 train_time:40058ms step_avg:146.20ms step:285/1480 train_time:40208ms step_avg:146.21ms step:286/1480 train_time:40360ms step_avg:146.23ms step:287/1480 train_time:40510ms step_avg:146.24ms step:288/1480 train_time:40660ms step_avg:146.26ms step:289/1480 train_time:40809ms step_avg:146.27ms step:290/1480 train_time:40960ms step_avg:146.29ms step:291/1480 train_time:41110ms step_avg:146.30ms step:292/1480 train_time:41261ms step_avg:146.32ms step:293/1480 train_time:41411ms step_avg:146.33ms step:294/1480 train_time:41563ms step_avg:146.35ms step:295/1480 train_time:41714ms step_avg:146.36ms step:296/1480 train_time:41866ms step_avg:146.38ms step:297/1480 train_time:42018ms step_avg:146.40ms step:298/1480 train_time:42168ms step_avg:146.42ms step:299/1480 train_time:42318ms step_avg:146.43ms step:300/1480 train_time:42468ms step_avg:146.44ms step:301/1480 train_time:42618ms step_avg:146.45ms step:302/1480 train_time:42768ms step_avg:146.47ms step:303/1480 train_time:42918ms step_avg:146.48ms step:304/1480 train_time:43068ms step_avg:146.49ms step:305/1480 train_time:43219ms step_avg:146.50ms step:306/1480 train_time:43368ms step_avg:146.51ms step:307/1480 train_time:43520ms step_avg:146.53ms step:308/1480 train_time:43670ms step_avg:146.54ms step:309/1480 train_time:43820ms step_avg:146.55ms step:310/1480 train_time:43969ms step_avg:146.56ms step:311/1480 train_time:44120ms step_avg:146.58ms step:312/1480 train_time:44269ms step_avg:146.59ms step:313/1480 train_time:44419ms step_avg:146.60ms step:314/1480 train_time:44570ms step_avg:146.61ms step:315/1480 train_time:44721ms step_avg:146.63ms step:316/1480 train_time:44871ms step_avg:146.64ms step:317/1480 train_time:45022ms step_avg:146.65ms step:318/1480 train_time:45173ms step_avg:146.66ms step:319/1480 train_time:45323ms step_avg:146.68ms step:320/1480 train_time:45474ms step_avg:146.69ms step:321/1480 train_time:45623ms step_avg:146.70ms step:322/1480 train_time:45773ms step_avg:146.71ms step:323/1480 train_time:45923ms step_avg:146.72ms step:324/1480 train_time:46074ms step_avg:146.73ms step:325/1480 train_time:46224ms step_avg:146.74ms step:326/1480 train_time:46375ms step_avg:146.76ms step:327/1480 train_time:46525ms step_avg:146.77ms step:328/1480 train_time:46676ms step_avg:146.78ms step:329/1480 train_time:46826ms step_avg:146.79ms step:330/1480 train_time:46979ms step_avg:146.81ms step:331/1480 train_time:47133ms step_avg:146.83ms step:332/1480 train_time:47288ms step_avg:146.86ms step:333/1480 train_time:47441ms step_avg:146.88ms step:334/1480 train_time:47595ms step_avg:146.90ms step:335/1480 train_time:47749ms step_avg:146.92ms step:336/1480 train_time:47903ms step_avg:146.94ms step:337/1480 train_time:48058ms step_avg:146.97ms step:338/1480 train_time:48212ms step_avg:146.99ms step:339/1480 train_time:48365ms step_avg:147.01ms step:340/1480 train_time:48520ms step_avg:147.03ms step:341/1480 train_time:48674ms step_avg:147.05ms step:342/1480 train_time:48827ms step_avg:147.07ms step:343/1480 train_time:48981ms step_avg:147.09ms step:344/1480 train_time:49135ms step_avg:147.11ms step:345/1480 train_time:49291ms step_avg:147.14ms step:346/1480 train_time:49443ms step_avg:147.15ms step:347/1480 train_time:49597ms step_avg:147.17ms step:348/1480 train_time:49752ms step_avg:147.19ms step:349/1480 train_time:49905ms step_avg:147.21ms step:350/1480 train_time:50060ms step_avg:147.24ms step:351/1480 train_time:50215ms step_avg:147.26ms step:352/1480 train_time:50368ms step_avg:147.28ms step:353/1480 train_time:50522ms step_avg:147.29ms step:354/1480 train_time:50675ms step_avg:147.31ms step:355/1480 train_time:50830ms step_avg:147.33ms step:356/1480 train_time:50984ms step_avg:147.35ms step:357/1480 train_time:51137ms step_avg:147.37ms step:358/1480 train_time:51292ms step_avg:147.39ms step:359/1480 train_time:51447ms step_avg:147.41ms step:360/1480 train_time:51602ms step_avg:147.43ms step:361/1480 train_time:51756ms step_avg:147.45ms step:362/1480 train_time:51910ms step_avg:147.47ms step:363/1480 train_time:52064ms step_avg:147.49ms step:364/1480 train_time:52218ms step_avg:147.51ms step:365/1480 train_time:52371ms step_avg:147.52ms step:366/1480 train_time:52524ms step_avg:147.54ms step:367/1480 train_time:52678ms step_avg:147.56ms step:368/1480 train_time:52832ms step_avg:147.57ms step:369/1480 train_time:52986ms step_avg:147.59ms step:370/1480 train_time:53138ms step_avg:147.61ms step:371/1480 train_time:53292ms step_avg:147.62ms step:372/1480 train_time:53446ms step_avg:147.64ms step:373/1480 train_time:53600ms step_avg:147.66ms step:374/1480 train_time:53754ms step_avg:147.67ms step:375/1480 train_time:53906ms step_avg:147.69ms step:375/1480 val_loss:3.8088 train_time:53968ms step_avg:147.86ms step:376/1480 train_time:54068ms step_avg:147.73ms step:377/1480 train_time:54221ms step_avg:147.74ms step:378/1480 train_time:54375ms step_avg:147.76ms step:379/1480 train_time:54527ms step_avg:147.77ms step:380/1480 train_time:54679ms step_avg:147.78ms step:381/1480 train_time:54831ms step_avg:147.79ms step:382/1480 train_time:54985ms step_avg:147.81ms step:383/1480 train_time:55140ms step_avg:147.83ms step:384/1480 train_time:55294ms step_avg:147.85ms step:385/1480 train_time:55449ms step_avg:147.86ms step:386/1480 train_time:55601ms step_avg:147.88ms step:387/1480 train_time:55757ms step_avg:147.90ms step:388/1480 train_time:55910ms step_avg:147.91ms step:389/1480 train_time:56064ms step_avg:147.93ms step:390/1480 train_time:56218ms step_avg:147.94ms step:391/1480 train_time:56372ms step_avg:147.96ms step:392/1480 train_time:56524ms step_avg:147.97ms step:393/1480 train_time:56677ms step_avg:147.98ms step:394/1480 train_time:56830ms step_avg:147.99ms step:395/1480 train_time:56983ms step_avg:148.01ms step:396/1480 train_time:57137ms step_avg:148.02ms step:397/1480 train_time:57293ms step_avg:148.04ms step:398/1480 train_time:57447ms step_avg:148.06ms step:399/1480 train_time:57601ms step_avg:148.07ms step:400/1480 train_time:57755ms step_avg:148.09ms step:401/1480 train_time:57909ms step_avg:148.10ms step:402/1480 train_time:58062ms step_avg:148.12ms step:403/1480 train_time:58216ms step_avg:148.13ms step:404/1480 train_time:58370ms step_avg:148.15ms step:405/1480 train_time:58524ms step_avg:148.16ms step:406/1480 train_time:58677ms step_avg:148.17ms step:407/1480 train_time:58831ms step_avg:148.19ms step:408/1480 train_time:58985ms step_avg:148.20ms step:409/1480 train_time:59138ms step_avg:148.22ms step:410/1480 train_time:59292ms step_avg:148.23ms step:411/1480 train_time:59446ms step_avg:148.25ms step:412/1480 train_time:59600ms step_avg:148.26ms step:413/1480 train_time:59754ms step_avg:148.27ms step:414/1480 train_time:59908ms step_avg:148.29ms step:415/1480 train_time:60062ms step_avg:148.30ms step:416/1480 train_time:60215ms step_avg:148.31ms step:417/1480 train_time:60368ms step_avg:148.33ms step:418/1480 train_time:60522ms step_avg:148.34ms step:419/1480 train_time:60676ms step_avg:148.35ms step:420/1480 train_time:60829ms step_avg:148.36ms step:421/1480 train_time:60982ms step_avg:148.38ms step:422/1480 train_time:61136ms step_avg:148.39ms step:423/1480 train_time:61290ms step_avg:148.40ms step:424/1480 train_time:61444ms step_avg:148.41ms step:425/1480 train_time:61597ms step_avg:148.43ms step:426/1480 train_time:61751ms step_avg:148.44ms step:427/1480 train_time:61905ms step_avg:148.45ms step:428/1480 train_time:62059ms step_avg:148.47ms step:429/1480 train_time:62213ms step_avg:148.48ms step:430/1480 train_time:62366ms step_avg:148.49ms step:431/1480 train_time:62519ms step_avg:148.50ms step:432/1480 train_time:62672ms step_avg:148.51ms step:433/1480 train_time:62824ms step_avg:148.52ms step:434/1480 train_time:62979ms step_avg:148.53ms step:435/1480 train_time:63134ms step_avg:148.55ms step:436/1480 train_time:63288ms step_avg:148.56ms step:437/1480 train_time:63442ms step_avg:148.58ms step:438/1480 train_time:63596ms step_avg:148.59ms step:439/1480 train_time:63750ms step_avg:148.60ms step:440/1480 train_time:63904ms step_avg:148.61ms step:441/1480 train_time:64061ms step_avg:148.63ms step:442/1480 train_time:64218ms step_avg:148.65ms step:443/1480 train_time:64375ms step_avg:148.67ms step:444/1480 train_time:64532ms step_avg:148.69ms step:445/1480 train_time:64689ms step_avg:148.71ms step:446/1480 train_time:64845ms step_avg:148.73ms step:447/1480 train_time:65000ms step_avg:148.74ms step:448/1480 train_time:65157ms step_avg:148.76ms step:449/1480 train_time:65316ms step_avg:148.78ms step:450/1480 train_time:65474ms step_avg:148.80ms step:451/1480 train_time:65631ms step_avg:148.82ms step:452/1480 train_time:65787ms step_avg:148.84ms step:453/1480 train_time:65943ms step_avg:148.85ms step:454/1480 train_time:66098ms step_avg:148.87ms step:455/1480 train_time:66255ms step_avg:148.89ms step:456/1480 train_time:66412ms step_avg:148.91ms step:457/1480 train_time:66570ms step_avg:148.93ms step:458/1480 train_time:66725ms step_avg:148.94ms step:459/1480 train_time:66881ms step_avg:148.96ms step:460/1480 train_time:67036ms step_avg:148.97ms step:461/1480 train_time:67195ms step_avg:148.99ms step:462/1480 train_time:67353ms step_avg:149.01ms step:463/1480 train_time:67510ms step_avg:149.03ms step:464/1480 train_time:67668ms step_avg:149.05ms step:465/1480 train_time:67823ms step_avg:149.06ms step:466/1480 train_time:67979ms step_avg:149.08ms step:467/1480 train_time:68136ms step_avg:149.09ms step:468/1480 train_time:68291ms step_avg:149.11ms step:469/1480 train_time:68449ms step_avg:149.13ms step:470/1480 train_time:68607ms step_avg:149.15ms step:471/1480 train_time:68764ms step_avg:149.16ms step:472/1480 train_time:68920ms step_avg:149.18ms step:473/1480 train_time:69077ms step_avg:149.19ms step:474/1480 train_time:69233ms step_avg:149.21ms step:475/1480 train_time:69391ms step_avg:149.23ms step:476/1480 train_time:69548ms step_avg:149.24ms step:477/1480 train_time:69704ms step_avg:149.26ms step:478/1480 train_time:69861ms step_avg:149.28ms step:479/1480 train_time:70018ms step_avg:149.29ms step:480/1480 train_time:70175ms step_avg:149.31ms step:481/1480 train_time:70332ms step_avg:149.33ms step:482/1480 train_time:70490ms step_avg:149.34ms step:483/1480 train_time:70647ms step_avg:149.36ms step:484/1480 train_time:70802ms step_avg:149.37ms step:485/1480 train_time:70960ms step_avg:149.39ms step:486/1480 train_time:71116ms step_avg:149.40ms step:487/1480 train_time:71273ms step_avg:149.42ms step:488/1480 train_time:71429ms step_avg:149.43ms step:489/1480 train_time:71587ms step_avg:149.45ms step:490/1480 train_time:71744ms step_avg:149.47ms step:491/1480 train_time:71900ms step_avg:149.48ms step:492/1480 train_time:72057ms step_avg:149.50ms step:493/1480 train_time:72215ms step_avg:149.51ms step:494/1480 train_time:72372ms step_avg:149.53ms step:495/1480 train_time:72530ms step_avg:149.55ms step:496/1480 train_time:72688ms step_avg:149.56ms step:497/1480 train_time:72844ms step_avg:149.58ms step:498/1480 train_time:73002ms step_avg:149.59ms step:499/1480 train_time:73160ms step_avg:149.61ms step:500/1480 train_time:73318ms step_avg:149.63ms step:500/1480 val_loss:3.6827 train_time:73379ms step_avg:149.75ms step:501/1480 train_time:73481ms step_avg:149.66ms step:502/1480 train_time:73639ms step_avg:149.67ms step:503/1480 train_time:73796ms step_avg:149.69ms step:504/1480 train_time:73952ms step_avg:149.70ms step:505/1480 train_time:74106ms step_avg:149.71ms step:506/1480 train_time:74263ms step_avg:149.72ms step:507/1480 train_time:74418ms step_avg:149.73ms step:508/1480 train_time:74578ms step_avg:149.75ms step:509/1480 train_time:74735ms step_avg:149.77ms step:510/1480 train_time:74891ms step_avg:149.78ms step:511/1480 train_time:75048ms step_avg:149.80ms step:512/1480 train_time:75205ms step_avg:149.81ms step:513/1480 train_time:75362ms step_avg:149.82ms step:514/1480 train_time:75519ms step_avg:149.84ms step:515/1480 train_time:75677ms step_avg:149.85ms step:516/1480 train_time:75836ms step_avg:149.87ms step:517/1480 train_time:75993ms step_avg:149.89ms step:518/1480 train_time:76152ms step_avg:149.90ms step:519/1480 train_time:76309ms step_avg:149.92ms step:520/1480 train_time:76465ms step_avg:149.93ms step:521/1480 train_time:76621ms step_avg:149.94ms step:522/1480 train_time:76779ms step_avg:149.96ms step:523/1480 train_time:76938ms step_avg:149.98ms step:524/1480 train_time:77096ms step_avg:149.99ms step:525/1480 train_time:77254ms step_avg:150.01ms step:526/1480 train_time:77410ms step_avg:150.02ms step:527/1480 train_time:77567ms step_avg:150.03ms step:528/1480 train_time:77723ms step_avg:150.04ms step:529/1480 train_time:77882ms step_avg:150.06ms step:530/1480 train_time:78039ms step_avg:150.08ms step:531/1480 train_time:78196ms step_avg:150.09ms step:532/1480 train_time:78354ms step_avg:150.10ms step:533/1480 train_time:78512ms step_avg:150.12ms step:534/1480 train_time:78668ms step_avg:150.13ms step:535/1480 train_time:78824ms step_avg:150.14ms step:536/1480 train_time:78981ms step_avg:150.15ms step:537/1480 train_time:79138ms step_avg:150.17ms step:538/1480 train_time:79295ms step_avg:150.18ms step:539/1480 train_time:79453ms step_avg:150.20ms step:540/1480 train_time:79610ms step_avg:150.21ms step:541/1480 train_time:79766ms step_avg:150.22ms step:542/1480 train_time:79923ms step_avg:150.23ms step:543/1480 train_time:80080ms step_avg:150.24ms step:544/1480 train_time:80237ms step_avg:150.26ms step:545/1480 train_time:80393ms step_avg:150.27ms step:546/1480 train_time:80549ms step_avg:150.28ms step:547/1480 train_time:80705ms step_avg:150.29ms step:548/1480 train_time:80864ms step_avg:150.30ms step:549/1480 train_time:81020ms step_avg:150.32ms step:550/1480 train_time:81180ms step_avg:150.33ms step:551/1480 train_time:81339ms step_avg:150.35ms step:552/1480 train_time:81498ms step_avg:150.37ms step:553/1480 train_time:81660ms step_avg:150.39ms step:554/1480 train_time:81820ms step_avg:150.40ms step:555/1480 train_time:81980ms step_avg:150.42ms step:556/1480 train_time:82139ms step_avg:150.44ms step:557/1480 train_time:82299ms step_avg:150.46ms step:558/1480 train_time:82457ms step_avg:150.47ms step:559/1480 train_time:82618ms step_avg:150.49ms step:560/1480 train_time:82780ms step_avg:150.51ms step:561/1480 train_time:82940ms step_avg:150.53ms step:562/1480 train_time:83100ms step_avg:150.54ms step:563/1480 train_time:83258ms step_avg:150.56ms step:564/1480 train_time:83418ms step_avg:150.57ms step:565/1480 train_time:83577ms step_avg:150.59ms step:566/1480 train_time:83738ms step_avg:150.61ms step:567/1480 train_time:83898ms step_avg:150.62ms step:568/1480 train_time:84058ms step_avg:150.64ms step:569/1480 train_time:84217ms step_avg:150.66ms step:570/1480 train_time:84377ms step_avg:150.67ms step:571/1480 train_time:84536ms step_avg:150.69ms step:572/1480 train_time:84695ms step_avg:150.70ms step:573/1480 train_time:84855ms step_avg:150.72ms step:574/1480 train_time:85015ms step_avg:150.74ms step:575/1480 train_time:85177ms step_avg:150.76ms step:576/1480 train_time:85338ms step_avg:150.77ms step:577/1480 train_time:85497ms step_avg:150.79ms step:578/1480 train_time:85658ms step_avg:150.81ms step:579/1480 train_time:85818ms step_avg:150.82ms step:580/1480 train_time:85978ms step_avg:150.84ms step:581/1480 train_time:86138ms step_avg:150.85ms step:582/1480 train_time:86299ms step_avg:150.87ms step:583/1480 train_time:86459ms step_avg:150.89ms step:584/1480 train_time:86619ms step_avg:150.90ms step:585/1480 train_time:86779ms step_avg:150.92ms step:586/1480 train_time:86939ms step_avg:150.94ms step:587/1480 train_time:87099ms step_avg:150.95ms step:588/1480 train_time:87259ms step_avg:150.97ms step:589/1480 train_time:87418ms step_avg:150.98ms step:590/1480 train_time:87579ms step_avg:151.00ms step:591/1480 train_time:87739ms step_avg:151.01ms step:592/1480 train_time:87899ms step_avg:151.03ms step:593/1480 train_time:88061ms step_avg:151.05ms step:594/1480 train_time:88221ms step_avg:151.06ms step:595/1480 train_time:88384ms step_avg:151.08ms step:596/1480 train_time:88544ms step_avg:151.10ms step:597/1480 train_time:88702ms step_avg:151.11ms step:598/1480 train_time:88860ms step_avg:151.12ms step:599/1480 train_time:89019ms step_avg:151.14ms step:600/1480 train_time:89180ms step_avg:151.15ms step:601/1480 train_time:89339ms step_avg:151.17ms step:602/1480 train_time:89500ms step_avg:151.18ms step:603/1480 train_time:89660ms step_avg:151.20ms step:604/1480 train_time:89819ms step_avg:151.21ms step:605/1480 train_time:89979ms step_avg:151.22ms step:606/1480 train_time:90141ms step_avg:151.24ms step:607/1480 train_time:90302ms step_avg:151.26ms step:608/1480 train_time:90461ms step_avg:151.27ms step:609/1480 train_time:90620ms step_avg:151.29ms step:610/1480 train_time:90779ms step_avg:151.30ms step:611/1480 train_time:90940ms step_avg:151.31ms step:612/1480 train_time:91099ms step_avg:151.33ms step:613/1480 train_time:91260ms step_avg:151.34ms step:614/1480 train_time:91419ms step_avg:151.36ms step:615/1480 train_time:91579ms step_avg:151.37ms step:616/1480 train_time:91738ms step_avg:151.38ms step:617/1480 train_time:91897ms step_avg:151.40ms step:618/1480 train_time:92057ms step_avg:151.41ms step:619/1480 train_time:92217ms step_avg:151.42ms step:620/1480 train_time:92378ms step_avg:151.44ms step:621/1480 train_time:92539ms step_avg:151.45ms step:622/1480 train_time:92698ms step_avg:151.47ms step:623/1480 train_time:92859ms step_avg:151.48ms step:624/1480 train_time:93018ms step_avg:151.50ms step:625/1480 train_time:93178ms step_avg:151.51ms step:625/1480 val_loss:3.6062 train_time:93241ms step_avg:151.61ms step:626/1480 train_time:93339ms step_avg:151.52ms step:627/1480 train_time:93500ms step_avg:151.54ms step:628/1480 train_time:93660ms step_avg:151.55ms step:629/1480 train_time:93819ms step_avg:151.57ms step:630/1480 train_time:93977ms step_avg:151.58ms step:631/1480 train_time:94135ms step_avg:151.59ms step:632/1480 train_time:94294ms step_avg:151.60ms step:633/1480 train_time:94453ms step_avg:151.61ms step:634/1480 train_time:94613ms step_avg:151.62ms step:635/1480 train_time:94771ms step_avg:151.63ms step:636/1480 train_time:94930ms step_avg:151.65ms step:637/1480 train_time:95090ms step_avg:151.66ms step:638/1480 train_time:95249ms step_avg:151.67ms step:639/1480 train_time:95407ms step_avg:151.68ms step:640/1480 train_time:95567ms step_avg:151.69ms step:641/1480 train_time:95728ms step_avg:151.71ms step:642/1480 train_time:95886ms step_avg:151.72ms step:643/1480 train_time:96046ms step_avg:151.73ms step:644/1480 train_time:96206ms step_avg:151.74ms step:645/1480 train_time:96365ms step_avg:151.76ms step:646/1480 train_time:96525ms step_avg:151.77ms step:647/1480 train_time:96685ms step_avg:151.78ms step:648/1480 train_time:96848ms step_avg:151.80ms step:649/1480 train_time:97008ms step_avg:151.81ms step:650/1480 train_time:97167ms step_avg:151.82ms step:651/1480 train_time:97327ms step_avg:151.84ms step:652/1480 train_time:97487ms step_avg:151.85ms step:653/1480 train_time:97646ms step_avg:151.86ms step:654/1480 train_time:97807ms step_avg:151.87ms step:655/1480 train_time:97966ms step_avg:151.88ms step:656/1480 train_time:98126ms step_avg:151.90ms step:657/1480 train_time:98286ms step_avg:151.91ms step:658/1480 train_time:98446ms step_avg:151.92ms step:659/1480 train_time:98608ms step_avg:151.94ms step:660/1480 train_time:98770ms step_avg:151.95ms step:661/1480 train_time:98931ms step_avg:151.97ms step:662/1480 train_time:99091ms step_avg:151.98ms step:663/1480 train_time:99251ms step_avg:151.99ms step:664/1480 train_time:99414ms step_avg:152.01ms step:665/1480 train_time:99575ms step_avg:152.02ms step:666/1480 train_time:99735ms step_avg:152.04ms step:667/1480 train_time:99895ms step_avg:152.05ms step:668/1480 train_time:100057ms step_avg:152.06ms step:669/1480 train_time:100220ms step_avg:152.08ms step:670/1480 train_time:100380ms step_avg:152.09ms step:671/1480 train_time:100542ms step_avg:152.11ms step:672/1480 train_time:100704ms step_avg:152.12ms step:673/1480 train_time:100871ms step_avg:152.14ms step:674/1480 train_time:101033ms step_avg:152.16ms step:675/1480 train_time:101194ms step_avg:152.17ms step:676/1480 train_time:101355ms step_avg:152.18ms step:677/1480 train_time:101515ms step_avg:152.20ms step:678/1480 train_time:101677ms step_avg:152.21ms step:679/1480 train_time:101839ms step_avg:152.23ms step:680/1480 train_time:102001ms step_avg:152.24ms step:681/1480 train_time:102161ms step_avg:152.25ms step:682/1480 train_time:102323ms step_avg:152.27ms step:683/1480 train_time:102485ms step_avg:152.28ms step:684/1480 train_time:102647ms step_avg:152.30ms step:685/1480 train_time:102811ms step_avg:152.31ms step:686/1480 train_time:102971ms step_avg:152.32ms step:687/1480 train_time:103131ms step_avg:152.34ms step:688/1480 train_time:103293ms step_avg:152.35ms step:689/1480 train_time:103455ms step_avg:152.36ms step:690/1480 train_time:103619ms step_avg:152.38ms step:691/1480 train_time:103781ms step_avg:152.40ms step:692/1480 train_time:103944ms step_avg:152.41ms step:693/1480 train_time:104107ms step_avg:152.43ms step:694/1480 train_time:104270ms step_avg:152.44ms step:695/1480 train_time:104430ms step_avg:152.45ms step:696/1480 train_time:104590ms step_avg:152.46ms step:697/1480 train_time:104753ms step_avg:152.48ms step:698/1480 train_time:104913ms step_avg:152.49ms step:699/1480 train_time:105076ms step_avg:152.51ms step:700/1480 train_time:105239ms step_avg:152.52ms step:701/1480 train_time:105400ms step_avg:152.53ms step:702/1480 train_time:105559ms step_avg:152.54ms step:703/1480 train_time:105721ms step_avg:152.56ms step:704/1480 train_time:105883ms step_avg:152.57ms step:705/1480 train_time:106048ms step_avg:152.59ms step:706/1480 train_time:106212ms step_avg:152.60ms step:707/1480 train_time:106375ms step_avg:152.62ms step:708/1480 train_time:106535ms step_avg:152.63ms step:709/1480 train_time:106696ms step_avg:152.64ms step:710/1480 train_time:106855ms step_avg:152.65ms step:711/1480 train_time:107016ms step_avg:152.66ms step:712/1480 train_time:107181ms step_avg:152.68ms step:713/1480 train_time:107347ms step_avg:152.70ms step:714/1480 train_time:107509ms step_avg:152.71ms step:715/1480 train_time:107669ms step_avg:152.72ms step:716/1480 train_time:107829ms step_avg:152.73ms step:717/1480 train_time:107989ms step_avg:152.74ms step:718/1480 train_time:108149ms step_avg:152.75ms step:719/1480 train_time:108309ms step_avg:152.76ms step:720/1480 train_time:108471ms step_avg:152.78ms step:721/1480 train_time:108631ms step_avg:152.79ms step:722/1480 train_time:108793ms step_avg:152.80ms step:723/1480 train_time:108953ms step_avg:152.81ms step:724/1480 train_time:109116ms step_avg:152.82ms step:725/1480 train_time:109279ms step_avg:152.84ms step:726/1480 train_time:109442ms step_avg:152.85ms step:727/1480 train_time:109606ms step_avg:152.87ms step:728/1480 train_time:109768ms step_avg:152.88ms step:729/1480 train_time:109929ms step_avg:152.89ms step:730/1480 train_time:110090ms step_avg:152.90ms step:731/1480 train_time:110251ms step_avg:152.91ms step:732/1480 train_time:110411ms step_avg:152.92ms step:733/1480 train_time:110573ms step_avg:152.94ms step:734/1480 train_time:110735ms step_avg:152.95ms step:735/1480 train_time:110894ms step_avg:152.96ms step:736/1480 train_time:111056ms step_avg:152.97ms step:737/1480 train_time:111220ms step_avg:152.98ms step:738/1480 train_time:111382ms step_avg:153.00ms step:739/1480 train_time:111543ms step_avg:153.01ms step:740/1480 train_time:111710ms step_avg:153.03ms step:741/1480 train_time:111873ms step_avg:153.04ms step:742/1480 train_time:112034ms step_avg:153.05ms step:743/1480 train_time:112194ms step_avg:153.06ms step:744/1480 train_time:112357ms step_avg:153.08ms step:745/1480 train_time:112523ms step_avg:153.09ms step:746/1480 train_time:112684ms step_avg:153.10ms step:747/1480 train_time:112846ms step_avg:153.12ms step:748/1480 train_time:113011ms step_avg:153.13ms step:749/1480 train_time:113174ms step_avg:153.14ms step:750/1480 train_time:113333ms step_avg:153.15ms step:750/1480 val_loss:3.5480 train_time:113396ms step_avg:153.24ms step:751/1480 train_time:113497ms step_avg:153.17ms step:752/1480 train_time:113659ms step_avg:153.18ms step:753/1480 train_time:113820ms step_avg:153.19ms step:754/1480 train_time:113981ms step_avg:153.20ms step:755/1480 train_time:114143ms step_avg:153.21ms step:756/1480 train_time:114304ms step_avg:153.22ms step:757/1480 train_time:114471ms step_avg:153.24ms step:758/1480 train_time:114632ms step_avg:153.25ms step:759/1480 train_time:114794ms step_avg:153.26ms step:760/1480 train_time:114955ms step_avg:153.27ms step:761/1480 train_time:115117ms step_avg:153.28ms step:762/1480 train_time:115277ms step_avg:153.29ms step:763/1480 train_time:115440ms step_avg:153.31ms step:764/1480 train_time:115601ms step_avg:153.32ms step:765/1480 train_time:115764ms step_avg:153.33ms step:766/1480 train_time:115926ms step_avg:153.34ms step:767/1480 train_time:116089ms step_avg:153.35ms step:768/1480 train_time:116251ms step_avg:153.37ms step:769/1480 train_time:116415ms step_avg:153.38ms step:770/1480 train_time:116577ms step_avg:153.39ms step:771/1480 train_time:116739ms step_avg:153.40ms step:772/1480 train_time:116902ms step_avg:153.41ms step:773/1480 train_time:117063ms step_avg:153.42ms step:774/1480 train_time:117226ms step_avg:153.44ms step:775/1480 train_time:117390ms step_avg:153.45ms step:776/1480 train_time:117555ms step_avg:153.47ms step:777/1480 train_time:117719ms step_avg:153.48ms step:778/1480 train_time:117882ms step_avg:153.49ms step:779/1480 train_time:118044ms step_avg:153.50ms step:780/1480 train_time:118209ms step_avg:153.52ms step:781/1480 train_time:118373ms step_avg:153.53ms step:782/1480 train_time:118537ms step_avg:153.55ms step:783/1480 train_time:118699ms step_avg:153.56ms step:784/1480 train_time:118861ms step_avg:153.57ms step:785/1480 train_time:119022ms step_avg:153.58ms step:786/1480 train_time:119188ms step_avg:153.59ms step:787/1480 train_time:119352ms step_avg:153.61ms step:788/1480 train_time:119515ms step_avg:153.62ms step:789/1480 train_time:119676ms step_avg:153.63ms step:790/1480 train_time:119842ms step_avg:153.64ms step:791/1480 train_time:120010ms step_avg:153.66ms step:792/1480 train_time:120174ms step_avg:153.68ms step:793/1480 train_time:120336ms step_avg:153.69ms step:794/1480 train_time:120500ms step_avg:153.70ms step:795/1480 train_time:120665ms step_avg:153.71ms step:796/1480 train_time:120832ms step_avg:153.73ms step:797/1480 train_time:120996ms step_avg:153.74ms step:798/1480 train_time:121160ms step_avg:153.76ms step:799/1480 train_time:121327ms step_avg:153.77ms step:800/1480 train_time:121491ms step_avg:153.79ms step:801/1480 train_time:121654ms step_avg:153.80ms step:802/1480 train_time:121821ms step_avg:153.81ms step:803/1480 train_time:121982ms step_avg:153.82ms step:804/1480 train_time:122143ms step_avg:153.83ms step:805/1480 train_time:122308ms step_avg:153.85ms step:806/1480 train_time:122471ms step_avg:153.86ms step:807/1480 train_time:122633ms step_avg:153.87ms step:808/1480 train_time:122797ms step_avg:153.88ms step:809/1480 train_time:122959ms step_avg:153.89ms step:810/1480 train_time:123120ms step_avg:153.90ms step:811/1480 train_time:123283ms step_avg:153.91ms step:812/1480 train_time:123446ms step_avg:153.92ms step:813/1480 train_time:123607ms step_avg:153.93ms step:814/1480 train_time:123772ms step_avg:153.95ms step:815/1480 train_time:123934ms step_avg:153.96ms step:816/1480 train_time:124100ms step_avg:153.97ms step:817/1480 train_time:124261ms step_avg:153.98ms step:818/1480 train_time:124421ms step_avg:153.99ms step:819/1480 train_time:124586ms step_avg:154.00ms step:820/1480 train_time:124751ms step_avg:154.01ms step:821/1480 train_time:124914ms step_avg:154.02ms step:822/1480 train_time:125077ms step_avg:154.04ms step:823/1480 train_time:125239ms step_avg:154.05ms step:824/1480 train_time:125400ms step_avg:154.05ms step:825/1480 train_time:125567ms step_avg:154.07ms step:826/1480 train_time:125733ms step_avg:154.09ms step:827/1480 train_time:125898ms step_avg:154.10ms step:828/1480 train_time:126060ms step_avg:154.11ms step:829/1480 train_time:126223ms step_avg:154.12ms step:830/1480 train_time:126388ms step_avg:154.13ms step:831/1480 train_time:126552ms step_avg:154.14ms step:832/1480 train_time:126716ms step_avg:154.16ms step:833/1480 train_time:126879ms step_avg:154.17ms step:834/1480 train_time:127043ms step_avg:154.18ms step:835/1480 train_time:127206ms step_avg:154.19ms step:836/1480 train_time:127371ms step_avg:154.20ms step:837/1480 train_time:127534ms step_avg:154.21ms step:838/1480 train_time:127697ms step_avg:154.22ms step:839/1480 train_time:127860ms step_avg:154.23ms step:840/1480 train_time:128020ms step_avg:154.24ms step:841/1480 train_time:128181ms step_avg:154.25ms step:842/1480 train_time:128346ms step_avg:154.26ms step:843/1480 train_time:128509ms step_avg:154.27ms step:844/1480 train_time:128672ms step_avg:154.28ms step:845/1480 train_time:128836ms step_avg:154.29ms step:846/1480 train_time:129000ms step_avg:154.31ms step:847/1480 train_time:129163ms step_avg:154.32ms step:848/1480 train_time:129325ms step_avg:154.33ms step:849/1480 train_time:129489ms step_avg:154.34ms step:850/1480 train_time:129652ms step_avg:154.35ms step:851/1480 train_time:129816ms step_avg:154.36ms step:852/1480 train_time:129978ms step_avg:154.37ms step:853/1480 train_time:130139ms step_avg:154.38ms step:854/1480 train_time:130305ms step_avg:154.39ms step:855/1480 train_time:130469ms step_avg:154.40ms step:856/1480 train_time:130631ms step_avg:154.41ms step:857/1480 train_time:130797ms step_avg:154.42ms step:858/1480 train_time:130963ms step_avg:154.44ms step:859/1480 train_time:131127ms step_avg:154.45ms step:860/1480 train_time:131290ms step_avg:154.46ms step:861/1480 train_time:131455ms step_avg:154.47ms step:862/1480 train_time:131623ms step_avg:154.49ms step:863/1480 train_time:131793ms step_avg:154.51ms step:864/1480 train_time:131956ms step_avg:154.52ms step:865/1480 train_time:132117ms step_avg:154.52ms step:866/1480 train_time:132283ms step_avg:154.54ms step:867/1480 train_time:132446ms step_avg:154.55ms step:868/1480 train_time:132607ms step_avg:154.55ms step:869/1480 train_time:132770ms step_avg:154.56ms step:870/1480 train_time:132934ms step_avg:154.57ms step:871/1480 train_time:133096ms step_avg:154.58ms step:872/1480 train_time:133258ms step_avg:154.59ms step:873/1480 train_time:133420ms step_avg:154.60ms step:874/1480 train_time:133586ms step_avg:154.61ms step:875/1480 train_time:133752ms step_avg:154.63ms step:875/1480 val_loss:3.5026 train_time:133818ms step_avg:154.70ms step:876/1480 train_time:133919ms step_avg:154.64ms step:877/1480 train_time:134084ms step_avg:154.65ms step:878/1480 train_time:134247ms step_avg:154.66ms step:879/1480 train_time:134411ms step_avg:154.67ms step:880/1480 train_time:134575ms step_avg:154.68ms step:881/1480 train_time:134738ms step_avg:154.69ms step:882/1480 train_time:134902ms step_avg:154.70ms step:883/1480 train_time:135069ms step_avg:154.72ms step:884/1480 train_time:135236ms step_avg:154.73ms step:885/1480 train_time:135401ms step_avg:154.74ms step:886/1480 train_time:135568ms step_avg:154.76ms step:887/1480 train_time:135737ms step_avg:154.77ms step:888/1480 train_time:135909ms step_avg:154.79ms step:889/1480 train_time:136077ms step_avg:154.81ms step:890/1480 train_time:136239ms step_avg:154.82ms step:891/1480 train_time:136403ms step_avg:154.83ms step:892/1480 train_time:136568ms step_avg:154.84ms step:893/1480 train_time:136732ms step_avg:154.85ms step:894/1480 train_time:136899ms step_avg:154.86ms step:895/1480 train_time:137063ms step_avg:154.87ms step:896/1480 train_time:137228ms step_avg:154.89ms step:897/1480 train_time:137396ms step_avg:154.90ms step:898/1480 train_time:137563ms step_avg:154.91ms step:899/1480 train_time:137726ms step_avg:154.92ms step:900/1480 train_time:137889ms step_avg:154.93ms step:901/1480 train_time:138053ms step_avg:154.94ms step:902/1480 train_time:138218ms step_avg:154.95ms step:903/1480 train_time:138388ms step_avg:154.97ms step:904/1480 train_time:138555ms step_avg:154.98ms step:905/1480 train_time:138719ms step_avg:154.99ms step:906/1480 train_time:138884ms step_avg:155.00ms step:907/1480 train_time:139054ms step_avg:155.02ms step:908/1480 train_time:139218ms step_avg:155.03ms step:909/1480 train_time:139383ms step_avg:155.04ms step:910/1480 train_time:139555ms step_avg:155.06ms step:911/1480 train_time:139720ms step_avg:155.07ms step:912/1480 train_time:139886ms step_avg:155.08ms step:913/1480 train_time:140055ms step_avg:155.10ms step:914/1480 train_time:140223ms step_avg:155.11ms step:915/1480 train_time:140394ms step_avg:155.13ms step:916/1480 train_time:140558ms step_avg:155.14ms step:917/1480 train_time:140721ms step_avg:155.15ms step:918/1480 train_time:140890ms step_avg:155.17ms step:919/1480 train_time:141060ms step_avg:155.18ms step:920/1480 train_time:141224ms step_avg:155.19ms step:921/1480 train_time:141391ms step_avg:155.20ms step:922/1480 train_time:141559ms step_avg:155.22ms step:923/1480 train_time:141721ms step_avg:155.23ms step:924/1480 train_time:141886ms step_avg:155.24ms step:925/1480 train_time:142052ms step_avg:155.25ms step:926/1480 train_time:142216ms step_avg:155.26ms step:927/1480 train_time:142380ms step_avg:155.27ms step:928/1480 train_time:142545ms step_avg:155.28ms step:929/1480 train_time:142710ms step_avg:155.29ms step:930/1480 train_time:142877ms step_avg:155.30ms step:931/1480 train_time:143040ms step_avg:155.31ms step:932/1480 train_time:143206ms step_avg:155.32ms step:933/1480 train_time:143375ms step_avg:155.34ms step:934/1480 train_time:143541ms step_avg:155.35ms step:935/1480 train_time:143711ms step_avg:155.36ms step:936/1480 train_time:143879ms step_avg:155.38ms step:937/1480 train_time:144048ms step_avg:155.39ms step:938/1480 train_time:144209ms step_avg:155.40ms step:939/1480 train_time:144378ms step_avg:155.41ms step:940/1480 train_time:144543ms step_avg:155.42ms step:941/1480 train_time:144707ms step_avg:155.43ms step:942/1480 train_time:144873ms step_avg:155.44ms step:943/1480 train_time:145044ms step_avg:155.46ms step:944/1480 train_time:145217ms step_avg:155.48ms step:945/1480 train_time:145380ms step_avg:155.49ms step:946/1480 train_time:145548ms step_avg:155.50ms step:947/1480 train_time:145717ms step_avg:155.51ms step:948/1480 train_time:145883ms step_avg:155.53ms step:949/1480 train_time:146048ms step_avg:155.54ms step:950/1480 train_time:146213ms step_avg:155.55ms step:951/1480 train_time:146380ms step_avg:155.56ms step:952/1480 train_time:146544ms step_avg:155.57ms step:953/1480 train_time:146711ms step_avg:155.58ms step:954/1480 train_time:146879ms step_avg:155.59ms step:955/1480 train_time:147042ms step_avg:155.60ms step:956/1480 train_time:147208ms step_avg:155.61ms step:957/1480 train_time:147377ms step_avg:155.63ms step:958/1480 train_time:147546ms step_avg:155.64ms step:959/1480 train_time:147711ms step_avg:155.65ms step:960/1480 train_time:147879ms step_avg:155.66ms step:961/1480 train_time:148044ms step_avg:155.67ms step:962/1480 train_time:148207ms step_avg:155.68ms step:963/1480 train_time:148374ms step_avg:155.69ms step:964/1480 train_time:148542ms step_avg:155.70ms step:965/1480 train_time:148705ms step_avg:155.71ms step:966/1480 train_time:148870ms step_avg:155.72ms step:967/1480 train_time:149034ms step_avg:155.73ms step:968/1480 train_time:149199ms step_avg:155.74ms step:969/1480 train_time:149365ms step_avg:155.75ms step:970/1480 train_time:149527ms step_avg:155.76ms step:971/1480 train_time:149692ms step_avg:155.77ms step:972/1480 train_time:149859ms step_avg:155.78ms step:973/1480 train_time:150022ms step_avg:155.79ms step:974/1480 train_time:150192ms step_avg:155.80ms step:975/1480 train_time:150358ms step_avg:155.81ms step:976/1480 train_time:150523ms step_avg:155.82ms step:977/1480 train_time:150686ms step_avg:155.83ms step:978/1480 train_time:150850ms step_avg:155.84ms step:979/1480 train_time:151017ms step_avg:155.85ms step:980/1480 train_time:151183ms step_avg:155.86ms step:981/1480 train_time:151354ms step_avg:155.87ms step:982/1480 train_time:151518ms step_avg:155.88ms step:983/1480 train_time:151682ms step_avg:155.89ms step:984/1480 train_time:151846ms step_avg:155.90ms step:985/1480 train_time:152013ms step_avg:155.91ms step:986/1480 train_time:152179ms step_avg:155.92ms step:987/1480 train_time:152343ms step_avg:155.93ms step:988/1480 train_time:152509ms step_avg:155.94ms step:989/1480 train_time:152675ms step_avg:155.95ms step:990/1480 train_time:152846ms step_avg:155.96ms step:991/1480 train_time:153014ms step_avg:155.98ms step:992/1480 train_time:153187ms step_avg:155.99ms step:993/1480 train_time:153363ms step_avg:156.02ms step:994/1480 train_time:153527ms step_avg:156.02ms step:995/1480 train_time:153691ms step_avg:156.03ms step:996/1480 train_time:153854ms step_avg:156.04ms step:997/1480 train_time:154018ms step_avg:156.05ms step:998/1480 train_time:154182ms step_avg:156.05ms step:999/1480 train_time:154349ms step_avg:156.07ms step:1000/1480 train_time:154516ms step_avg:156.08ms step:1000/1480 val_loss:3.4389 train_time:154584ms step_avg:156.15ms step:1001/1480 train_time:154688ms step_avg:156.09ms step:1002/1480 train_time:154854ms step_avg:156.10ms step:1003/1480 train_time:155024ms step_avg:156.12ms step:1004/1480 train_time:155195ms step_avg:156.13ms step:1005/1480 train_time:155362ms step_avg:156.14ms step:1006/1480 train_time:155531ms step_avg:156.16ms step:1007/1480 train_time:155696ms step_avg:156.16ms step:1008/1480 train_time:155862ms step_avg:156.17ms step:1009/1480 train_time:156036ms step_avg:156.19ms step:1010/1480 train_time:156201ms step_avg:156.20ms step:1011/1480 train_time:156368ms step_avg:156.21ms step:1012/1480 train_time:156535ms step_avg:156.22ms step:1013/1480 train_time:156705ms step_avg:156.24ms step:1014/1480 train_time:156872ms step_avg:156.25ms step:1015/1480 train_time:157042ms step_avg:156.26ms step:1016/1480 train_time:157209ms step_avg:156.27ms step:1017/1480 train_time:157381ms step_avg:156.29ms step:1018/1480 train_time:157549ms step_avg:156.30ms step:1019/1480 train_time:157717ms step_avg:156.31ms step:1020/1480 train_time:157885ms step_avg:156.32ms step:1021/1480 train_time:158051ms step_avg:156.33ms step:1022/1480 train_time:158219ms step_avg:156.34ms step:1023/1480 train_time:158386ms step_avg:156.35ms step:1024/1480 train_time:158553ms step_avg:156.36ms step:1025/1480 train_time:158727ms step_avg:156.38ms step:1026/1480 train_time:158894ms step_avg:156.39ms step:1027/1480 train_time:159060ms step_avg:156.40ms step:1028/1480 train_time:159232ms step_avg:156.42ms step:1029/1480 train_time:159407ms step_avg:156.43ms step:1030/1480 train_time:159575ms step_avg:156.45ms step:1031/1480 train_time:159739ms step_avg:156.45ms step:1032/1480 train_time:159912ms step_avg:156.47ms step:1033/1480 train_time:160079ms step_avg:156.48ms step:1034/1480 train_time:160247ms step_avg:156.49ms step:1035/1480 train_time:160416ms step_avg:156.50ms step:1036/1480 train_time:160582ms step_avg:156.51ms step:1037/1480 train_time:160749ms step_avg:156.52ms step:1038/1480 train_time:160918ms step_avg:156.54ms step:1039/1480 train_time:161090ms step_avg:156.55ms step:1040/1480 train_time:161258ms step_avg:156.56ms step:1041/1480 train_time:161424ms step_avg:156.57ms step:1042/1480 train_time:161587ms step_avg:156.58ms step:1043/1480 train_time:161755ms step_avg:156.59ms step:1044/1480 train_time:161918ms step_avg:156.59ms step:1045/1480 train_time:162088ms step_avg:156.61ms step:1046/1480 train_time:162257ms step_avg:156.62ms step:1047/1480 train_time:162422ms step_avg:156.63ms step:1048/1480 train_time:162588ms step_avg:156.64ms step:1049/1480 train_time:162755ms step_avg:156.65ms step:1050/1480 train_time:162923ms step_avg:156.66ms step:1051/1480 train_time:163095ms step_avg:156.67ms step:1052/1480 train_time:163263ms step_avg:156.68ms step:1053/1480 train_time:163431ms step_avg:156.69ms step:1054/1480 train_time:163599ms step_avg:156.70ms step:1055/1480 train_time:163764ms step_avg:156.71ms step:1056/1480 train_time:163929ms step_avg:156.72ms step:1057/1480 train_time:164096ms step_avg:156.73ms step:1058/1480 train_time:164264ms step_avg:156.74ms step:1059/1480 train_time:164437ms step_avg:156.76ms step:1060/1480 train_time:164606ms step_avg:156.77ms step:1061/1480 train_time:164770ms step_avg:156.77ms step:1062/1480 train_time:164936ms step_avg:156.78ms step:1063/1480 train_time:165101ms step_avg:156.79ms step:1064/1480 train_time:165265ms step_avg:156.80ms step:1065/1480 train_time:165432ms step_avg:156.81ms step:1066/1480 train_time:165599ms step_avg:156.82ms step:1067/1480 train_time:165768ms step_avg:156.83ms step:1068/1480 train_time:165934ms step_avg:156.84ms step:1069/1480 train_time:166105ms step_avg:156.85ms step:1070/1480 train_time:166271ms step_avg:156.86ms step:1071/1480 train_time:166442ms step_avg:156.87ms step:1072/1480 train_time:166606ms step_avg:156.88ms step:1073/1480 train_time:166770ms step_avg:156.89ms step:1074/1480 train_time:166938ms step_avg:156.90ms step:1075/1480 train_time:167109ms step_avg:156.91ms step:1076/1480 train_time:167277ms step_avg:156.92ms step:1077/1480 train_time:167443ms step_avg:156.93ms step:1078/1480 train_time:167618ms step_avg:156.95ms step:1079/1480 train_time:167791ms step_avg:156.96ms step:1080/1480 train_time:167961ms step_avg:156.97ms step:1081/1480 train_time:168126ms step_avg:156.98ms step:1082/1480 train_time:168294ms step_avg:156.99ms step:1083/1480 train_time:168461ms step_avg:157.00ms step:1084/1480 train_time:168628ms step_avg:157.01ms step:1085/1480 train_time:168797ms step_avg:157.02ms step:1086/1480 train_time:168963ms step_avg:157.03ms step:1087/1480 train_time:169129ms step_avg:157.04ms step:1088/1480 train_time:169300ms step_avg:157.05ms step:1089/1480 train_time:169473ms step_avg:157.06ms step:1090/1480 train_time:169644ms step_avg:157.08ms step:1091/1480 train_time:169812ms step_avg:157.09ms step:1092/1480 train_time:169980ms step_avg:157.10ms step:1093/1480 train_time:170149ms step_avg:157.11ms step:1094/1480 train_time:170316ms step_avg:157.12ms step:1095/1480 train_time:170480ms step_avg:157.12ms step:1096/1480 train_time:170649ms step_avg:157.14ms step:1097/1480 train_time:170817ms step_avg:157.15ms step:1098/1480 train_time:170988ms step_avg:157.16ms step:1099/1480 train_time:171157ms step_avg:157.17ms step:1100/1480 train_time:171329ms step_avg:157.18ms step:1101/1480 train_time:171499ms step_avg:157.19ms step:1102/1480 train_time:171671ms step_avg:157.21ms step:1103/1480 train_time:171847ms step_avg:157.22ms step:1104/1480 train_time:172014ms step_avg:157.23ms step:1105/1480 train_time:172183ms step_avg:157.24ms step:1106/1480 train_time:172353ms step_avg:157.26ms step:1107/1480 train_time:172522ms step_avg:157.27ms step:1108/1480 train_time:172687ms step_avg:157.27ms step:1109/1480 train_time:172854ms step_avg:157.28ms step:1110/1480 train_time:173020ms step_avg:157.29ms step:1111/1480 train_time:173190ms step_avg:157.30ms step:1112/1480 train_time:173360ms step_avg:157.31ms step:1113/1480 train_time:173542ms step_avg:157.34ms step:1114/1480 train_time:173717ms step_avg:157.35ms step:1115/1480 train_time:173888ms step_avg:157.36ms step:1116/1480 train_time:174054ms step_avg:157.37ms step:1117/1480 train_time:174226ms step_avg:157.39ms step:1118/1480 train_time:174401ms step_avg:157.40ms step:1119/1480 train_time:174569ms step_avg:157.41ms step:1120/1480 train_time:174737ms step_avg:157.42ms step:1121/1480 train_time:174908ms step_avg:157.43ms step:1122/1480 train_time:175075ms step_avg:157.44ms step:1123/1480 train_time:175240ms step_avg:157.45ms step:1124/1480 train_time:175409ms step_avg:157.46ms step:1125/1480 train_time:175577ms step_avg:157.47ms step:1125/1480 val_loss:3.3826 train_time:175646ms step_avg:157.53ms step:1126/1480 train_time:175747ms step_avg:157.48ms step:1127/1480 train_time:175919ms step_avg:157.49ms step:1128/1480 train_time:176091ms step_avg:157.51ms step:1129/1480 train_time:176265ms step_avg:157.52ms step:1130/1480 train_time:176433ms step_avg:157.53ms step:1131/1480 train_time:176611ms step_avg:157.55ms step:1132/1480 train_time:176776ms step_avg:157.55ms step:1133/1480 train_time:176946ms step_avg:157.57ms step:1134/1480 train_time:177116ms step_avg:157.58ms step:1135/1480 train_time:177283ms step_avg:157.59ms step:1136/1480 train_time:177452ms step_avg:157.60ms step:1137/1480 train_time:177622ms step_avg:157.61ms step:1138/1480 train_time:177792ms step_avg:157.62ms step:1139/1480 train_time:177961ms step_avg:157.63ms step:1140/1480 train_time:178128ms step_avg:157.64ms step:1141/1480 train_time:178302ms step_avg:157.65ms step:1142/1480 train_time:178469ms step_avg:157.66ms step:1143/1480 train_time:178639ms step_avg:157.67ms step:1144/1480 train_time:178809ms step_avg:157.68ms step:1145/1480 train_time:178974ms step_avg:157.69ms step:1146/1480 train_time:179144ms step_avg:157.70ms step:1147/1480 train_time:179313ms step_avg:157.71ms step:1148/1480 train_time:179481ms step_avg:157.72ms step:1149/1480 train_time:179650ms step_avg:157.73ms step:1150/1480 train_time:179821ms step_avg:157.74ms step:1151/1480 train_time:179991ms step_avg:157.75ms step:1152/1480 train_time:180164ms step_avg:157.76ms step:1153/1480 train_time:180337ms step_avg:157.78ms step:1154/1480 train_time:180505ms step_avg:157.78ms step:1155/1480 train_time:180676ms step_avg:157.80ms step:1156/1480 train_time:180857ms step_avg:157.82ms step:1157/1480 train_time:181026ms step_avg:157.83ms step:1158/1480 train_time:181192ms step_avg:157.83ms step:1159/1480 train_time:181360ms step_avg:157.84ms step:1160/1480 train_time:181526ms step_avg:157.85ms step:1161/1480 train_time:181696ms step_avg:157.86ms step:1162/1480 train_time:181865ms step_avg:157.87ms step:1163/1480 train_time:182035ms step_avg:157.88ms step:1164/1480 train_time:182204ms step_avg:157.89ms step:1165/1480 train_time:182369ms step_avg:157.89ms step:1166/1480 train_time:182539ms step_avg:157.91ms step:1167/1480 train_time:182708ms step_avg:157.92ms step:1168/1480 train_time:182877ms step_avg:157.93ms step:1169/1480 train_time:183045ms step_avg:157.93ms step:1170/1480 train_time:183213ms step_avg:157.94ms step:1171/1480 train_time:183381ms step_avg:157.95ms step:1172/1480 train_time:183549ms step_avg:157.96ms step:1173/1480 train_time:183722ms step_avg:157.97ms step:1174/1480 train_time:183903ms step_avg:157.99ms step:1175/1480 train_time:184073ms step_avg:158.00ms step:1176/1480 train_time:184243ms step_avg:158.01ms step:1177/1480 train_time:184422ms step_avg:158.03ms step:1178/1480 train_time:184588ms step_avg:158.04ms step:1179/1480 train_time:184754ms step_avg:158.04ms step:1180/1480 train_time:184934ms step_avg:158.06ms step:1181/1480 train_time:185104ms step_avg:158.07ms step:1182/1480 train_time:185272ms step_avg:158.08ms step:1183/1480 train_time:185444ms step_avg:158.09ms step:1184/1480 train_time:185612ms step_avg:158.10ms step:1185/1480 train_time:185785ms step_avg:158.12ms step:1186/1480 train_time:185956ms step_avg:158.13ms step:1187/1480 train_time:186141ms step_avg:158.15ms step:1188/1480 train_time:186307ms step_avg:158.16ms step:1189/1480 train_time:186481ms step_avg:158.17ms step:1190/1480 train_time:186649ms step_avg:158.18ms step:1191/1480 train_time:186823ms step_avg:158.19ms step:1192/1480 train_time:186988ms step_avg:158.20ms step:1193/1480 train_time:187153ms step_avg:158.20ms step:1194/1480 train_time:187323ms step_avg:158.21ms step:1195/1480 train_time:187495ms step_avg:158.22ms step:1196/1480 train_time:187678ms step_avg:158.24ms step:1197/1480 train_time:187848ms step_avg:158.25ms step:1198/1480 train_time:188029ms step_avg:158.27ms step:1199/1480 train_time:188199ms step_avg:158.28ms step:1200/1480 train_time:188367ms step_avg:158.29ms step:1201/1480 train_time:188536ms step_avg:158.30ms step:1202/1480 train_time:188716ms step_avg:158.32ms step:1203/1480 train_time:188891ms step_avg:158.33ms step:1204/1480 train_time:189065ms step_avg:158.35ms step:1205/1480 train_time:189232ms step_avg:158.35ms step:1206/1480 train_time:189401ms step_avg:158.36ms step:1207/1480 train_time:189569ms step_avg:158.37ms step:1208/1480 train_time:189737ms step_avg:158.38ms step:1209/1480 train_time:189911ms step_avg:158.39ms step:1210/1480 train_time:190086ms step_avg:158.41ms step:1211/1480 train_time:190260ms step_avg:158.42ms step:1212/1480 train_time:190432ms step_avg:158.43ms step:1213/1480 train_time:190605ms step_avg:158.44ms step:1214/1480 train_time:190783ms step_avg:158.46ms step:1215/1480 train_time:190956ms step_avg:158.47ms step:1216/1480 train_time:191125ms step_avg:158.48ms step:1217/1480 train_time:191297ms step_avg:158.49ms step:1218/1480 train_time:191467ms step_avg:158.50ms step:1219/1480 train_time:191647ms step_avg:158.52ms step:1220/1480 train_time:191816ms step_avg:158.53ms step:1221/1480 train_time:191985ms step_avg:158.53ms step:1222/1480 train_time:192152ms step_avg:158.54ms step:1223/1480 train_time:192324ms step_avg:158.55ms step:1224/1480 train_time:192501ms step_avg:158.57ms step:1225/1480 train_time:192673ms step_avg:158.58ms step:1226/1480 train_time:192847ms step_avg:158.59ms step:1227/1480 train_time:193020ms step_avg:158.60ms step:1228/1480 train_time:193189ms step_avg:158.61ms step:1229/1480 train_time:193363ms step_avg:158.62ms step:1230/1480 train_time:193543ms step_avg:158.64ms step:1231/1480 train_time:193719ms step_avg:158.66ms step:1232/1480 train_time:193893ms step_avg:158.67ms step:1233/1480 train_time:194063ms step_avg:158.68ms step:1234/1480 train_time:194232ms step_avg:158.69ms step:1235/1480 train_time:194407ms step_avg:158.70ms step:1236/1480 train_time:194574ms step_avg:158.71ms step:1237/1480 train_time:194746ms step_avg:158.72ms step:1238/1480 train_time:194931ms step_avg:158.74ms step:1239/1480 train_time:195104ms step_avg:158.75ms step:1240/1480 train_time:195275ms step_avg:158.76ms step:1241/1480 train_time:195446ms step_avg:158.77ms step:1242/1480 train_time:195616ms step_avg:158.78ms step:1243/1480 train_time:195790ms step_avg:158.79ms step:1244/1480 train_time:195957ms step_avg:158.80ms step:1245/1480 train_time:196127ms step_avg:158.81ms step:1246/1480 train_time:196297ms step_avg:158.82ms step:1247/1480 train_time:196466ms step_avg:158.82ms step:1248/1480 train_time:196635ms step_avg:158.83ms step:1249/1480 train_time:196803ms step_avg:158.84ms step:1250/1480 train_time:196971ms step_avg:158.85ms step:1250/1480 val_loss:3.3326 train_time:197042ms step_avg:158.91ms step:1251/1480 train_time:197152ms step_avg:158.87ms step:1252/1480 train_time:197322ms step_avg:158.87ms step:1253/1480 train_time:197489ms step_avg:158.88ms step:1254/1480 train_time:197659ms step_avg:158.89ms step:1255/1480 train_time:197847ms step_avg:158.91ms step:1256/1480 train_time:198020ms step_avg:158.92ms step:1257/1480 train_time:198189ms step_avg:158.93ms step:1258/1480 train_time:198365ms step_avg:158.95ms step:1259/1480 train_time:198537ms step_avg:158.96ms step:1260/1480 train_time:198705ms step_avg:158.96ms step:1261/1480 train_time:198876ms step_avg:158.97ms step:1262/1480 train_time:199052ms step_avg:158.99ms step:1263/1480 train_time:199227ms step_avg:159.00ms step:1264/1480 train_time:199393ms step_avg:159.01ms step:1265/1480 train_time:199559ms step_avg:159.01ms step:1266/1480 train_time:199732ms step_avg:159.02ms step:1267/1480 train_time:199902ms step_avg:159.03ms step:1268/1480 train_time:200072ms step_avg:159.04ms step:1269/1480 train_time:200249ms step_avg:159.05ms step:1270/1480 train_time:200417ms step_avg:159.06ms step:1271/1480 train_time:200587ms step_avg:159.07ms step:1272/1480 train_time:200753ms step_avg:159.08ms step:1273/1480 train_time:200925ms step_avg:159.09ms step:1274/1480 train_time:201096ms step_avg:159.10ms step:1275/1480 train_time:201264ms step_avg:159.10ms step:1276/1480 train_time:201429ms step_avg:159.11ms step:1277/1480 train_time:201601ms step_avg:159.12ms step:1278/1480 train_time:201770ms step_avg:159.12ms step:1279/1480 train_time:201942ms step_avg:159.14ms step:1280/1480 train_time:202121ms step_avg:159.15ms step:1281/1480 train_time:202290ms step_avg:159.16ms step:1282/1480 train_time:202456ms step_avg:159.16ms step:1283/1480 train_time:202628ms step_avg:159.17ms step:1284/1480 train_time:202799ms step_avg:159.18ms step:1285/1480 train_time:202969ms step_avg:159.19ms step:1286/1480 train_time:203141ms step_avg:159.20ms step:1287/1480 train_time:203312ms step_avg:159.21ms step:1288/1480 train_time:203485ms step_avg:159.22ms step:1289/1480 train_time:203667ms step_avg:159.24ms step:1290/1480 train_time:203848ms step_avg:159.26ms step:1291/1480 train_time:204022ms step_avg:159.27ms step:1292/1480 train_time:204195ms step_avg:159.28ms step:1293/1480 train_time:204372ms step_avg:159.29ms step:1294/1480 train_time:204546ms step_avg:159.30ms step:1295/1480 train_time:204719ms step_avg:159.31ms step:1296/1480 train_time:204894ms step_avg:159.33ms step:1297/1480 train_time:205065ms step_avg:159.34ms step:1298/1480 train_time:205237ms step_avg:159.35ms step:1299/1480 train_time:205409ms step_avg:159.36ms step:1300/1480 train_time:205575ms step_avg:159.36ms step:1301/1480 train_time:205745ms step_avg:159.37ms step:1302/1480 train_time:205918ms step_avg:159.38ms step:1303/1480 train_time:206094ms step_avg:159.39ms step:1304/1480 train_time:206268ms step_avg:159.40ms step:1305/1480 train_time:206436ms step_avg:159.41ms step:1306/1480 train_time:206610ms step_avg:159.42ms step:1307/1480 train_time:206779ms step_avg:159.43ms step:1308/1480 train_time:206949ms step_avg:159.44ms step:1309/1480 train_time:207121ms step_avg:159.45ms step:1310/1480 train_time:207289ms step_avg:159.45ms step:1311/1480 train_time:207458ms step_avg:159.46ms step:1312/1480 train_time:207631ms step_avg:159.47ms step:1313/1480 train_time:207800ms step_avg:159.48ms step:1314/1480 train_time:207972ms step_avg:159.49ms step:1315/1480 train_time:208143ms step_avg:159.50ms step:1316/1480 train_time:208310ms step_avg:159.50ms step:1317/1480 train_time:208481ms step_avg:159.51ms step:1318/1480 train_time:208659ms step_avg:159.53ms step:1319/1480 train_time:208834ms step_avg:159.54ms step:1320/1480 train_time:209011ms step_avg:159.55ms step:1321/1480 train_time:209184ms step_avg:159.56ms step:1322/1480 train_time:209364ms step_avg:159.58ms step:1323/1480 train_time:209537ms step_avg:159.59ms step:1324/1480 train_time:209712ms step_avg:159.60ms step:1325/1480 train_time:209895ms step_avg:159.62ms step:1326/1480 train_time:210070ms step_avg:159.63ms step:1327/1480 train_time:210240ms step_avg:159.64ms step:1328/1480 train_time:210409ms step_avg:159.64ms step:1329/1480 train_time:210606ms step_avg:159.67ms step:1330/1480 train_time:210786ms step_avg:159.69ms step:1331/1480 train_time:210957ms step_avg:159.69ms step:1332/1480 train_time:211133ms step_avg:159.71ms step:1333/1480 train_time:211308ms step_avg:159.72ms step:1334/1480 train_time:211479ms step_avg:159.73ms step:1335/1480 train_time:211649ms step_avg:159.73ms step:1336/1480 train_time:211831ms step_avg:159.75ms step:1337/1480 train_time:212007ms step_avg:159.76ms step:1338/1480 train_time:212179ms step_avg:159.77ms step:1339/1480 train_time:212352ms step_avg:159.78ms step:1340/1480 train_time:212525ms step_avg:159.79ms step:1341/1480 train_time:212692ms step_avg:159.80ms step:1342/1480 train_time:212866ms step_avg:159.81ms step:1343/1480 train_time:213036ms step_avg:159.82ms step:1344/1480 train_time:213209ms step_avg:159.83ms step:1345/1480 train_time:213388ms step_avg:159.84ms step:1346/1480 train_time:213557ms step_avg:159.85ms step:1347/1480 train_time:213728ms step_avg:159.86ms step:1348/1480 train_time:213897ms step_avg:159.86ms step:1349/1480 train_time:214066ms step_avg:159.87ms step:1350/1480 train_time:214243ms step_avg:159.88ms step:1351/1480 train_time:214412ms step_avg:159.89ms step:1352/1480 train_time:214583ms step_avg:159.90ms step:1353/1480 train_time:214759ms step_avg:159.91ms step:1354/1480 train_time:214931ms step_avg:159.92ms step:1355/1480 train_time:215100ms step_avg:159.93ms step:1356/1480 train_time:215272ms step_avg:159.93ms step:1357/1480 train_time:215446ms step_avg:159.95ms step:1358/1480 train_time:215619ms step_avg:159.95ms step:1359/1480 train_time:215791ms step_avg:159.96ms step:1360/1480 train_time:215966ms step_avg:159.98ms step:1361/1480 train_time:216145ms step_avg:159.99ms step:1362/1480 train_time:216321ms step_avg:160.00ms step:1363/1480 train_time:216502ms step_avg:160.02ms step:1364/1480 train_time:216669ms step_avg:160.02ms step:1365/1480 train_time:216838ms step_avg:160.03ms step:1366/1480 train_time:217011ms step_avg:160.04ms step:1367/1480 train_time:217183ms step_avg:160.05ms step:1368/1480 train_time:217356ms step_avg:160.06ms step:1369/1480 train_time:217538ms step_avg:160.07ms step:1370/1480 train_time:217715ms step_avg:160.08ms step:1371/1480 train_time:217887ms step_avg:160.09ms step:1372/1480 train_time:218066ms step_avg:160.11ms step:1373/1480 train_time:218234ms step_avg:160.11ms step:1374/1480 train_time:218412ms step_avg:160.13ms step:1375/1480 train_time:218583ms step_avg:160.13ms step:1375/1480 val_loss:3.2937 train_time:218651ms step_avg:160.18ms step:1376/1480 train_time:218759ms step_avg:160.15ms step:1377/1480 train_time:218931ms step_avg:160.15ms step:1378/1480 train_time:219100ms step_avg:160.16ms step:1379/1480 train_time:219273ms step_avg:160.17ms step:1380/1480 train_time:219447ms step_avg:160.18ms step:1381/1480 train_time:219627ms step_avg:160.19ms step:1382/1480 train_time:219798ms step_avg:160.20ms step:1383/1480 train_time:219969ms step_avg:160.21ms step:1384/1480 train_time:220147ms step_avg:160.22ms step:1385/1480 train_time:220313ms step_avg:160.23ms step:1386/1480 train_time:220484ms step_avg:160.24ms step:1387/1480 train_time:220658ms step_avg:160.25ms step:1388/1480 train_time:220827ms step_avg:160.25ms step:1389/1480 train_time:221001ms step_avg:160.26ms step:1390/1480 train_time:221168ms step_avg:160.27ms step:1391/1480 train_time:221339ms step_avg:160.27ms step:1392/1480 train_time:221512ms step_avg:160.28ms step:1393/1480 train_time:221683ms step_avg:160.29ms step:1394/1480 train_time:221854ms step_avg:160.30ms step:1395/1480 train_time:222022ms step_avg:160.30ms step:1396/1480 train_time:222190ms step_avg:160.31ms step:1397/1480 train_time:222359ms step_avg:160.32ms step:1398/1480 train_time:222526ms step_avg:160.32ms step:1399/1480 train_time:222697ms step_avg:160.33ms step:1400/1480 train_time:222874ms step_avg:160.34ms step:1401/1480 train_time:223040ms step_avg:160.34ms step:1402/1480 train_time:223210ms step_avg:160.35ms step:1403/1480 train_time:223387ms step_avg:160.36ms step:1404/1480 train_time:223558ms step_avg:160.37ms step:1405/1480 train_time:223729ms step_avg:160.38ms step:1406/1480 train_time:223903ms step_avg:160.39ms step:1407/1480 train_time:224073ms step_avg:160.40ms step:1408/1480 train_time:224243ms step_avg:160.40ms step:1409/1480 train_time:224426ms step_avg:160.42ms step:1410/1480 train_time:224595ms step_avg:160.43ms step:1411/1480 train_time:224764ms step_avg:160.43ms step:1412/1480 train_time:224934ms step_avg:160.44ms step:1413/1480 train_time:225104ms step_avg:160.44ms step:1414/1480 train_time:225277ms step_avg:160.45ms step:1415/1480 train_time:225451ms step_avg:160.46ms step:1416/1480 train_time:225637ms step_avg:160.48ms step:1417/1480 train_time:225810ms step_avg:160.49ms step:1418/1480 train_time:225981ms step_avg:160.50ms step:1419/1480 train_time:226157ms step_avg:160.51ms step:1420/1480 train_time:226332ms step_avg:160.52ms step:1421/1480 train_time:226505ms step_avg:160.53ms step:1422/1480 train_time:226679ms step_avg:160.54ms step:1423/1480 train_time:226848ms step_avg:160.54ms step:1424/1480 train_time:227025ms step_avg:160.56ms step:1425/1480 train_time:227205ms step_avg:160.57ms step:1426/1480 train_time:227375ms step_avg:160.58ms step:1427/1480 train_time:227550ms step_avg:160.59ms step:1428/1480 train_time:227722ms step_avg:160.59ms step:1429/1480 train_time:227892ms step_avg:160.60ms step:1430/1480 train_time:228065ms step_avg:160.61ms step:1431/1480 train_time:228239ms step_avg:160.62ms step:1432/1480 train_time:228415ms step_avg:160.63ms step:1433/1480 train_time:228592ms step_avg:160.64ms step:1434/1480 train_time:228772ms step_avg:160.65ms step:1435/1480 train_time:228948ms step_avg:160.66ms step:1436/1480 train_time:229122ms step_avg:160.67ms step:1437/1480 train_time:229291ms step_avg:160.68ms step:1438/1480 train_time:229460ms step_avg:160.69ms step:1439/1480 train_time:229635ms step_avg:160.70ms step:1440/1480 train_time:229803ms step_avg:160.70ms step:1441/1480 train_time:229975ms step_avg:160.71ms step:1442/1480 train_time:230151ms step_avg:160.72ms step:1443/1480 train_time:230343ms step_avg:160.74ms step:1444/1480 train_time:230515ms step_avg:160.75ms step:1445/1480 train_time:230688ms step_avg:160.76ms step:1446/1480 train_time:230864ms step_avg:160.77ms step:1447/1480 train_time:231041ms step_avg:160.78ms step:1448/1480 train_time:231211ms step_avg:160.79ms step:1449/1480 train_time:231385ms step_avg:160.80ms step:1450/1480 train_time:231560ms step_avg:160.81ms step:1451/1480 train_time:231728ms step_avg:160.81ms step:1452/1480 train_time:231902ms step_avg:160.82ms step:1453/1480 train_time:232071ms step_avg:160.83ms step:1454/1480 train_time:232244ms step_avg:160.83ms step:1455/1480 train_time:232423ms step_avg:160.85ms step:1456/1480 train_time:232596ms step_avg:160.85ms step:1457/1480 train_time:232768ms step_avg:160.86ms step:1458/1480 train_time:232940ms step_avg:160.87ms step:1459/1480 train_time:233115ms step_avg:160.88ms step:1460/1480 train_time:233285ms step_avg:160.89ms step:1461/1480 train_time:233460ms step_avg:160.90ms step:1462/1480 train_time:233629ms step_avg:160.90ms step:1463/1480 train_time:233806ms step_avg:160.91ms step:1464/1480 train_time:233982ms step_avg:160.92ms step:1465/1480 train_time:234155ms step_avg:160.93ms step:1466/1480 train_time:234325ms step_avg:160.94ms step:1467/1480 train_time:234499ms step_avg:160.95ms step:1468/1480 train_time:234667ms step_avg:160.95ms step:1469/1480 train_time:234841ms step_avg:160.96ms step:1470/1480 train_time:235021ms step_avg:160.97ms step:1471/1480 train_time:235205ms step_avg:160.99ms step:1472/1480 train_time:235384ms step_avg:161.00ms step:1473/1480 train_time:235556ms step_avg:161.01ms step:1474/1480 train_time:235733ms step_avg:161.02ms step:1475/1480 train_time:235912ms step_avg:161.03ms step:1476/1480 train_time:236084ms step_avg:161.04ms step:1477/1480 train_time:236266ms step_avg:161.05ms step:1478/1480 train_time:236447ms step_avg:161.07ms step:1479/1480 train_time:236622ms step_avg:161.08ms step:1480/1480 train_time:236793ms step_avg:161.08ms step:1480/1480 val_loss:3.2745 train_time:236864ms step_avg:161.13ms