import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 08:43:58 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 98W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 40C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 93W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23370ms step_avg:nanms step:2/1480 train_time:23456ms step_avg:nanms step:3/1480 train_time:23595ms step_avg:nanms step:4/1480 train_time:23737ms step_avg:nanms step:5/1480 train_time:23880ms step_avg:nanms step:6/1480 train_time:24022ms step_avg:nanms step:7/1480 train_time:24163ms step_avg:nanms step:8/1480 train_time:24308ms step_avg:nanms step:9/1480 train_time:24452ms step_avg:nanms step:10/1480 train_time:24595ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:428ms step_avg:142.55ms step:14/1480 train_time:569ms step_avg:142.15ms step:15/1480 train_time:711ms step_avg:142.19ms step:16/1480 train_time:854ms step_avg:142.41ms step:17/1480 train_time:999ms step_avg:142.65ms step:18/1480 train_time:1140ms step_avg:142.49ms step:19/1480 train_time:1284ms step_avg:142.66ms step:20/1480 train_time:1428ms step_avg:142.76ms step:21/1480 train_time:1569ms step_avg:142.66ms step:22/1480 train_time:1712ms step_avg:142.64ms step:23/1480 train_time:1854ms step_avg:142.65ms step:24/1480 train_time:1997ms step_avg:142.64ms step:25/1480 train_time:2139ms step_avg:142.58ms step:26/1480 train_time:2281ms step_avg:142.54ms step:27/1480 train_time:2424ms step_avg:142.60ms step:28/1480 train_time:2567ms step_avg:142.60ms step:29/1480 train_time:2710ms step_avg:142.64ms step:30/1480 train_time:2853ms step_avg:142.63ms step:31/1480 train_time:2994ms step_avg:142.59ms step:32/1480 train_time:3135ms step_avg:142.52ms step:33/1480 train_time:3277ms step_avg:142.49ms step:34/1480 train_time:3422ms step_avg:142.56ms step:35/1480 train_time:3566ms step_avg:142.65ms step:36/1480 train_time:3709ms step_avg:142.64ms step:37/1480 train_time:3852ms step_avg:142.66ms step:38/1480 train_time:3993ms step_avg:142.61ms step:39/1480 train_time:4136ms step_avg:142.61ms step:40/1480 train_time:4278ms step_avg:142.61ms step:41/1480 train_time:4422ms step_avg:142.64ms step:42/1480 train_time:4566ms step_avg:142.68ms step:43/1480 train_time:4709ms step_avg:142.71ms step:44/1480 train_time:4852ms step_avg:142.69ms step:45/1480 train_time:4995ms step_avg:142.71ms step:46/1480 train_time:5135ms step_avg:142.64ms step:47/1480 train_time:5279ms step_avg:142.67ms step:48/1480 train_time:5424ms step_avg:142.72ms step:49/1480 train_time:5567ms step_avg:142.75ms step:50/1480 train_time:5710ms step_avg:142.76ms step:51/1480 train_time:5852ms step_avg:142.74ms step:52/1480 train_time:5996ms step_avg:142.75ms step:53/1480 train_time:6137ms step_avg:142.73ms step:54/1480 train_time:6280ms step_avg:142.73ms step:55/1480 train_time:6424ms step_avg:142.75ms step:56/1480 train_time:6568ms step_avg:142.78ms step:57/1480 train_time:6710ms step_avg:142.77ms step:58/1480 train_time:6852ms step_avg:142.76ms step:59/1480 train_time:6994ms step_avg:142.74ms step:60/1480 train_time:7135ms step_avg:142.71ms step:61/1480 train_time:7280ms step_avg:142.75ms step:62/1480 train_time:7424ms step_avg:142.77ms step:63/1480 train_time:7567ms step_avg:142.77ms step:64/1480 train_time:7709ms step_avg:142.76ms step:65/1480 train_time:7851ms step_avg:142.75ms step:66/1480 train_time:7994ms step_avg:142.75ms step:67/1480 train_time:8136ms step_avg:142.73ms step:68/1480 train_time:8279ms step_avg:142.75ms step:69/1480 train_time:8423ms step_avg:142.77ms step:70/1480 train_time:8568ms step_avg:142.80ms step:71/1480 train_time:8710ms step_avg:142.79ms step:72/1480 train_time:8853ms step_avg:142.80ms step:73/1480 train_time:8996ms step_avg:142.79ms step:74/1480 train_time:9137ms step_avg:142.76ms step:75/1480 train_time:9279ms step_avg:142.75ms step:76/1480 train_time:9423ms step_avg:142.78ms step:77/1480 train_time:9569ms step_avg:142.82ms step:78/1480 train_time:9712ms step_avg:142.83ms step:79/1480 train_time:9855ms step_avg:142.83ms step:80/1480 train_time:9998ms step_avg:142.83ms step:81/1480 train_time:10140ms step_avg:142.82ms step:82/1480 train_time:10283ms step_avg:142.81ms step:83/1480 train_time:10427ms step_avg:142.83ms step:84/1480 train_time:10570ms step_avg:142.83ms step:85/1480 train_time:10712ms step_avg:142.82ms step:86/1480 train_time:10854ms step_avg:142.82ms step:87/1480 train_time:10997ms step_avg:142.82ms step:88/1480 train_time:11141ms step_avg:142.83ms step:89/1480 train_time:11284ms step_avg:142.83ms step:90/1480 train_time:11427ms step_avg:142.84ms step:91/1480 train_time:11569ms step_avg:142.83ms step:92/1480 train_time:11712ms step_avg:142.83ms step:93/1480 train_time:11854ms step_avg:142.81ms step:94/1480 train_time:11995ms step_avg:142.79ms step:95/1480 train_time:12135ms step_avg:142.77ms step:96/1480 train_time:12277ms step_avg:142.75ms step:97/1480 train_time:12420ms step_avg:142.75ms step:98/1480 train_time:12563ms step_avg:142.76ms step:99/1480 train_time:12705ms step_avg:142.76ms step:100/1480 train_time:12849ms step_avg:142.76ms step:101/1480 train_time:12990ms step_avg:142.75ms step:102/1480 train_time:13131ms step_avg:142.73ms step:103/1480 train_time:13272ms step_avg:142.71ms step:104/1480 train_time:13414ms step_avg:142.71ms step:105/1480 train_time:13556ms step_avg:142.70ms step:106/1480 train_time:13698ms step_avg:142.69ms step:107/1480 train_time:13839ms step_avg:142.67ms step:108/1480 train_time:13983ms step_avg:142.69ms step:109/1480 train_time:14127ms step_avg:142.69ms step:110/1480 train_time:14270ms step_avg:142.70ms step:111/1480 train_time:14415ms step_avg:142.72ms step:112/1480 train_time:14561ms step_avg:142.75ms step:113/1480 train_time:14708ms step_avg:142.80ms step:114/1480 train_time:14855ms step_avg:142.83ms step:115/1480 train_time:15001ms step_avg:142.86ms step:116/1480 train_time:15148ms step_avg:142.91ms step:117/1480 train_time:15295ms step_avg:142.94ms step:118/1480 train_time:15441ms step_avg:142.97ms step:119/1480 train_time:15589ms step_avg:143.02ms step:120/1480 train_time:15734ms step_avg:143.04ms step:121/1480 train_time:15880ms step_avg:143.06ms step:122/1480 train_time:16029ms step_avg:143.11ms step:123/1480 train_time:16174ms step_avg:143.13ms step:124/1480 train_time:16321ms step_avg:143.16ms step:125/1480 train_time:16468ms step_avg:143.20ms step:125/1480 val_loss:4.4117 train_time:16525ms step_avg:143.69ms step:126/1480 train_time:16620ms step_avg:143.28ms step:127/1480 train_time:16770ms step_avg:143.33ms step:128/1480 train_time:16917ms step_avg:143.36ms step:129/1480 train_time:17063ms step_avg:143.39ms step:130/1480 train_time:17208ms step_avg:143.40ms step:131/1480 train_time:17354ms step_avg:143.42ms step:132/1480 train_time:17502ms step_avg:143.46ms step:133/1480 train_time:17652ms step_avg:143.51ms step:134/1480 train_time:17800ms step_avg:143.55ms step:135/1480 train_time:17948ms step_avg:143.58ms step:136/1480 train_time:18094ms step_avg:143.60ms step:137/1480 train_time:18242ms step_avg:143.64ms step:138/1480 train_time:18388ms step_avg:143.65ms step:139/1480 train_time:18534ms step_avg:143.68ms step:140/1480 train_time:18682ms step_avg:143.71ms step:141/1480 train_time:18829ms step_avg:143.73ms step:142/1480 train_time:18975ms step_avg:143.75ms step:143/1480 train_time:19123ms step_avg:143.78ms step:144/1480 train_time:19268ms step_avg:143.79ms step:145/1480 train_time:19416ms step_avg:143.83ms step:146/1480 train_time:19564ms step_avg:143.85ms step:147/1480 train_time:19710ms step_avg:143.87ms step:148/1480 train_time:19859ms step_avg:143.90ms step:149/1480 train_time:20006ms step_avg:143.93ms step:150/1480 train_time:20153ms step_avg:143.95ms step:151/1480 train_time:20300ms step_avg:143.97ms step:152/1480 train_time:20447ms step_avg:144.00ms step:153/1480 train_time:20595ms step_avg:144.02ms step:154/1480 train_time:20743ms step_avg:144.05ms step:155/1480 train_time:20889ms step_avg:144.06ms step:156/1480 train_time:21036ms step_avg:144.09ms step:157/1480 train_time:21184ms step_avg:144.11ms step:158/1480 train_time:21330ms step_avg:144.12ms step:159/1480 train_time:21476ms step_avg:144.13ms step:160/1480 train_time:21624ms step_avg:144.16ms step:161/1480 train_time:21770ms step_avg:144.17ms step:162/1480 train_time:21917ms step_avg:144.19ms step:163/1480 train_time:22063ms step_avg:144.21ms step:164/1480 train_time:22209ms step_avg:144.21ms step:165/1480 train_time:22356ms step_avg:144.23ms step:166/1480 train_time:22503ms step_avg:144.25ms step:167/1480 train_time:22649ms step_avg:144.26ms step:168/1480 train_time:22796ms step_avg:144.28ms step:169/1480 train_time:22944ms step_avg:144.30ms step:170/1480 train_time:23091ms step_avg:144.32ms step:171/1480 train_time:23238ms step_avg:144.34ms step:172/1480 train_time:23385ms step_avg:144.35ms step:173/1480 train_time:23531ms step_avg:144.36ms step:174/1480 train_time:23679ms step_avg:144.39ms step:175/1480 train_time:23827ms step_avg:144.41ms step:176/1480 train_time:23973ms step_avg:144.41ms step:177/1480 train_time:24121ms step_avg:144.44ms step:178/1480 train_time:24267ms step_avg:144.45ms step:179/1480 train_time:24414ms step_avg:144.46ms step:180/1480 train_time:24563ms step_avg:144.49ms step:181/1480 train_time:24709ms step_avg:144.50ms step:182/1480 train_time:24858ms step_avg:144.52ms step:183/1480 train_time:25005ms step_avg:144.54ms step:184/1480 train_time:25151ms step_avg:144.55ms step:185/1480 train_time:25298ms step_avg:144.56ms step:186/1480 train_time:25445ms step_avg:144.57ms step:187/1480 train_time:25591ms step_avg:144.58ms step:188/1480 train_time:25739ms step_avg:144.60ms step:189/1480 train_time:25886ms step_avg:144.62ms step:190/1480 train_time:26034ms step_avg:144.63ms step:191/1480 train_time:26181ms step_avg:144.65ms step:192/1480 train_time:26328ms step_avg:144.66ms step:193/1480 train_time:26473ms step_avg:144.66ms step:194/1480 train_time:26621ms step_avg:144.68ms step:195/1480 train_time:26768ms step_avg:144.69ms step:196/1480 train_time:26915ms step_avg:144.70ms step:197/1480 train_time:27063ms step_avg:144.72ms step:198/1480 train_time:27209ms step_avg:144.73ms step:199/1480 train_time:27357ms step_avg:144.75ms step:200/1480 train_time:27504ms step_avg:144.76ms step:201/1480 train_time:27650ms step_avg:144.77ms step:202/1480 train_time:27798ms step_avg:144.78ms step:203/1480 train_time:27945ms step_avg:144.79ms step:204/1480 train_time:28090ms step_avg:144.79ms step:205/1480 train_time:28236ms step_avg:144.80ms step:206/1480 train_time:28383ms step_avg:144.81ms step:207/1480 train_time:28529ms step_avg:144.82ms step:208/1480 train_time:28674ms step_avg:144.82ms step:209/1480 train_time:28822ms step_avg:144.84ms step:210/1480 train_time:28968ms step_avg:144.84ms step:211/1480 train_time:29116ms step_avg:144.85ms step:212/1480 train_time:29263ms step_avg:144.87ms step:213/1480 train_time:29409ms step_avg:144.87ms step:214/1480 train_time:29557ms step_avg:144.89ms step:215/1480 train_time:29704ms step_avg:144.90ms step:216/1480 train_time:29850ms step_avg:144.90ms step:217/1480 train_time:29997ms step_avg:144.91ms step:218/1480 train_time:30144ms step_avg:144.92ms step:219/1480 train_time:30290ms step_avg:144.93ms step:220/1480 train_time:30438ms step_avg:144.94ms step:221/1480 train_time:30586ms step_avg:144.96ms step:222/1480 train_time:30735ms step_avg:144.98ms step:223/1480 train_time:30886ms step_avg:145.01ms step:224/1480 train_time:31037ms step_avg:145.03ms step:225/1480 train_time:31187ms step_avg:145.06ms step:226/1480 train_time:31339ms step_avg:145.09ms step:227/1480 train_time:31490ms step_avg:145.11ms step:228/1480 train_time:31641ms step_avg:145.14ms step:229/1480 train_time:31791ms step_avg:145.16ms step:230/1480 train_time:31942ms step_avg:145.19ms step:231/1480 train_time:32091ms step_avg:145.21ms step:232/1480 train_time:32242ms step_avg:145.24ms step:233/1480 train_time:32391ms step_avg:145.25ms step:234/1480 train_time:32542ms step_avg:145.28ms step:235/1480 train_time:32693ms step_avg:145.30ms step:236/1480 train_time:32843ms step_avg:145.32ms step:237/1480 train_time:32993ms step_avg:145.34ms step:238/1480 train_time:33144ms step_avg:145.37ms step:239/1480 train_time:33293ms step_avg:145.38ms step:240/1480 train_time:33444ms step_avg:145.41ms step:241/1480 train_time:33593ms step_avg:145.42ms step:242/1480 train_time:33744ms step_avg:145.45ms step:243/1480 train_time:33893ms step_avg:145.47ms step:244/1480 train_time:34047ms step_avg:145.50ms step:245/1480 train_time:34193ms step_avg:145.50ms step:246/1480 train_time:34344ms step_avg:145.52ms step:247/1480 train_time:34494ms step_avg:145.54ms step:248/1480 train_time:34645ms step_avg:145.57ms step:249/1480 train_time:34795ms step_avg:145.59ms step:250/1480 train_time:34946ms step_avg:145.61ms step:250/1480 val_loss:3.9968 train_time:35004ms step_avg:145.85ms step:251/1480 train_time:35102ms step_avg:145.65ms step:252/1480 train_time:35254ms step_avg:145.68ms step:253/1480 train_time:35404ms step_avg:145.69ms step:254/1480 train_time:35552ms step_avg:145.71ms step:255/1480 train_time:35702ms step_avg:145.72ms step:256/1480 train_time:35850ms step_avg:145.73ms step:257/1480 train_time:36000ms step_avg:145.75ms step:258/1480 train_time:36153ms step_avg:145.78ms step:259/1480 train_time:36305ms step_avg:145.80ms step:260/1480 train_time:36456ms step_avg:145.82ms step:261/1480 train_time:36606ms step_avg:145.84ms step:262/1480 train_time:36757ms step_avg:145.86ms step:263/1480 train_time:36907ms step_avg:145.88ms step:264/1480 train_time:37058ms step_avg:145.90ms step:265/1480 train_time:37208ms step_avg:145.92ms step:266/1480 train_time:37360ms step_avg:145.94ms step:267/1480 train_time:37509ms step_avg:145.95ms step:268/1480 train_time:37660ms step_avg:145.97ms step:269/1480 train_time:37810ms step_avg:145.98ms step:270/1480 train_time:37960ms step_avg:146.00ms step:271/1480 train_time:38109ms step_avg:146.01ms step:272/1480 train_time:38260ms step_avg:146.03ms step:273/1480 train_time:38410ms step_avg:146.05ms step:274/1480 train_time:38562ms step_avg:146.07ms step:275/1480 train_time:38710ms step_avg:146.08ms step:276/1480 train_time:38862ms step_avg:146.10ms step:277/1480 train_time:39013ms step_avg:146.11ms step:278/1480 train_time:39164ms step_avg:146.13ms step:279/1480 train_time:39314ms step_avg:146.15ms step:280/1480 train_time:39466ms step_avg:146.17ms step:281/1480 train_time:39614ms step_avg:146.18ms step:282/1480 train_time:39766ms step_avg:146.20ms step:283/1480 train_time:39917ms step_avg:146.22ms step:284/1480 train_time:40068ms step_avg:146.23ms step:285/1480 train_time:40219ms step_avg:146.25ms step:286/1480 train_time:40368ms step_avg:146.26ms step:287/1480 train_time:40519ms step_avg:146.28ms step:288/1480 train_time:40667ms step_avg:146.29ms step:289/1480 train_time:40819ms step_avg:146.31ms step:290/1480 train_time:40969ms step_avg:146.32ms step:291/1480 train_time:41120ms step_avg:146.33ms step:292/1480 train_time:41270ms step_avg:146.35ms step:293/1480 train_time:41421ms step_avg:146.37ms step:294/1480 train_time:41572ms step_avg:146.38ms step:295/1480 train_time:41722ms step_avg:146.39ms step:296/1480 train_time:41873ms step_avg:146.41ms step:297/1480 train_time:42025ms step_avg:146.43ms step:298/1480 train_time:42177ms step_avg:146.45ms step:299/1480 train_time:42327ms step_avg:146.46ms step:300/1480 train_time:42479ms step_avg:146.48ms step:301/1480 train_time:42627ms step_avg:146.49ms step:302/1480 train_time:42781ms step_avg:146.51ms step:303/1480 train_time:42930ms step_avg:146.52ms step:304/1480 train_time:43081ms step_avg:146.53ms step:305/1480 train_time:43232ms step_avg:146.55ms step:306/1480 train_time:43383ms step_avg:146.56ms step:307/1480 train_time:43533ms step_avg:146.58ms step:308/1480 train_time:43684ms step_avg:146.59ms step:309/1480 train_time:43834ms step_avg:146.60ms step:310/1480 train_time:43985ms step_avg:146.62ms step:311/1480 train_time:44136ms step_avg:146.63ms step:312/1480 train_time:44286ms step_avg:146.64ms step:313/1480 train_time:44437ms step_avg:146.66ms step:314/1480 train_time:44587ms step_avg:146.67ms step:315/1480 train_time:44738ms step_avg:146.68ms step:316/1480 train_time:44888ms step_avg:146.69ms step:317/1480 train_time:45039ms step_avg:146.71ms step:318/1480 train_time:45189ms step_avg:146.72ms step:319/1480 train_time:45341ms step_avg:146.74ms step:320/1480 train_time:45491ms step_avg:146.75ms step:321/1480 train_time:45642ms step_avg:146.76ms step:322/1480 train_time:45793ms step_avg:146.77ms step:323/1480 train_time:45944ms step_avg:146.79ms step:324/1480 train_time:46094ms step_avg:146.80ms step:325/1480 train_time:46245ms step_avg:146.81ms step:326/1480 train_time:46396ms step_avg:146.82ms step:327/1480 train_time:46546ms step_avg:146.83ms step:328/1480 train_time:46697ms step_avg:146.85ms step:329/1480 train_time:46848ms step_avg:146.86ms step:330/1480 train_time:47001ms step_avg:146.88ms step:331/1480 train_time:47156ms step_avg:146.90ms step:332/1480 train_time:47309ms step_avg:146.92ms step:333/1480 train_time:47463ms step_avg:146.95ms step:334/1480 train_time:47616ms step_avg:146.96ms step:335/1480 train_time:47769ms step_avg:146.98ms step:336/1480 train_time:47922ms step_avg:147.00ms step:337/1480 train_time:48078ms step_avg:147.03ms step:338/1480 train_time:48234ms step_avg:147.05ms step:339/1480 train_time:48387ms step_avg:147.07ms step:340/1480 train_time:48542ms step_avg:147.10ms step:341/1480 train_time:48697ms step_avg:147.12ms step:342/1480 train_time:48851ms step_avg:147.14ms step:343/1480 train_time:49005ms step_avg:147.16ms step:344/1480 train_time:49160ms step_avg:147.19ms step:345/1480 train_time:49316ms step_avg:147.21ms step:346/1480 train_time:49471ms step_avg:147.23ms step:347/1480 train_time:49624ms step_avg:147.25ms step:348/1480 train_time:49778ms step_avg:147.27ms step:349/1480 train_time:49931ms step_avg:147.29ms step:350/1480 train_time:50085ms step_avg:147.31ms step:351/1480 train_time:50239ms step_avg:147.33ms step:352/1480 train_time:50393ms step_avg:147.35ms step:353/1480 train_time:50548ms step_avg:147.37ms step:354/1480 train_time:50701ms step_avg:147.39ms step:355/1480 train_time:50856ms step_avg:147.41ms step:356/1480 train_time:51010ms step_avg:147.43ms step:357/1480 train_time:51164ms step_avg:147.45ms step:358/1480 train_time:51318ms step_avg:147.47ms step:359/1480 train_time:51473ms step_avg:147.49ms step:360/1480 train_time:51627ms step_avg:147.51ms step:361/1480 train_time:51782ms step_avg:147.53ms step:362/1480 train_time:51936ms step_avg:147.55ms step:363/1480 train_time:52089ms step_avg:147.56ms step:364/1480 train_time:52243ms step_avg:147.58ms step:365/1480 train_time:52397ms step_avg:147.60ms step:366/1480 train_time:52551ms step_avg:147.61ms step:367/1480 train_time:52704ms step_avg:147.63ms step:368/1480 train_time:52858ms step_avg:147.65ms step:369/1480 train_time:53011ms step_avg:147.66ms step:370/1480 train_time:53164ms step_avg:147.68ms step:371/1480 train_time:53317ms step_avg:147.69ms step:372/1480 train_time:53470ms step_avg:147.71ms step:373/1480 train_time:53622ms step_avg:147.72ms step:374/1480 train_time:53776ms step_avg:147.74ms step:375/1480 train_time:53929ms step_avg:147.75ms step:375/1480 val_loss:3.8079 train_time:53989ms step_avg:147.91ms step:376/1480 train_time:54086ms step_avg:147.78ms step:377/1480 train_time:54241ms step_avg:147.80ms step:378/1480 train_time:54395ms step_avg:147.81ms step:379/1480 train_time:54547ms step_avg:147.82ms step:380/1480 train_time:54700ms step_avg:147.84ms step:381/1480 train_time:54852ms step_avg:147.85ms step:382/1480 train_time:55006ms step_avg:147.86ms step:383/1480 train_time:55162ms step_avg:147.89ms step:384/1480 train_time:55315ms step_avg:147.90ms step:385/1480 train_time:55470ms step_avg:147.92ms step:386/1480 train_time:55623ms step_avg:147.93ms step:387/1480 train_time:55777ms step_avg:147.95ms step:388/1480 train_time:55929ms step_avg:147.96ms step:389/1480 train_time:56084ms step_avg:147.98ms step:390/1480 train_time:56239ms step_avg:148.00ms step:391/1480 train_time:56393ms step_avg:148.01ms step:392/1480 train_time:56546ms step_avg:148.03ms step:393/1480 train_time:56700ms step_avg:148.04ms step:394/1480 train_time:56853ms step_avg:148.06ms step:395/1480 train_time:57007ms step_avg:148.07ms step:396/1480 train_time:57162ms step_avg:148.09ms step:397/1480 train_time:57317ms step_avg:148.11ms step:398/1480 train_time:57471ms step_avg:148.12ms step:399/1480 train_time:57625ms step_avg:148.14ms step:400/1480 train_time:57780ms step_avg:148.16ms step:401/1480 train_time:57934ms step_avg:148.17ms step:402/1480 train_time:58087ms step_avg:148.18ms step:403/1480 train_time:58242ms step_avg:148.20ms step:404/1480 train_time:58394ms step_avg:148.21ms step:405/1480 train_time:58549ms step_avg:148.23ms step:406/1480 train_time:58703ms step_avg:148.24ms step:407/1480 train_time:58857ms step_avg:148.25ms step:408/1480 train_time:59011ms step_avg:148.27ms step:409/1480 train_time:59164ms step_avg:148.28ms step:410/1480 train_time:59318ms step_avg:148.29ms step:411/1480 train_time:59472ms step_avg:148.31ms step:412/1480 train_time:59625ms step_avg:148.32ms step:413/1480 train_time:59780ms step_avg:148.34ms step:414/1480 train_time:59934ms step_avg:148.35ms step:415/1480 train_time:60087ms step_avg:148.36ms step:416/1480 train_time:60241ms step_avg:148.38ms step:417/1480 train_time:60396ms step_avg:148.39ms step:418/1480 train_time:60550ms step_avg:148.41ms step:419/1480 train_time:60703ms step_avg:148.42ms step:420/1480 train_time:60857ms step_avg:148.43ms step:421/1480 train_time:61010ms step_avg:148.44ms step:422/1480 train_time:61163ms step_avg:148.45ms step:423/1480 train_time:61316ms step_avg:148.47ms step:424/1480 train_time:61471ms step_avg:148.48ms step:425/1480 train_time:61624ms step_avg:148.49ms step:426/1480 train_time:61779ms step_avg:148.51ms step:427/1480 train_time:61933ms step_avg:148.52ms step:428/1480 train_time:62086ms step_avg:148.53ms step:429/1480 train_time:62241ms step_avg:148.55ms step:430/1480 train_time:62395ms step_avg:148.56ms step:431/1480 train_time:62549ms step_avg:148.57ms step:432/1480 train_time:62703ms step_avg:148.58ms step:433/1480 train_time:62856ms step_avg:148.60ms step:434/1480 train_time:63010ms step_avg:148.61ms step:435/1480 train_time:63165ms step_avg:148.62ms step:436/1480 train_time:63320ms step_avg:148.64ms step:437/1480 train_time:63475ms step_avg:148.65ms step:438/1480 train_time:63630ms step_avg:148.67ms step:439/1480 train_time:63784ms step_avg:148.68ms step:440/1480 train_time:63940ms step_avg:148.70ms step:441/1480 train_time:64098ms step_avg:148.72ms step:442/1480 train_time:64258ms step_avg:148.74ms step:443/1480 train_time:64414ms step_avg:148.76ms step:444/1480 train_time:64570ms step_avg:148.78ms step:445/1480 train_time:64725ms step_avg:148.79ms step:446/1480 train_time:64882ms step_avg:148.81ms step:447/1480 train_time:65039ms step_avg:148.83ms step:448/1480 train_time:65196ms step_avg:148.85ms step:449/1480 train_time:65354ms step_avg:148.87ms step:450/1480 train_time:65512ms step_avg:148.89ms step:451/1480 train_time:65670ms step_avg:148.91ms step:452/1480 train_time:65825ms step_avg:148.93ms step:453/1480 train_time:65981ms step_avg:148.94ms step:454/1480 train_time:66139ms step_avg:148.96ms step:455/1480 train_time:66295ms step_avg:148.98ms step:456/1480 train_time:66452ms step_avg:149.00ms step:457/1480 train_time:66608ms step_avg:149.01ms step:458/1480 train_time:66764ms step_avg:149.03ms step:459/1480 train_time:66921ms step_avg:149.04ms step:460/1480 train_time:67078ms step_avg:149.06ms step:461/1480 train_time:67237ms step_avg:149.08ms step:462/1480 train_time:67394ms step_avg:149.10ms step:463/1480 train_time:67550ms step_avg:149.12ms step:464/1480 train_time:67706ms step_avg:149.13ms step:465/1480 train_time:67862ms step_avg:149.15ms step:466/1480 train_time:68018ms step_avg:149.16ms step:467/1480 train_time:68176ms step_avg:149.18ms step:468/1480 train_time:68334ms step_avg:149.20ms step:469/1480 train_time:68490ms step_avg:149.22ms step:470/1480 train_time:68647ms step_avg:149.23ms step:471/1480 train_time:68804ms step_avg:149.25ms step:472/1480 train_time:68960ms step_avg:149.26ms step:473/1480 train_time:69117ms step_avg:149.28ms step:474/1480 train_time:69273ms step_avg:149.29ms step:475/1480 train_time:69428ms step_avg:149.31ms step:476/1480 train_time:69585ms step_avg:149.32ms step:477/1480 train_time:69743ms step_avg:149.34ms step:478/1480 train_time:69900ms step_avg:149.36ms step:479/1480 train_time:70058ms step_avg:149.38ms step:480/1480 train_time:70216ms step_avg:149.40ms step:481/1480 train_time:70373ms step_avg:149.41ms step:482/1480 train_time:70529ms step_avg:149.43ms step:483/1480 train_time:70685ms step_avg:149.44ms step:484/1480 train_time:70842ms step_avg:149.46ms step:485/1480 train_time:70999ms step_avg:149.47ms step:486/1480 train_time:71156ms step_avg:149.49ms step:487/1480 train_time:71314ms step_avg:149.51ms step:488/1480 train_time:71472ms step_avg:149.52ms step:489/1480 train_time:71628ms step_avg:149.54ms step:490/1480 train_time:71784ms step_avg:149.55ms step:491/1480 train_time:71941ms step_avg:149.56ms step:492/1480 train_time:72096ms step_avg:149.58ms step:493/1480 train_time:72252ms step_avg:149.59ms step:494/1480 train_time:72407ms step_avg:149.60ms step:495/1480 train_time:72564ms step_avg:149.62ms step:496/1480 train_time:72720ms step_avg:149.63ms step:497/1480 train_time:72878ms step_avg:149.65ms step:498/1480 train_time:73035ms step_avg:149.66ms step:499/1480 train_time:73193ms step_avg:149.68ms step:500/1480 train_time:73350ms step_avg:149.69ms step:500/1480 val_loss:3.6869 train_time:73411ms step_avg:149.82ms step:501/1480 train_time:73509ms step_avg:149.71ms step:502/1480 train_time:73668ms step_avg:149.73ms step:503/1480 train_time:73825ms step_avg:149.75ms step:504/1480 train_time:73981ms step_avg:149.76ms step:505/1480 train_time:74137ms step_avg:149.77ms step:506/1480 train_time:74294ms step_avg:149.79ms step:507/1480 train_time:74450ms step_avg:149.80ms step:508/1480 train_time:74607ms step_avg:149.81ms step:509/1480 train_time:74764ms step_avg:149.83ms step:510/1480 train_time:74921ms step_avg:149.84ms step:511/1480 train_time:75079ms step_avg:149.86ms step:512/1480 train_time:75237ms step_avg:149.87ms step:513/1480 train_time:75393ms step_avg:149.89ms step:514/1480 train_time:75550ms step_avg:149.90ms step:515/1480 train_time:75707ms step_avg:149.91ms step:516/1480 train_time:75866ms step_avg:149.93ms step:517/1480 train_time:76023ms step_avg:149.95ms step:518/1480 train_time:76181ms step_avg:149.96ms step:519/1480 train_time:76339ms step_avg:149.98ms step:520/1480 train_time:76496ms step_avg:149.99ms step:521/1480 train_time:76654ms step_avg:150.01ms step:522/1480 train_time:76812ms step_avg:150.02ms step:523/1480 train_time:76968ms step_avg:150.04ms step:524/1480 train_time:77125ms step_avg:150.05ms step:525/1480 train_time:77281ms step_avg:150.06ms step:526/1480 train_time:77440ms step_avg:150.08ms step:527/1480 train_time:77596ms step_avg:150.09ms step:528/1480 train_time:77754ms step_avg:150.10ms step:529/1480 train_time:77911ms step_avg:150.12ms step:530/1480 train_time:78068ms step_avg:150.13ms step:531/1480 train_time:78225ms step_avg:150.14ms step:532/1480 train_time:78381ms step_avg:150.16ms step:533/1480 train_time:78538ms step_avg:150.17ms step:534/1480 train_time:78695ms step_avg:150.18ms step:535/1480 train_time:78852ms step_avg:150.19ms step:536/1480 train_time:79009ms step_avg:150.21ms step:537/1480 train_time:79167ms step_avg:150.22ms step:538/1480 train_time:79325ms step_avg:150.24ms step:539/1480 train_time:79483ms step_avg:150.25ms step:540/1480 train_time:79642ms step_avg:150.27ms step:541/1480 train_time:79800ms step_avg:150.28ms step:542/1480 train_time:79957ms step_avg:150.29ms step:543/1480 train_time:80113ms step_avg:150.31ms step:544/1480 train_time:80269ms step_avg:150.32ms step:545/1480 train_time:80425ms step_avg:150.33ms step:546/1480 train_time:80581ms step_avg:150.34ms step:547/1480 train_time:80737ms step_avg:150.35ms step:548/1480 train_time:80894ms step_avg:150.36ms step:549/1480 train_time:81048ms step_avg:150.37ms step:550/1480 train_time:81205ms step_avg:150.38ms step:551/1480 train_time:81364ms step_avg:150.40ms step:552/1480 train_time:81524ms step_avg:150.41ms step:553/1480 train_time:81684ms step_avg:150.43ms step:554/1480 train_time:81845ms step_avg:150.45ms step:555/1480 train_time:82004ms step_avg:150.47ms step:556/1480 train_time:82163ms step_avg:150.48ms step:557/1480 train_time:82324ms step_avg:150.50ms step:558/1480 train_time:82483ms step_avg:150.52ms step:559/1480 train_time:82643ms step_avg:150.53ms step:560/1480 train_time:82803ms step_avg:150.55ms step:561/1480 train_time:82961ms step_avg:150.56ms step:562/1480 train_time:83123ms step_avg:150.58ms step:563/1480 train_time:83282ms step_avg:150.60ms step:564/1480 train_time:83443ms step_avg:150.62ms step:565/1480 train_time:83603ms step_avg:150.64ms step:566/1480 train_time:83764ms step_avg:150.65ms step:567/1480 train_time:83924ms step_avg:150.67ms step:568/1480 train_time:84083ms step_avg:150.69ms step:569/1480 train_time:84243ms step_avg:150.70ms step:570/1480 train_time:84403ms step_avg:150.72ms step:571/1480 train_time:84563ms step_avg:150.74ms step:572/1480 train_time:84723ms step_avg:150.75ms step:573/1480 train_time:84883ms step_avg:150.77ms step:574/1480 train_time:85043ms step_avg:150.79ms step:575/1480 train_time:85203ms step_avg:150.80ms step:576/1480 train_time:85364ms step_avg:150.82ms step:577/1480 train_time:85524ms step_avg:150.84ms step:578/1480 train_time:85683ms step_avg:150.85ms step:579/1480 train_time:85842ms step_avg:150.86ms step:580/1480 train_time:86001ms step_avg:150.88ms step:581/1480 train_time:86162ms step_avg:150.90ms step:582/1480 train_time:86321ms step_avg:150.91ms step:583/1480 train_time:86481ms step_avg:150.93ms step:584/1480 train_time:86641ms step_avg:150.94ms step:585/1480 train_time:86801ms step_avg:150.96ms step:586/1480 train_time:86962ms step_avg:150.98ms step:587/1480 train_time:87123ms step_avg:150.99ms step:588/1480 train_time:87282ms step_avg:151.01ms step:589/1480 train_time:87445ms step_avg:151.03ms step:590/1480 train_time:87604ms step_avg:151.04ms step:591/1480 train_time:87762ms step_avg:151.05ms step:592/1480 train_time:87922ms step_avg:151.07ms step:593/1480 train_time:88084ms step_avg:151.09ms step:594/1480 train_time:88245ms step_avg:151.10ms step:595/1480 train_time:88406ms step_avg:151.12ms step:596/1480 train_time:88567ms step_avg:151.14ms step:597/1480 train_time:88726ms step_avg:151.15ms step:598/1480 train_time:88883ms step_avg:151.16ms step:599/1480 train_time:89042ms step_avg:151.18ms step:600/1480 train_time:89203ms step_avg:151.19ms step:601/1480 train_time:89364ms step_avg:151.21ms step:602/1480 train_time:89525ms step_avg:151.22ms step:603/1480 train_time:89684ms step_avg:151.24ms step:604/1480 train_time:89843ms step_avg:151.25ms step:605/1480 train_time:90002ms step_avg:151.26ms step:606/1480 train_time:90165ms step_avg:151.28ms step:607/1480 train_time:90325ms step_avg:151.30ms step:608/1480 train_time:90485ms step_avg:151.31ms step:609/1480 train_time:90645ms step_avg:151.33ms step:610/1480 train_time:90804ms step_avg:151.34ms step:611/1480 train_time:90965ms step_avg:151.36ms step:612/1480 train_time:91124ms step_avg:151.37ms step:613/1480 train_time:91284ms step_avg:151.38ms step:614/1480 train_time:91444ms step_avg:151.40ms step:615/1480 train_time:91603ms step_avg:151.41ms step:616/1480 train_time:91764ms step_avg:151.43ms step:617/1480 train_time:91924ms step_avg:151.44ms step:618/1480 train_time:92083ms step_avg:151.45ms step:619/1480 train_time:92243ms step_avg:151.47ms step:620/1480 train_time:92401ms step_avg:151.48ms step:621/1480 train_time:92561ms step_avg:151.49ms step:622/1480 train_time:92720ms step_avg:151.50ms step:623/1480 train_time:92882ms step_avg:151.52ms step:624/1480 train_time:93042ms step_avg:151.53ms step:625/1480 train_time:93202ms step_avg:151.55ms step:625/1480 val_loss:3.6070 train_time:93265ms step_avg:151.65ms step:626/1480 train_time:93364ms step_avg:151.56ms step:627/1480 train_time:93523ms step_avg:151.58ms step:628/1480 train_time:93680ms step_avg:151.59ms step:629/1480 train_time:93840ms step_avg:151.60ms step:630/1480 train_time:93998ms step_avg:151.61ms step:631/1480 train_time:94156ms step_avg:151.62ms step:632/1480 train_time:94316ms step_avg:151.63ms step:633/1480 train_time:94476ms step_avg:151.65ms step:634/1480 train_time:94635ms step_avg:151.66ms step:635/1480 train_time:94795ms step_avg:151.67ms step:636/1480 train_time:94955ms step_avg:151.69ms step:637/1480 train_time:95117ms step_avg:151.70ms step:638/1480 train_time:95277ms step_avg:151.71ms step:639/1480 train_time:95437ms step_avg:151.73ms step:640/1480 train_time:95596ms step_avg:151.74ms step:641/1480 train_time:95755ms step_avg:151.75ms step:642/1480 train_time:95916ms step_avg:151.77ms step:643/1480 train_time:96076ms step_avg:151.78ms step:644/1480 train_time:96236ms step_avg:151.79ms step:645/1480 train_time:96395ms step_avg:151.80ms step:646/1480 train_time:96555ms step_avg:151.82ms step:647/1480 train_time:96714ms step_avg:151.83ms step:648/1480 train_time:96875ms step_avg:151.84ms step:649/1480 train_time:97035ms step_avg:151.85ms step:650/1480 train_time:97195ms step_avg:151.87ms step:651/1480 train_time:97356ms step_avg:151.88ms step:652/1480 train_time:97517ms step_avg:151.90ms step:653/1480 train_time:97677ms step_avg:151.91ms step:654/1480 train_time:97837ms step_avg:151.92ms step:655/1480 train_time:97997ms step_avg:151.93ms step:656/1480 train_time:98155ms step_avg:151.94ms step:657/1480 train_time:98317ms step_avg:151.96ms step:658/1480 train_time:98477ms step_avg:151.97ms step:659/1480 train_time:98639ms step_avg:151.99ms step:660/1480 train_time:98801ms step_avg:152.00ms step:661/1480 train_time:98962ms step_avg:152.02ms step:662/1480 train_time:99122ms step_avg:152.03ms step:663/1480 train_time:99281ms step_avg:152.04ms step:664/1480 train_time:99444ms step_avg:152.05ms step:665/1480 train_time:99606ms step_avg:152.07ms step:666/1480 train_time:99765ms step_avg:152.08ms step:667/1480 train_time:99925ms step_avg:152.09ms step:668/1480 train_time:100090ms step_avg:152.11ms step:669/1480 train_time:100254ms step_avg:152.13ms step:670/1480 train_time:100417ms step_avg:152.15ms step:671/1480 train_time:100577ms step_avg:152.16ms step:672/1480 train_time:100739ms step_avg:152.17ms step:673/1480 train_time:100900ms step_avg:152.19ms step:674/1480 train_time:101061ms step_avg:152.20ms step:675/1480 train_time:101223ms step_avg:152.21ms step:676/1480 train_time:101384ms step_avg:152.23ms step:677/1480 train_time:101544ms step_avg:152.24ms step:678/1480 train_time:101704ms step_avg:152.25ms step:679/1480 train_time:101867ms step_avg:152.27ms step:680/1480 train_time:102032ms step_avg:152.29ms step:681/1480 train_time:102192ms step_avg:152.30ms step:682/1480 train_time:102356ms step_avg:152.32ms step:683/1480 train_time:102519ms step_avg:152.33ms step:684/1480 train_time:102680ms step_avg:152.34ms step:685/1480 train_time:102843ms step_avg:152.36ms step:686/1480 train_time:103004ms step_avg:152.37ms step:687/1480 train_time:103164ms step_avg:152.38ms step:688/1480 train_time:103327ms step_avg:152.40ms step:689/1480 train_time:103491ms step_avg:152.42ms step:690/1480 train_time:103655ms step_avg:152.43ms step:691/1480 train_time:103818ms step_avg:152.45ms step:692/1480 train_time:103979ms step_avg:152.46ms step:693/1480 train_time:104141ms step_avg:152.48ms step:694/1480 train_time:104302ms step_avg:152.49ms step:695/1480 train_time:104461ms step_avg:152.50ms step:696/1480 train_time:104622ms step_avg:152.51ms step:697/1480 train_time:104784ms step_avg:152.52ms step:698/1480 train_time:104945ms step_avg:152.54ms step:699/1480 train_time:105108ms step_avg:152.55ms step:700/1480 train_time:105272ms step_avg:152.57ms step:701/1480 train_time:105433ms step_avg:152.58ms step:702/1480 train_time:105594ms step_avg:152.59ms step:703/1480 train_time:105755ms step_avg:152.60ms step:704/1480 train_time:105917ms step_avg:152.62ms step:705/1480 train_time:106079ms step_avg:152.63ms step:706/1480 train_time:106243ms step_avg:152.65ms step:707/1480 train_time:106403ms step_avg:152.66ms step:708/1480 train_time:106563ms step_avg:152.67ms step:709/1480 train_time:106724ms step_avg:152.68ms step:710/1480 train_time:106884ms step_avg:152.69ms step:711/1480 train_time:107047ms step_avg:152.71ms step:712/1480 train_time:107215ms step_avg:152.73ms step:713/1480 train_time:107378ms step_avg:152.74ms step:714/1480 train_time:107539ms step_avg:152.75ms step:715/1480 train_time:107699ms step_avg:152.76ms step:716/1480 train_time:107860ms step_avg:152.78ms step:717/1480 train_time:108022ms step_avg:152.79ms step:718/1480 train_time:108181ms step_avg:152.80ms step:719/1480 train_time:108341ms step_avg:152.81ms step:720/1480 train_time:108505ms step_avg:152.82ms step:721/1480 train_time:108666ms step_avg:152.84ms step:722/1480 train_time:108827ms step_avg:152.85ms step:723/1480 train_time:108987ms step_avg:152.86ms step:724/1480 train_time:109150ms step_avg:152.87ms step:725/1480 train_time:109315ms step_avg:152.89ms step:726/1480 train_time:109478ms step_avg:152.90ms step:727/1480 train_time:109642ms step_avg:152.92ms step:728/1480 train_time:109801ms step_avg:152.93ms step:729/1480 train_time:109961ms step_avg:152.94ms step:730/1480 train_time:110123ms step_avg:152.95ms step:731/1480 train_time:110284ms step_avg:152.96ms step:732/1480 train_time:110444ms step_avg:152.97ms step:733/1480 train_time:110606ms step_avg:152.98ms step:734/1480 train_time:110766ms step_avg:152.99ms step:735/1480 train_time:110927ms step_avg:153.00ms step:736/1480 train_time:111091ms step_avg:153.02ms step:737/1480 train_time:111253ms step_avg:153.03ms step:738/1480 train_time:111415ms step_avg:153.04ms step:739/1480 train_time:111576ms step_avg:153.05ms step:740/1480 train_time:111742ms step_avg:153.07ms step:741/1480 train_time:111905ms step_avg:153.08ms step:742/1480 train_time:112067ms step_avg:153.10ms step:743/1480 train_time:112228ms step_avg:153.11ms step:744/1480 train_time:112393ms step_avg:153.12ms step:745/1480 train_time:112559ms step_avg:153.14ms step:746/1480 train_time:112719ms step_avg:153.15ms step:747/1480 train_time:112881ms step_avg:153.16ms step:748/1480 train_time:113045ms step_avg:153.18ms step:749/1480 train_time:113209ms step_avg:153.19ms step:750/1480 train_time:113370ms step_avg:153.20ms step:750/1480 val_loss:3.5509 train_time:113436ms step_avg:153.29ms step:751/1480 train_time:113538ms step_avg:153.22ms step:752/1480 train_time:113700ms step_avg:153.23ms step:753/1480 train_time:113861ms step_avg:153.24ms step:754/1480 train_time:114021ms step_avg:153.25ms step:755/1480 train_time:114182ms step_avg:153.26ms step:756/1480 train_time:114342ms step_avg:153.27ms step:757/1480 train_time:114504ms step_avg:153.29ms step:758/1480 train_time:114665ms step_avg:153.30ms step:759/1480 train_time:114826ms step_avg:153.31ms step:760/1480 train_time:114987ms step_avg:153.32ms step:761/1480 train_time:115152ms step_avg:153.33ms step:762/1480 train_time:115315ms step_avg:153.34ms step:763/1480 train_time:115478ms step_avg:153.36ms step:764/1480 train_time:115639ms step_avg:153.37ms step:765/1480 train_time:115801ms step_avg:153.38ms step:766/1480 train_time:115964ms step_avg:153.39ms step:767/1480 train_time:116124ms step_avg:153.40ms step:768/1480 train_time:116287ms step_avg:153.41ms step:769/1480 train_time:116451ms step_avg:153.43ms step:770/1480 train_time:116615ms step_avg:153.44ms step:771/1480 train_time:116778ms step_avg:153.45ms step:772/1480 train_time:116940ms step_avg:153.46ms step:773/1480 train_time:117102ms step_avg:153.48ms step:774/1480 train_time:117265ms step_avg:153.49ms step:775/1480 train_time:117427ms step_avg:153.50ms step:776/1480 train_time:117592ms step_avg:153.51ms step:777/1480 train_time:117760ms step_avg:153.53ms step:778/1480 train_time:117922ms step_avg:153.54ms step:779/1480 train_time:118084ms step_avg:153.56ms step:780/1480 train_time:118247ms step_avg:153.57ms step:781/1480 train_time:118409ms step_avg:153.58ms step:782/1480 train_time:118574ms step_avg:153.59ms step:783/1480 train_time:118735ms step_avg:153.60ms step:784/1480 train_time:118901ms step_avg:153.62ms step:785/1480 train_time:119063ms step_avg:153.63ms step:786/1480 train_time:119227ms step_avg:153.64ms step:787/1480 train_time:119391ms step_avg:153.66ms step:788/1480 train_time:119556ms step_avg:153.67ms step:789/1480 train_time:119717ms step_avg:153.68ms step:790/1480 train_time:119883ms step_avg:153.70ms step:791/1480 train_time:120049ms step_avg:153.71ms step:792/1480 train_time:120215ms step_avg:153.73ms step:793/1480 train_time:120377ms step_avg:153.74ms step:794/1480 train_time:120540ms step_avg:153.75ms step:795/1480 train_time:120704ms step_avg:153.76ms step:796/1480 train_time:120869ms step_avg:153.78ms step:797/1480 train_time:121034ms step_avg:153.79ms step:798/1480 train_time:121199ms step_avg:153.81ms step:799/1480 train_time:121366ms step_avg:153.82ms step:800/1480 train_time:121529ms step_avg:153.83ms step:801/1480 train_time:121693ms step_avg:153.85ms step:802/1480 train_time:121861ms step_avg:153.86ms step:803/1480 train_time:122023ms step_avg:153.87ms step:804/1480 train_time:122184ms step_avg:153.88ms step:805/1480 train_time:122350ms step_avg:153.90ms step:806/1480 train_time:122511ms step_avg:153.91ms step:807/1480 train_time:122673ms step_avg:153.92ms step:808/1480 train_time:122838ms step_avg:153.93ms step:809/1480 train_time:123000ms step_avg:153.94ms step:810/1480 train_time:123162ms step_avg:153.95ms step:811/1480 train_time:123324ms step_avg:153.96ms step:812/1480 train_time:123486ms step_avg:153.97ms step:813/1480 train_time:123647ms step_avg:153.98ms step:814/1480 train_time:123810ms step_avg:153.99ms step:815/1480 train_time:123974ms step_avg:154.01ms step:816/1480 train_time:124138ms step_avg:154.02ms step:817/1480 train_time:124301ms step_avg:154.03ms step:818/1480 train_time:124462ms step_avg:154.04ms step:819/1480 train_time:124626ms step_avg:154.05ms step:820/1480 train_time:124790ms step_avg:154.06ms step:821/1480 train_time:124951ms step_avg:154.07ms step:822/1480 train_time:125117ms step_avg:154.08ms step:823/1480 train_time:125279ms step_avg:154.10ms step:824/1480 train_time:125440ms step_avg:154.10ms step:825/1480 train_time:125606ms step_avg:154.12ms step:826/1480 train_time:125775ms step_avg:154.14ms step:827/1480 train_time:125940ms step_avg:154.15ms step:828/1480 train_time:126102ms step_avg:154.16ms step:829/1480 train_time:126265ms step_avg:154.17ms step:830/1480 train_time:126432ms step_avg:154.18ms step:831/1480 train_time:126597ms step_avg:154.20ms step:832/1480 train_time:126760ms step_avg:154.21ms step:833/1480 train_time:126925ms step_avg:154.22ms step:834/1480 train_time:127090ms step_avg:154.24ms step:835/1480 train_time:127253ms step_avg:154.25ms step:836/1480 train_time:127418ms step_avg:154.26ms step:837/1480 train_time:127580ms step_avg:154.27ms step:838/1480 train_time:127743ms step_avg:154.28ms step:839/1480 train_time:127906ms step_avg:154.29ms step:840/1480 train_time:128068ms step_avg:154.30ms step:841/1480 train_time:128230ms step_avg:154.31ms step:842/1480 train_time:128394ms step_avg:154.32ms step:843/1480 train_time:128557ms step_avg:154.33ms step:844/1480 train_time:128718ms step_avg:154.34ms step:845/1480 train_time:128882ms step_avg:154.35ms step:846/1480 train_time:129047ms step_avg:154.36ms step:847/1480 train_time:129211ms step_avg:154.37ms step:848/1480 train_time:129375ms step_avg:154.38ms step:849/1480 train_time:129537ms step_avg:154.39ms step:850/1480 train_time:129700ms step_avg:154.40ms step:851/1480 train_time:129863ms step_avg:154.41ms step:852/1480 train_time:130024ms step_avg:154.42ms step:853/1480 train_time:130185ms step_avg:154.43ms step:854/1480 train_time:130349ms step_avg:154.44ms step:855/1480 train_time:130512ms step_avg:154.45ms step:856/1480 train_time:130676ms step_avg:154.46ms step:857/1480 train_time:130841ms step_avg:154.48ms step:858/1480 train_time:131007ms step_avg:154.49ms step:859/1480 train_time:131171ms step_avg:154.50ms step:860/1480 train_time:131335ms step_avg:154.51ms step:861/1480 train_time:131501ms step_avg:154.53ms step:862/1480 train_time:131670ms step_avg:154.54ms step:863/1480 train_time:131839ms step_avg:154.56ms step:864/1480 train_time:132002ms step_avg:154.57ms step:865/1480 train_time:132163ms step_avg:154.58ms step:866/1480 train_time:132328ms step_avg:154.59ms step:867/1480 train_time:132493ms step_avg:154.60ms step:868/1480 train_time:132656ms step_avg:154.61ms step:869/1480 train_time:132820ms step_avg:154.62ms step:870/1480 train_time:132985ms step_avg:154.63ms step:871/1480 train_time:133149ms step_avg:154.64ms step:872/1480 train_time:133315ms step_avg:154.66ms step:873/1480 train_time:133478ms step_avg:154.67ms step:874/1480 train_time:133643ms step_avg:154.68ms step:875/1480 train_time:133807ms step_avg:154.69ms step:875/1480 val_loss:3.5084 train_time:133872ms step_avg:154.77ms step:876/1480 train_time:133971ms step_avg:154.70ms step:877/1480 train_time:134135ms step_avg:154.71ms step:878/1480 train_time:134298ms step_avg:154.72ms step:879/1480 train_time:134462ms step_avg:154.73ms step:880/1480 train_time:134627ms step_avg:154.74ms step:881/1480 train_time:134789ms step_avg:154.75ms step:882/1480 train_time:134954ms step_avg:154.76ms step:883/1480 train_time:135119ms step_avg:154.78ms step:884/1480 train_time:135286ms step_avg:154.79ms step:885/1480 train_time:135451ms step_avg:154.80ms step:886/1480 train_time:135618ms step_avg:154.82ms step:887/1480 train_time:135787ms step_avg:154.83ms step:888/1480 train_time:135960ms step_avg:154.85ms step:889/1480 train_time:136128ms step_avg:154.87ms step:890/1480 train_time:136290ms step_avg:154.87ms step:891/1480 train_time:136454ms step_avg:154.89ms step:892/1480 train_time:136619ms step_avg:154.90ms step:893/1480 train_time:136782ms step_avg:154.91ms step:894/1480 train_time:136950ms step_avg:154.92ms step:895/1480 train_time:137115ms step_avg:154.93ms step:896/1480 train_time:137281ms step_avg:154.94ms step:897/1480 train_time:137448ms step_avg:154.96ms step:898/1480 train_time:137615ms step_avg:154.97ms step:899/1480 train_time:137778ms step_avg:154.98ms step:900/1480 train_time:137943ms step_avg:154.99ms step:901/1480 train_time:138107ms step_avg:155.00ms step:902/1480 train_time:138271ms step_avg:155.01ms step:903/1480 train_time:138442ms step_avg:155.03ms step:904/1480 train_time:138608ms step_avg:155.04ms step:905/1480 train_time:138770ms step_avg:155.05ms step:906/1480 train_time:138937ms step_avg:155.06ms step:907/1480 train_time:139107ms step_avg:155.08ms step:908/1480 train_time:139270ms step_avg:155.09ms step:909/1480 train_time:139435ms step_avg:155.10ms step:910/1480 train_time:139607ms step_avg:155.12ms step:911/1480 train_time:139772ms step_avg:155.13ms step:912/1480 train_time:139937ms step_avg:155.14ms step:913/1480 train_time:140104ms step_avg:155.15ms step:914/1480 train_time:140272ms step_avg:155.17ms step:915/1480 train_time:140442ms step_avg:155.18ms step:916/1480 train_time:140607ms step_avg:155.20ms step:917/1480 train_time:140770ms step_avg:155.20ms step:918/1480 train_time:140937ms step_avg:155.22ms step:919/1480 train_time:141108ms step_avg:155.23ms step:920/1480 train_time:141275ms step_avg:155.25ms step:921/1480 train_time:141440ms step_avg:155.26ms step:922/1480 train_time:141608ms step_avg:155.27ms step:923/1480 train_time:141771ms step_avg:155.28ms step:924/1480 train_time:141936ms step_avg:155.29ms step:925/1480 train_time:142103ms step_avg:155.30ms step:926/1480 train_time:142267ms step_avg:155.31ms step:927/1480 train_time:142430ms step_avg:155.32ms step:928/1480 train_time:142597ms step_avg:155.33ms step:929/1480 train_time:142764ms step_avg:155.35ms step:930/1480 train_time:142932ms step_avg:155.36ms step:931/1480 train_time:143095ms step_avg:155.37ms step:932/1480 train_time:143259ms step_avg:155.38ms step:933/1480 train_time:143429ms step_avg:155.39ms step:934/1480 train_time:143594ms step_avg:155.40ms step:935/1480 train_time:143765ms step_avg:155.42ms step:936/1480 train_time:143933ms step_avg:155.44ms step:937/1480 train_time:144105ms step_avg:155.45ms step:938/1480 train_time:144268ms step_avg:155.46ms step:939/1480 train_time:144437ms step_avg:155.48ms step:940/1480 train_time:144605ms step_avg:155.49ms step:941/1480 train_time:144769ms step_avg:155.50ms step:942/1480 train_time:144934ms step_avg:155.51ms step:943/1480 train_time:145104ms step_avg:155.52ms step:944/1480 train_time:145276ms step_avg:155.54ms step:945/1480 train_time:145440ms step_avg:155.55ms step:946/1480 train_time:145609ms step_avg:155.57ms step:947/1480 train_time:145777ms step_avg:155.58ms step:948/1480 train_time:145943ms step_avg:155.59ms step:949/1480 train_time:146110ms step_avg:155.60ms step:950/1480 train_time:146273ms step_avg:155.61ms step:951/1480 train_time:146441ms step_avg:155.62ms step:952/1480 train_time:146608ms step_avg:155.63ms step:953/1480 train_time:146776ms step_avg:155.65ms step:954/1480 train_time:146946ms step_avg:155.66ms step:955/1480 train_time:147109ms step_avg:155.67ms step:956/1480 train_time:147273ms step_avg:155.68ms step:957/1480 train_time:147441ms step_avg:155.69ms step:958/1480 train_time:147611ms step_avg:155.71ms step:959/1480 train_time:147776ms step_avg:155.72ms step:960/1480 train_time:147945ms step_avg:155.73ms step:961/1480 train_time:148110ms step_avg:155.74ms step:962/1480 train_time:148274ms step_avg:155.75ms step:963/1480 train_time:148440ms step_avg:155.76ms step:964/1480 train_time:148608ms step_avg:155.77ms step:965/1480 train_time:148773ms step_avg:155.78ms step:966/1480 train_time:148938ms step_avg:155.79ms step:967/1480 train_time:149101ms step_avg:155.80ms step:968/1480 train_time:149267ms step_avg:155.81ms step:969/1480 train_time:149432ms step_avg:155.82ms step:970/1480 train_time:149593ms step_avg:155.83ms step:971/1480 train_time:149759ms step_avg:155.84ms step:972/1480 train_time:149927ms step_avg:155.85ms step:973/1480 train_time:150091ms step_avg:155.86ms step:974/1480 train_time:150260ms step_avg:155.87ms step:975/1480 train_time:150428ms step_avg:155.88ms step:976/1480 train_time:150592ms step_avg:155.89ms step:977/1480 train_time:150755ms step_avg:155.90ms step:978/1480 train_time:150922ms step_avg:155.91ms step:979/1480 train_time:151088ms step_avg:155.92ms step:980/1480 train_time:151253ms step_avg:155.93ms step:981/1480 train_time:151422ms step_avg:155.94ms step:982/1480 train_time:151587ms step_avg:155.95ms step:983/1480 train_time:151753ms step_avg:155.96ms step:984/1480 train_time:151915ms step_avg:155.97ms step:985/1480 train_time:152083ms step_avg:155.98ms step:986/1480 train_time:152250ms step_avg:155.99ms step:987/1480 train_time:152413ms step_avg:156.00ms step:988/1480 train_time:152580ms step_avg:156.01ms step:989/1480 train_time:152748ms step_avg:156.02ms step:990/1480 train_time:152916ms step_avg:156.04ms step:991/1480 train_time:153083ms step_avg:156.05ms step:992/1480 train_time:153256ms step_avg:156.07ms step:993/1480 train_time:153433ms step_avg:156.09ms step:994/1480 train_time:153598ms step_avg:156.10ms step:995/1480 train_time:153762ms step_avg:156.10ms step:996/1480 train_time:153927ms step_avg:156.11ms step:997/1480 train_time:154090ms step_avg:156.12ms step:998/1480 train_time:154253ms step_avg:156.13ms step:999/1480 train_time:154419ms step_avg:156.14ms step:1000/1480 train_time:154589ms step_avg:156.15ms step:1000/1480 val_loss:3.4438 train_time:154656ms step_avg:156.22ms step:1001/1480 train_time:154758ms step_avg:156.16ms step:1002/1480 train_time:154924ms step_avg:156.17ms step:1003/1480 train_time:155096ms step_avg:156.19ms step:1004/1480 train_time:155265ms step_avg:156.20ms step:1005/1480 train_time:155433ms step_avg:156.21ms step:1006/1480 train_time:155599ms step_avg:156.22ms step:1007/1480 train_time:155766ms step_avg:156.23ms step:1008/1480 train_time:155933ms step_avg:156.25ms step:1009/1480 train_time:156107ms step_avg:156.26ms step:1010/1480 train_time:156272ms step_avg:156.27ms step:1011/1480 train_time:156439ms step_avg:156.28ms step:1012/1480 train_time:156604ms step_avg:156.29ms step:1013/1480 train_time:156774ms step_avg:156.31ms step:1014/1480 train_time:156942ms step_avg:156.32ms step:1015/1480 train_time:157112ms step_avg:156.33ms step:1016/1480 train_time:157278ms step_avg:156.34ms step:1017/1480 train_time:157450ms step_avg:156.36ms step:1018/1480 train_time:157619ms step_avg:156.37ms step:1019/1480 train_time:157788ms step_avg:156.38ms step:1020/1480 train_time:157956ms step_avg:156.39ms step:1021/1480 train_time:158122ms step_avg:156.40ms step:1022/1480 train_time:158290ms step_avg:156.41ms step:1023/1480 train_time:158456ms step_avg:156.42ms step:1024/1480 train_time:158622ms step_avg:156.43ms step:1025/1480 train_time:158793ms step_avg:156.45ms step:1026/1480 train_time:158957ms step_avg:156.45ms step:1027/1480 train_time:159125ms step_avg:156.47ms step:1028/1480 train_time:159298ms step_avg:156.48ms step:1029/1480 train_time:159473ms step_avg:156.50ms step:1030/1480 train_time:159641ms step_avg:156.51ms step:1031/1480 train_time:159807ms step_avg:156.52ms step:1032/1480 train_time:159981ms step_avg:156.54ms step:1033/1480 train_time:160148ms step_avg:156.55ms step:1034/1480 train_time:160316ms step_avg:156.56ms step:1035/1480 train_time:160483ms step_avg:156.57ms step:1036/1480 train_time:160649ms step_avg:156.58ms step:1037/1480 train_time:160815ms step_avg:156.59ms step:1038/1480 train_time:160984ms step_avg:156.60ms step:1039/1480 train_time:161155ms step_avg:156.61ms step:1040/1480 train_time:161320ms step_avg:156.62ms step:1041/1480 train_time:161489ms step_avg:156.63ms step:1042/1480 train_time:161653ms step_avg:156.64ms step:1043/1480 train_time:161817ms step_avg:156.65ms step:1044/1480 train_time:161982ms step_avg:156.66ms step:1045/1480 train_time:162151ms step_avg:156.67ms step:1046/1480 train_time:162318ms step_avg:156.68ms step:1047/1480 train_time:162487ms step_avg:156.69ms step:1048/1480 train_time:162653ms step_avg:156.70ms step:1049/1480 train_time:162818ms step_avg:156.71ms step:1050/1480 train_time:162988ms step_avg:156.72ms step:1051/1480 train_time:163155ms step_avg:156.73ms step:1052/1480 train_time:163323ms step_avg:156.74ms step:1053/1480 train_time:163490ms step_avg:156.75ms step:1054/1480 train_time:163657ms step_avg:156.76ms step:1055/1480 train_time:163822ms step_avg:156.77ms step:1056/1480 train_time:163988ms step_avg:156.78ms step:1057/1480 train_time:164155ms step_avg:156.79ms step:1058/1480 train_time:164325ms step_avg:156.80ms step:1059/1480 train_time:164498ms step_avg:156.81ms step:1060/1480 train_time:164667ms step_avg:156.83ms step:1061/1480 train_time:164830ms step_avg:156.83ms step:1062/1480 train_time:164996ms step_avg:156.84ms step:1063/1480 train_time:165161ms step_avg:156.85ms step:1064/1480 train_time:165325ms step_avg:156.85ms step:1065/1480 train_time:165493ms step_avg:156.87ms step:1066/1480 train_time:165662ms step_avg:156.88ms step:1067/1480 train_time:165832ms step_avg:156.89ms step:1068/1480 train_time:165997ms step_avg:156.90ms step:1069/1480 train_time:166170ms step_avg:156.91ms step:1070/1480 train_time:166335ms step_avg:156.92ms step:1071/1480 train_time:166508ms step_avg:156.94ms step:1072/1480 train_time:166675ms step_avg:156.94ms step:1073/1480 train_time:166838ms step_avg:156.95ms step:1074/1480 train_time:167005ms step_avg:156.96ms step:1075/1480 train_time:167176ms step_avg:156.97ms step:1076/1480 train_time:167344ms step_avg:156.98ms step:1077/1480 train_time:167512ms step_avg:156.99ms step:1078/1480 train_time:167688ms step_avg:157.01ms step:1079/1480 train_time:167860ms step_avg:157.02ms step:1080/1480 train_time:168030ms step_avg:157.04ms step:1081/1480 train_time:168196ms step_avg:157.05ms step:1082/1480 train_time:168363ms step_avg:157.06ms step:1083/1480 train_time:168530ms step_avg:157.06ms step:1084/1480 train_time:168695ms step_avg:157.07ms step:1085/1480 train_time:168866ms step_avg:157.09ms step:1086/1480 train_time:169033ms step_avg:157.09ms step:1087/1480 train_time:169200ms step_avg:157.10ms step:1088/1480 train_time:169370ms step_avg:157.12ms step:1089/1480 train_time:169543ms step_avg:157.13ms step:1090/1480 train_time:169713ms step_avg:157.14ms step:1091/1480 train_time:169880ms step_avg:157.15ms step:1092/1480 train_time:170047ms step_avg:157.16ms step:1093/1480 train_time:170214ms step_avg:157.17ms step:1094/1480 train_time:170380ms step_avg:157.18ms step:1095/1480 train_time:170545ms step_avg:157.18ms step:1096/1480 train_time:170713ms step_avg:157.19ms step:1097/1480 train_time:170882ms step_avg:157.21ms step:1098/1480 train_time:171053ms step_avg:157.22ms step:1099/1480 train_time:171224ms step_avg:157.23ms step:1100/1480 train_time:171395ms step_avg:157.24ms step:1101/1480 train_time:171566ms step_avg:157.26ms step:1102/1480 train_time:171736ms step_avg:157.27ms step:1103/1480 train_time:171913ms step_avg:157.29ms step:1104/1480 train_time:172080ms step_avg:157.29ms step:1105/1480 train_time:172250ms step_avg:157.31ms step:1106/1480 train_time:172418ms step_avg:157.32ms step:1107/1480 train_time:172588ms step_avg:157.33ms step:1108/1480 train_time:172752ms step_avg:157.33ms step:1109/1480 train_time:172918ms step_avg:157.34ms step:1110/1480 train_time:173084ms step_avg:157.35ms step:1111/1480 train_time:173250ms step_avg:157.36ms step:1112/1480 train_time:173420ms step_avg:157.37ms step:1113/1480 train_time:173599ms step_avg:157.39ms step:1114/1480 train_time:173772ms step_avg:157.40ms step:1115/1480 train_time:173945ms step_avg:157.42ms step:1116/1480 train_time:174114ms step_avg:157.43ms step:1117/1480 train_time:174288ms step_avg:157.44ms step:1118/1480 train_time:174461ms step_avg:157.46ms step:1119/1480 train_time:174626ms step_avg:157.46ms step:1120/1480 train_time:174794ms step_avg:157.47ms step:1121/1480 train_time:174964ms step_avg:157.48ms step:1122/1480 train_time:175131ms step_avg:157.49ms step:1123/1480 train_time:175297ms step_avg:157.50ms step:1124/1480 train_time:175467ms step_avg:157.51ms step:1125/1480 train_time:175634ms step_avg:157.52ms step:1125/1480 val_loss:3.3885 train_time:175702ms step_avg:157.58ms step:1126/1480 train_time:175805ms step_avg:157.53ms step:1127/1480 train_time:175974ms step_avg:157.54ms step:1128/1480 train_time:176146ms step_avg:157.55ms step:1129/1480 train_time:176321ms step_avg:157.57ms step:1130/1480 train_time:176490ms step_avg:157.58ms step:1131/1480 train_time:176667ms step_avg:157.60ms step:1132/1480 train_time:176833ms step_avg:157.60ms step:1133/1480 train_time:177007ms step_avg:157.62ms step:1134/1480 train_time:177178ms step_avg:157.63ms step:1135/1480 train_time:177346ms step_avg:157.64ms step:1136/1480 train_time:177518ms step_avg:157.65ms step:1137/1480 train_time:177688ms step_avg:157.66ms step:1138/1480 train_time:177861ms step_avg:157.68ms step:1139/1480 train_time:178029ms step_avg:157.69ms step:1140/1480 train_time:178197ms step_avg:157.70ms step:1141/1480 train_time:178368ms step_avg:157.71ms step:1142/1480 train_time:178538ms step_avg:157.72ms step:1143/1480 train_time:178708ms step_avg:157.73ms step:1144/1480 train_time:178877ms step_avg:157.74ms step:1145/1480 train_time:179044ms step_avg:157.75ms step:1146/1480 train_time:179216ms step_avg:157.76ms step:1147/1480 train_time:179386ms step_avg:157.77ms step:1148/1480 train_time:179555ms step_avg:157.78ms step:1149/1480 train_time:179727ms step_avg:157.79ms step:1150/1480 train_time:179895ms step_avg:157.80ms step:1151/1480 train_time:180067ms step_avg:157.81ms step:1152/1480 train_time:180239ms step_avg:157.83ms step:1153/1480 train_time:180411ms step_avg:157.84ms step:1154/1480 train_time:180579ms step_avg:157.85ms step:1155/1480 train_time:180750ms step_avg:157.86ms step:1156/1480 train_time:180928ms step_avg:157.88ms step:1157/1480 train_time:181099ms step_avg:157.89ms step:1158/1480 train_time:181266ms step_avg:157.90ms step:1159/1480 train_time:181434ms step_avg:157.91ms step:1160/1480 train_time:181601ms step_avg:157.91ms step:1161/1480 train_time:181769ms step_avg:157.92ms step:1162/1480 train_time:181939ms step_avg:157.93ms step:1163/1480 train_time:182109ms step_avg:157.94ms step:1164/1480 train_time:182278ms step_avg:157.95ms step:1165/1480 train_time:182443ms step_avg:157.96ms step:1166/1480 train_time:182613ms step_avg:157.97ms step:1167/1480 train_time:182782ms step_avg:157.98ms step:1168/1480 train_time:182949ms step_avg:157.99ms step:1169/1480 train_time:183119ms step_avg:158.00ms step:1170/1480 train_time:183288ms step_avg:158.01ms step:1171/1480 train_time:183455ms step_avg:158.01ms step:1172/1480 train_time:183623ms step_avg:158.02ms step:1173/1480 train_time:183793ms step_avg:158.03ms step:1174/1480 train_time:183977ms step_avg:158.06ms step:1175/1480 train_time:184149ms step_avg:158.07ms step:1176/1480 train_time:184322ms step_avg:158.08ms step:1177/1480 train_time:184498ms step_avg:158.10ms step:1178/1480 train_time:184665ms step_avg:158.10ms step:1179/1480 train_time:184831ms step_avg:158.11ms step:1180/1480 train_time:185012ms step_avg:158.13ms step:1181/1480 train_time:185183ms step_avg:158.14ms step:1182/1480 train_time:185351ms step_avg:158.15ms step:1183/1480 train_time:185522ms step_avg:158.16ms step:1184/1480 train_time:185689ms step_avg:158.17ms step:1185/1480 train_time:185862ms step_avg:158.18ms step:1186/1480 train_time:186033ms step_avg:158.19ms step:1187/1480 train_time:186218ms step_avg:158.21ms step:1188/1480 train_time:186386ms step_avg:158.22ms step:1189/1480 train_time:186559ms step_avg:158.23ms step:1190/1480 train_time:186726ms step_avg:158.24ms step:1191/1480 train_time:186897ms step_avg:158.25ms step:1192/1480 train_time:187064ms step_avg:158.26ms step:1193/1480 train_time:187231ms step_avg:158.27ms step:1194/1480 train_time:187401ms step_avg:158.28ms step:1195/1480 train_time:187574ms step_avg:158.29ms step:1196/1480 train_time:187759ms step_avg:158.31ms step:1197/1480 train_time:187930ms step_avg:158.32ms step:1198/1480 train_time:188114ms step_avg:158.34ms step:1199/1480 train_time:188285ms step_avg:158.36ms step:1200/1480 train_time:188454ms step_avg:158.36ms step:1201/1480 train_time:188622ms step_avg:158.37ms step:1202/1480 train_time:188803ms step_avg:158.39ms step:1203/1480 train_time:188981ms step_avg:158.41ms step:1204/1480 train_time:189155ms step_avg:158.42ms step:1205/1480 train_time:189324ms step_avg:158.43ms step:1206/1480 train_time:189491ms step_avg:158.44ms step:1207/1480 train_time:189662ms step_avg:158.45ms step:1208/1480 train_time:189829ms step_avg:158.45ms step:1209/1480 train_time:190003ms step_avg:158.47ms step:1210/1480 train_time:190178ms step_avg:158.48ms step:1211/1480 train_time:190353ms step_avg:158.50ms step:1212/1480 train_time:190526ms step_avg:158.51ms step:1213/1480 train_time:190700ms step_avg:158.52ms step:1214/1480 train_time:190878ms step_avg:158.54ms step:1215/1480 train_time:191051ms step_avg:158.55ms step:1216/1480 train_time:191220ms step_avg:158.56ms step:1217/1480 train_time:191393ms step_avg:158.57ms step:1218/1480 train_time:191563ms step_avg:158.58ms step:1219/1480 train_time:191742ms step_avg:158.60ms step:1220/1480 train_time:191910ms step_avg:158.60ms step:1221/1480 train_time:192080ms step_avg:158.61ms step:1222/1480 train_time:192246ms step_avg:158.62ms step:1223/1480 train_time:192418ms step_avg:158.63ms step:1224/1480 train_time:192595ms step_avg:158.64ms step:1225/1480 train_time:192766ms step_avg:158.66ms step:1226/1480 train_time:192940ms step_avg:158.67ms step:1227/1480 train_time:193112ms step_avg:158.68ms step:1228/1480 train_time:193282ms step_avg:158.69ms step:1229/1480 train_time:193455ms step_avg:158.70ms step:1230/1480 train_time:193635ms step_avg:158.72ms step:1231/1480 train_time:193810ms step_avg:158.73ms step:1232/1480 train_time:193985ms step_avg:158.74ms step:1233/1480 train_time:194155ms step_avg:158.75ms step:1234/1480 train_time:194326ms step_avg:158.76ms step:1235/1480 train_time:194501ms step_avg:158.78ms step:1236/1480 train_time:194669ms step_avg:158.78ms step:1237/1480 train_time:194840ms step_avg:158.79ms step:1238/1480 train_time:195025ms step_avg:158.81ms step:1239/1480 train_time:195196ms step_avg:158.83ms step:1240/1480 train_time:195366ms step_avg:158.83ms step:1241/1480 train_time:195540ms step_avg:158.85ms step:1242/1480 train_time:195709ms step_avg:158.86ms step:1243/1480 train_time:195884ms step_avg:158.87ms step:1244/1480 train_time:196049ms step_avg:158.87ms step:1245/1480 train_time:196219ms step_avg:158.88ms step:1246/1480 train_time:196388ms step_avg:158.89ms step:1247/1480 train_time:196558ms step_avg:158.90ms step:1248/1480 train_time:196727ms step_avg:158.91ms step:1249/1480 train_time:196894ms step_avg:158.91ms step:1250/1480 train_time:197063ms step_avg:158.92ms step:1250/1480 val_loss:3.3386 train_time:197135ms step_avg:158.98ms step:1251/1480 train_time:197244ms step_avg:158.94ms step:1252/1480 train_time:197413ms step_avg:158.95ms step:1253/1480 train_time:197581ms step_avg:158.96ms step:1254/1480 train_time:197752ms step_avg:158.96ms step:1255/1480 train_time:197939ms step_avg:158.99ms step:1256/1480 train_time:198113ms step_avg:159.00ms step:1257/1480 train_time:198282ms step_avg:159.01ms step:1258/1480 train_time:198458ms step_avg:159.02ms step:1259/1480 train_time:198629ms step_avg:159.03ms step:1260/1480 train_time:198797ms step_avg:159.04ms step:1261/1480 train_time:198969ms step_avg:159.05ms step:1262/1480 train_time:199144ms step_avg:159.06ms step:1263/1480 train_time:199318ms step_avg:159.07ms step:1264/1480 train_time:199485ms step_avg:159.08ms step:1265/1480 train_time:199653ms step_avg:159.09ms step:1266/1480 train_time:199825ms step_avg:159.10ms step:1267/1480 train_time:199996ms step_avg:159.11ms step:1268/1480 train_time:200166ms step_avg:159.11ms step:1269/1480 train_time:200342ms step_avg:159.13ms step:1270/1480 train_time:200512ms step_avg:159.14ms step:1271/1480 train_time:200682ms step_avg:159.14ms step:1272/1480 train_time:200847ms step_avg:159.15ms step:1273/1480 train_time:201018ms step_avg:159.16ms step:1274/1480 train_time:201191ms step_avg:159.17ms step:1275/1480 train_time:201359ms step_avg:159.18ms step:1276/1480 train_time:201524ms step_avg:159.18ms step:1277/1480 train_time:201697ms step_avg:159.19ms step:1278/1480 train_time:201865ms step_avg:159.20ms step:1279/1480 train_time:202037ms step_avg:159.21ms step:1280/1480 train_time:202219ms step_avg:159.23ms step:1281/1480 train_time:202386ms step_avg:159.23ms step:1282/1480 train_time:202551ms step_avg:159.24ms step:1283/1480 train_time:202723ms step_avg:159.25ms step:1284/1480 train_time:202893ms step_avg:159.26ms step:1285/1480 train_time:203062ms step_avg:159.26ms step:1286/1480 train_time:203232ms step_avg:159.27ms step:1287/1480 train_time:203404ms step_avg:159.28ms step:1288/1480 train_time:203575ms step_avg:159.29ms step:1289/1480 train_time:203760ms step_avg:159.31ms step:1290/1480 train_time:203939ms step_avg:159.33ms step:1291/1480 train_time:204112ms step_avg:159.34ms step:1292/1480 train_time:204285ms step_avg:159.35ms step:1293/1480 train_time:204462ms step_avg:159.36ms step:1294/1480 train_time:204635ms step_avg:159.37ms step:1295/1480 train_time:204807ms step_avg:159.38ms step:1296/1480 train_time:204981ms step_avg:159.39ms step:1297/1480 train_time:205153ms step_avg:159.40ms step:1298/1480 train_time:205324ms step_avg:159.41ms step:1299/1480 train_time:205494ms step_avg:159.42ms step:1300/1480 train_time:205661ms step_avg:159.43ms step:1301/1480 train_time:205830ms step_avg:159.43ms step:1302/1480 train_time:206005ms step_avg:159.45ms step:1303/1480 train_time:206181ms step_avg:159.46ms step:1304/1480 train_time:206355ms step_avg:159.47ms step:1305/1480 train_time:206524ms step_avg:159.48ms step:1306/1480 train_time:206700ms step_avg:159.49ms step:1307/1480 train_time:206867ms step_avg:159.50ms step:1308/1480 train_time:207038ms step_avg:159.51ms step:1309/1480 train_time:207209ms step_avg:159.51ms step:1310/1480 train_time:207378ms step_avg:159.52ms step:1311/1480 train_time:207547ms step_avg:159.53ms step:1312/1480 train_time:207720ms step_avg:159.54ms step:1313/1480 train_time:207887ms step_avg:159.54ms step:1314/1480 train_time:208060ms step_avg:159.56ms step:1315/1480 train_time:208230ms step_avg:159.56ms step:1316/1480 train_time:208399ms step_avg:159.57ms step:1317/1480 train_time:208569ms step_avg:159.58ms step:1318/1480 train_time:208748ms step_avg:159.59ms step:1319/1480 train_time:208922ms step_avg:159.60ms step:1320/1480 train_time:209100ms step_avg:159.62ms step:1321/1480 train_time:209272ms step_avg:159.63ms step:1322/1480 train_time:209455ms step_avg:159.65ms step:1323/1480 train_time:209627ms step_avg:159.65ms step:1324/1480 train_time:209802ms step_avg:159.67ms step:1325/1480 train_time:209982ms step_avg:159.68ms step:1326/1480 train_time:210158ms step_avg:159.69ms step:1327/1480 train_time:210328ms step_avg:159.70ms step:1328/1480 train_time:210499ms step_avg:159.71ms step:1329/1480 train_time:210695ms step_avg:159.74ms step:1330/1480 train_time:210876ms step_avg:159.75ms step:1331/1480 train_time:211046ms step_avg:159.76ms step:1332/1480 train_time:211222ms step_avg:159.77ms step:1333/1480 train_time:211396ms step_avg:159.79ms step:1334/1480 train_time:211566ms step_avg:159.79ms step:1335/1480 train_time:211735ms step_avg:159.80ms step:1336/1480 train_time:211919ms step_avg:159.82ms step:1337/1480 train_time:212093ms step_avg:159.83ms step:1338/1480 train_time:212266ms step_avg:159.84ms step:1339/1480 train_time:212441ms step_avg:159.85ms step:1340/1480 train_time:212613ms step_avg:159.86ms step:1341/1480 train_time:212780ms step_avg:159.86ms step:1342/1480 train_time:212954ms step_avg:159.88ms step:1343/1480 train_time:213123ms step_avg:159.88ms step:1344/1480 train_time:213296ms step_avg:159.89ms step:1345/1480 train_time:213476ms step_avg:159.91ms step:1346/1480 train_time:213644ms step_avg:159.91ms step:1347/1480 train_time:213813ms step_avg:159.92ms step:1348/1480 train_time:213982ms step_avg:159.93ms step:1349/1480 train_time:214151ms step_avg:159.93ms step:1350/1480 train_time:214327ms step_avg:159.95ms step:1351/1480 train_time:214497ms step_avg:159.95ms step:1352/1480 train_time:214669ms step_avg:159.96ms step:1353/1480 train_time:214845ms step_avg:159.97ms step:1354/1480 train_time:215017ms step_avg:159.98ms step:1355/1480 train_time:215185ms step_avg:159.99ms step:1356/1480 train_time:215358ms step_avg:160.00ms step:1357/1480 train_time:215532ms step_avg:160.01ms step:1358/1480 train_time:215704ms step_avg:160.02ms step:1359/1480 train_time:215876ms step_avg:160.03ms step:1360/1480 train_time:216050ms step_avg:160.04ms step:1361/1480 train_time:216228ms step_avg:160.05ms step:1362/1480 train_time:216404ms step_avg:160.06ms step:1363/1480 train_time:216583ms step_avg:160.08ms step:1364/1480 train_time:216752ms step_avg:160.08ms step:1365/1480 train_time:216920ms step_avg:160.09ms step:1366/1480 train_time:217091ms step_avg:160.10ms step:1367/1480 train_time:217262ms step_avg:160.10ms step:1368/1480 train_time:217437ms step_avg:160.12ms step:1369/1480 train_time:217617ms step_avg:160.13ms step:1370/1480 train_time:217797ms step_avg:160.14ms step:1371/1480 train_time:217968ms step_avg:160.15ms step:1372/1480 train_time:218145ms step_avg:160.17ms step:1373/1480 train_time:218314ms step_avg:160.17ms step:1374/1480 train_time:218489ms step_avg:160.18ms step:1375/1480 train_time:218661ms step_avg:160.19ms step:1375/1480 val_loss:3.2996 train_time:218728ms step_avg:160.24ms step:1376/1480 train_time:218834ms step_avg:160.20ms step:1377/1480 train_time:219007ms step_avg:160.21ms step:1378/1480 train_time:219176ms step_avg:160.22ms step:1379/1480 train_time:219351ms step_avg:160.23ms step:1380/1480 train_time:219524ms step_avg:160.24ms step:1381/1480 train_time:219704ms step_avg:160.25ms step:1382/1480 train_time:219875ms step_avg:160.26ms step:1383/1480 train_time:220047ms step_avg:160.27ms step:1384/1480 train_time:220224ms step_avg:160.28ms step:1385/1480 train_time:220390ms step_avg:160.28ms step:1386/1480 train_time:220561ms step_avg:160.29ms step:1387/1480 train_time:220731ms step_avg:160.30ms step:1388/1480 train_time:220901ms step_avg:160.31ms step:1389/1480 train_time:221075ms step_avg:160.32ms step:1390/1480 train_time:221243ms step_avg:160.32ms step:1391/1480 train_time:221412ms step_avg:160.33ms step:1392/1480 train_time:221584ms step_avg:160.34ms step:1393/1480 train_time:221757ms step_avg:160.34ms step:1394/1480 train_time:221928ms step_avg:160.35ms step:1395/1480 train_time:222097ms step_avg:160.36ms step:1396/1480 train_time:222265ms step_avg:160.36ms step:1397/1480 train_time:222431ms step_avg:160.37ms step:1398/1480 train_time:222598ms step_avg:160.37ms step:1399/1480 train_time:222767ms step_avg:160.38ms step:1400/1480 train_time:222945ms step_avg:160.39ms step:1401/1480 train_time:223110ms step_avg:160.40ms step:1402/1480 train_time:223281ms step_avg:160.40ms step:1403/1480 train_time:223460ms step_avg:160.42ms step:1404/1480 train_time:223631ms step_avg:160.42ms step:1405/1480 train_time:223805ms step_avg:160.43ms step:1406/1480 train_time:223980ms step_avg:160.44ms step:1407/1480 train_time:224149ms step_avg:160.45ms step:1408/1480 train_time:224318ms step_avg:160.46ms step:1409/1480 train_time:224501ms step_avg:160.47ms step:1410/1480 train_time:224669ms step_avg:160.48ms step:1411/1480 train_time:224837ms step_avg:160.48ms step:1412/1480 train_time:225007ms step_avg:160.49ms step:1413/1480 train_time:225178ms step_avg:160.50ms step:1414/1480 train_time:225350ms step_avg:160.51ms step:1415/1480 train_time:225525ms step_avg:160.52ms step:1416/1480 train_time:225712ms step_avg:160.53ms step:1417/1480 train_time:225886ms step_avg:160.54ms step:1418/1480 train_time:226058ms step_avg:160.55ms step:1419/1480 train_time:226231ms step_avg:160.56ms step:1420/1480 train_time:226405ms step_avg:160.57ms step:1421/1480 train_time:226580ms step_avg:160.58ms step:1422/1480 train_time:226751ms step_avg:160.59ms step:1423/1480 train_time:226921ms step_avg:160.59ms step:1424/1480 train_time:227098ms step_avg:160.61ms step:1425/1480 train_time:227275ms step_avg:160.62ms step:1426/1480 train_time:227447ms step_avg:160.63ms step:1427/1480 train_time:227622ms step_avg:160.64ms step:1428/1480 train_time:227791ms step_avg:160.64ms step:1429/1480 train_time:227960ms step_avg:160.65ms step:1430/1480 train_time:228132ms step_avg:160.66ms step:1431/1480 train_time:228307ms step_avg:160.67ms step:1432/1480 train_time:228484ms step_avg:160.68ms step:1433/1480 train_time:228664ms step_avg:160.69ms step:1434/1480 train_time:228845ms step_avg:160.71ms step:1435/1480 train_time:229021ms step_avg:160.72ms step:1436/1480 train_time:229195ms step_avg:160.73ms step:1437/1480 train_time:229366ms step_avg:160.73ms step:1438/1480 train_time:229535ms step_avg:160.74ms step:1439/1480 train_time:229708ms step_avg:160.75ms step:1440/1480 train_time:229877ms step_avg:160.75ms step:1441/1480 train_time:230048ms step_avg:160.76ms step:1442/1480 train_time:230225ms step_avg:160.77ms step:1443/1480 train_time:230413ms step_avg:160.79ms step:1444/1480 train_time:230585ms step_avg:160.80ms step:1445/1480 train_time:230757ms step_avg:160.81ms step:1446/1480 train_time:230932ms step_avg:160.82ms step:1447/1480 train_time:231109ms step_avg:160.83ms step:1448/1480 train_time:231281ms step_avg:160.84ms step:1449/1480 train_time:231456ms step_avg:160.85ms step:1450/1480 train_time:231628ms step_avg:160.85ms step:1451/1480 train_time:231798ms step_avg:160.86ms step:1452/1480 train_time:231971ms step_avg:160.87ms step:1453/1480 train_time:232140ms step_avg:160.87ms step:1454/1480 train_time:232311ms step_avg:160.88ms step:1455/1480 train_time:232490ms step_avg:160.89ms step:1456/1480 train_time:232663ms step_avg:160.90ms step:1457/1480 train_time:232833ms step_avg:160.91ms step:1458/1480 train_time:233004ms step_avg:160.91ms step:1459/1480 train_time:233182ms step_avg:160.93ms step:1460/1480 train_time:233355ms step_avg:160.93ms step:1461/1480 train_time:233528ms step_avg:160.94ms step:1462/1480 train_time:233698ms step_avg:160.95ms step:1463/1480 train_time:233875ms step_avg:160.96ms step:1464/1480 train_time:234049ms step_avg:160.97ms step:1465/1480 train_time:234222ms step_avg:160.98ms step:1466/1480 train_time:234392ms step_avg:160.98ms step:1467/1480 train_time:234567ms step_avg:160.99ms step:1468/1480 train_time:234739ms step_avg:161.00ms step:1469/1480 train_time:234912ms step_avg:161.01ms step:1470/1480 train_time:235092ms step_avg:161.02ms step:1471/1480 train_time:235279ms step_avg:161.04ms step:1472/1480 train_time:235460ms step_avg:161.05ms step:1473/1480 train_time:235631ms step_avg:161.06ms step:1474/1480 train_time:235808ms step_avg:161.07ms step:1475/1480 train_time:235988ms step_avg:161.08ms step:1476/1480 train_time:236160ms step_avg:161.09ms step:1477/1480 train_time:236344ms step_avg:161.11ms step:1478/1480 train_time:236526ms step_avg:161.12ms step:1479/1480 train_time:236700ms step_avg:161.13ms step:1480/1480 train_time:236874ms step_avg:161.14ms step:1480/1480 val_loss:3.2807 train_time:236944ms step_avg:161.19ms