import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:39:39 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 130W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 118W / 700W | 533MiB / 81559MiB | 2% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 38C P0 83W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 119W / 700W | 119MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22890ms step_avg:nanms step:2/1480 train_time:22993ms step_avg:nanms step:3/1480 train_time:23132ms step_avg:nanms step:4/1480 train_time:23271ms step_avg:nanms step:5/1480 train_time:23412ms step_avg:nanms step:6/1480 train_time:23554ms step_avg:nanms step:7/1480 train_time:23694ms step_avg:nanms step:8/1480 train_time:23835ms step_avg:nanms step:9/1480 train_time:23982ms step_avg:nanms step:10/1480 train_time:24126ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.58ms step:14/1480 train_time:566ms step_avg:141.51ms step:15/1480 train_time:707ms step_avg:141.48ms step:16/1480 train_time:851ms step_avg:141.81ms step:17/1480 train_time:995ms step_avg:142.07ms step:18/1480 train_time:1138ms step_avg:142.27ms step:19/1480 train_time:1282ms step_avg:142.42ms step:20/1480 train_time:1424ms step_avg:142.43ms step:21/1480 train_time:1566ms step_avg:142.38ms step:22/1480 train_time:1707ms step_avg:142.22ms step:23/1480 train_time:1849ms step_avg:142.20ms step:24/1480 train_time:1993ms step_avg:142.37ms step:25/1480 train_time:2138ms step_avg:142.51ms step:26/1480 train_time:2282ms step_avg:142.61ms step:27/1480 train_time:2424ms step_avg:142.62ms step:28/1480 train_time:2566ms step_avg:142.56ms step:29/1480 train_time:2708ms step_avg:142.53ms step:30/1480 train_time:2850ms step_avg:142.51ms step:31/1480 train_time:2996ms step_avg:142.65ms step:32/1480 train_time:3140ms step_avg:142.73ms step:33/1480 train_time:3283ms step_avg:142.74ms step:34/1480 train_time:3425ms step_avg:142.71ms step:35/1480 train_time:3566ms step_avg:142.66ms step:36/1480 train_time:3707ms step_avg:142.56ms step:37/1480 train_time:3850ms step_avg:142.58ms step:38/1480 train_time:3993ms step_avg:142.62ms step:39/1480 train_time:4139ms step_avg:142.71ms step:40/1480 train_time:4283ms step_avg:142.75ms step:41/1480 train_time:4425ms step_avg:142.74ms step:42/1480 train_time:4566ms step_avg:142.69ms step:43/1480 train_time:4707ms step_avg:142.62ms step:44/1480 train_time:4850ms step_avg:142.65ms step:45/1480 train_time:4994ms step_avg:142.69ms step:46/1480 train_time:5138ms step_avg:142.73ms step:47/1480 train_time:5283ms step_avg:142.78ms step:48/1480 train_time:5425ms step_avg:142.75ms step:49/1480 train_time:5566ms step_avg:142.72ms step:50/1480 train_time:5707ms step_avg:142.67ms step:51/1480 train_time:5849ms step_avg:142.66ms step:52/1480 train_time:5991ms step_avg:142.65ms step:53/1480 train_time:6135ms step_avg:142.68ms step:54/1480 train_time:6279ms step_avg:142.70ms step:55/1480 train_time:6421ms step_avg:142.70ms step:56/1480 train_time:6564ms step_avg:142.69ms step:57/1480 train_time:6705ms step_avg:142.66ms step:58/1480 train_time:6846ms step_avg:142.62ms step:59/1480 train_time:6988ms step_avg:142.62ms step:60/1480 train_time:7133ms step_avg:142.67ms step:61/1480 train_time:7278ms step_avg:142.71ms step:62/1480 train_time:7422ms step_avg:142.72ms step:63/1480 train_time:7564ms step_avg:142.72ms step:64/1480 train_time:7705ms step_avg:142.69ms step:65/1480 train_time:7845ms step_avg:142.64ms step:66/1480 train_time:7987ms step_avg:142.63ms step:67/1480 train_time:8132ms step_avg:142.66ms step:68/1480 train_time:8275ms step_avg:142.67ms step:69/1480 train_time:8418ms step_avg:142.69ms step:70/1480 train_time:8562ms step_avg:142.70ms step:71/1480 train_time:8704ms step_avg:142.69ms step:72/1480 train_time:8845ms step_avg:142.66ms step:73/1480 train_time:8986ms step_avg:142.63ms step:74/1480 train_time:9127ms step_avg:142.62ms step:75/1480 train_time:9271ms step_avg:142.64ms step:76/1480 train_time:9415ms step_avg:142.65ms step:77/1480 train_time:9559ms step_avg:142.67ms step:78/1480 train_time:9701ms step_avg:142.67ms step:79/1480 train_time:9842ms step_avg:142.64ms step:80/1480 train_time:9984ms step_avg:142.63ms step:81/1480 train_time:10125ms step_avg:142.60ms step:82/1480 train_time:10266ms step_avg:142.59ms step:83/1480 train_time:10408ms step_avg:142.57ms step:84/1480 train_time:10553ms step_avg:142.60ms step:85/1480 train_time:10697ms step_avg:142.63ms step:86/1480 train_time:10840ms step_avg:142.64ms step:87/1480 train_time:10983ms step_avg:142.64ms step:88/1480 train_time:11124ms step_avg:142.62ms step:89/1480 train_time:11268ms step_avg:142.63ms step:90/1480 train_time:11409ms step_avg:142.61ms step:91/1480 train_time:11549ms step_avg:142.59ms step:92/1480 train_time:11691ms step_avg:142.58ms step:93/1480 train_time:11835ms step_avg:142.59ms step:94/1480 train_time:11979ms step_avg:142.60ms step:95/1480 train_time:12122ms step_avg:142.61ms step:96/1480 train_time:12264ms step_avg:142.60ms step:97/1480 train_time:12405ms step_avg:142.59ms step:98/1480 train_time:12546ms step_avg:142.57ms step:99/1480 train_time:12689ms step_avg:142.57ms step:100/1480 train_time:12831ms step_avg:142.57ms step:101/1480 train_time:12976ms step_avg:142.59ms step:102/1480 train_time:13120ms step_avg:142.61ms step:103/1480 train_time:13263ms step_avg:142.61ms step:104/1480 train_time:13404ms step_avg:142.60ms step:105/1480 train_time:13545ms step_avg:142.58ms step:106/1480 train_time:13688ms step_avg:142.58ms step:107/1480 train_time:13831ms step_avg:142.58ms step:108/1480 train_time:13975ms step_avg:142.60ms step:109/1480 train_time:14119ms step_avg:142.61ms step:110/1480 train_time:14263ms step_avg:142.63ms step:111/1480 train_time:14407ms step_avg:142.64ms step:112/1480 train_time:14554ms step_avg:142.69ms step:113/1480 train_time:14702ms step_avg:142.74ms step:114/1480 train_time:14848ms step_avg:142.77ms step:115/1480 train_time:14994ms step_avg:142.80ms step:116/1480 train_time:15142ms step_avg:142.85ms step:117/1480 train_time:15289ms step_avg:142.89ms step:118/1480 train_time:15436ms step_avg:142.92ms step:119/1480 train_time:15583ms step_avg:142.96ms step:120/1480 train_time:15729ms step_avg:142.99ms step:121/1480 train_time:15876ms step_avg:143.03ms step:122/1480 train_time:16023ms step_avg:143.07ms step:123/1480 train_time:16170ms step_avg:143.10ms step:124/1480 train_time:16318ms step_avg:143.14ms step:125/1480 train_time:16465ms step_avg:143.17ms step:125/1480 val_loss:4.4229 train_time:16521ms step_avg:143.66ms step:126/1480 train_time:16618ms step_avg:143.26ms step:127/1480 train_time:16766ms step_avg:143.30ms step:128/1480 train_time:16914ms step_avg:143.34ms step:129/1480 train_time:17059ms step_avg:143.35ms step:130/1480 train_time:17204ms step_avg:143.36ms step:131/1480 train_time:17349ms step_avg:143.38ms step:132/1480 train_time:17497ms step_avg:143.42ms step:133/1480 train_time:17644ms step_avg:143.44ms step:134/1480 train_time:17792ms step_avg:143.49ms step:135/1480 train_time:17939ms step_avg:143.52ms step:136/1480 train_time:18085ms step_avg:143.53ms step:137/1480 train_time:18233ms step_avg:143.57ms step:138/1480 train_time:18379ms step_avg:143.59ms step:139/1480 train_time:18527ms step_avg:143.62ms step:140/1480 train_time:18675ms step_avg:143.66ms step:141/1480 train_time:18822ms step_avg:143.68ms step:142/1480 train_time:18969ms step_avg:143.70ms step:143/1480 train_time:19116ms step_avg:143.73ms step:144/1480 train_time:19262ms step_avg:143.74ms step:145/1480 train_time:19407ms step_avg:143.75ms step:146/1480 train_time:19554ms step_avg:143.78ms step:147/1480 train_time:19701ms step_avg:143.80ms step:148/1480 train_time:19848ms step_avg:143.83ms step:149/1480 train_time:19996ms step_avg:143.86ms step:150/1480 train_time:20142ms step_avg:143.87ms step:151/1480 train_time:20290ms step_avg:143.90ms step:152/1480 train_time:20436ms step_avg:143.92ms step:153/1480 train_time:20581ms step_avg:143.93ms step:154/1480 train_time:20729ms step_avg:143.95ms step:155/1480 train_time:20876ms step_avg:143.97ms step:156/1480 train_time:21022ms step_avg:143.99ms step:157/1480 train_time:21170ms step_avg:144.02ms step:158/1480 train_time:21317ms step_avg:144.03ms step:159/1480 train_time:21463ms step_avg:144.05ms step:160/1480 train_time:21610ms step_avg:144.07ms step:161/1480 train_time:21756ms step_avg:144.08ms step:162/1480 train_time:21904ms step_avg:144.10ms step:163/1480 train_time:22051ms step_avg:144.13ms step:164/1480 train_time:22198ms step_avg:144.14ms step:165/1480 train_time:22343ms step_avg:144.15ms step:166/1480 train_time:22491ms step_avg:144.17ms step:167/1480 train_time:22638ms step_avg:144.19ms step:168/1480 train_time:22785ms step_avg:144.21ms step:169/1480 train_time:22934ms step_avg:144.24ms step:170/1480 train_time:23080ms step_avg:144.25ms step:171/1480 train_time:23226ms step_avg:144.26ms step:172/1480 train_time:23375ms step_avg:144.29ms step:173/1480 train_time:23521ms step_avg:144.30ms step:174/1480 train_time:23667ms step_avg:144.31ms step:175/1480 train_time:23815ms step_avg:144.33ms step:176/1480 train_time:23963ms step_avg:144.35ms step:177/1480 train_time:24111ms step_avg:144.38ms step:178/1480 train_time:24258ms step_avg:144.39ms step:179/1480 train_time:24404ms step_avg:144.41ms step:180/1480 train_time:24550ms step_avg:144.41ms step:181/1480 train_time:24697ms step_avg:144.43ms step:182/1480 train_time:24843ms step_avg:144.44ms step:183/1480 train_time:24990ms step_avg:144.45ms step:184/1480 train_time:25138ms step_avg:144.47ms step:185/1480 train_time:25283ms step_avg:144.48ms step:186/1480 train_time:25430ms step_avg:144.49ms step:187/1480 train_time:25577ms step_avg:144.50ms step:188/1480 train_time:25723ms step_avg:144.51ms step:189/1480 train_time:25869ms step_avg:144.52ms step:190/1480 train_time:26016ms step_avg:144.54ms step:191/1480 train_time:26162ms step_avg:144.54ms step:192/1480 train_time:26308ms step_avg:144.55ms step:193/1480 train_time:26455ms step_avg:144.56ms step:194/1480 train_time:26603ms step_avg:144.58ms step:195/1480 train_time:26749ms step_avg:144.59ms step:196/1480 train_time:26896ms step_avg:144.60ms step:197/1480 train_time:27043ms step_avg:144.61ms step:198/1480 train_time:27190ms step_avg:144.63ms step:199/1480 train_time:27338ms step_avg:144.64ms step:200/1480 train_time:27485ms step_avg:144.66ms step:201/1480 train_time:27634ms step_avg:144.68ms step:202/1480 train_time:27781ms step_avg:144.69ms step:203/1480 train_time:27927ms step_avg:144.70ms step:204/1480 train_time:28075ms step_avg:144.72ms step:205/1480 train_time:28223ms step_avg:144.73ms step:206/1480 train_time:28370ms step_avg:144.74ms step:207/1480 train_time:28516ms step_avg:144.75ms step:208/1480 train_time:28663ms step_avg:144.76ms step:209/1480 train_time:28810ms step_avg:144.77ms step:210/1480 train_time:28957ms step_avg:144.79ms step:211/1480 train_time:29103ms step_avg:144.79ms step:212/1480 train_time:29249ms step_avg:144.80ms step:213/1480 train_time:29396ms step_avg:144.81ms step:214/1480 train_time:29542ms step_avg:144.82ms step:215/1480 train_time:29690ms step_avg:144.83ms step:216/1480 train_time:29836ms step_avg:144.84ms step:217/1480 train_time:29984ms step_avg:144.85ms step:218/1480 train_time:30133ms step_avg:144.87ms step:219/1480 train_time:30280ms step_avg:144.88ms step:220/1480 train_time:30428ms step_avg:144.89ms step:221/1480 train_time:30577ms step_avg:144.91ms step:222/1480 train_time:30728ms step_avg:144.94ms step:223/1480 train_time:30878ms step_avg:144.97ms step:224/1480 train_time:31029ms step_avg:145.00ms step:225/1480 train_time:31179ms step_avg:145.02ms step:226/1480 train_time:31329ms step_avg:145.04ms step:227/1480 train_time:31480ms step_avg:145.07ms step:228/1480 train_time:31631ms step_avg:145.10ms step:229/1480 train_time:31783ms step_avg:145.13ms step:230/1480 train_time:31935ms step_avg:145.16ms step:231/1480 train_time:32084ms step_avg:145.18ms step:232/1480 train_time:32235ms step_avg:145.20ms step:233/1480 train_time:32384ms step_avg:145.22ms step:234/1480 train_time:32536ms step_avg:145.25ms step:235/1480 train_time:32686ms step_avg:145.27ms step:236/1480 train_time:32837ms step_avg:145.30ms step:237/1480 train_time:32987ms step_avg:145.32ms step:238/1480 train_time:33138ms step_avg:145.34ms step:239/1480 train_time:33289ms step_avg:145.37ms step:240/1480 train_time:33440ms step_avg:145.39ms step:241/1480 train_time:33590ms step_avg:145.41ms step:242/1480 train_time:33742ms step_avg:145.44ms step:243/1480 train_time:33893ms step_avg:145.46ms step:244/1480 train_time:34042ms step_avg:145.48ms step:245/1480 train_time:34193ms step_avg:145.50ms step:246/1480 train_time:34343ms step_avg:145.52ms step:247/1480 train_time:34495ms step_avg:145.55ms step:248/1480 train_time:34646ms step_avg:145.57ms step:249/1480 train_time:34797ms step_avg:145.59ms step:250/1480 train_time:34947ms step_avg:145.61ms step:250/1480 val_loss:3.9999 train_time:35006ms step_avg:145.86ms step:251/1480 train_time:35104ms step_avg:145.66ms step:252/1480 train_time:35254ms step_avg:145.68ms step:253/1480 train_time:35403ms step_avg:145.69ms step:254/1480 train_time:35553ms step_avg:145.71ms step:255/1480 train_time:35701ms step_avg:145.72ms step:256/1480 train_time:35851ms step_avg:145.74ms step:257/1480 train_time:36001ms step_avg:145.75ms step:258/1480 train_time:36154ms step_avg:145.78ms step:259/1480 train_time:36305ms step_avg:145.80ms step:260/1480 train_time:36457ms step_avg:145.83ms step:261/1480 train_time:36608ms step_avg:145.85ms step:262/1480 train_time:36757ms step_avg:145.86ms step:263/1480 train_time:36908ms step_avg:145.88ms step:264/1480 train_time:37058ms step_avg:145.90ms step:265/1480 train_time:37210ms step_avg:145.92ms step:266/1480 train_time:37360ms step_avg:145.94ms step:267/1480 train_time:37512ms step_avg:145.96ms step:268/1480 train_time:37663ms step_avg:145.98ms step:269/1480 train_time:37813ms step_avg:146.00ms step:270/1480 train_time:37963ms step_avg:146.01ms step:271/1480 train_time:38113ms step_avg:146.03ms step:272/1480 train_time:38264ms step_avg:146.05ms step:273/1480 train_time:38415ms step_avg:146.06ms step:274/1480 train_time:38567ms step_avg:146.09ms step:275/1480 train_time:38718ms step_avg:146.10ms step:276/1480 train_time:38868ms step_avg:146.12ms step:277/1480 train_time:39017ms step_avg:146.13ms step:278/1480 train_time:39166ms step_avg:146.14ms step:279/1480 train_time:39317ms step_avg:146.16ms step:280/1480 train_time:39469ms step_avg:146.18ms step:281/1480 train_time:39619ms step_avg:146.19ms step:282/1480 train_time:39771ms step_avg:146.22ms step:283/1480 train_time:39922ms step_avg:146.24ms step:284/1480 train_time:40073ms step_avg:146.25ms step:285/1480 train_time:40224ms step_avg:146.27ms step:286/1480 train_time:40375ms step_avg:146.29ms step:287/1480 train_time:40527ms step_avg:146.31ms step:288/1480 train_time:40676ms step_avg:146.32ms step:289/1480 train_time:40828ms step_avg:146.34ms step:290/1480 train_time:40978ms step_avg:146.35ms step:291/1480 train_time:41129ms step_avg:146.37ms step:292/1480 train_time:41278ms step_avg:146.38ms step:293/1480 train_time:41429ms step_avg:146.39ms step:294/1480 train_time:41580ms step_avg:146.41ms step:295/1480 train_time:41731ms step_avg:146.42ms step:296/1480 train_time:41881ms step_avg:146.44ms step:297/1480 train_time:42032ms step_avg:146.45ms step:298/1480 train_time:42182ms step_avg:146.47ms step:299/1480 train_time:42333ms step_avg:146.48ms step:300/1480 train_time:42482ms step_avg:146.49ms step:301/1480 train_time:42632ms step_avg:146.50ms step:302/1480 train_time:42781ms step_avg:146.51ms step:303/1480 train_time:42932ms step_avg:146.53ms step:304/1480 train_time:43082ms step_avg:146.54ms step:305/1480 train_time:43233ms step_avg:146.55ms step:306/1480 train_time:43385ms step_avg:146.57ms step:307/1480 train_time:43536ms step_avg:146.58ms step:308/1480 train_time:43686ms step_avg:146.60ms step:309/1480 train_time:43836ms step_avg:146.61ms step:310/1480 train_time:43986ms step_avg:146.62ms step:311/1480 train_time:44136ms step_avg:146.63ms step:312/1480 train_time:44286ms step_avg:146.64ms step:313/1480 train_time:44438ms step_avg:146.66ms step:314/1480 train_time:44588ms step_avg:146.67ms step:315/1480 train_time:44739ms step_avg:146.68ms step:316/1480 train_time:44890ms step_avg:146.70ms step:317/1480 train_time:45041ms step_avg:146.71ms step:318/1480 train_time:45191ms step_avg:146.72ms step:319/1480 train_time:45341ms step_avg:146.73ms step:320/1480 train_time:45492ms step_avg:146.75ms step:321/1480 train_time:45643ms step_avg:146.76ms step:322/1480 train_time:45793ms step_avg:146.77ms step:323/1480 train_time:45944ms step_avg:146.79ms step:324/1480 train_time:46094ms step_avg:146.80ms step:325/1480 train_time:46245ms step_avg:146.81ms step:326/1480 train_time:46395ms step_avg:146.82ms step:327/1480 train_time:46547ms step_avg:146.84ms step:328/1480 train_time:46698ms step_avg:146.85ms step:329/1480 train_time:46850ms step_avg:146.86ms step:330/1480 train_time:47001ms step_avg:146.88ms step:331/1480 train_time:47155ms step_avg:146.90ms step:332/1480 train_time:47309ms step_avg:146.92ms step:333/1480 train_time:47462ms step_avg:146.94ms step:334/1480 train_time:47617ms step_avg:146.96ms step:335/1480 train_time:47771ms step_avg:146.99ms step:336/1480 train_time:47927ms step_avg:147.01ms step:337/1480 train_time:48081ms step_avg:147.04ms step:338/1480 train_time:48234ms step_avg:147.06ms step:339/1480 train_time:48389ms step_avg:147.08ms step:340/1480 train_time:48541ms step_avg:147.09ms step:341/1480 train_time:48695ms step_avg:147.11ms step:342/1480 train_time:48850ms step_avg:147.14ms step:343/1480 train_time:49005ms step_avg:147.16ms step:344/1480 train_time:49158ms step_avg:147.18ms step:345/1480 train_time:49312ms step_avg:147.20ms step:346/1480 train_time:49468ms step_avg:147.23ms step:347/1480 train_time:49620ms step_avg:147.24ms step:348/1480 train_time:49774ms step_avg:147.26ms step:349/1480 train_time:49929ms step_avg:147.28ms step:350/1480 train_time:50084ms step_avg:147.31ms step:351/1480 train_time:50238ms step_avg:147.33ms step:352/1480 train_time:50392ms step_avg:147.35ms step:353/1480 train_time:50548ms step_avg:147.37ms step:354/1480 train_time:50701ms step_avg:147.39ms step:355/1480 train_time:50855ms step_avg:147.40ms step:356/1480 train_time:51011ms step_avg:147.43ms step:357/1480 train_time:51165ms step_avg:147.45ms step:358/1480 train_time:51319ms step_avg:147.47ms step:359/1480 train_time:51473ms step_avg:147.49ms step:360/1480 train_time:51631ms step_avg:147.52ms step:361/1480 train_time:51787ms step_avg:147.54ms step:362/1480 train_time:51941ms step_avg:147.56ms step:363/1480 train_time:52095ms step_avg:147.58ms step:364/1480 train_time:52250ms step_avg:147.60ms step:365/1480 train_time:52402ms step_avg:147.61ms step:366/1480 train_time:52556ms step_avg:147.63ms step:367/1480 train_time:52709ms step_avg:147.64ms step:368/1480 train_time:52863ms step_avg:147.66ms step:369/1480 train_time:53017ms step_avg:147.68ms step:370/1480 train_time:53170ms step_avg:147.69ms step:371/1480 train_time:53327ms step_avg:147.72ms step:372/1480 train_time:53481ms step_avg:147.74ms step:373/1480 train_time:53634ms step_avg:147.75ms step:374/1480 train_time:53788ms step_avg:147.77ms step:375/1480 train_time:53941ms step_avg:147.78ms step:375/1480 val_loss:3.8031 train_time:54001ms step_avg:147.95ms step:376/1480 train_time:54097ms step_avg:147.81ms step:377/1480 train_time:54253ms step_avg:147.83ms step:378/1480 train_time:54407ms step_avg:147.84ms step:379/1480 train_time:54559ms step_avg:147.86ms step:380/1480 train_time:54713ms step_avg:147.87ms step:381/1480 train_time:54866ms step_avg:147.89ms step:382/1480 train_time:55020ms step_avg:147.90ms step:383/1480 train_time:55174ms step_avg:147.92ms step:384/1480 train_time:55328ms step_avg:147.94ms step:385/1480 train_time:55483ms step_avg:147.95ms step:386/1480 train_time:55636ms step_avg:147.97ms step:387/1480 train_time:55789ms step_avg:147.98ms step:388/1480 train_time:55941ms step_avg:147.99ms step:389/1480 train_time:56094ms step_avg:148.01ms step:390/1480 train_time:56250ms step_avg:148.03ms step:391/1480 train_time:56406ms step_avg:148.05ms step:392/1480 train_time:56559ms step_avg:148.06ms step:393/1480 train_time:56713ms step_avg:148.08ms step:394/1480 train_time:56868ms step_avg:148.09ms step:395/1480 train_time:57021ms step_avg:148.11ms step:396/1480 train_time:57174ms step_avg:148.12ms step:397/1480 train_time:57327ms step_avg:148.13ms step:398/1480 train_time:57480ms step_avg:148.15ms step:399/1480 train_time:57634ms step_avg:148.16ms step:400/1480 train_time:57787ms step_avg:148.17ms step:401/1480 train_time:57942ms step_avg:148.19ms step:402/1480 train_time:58094ms step_avg:148.20ms step:403/1480 train_time:58249ms step_avg:148.22ms step:404/1480 train_time:58405ms step_avg:148.24ms step:405/1480 train_time:58559ms step_avg:148.25ms step:406/1480 train_time:58714ms step_avg:148.27ms step:407/1480 train_time:58867ms step_avg:148.28ms step:408/1480 train_time:59020ms step_avg:148.29ms step:409/1480 train_time:59173ms step_avg:148.30ms step:410/1480 train_time:59327ms step_avg:148.32ms step:411/1480 train_time:59481ms step_avg:148.33ms step:412/1480 train_time:59634ms step_avg:148.34ms step:413/1480 train_time:59788ms step_avg:148.36ms step:414/1480 train_time:59943ms step_avg:148.37ms step:415/1480 train_time:60096ms step_avg:148.39ms step:416/1480 train_time:60250ms step_avg:148.40ms step:417/1480 train_time:60405ms step_avg:148.42ms step:418/1480 train_time:60558ms step_avg:148.43ms step:419/1480 train_time:60713ms step_avg:148.44ms step:420/1480 train_time:60868ms step_avg:148.46ms step:421/1480 train_time:61021ms step_avg:148.47ms step:422/1480 train_time:61174ms step_avg:148.48ms step:423/1480 train_time:61328ms step_avg:148.49ms step:424/1480 train_time:61481ms step_avg:148.51ms step:425/1480 train_time:61635ms step_avg:148.52ms step:426/1480 train_time:61789ms step_avg:148.53ms step:427/1480 train_time:61945ms step_avg:148.55ms step:428/1480 train_time:62097ms step_avg:148.56ms step:429/1480 train_time:62251ms step_avg:148.57ms step:430/1480 train_time:62406ms step_avg:148.59ms step:431/1480 train_time:62559ms step_avg:148.60ms step:432/1480 train_time:62713ms step_avg:148.61ms step:433/1480 train_time:62869ms step_avg:148.63ms step:434/1480 train_time:63024ms step_avg:148.64ms step:435/1480 train_time:63178ms step_avg:148.65ms step:436/1480 train_time:63331ms step_avg:148.66ms step:437/1480 train_time:63485ms step_avg:148.68ms step:438/1480 train_time:63638ms step_avg:148.69ms step:439/1480 train_time:63792ms step_avg:148.70ms step:440/1480 train_time:63950ms step_avg:148.72ms step:441/1480 train_time:64109ms step_avg:148.74ms step:442/1480 train_time:64267ms step_avg:148.77ms step:443/1480 train_time:64424ms step_avg:148.78ms step:444/1480 train_time:64579ms step_avg:148.80ms step:445/1480 train_time:64734ms step_avg:148.81ms step:446/1480 train_time:64889ms step_avg:148.83ms step:447/1480 train_time:65046ms step_avg:148.85ms step:448/1480 train_time:65201ms step_avg:148.86ms step:449/1480 train_time:65358ms step_avg:148.88ms step:450/1480 train_time:65516ms step_avg:148.90ms step:451/1480 train_time:65673ms step_avg:148.92ms step:452/1480 train_time:65829ms step_avg:148.94ms step:453/1480 train_time:65986ms step_avg:148.95ms step:454/1480 train_time:66142ms step_avg:148.97ms step:455/1480 train_time:66299ms step_avg:148.99ms step:456/1480 train_time:66456ms step_avg:149.00ms step:457/1480 train_time:66613ms step_avg:149.02ms step:458/1480 train_time:66769ms step_avg:149.04ms step:459/1480 train_time:66928ms step_avg:149.06ms step:460/1480 train_time:67086ms step_avg:149.08ms step:461/1480 train_time:67246ms step_avg:149.10ms step:462/1480 train_time:67403ms step_avg:149.12ms step:463/1480 train_time:67559ms step_avg:149.14ms step:464/1480 train_time:67716ms step_avg:149.15ms step:465/1480 train_time:67872ms step_avg:149.17ms step:466/1480 train_time:68030ms step_avg:149.19ms step:467/1480 train_time:68188ms step_avg:149.21ms step:468/1480 train_time:68345ms step_avg:149.23ms step:469/1480 train_time:68502ms step_avg:149.24ms step:470/1480 train_time:68658ms step_avg:149.26ms step:471/1480 train_time:68814ms step_avg:149.27ms step:472/1480 train_time:68973ms step_avg:149.29ms step:473/1480 train_time:69130ms step_avg:149.31ms step:474/1480 train_time:69287ms step_avg:149.32ms step:475/1480 train_time:69444ms step_avg:149.34ms step:476/1480 train_time:69601ms step_avg:149.36ms step:477/1480 train_time:69757ms step_avg:149.37ms step:478/1480 train_time:69914ms step_avg:149.39ms step:479/1480 train_time:70070ms step_avg:149.40ms step:480/1480 train_time:70228ms step_avg:149.42ms step:481/1480 train_time:70385ms step_avg:149.44ms step:482/1480 train_time:70543ms step_avg:149.45ms step:483/1480 train_time:70698ms step_avg:149.47ms step:484/1480 train_time:70854ms step_avg:149.48ms step:485/1480 train_time:71011ms step_avg:149.50ms step:486/1480 train_time:71170ms step_avg:149.52ms step:487/1480 train_time:71328ms step_avg:149.53ms step:488/1480 train_time:71485ms step_avg:149.55ms step:489/1480 train_time:71641ms step_avg:149.56ms step:490/1480 train_time:71798ms step_avg:149.58ms step:491/1480 train_time:71955ms step_avg:149.59ms step:492/1480 train_time:72111ms step_avg:149.61ms step:493/1480 train_time:72270ms step_avg:149.63ms step:494/1480 train_time:72429ms step_avg:149.65ms step:495/1480 train_time:72587ms step_avg:149.66ms step:496/1480 train_time:72744ms step_avg:149.68ms step:497/1480 train_time:72900ms step_avg:149.69ms step:498/1480 train_time:73056ms step_avg:149.71ms step:499/1480 train_time:73213ms step_avg:149.72ms step:500/1480 train_time:73370ms step_avg:149.74ms step:500/1480 val_loss:3.6818 train_time:73433ms step_avg:149.86ms step:501/1480 train_time:73532ms step_avg:149.76ms step:502/1480 train_time:73691ms step_avg:149.78ms step:503/1480 train_time:73848ms step_avg:149.79ms step:504/1480 train_time:74004ms step_avg:149.81ms step:505/1480 train_time:74159ms step_avg:149.82ms step:506/1480 train_time:74314ms step_avg:149.83ms step:507/1480 train_time:74471ms step_avg:149.84ms step:508/1480 train_time:74632ms step_avg:149.86ms step:509/1480 train_time:74790ms step_avg:149.88ms step:510/1480 train_time:74948ms step_avg:149.90ms step:511/1480 train_time:75105ms step_avg:149.91ms step:512/1480 train_time:75262ms step_avg:149.92ms step:513/1480 train_time:75417ms step_avg:149.93ms step:514/1480 train_time:75574ms step_avg:149.95ms step:515/1480 train_time:75732ms step_avg:149.96ms step:516/1480 train_time:75892ms step_avg:149.98ms step:517/1480 train_time:76050ms step_avg:150.00ms step:518/1480 train_time:76207ms step_avg:150.01ms step:519/1480 train_time:76362ms step_avg:150.02ms step:520/1480 train_time:76520ms step_avg:150.04ms step:521/1480 train_time:76676ms step_avg:150.05ms step:522/1480 train_time:76834ms step_avg:150.07ms step:523/1480 train_time:76992ms step_avg:150.08ms step:524/1480 train_time:77149ms step_avg:150.09ms step:525/1480 train_time:77305ms step_avg:150.11ms step:526/1480 train_time:77462ms step_avg:150.12ms step:527/1480 train_time:77618ms step_avg:150.13ms step:528/1480 train_time:77773ms step_avg:150.14ms step:529/1480 train_time:77931ms step_avg:150.16ms step:530/1480 train_time:78089ms step_avg:150.17ms step:531/1480 train_time:78246ms step_avg:150.19ms step:532/1480 train_time:78403ms step_avg:150.20ms step:533/1480 train_time:78558ms step_avg:150.21ms step:534/1480 train_time:78715ms step_avg:150.22ms step:535/1480 train_time:78871ms step_avg:150.23ms step:536/1480 train_time:79032ms step_avg:150.25ms step:537/1480 train_time:79190ms step_avg:150.27ms step:538/1480 train_time:79348ms step_avg:150.28ms step:539/1480 train_time:79506ms step_avg:150.30ms step:540/1480 train_time:79662ms step_avg:150.31ms step:541/1480 train_time:79818ms step_avg:150.32ms step:542/1480 train_time:79973ms step_avg:150.33ms step:543/1480 train_time:80131ms step_avg:150.34ms step:544/1480 train_time:80290ms step_avg:150.36ms step:545/1480 train_time:80447ms step_avg:150.37ms step:546/1480 train_time:80604ms step_avg:150.38ms step:547/1480 train_time:80760ms step_avg:150.39ms step:548/1480 train_time:80917ms step_avg:150.40ms step:549/1480 train_time:81073ms step_avg:150.41ms step:550/1480 train_time:81232ms step_avg:150.43ms step:551/1480 train_time:81391ms step_avg:150.45ms step:552/1480 train_time:81551ms step_avg:150.46ms step:553/1480 train_time:81712ms step_avg:150.48ms step:554/1480 train_time:81872ms step_avg:150.50ms step:555/1480 train_time:82033ms step_avg:150.52ms step:556/1480 train_time:82193ms step_avg:150.54ms step:557/1480 train_time:82352ms step_avg:150.55ms step:558/1480 train_time:82511ms step_avg:150.57ms step:559/1480 train_time:82671ms step_avg:150.59ms step:560/1480 train_time:82832ms step_avg:150.60ms step:561/1480 train_time:82991ms step_avg:150.62ms step:562/1480 train_time:83151ms step_avg:150.64ms step:563/1480 train_time:83310ms step_avg:150.65ms step:564/1480 train_time:83470ms step_avg:150.67ms step:565/1480 train_time:83630ms step_avg:150.69ms step:566/1480 train_time:83791ms step_avg:150.70ms step:567/1480 train_time:83950ms step_avg:150.72ms step:568/1480 train_time:84110ms step_avg:150.73ms step:569/1480 train_time:84268ms step_avg:150.75ms step:570/1480 train_time:84428ms step_avg:150.76ms step:571/1480 train_time:84588ms step_avg:150.78ms step:572/1480 train_time:84747ms step_avg:150.80ms step:573/1480 train_time:84907ms step_avg:150.81ms step:574/1480 train_time:85068ms step_avg:150.83ms step:575/1480 train_time:85229ms step_avg:150.85ms step:576/1480 train_time:85389ms step_avg:150.86ms step:577/1480 train_time:85548ms step_avg:150.88ms step:578/1480 train_time:85708ms step_avg:150.89ms step:579/1480 train_time:85867ms step_avg:150.91ms step:580/1480 train_time:86025ms step_avg:150.92ms step:581/1480 train_time:86186ms step_avg:150.94ms step:582/1480 train_time:86348ms step_avg:150.96ms step:583/1480 train_time:86507ms step_avg:150.97ms step:584/1480 train_time:86667ms step_avg:150.99ms step:585/1480 train_time:86826ms step_avg:151.00ms step:586/1480 train_time:86986ms step_avg:151.02ms step:587/1480 train_time:87145ms step_avg:151.03ms step:588/1480 train_time:87303ms step_avg:151.04ms step:589/1480 train_time:87462ms step_avg:151.06ms step:590/1480 train_time:87621ms step_avg:151.07ms step:591/1480 train_time:87778ms step_avg:151.08ms step:592/1480 train_time:87938ms step_avg:151.10ms step:593/1480 train_time:88098ms step_avg:151.11ms step:594/1480 train_time:88258ms step_avg:151.13ms step:595/1480 train_time:88418ms step_avg:151.14ms step:596/1480 train_time:88579ms step_avg:151.16ms step:597/1480 train_time:88737ms step_avg:151.17ms step:598/1480 train_time:88896ms step_avg:151.18ms step:599/1480 train_time:89053ms step_avg:151.19ms step:600/1480 train_time:89212ms step_avg:151.21ms step:601/1480 train_time:89371ms step_avg:151.22ms step:602/1480 train_time:89531ms step_avg:151.23ms step:603/1480 train_time:89692ms step_avg:151.25ms step:604/1480 train_time:89852ms step_avg:151.27ms step:605/1480 train_time:90012ms step_avg:151.28ms step:606/1480 train_time:90174ms step_avg:151.30ms step:607/1480 train_time:90336ms step_avg:151.32ms step:608/1480 train_time:90496ms step_avg:151.33ms step:609/1480 train_time:90655ms step_avg:151.34ms step:610/1480 train_time:90813ms step_avg:151.36ms step:611/1480 train_time:90972ms step_avg:151.37ms step:612/1480 train_time:91133ms step_avg:151.38ms step:613/1480 train_time:91293ms step_avg:151.40ms step:614/1480 train_time:91453ms step_avg:151.41ms step:615/1480 train_time:91612ms step_avg:151.43ms step:616/1480 train_time:91770ms step_avg:151.44ms step:617/1480 train_time:91931ms step_avg:151.45ms step:618/1480 train_time:92092ms step_avg:151.47ms step:619/1480 train_time:92253ms step_avg:151.48ms step:620/1480 train_time:92413ms step_avg:151.50ms step:621/1480 train_time:92572ms step_avg:151.51ms step:622/1480 train_time:92733ms step_avg:151.52ms step:623/1480 train_time:92894ms step_avg:151.54ms step:624/1480 train_time:93053ms step_avg:151.55ms step:625/1480 train_time:93213ms step_avg:151.57ms step:625/1480 val_loss:3.6035 train_time:93278ms step_avg:151.67ms step:626/1480 train_time:93379ms step_avg:151.59ms step:627/1480 train_time:93538ms step_avg:151.60ms step:628/1480 train_time:93697ms step_avg:151.61ms step:629/1480 train_time:93856ms step_avg:151.63ms step:630/1480 train_time:94015ms step_avg:151.64ms step:631/1480 train_time:94173ms step_avg:151.65ms step:632/1480 train_time:94333ms step_avg:151.66ms step:633/1480 train_time:94494ms step_avg:151.68ms step:634/1480 train_time:94652ms step_avg:151.69ms step:635/1480 train_time:94813ms step_avg:151.70ms step:636/1480 train_time:94973ms step_avg:151.71ms step:637/1480 train_time:95132ms step_avg:151.73ms step:638/1480 train_time:95291ms step_avg:151.74ms step:639/1480 train_time:95448ms step_avg:151.75ms step:640/1480 train_time:95606ms step_avg:151.75ms step:641/1480 train_time:95764ms step_avg:151.77ms step:642/1480 train_time:95924ms step_avg:151.78ms step:643/1480 train_time:96083ms step_avg:151.79ms step:644/1480 train_time:96241ms step_avg:151.80ms step:645/1480 train_time:96399ms step_avg:151.81ms step:646/1480 train_time:96560ms step_avg:151.82ms step:647/1480 train_time:96720ms step_avg:151.84ms step:648/1480 train_time:96881ms step_avg:151.85ms step:649/1480 train_time:97040ms step_avg:151.86ms step:650/1480 train_time:97201ms step_avg:151.88ms step:651/1480 train_time:97361ms step_avg:151.89ms step:652/1480 train_time:97520ms step_avg:151.90ms step:653/1480 train_time:97680ms step_avg:151.91ms step:654/1480 train_time:97839ms step_avg:151.92ms step:655/1480 train_time:98000ms step_avg:151.94ms step:656/1480 train_time:98159ms step_avg:151.95ms step:657/1480 train_time:98320ms step_avg:151.96ms step:658/1480 train_time:98481ms step_avg:151.98ms step:659/1480 train_time:98642ms step_avg:151.99ms step:660/1480 train_time:98805ms step_avg:152.01ms step:661/1480 train_time:98966ms step_avg:152.02ms step:662/1480 train_time:99125ms step_avg:152.03ms step:663/1480 train_time:99285ms step_avg:152.04ms step:664/1480 train_time:99447ms step_avg:152.06ms step:665/1480 train_time:99609ms step_avg:152.08ms step:666/1480 train_time:99769ms step_avg:152.09ms step:667/1480 train_time:99931ms step_avg:152.10ms step:668/1480 train_time:100093ms step_avg:152.12ms step:669/1480 train_time:100255ms step_avg:152.13ms step:670/1480 train_time:100417ms step_avg:152.15ms step:671/1480 train_time:100579ms step_avg:152.16ms step:672/1480 train_time:100740ms step_avg:152.18ms step:673/1480 train_time:100903ms step_avg:152.19ms step:674/1480 train_time:101065ms step_avg:152.21ms step:675/1480 train_time:101226ms step_avg:152.22ms step:676/1480 train_time:101388ms step_avg:152.23ms step:677/1480 train_time:101550ms step_avg:152.25ms step:678/1480 train_time:101711ms step_avg:152.26ms step:679/1480 train_time:101873ms step_avg:152.28ms step:680/1480 train_time:102034ms step_avg:152.29ms step:681/1480 train_time:102195ms step_avg:152.30ms step:682/1480 train_time:102359ms step_avg:152.32ms step:683/1480 train_time:102521ms step_avg:152.33ms step:684/1480 train_time:102683ms step_avg:152.35ms step:685/1480 train_time:102844ms step_avg:152.36ms step:686/1480 train_time:103005ms step_avg:152.37ms step:687/1480 train_time:103166ms step_avg:152.39ms step:688/1480 train_time:103326ms step_avg:152.40ms step:689/1480 train_time:103489ms step_avg:152.41ms step:690/1480 train_time:103655ms step_avg:152.43ms step:691/1480 train_time:103818ms step_avg:152.45ms step:692/1480 train_time:103980ms step_avg:152.46ms step:693/1480 train_time:104142ms step_avg:152.48ms step:694/1480 train_time:104303ms step_avg:152.49ms step:695/1480 train_time:104463ms step_avg:152.50ms step:696/1480 train_time:104624ms step_avg:152.51ms step:697/1480 train_time:104786ms step_avg:152.53ms step:698/1480 train_time:104946ms step_avg:152.54ms step:699/1480 train_time:105108ms step_avg:152.55ms step:700/1480 train_time:105269ms step_avg:152.56ms step:701/1480 train_time:105427ms step_avg:152.57ms step:702/1480 train_time:105588ms step_avg:152.58ms step:703/1480 train_time:105748ms step_avg:152.59ms step:704/1480 train_time:105908ms step_avg:152.60ms step:705/1480 train_time:106073ms step_avg:152.62ms step:706/1480 train_time:106236ms step_avg:152.64ms step:707/1480 train_time:106399ms step_avg:152.65ms step:708/1480 train_time:106561ms step_avg:152.67ms step:709/1480 train_time:106723ms step_avg:152.68ms step:710/1480 train_time:106885ms step_avg:152.69ms step:711/1480 train_time:107045ms step_avg:152.70ms step:712/1480 train_time:107212ms step_avg:152.72ms step:713/1480 train_time:107377ms step_avg:152.74ms step:714/1480 train_time:107539ms step_avg:152.75ms step:715/1480 train_time:107700ms step_avg:152.77ms step:716/1480 train_time:107861ms step_avg:152.78ms step:717/1480 train_time:108023ms step_avg:152.79ms step:718/1480 train_time:108182ms step_avg:152.80ms step:719/1480 train_time:108341ms step_avg:152.81ms step:720/1480 train_time:108505ms step_avg:152.82ms step:721/1480 train_time:108666ms step_avg:152.84ms step:722/1480 train_time:108828ms step_avg:152.85ms step:723/1480 train_time:108988ms step_avg:152.86ms step:724/1480 train_time:109150ms step_avg:152.87ms step:725/1480 train_time:109313ms step_avg:152.89ms step:726/1480 train_time:109478ms step_avg:152.90ms step:727/1480 train_time:109640ms step_avg:152.91ms step:728/1480 train_time:109802ms step_avg:152.93ms step:729/1480 train_time:109963ms step_avg:152.94ms step:730/1480 train_time:110126ms step_avg:152.95ms step:731/1480 train_time:110286ms step_avg:152.96ms step:732/1480 train_time:110446ms step_avg:152.97ms step:733/1480 train_time:110606ms step_avg:152.98ms step:734/1480 train_time:110768ms step_avg:152.99ms step:735/1480 train_time:110928ms step_avg:153.00ms step:736/1480 train_time:111091ms step_avg:153.02ms step:737/1480 train_time:111251ms step_avg:153.03ms step:738/1480 train_time:111414ms step_avg:153.04ms step:739/1480 train_time:111575ms step_avg:153.05ms step:740/1480 train_time:111741ms step_avg:153.07ms step:741/1480 train_time:111904ms step_avg:153.08ms step:742/1480 train_time:112065ms step_avg:153.09ms step:743/1480 train_time:112225ms step_avg:153.10ms step:744/1480 train_time:112389ms step_avg:153.12ms step:745/1480 train_time:112554ms step_avg:153.13ms step:746/1480 train_time:112715ms step_avg:153.14ms step:747/1480 train_time:112877ms step_avg:153.16ms step:748/1480 train_time:113042ms step_avg:153.17ms step:749/1480 train_time:113205ms step_avg:153.19ms step:750/1480 train_time:113365ms step_avg:153.20ms step:750/1480 val_loss:3.5484 train_time:113429ms step_avg:153.28ms step:751/1480 train_time:113531ms step_avg:153.21ms step:752/1480 train_time:113693ms step_avg:153.23ms step:753/1480 train_time:113855ms step_avg:153.24ms step:754/1480 train_time:114015ms step_avg:153.25ms step:755/1480 train_time:114176ms step_avg:153.26ms step:756/1480 train_time:114337ms step_avg:153.27ms step:757/1480 train_time:114502ms step_avg:153.28ms step:758/1480 train_time:114661ms step_avg:153.29ms step:759/1480 train_time:114826ms step_avg:153.31ms step:760/1480 train_time:114988ms step_avg:153.32ms step:761/1480 train_time:115152ms step_avg:153.33ms step:762/1480 train_time:115315ms step_avg:153.34ms step:763/1480 train_time:115476ms step_avg:153.35ms step:764/1480 train_time:115637ms step_avg:153.37ms step:765/1480 train_time:115798ms step_avg:153.38ms step:766/1480 train_time:115960ms step_avg:153.39ms step:767/1480 train_time:116121ms step_avg:153.40ms step:768/1480 train_time:116283ms step_avg:153.41ms step:769/1480 train_time:116448ms step_avg:153.42ms step:770/1480 train_time:116612ms step_avg:153.44ms step:771/1480 train_time:116777ms step_avg:153.45ms step:772/1480 train_time:116938ms step_avg:153.46ms step:773/1480 train_time:117099ms step_avg:153.47ms step:774/1480 train_time:117259ms step_avg:153.48ms step:775/1480 train_time:117421ms step_avg:153.49ms step:776/1480 train_time:117586ms step_avg:153.51ms step:777/1480 train_time:117753ms step_avg:153.52ms step:778/1480 train_time:117916ms step_avg:153.54ms step:779/1480 train_time:118078ms step_avg:153.55ms step:780/1480 train_time:118241ms step_avg:153.56ms step:781/1480 train_time:118402ms step_avg:153.57ms step:782/1480 train_time:118566ms step_avg:153.58ms step:783/1480 train_time:118728ms step_avg:153.59ms step:784/1480 train_time:118892ms step_avg:153.61ms step:785/1480 train_time:119056ms step_avg:153.62ms step:786/1480 train_time:119221ms step_avg:153.63ms step:787/1480 train_time:119383ms step_avg:153.65ms step:788/1480 train_time:119548ms step_avg:153.66ms step:789/1480 train_time:119710ms step_avg:153.67ms step:790/1480 train_time:119875ms step_avg:153.69ms step:791/1480 train_time:120041ms step_avg:153.70ms step:792/1480 train_time:120206ms step_avg:153.72ms step:793/1480 train_time:120367ms step_avg:153.73ms step:794/1480 train_time:120534ms step_avg:153.74ms step:795/1480 train_time:120698ms step_avg:153.76ms step:796/1480 train_time:120863ms step_avg:153.77ms step:797/1480 train_time:121026ms step_avg:153.78ms step:798/1480 train_time:121190ms step_avg:153.79ms step:799/1480 train_time:121357ms step_avg:153.81ms step:800/1480 train_time:121519ms step_avg:153.82ms step:801/1480 train_time:121682ms step_avg:153.83ms step:802/1480 train_time:121851ms step_avg:153.85ms step:803/1480 train_time:122014ms step_avg:153.86ms step:804/1480 train_time:122176ms step_avg:153.87ms step:805/1480 train_time:122342ms step_avg:153.89ms step:806/1480 train_time:122504ms step_avg:153.90ms step:807/1480 train_time:122664ms step_avg:153.91ms step:808/1480 train_time:122831ms step_avg:153.92ms step:809/1480 train_time:122994ms step_avg:153.93ms step:810/1480 train_time:123157ms step_avg:153.95ms step:811/1480 train_time:123319ms step_avg:153.96ms step:812/1480 train_time:123482ms step_avg:153.97ms step:813/1480 train_time:123642ms step_avg:153.98ms step:814/1480 train_time:123806ms step_avg:153.99ms step:815/1480 train_time:123969ms step_avg:154.00ms step:816/1480 train_time:124135ms step_avg:154.01ms step:817/1480 train_time:124297ms step_avg:154.02ms step:818/1480 train_time:124458ms step_avg:154.03ms step:819/1480 train_time:124623ms step_avg:154.05ms step:820/1480 train_time:124787ms step_avg:154.06ms step:821/1480 train_time:124949ms step_avg:154.07ms step:822/1480 train_time:125113ms step_avg:154.08ms step:823/1480 train_time:125276ms step_avg:154.09ms step:824/1480 train_time:125437ms step_avg:154.10ms step:825/1480 train_time:125602ms step_avg:154.11ms step:826/1480 train_time:125768ms step_avg:154.13ms step:827/1480 train_time:125933ms step_avg:154.14ms step:828/1480 train_time:126095ms step_avg:154.15ms step:829/1480 train_time:126259ms step_avg:154.16ms step:830/1480 train_time:126423ms step_avg:154.17ms step:831/1480 train_time:126587ms step_avg:154.19ms step:832/1480 train_time:126752ms step_avg:154.20ms step:833/1480 train_time:126917ms step_avg:154.21ms step:834/1480 train_time:127080ms step_avg:154.22ms step:835/1480 train_time:127245ms step_avg:154.24ms step:836/1480 train_time:127411ms step_avg:154.25ms step:837/1480 train_time:127573ms step_avg:154.26ms step:838/1480 train_time:127737ms step_avg:154.27ms step:839/1480 train_time:127900ms step_avg:154.28ms step:840/1480 train_time:128061ms step_avg:154.29ms step:841/1480 train_time:128222ms step_avg:154.30ms step:842/1480 train_time:128388ms step_avg:154.31ms step:843/1480 train_time:128551ms step_avg:154.32ms step:844/1480 train_time:128713ms step_avg:154.33ms step:845/1480 train_time:128877ms step_avg:154.34ms step:846/1480 train_time:129043ms step_avg:154.36ms step:847/1480 train_time:129208ms step_avg:154.37ms step:848/1480 train_time:129370ms step_avg:154.38ms step:849/1480 train_time:129533ms step_avg:154.39ms step:850/1480 train_time:129696ms step_avg:154.40ms step:851/1480 train_time:129861ms step_avg:154.41ms step:852/1480 train_time:130022ms step_avg:154.42ms step:853/1480 train_time:130183ms step_avg:154.43ms step:854/1480 train_time:130349ms step_avg:154.44ms step:855/1480 train_time:130513ms step_avg:154.45ms step:856/1480 train_time:130675ms step_avg:154.46ms step:857/1480 train_time:130840ms step_avg:154.47ms step:858/1480 train_time:131007ms step_avg:154.49ms step:859/1480 train_time:131172ms step_avg:154.50ms step:860/1480 train_time:131333ms step_avg:154.51ms step:861/1480 train_time:131500ms step_avg:154.52ms step:862/1480 train_time:131669ms step_avg:154.54ms step:863/1480 train_time:131838ms step_avg:154.56ms step:864/1480 train_time:132000ms step_avg:154.57ms step:865/1480 train_time:132160ms step_avg:154.57ms step:866/1480 train_time:132327ms step_avg:154.59ms step:867/1480 train_time:132492ms step_avg:154.60ms step:868/1480 train_time:132654ms step_avg:154.61ms step:869/1480 train_time:132816ms step_avg:154.62ms step:870/1480 train_time:132980ms step_avg:154.63ms step:871/1480 train_time:133143ms step_avg:154.64ms step:872/1480 train_time:133307ms step_avg:154.65ms step:873/1480 train_time:133470ms step_avg:154.66ms step:874/1480 train_time:133636ms step_avg:154.67ms step:875/1480 train_time:133800ms step_avg:154.68ms step:875/1480 val_loss:3.4995 train_time:133866ms step_avg:154.76ms step:876/1480 train_time:133965ms step_avg:154.69ms step:877/1480 train_time:134129ms step_avg:154.70ms step:878/1480 train_time:134292ms step_avg:154.71ms step:879/1480 train_time:134458ms step_avg:154.73ms step:880/1480 train_time:134621ms step_avg:154.74ms step:881/1480 train_time:134784ms step_avg:154.75ms step:882/1480 train_time:134948ms step_avg:154.76ms step:883/1480 train_time:135113ms step_avg:154.77ms step:884/1480 train_time:135281ms step_avg:154.78ms step:885/1480 train_time:135446ms step_avg:154.79ms step:886/1480 train_time:135611ms step_avg:154.81ms step:887/1480 train_time:135780ms step_avg:154.82ms step:888/1480 train_time:135952ms step_avg:154.84ms step:889/1480 train_time:136119ms step_avg:154.86ms step:890/1480 train_time:136282ms step_avg:154.87ms step:891/1480 train_time:136447ms step_avg:154.88ms step:892/1480 train_time:136611ms step_avg:154.89ms step:893/1480 train_time:136775ms step_avg:154.90ms step:894/1480 train_time:136941ms step_avg:154.91ms step:895/1480 train_time:137105ms step_avg:154.92ms step:896/1480 train_time:137270ms step_avg:154.93ms step:897/1480 train_time:137438ms step_avg:154.95ms step:898/1480 train_time:137605ms step_avg:154.96ms step:899/1480 train_time:137768ms step_avg:154.97ms step:900/1480 train_time:137931ms step_avg:154.98ms step:901/1480 train_time:138097ms step_avg:154.99ms step:902/1480 train_time:138261ms step_avg:155.00ms step:903/1480 train_time:138434ms step_avg:155.02ms step:904/1480 train_time:138600ms step_avg:155.03ms step:905/1480 train_time:138762ms step_avg:155.04ms step:906/1480 train_time:138928ms step_avg:155.05ms step:907/1480 train_time:139094ms step_avg:155.07ms step:908/1480 train_time:139258ms step_avg:155.08ms step:909/1480 train_time:139423ms step_avg:155.09ms step:910/1480 train_time:139593ms step_avg:155.10ms step:911/1480 train_time:139758ms step_avg:155.11ms step:912/1480 train_time:139924ms step_avg:155.13ms step:913/1480 train_time:140092ms step_avg:155.14ms step:914/1480 train_time:140259ms step_avg:155.15ms step:915/1480 train_time:140426ms step_avg:155.17ms step:916/1480 train_time:140592ms step_avg:155.18ms step:917/1480 train_time:140756ms step_avg:155.19ms step:918/1480 train_time:140924ms step_avg:155.20ms step:919/1480 train_time:141094ms step_avg:155.22ms step:920/1480 train_time:141261ms step_avg:155.23ms step:921/1480 train_time:141426ms step_avg:155.24ms step:922/1480 train_time:141595ms step_avg:155.26ms step:923/1480 train_time:141758ms step_avg:155.27ms step:924/1480 train_time:141923ms step_avg:155.28ms step:925/1480 train_time:142089ms step_avg:155.29ms step:926/1480 train_time:142252ms step_avg:155.30ms step:927/1480 train_time:142416ms step_avg:155.31ms step:928/1480 train_time:142582ms step_avg:155.32ms step:929/1480 train_time:142746ms step_avg:155.33ms step:930/1480 train_time:142912ms step_avg:155.34ms step:931/1480 train_time:143076ms step_avg:155.35ms step:932/1480 train_time:143242ms step_avg:155.36ms step:933/1480 train_time:143409ms step_avg:155.37ms step:934/1480 train_time:143579ms step_avg:155.39ms step:935/1480 train_time:143748ms step_avg:155.40ms step:936/1480 train_time:143914ms step_avg:155.41ms step:937/1480 train_time:144083ms step_avg:155.43ms step:938/1480 train_time:144245ms step_avg:155.44ms step:939/1480 train_time:144414ms step_avg:155.45ms step:940/1480 train_time:144581ms step_avg:155.46ms step:941/1480 train_time:144744ms step_avg:155.47ms step:942/1480 train_time:144908ms step_avg:155.48ms step:943/1480 train_time:145079ms step_avg:155.50ms step:944/1480 train_time:145251ms step_avg:155.52ms step:945/1480 train_time:145415ms step_avg:155.52ms step:946/1480 train_time:145585ms step_avg:155.54ms step:947/1480 train_time:145753ms step_avg:155.55ms step:948/1480 train_time:145919ms step_avg:155.56ms step:949/1480 train_time:146085ms step_avg:155.58ms step:950/1480 train_time:146249ms step_avg:155.58ms step:951/1480 train_time:146418ms step_avg:155.60ms step:952/1480 train_time:146584ms step_avg:155.61ms step:953/1480 train_time:146753ms step_avg:155.62ms step:954/1480 train_time:146921ms step_avg:155.64ms step:955/1480 train_time:147085ms step_avg:155.65ms step:956/1480 train_time:147251ms step_avg:155.66ms step:957/1480 train_time:147418ms step_avg:155.67ms step:958/1480 train_time:147587ms step_avg:155.68ms step:959/1480 train_time:147751ms step_avg:155.69ms step:960/1480 train_time:147919ms step_avg:155.70ms step:961/1480 train_time:148084ms step_avg:155.71ms step:962/1480 train_time:148247ms step_avg:155.72ms step:963/1480 train_time:148412ms step_avg:155.73ms step:964/1480 train_time:148581ms step_avg:155.75ms step:965/1480 train_time:148745ms step_avg:155.75ms step:966/1480 train_time:148909ms step_avg:155.76ms step:967/1480 train_time:149073ms step_avg:155.77ms step:968/1480 train_time:149238ms step_avg:155.78ms step:969/1480 train_time:149406ms step_avg:155.79ms step:970/1480 train_time:149568ms step_avg:155.80ms step:971/1480 train_time:149733ms step_avg:155.81ms step:972/1480 train_time:149899ms step_avg:155.82ms step:973/1480 train_time:150064ms step_avg:155.83ms step:974/1480 train_time:150233ms step_avg:155.84ms step:975/1480 train_time:150399ms step_avg:155.85ms step:976/1480 train_time:150564ms step_avg:155.86ms step:977/1480 train_time:150727ms step_avg:155.87ms step:978/1480 train_time:150894ms step_avg:155.88ms step:979/1480 train_time:151062ms step_avg:155.89ms step:980/1480 train_time:151227ms step_avg:155.90ms step:981/1480 train_time:151397ms step_avg:155.92ms step:982/1480 train_time:151560ms step_avg:155.93ms step:983/1480 train_time:151725ms step_avg:155.93ms step:984/1480 train_time:151889ms step_avg:155.94ms step:985/1480 train_time:152058ms step_avg:155.96ms step:986/1480 train_time:152223ms step_avg:155.97ms step:987/1480 train_time:152387ms step_avg:155.97ms step:988/1480 train_time:152556ms step_avg:155.99ms step:989/1480 train_time:152722ms step_avg:156.00ms step:990/1480 train_time:152892ms step_avg:156.01ms step:991/1480 train_time:153059ms step_avg:156.02ms step:992/1480 train_time:153234ms step_avg:156.04ms step:993/1480 train_time:153409ms step_avg:156.06ms step:994/1480 train_time:153575ms step_avg:156.07ms step:995/1480 train_time:153740ms step_avg:156.08ms step:996/1480 train_time:153902ms step_avg:156.09ms step:997/1480 train_time:154067ms step_avg:156.10ms step:998/1480 train_time:154231ms step_avg:156.10ms step:999/1480 train_time:154399ms step_avg:156.12ms step:1000/1480 train_time:154567ms step_avg:156.13ms step:1000/1480 val_loss:3.4387 train_time:154634ms step_avg:156.20ms step:1001/1480 train_time:154734ms step_avg:156.14ms step:1002/1480 train_time:154899ms step_avg:156.15ms step:1003/1480 train_time:155070ms step_avg:156.16ms step:1004/1480 train_time:155239ms step_avg:156.18ms step:1005/1480 train_time:155408ms step_avg:156.19ms step:1006/1480 train_time:155574ms step_avg:156.20ms step:1007/1480 train_time:155739ms step_avg:156.21ms step:1008/1480 train_time:155908ms step_avg:156.22ms step:1009/1480 train_time:156082ms step_avg:156.24ms step:1010/1480 train_time:156248ms step_avg:156.25ms step:1011/1480 train_time:156413ms step_avg:156.26ms step:1012/1480 train_time:156577ms step_avg:156.26ms step:1013/1480 train_time:156748ms step_avg:156.28ms step:1014/1480 train_time:156914ms step_avg:156.29ms step:1015/1480 train_time:157084ms step_avg:156.30ms step:1016/1480 train_time:157252ms step_avg:156.31ms step:1017/1480 train_time:157423ms step_avg:156.33ms step:1018/1480 train_time:157593ms step_avg:156.34ms step:1019/1480 train_time:157761ms step_avg:156.35ms step:1020/1480 train_time:157931ms step_avg:156.37ms step:1021/1480 train_time:158096ms step_avg:156.38ms step:1022/1480 train_time:158263ms step_avg:156.39ms step:1023/1480 train_time:158431ms step_avg:156.40ms step:1024/1480 train_time:158596ms step_avg:156.41ms step:1025/1480 train_time:158767ms step_avg:156.42ms step:1026/1480 train_time:158933ms step_avg:156.43ms step:1027/1480 train_time:159098ms step_avg:156.44ms step:1028/1480 train_time:159272ms step_avg:156.46ms step:1029/1480 train_time:159447ms step_avg:156.47ms step:1030/1480 train_time:159615ms step_avg:156.49ms step:1031/1480 train_time:159780ms step_avg:156.49ms step:1032/1480 train_time:159953ms step_avg:156.51ms step:1033/1480 train_time:160118ms step_avg:156.52ms step:1034/1480 train_time:160284ms step_avg:156.53ms step:1035/1480 train_time:160452ms step_avg:156.54ms step:1036/1480 train_time:160617ms step_avg:156.55ms step:1037/1480 train_time:160784ms step_avg:156.56ms step:1038/1480 train_time:160953ms step_avg:156.57ms step:1039/1480 train_time:161124ms step_avg:156.58ms step:1040/1480 train_time:161291ms step_avg:156.59ms step:1041/1480 train_time:161458ms step_avg:156.60ms step:1042/1480 train_time:161620ms step_avg:156.61ms step:1043/1480 train_time:161787ms step_avg:156.62ms step:1044/1480 train_time:161952ms step_avg:156.63ms step:1045/1480 train_time:162123ms step_avg:156.64ms step:1046/1480 train_time:162292ms step_avg:156.65ms step:1047/1480 train_time:162457ms step_avg:156.66ms step:1048/1480 train_time:162626ms step_avg:156.67ms step:1049/1480 train_time:162792ms step_avg:156.68ms step:1050/1480 train_time:162959ms step_avg:156.69ms step:1051/1480 train_time:163131ms step_avg:156.71ms step:1052/1480 train_time:163297ms step_avg:156.72ms step:1053/1480 train_time:163464ms step_avg:156.73ms step:1054/1480 train_time:163634ms step_avg:156.74ms step:1055/1480 train_time:163799ms step_avg:156.75ms step:1056/1480 train_time:163964ms step_avg:156.75ms step:1057/1480 train_time:164131ms step_avg:156.76ms step:1058/1480 train_time:164300ms step_avg:156.77ms step:1059/1480 train_time:164472ms step_avg:156.79ms step:1060/1480 train_time:164640ms step_avg:156.80ms step:1061/1480 train_time:164804ms step_avg:156.81ms step:1062/1480 train_time:164970ms step_avg:156.82ms step:1063/1480 train_time:165135ms step_avg:156.82ms step:1064/1480 train_time:165299ms step_avg:156.83ms step:1065/1480 train_time:165466ms step_avg:156.84ms step:1066/1480 train_time:165634ms step_avg:156.85ms step:1067/1480 train_time:165804ms step_avg:156.86ms step:1068/1480 train_time:165971ms step_avg:156.87ms step:1069/1480 train_time:166141ms step_avg:156.88ms step:1070/1480 train_time:166307ms step_avg:156.89ms step:1071/1480 train_time:166478ms step_avg:156.91ms step:1072/1480 train_time:166644ms step_avg:156.92ms step:1073/1480 train_time:166808ms step_avg:156.92ms step:1074/1480 train_time:166975ms step_avg:156.93ms step:1075/1480 train_time:167149ms step_avg:156.95ms step:1076/1480 train_time:167317ms step_avg:156.96ms step:1077/1480 train_time:167482ms step_avg:156.97ms step:1078/1480 train_time:167657ms step_avg:156.98ms step:1079/1480 train_time:167829ms step_avg:157.00ms step:1080/1480 train_time:167999ms step_avg:157.01ms step:1081/1480 train_time:168166ms step_avg:157.02ms step:1082/1480 train_time:168333ms step_avg:157.03ms step:1083/1480 train_time:168500ms step_avg:157.04ms step:1084/1480 train_time:168668ms step_avg:157.05ms step:1085/1480 train_time:168836ms step_avg:157.06ms step:1086/1480 train_time:169005ms step_avg:157.07ms step:1087/1480 train_time:169171ms step_avg:157.08ms step:1088/1480 train_time:169341ms step_avg:157.09ms step:1089/1480 train_time:169515ms step_avg:157.10ms step:1090/1480 train_time:169687ms step_avg:157.12ms step:1091/1480 train_time:169855ms step_avg:157.13ms step:1092/1480 train_time:170024ms step_avg:157.14ms step:1093/1480 train_time:170191ms step_avg:157.15ms step:1094/1480 train_time:170357ms step_avg:157.16ms step:1095/1480 train_time:170522ms step_avg:157.16ms step:1096/1480 train_time:170691ms step_avg:157.17ms step:1097/1480 train_time:170858ms step_avg:157.18ms step:1098/1480 train_time:171030ms step_avg:157.20ms step:1099/1480 train_time:171200ms step_avg:157.21ms step:1100/1480 train_time:171372ms step_avg:157.22ms step:1101/1480 train_time:171542ms step_avg:157.23ms step:1102/1480 train_time:171714ms step_avg:157.25ms step:1103/1480 train_time:171891ms step_avg:157.27ms step:1104/1480 train_time:172058ms step_avg:157.27ms step:1105/1480 train_time:172230ms step_avg:157.29ms step:1106/1480 train_time:172399ms step_avg:157.30ms step:1107/1480 train_time:172568ms step_avg:157.31ms step:1108/1480 train_time:172733ms step_avg:157.32ms step:1109/1480 train_time:172898ms step_avg:157.32ms step:1110/1480 train_time:173063ms step_avg:157.33ms step:1111/1480 train_time:173231ms step_avg:157.34ms step:1112/1480 train_time:173400ms step_avg:157.35ms step:1113/1480 train_time:173581ms step_avg:157.37ms step:1114/1480 train_time:173754ms step_avg:157.39ms step:1115/1480 train_time:173927ms step_avg:157.40ms step:1116/1480 train_time:174093ms step_avg:157.41ms step:1117/1480 train_time:174266ms step_avg:157.42ms step:1118/1480 train_time:174442ms step_avg:157.44ms step:1119/1480 train_time:174610ms step_avg:157.45ms step:1120/1480 train_time:174777ms step_avg:157.46ms step:1121/1480 train_time:174948ms step_avg:157.47ms step:1122/1480 train_time:175114ms step_avg:157.48ms step:1123/1480 train_time:175280ms step_avg:157.48ms step:1124/1480 train_time:175448ms step_avg:157.49ms step:1125/1480 train_time:175616ms step_avg:157.50ms step:1125/1480 val_loss:3.3834 train_time:175684ms step_avg:157.56ms step:1126/1480 train_time:175785ms step_avg:157.51ms step:1127/1480 train_time:175957ms step_avg:157.53ms step:1128/1480 train_time:176127ms step_avg:157.54ms step:1129/1480 train_time:176301ms step_avg:157.55ms step:1130/1480 train_time:176470ms step_avg:157.56ms step:1131/1480 train_time:176648ms step_avg:157.58ms step:1132/1480 train_time:176815ms step_avg:157.59ms step:1133/1480 train_time:176987ms step_avg:157.60ms step:1134/1480 train_time:177158ms step_avg:157.61ms step:1135/1480 train_time:177326ms step_avg:157.62ms step:1136/1480 train_time:177495ms step_avg:157.63ms step:1137/1480 train_time:177665ms step_avg:157.64ms step:1138/1480 train_time:177837ms step_avg:157.66ms step:1139/1480 train_time:178005ms step_avg:157.67ms step:1140/1480 train_time:178172ms step_avg:157.67ms step:1141/1480 train_time:178347ms step_avg:157.69ms step:1142/1480 train_time:178515ms step_avg:157.70ms step:1143/1480 train_time:178686ms step_avg:157.71ms step:1144/1480 train_time:178854ms step_avg:157.72ms step:1145/1480 train_time:179019ms step_avg:157.73ms step:1146/1480 train_time:179189ms step_avg:157.74ms step:1147/1480 train_time:179358ms step_avg:157.75ms step:1148/1480 train_time:179526ms step_avg:157.76ms step:1149/1480 train_time:179695ms step_avg:157.77ms step:1150/1480 train_time:179864ms step_avg:157.78ms step:1151/1480 train_time:180037ms step_avg:157.79ms step:1152/1480 train_time:180209ms step_avg:157.80ms step:1153/1480 train_time:180382ms step_avg:157.81ms step:1154/1480 train_time:180549ms step_avg:157.82ms step:1155/1480 train_time:180722ms step_avg:157.84ms step:1156/1480 train_time:180903ms step_avg:157.86ms step:1157/1480 train_time:181071ms step_avg:157.86ms step:1158/1480 train_time:181239ms step_avg:157.87ms step:1159/1480 train_time:181406ms step_avg:157.88ms step:1160/1480 train_time:181572ms step_avg:157.89ms step:1161/1480 train_time:181744ms step_avg:157.90ms step:1162/1480 train_time:181914ms step_avg:157.91ms step:1163/1480 train_time:182083ms step_avg:157.92ms step:1164/1480 train_time:182252ms step_avg:157.93ms step:1165/1480 train_time:182418ms step_avg:157.94ms step:1166/1480 train_time:182586ms step_avg:157.95ms step:1167/1480 train_time:182756ms step_avg:157.96ms step:1168/1480 train_time:182923ms step_avg:157.96ms step:1169/1480 train_time:183090ms step_avg:157.97ms step:1170/1480 train_time:183260ms step_avg:157.98ms step:1171/1480 train_time:183427ms step_avg:157.99ms step:1172/1480 train_time:183595ms step_avg:158.00ms step:1173/1480 train_time:183767ms step_avg:158.01ms step:1174/1480 train_time:183948ms step_avg:158.03ms step:1175/1480 train_time:184121ms step_avg:158.04ms step:1176/1480 train_time:184292ms step_avg:158.06ms step:1177/1480 train_time:184470ms step_avg:158.07ms step:1178/1480 train_time:184637ms step_avg:158.08ms step:1179/1480 train_time:184803ms step_avg:158.09ms step:1180/1480 train_time:184981ms step_avg:158.10ms step:1181/1480 train_time:185150ms step_avg:158.11ms step:1182/1480 train_time:185320ms step_avg:158.12ms step:1183/1480 train_time:185493ms step_avg:158.14ms step:1184/1480 train_time:185662ms step_avg:158.14ms step:1185/1480 train_time:185833ms step_avg:158.16ms step:1186/1480 train_time:186005ms step_avg:158.17ms step:1187/1480 train_time:186187ms step_avg:158.19ms step:1188/1480 train_time:186354ms step_avg:158.19ms step:1189/1480 train_time:186526ms step_avg:158.21ms step:1190/1480 train_time:186693ms step_avg:158.21ms step:1191/1480 train_time:186865ms step_avg:158.23ms step:1192/1480 train_time:187031ms step_avg:158.23ms step:1193/1480 train_time:187199ms step_avg:158.24ms step:1194/1480 train_time:187367ms step_avg:158.25ms step:1195/1480 train_time:187541ms step_avg:158.26ms step:1196/1480 train_time:187723ms step_avg:158.28ms step:1197/1480 train_time:187893ms step_avg:158.29ms step:1198/1480 train_time:188077ms step_avg:158.31ms step:1199/1480 train_time:188248ms step_avg:158.32ms step:1200/1480 train_time:188415ms step_avg:158.33ms step:1201/1480 train_time:188583ms step_avg:158.34ms step:1202/1480 train_time:188765ms step_avg:158.36ms step:1203/1480 train_time:188943ms step_avg:158.38ms step:1204/1480 train_time:189116ms step_avg:158.39ms step:1205/1480 train_time:189283ms step_avg:158.40ms step:1206/1480 train_time:189450ms step_avg:158.40ms step:1207/1480 train_time:189619ms step_avg:158.41ms step:1208/1480 train_time:189787ms step_avg:158.42ms step:1209/1480 train_time:189963ms step_avg:158.43ms step:1210/1480 train_time:190139ms step_avg:158.45ms step:1211/1480 train_time:190311ms step_avg:158.46ms step:1212/1480 train_time:190483ms step_avg:158.47ms step:1213/1480 train_time:190656ms step_avg:158.48ms step:1214/1480 train_time:190832ms step_avg:158.50ms step:1215/1480 train_time:191003ms step_avg:158.51ms step:1216/1480 train_time:191171ms step_avg:158.52ms step:1217/1480 train_time:191345ms step_avg:158.53ms step:1218/1480 train_time:191514ms step_avg:158.54ms step:1219/1480 train_time:191691ms step_avg:158.55ms step:1220/1480 train_time:191861ms step_avg:158.56ms step:1221/1480 train_time:192029ms step_avg:158.57ms step:1222/1480 train_time:192196ms step_avg:158.58ms step:1223/1480 train_time:192365ms step_avg:158.59ms step:1224/1480 train_time:192542ms step_avg:158.60ms step:1225/1480 train_time:192714ms step_avg:158.61ms step:1226/1480 train_time:192887ms step_avg:158.62ms step:1227/1480 train_time:193059ms step_avg:158.64ms step:1228/1480 train_time:193229ms step_avg:158.64ms step:1229/1480 train_time:193402ms step_avg:158.66ms step:1230/1480 train_time:193583ms step_avg:158.67ms step:1231/1480 train_time:193759ms step_avg:158.69ms step:1232/1480 train_time:193933ms step_avg:158.70ms step:1233/1480 train_time:194103ms step_avg:158.71ms step:1234/1480 train_time:194274ms step_avg:158.72ms step:1235/1480 train_time:194449ms step_avg:158.73ms step:1236/1480 train_time:194618ms step_avg:158.74ms step:1237/1480 train_time:194789ms step_avg:158.75ms step:1238/1480 train_time:194973ms step_avg:158.77ms step:1239/1480 train_time:195145ms step_avg:158.78ms step:1240/1480 train_time:195315ms step_avg:158.79ms step:1241/1480 train_time:195488ms step_avg:158.80ms step:1242/1480 train_time:195659ms step_avg:158.81ms step:1243/1480 train_time:195831ms step_avg:158.82ms step:1244/1480 train_time:195999ms step_avg:158.83ms step:1245/1480 train_time:196168ms step_avg:158.84ms step:1246/1480 train_time:196339ms step_avg:158.85ms step:1247/1480 train_time:196509ms step_avg:158.86ms step:1248/1480 train_time:196680ms step_avg:158.87ms step:1249/1480 train_time:196847ms step_avg:158.88ms step:1250/1480 train_time:197016ms step_avg:158.88ms step:1250/1480 val_loss:3.3343 train_time:197087ms step_avg:158.94ms step:1251/1480 train_time:197193ms step_avg:158.90ms step:1252/1480 train_time:197363ms step_avg:158.91ms step:1253/1480 train_time:197531ms step_avg:158.91ms step:1254/1480 train_time:197702ms step_avg:158.92ms step:1255/1480 train_time:197890ms step_avg:158.95ms step:1256/1480 train_time:198066ms step_avg:158.96ms step:1257/1480 train_time:198236ms step_avg:158.97ms step:1258/1480 train_time:198411ms step_avg:158.98ms step:1259/1480 train_time:198584ms step_avg:158.99ms step:1260/1480 train_time:198751ms step_avg:159.00ms step:1261/1480 train_time:198923ms step_avg:159.01ms step:1262/1480 train_time:199097ms step_avg:159.02ms step:1263/1480 train_time:199273ms step_avg:159.04ms step:1264/1480 train_time:199439ms step_avg:159.04ms step:1265/1480 train_time:199606ms step_avg:159.05ms step:1266/1480 train_time:199779ms step_avg:159.06ms step:1267/1480 train_time:199948ms step_avg:159.07ms step:1268/1480 train_time:200120ms step_avg:159.08ms step:1269/1480 train_time:200295ms step_avg:159.09ms step:1270/1480 train_time:200464ms step_avg:159.10ms step:1271/1480 train_time:200632ms step_avg:159.11ms step:1272/1480 train_time:200798ms step_avg:159.11ms step:1273/1480 train_time:200969ms step_avg:159.12ms step:1274/1480 train_time:201142ms step_avg:159.13ms step:1275/1480 train_time:201310ms step_avg:159.14ms step:1276/1480 train_time:201475ms step_avg:159.14ms step:1277/1480 train_time:201647ms step_avg:159.15ms step:1278/1480 train_time:201814ms step_avg:159.16ms step:1279/1480 train_time:201987ms step_avg:159.17ms step:1280/1480 train_time:202166ms step_avg:159.19ms step:1281/1480 train_time:202333ms step_avg:159.19ms step:1282/1480 train_time:202498ms step_avg:159.20ms step:1283/1480 train_time:202670ms step_avg:159.21ms step:1284/1480 train_time:202841ms step_avg:159.22ms step:1285/1480 train_time:203010ms step_avg:159.22ms step:1286/1480 train_time:203181ms step_avg:159.23ms step:1287/1480 train_time:203353ms step_avg:159.24ms step:1288/1480 train_time:203526ms step_avg:159.25ms step:1289/1480 train_time:203709ms step_avg:159.27ms step:1290/1480 train_time:203889ms step_avg:159.29ms step:1291/1480 train_time:204061ms step_avg:159.30ms step:1292/1480 train_time:204235ms step_avg:159.31ms step:1293/1480 train_time:204410ms step_avg:159.32ms step:1294/1480 train_time:204583ms step_avg:159.33ms step:1295/1480 train_time:204754ms step_avg:159.34ms step:1296/1480 train_time:204927ms step_avg:159.35ms step:1297/1480 train_time:205099ms step_avg:159.36ms step:1298/1480 train_time:205271ms step_avg:159.37ms step:1299/1480 train_time:205443ms step_avg:159.38ms step:1300/1480 train_time:205609ms step_avg:159.39ms step:1301/1480 train_time:205779ms step_avg:159.40ms step:1302/1480 train_time:205952ms step_avg:159.41ms step:1303/1480 train_time:206129ms step_avg:159.42ms step:1304/1480 train_time:206304ms step_avg:159.43ms step:1305/1480 train_time:206473ms step_avg:159.44ms step:1306/1480 train_time:206649ms step_avg:159.45ms step:1307/1480 train_time:206817ms step_avg:159.46ms step:1308/1480 train_time:206987ms step_avg:159.47ms step:1309/1480 train_time:207160ms step_avg:159.48ms step:1310/1480 train_time:207328ms step_avg:159.48ms step:1311/1480 train_time:207495ms step_avg:159.49ms step:1312/1480 train_time:207669ms step_avg:159.50ms step:1313/1480 train_time:207838ms step_avg:159.51ms step:1314/1480 train_time:208010ms step_avg:159.52ms step:1315/1480 train_time:208181ms step_avg:159.53ms step:1316/1480 train_time:208348ms step_avg:159.53ms step:1317/1480 train_time:208520ms step_avg:159.54ms step:1318/1480 train_time:208701ms step_avg:159.56ms step:1319/1480 train_time:208878ms step_avg:159.57ms step:1320/1480 train_time:209055ms step_avg:159.58ms step:1321/1480 train_time:209228ms step_avg:159.59ms step:1322/1480 train_time:209409ms step_avg:159.61ms step:1323/1480 train_time:209581ms step_avg:159.62ms step:1324/1480 train_time:209756ms step_avg:159.63ms step:1325/1480 train_time:209941ms step_avg:159.65ms step:1326/1480 train_time:210115ms step_avg:159.66ms step:1327/1480 train_time:210285ms step_avg:159.67ms step:1328/1480 train_time:210454ms step_avg:159.68ms step:1329/1480 train_time:210652ms step_avg:159.71ms step:1330/1480 train_time:210831ms step_avg:159.72ms step:1331/1480 train_time:211001ms step_avg:159.73ms step:1332/1480 train_time:211174ms step_avg:159.74ms step:1333/1480 train_time:211349ms step_avg:159.75ms step:1334/1480 train_time:211521ms step_avg:159.76ms step:1335/1480 train_time:211690ms step_avg:159.77ms step:1336/1480 train_time:211872ms step_avg:159.78ms step:1337/1480 train_time:212047ms step_avg:159.79ms step:1338/1480 train_time:212220ms step_avg:159.80ms step:1339/1480 train_time:212393ms step_avg:159.81ms step:1340/1480 train_time:212565ms step_avg:159.82ms step:1341/1480 train_time:212733ms step_avg:159.83ms step:1342/1480 train_time:212906ms step_avg:159.84ms step:1343/1480 train_time:213077ms step_avg:159.85ms step:1344/1480 train_time:213249ms step_avg:159.86ms step:1345/1480 train_time:213426ms step_avg:159.87ms step:1346/1480 train_time:213593ms step_avg:159.88ms step:1347/1480 train_time:213765ms step_avg:159.88ms step:1348/1480 train_time:213933ms step_avg:159.89ms step:1349/1480 train_time:214103ms step_avg:159.90ms step:1350/1480 train_time:214277ms step_avg:159.91ms step:1351/1480 train_time:214447ms step_avg:159.92ms step:1352/1480 train_time:214619ms step_avg:159.92ms step:1353/1480 train_time:214795ms step_avg:159.94ms step:1354/1480 train_time:214967ms step_avg:159.95ms step:1355/1480 train_time:215136ms step_avg:159.95ms step:1356/1480 train_time:215309ms step_avg:159.96ms step:1357/1480 train_time:215482ms step_avg:159.97ms step:1358/1480 train_time:215653ms step_avg:159.98ms step:1359/1480 train_time:215826ms step_avg:159.99ms step:1360/1480 train_time:216000ms step_avg:160.00ms step:1361/1480 train_time:216177ms step_avg:160.01ms step:1362/1480 train_time:216351ms step_avg:160.02ms step:1363/1480 train_time:216531ms step_avg:160.04ms step:1364/1480 train_time:216700ms step_avg:160.04ms step:1365/1480 train_time:216867ms step_avg:160.05ms step:1366/1480 train_time:217039ms step_avg:160.06ms step:1367/1480 train_time:217209ms step_avg:160.07ms step:1368/1480 train_time:217383ms step_avg:160.08ms step:1369/1480 train_time:217564ms step_avg:160.09ms step:1370/1480 train_time:217741ms step_avg:160.10ms step:1371/1480 train_time:217911ms step_avg:160.11ms step:1372/1480 train_time:218089ms step_avg:160.12ms step:1373/1480 train_time:218258ms step_avg:160.13ms step:1374/1480 train_time:218432ms step_avg:160.14ms step:1375/1480 train_time:218604ms step_avg:160.15ms step:1375/1480 val_loss:3.2959 train_time:218672ms step_avg:160.20ms step:1376/1480 train_time:218778ms step_avg:160.16ms step:1377/1480 train_time:218951ms step_avg:160.17ms step:1378/1480 train_time:219120ms step_avg:160.18ms step:1379/1480 train_time:219293ms step_avg:160.18ms step:1380/1480 train_time:219466ms step_avg:160.19ms step:1381/1480 train_time:219645ms step_avg:160.21ms step:1382/1480 train_time:219817ms step_avg:160.22ms step:1383/1480 train_time:219989ms step_avg:160.23ms step:1384/1480 train_time:220167ms step_avg:160.24ms step:1385/1480 train_time:220332ms step_avg:160.24ms step:1386/1480 train_time:220503ms step_avg:160.25ms step:1387/1480 train_time:220676ms step_avg:160.26ms step:1388/1480 train_time:220843ms step_avg:160.26ms step:1389/1480 train_time:221017ms step_avg:160.27ms step:1390/1480 train_time:221184ms step_avg:160.28ms step:1391/1480 train_time:221355ms step_avg:160.29ms step:1392/1480 train_time:221525ms step_avg:160.29ms step:1393/1480 train_time:221698ms step_avg:160.30ms step:1394/1480 train_time:221867ms step_avg:160.31ms step:1395/1480 train_time:222037ms step_avg:160.32ms step:1396/1480 train_time:222205ms step_avg:160.32ms step:1397/1480 train_time:222374ms step_avg:160.33ms step:1398/1480 train_time:222540ms step_avg:160.33ms step:1399/1480 train_time:222709ms step_avg:160.34ms step:1400/1480 train_time:222886ms step_avg:160.35ms step:1401/1480 train_time:223052ms step_avg:160.35ms step:1402/1480 train_time:223224ms step_avg:160.36ms step:1403/1480 train_time:223402ms step_avg:160.37ms step:1404/1480 train_time:223573ms step_avg:160.38ms step:1405/1480 train_time:223746ms step_avg:160.39ms step:1406/1480 train_time:223920ms step_avg:160.40ms step:1407/1480 train_time:224087ms step_avg:160.41ms step:1408/1480 train_time:224257ms step_avg:160.41ms step:1409/1480 train_time:224439ms step_avg:160.43ms step:1410/1480 train_time:224606ms step_avg:160.43ms step:1411/1480 train_time:224776ms step_avg:160.44ms step:1412/1480 train_time:224945ms step_avg:160.45ms step:1413/1480 train_time:225116ms step_avg:160.45ms step:1414/1480 train_time:225287ms step_avg:160.46ms step:1415/1480 train_time:225463ms step_avg:160.47ms step:1416/1480 train_time:225649ms step_avg:160.49ms step:1417/1480 train_time:225823ms step_avg:160.50ms step:1418/1480 train_time:225995ms step_avg:160.51ms step:1419/1480 train_time:226168ms step_avg:160.52ms step:1420/1480 train_time:226342ms step_avg:160.53ms step:1421/1480 train_time:226515ms step_avg:160.54ms step:1422/1480 train_time:226686ms step_avg:160.54ms step:1423/1480 train_time:226855ms step_avg:160.55ms step:1424/1480 train_time:227031ms step_avg:160.56ms step:1425/1480 train_time:227210ms step_avg:160.57ms step:1426/1480 train_time:227382ms step_avg:160.58ms step:1427/1480 train_time:227558ms step_avg:160.59ms step:1428/1480 train_time:227730ms step_avg:160.60ms step:1429/1480 train_time:227899ms step_avg:160.61ms step:1430/1480 train_time:228073ms step_avg:160.61ms step:1431/1480 train_time:228247ms step_avg:160.62ms step:1432/1480 train_time:228424ms step_avg:160.64ms step:1433/1480 train_time:228604ms step_avg:160.65ms step:1434/1480 train_time:228785ms step_avg:160.66ms step:1435/1480 train_time:228960ms step_avg:160.67ms step:1436/1480 train_time:229134ms step_avg:160.68ms step:1437/1480 train_time:229305ms step_avg:160.69ms step:1438/1480 train_time:229475ms step_avg:160.70ms step:1439/1480 train_time:229647ms step_avg:160.70ms step:1440/1480 train_time:229818ms step_avg:160.71ms step:1441/1480 train_time:229991ms step_avg:160.72ms step:1442/1480 train_time:230168ms step_avg:160.73ms step:1443/1480 train_time:230358ms step_avg:160.75ms step:1444/1480 train_time:230529ms step_avg:160.76ms step:1445/1480 train_time:230701ms step_avg:160.77ms step:1446/1480 train_time:230876ms step_avg:160.78ms step:1447/1480 train_time:231053ms step_avg:160.79ms step:1448/1480 train_time:231225ms step_avg:160.80ms step:1449/1480 train_time:231399ms step_avg:160.81ms step:1450/1480 train_time:231571ms step_avg:160.81ms step:1451/1480 train_time:231742ms step_avg:160.82ms step:1452/1480 train_time:231915ms step_avg:160.83ms step:1453/1480 train_time:232084ms step_avg:160.83ms step:1454/1480 train_time:232257ms step_avg:160.84ms step:1455/1480 train_time:232436ms step_avg:160.86ms step:1456/1480 train_time:232608ms step_avg:160.86ms step:1457/1480 train_time:232780ms step_avg:160.87ms step:1458/1480 train_time:232951ms step_avg:160.88ms step:1459/1480 train_time:233130ms step_avg:160.89ms step:1460/1480 train_time:233302ms step_avg:160.90ms step:1461/1480 train_time:233478ms step_avg:160.91ms step:1462/1480 train_time:233649ms step_avg:160.92ms step:1463/1480 train_time:233826ms step_avg:160.93ms step:1464/1480 train_time:234001ms step_avg:160.94ms step:1465/1480 train_time:234173ms step_avg:160.94ms step:1466/1480 train_time:234344ms step_avg:160.95ms step:1467/1480 train_time:234518ms step_avg:160.96ms step:1468/1480 train_time:234687ms step_avg:160.97ms step:1469/1480 train_time:234860ms step_avg:160.97ms step:1470/1480 train_time:235039ms step_avg:160.99ms step:1471/1480 train_time:235225ms step_avg:161.00ms step:1472/1480 train_time:235404ms step_avg:161.02ms step:1473/1480 train_time:235576ms step_avg:161.02ms step:1474/1480 train_time:235754ms step_avg:161.03ms step:1475/1480 train_time:235934ms step_avg:161.05ms step:1476/1480 train_time:236106ms step_avg:161.05ms step:1477/1480 train_time:236288ms step_avg:161.07ms step:1478/1480 train_time:236472ms step_avg:161.08ms step:1479/1480 train_time:236645ms step_avg:161.09ms step:1480/1480 train_time:236819ms step_avg:161.10ms step:1480/1480 val_loss:3.2773 train_time:236891ms step_avg:161.15ms