import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 11:57:32 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 44C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 44C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 97W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 38C P0 77W / 700W | 19MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 44C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 95W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 37C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22867ms step_avg:nanms step:2/1480 train_time:22964ms step_avg:nanms step:3/1480 train_time:23102ms step_avg:nanms step:4/1480 train_time:23243ms step_avg:nanms step:5/1480 train_time:23384ms step_avg:nanms step:6/1480 train_time:23526ms step_avg:nanms step:7/1480 train_time:23667ms step_avg:nanms step:8/1480 train_time:23810ms step_avg:nanms step:9/1480 train_time:23955ms step_avg:nanms step:10/1480 train_time:24096ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.62ms step:14/1480 train_time:566ms step_avg:141.61ms step:15/1480 train_time:709ms step_avg:141.87ms step:16/1480 train_time:853ms step_avg:142.21ms step:17/1480 train_time:997ms step_avg:142.45ms step:18/1480 train_time:1140ms step_avg:142.45ms step:19/1480 train_time:1281ms step_avg:142.36ms step:20/1480 train_time:1423ms step_avg:142.33ms step:21/1480 train_time:1565ms step_avg:142.26ms step:22/1480 train_time:1707ms step_avg:142.27ms step:23/1480 train_time:1851ms step_avg:142.40ms step:24/1480 train_time:1994ms step_avg:142.46ms step:25/1480 train_time:2139ms step_avg:142.58ms step:26/1480 train_time:2281ms step_avg:142.57ms step:27/1480 train_time:2423ms step_avg:142.55ms step:28/1480 train_time:2565ms step_avg:142.52ms step:29/1480 train_time:2708ms step_avg:142.54ms step:30/1480 train_time:2852ms step_avg:142.62ms step:31/1480 train_time:2994ms step_avg:142.58ms step:32/1480 train_time:3137ms step_avg:142.57ms step:33/1480 train_time:3279ms step_avg:142.57ms step:34/1480 train_time:3421ms step_avg:142.56ms step:35/1480 train_time:3563ms step_avg:142.53ms step:36/1480 train_time:3706ms step_avg:142.54ms step:37/1480 train_time:3851ms step_avg:142.62ms step:38/1480 train_time:3995ms step_avg:142.67ms step:39/1480 train_time:4137ms step_avg:142.67ms step:40/1480 train_time:4279ms step_avg:142.64ms step:41/1480 train_time:4422ms step_avg:142.64ms step:42/1480 train_time:4563ms step_avg:142.60ms step:43/1480 train_time:4707ms step_avg:142.65ms step:44/1480 train_time:4852ms step_avg:142.71ms step:45/1480 train_time:4996ms step_avg:142.74ms step:46/1480 train_time:5139ms step_avg:142.74ms step:47/1480 train_time:5280ms step_avg:142.70ms step:48/1480 train_time:5421ms step_avg:142.65ms step:49/1480 train_time:5561ms step_avg:142.59ms step:50/1480 train_time:5702ms step_avg:142.56ms step:51/1480 train_time:5844ms step_avg:142.54ms step:52/1480 train_time:5989ms step_avg:142.59ms step:53/1480 train_time:6133ms step_avg:142.63ms step:54/1480 train_time:6275ms step_avg:142.62ms step:55/1480 train_time:6417ms step_avg:142.61ms step:56/1480 train_time:6559ms step_avg:142.58ms step:57/1480 train_time:6702ms step_avg:142.59ms step:58/1480 train_time:6845ms step_avg:142.60ms step:59/1480 train_time:6989ms step_avg:142.62ms step:60/1480 train_time:7132ms step_avg:142.64ms step:61/1480 train_time:7275ms step_avg:142.64ms step:62/1480 train_time:7418ms step_avg:142.65ms step:63/1480 train_time:7559ms step_avg:142.62ms step:64/1480 train_time:7700ms step_avg:142.59ms step:65/1480 train_time:7843ms step_avg:142.61ms step:66/1480 train_time:7989ms step_avg:142.67ms step:67/1480 train_time:8135ms step_avg:142.73ms step:68/1480 train_time:8277ms step_avg:142.70ms step:69/1480 train_time:8418ms step_avg:142.69ms step:70/1480 train_time:8559ms step_avg:142.66ms step:71/1480 train_time:8702ms step_avg:142.65ms step:72/1480 train_time:8844ms step_avg:142.65ms step:73/1480 train_time:8988ms step_avg:142.67ms step:74/1480 train_time:9133ms step_avg:142.70ms step:75/1480 train_time:9276ms step_avg:142.71ms step:76/1480 train_time:9419ms step_avg:142.71ms step:77/1480 train_time:9560ms step_avg:142.69ms step:78/1480 train_time:9704ms step_avg:142.71ms step:79/1480 train_time:9848ms step_avg:142.73ms step:80/1480 train_time:9991ms step_avg:142.73ms step:81/1480 train_time:10134ms step_avg:142.74ms step:82/1480 train_time:10276ms step_avg:142.73ms step:83/1480 train_time:10419ms step_avg:142.72ms step:84/1480 train_time:10560ms step_avg:142.70ms step:85/1480 train_time:10701ms step_avg:142.68ms step:86/1480 train_time:10844ms step_avg:142.68ms step:87/1480 train_time:10987ms step_avg:142.69ms step:88/1480 train_time:11132ms step_avg:142.71ms step:89/1480 train_time:11277ms step_avg:142.74ms step:90/1480 train_time:11419ms step_avg:142.74ms step:91/1480 train_time:11559ms step_avg:142.71ms step:92/1480 train_time:11701ms step_avg:142.70ms step:93/1480 train_time:11844ms step_avg:142.70ms step:94/1480 train_time:11987ms step_avg:142.70ms step:95/1480 train_time:12131ms step_avg:142.72ms step:96/1480 train_time:12274ms step_avg:142.72ms step:97/1480 train_time:12416ms step_avg:142.72ms step:98/1480 train_time:12557ms step_avg:142.70ms step:99/1480 train_time:12698ms step_avg:142.68ms step:100/1480 train_time:12841ms step_avg:142.68ms step:101/1480 train_time:12985ms step_avg:142.69ms step:102/1480 train_time:13128ms step_avg:142.70ms step:103/1480 train_time:13271ms step_avg:142.69ms step:104/1480 train_time:13414ms step_avg:142.70ms step:105/1480 train_time:13555ms step_avg:142.69ms step:106/1480 train_time:13696ms step_avg:142.67ms step:107/1480 train_time:13839ms step_avg:142.67ms step:108/1480 train_time:13983ms step_avg:142.68ms step:109/1480 train_time:14126ms step_avg:142.68ms step:110/1480 train_time:14269ms step_avg:142.69ms step:111/1480 train_time:14415ms step_avg:142.72ms step:112/1480 train_time:14560ms step_avg:142.75ms step:113/1480 train_time:14707ms step_avg:142.78ms step:114/1480 train_time:14855ms step_avg:142.83ms step:115/1480 train_time:15002ms step_avg:142.87ms step:116/1480 train_time:15149ms step_avg:142.91ms step:117/1480 train_time:15296ms step_avg:142.95ms step:118/1480 train_time:15443ms step_avg:142.99ms step:119/1480 train_time:15588ms step_avg:143.01ms step:120/1480 train_time:15735ms step_avg:143.04ms step:121/1480 train_time:15881ms step_avg:143.07ms step:122/1480 train_time:16027ms step_avg:143.10ms step:123/1480 train_time:16175ms step_avg:143.14ms step:124/1480 train_time:16322ms step_avg:143.17ms step:125/1480 train_time:16468ms step_avg:143.20ms step:125/1480 val_loss:4.4102 train_time:16526ms step_avg:143.70ms step:126/1480 train_time:16623ms step_avg:143.30ms step:127/1480 train_time:16772ms step_avg:143.35ms step:128/1480 train_time:16919ms step_avg:143.38ms step:129/1480 train_time:17065ms step_avg:143.41ms step:130/1480 train_time:17210ms step_avg:143.42ms step:131/1480 train_time:17356ms step_avg:143.44ms step:132/1480 train_time:17502ms step_avg:143.46ms step:133/1480 train_time:17651ms step_avg:143.50ms step:134/1480 train_time:17799ms step_avg:143.54ms step:135/1480 train_time:17947ms step_avg:143.57ms step:136/1480 train_time:18092ms step_avg:143.59ms step:137/1480 train_time:18240ms step_avg:143.62ms step:138/1480 train_time:18385ms step_avg:143.64ms step:139/1480 train_time:18531ms step_avg:143.65ms step:140/1480 train_time:18680ms step_avg:143.69ms step:141/1480 train_time:18827ms step_avg:143.72ms step:142/1480 train_time:18974ms step_avg:143.74ms step:143/1480 train_time:19120ms step_avg:143.76ms step:144/1480 train_time:19267ms step_avg:143.79ms step:145/1480 train_time:19414ms step_avg:143.81ms step:146/1480 train_time:19561ms step_avg:143.83ms step:147/1480 train_time:19707ms step_avg:143.85ms step:148/1480 train_time:19855ms step_avg:143.88ms step:149/1480 train_time:20003ms step_avg:143.90ms step:150/1480 train_time:20150ms step_avg:143.93ms step:151/1480 train_time:20297ms step_avg:143.95ms step:152/1480 train_time:20445ms step_avg:143.98ms step:153/1480 train_time:20591ms step_avg:143.99ms step:154/1480 train_time:20740ms step_avg:144.03ms step:155/1480 train_time:20887ms step_avg:144.05ms step:156/1480 train_time:21035ms step_avg:144.07ms step:157/1480 train_time:21182ms step_avg:144.10ms step:158/1480 train_time:21328ms step_avg:144.11ms step:159/1480 train_time:21474ms step_avg:144.12ms step:160/1480 train_time:21621ms step_avg:144.14ms step:161/1480 train_time:21768ms step_avg:144.16ms step:162/1480 train_time:21917ms step_avg:144.19ms step:163/1480 train_time:22065ms step_avg:144.22ms step:164/1480 train_time:22212ms step_avg:144.23ms step:165/1480 train_time:22359ms step_avg:144.25ms step:166/1480 train_time:22506ms step_avg:144.27ms step:167/1480 train_time:22654ms step_avg:144.29ms step:168/1480 train_time:22801ms step_avg:144.31ms step:169/1480 train_time:22948ms step_avg:144.33ms step:170/1480 train_time:23095ms step_avg:144.34ms step:171/1480 train_time:23242ms step_avg:144.36ms step:172/1480 train_time:23388ms step_avg:144.37ms step:173/1480 train_time:23537ms step_avg:144.40ms step:174/1480 train_time:23685ms step_avg:144.42ms step:175/1480 train_time:23832ms step_avg:144.44ms step:176/1480 train_time:23981ms step_avg:144.47ms step:177/1480 train_time:24128ms step_avg:144.48ms step:178/1480 train_time:24276ms step_avg:144.50ms step:179/1480 train_time:24423ms step_avg:144.52ms step:180/1480 train_time:24570ms step_avg:144.53ms step:181/1480 train_time:24718ms step_avg:144.55ms step:182/1480 train_time:24865ms step_avg:144.57ms step:183/1480 train_time:25010ms step_avg:144.57ms step:184/1480 train_time:25158ms step_avg:144.58ms step:185/1480 train_time:25304ms step_avg:144.60ms step:186/1480 train_time:25450ms step_avg:144.60ms step:187/1480 train_time:25598ms step_avg:144.62ms step:188/1480 train_time:25746ms step_avg:144.64ms step:189/1480 train_time:25892ms step_avg:144.65ms step:190/1480 train_time:26041ms step_avg:144.67ms step:191/1480 train_time:26188ms step_avg:144.68ms step:192/1480 train_time:26335ms step_avg:144.70ms step:193/1480 train_time:26482ms step_avg:144.71ms step:194/1480 train_time:26627ms step_avg:144.71ms step:195/1480 train_time:26774ms step_avg:144.73ms step:196/1480 train_time:26921ms step_avg:144.74ms step:197/1480 train_time:27068ms step_avg:144.75ms step:198/1480 train_time:27216ms step_avg:144.77ms step:199/1480 train_time:27364ms step_avg:144.78ms step:200/1480 train_time:27510ms step_avg:144.79ms step:201/1480 train_time:27658ms step_avg:144.81ms step:202/1480 train_time:27805ms step_avg:144.82ms step:203/1480 train_time:27951ms step_avg:144.82ms step:204/1480 train_time:28098ms step_avg:144.83ms step:205/1480 train_time:28246ms step_avg:144.85ms step:206/1480 train_time:28393ms step_avg:144.86ms step:207/1480 train_time:28540ms step_avg:144.87ms step:208/1480 train_time:28686ms step_avg:144.88ms step:209/1480 train_time:28834ms step_avg:144.89ms step:210/1480 train_time:28982ms step_avg:144.91ms step:211/1480 train_time:29127ms step_avg:144.91ms step:212/1480 train_time:29275ms step_avg:144.93ms step:213/1480 train_time:29423ms step_avg:144.94ms step:214/1480 train_time:29570ms step_avg:144.95ms step:215/1480 train_time:29718ms step_avg:144.97ms step:216/1480 train_time:29865ms step_avg:144.98ms step:217/1480 train_time:30011ms step_avg:144.98ms step:218/1480 train_time:30160ms step_avg:145.00ms step:219/1480 train_time:30307ms step_avg:145.01ms step:220/1480 train_time:30455ms step_avg:145.03ms step:221/1480 train_time:30604ms step_avg:145.04ms step:222/1480 train_time:30754ms step_avg:145.07ms step:223/1480 train_time:30905ms step_avg:145.09ms step:224/1480 train_time:31055ms step_avg:145.12ms step:225/1480 train_time:31206ms step_avg:145.14ms step:226/1480 train_time:31358ms step_avg:145.18ms step:227/1480 train_time:31508ms step_avg:145.20ms step:228/1480 train_time:31659ms step_avg:145.22ms step:229/1480 train_time:31810ms step_avg:145.25ms step:230/1480 train_time:31961ms step_avg:145.28ms step:231/1480 train_time:32111ms step_avg:145.30ms step:232/1480 train_time:32262ms step_avg:145.32ms step:233/1480 train_time:32412ms step_avg:145.34ms step:234/1480 train_time:32564ms step_avg:145.37ms step:235/1480 train_time:32715ms step_avg:145.40ms step:236/1480 train_time:32866ms step_avg:145.42ms step:237/1480 train_time:33017ms step_avg:145.45ms step:238/1480 train_time:33168ms step_avg:145.47ms step:239/1480 train_time:33319ms step_avg:145.50ms step:240/1480 train_time:33469ms step_avg:145.52ms step:241/1480 train_time:33620ms step_avg:145.54ms step:242/1480 train_time:33770ms step_avg:145.56ms step:243/1480 train_time:33921ms step_avg:145.58ms step:244/1480 train_time:34071ms step_avg:145.60ms step:245/1480 train_time:34223ms step_avg:145.63ms step:246/1480 train_time:34374ms step_avg:145.65ms step:247/1480 train_time:34524ms step_avg:145.67ms step:248/1480 train_time:34676ms step_avg:145.70ms step:249/1480 train_time:34826ms step_avg:145.72ms step:250/1480 train_time:34977ms step_avg:145.74ms step:250/1480 val_loss:3.9844 train_time:35036ms step_avg:145.98ms step:251/1480 train_time:35132ms step_avg:145.78ms step:252/1480 train_time:35284ms step_avg:145.80ms step:253/1480 train_time:35434ms step_avg:145.82ms step:254/1480 train_time:35583ms step_avg:145.83ms step:255/1480 train_time:35732ms step_avg:145.85ms step:256/1480 train_time:35882ms step_avg:145.86ms step:257/1480 train_time:36034ms step_avg:145.88ms step:258/1480 train_time:36187ms step_avg:145.91ms step:259/1480 train_time:36338ms step_avg:145.94ms step:260/1480 train_time:36490ms step_avg:145.96ms step:261/1480 train_time:36640ms step_avg:145.97ms step:262/1480 train_time:36790ms step_avg:145.99ms step:263/1480 train_time:36940ms step_avg:146.01ms step:264/1480 train_time:37091ms step_avg:146.03ms step:265/1480 train_time:37243ms step_avg:146.05ms step:266/1480 train_time:37394ms step_avg:146.07ms step:267/1480 train_time:37544ms step_avg:146.09ms step:268/1480 train_time:37695ms step_avg:146.10ms step:269/1480 train_time:37845ms step_avg:146.12ms step:270/1480 train_time:37996ms step_avg:146.14ms step:271/1480 train_time:38148ms step_avg:146.16ms step:272/1480 train_time:38298ms step_avg:146.18ms step:273/1480 train_time:38450ms step_avg:146.20ms step:274/1480 train_time:38600ms step_avg:146.21ms step:275/1480 train_time:38751ms step_avg:146.23ms step:276/1480 train_time:38901ms step_avg:146.24ms step:277/1480 train_time:39051ms step_avg:146.26ms step:278/1480 train_time:39203ms step_avg:146.28ms step:279/1480 train_time:39354ms step_avg:146.30ms step:280/1480 train_time:39504ms step_avg:146.31ms step:281/1480 train_time:39654ms step_avg:146.33ms step:282/1480 train_time:39804ms step_avg:146.34ms step:283/1480 train_time:39955ms step_avg:146.35ms step:284/1480 train_time:40104ms step_avg:146.37ms step:285/1480 train_time:40254ms step_avg:146.38ms step:286/1480 train_time:40405ms step_avg:146.39ms step:287/1480 train_time:40555ms step_avg:146.41ms step:288/1480 train_time:40706ms step_avg:146.43ms step:289/1480 train_time:40856ms step_avg:146.44ms step:290/1480 train_time:41005ms step_avg:146.45ms step:291/1480 train_time:41157ms step_avg:146.46ms step:292/1480 train_time:41308ms step_avg:146.48ms step:293/1480 train_time:41457ms step_avg:146.49ms step:294/1480 train_time:41608ms step_avg:146.51ms step:295/1480 train_time:41758ms step_avg:146.52ms step:296/1480 train_time:41909ms step_avg:146.53ms step:297/1480 train_time:42059ms step_avg:146.55ms step:298/1480 train_time:42209ms step_avg:146.56ms step:299/1480 train_time:42360ms step_avg:146.57ms step:300/1480 train_time:42511ms step_avg:146.59ms step:301/1480 train_time:42661ms step_avg:146.60ms step:302/1480 train_time:42811ms step_avg:146.61ms step:303/1480 train_time:42962ms step_avg:146.63ms step:304/1480 train_time:43113ms step_avg:146.64ms step:305/1480 train_time:43264ms step_avg:146.66ms step:306/1480 train_time:43413ms step_avg:146.67ms step:307/1480 train_time:43563ms step_avg:146.68ms step:308/1480 train_time:43714ms step_avg:146.69ms step:309/1480 train_time:43864ms step_avg:146.70ms step:310/1480 train_time:44014ms step_avg:146.71ms step:311/1480 train_time:44166ms step_avg:146.73ms step:312/1480 train_time:44316ms step_avg:146.74ms step:313/1480 train_time:44467ms step_avg:146.75ms step:314/1480 train_time:44617ms step_avg:146.77ms step:315/1480 train_time:44768ms step_avg:146.78ms step:316/1480 train_time:44917ms step_avg:146.79ms step:317/1480 train_time:45068ms step_avg:146.80ms step:318/1480 train_time:45218ms step_avg:146.81ms step:319/1480 train_time:45368ms step_avg:146.82ms step:320/1480 train_time:45518ms step_avg:146.83ms step:321/1480 train_time:45669ms step_avg:146.84ms step:322/1480 train_time:45818ms step_avg:146.85ms step:323/1480 train_time:45969ms step_avg:146.87ms step:324/1480 train_time:46120ms step_avg:146.88ms step:325/1480 train_time:46270ms step_avg:146.89ms step:326/1480 train_time:46420ms step_avg:146.90ms step:327/1480 train_time:46570ms step_avg:146.91ms step:328/1480 train_time:46720ms step_avg:146.92ms step:329/1480 train_time:46870ms step_avg:146.93ms step:330/1480 train_time:47023ms step_avg:146.95ms step:331/1480 train_time:47177ms step_avg:146.97ms step:332/1480 train_time:47330ms step_avg:146.99ms step:333/1480 train_time:47486ms step_avg:147.01ms step:334/1480 train_time:47638ms step_avg:147.03ms step:335/1480 train_time:47792ms step_avg:147.05ms step:336/1480 train_time:47945ms step_avg:147.07ms step:337/1480 train_time:48099ms step_avg:147.09ms step:338/1480 train_time:48254ms step_avg:147.12ms step:339/1480 train_time:48406ms step_avg:147.13ms step:340/1480 train_time:48558ms step_avg:147.15ms step:341/1480 train_time:48712ms step_avg:147.17ms step:342/1480 train_time:48866ms step_avg:147.19ms step:343/1480 train_time:49021ms step_avg:147.21ms step:344/1480 train_time:49176ms step_avg:147.23ms step:345/1480 train_time:49330ms step_avg:147.25ms step:346/1480 train_time:49485ms step_avg:147.28ms step:347/1480 train_time:49638ms step_avg:147.29ms step:348/1480 train_time:49792ms step_avg:147.31ms step:349/1480 train_time:49945ms step_avg:147.33ms step:350/1480 train_time:50099ms step_avg:147.35ms step:351/1480 train_time:50255ms step_avg:147.37ms step:352/1480 train_time:50408ms step_avg:147.39ms step:353/1480 train_time:50562ms step_avg:147.41ms step:354/1480 train_time:50714ms step_avg:147.43ms step:355/1480 train_time:50868ms step_avg:147.44ms step:356/1480 train_time:51022ms step_avg:147.46ms step:357/1480 train_time:51176ms step_avg:147.48ms step:358/1480 train_time:51330ms step_avg:147.50ms step:359/1480 train_time:51486ms step_avg:147.52ms step:360/1480 train_time:51640ms step_avg:147.54ms step:361/1480 train_time:51793ms step_avg:147.56ms step:362/1480 train_time:51947ms step_avg:147.58ms step:363/1480 train_time:52100ms step_avg:147.59ms step:364/1480 train_time:52255ms step_avg:147.61ms step:365/1480 train_time:52410ms step_avg:147.63ms step:366/1480 train_time:52564ms step_avg:147.65ms step:367/1480 train_time:52717ms step_avg:147.67ms step:368/1480 train_time:52871ms step_avg:147.68ms step:369/1480 train_time:53023ms step_avg:147.70ms step:370/1480 train_time:53177ms step_avg:147.71ms step:371/1480 train_time:53332ms step_avg:147.73ms step:372/1480 train_time:53488ms step_avg:147.76ms step:373/1480 train_time:53642ms step_avg:147.77ms step:374/1480 train_time:53794ms step_avg:147.79ms step:375/1480 train_time:53948ms step_avg:147.80ms step:375/1480 val_loss:3.8030 train_time:54008ms step_avg:147.97ms step:376/1480 train_time:54106ms step_avg:147.83ms step:377/1480 train_time:54262ms step_avg:147.85ms step:378/1480 train_time:54415ms step_avg:147.87ms step:379/1480 train_time:54568ms step_avg:147.88ms step:380/1480 train_time:54720ms step_avg:147.89ms step:381/1480 train_time:54871ms step_avg:147.90ms step:382/1480 train_time:55025ms step_avg:147.92ms step:383/1480 train_time:55183ms step_avg:147.94ms step:384/1480 train_time:55335ms step_avg:147.96ms step:385/1480 train_time:55488ms step_avg:147.97ms step:386/1480 train_time:55641ms step_avg:147.98ms step:387/1480 train_time:55794ms step_avg:147.99ms step:388/1480 train_time:55948ms step_avg:148.01ms step:389/1480 train_time:56103ms step_avg:148.03ms step:390/1480 train_time:56257ms step_avg:148.04ms step:391/1480 train_time:56411ms step_avg:148.06ms step:392/1480 train_time:56564ms step_avg:148.07ms step:393/1480 train_time:56719ms step_avg:148.09ms step:394/1480 train_time:56872ms step_avg:148.10ms step:395/1480 train_time:57026ms step_avg:148.12ms step:396/1480 train_time:57180ms step_avg:148.13ms step:397/1480 train_time:57334ms step_avg:148.15ms step:398/1480 train_time:57488ms step_avg:148.17ms step:399/1480 train_time:57642ms step_avg:148.18ms step:400/1480 train_time:57795ms step_avg:148.19ms step:401/1480 train_time:57949ms step_avg:148.21ms step:402/1480 train_time:58101ms step_avg:148.22ms step:403/1480 train_time:58256ms step_avg:148.23ms step:404/1480 train_time:58409ms step_avg:148.25ms step:405/1480 train_time:58564ms step_avg:148.26ms step:406/1480 train_time:58718ms step_avg:148.28ms step:407/1480 train_time:58872ms step_avg:148.29ms step:408/1480 train_time:59026ms step_avg:148.31ms step:409/1480 train_time:59179ms step_avg:148.32ms step:410/1480 train_time:59331ms step_avg:148.33ms step:411/1480 train_time:59486ms step_avg:148.34ms step:412/1480 train_time:59641ms step_avg:148.36ms step:413/1480 train_time:59793ms step_avg:148.37ms step:414/1480 train_time:59948ms step_avg:148.39ms step:415/1480 train_time:60102ms step_avg:148.40ms step:416/1480 train_time:60256ms step_avg:148.41ms step:417/1480 train_time:60410ms step_avg:148.43ms step:418/1480 train_time:60564ms step_avg:148.44ms step:419/1480 train_time:60717ms step_avg:148.45ms step:420/1480 train_time:60870ms step_avg:148.46ms step:421/1480 train_time:61023ms step_avg:148.48ms step:422/1480 train_time:61178ms step_avg:148.49ms step:423/1480 train_time:61330ms step_avg:148.50ms step:424/1480 train_time:61484ms step_avg:148.51ms step:425/1480 train_time:61639ms step_avg:148.53ms step:426/1480 train_time:61792ms step_avg:148.54ms step:427/1480 train_time:61945ms step_avg:148.55ms step:428/1480 train_time:62098ms step_avg:148.56ms step:429/1480 train_time:62252ms step_avg:148.57ms step:430/1480 train_time:62406ms step_avg:148.58ms step:431/1480 train_time:62560ms step_avg:148.60ms step:432/1480 train_time:62713ms step_avg:148.61ms step:433/1480 train_time:62867ms step_avg:148.62ms step:434/1480 train_time:63020ms step_avg:148.63ms step:435/1480 train_time:63174ms step_avg:148.64ms step:436/1480 train_time:63328ms step_avg:148.66ms step:437/1480 train_time:63483ms step_avg:148.67ms step:438/1480 train_time:63637ms step_avg:148.68ms step:439/1480 train_time:63790ms step_avg:148.70ms step:440/1480 train_time:63946ms step_avg:148.71ms step:441/1480 train_time:64104ms step_avg:148.73ms step:442/1480 train_time:64264ms step_avg:148.76ms step:443/1480 train_time:64421ms step_avg:148.78ms step:444/1480 train_time:64577ms step_avg:148.80ms step:445/1480 train_time:64732ms step_avg:148.81ms step:446/1480 train_time:64889ms step_avg:148.83ms step:447/1480 train_time:65044ms step_avg:148.84ms step:448/1480 train_time:65201ms step_avg:148.86ms step:449/1480 train_time:65361ms step_avg:148.89ms step:450/1480 train_time:65518ms step_avg:148.91ms step:451/1480 train_time:65678ms step_avg:148.93ms step:452/1480 train_time:65835ms step_avg:148.95ms step:453/1480 train_time:65990ms step_avg:148.96ms step:454/1480 train_time:66146ms step_avg:148.98ms step:455/1480 train_time:66303ms step_avg:149.00ms step:456/1480 train_time:66461ms step_avg:149.02ms step:457/1480 train_time:66617ms step_avg:149.03ms step:458/1480 train_time:66773ms step_avg:149.05ms step:459/1480 train_time:66930ms step_avg:149.07ms step:460/1480 train_time:67086ms step_avg:149.08ms step:461/1480 train_time:67244ms step_avg:149.10ms step:462/1480 train_time:67401ms step_avg:149.12ms step:463/1480 train_time:67560ms step_avg:149.14ms step:464/1480 train_time:67718ms step_avg:149.16ms step:465/1480 train_time:67873ms step_avg:149.17ms step:466/1480 train_time:68028ms step_avg:149.18ms step:467/1480 train_time:68187ms step_avg:149.21ms step:468/1480 train_time:68344ms step_avg:149.22ms step:469/1480 train_time:68500ms step_avg:149.24ms step:470/1480 train_time:68655ms step_avg:149.25ms step:471/1480 train_time:68812ms step_avg:149.27ms step:472/1480 train_time:68969ms step_avg:149.28ms step:473/1480 train_time:69125ms step_avg:149.30ms step:474/1480 train_time:69284ms step_avg:149.32ms step:475/1480 train_time:69441ms step_avg:149.34ms step:476/1480 train_time:69598ms step_avg:149.35ms step:477/1480 train_time:69756ms step_avg:149.37ms step:478/1480 train_time:69912ms step_avg:149.38ms step:479/1480 train_time:70068ms step_avg:149.40ms step:480/1480 train_time:70226ms step_avg:149.42ms step:481/1480 train_time:70385ms step_avg:149.44ms step:482/1480 train_time:70542ms step_avg:149.45ms step:483/1480 train_time:70698ms step_avg:149.47ms step:484/1480 train_time:70855ms step_avg:149.48ms step:485/1480 train_time:71011ms step_avg:149.50ms step:486/1480 train_time:71167ms step_avg:149.51ms step:487/1480 train_time:71324ms step_avg:149.53ms step:488/1480 train_time:71483ms step_avg:149.55ms step:489/1480 train_time:71640ms step_avg:149.56ms step:490/1480 train_time:71795ms step_avg:149.57ms step:491/1480 train_time:71953ms step_avg:149.59ms step:492/1480 train_time:72109ms step_avg:149.60ms step:493/1480 train_time:72265ms step_avg:149.62ms step:494/1480 train_time:72423ms step_avg:149.64ms step:495/1480 train_time:72583ms step_avg:149.65ms step:496/1480 train_time:72740ms step_avg:149.67ms step:497/1480 train_time:72896ms step_avg:149.68ms step:498/1480 train_time:73052ms step_avg:149.70ms step:499/1480 train_time:73210ms step_avg:149.71ms step:500/1480 train_time:73368ms step_avg:149.73ms step:500/1480 val_loss:3.6817 train_time:73428ms step_avg:149.85ms step:501/1480 train_time:73527ms step_avg:149.75ms step:502/1480 train_time:73684ms step_avg:149.76ms step:503/1480 train_time:73841ms step_avg:149.78ms step:504/1480 train_time:73997ms step_avg:149.79ms step:505/1480 train_time:74151ms step_avg:149.80ms step:506/1480 train_time:74308ms step_avg:149.81ms step:507/1480 train_time:74466ms step_avg:149.83ms step:508/1480 train_time:74625ms step_avg:149.85ms step:509/1480 train_time:74782ms step_avg:149.86ms step:510/1480 train_time:74939ms step_avg:149.88ms step:511/1480 train_time:75096ms step_avg:149.89ms step:512/1480 train_time:75252ms step_avg:149.91ms step:513/1480 train_time:75409ms step_avg:149.92ms step:514/1480 train_time:75567ms step_avg:149.93ms step:515/1480 train_time:75725ms step_avg:149.95ms step:516/1480 train_time:75884ms step_avg:149.97ms step:517/1480 train_time:76043ms step_avg:149.99ms step:518/1480 train_time:76200ms step_avg:150.00ms step:519/1480 train_time:76358ms step_avg:150.01ms step:520/1480 train_time:76515ms step_avg:150.03ms step:521/1480 train_time:76671ms step_avg:150.04ms step:522/1480 train_time:76828ms step_avg:150.05ms step:523/1480 train_time:76985ms step_avg:150.07ms step:524/1480 train_time:77143ms step_avg:150.08ms step:525/1480 train_time:77300ms step_avg:150.10ms step:526/1480 train_time:77459ms step_avg:150.11ms step:527/1480 train_time:77615ms step_avg:150.13ms step:528/1480 train_time:77771ms step_avg:150.14ms step:529/1480 train_time:77927ms step_avg:150.15ms step:530/1480 train_time:78083ms step_avg:150.16ms step:531/1480 train_time:78240ms step_avg:150.17ms step:532/1480 train_time:78395ms step_avg:150.18ms step:533/1480 train_time:78552ms step_avg:150.20ms step:534/1480 train_time:78708ms step_avg:150.21ms step:535/1480 train_time:78866ms step_avg:150.22ms step:536/1480 train_time:79025ms step_avg:150.24ms step:537/1480 train_time:79182ms step_avg:150.25ms step:538/1480 train_time:79339ms step_avg:150.26ms step:539/1480 train_time:79497ms step_avg:150.28ms step:540/1480 train_time:79654ms step_avg:150.29ms step:541/1480 train_time:79809ms step_avg:150.30ms step:542/1480 train_time:79965ms step_avg:150.31ms step:543/1480 train_time:80123ms step_avg:150.32ms step:544/1480 train_time:80280ms step_avg:150.34ms step:545/1480 train_time:80435ms step_avg:150.35ms step:546/1480 train_time:80591ms step_avg:150.36ms step:547/1480 train_time:80748ms step_avg:150.37ms step:548/1480 train_time:80906ms step_avg:150.38ms step:549/1480 train_time:81064ms step_avg:150.40ms step:550/1480 train_time:81223ms step_avg:150.41ms step:551/1480 train_time:81381ms step_avg:150.43ms step:552/1480 train_time:81540ms step_avg:150.44ms step:553/1480 train_time:81701ms step_avg:150.46ms step:554/1480 train_time:81861ms step_avg:150.48ms step:555/1480 train_time:82022ms step_avg:150.50ms step:556/1480 train_time:82181ms step_avg:150.51ms step:557/1480 train_time:82342ms step_avg:150.53ms step:558/1480 train_time:82501ms step_avg:150.55ms step:559/1480 train_time:82661ms step_avg:150.57ms step:560/1480 train_time:82821ms step_avg:150.58ms step:561/1480 train_time:82980ms step_avg:150.60ms step:562/1480 train_time:83141ms step_avg:150.62ms step:563/1480 train_time:83300ms step_avg:150.63ms step:564/1480 train_time:83460ms step_avg:150.65ms step:565/1480 train_time:83619ms step_avg:150.67ms step:566/1480 train_time:83778ms step_avg:150.68ms step:567/1480 train_time:83938ms step_avg:150.70ms step:568/1480 train_time:84095ms step_avg:150.71ms step:569/1480 train_time:84254ms step_avg:150.72ms step:570/1480 train_time:84412ms step_avg:150.74ms step:571/1480 train_time:84571ms step_avg:150.75ms step:572/1480 train_time:84731ms step_avg:150.77ms step:573/1480 train_time:84890ms step_avg:150.78ms step:574/1480 train_time:85051ms step_avg:150.80ms step:575/1480 train_time:85210ms step_avg:150.81ms step:576/1480 train_time:85370ms step_avg:150.83ms step:577/1480 train_time:85529ms step_avg:150.84ms step:578/1480 train_time:85687ms step_avg:150.86ms step:579/1480 train_time:85846ms step_avg:150.87ms step:580/1480 train_time:86005ms step_avg:150.89ms step:581/1480 train_time:86166ms step_avg:150.90ms step:582/1480 train_time:86327ms step_avg:150.92ms step:583/1480 train_time:86486ms step_avg:150.94ms step:584/1480 train_time:86646ms step_avg:150.95ms step:585/1480 train_time:86805ms step_avg:150.97ms step:586/1480 train_time:86966ms step_avg:150.98ms step:587/1480 train_time:87125ms step_avg:151.00ms step:588/1480 train_time:87284ms step_avg:151.01ms step:589/1480 train_time:87446ms step_avg:151.03ms step:590/1480 train_time:87606ms step_avg:151.05ms step:591/1480 train_time:87766ms step_avg:151.06ms step:592/1480 train_time:87925ms step_avg:151.07ms step:593/1480 train_time:88086ms step_avg:151.09ms step:594/1480 train_time:88247ms step_avg:151.11ms step:595/1480 train_time:88408ms step_avg:151.12ms step:596/1480 train_time:88569ms step_avg:151.14ms step:597/1480 train_time:88728ms step_avg:151.15ms step:598/1480 train_time:88885ms step_avg:151.16ms step:599/1480 train_time:89045ms step_avg:151.18ms step:600/1480 train_time:89204ms step_avg:151.19ms step:601/1480 train_time:89365ms step_avg:151.21ms step:602/1480 train_time:89525ms step_avg:151.22ms step:603/1480 train_time:89687ms step_avg:151.24ms step:604/1480 train_time:89847ms step_avg:151.26ms step:605/1480 train_time:90006ms step_avg:151.27ms step:606/1480 train_time:90168ms step_avg:151.29ms step:607/1480 train_time:90328ms step_avg:151.30ms step:608/1480 train_time:90487ms step_avg:151.32ms step:609/1480 train_time:90647ms step_avg:151.33ms step:610/1480 train_time:90806ms step_avg:151.34ms step:611/1480 train_time:90967ms step_avg:151.36ms step:612/1480 train_time:91127ms step_avg:151.37ms step:613/1480 train_time:91287ms step_avg:151.39ms step:614/1480 train_time:91447ms step_avg:151.40ms step:615/1480 train_time:91605ms step_avg:151.41ms step:616/1480 train_time:91765ms step_avg:151.43ms step:617/1480 train_time:91925ms step_avg:151.44ms step:618/1480 train_time:92085ms step_avg:151.46ms step:619/1480 train_time:92246ms step_avg:151.47ms step:620/1480 train_time:92405ms step_avg:151.48ms step:621/1480 train_time:92566ms step_avg:151.50ms step:622/1480 train_time:92726ms step_avg:151.51ms step:623/1480 train_time:92887ms step_avg:151.53ms step:624/1480 train_time:93047ms step_avg:151.54ms step:625/1480 train_time:93205ms step_avg:151.55ms step:625/1480 val_loss:3.6044 train_time:93270ms step_avg:151.66ms step:626/1480 train_time:93369ms step_avg:151.57ms step:627/1480 train_time:93530ms step_avg:151.59ms step:628/1480 train_time:93689ms step_avg:151.60ms step:629/1480 train_time:93847ms step_avg:151.61ms step:630/1480 train_time:94006ms step_avg:151.62ms step:631/1480 train_time:94164ms step_avg:151.63ms step:632/1480 train_time:94325ms step_avg:151.65ms step:633/1480 train_time:94485ms step_avg:151.66ms step:634/1480 train_time:94644ms step_avg:151.67ms step:635/1480 train_time:94803ms step_avg:151.69ms step:636/1480 train_time:94963ms step_avg:151.70ms step:637/1480 train_time:95122ms step_avg:151.71ms step:638/1480 train_time:95281ms step_avg:151.72ms step:639/1480 train_time:95438ms step_avg:151.73ms step:640/1480 train_time:95595ms step_avg:151.74ms step:641/1480 train_time:95753ms step_avg:151.75ms step:642/1480 train_time:95912ms step_avg:151.76ms step:643/1480 train_time:96071ms step_avg:151.77ms step:644/1480 train_time:96231ms step_avg:151.78ms step:645/1480 train_time:96390ms step_avg:151.80ms step:646/1480 train_time:96550ms step_avg:151.81ms step:647/1480 train_time:96710ms step_avg:151.82ms step:648/1480 train_time:96871ms step_avg:151.84ms step:649/1480 train_time:97031ms step_avg:151.85ms step:650/1480 train_time:97191ms step_avg:151.86ms step:651/1480 train_time:97352ms step_avg:151.88ms step:652/1480 train_time:97512ms step_avg:151.89ms step:653/1480 train_time:97670ms step_avg:151.90ms step:654/1480 train_time:97832ms step_avg:151.91ms step:655/1480 train_time:97992ms step_avg:151.93ms step:656/1480 train_time:98152ms step_avg:151.94ms step:657/1480 train_time:98314ms step_avg:151.95ms step:658/1480 train_time:98473ms step_avg:151.96ms step:659/1480 train_time:98634ms step_avg:151.98ms step:660/1480 train_time:98796ms step_avg:151.99ms step:661/1480 train_time:98957ms step_avg:152.01ms step:662/1480 train_time:99116ms step_avg:152.02ms step:663/1480 train_time:99275ms step_avg:152.03ms step:664/1480 train_time:99437ms step_avg:152.05ms step:665/1480 train_time:99599ms step_avg:152.06ms step:666/1480 train_time:99760ms step_avg:152.07ms step:667/1480 train_time:99921ms step_avg:152.09ms step:668/1480 train_time:100082ms step_avg:152.10ms step:669/1480 train_time:100244ms step_avg:152.12ms step:670/1480 train_time:100405ms step_avg:152.13ms step:671/1480 train_time:100567ms step_avg:152.14ms step:672/1480 train_time:100731ms step_avg:152.16ms step:673/1480 train_time:100893ms step_avg:152.18ms step:674/1480 train_time:101055ms step_avg:152.19ms step:675/1480 train_time:101216ms step_avg:152.20ms step:676/1480 train_time:101377ms step_avg:152.22ms step:677/1480 train_time:101537ms step_avg:152.23ms step:678/1480 train_time:101699ms step_avg:152.24ms step:679/1480 train_time:101861ms step_avg:152.26ms step:680/1480 train_time:102023ms step_avg:152.27ms step:681/1480 train_time:102184ms step_avg:152.29ms step:682/1480 train_time:102347ms step_avg:152.30ms step:683/1480 train_time:102509ms step_avg:152.32ms step:684/1480 train_time:102671ms step_avg:152.33ms step:685/1480 train_time:102836ms step_avg:152.35ms step:686/1480 train_time:102998ms step_avg:152.36ms step:687/1480 train_time:103158ms step_avg:152.37ms step:688/1480 train_time:103320ms step_avg:152.39ms step:689/1480 train_time:103484ms step_avg:152.41ms step:690/1480 train_time:103648ms step_avg:152.42ms step:691/1480 train_time:103810ms step_avg:152.44ms step:692/1480 train_time:103972ms step_avg:152.45ms step:693/1480 train_time:104134ms step_avg:152.47ms step:694/1480 train_time:104296ms step_avg:152.48ms step:695/1480 train_time:104456ms step_avg:152.49ms step:696/1480 train_time:104618ms step_avg:152.50ms step:697/1480 train_time:104780ms step_avg:152.52ms step:698/1480 train_time:104938ms step_avg:152.53ms step:699/1480 train_time:105100ms step_avg:152.54ms step:700/1480 train_time:105261ms step_avg:152.55ms step:701/1480 train_time:105420ms step_avg:152.56ms step:702/1480 train_time:105582ms step_avg:152.57ms step:703/1480 train_time:105741ms step_avg:152.59ms step:704/1480 train_time:105902ms step_avg:152.60ms step:705/1480 train_time:106067ms step_avg:152.61ms step:706/1480 train_time:106233ms step_avg:152.63ms step:707/1480 train_time:106393ms step_avg:152.64ms step:708/1480 train_time:106553ms step_avg:152.65ms step:709/1480 train_time:106715ms step_avg:152.67ms step:710/1480 train_time:106874ms step_avg:152.68ms step:711/1480 train_time:107036ms step_avg:152.69ms step:712/1480 train_time:107203ms step_avg:152.71ms step:713/1480 train_time:107366ms step_avg:152.73ms step:714/1480 train_time:107528ms step_avg:152.74ms step:715/1480 train_time:107690ms step_avg:152.75ms step:716/1480 train_time:107850ms step_avg:152.76ms step:717/1480 train_time:108014ms step_avg:152.78ms step:718/1480 train_time:108173ms step_avg:152.79ms step:719/1480 train_time:108333ms step_avg:152.80ms step:720/1480 train_time:108496ms step_avg:152.81ms step:721/1480 train_time:108657ms step_avg:152.82ms step:722/1480 train_time:108819ms step_avg:152.84ms step:723/1480 train_time:108978ms step_avg:152.84ms step:724/1480 train_time:109139ms step_avg:152.86ms step:725/1480 train_time:109302ms step_avg:152.87ms step:726/1480 train_time:109465ms step_avg:152.88ms step:727/1480 train_time:109631ms step_avg:152.90ms step:728/1480 train_time:109792ms step_avg:152.91ms step:729/1480 train_time:109954ms step_avg:152.93ms step:730/1480 train_time:110117ms step_avg:152.94ms step:731/1480 train_time:110277ms step_avg:152.95ms step:732/1480 train_time:110436ms step_avg:152.96ms step:733/1480 train_time:110597ms step_avg:152.97ms step:734/1480 train_time:110757ms step_avg:152.98ms step:735/1480 train_time:110918ms step_avg:152.99ms step:736/1480 train_time:111079ms step_avg:153.00ms step:737/1480 train_time:111239ms step_avg:153.01ms step:738/1480 train_time:111400ms step_avg:153.02ms step:739/1480 train_time:111561ms step_avg:153.03ms step:740/1480 train_time:111727ms step_avg:153.05ms step:741/1480 train_time:111891ms step_avg:153.07ms step:742/1480 train_time:112053ms step_avg:153.08ms step:743/1480 train_time:112215ms step_avg:153.09ms step:744/1480 train_time:112377ms step_avg:153.10ms step:745/1480 train_time:112540ms step_avg:153.12ms step:746/1480 train_time:112699ms step_avg:153.12ms step:747/1480 train_time:112861ms step_avg:153.14ms step:748/1480 train_time:113027ms step_avg:153.15ms step:749/1480 train_time:113191ms step_avg:153.17ms step:750/1480 train_time:113351ms step_avg:153.18ms step:750/1480 val_loss:3.5495 train_time:113416ms step_avg:153.27ms step:751/1480 train_time:113517ms step_avg:153.19ms step:752/1480 train_time:113682ms step_avg:153.21ms step:753/1480 train_time:113844ms step_avg:153.22ms step:754/1480 train_time:114005ms step_avg:153.23ms step:755/1480 train_time:114167ms step_avg:153.24ms step:756/1480 train_time:114327ms step_avg:153.25ms step:757/1480 train_time:114493ms step_avg:153.27ms step:758/1480 train_time:114653ms step_avg:153.28ms step:759/1480 train_time:114818ms step_avg:153.29ms step:760/1480 train_time:114980ms step_avg:153.31ms step:761/1480 train_time:115144ms step_avg:153.32ms step:762/1480 train_time:115307ms step_avg:153.33ms step:763/1480 train_time:115469ms step_avg:153.34ms step:764/1480 train_time:115630ms step_avg:153.35ms step:765/1480 train_time:115791ms step_avg:153.37ms step:766/1480 train_time:115953ms step_avg:153.38ms step:767/1480 train_time:116114ms step_avg:153.39ms step:768/1480 train_time:116276ms step_avg:153.40ms step:769/1480 train_time:116441ms step_avg:153.41ms step:770/1480 train_time:116604ms step_avg:153.43ms step:771/1480 train_time:116769ms step_avg:153.44ms step:772/1480 train_time:116930ms step_avg:153.45ms step:773/1480 train_time:117091ms step_avg:153.46ms step:774/1480 train_time:117253ms step_avg:153.47ms step:775/1480 train_time:117414ms step_avg:153.48ms step:776/1480 train_time:117579ms step_avg:153.50ms step:777/1480 train_time:117747ms step_avg:153.52ms step:778/1480 train_time:117909ms step_avg:153.53ms step:779/1480 train_time:118071ms step_avg:153.54ms step:780/1480 train_time:118233ms step_avg:153.55ms step:781/1480 train_time:118395ms step_avg:153.56ms step:782/1480 train_time:118559ms step_avg:153.57ms step:783/1480 train_time:118721ms step_avg:153.59ms step:784/1480 train_time:118886ms step_avg:153.60ms step:785/1480 train_time:119049ms step_avg:153.61ms step:786/1480 train_time:119213ms step_avg:153.62ms step:787/1480 train_time:119375ms step_avg:153.64ms step:788/1480 train_time:119541ms step_avg:153.65ms step:789/1480 train_time:119702ms step_avg:153.66ms step:790/1480 train_time:119868ms step_avg:153.68ms step:791/1480 train_time:120034ms step_avg:153.69ms step:792/1480 train_time:120198ms step_avg:153.71ms step:793/1480 train_time:120359ms step_avg:153.72ms step:794/1480 train_time:120524ms step_avg:153.73ms step:795/1480 train_time:120690ms step_avg:153.75ms step:796/1480 train_time:120855ms step_avg:153.76ms step:797/1480 train_time:121019ms step_avg:153.77ms step:798/1480 train_time:121184ms step_avg:153.79ms step:799/1480 train_time:121350ms step_avg:153.80ms step:800/1480 train_time:121513ms step_avg:153.81ms step:801/1480 train_time:121675ms step_avg:153.82ms step:802/1480 train_time:121843ms step_avg:153.84ms step:803/1480 train_time:122006ms step_avg:153.85ms step:804/1480 train_time:122169ms step_avg:153.86ms step:805/1480 train_time:122334ms step_avg:153.88ms step:806/1480 train_time:122495ms step_avg:153.89ms step:807/1480 train_time:122658ms step_avg:153.90ms step:808/1480 train_time:122823ms step_avg:153.91ms step:809/1480 train_time:122987ms step_avg:153.93ms step:810/1480 train_time:123149ms step_avg:153.94ms step:811/1480 train_time:123311ms step_avg:153.95ms step:812/1480 train_time:123474ms step_avg:153.96ms step:813/1480 train_time:123634ms step_avg:153.96ms step:814/1480 train_time:123795ms step_avg:153.97ms step:815/1480 train_time:123958ms step_avg:153.98ms step:816/1480 train_time:124124ms step_avg:154.00ms step:817/1480 train_time:124288ms step_avg:154.01ms step:818/1480 train_time:124449ms step_avg:154.02ms step:819/1480 train_time:124613ms step_avg:154.03ms step:820/1480 train_time:124775ms step_avg:154.04ms step:821/1480 train_time:124937ms step_avg:154.05ms step:822/1480 train_time:125102ms step_avg:154.07ms step:823/1480 train_time:125265ms step_avg:154.08ms step:824/1480 train_time:125428ms step_avg:154.09ms step:825/1480 train_time:125595ms step_avg:154.10ms step:826/1480 train_time:125763ms step_avg:154.12ms step:827/1480 train_time:125928ms step_avg:154.13ms step:828/1480 train_time:126090ms step_avg:154.14ms step:829/1480 train_time:126253ms step_avg:154.16ms step:830/1480 train_time:126418ms step_avg:154.17ms step:831/1480 train_time:126582ms step_avg:154.18ms step:832/1480 train_time:126747ms step_avg:154.19ms step:833/1480 train_time:126912ms step_avg:154.21ms step:834/1480 train_time:127077ms step_avg:154.22ms step:835/1480 train_time:127240ms step_avg:154.23ms step:836/1480 train_time:127406ms step_avg:154.24ms step:837/1480 train_time:127567ms step_avg:154.25ms step:838/1480 train_time:127730ms step_avg:154.26ms step:839/1480 train_time:127891ms step_avg:154.27ms step:840/1480 train_time:128051ms step_avg:154.28ms step:841/1480 train_time:128213ms step_avg:154.29ms step:842/1480 train_time:128375ms step_avg:154.30ms step:843/1480 train_time:128538ms step_avg:154.31ms step:844/1480 train_time:128700ms step_avg:154.32ms step:845/1480 train_time:128865ms step_avg:154.33ms step:846/1480 train_time:129028ms step_avg:154.34ms step:847/1480 train_time:129191ms step_avg:154.35ms step:848/1480 train_time:129352ms step_avg:154.36ms step:849/1480 train_time:129515ms step_avg:154.37ms step:850/1480 train_time:129677ms step_avg:154.38ms step:851/1480 train_time:129843ms step_avg:154.39ms step:852/1480 train_time:130006ms step_avg:154.40ms step:853/1480 train_time:130168ms step_avg:154.41ms step:854/1480 train_time:130331ms step_avg:154.42ms step:855/1480 train_time:130493ms step_avg:154.43ms step:856/1480 train_time:130655ms step_avg:154.44ms step:857/1480 train_time:130819ms step_avg:154.45ms step:858/1480 train_time:130987ms step_avg:154.47ms step:859/1480 train_time:131151ms step_avg:154.48ms step:860/1480 train_time:131312ms step_avg:154.49ms step:861/1480 train_time:131480ms step_avg:154.50ms step:862/1480 train_time:131650ms step_avg:154.52ms step:863/1480 train_time:131816ms step_avg:154.53ms step:864/1480 train_time:131982ms step_avg:154.55ms step:865/1480 train_time:132144ms step_avg:154.55ms step:866/1480 train_time:132311ms step_avg:154.57ms step:867/1480 train_time:132473ms step_avg:154.58ms step:868/1480 train_time:132632ms step_avg:154.58ms step:869/1480 train_time:132794ms step_avg:154.59ms step:870/1480 train_time:132959ms step_avg:154.60ms step:871/1480 train_time:133122ms step_avg:154.61ms step:872/1480 train_time:133287ms step_avg:154.63ms step:873/1480 train_time:133451ms step_avg:154.64ms step:874/1480 train_time:133616ms step_avg:154.65ms step:875/1480 train_time:133781ms step_avg:154.66ms step:875/1480 val_loss:3.5044 train_time:133846ms step_avg:154.74ms step:876/1480 train_time:133947ms step_avg:154.67ms step:877/1480 train_time:134113ms step_avg:154.69ms step:878/1480 train_time:134275ms step_avg:154.69ms step:879/1480 train_time:134440ms step_avg:154.71ms step:880/1480 train_time:134602ms step_avg:154.71ms step:881/1480 train_time:134764ms step_avg:154.72ms step:882/1480 train_time:134929ms step_avg:154.74ms step:883/1480 train_time:135097ms step_avg:154.75ms step:884/1480 train_time:135262ms step_avg:154.76ms step:885/1480 train_time:135428ms step_avg:154.77ms step:886/1480 train_time:135594ms step_avg:154.79ms step:887/1480 train_time:135761ms step_avg:154.80ms step:888/1480 train_time:135936ms step_avg:154.82ms step:889/1480 train_time:136104ms step_avg:154.84ms step:890/1480 train_time:136266ms step_avg:154.85ms step:891/1480 train_time:136433ms step_avg:154.86ms step:892/1480 train_time:136599ms step_avg:154.87ms step:893/1480 train_time:136760ms step_avg:154.88ms step:894/1480 train_time:136929ms step_avg:154.90ms step:895/1480 train_time:137095ms step_avg:154.91ms step:896/1480 train_time:137260ms step_avg:154.92ms step:897/1480 train_time:137426ms step_avg:154.93ms step:898/1480 train_time:137594ms step_avg:154.95ms step:899/1480 train_time:137758ms step_avg:154.96ms step:900/1480 train_time:137921ms step_avg:154.97ms step:901/1480 train_time:138085ms step_avg:154.98ms step:902/1480 train_time:138250ms step_avg:154.99ms step:903/1480 train_time:138422ms step_avg:155.01ms step:904/1480 train_time:138588ms step_avg:155.02ms step:905/1480 train_time:138752ms step_avg:155.03ms step:906/1480 train_time:138919ms step_avg:155.04ms step:907/1480 train_time:139088ms step_avg:155.06ms step:908/1480 train_time:139251ms step_avg:155.07ms step:909/1480 train_time:139417ms step_avg:155.08ms step:910/1480 train_time:139587ms step_avg:155.10ms step:911/1480 train_time:139752ms step_avg:155.11ms step:912/1480 train_time:139918ms step_avg:155.12ms step:913/1480 train_time:140085ms step_avg:155.13ms step:914/1480 train_time:140252ms step_avg:155.15ms step:915/1480 train_time:140421ms step_avg:155.16ms step:916/1480 train_time:140587ms step_avg:155.17ms step:917/1480 train_time:140749ms step_avg:155.18ms step:918/1480 train_time:140918ms step_avg:155.20ms step:919/1480 train_time:141088ms step_avg:155.21ms step:920/1480 train_time:141254ms step_avg:155.22ms step:921/1480 train_time:141419ms step_avg:155.24ms step:922/1480 train_time:141587ms step_avg:155.25ms step:923/1480 train_time:141751ms step_avg:155.26ms step:924/1480 train_time:141917ms step_avg:155.27ms step:925/1480 train_time:142082ms step_avg:155.28ms step:926/1480 train_time:142244ms step_avg:155.29ms step:927/1480 train_time:142409ms step_avg:155.30ms step:928/1480 train_time:142575ms step_avg:155.31ms step:929/1480 train_time:142739ms step_avg:155.32ms step:930/1480 train_time:142904ms step_avg:155.33ms step:931/1480 train_time:143067ms step_avg:155.34ms step:932/1480 train_time:143234ms step_avg:155.35ms step:933/1480 train_time:143402ms step_avg:155.37ms step:934/1480 train_time:143571ms step_avg:155.38ms step:935/1480 train_time:143742ms step_avg:155.40ms step:936/1480 train_time:143910ms step_avg:155.41ms step:937/1480 train_time:144079ms step_avg:155.42ms step:938/1480 train_time:144240ms step_avg:155.43ms step:939/1480 train_time:144410ms step_avg:155.45ms step:940/1480 train_time:144576ms step_avg:155.46ms step:941/1480 train_time:144740ms step_avg:155.47ms step:942/1480 train_time:144904ms step_avg:155.48ms step:943/1480 train_time:145074ms step_avg:155.49ms step:944/1480 train_time:145247ms step_avg:155.51ms step:945/1480 train_time:145412ms step_avg:155.52ms step:946/1480 train_time:145580ms step_avg:155.53ms step:947/1480 train_time:145747ms step_avg:155.55ms step:948/1480 train_time:145913ms step_avg:155.56ms step:949/1480 train_time:146077ms step_avg:155.57ms step:950/1480 train_time:146239ms step_avg:155.57ms step:951/1480 train_time:146408ms step_avg:155.59ms step:952/1480 train_time:146574ms step_avg:155.60ms step:953/1480 train_time:146743ms step_avg:155.61ms step:954/1480 train_time:146913ms step_avg:155.63ms step:955/1480 train_time:147076ms step_avg:155.64ms step:956/1480 train_time:147241ms step_avg:155.65ms step:957/1480 train_time:147410ms step_avg:155.66ms step:958/1480 train_time:147580ms step_avg:155.68ms step:959/1480 train_time:147744ms step_avg:155.68ms step:960/1480 train_time:147911ms step_avg:155.70ms step:961/1480 train_time:148077ms step_avg:155.71ms step:962/1480 train_time:148241ms step_avg:155.72ms step:963/1480 train_time:148408ms step_avg:155.73ms step:964/1480 train_time:148576ms step_avg:155.74ms step:965/1480 train_time:148740ms step_avg:155.75ms step:966/1480 train_time:148905ms step_avg:155.76ms step:967/1480 train_time:149070ms step_avg:155.77ms step:968/1480 train_time:149235ms step_avg:155.78ms step:969/1480 train_time:149400ms step_avg:155.79ms step:970/1480 train_time:149563ms step_avg:155.79ms step:971/1480 train_time:149728ms step_avg:155.80ms step:972/1480 train_time:149894ms step_avg:155.81ms step:973/1480 train_time:150057ms step_avg:155.82ms step:974/1480 train_time:150225ms step_avg:155.84ms step:975/1480 train_time:150393ms step_avg:155.85ms step:976/1480 train_time:150558ms step_avg:155.86ms step:977/1480 train_time:150721ms step_avg:155.86ms step:978/1480 train_time:150887ms step_avg:155.87ms step:979/1480 train_time:151053ms step_avg:155.89ms step:980/1480 train_time:151218ms step_avg:155.90ms step:981/1480 train_time:151387ms step_avg:155.91ms step:982/1480 train_time:151551ms step_avg:155.92ms step:983/1480 train_time:151717ms step_avg:155.93ms step:984/1480 train_time:151880ms step_avg:155.93ms step:985/1480 train_time:152048ms step_avg:155.95ms step:986/1480 train_time:152214ms step_avg:155.96ms step:987/1480 train_time:152378ms step_avg:155.97ms step:988/1480 train_time:152544ms step_avg:155.98ms step:989/1480 train_time:152712ms step_avg:155.99ms step:990/1480 train_time:152879ms step_avg:156.00ms step:991/1480 train_time:153045ms step_avg:156.01ms step:992/1480 train_time:153220ms step_avg:156.03ms step:993/1480 train_time:153397ms step_avg:156.05ms step:994/1480 train_time:153561ms step_avg:156.06ms step:995/1480 train_time:153726ms step_avg:156.07ms step:996/1480 train_time:153890ms step_avg:156.08ms step:997/1480 train_time:154055ms step_avg:156.08ms step:998/1480 train_time:154218ms step_avg:156.09ms step:999/1480 train_time:154382ms step_avg:156.10ms step:1000/1480 train_time:154552ms step_avg:156.11ms step:1000/1480 val_loss:3.4396 train_time:154620ms step_avg:156.18ms step:1001/1480 train_time:154722ms step_avg:156.13ms step:1002/1480 train_time:154888ms step_avg:156.14ms step:1003/1480 train_time:155062ms step_avg:156.16ms step:1004/1480 train_time:155231ms step_avg:156.17ms step:1005/1480 train_time:155398ms step_avg:156.18ms step:1006/1480 train_time:155567ms step_avg:156.19ms step:1007/1480 train_time:155733ms step_avg:156.20ms step:1008/1480 train_time:155900ms step_avg:156.21ms step:1009/1480 train_time:156073ms step_avg:156.23ms step:1010/1480 train_time:156238ms step_avg:156.24ms step:1011/1480 train_time:156404ms step_avg:156.25ms step:1012/1480 train_time:156570ms step_avg:156.26ms step:1013/1480 train_time:156739ms step_avg:156.27ms step:1014/1480 train_time:156906ms step_avg:156.28ms step:1015/1480 train_time:157074ms step_avg:156.29ms step:1016/1480 train_time:157241ms step_avg:156.30ms step:1017/1480 train_time:157412ms step_avg:156.32ms step:1018/1480 train_time:157580ms step_avg:156.33ms step:1019/1480 train_time:157749ms step_avg:156.34ms step:1020/1480 train_time:157918ms step_avg:156.35ms step:1021/1480 train_time:158083ms step_avg:156.36ms step:1022/1480 train_time:158251ms step_avg:156.37ms step:1023/1480 train_time:158419ms step_avg:156.39ms step:1024/1480 train_time:158587ms step_avg:156.40ms step:1025/1480 train_time:158758ms step_avg:156.41ms step:1026/1480 train_time:158925ms step_avg:156.42ms step:1027/1480 train_time:159091ms step_avg:156.43ms step:1028/1480 train_time:159266ms step_avg:156.45ms step:1029/1480 train_time:159439ms step_avg:156.47ms step:1030/1480 train_time:159607ms step_avg:156.48ms step:1031/1480 train_time:159771ms step_avg:156.48ms step:1032/1480 train_time:159944ms step_avg:156.50ms step:1033/1480 train_time:160110ms step_avg:156.51ms step:1034/1480 train_time:160278ms step_avg:156.52ms step:1035/1480 train_time:160446ms step_avg:156.53ms step:1036/1480 train_time:160611ms step_avg:156.54ms step:1037/1480 train_time:160779ms step_avg:156.55ms step:1038/1480 train_time:160948ms step_avg:156.56ms step:1039/1480 train_time:161117ms step_avg:156.58ms step:1040/1480 train_time:161285ms step_avg:156.59ms step:1041/1480 train_time:161452ms step_avg:156.60ms step:1042/1480 train_time:161616ms step_avg:156.60ms step:1043/1480 train_time:161782ms step_avg:156.61ms step:1044/1480 train_time:161947ms step_avg:156.62ms step:1045/1480 train_time:162118ms step_avg:156.64ms step:1046/1480 train_time:162288ms step_avg:156.65ms step:1047/1480 train_time:162454ms step_avg:156.66ms step:1048/1480 train_time:162620ms step_avg:156.67ms step:1049/1480 train_time:162786ms step_avg:156.68ms step:1050/1480 train_time:162955ms step_avg:156.69ms step:1051/1480 train_time:163124ms step_avg:156.70ms step:1052/1480 train_time:163291ms step_avg:156.71ms step:1053/1480 train_time:163457ms step_avg:156.72ms step:1054/1480 train_time:163626ms step_avg:156.73ms step:1055/1480 train_time:163792ms step_avg:156.74ms step:1056/1480 train_time:163956ms step_avg:156.75ms step:1057/1480 train_time:164126ms step_avg:156.76ms step:1058/1480 train_time:164293ms step_avg:156.77ms step:1059/1480 train_time:164468ms step_avg:156.79ms step:1060/1480 train_time:164636ms step_avg:156.80ms step:1061/1480 train_time:164800ms step_avg:156.80ms step:1062/1480 train_time:164967ms step_avg:156.81ms step:1063/1480 train_time:165131ms step_avg:156.82ms step:1064/1480 train_time:165294ms step_avg:156.83ms step:1065/1480 train_time:165463ms step_avg:156.84ms step:1066/1480 train_time:165630ms step_avg:156.85ms step:1067/1480 train_time:165801ms step_avg:156.86ms step:1068/1480 train_time:165968ms step_avg:156.87ms step:1069/1480 train_time:166140ms step_avg:156.88ms step:1070/1480 train_time:166307ms step_avg:156.89ms step:1071/1480 train_time:166481ms step_avg:156.91ms step:1072/1480 train_time:166648ms step_avg:156.92ms step:1073/1480 train_time:166811ms step_avg:156.92ms step:1074/1480 train_time:166977ms step_avg:156.93ms step:1075/1480 train_time:167148ms step_avg:156.95ms step:1076/1480 train_time:167314ms step_avg:156.95ms step:1077/1480 train_time:167479ms step_avg:156.96ms step:1078/1480 train_time:167653ms step_avg:156.98ms step:1079/1480 train_time:167826ms step_avg:156.99ms step:1080/1480 train_time:167996ms step_avg:157.01ms step:1081/1480 train_time:168163ms step_avg:157.02ms step:1082/1480 train_time:168330ms step_avg:157.02ms step:1083/1480 train_time:168496ms step_avg:157.03ms step:1084/1480 train_time:168665ms step_avg:157.04ms step:1085/1480 train_time:168833ms step_avg:157.05ms step:1086/1480 train_time:169000ms step_avg:157.06ms step:1087/1480 train_time:169167ms step_avg:157.07ms step:1088/1480 train_time:169336ms step_avg:157.08ms step:1089/1480 train_time:169509ms step_avg:157.10ms step:1090/1480 train_time:169681ms step_avg:157.11ms step:1091/1480 train_time:169849ms step_avg:157.12ms step:1092/1480 train_time:170016ms step_avg:157.13ms step:1093/1480 train_time:170184ms step_avg:157.14ms step:1094/1480 train_time:170351ms step_avg:157.15ms step:1095/1480 train_time:170515ms step_avg:157.16ms step:1096/1480 train_time:170685ms step_avg:157.17ms step:1097/1480 train_time:170852ms step_avg:157.18ms step:1098/1480 train_time:171024ms step_avg:157.19ms step:1099/1480 train_time:171196ms step_avg:157.20ms step:1100/1480 train_time:171368ms step_avg:157.22ms step:1101/1480 train_time:171537ms step_avg:157.23ms step:1102/1480 train_time:171708ms step_avg:157.24ms step:1103/1480 train_time:171885ms step_avg:157.26ms step:1104/1480 train_time:172053ms step_avg:157.27ms step:1105/1480 train_time:172223ms step_avg:157.28ms step:1106/1480 train_time:172392ms step_avg:157.29ms step:1107/1480 train_time:172563ms step_avg:157.30ms step:1108/1480 train_time:172728ms step_avg:157.31ms step:1109/1480 train_time:172894ms step_avg:157.32ms step:1110/1480 train_time:173061ms step_avg:157.33ms step:1111/1480 train_time:173227ms step_avg:157.34ms step:1112/1480 train_time:173397ms step_avg:157.35ms step:1113/1480 train_time:173579ms step_avg:157.37ms step:1114/1480 train_time:173751ms step_avg:157.38ms step:1115/1480 train_time:173923ms step_avg:157.40ms step:1116/1480 train_time:174090ms step_avg:157.40ms step:1117/1480 train_time:174263ms step_avg:157.42ms step:1118/1480 train_time:174438ms step_avg:157.43ms step:1119/1480 train_time:174604ms step_avg:157.44ms step:1120/1480 train_time:174772ms step_avg:157.45ms step:1121/1480 train_time:174941ms step_avg:157.46ms step:1122/1480 train_time:175107ms step_avg:157.47ms step:1123/1480 train_time:175273ms step_avg:157.48ms step:1124/1480 train_time:175443ms step_avg:157.49ms step:1125/1480 train_time:175611ms step_avg:157.50ms step:1125/1480 val_loss:3.3847 train_time:175679ms step_avg:157.56ms step:1126/1480 train_time:175781ms step_avg:157.51ms step:1127/1480 train_time:175951ms step_avg:157.52ms step:1128/1480 train_time:176122ms step_avg:157.53ms step:1129/1480 train_time:176296ms step_avg:157.55ms step:1130/1480 train_time:176466ms step_avg:157.56ms step:1131/1480 train_time:176645ms step_avg:157.58ms step:1132/1480 train_time:176811ms step_avg:157.59ms step:1133/1480 train_time:176982ms step_avg:157.60ms step:1134/1480 train_time:177153ms step_avg:157.61ms step:1135/1480 train_time:177322ms step_avg:157.62ms step:1136/1480 train_time:177493ms step_avg:157.63ms step:1137/1480 train_time:177662ms step_avg:157.64ms step:1138/1480 train_time:177833ms step_avg:157.65ms step:1139/1480 train_time:178000ms step_avg:157.66ms step:1140/1480 train_time:178168ms step_avg:157.67ms step:1141/1480 train_time:178339ms step_avg:157.68ms step:1142/1480 train_time:178506ms step_avg:157.69ms step:1143/1480 train_time:178678ms step_avg:157.70ms step:1144/1480 train_time:178846ms step_avg:157.71ms step:1145/1480 train_time:179013ms step_avg:157.72ms step:1146/1480 train_time:179184ms step_avg:157.73ms step:1147/1480 train_time:179354ms step_avg:157.74ms step:1148/1480 train_time:179522ms step_avg:157.75ms step:1149/1480 train_time:179694ms step_avg:157.76ms step:1150/1480 train_time:179861ms step_avg:157.77ms step:1151/1480 train_time:180032ms step_avg:157.78ms step:1152/1480 train_time:180204ms step_avg:157.80ms step:1153/1480 train_time:180377ms step_avg:157.81ms step:1154/1480 train_time:180543ms step_avg:157.82ms step:1155/1480 train_time:180716ms step_avg:157.83ms step:1156/1480 train_time:180895ms step_avg:157.85ms step:1157/1480 train_time:181063ms step_avg:157.86ms step:1158/1480 train_time:181231ms step_avg:157.87ms step:1159/1480 train_time:181398ms step_avg:157.87ms step:1160/1480 train_time:181564ms step_avg:157.88ms step:1161/1480 train_time:181734ms step_avg:157.89ms step:1162/1480 train_time:181903ms step_avg:157.90ms step:1163/1480 train_time:182075ms step_avg:157.91ms step:1164/1480 train_time:182245ms step_avg:157.92ms step:1165/1480 train_time:182411ms step_avg:157.93ms step:1166/1480 train_time:182579ms step_avg:157.94ms step:1167/1480 train_time:182749ms step_avg:157.95ms step:1168/1480 train_time:182915ms step_avg:157.96ms step:1169/1480 train_time:183086ms step_avg:157.97ms step:1170/1480 train_time:183255ms step_avg:157.98ms step:1171/1480 train_time:183422ms step_avg:157.99ms step:1172/1480 train_time:183589ms step_avg:157.99ms step:1173/1480 train_time:183760ms step_avg:158.01ms step:1174/1480 train_time:183942ms step_avg:158.03ms step:1175/1480 train_time:184114ms step_avg:158.04ms step:1176/1480 train_time:184289ms step_avg:158.05ms step:1177/1480 train_time:184465ms step_avg:158.07ms step:1178/1480 train_time:184632ms step_avg:158.08ms step:1179/1480 train_time:184798ms step_avg:158.08ms step:1180/1480 train_time:184977ms step_avg:158.10ms step:1181/1480 train_time:185147ms step_avg:158.11ms step:1182/1480 train_time:185315ms step_avg:158.12ms step:1183/1480 train_time:185486ms step_avg:158.13ms step:1184/1480 train_time:185653ms step_avg:158.14ms step:1185/1480 train_time:185825ms step_avg:158.15ms step:1186/1480 train_time:185996ms step_avg:158.16ms step:1187/1480 train_time:186179ms step_avg:158.18ms step:1188/1480 train_time:186345ms step_avg:158.19ms step:1189/1480 train_time:186515ms step_avg:158.20ms step:1190/1480 train_time:186684ms step_avg:158.21ms step:1191/1480 train_time:186856ms step_avg:158.22ms step:1192/1480 train_time:187022ms step_avg:158.22ms step:1193/1480 train_time:187190ms step_avg:158.23ms step:1194/1480 train_time:187358ms step_avg:158.24ms step:1195/1480 train_time:187532ms step_avg:158.25ms step:1196/1480 train_time:187714ms step_avg:158.27ms step:1197/1480 train_time:187887ms step_avg:158.29ms step:1198/1480 train_time:188068ms step_avg:158.31ms step:1199/1480 train_time:188238ms step_avg:158.32ms step:1200/1480 train_time:188407ms step_avg:158.32ms step:1201/1480 train_time:188575ms step_avg:158.33ms step:1202/1480 train_time:188757ms step_avg:158.35ms step:1203/1480 train_time:188933ms step_avg:158.37ms step:1204/1480 train_time:189107ms step_avg:158.38ms step:1205/1480 train_time:189274ms step_avg:158.39ms step:1206/1480 train_time:189441ms step_avg:158.40ms step:1207/1480 train_time:189612ms step_avg:158.41ms step:1208/1480 train_time:189779ms step_avg:158.41ms step:1209/1480 train_time:189952ms step_avg:158.43ms step:1210/1480 train_time:190128ms step_avg:158.44ms step:1211/1480 train_time:190300ms step_avg:158.45ms step:1212/1480 train_time:190473ms step_avg:158.46ms step:1213/1480 train_time:190647ms step_avg:158.48ms step:1214/1480 train_time:190823ms step_avg:158.49ms step:1215/1480 train_time:190996ms step_avg:158.50ms step:1216/1480 train_time:191164ms step_avg:158.51ms step:1217/1480 train_time:191337ms step_avg:158.52ms step:1218/1480 train_time:191506ms step_avg:158.53ms step:1219/1480 train_time:191683ms step_avg:158.55ms step:1220/1480 train_time:191853ms step_avg:158.56ms step:1221/1480 train_time:192022ms step_avg:158.57ms step:1222/1480 train_time:192192ms step_avg:158.57ms step:1223/1480 train_time:192361ms step_avg:158.58ms step:1224/1480 train_time:192538ms step_avg:158.60ms step:1225/1480 train_time:192709ms step_avg:158.61ms step:1226/1480 train_time:192881ms step_avg:158.62ms step:1227/1480 train_time:193054ms step_avg:158.63ms step:1228/1480 train_time:193223ms step_avg:158.64ms step:1229/1480 train_time:193397ms step_avg:158.65ms step:1230/1480 train_time:193577ms step_avg:158.67ms step:1231/1480 train_time:193753ms step_avg:158.68ms step:1232/1480 train_time:193928ms step_avg:158.70ms step:1233/1480 train_time:194098ms step_avg:158.71ms step:1234/1480 train_time:194268ms step_avg:158.72ms step:1235/1480 train_time:194442ms step_avg:158.73ms step:1236/1480 train_time:194611ms step_avg:158.74ms step:1237/1480 train_time:194782ms step_avg:158.75ms step:1238/1480 train_time:194967ms step_avg:158.77ms step:1239/1480 train_time:195136ms step_avg:158.78ms step:1240/1480 train_time:195307ms step_avg:158.79ms step:1241/1480 train_time:195480ms step_avg:158.80ms step:1242/1480 train_time:195649ms step_avg:158.81ms step:1243/1480 train_time:195822ms step_avg:158.82ms step:1244/1480 train_time:195989ms step_avg:158.82ms step:1245/1480 train_time:196157ms step_avg:158.83ms step:1246/1480 train_time:196327ms step_avg:158.84ms step:1247/1480 train_time:196496ms step_avg:158.85ms step:1248/1480 train_time:196664ms step_avg:158.86ms step:1249/1480 train_time:196833ms step_avg:158.86ms step:1250/1480 train_time:197002ms step_avg:158.87ms step:1250/1480 val_loss:3.3356 train_time:197074ms step_avg:158.93ms step:1251/1480 train_time:197182ms step_avg:158.89ms step:1252/1480 train_time:197351ms step_avg:158.90ms step:1253/1480 train_time:197520ms step_avg:158.91ms step:1254/1480 train_time:197692ms step_avg:158.92ms step:1255/1480 train_time:197878ms step_avg:158.94ms step:1256/1480 train_time:198051ms step_avg:158.95ms step:1257/1480 train_time:198222ms step_avg:158.96ms step:1258/1480 train_time:198396ms step_avg:158.97ms step:1259/1480 train_time:198568ms step_avg:158.98ms step:1260/1480 train_time:198734ms step_avg:158.99ms step:1261/1480 train_time:198907ms step_avg:159.00ms step:1262/1480 train_time:199083ms step_avg:159.01ms step:1263/1480 train_time:199258ms step_avg:159.02ms step:1264/1480 train_time:199424ms step_avg:159.03ms step:1265/1480 train_time:199592ms step_avg:159.04ms step:1266/1480 train_time:199764ms step_avg:159.05ms step:1267/1480 train_time:199933ms step_avg:159.06ms step:1268/1480 train_time:200106ms step_avg:159.07ms step:1269/1480 train_time:200283ms step_avg:159.08ms step:1270/1480 train_time:200452ms step_avg:159.09ms step:1271/1480 train_time:200622ms step_avg:159.10ms step:1272/1480 train_time:200788ms step_avg:159.10ms step:1273/1480 train_time:200959ms step_avg:159.11ms step:1274/1480 train_time:201132ms step_avg:159.12ms step:1275/1480 train_time:201302ms step_avg:159.13ms step:1276/1480 train_time:201468ms step_avg:159.14ms step:1277/1480 train_time:201640ms step_avg:159.15ms step:1278/1480 train_time:201808ms step_avg:159.15ms step:1279/1480 train_time:201978ms step_avg:159.16ms step:1280/1480 train_time:202157ms step_avg:159.18ms step:1281/1480 train_time:202326ms step_avg:159.19ms step:1282/1480 train_time:202492ms step_avg:159.19ms step:1283/1480 train_time:202662ms step_avg:159.20ms step:1284/1480 train_time:202832ms step_avg:159.21ms step:1285/1480 train_time:203002ms step_avg:159.22ms step:1286/1480 train_time:203172ms step_avg:159.23ms step:1287/1480 train_time:203344ms step_avg:159.24ms step:1288/1480 train_time:203515ms step_avg:159.25ms step:1289/1480 train_time:203698ms step_avg:159.26ms step:1290/1480 train_time:203876ms step_avg:159.28ms step:1291/1480 train_time:204050ms step_avg:159.29ms step:1292/1480 train_time:204225ms step_avg:159.30ms step:1293/1480 train_time:204400ms step_avg:159.31ms step:1294/1480 train_time:204571ms step_avg:159.32ms step:1295/1480 train_time:204742ms step_avg:159.33ms step:1296/1480 train_time:204916ms step_avg:159.34ms step:1297/1480 train_time:205088ms step_avg:159.35ms step:1298/1480 train_time:205258ms step_avg:159.36ms step:1299/1480 train_time:205429ms step_avg:159.37ms step:1300/1480 train_time:205596ms step_avg:159.38ms step:1301/1480 train_time:205766ms step_avg:159.38ms step:1302/1480 train_time:205940ms step_avg:159.40ms step:1303/1480 train_time:206116ms step_avg:159.41ms step:1304/1480 train_time:206290ms step_avg:159.42ms step:1305/1480 train_time:206459ms step_avg:159.43ms step:1306/1480 train_time:206634ms step_avg:159.44ms step:1307/1480 train_time:206801ms step_avg:159.45ms step:1308/1480 train_time:206970ms step_avg:159.45ms step:1309/1480 train_time:207144ms step_avg:159.46ms step:1310/1480 train_time:207312ms step_avg:159.47ms step:1311/1480 train_time:207482ms step_avg:159.48ms step:1312/1480 train_time:207655ms step_avg:159.49ms step:1313/1480 train_time:207825ms step_avg:159.50ms step:1314/1480 train_time:207998ms step_avg:159.51ms step:1315/1480 train_time:208169ms step_avg:159.52ms step:1316/1480 train_time:208335ms step_avg:159.52ms step:1317/1480 train_time:208507ms step_avg:159.53ms step:1318/1480 train_time:208688ms step_avg:159.55ms step:1319/1480 train_time:208864ms step_avg:159.56ms step:1320/1480 train_time:209042ms step_avg:159.57ms step:1321/1480 train_time:209215ms step_avg:159.58ms step:1322/1480 train_time:209396ms step_avg:159.60ms step:1323/1480 train_time:209570ms step_avg:159.61ms step:1324/1480 train_time:209746ms step_avg:159.62ms step:1325/1480 train_time:209927ms step_avg:159.64ms step:1326/1480 train_time:210104ms step_avg:159.65ms step:1327/1480 train_time:210274ms step_avg:159.66ms step:1328/1480 train_time:210445ms step_avg:159.67ms step:1329/1480 train_time:210640ms step_avg:159.70ms step:1330/1480 train_time:210818ms step_avg:159.71ms step:1331/1480 train_time:210988ms step_avg:159.72ms step:1332/1480 train_time:211163ms step_avg:159.73ms step:1333/1480 train_time:211341ms step_avg:159.74ms step:1334/1480 train_time:211511ms step_avg:159.75ms step:1335/1480 train_time:211679ms step_avg:159.76ms step:1336/1480 train_time:211863ms step_avg:159.78ms step:1337/1480 train_time:212037ms step_avg:159.79ms step:1338/1480 train_time:212210ms step_avg:159.80ms step:1339/1480 train_time:212384ms step_avg:159.81ms step:1340/1480 train_time:212557ms step_avg:159.82ms step:1341/1480 train_time:212726ms step_avg:159.82ms step:1342/1480 train_time:212899ms step_avg:159.83ms step:1343/1480 train_time:213069ms step_avg:159.84ms step:1344/1480 train_time:213241ms step_avg:159.85ms step:1345/1480 train_time:213420ms step_avg:159.87ms step:1346/1480 train_time:213591ms step_avg:159.87ms step:1347/1480 train_time:213760ms step_avg:159.88ms step:1348/1480 train_time:213930ms step_avg:159.89ms step:1349/1480 train_time:214101ms step_avg:159.90ms step:1350/1480 train_time:214276ms step_avg:159.91ms step:1351/1480 train_time:214446ms step_avg:159.92ms step:1352/1480 train_time:214617ms step_avg:159.92ms step:1353/1480 train_time:214793ms step_avg:159.93ms step:1354/1480 train_time:214964ms step_avg:159.94ms step:1355/1480 train_time:215133ms step_avg:159.95ms step:1356/1480 train_time:215307ms step_avg:159.96ms step:1357/1480 train_time:215479ms step_avg:159.97ms step:1358/1480 train_time:215651ms step_avg:159.98ms step:1359/1480 train_time:215824ms step_avg:159.99ms step:1360/1480 train_time:216000ms step_avg:160.00ms step:1361/1480 train_time:216177ms step_avg:160.01ms step:1362/1480 train_time:216352ms step_avg:160.02ms step:1363/1480 train_time:216532ms step_avg:160.04ms step:1364/1480 train_time:216701ms step_avg:160.04ms step:1365/1480 train_time:216868ms step_avg:160.05ms step:1366/1480 train_time:217040ms step_avg:160.06ms step:1367/1480 train_time:217211ms step_avg:160.07ms step:1368/1480 train_time:217386ms step_avg:160.08ms step:1369/1480 train_time:217568ms step_avg:160.09ms step:1370/1480 train_time:217746ms step_avg:160.11ms step:1371/1480 train_time:217916ms step_avg:160.11ms step:1372/1480 train_time:218093ms step_avg:160.13ms step:1373/1480 train_time:218262ms step_avg:160.13ms step:1374/1480 train_time:218437ms step_avg:160.14ms step:1375/1480 train_time:218608ms step_avg:160.15ms step:1375/1480 val_loss:3.2968 train_time:218676ms step_avg:160.20ms step:1376/1480 train_time:218782ms step_avg:160.16ms step:1377/1480 train_time:218956ms step_avg:160.17ms step:1378/1480 train_time:219124ms step_avg:160.18ms step:1379/1480 train_time:219299ms step_avg:160.19ms step:1380/1480 train_time:219474ms step_avg:160.20ms step:1381/1480 train_time:219654ms step_avg:160.21ms step:1382/1480 train_time:219824ms step_avg:160.22ms step:1383/1480 train_time:219996ms step_avg:160.23ms step:1384/1480 train_time:220174ms step_avg:160.24ms step:1385/1480 train_time:220341ms step_avg:160.25ms step:1386/1480 train_time:220511ms step_avg:160.26ms step:1387/1480 train_time:220682ms step_avg:160.26ms step:1388/1480 train_time:220851ms step_avg:160.27ms step:1389/1480 train_time:221023ms step_avg:160.28ms step:1390/1480 train_time:221190ms step_avg:160.28ms step:1391/1480 train_time:221359ms step_avg:160.29ms step:1392/1480 train_time:221531ms step_avg:160.30ms step:1393/1480 train_time:221703ms step_avg:160.31ms step:1394/1480 train_time:221875ms step_avg:160.31ms step:1395/1480 train_time:222044ms step_avg:160.32ms step:1396/1480 train_time:222213ms step_avg:160.33ms step:1397/1480 train_time:222380ms step_avg:160.33ms step:1398/1480 train_time:222546ms step_avg:160.34ms step:1399/1480 train_time:222717ms step_avg:160.34ms step:1400/1480 train_time:222895ms step_avg:160.36ms step:1401/1480 train_time:223061ms step_avg:160.36ms step:1402/1480 train_time:223232ms step_avg:160.37ms step:1403/1480 train_time:223409ms step_avg:160.38ms step:1404/1480 train_time:223580ms step_avg:160.39ms step:1405/1480 train_time:223755ms step_avg:160.40ms step:1406/1480 train_time:223929ms step_avg:160.41ms step:1407/1480 train_time:224099ms step_avg:160.41ms step:1408/1480 train_time:224267ms step_avg:160.42ms step:1409/1480 train_time:224451ms step_avg:160.44ms step:1410/1480 train_time:224620ms step_avg:160.44ms step:1411/1480 train_time:224787ms step_avg:160.45ms step:1412/1480 train_time:224957ms step_avg:160.45ms step:1413/1480 train_time:225127ms step_avg:160.46ms step:1414/1480 train_time:225298ms step_avg:160.47ms step:1415/1480 train_time:225474ms step_avg:160.48ms step:1416/1480 train_time:225660ms step_avg:160.50ms step:1417/1480 train_time:225835ms step_avg:160.51ms step:1418/1480 train_time:226005ms step_avg:160.51ms step:1419/1480 train_time:226180ms step_avg:160.53ms step:1420/1480 train_time:226355ms step_avg:160.54ms step:1421/1480 train_time:226528ms step_avg:160.54ms step:1422/1480 train_time:226701ms step_avg:160.55ms step:1423/1480 train_time:226871ms step_avg:160.56ms step:1424/1480 train_time:227048ms step_avg:160.57ms step:1425/1480 train_time:227227ms step_avg:160.58ms step:1426/1480 train_time:227399ms step_avg:160.59ms step:1427/1480 train_time:227575ms step_avg:160.60ms step:1428/1480 train_time:227745ms step_avg:160.61ms step:1429/1480 train_time:227915ms step_avg:160.62ms step:1430/1480 train_time:228090ms step_avg:160.63ms step:1431/1480 train_time:228265ms step_avg:160.64ms step:1432/1480 train_time:228442ms step_avg:160.65ms step:1433/1480 train_time:228622ms step_avg:160.66ms step:1434/1480 train_time:228802ms step_avg:160.68ms step:1435/1480 train_time:228978ms step_avg:160.69ms step:1436/1480 train_time:229151ms step_avg:160.70ms step:1437/1480 train_time:229322ms step_avg:160.70ms step:1438/1480 train_time:229490ms step_avg:160.71ms step:1439/1480 train_time:229665ms step_avg:160.72ms step:1440/1480 train_time:229835ms step_avg:160.72ms step:1441/1480 train_time:230007ms step_avg:160.73ms step:1442/1480 train_time:230185ms step_avg:160.74ms step:1443/1480 train_time:230374ms step_avg:160.76ms step:1444/1480 train_time:230544ms step_avg:160.77ms step:1445/1480 train_time:230716ms step_avg:160.78ms step:1446/1480 train_time:230890ms step_avg:160.79ms step:1447/1480 train_time:231068ms step_avg:160.80ms step:1448/1480 train_time:231239ms step_avg:160.81ms step:1449/1480 train_time:231413ms step_avg:160.82ms step:1450/1480 train_time:231585ms step_avg:160.82ms step:1451/1480 train_time:231757ms step_avg:160.83ms step:1452/1480 train_time:231931ms step_avg:160.84ms step:1453/1480 train_time:232100ms step_avg:160.85ms step:1454/1480 train_time:232272ms step_avg:160.85ms step:1455/1480 train_time:232448ms step_avg:160.86ms step:1456/1480 train_time:232621ms step_avg:160.87ms step:1457/1480 train_time:232793ms step_avg:160.88ms step:1458/1480 train_time:232963ms step_avg:160.89ms step:1459/1480 train_time:233139ms step_avg:160.90ms step:1460/1480 train_time:233311ms step_avg:160.90ms step:1461/1480 train_time:233485ms step_avg:160.91ms step:1462/1480 train_time:233657ms step_avg:160.92ms step:1463/1480 train_time:233835ms step_avg:160.93ms step:1464/1480 train_time:234012ms step_avg:160.94ms step:1465/1480 train_time:234183ms step_avg:160.95ms step:1466/1480 train_time:234354ms step_avg:160.96ms step:1467/1480 train_time:234529ms step_avg:160.97ms step:1468/1480 train_time:234699ms step_avg:160.97ms step:1469/1480 train_time:234871ms step_avg:160.98ms step:1470/1480 train_time:235052ms step_avg:160.99ms step:1471/1480 train_time:235239ms step_avg:161.01ms step:1472/1480 train_time:235420ms step_avg:161.03ms step:1473/1480 train_time:235591ms step_avg:161.03ms step:1474/1480 train_time:235769ms step_avg:161.04ms step:1475/1480 train_time:235948ms step_avg:161.06ms step:1476/1480 train_time:236120ms step_avg:161.06ms step:1477/1480 train_time:236302ms step_avg:161.08ms step:1478/1480 train_time:236484ms step_avg:161.09ms step:1479/1480 train_time:236659ms step_avg:161.10ms step:1480/1480 train_time:236832ms step_avg:161.11ms step:1480/1480 val_loss:3.2782 train_time:236903ms step_avg:161.16ms