import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 12:29:49 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 125W / 700W | 533MiB / 81559MiB | 2% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 44C P0 75W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 119W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 104W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22927ms step_avg:nanms step:2/1480 train_time:23019ms step_avg:nanms step:3/1480 train_time:23159ms step_avg:nanms step:4/1480 train_time:23302ms step_avg:nanms step:5/1480 train_time:23442ms step_avg:nanms step:6/1480 train_time:23584ms step_avg:nanms step:7/1480 train_time:23724ms step_avg:nanms step:8/1480 train_time:23867ms step_avg:nanms step:9/1480 train_time:24011ms step_avg:nanms step:10/1480 train_time:24156ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:285ms step_avg:nanms step:13/1480 train_time:427ms step_avg:142.26ms step:14/1480 train_time:569ms step_avg:142.14ms step:15/1480 train_time:711ms step_avg:142.20ms step:16/1480 train_time:852ms step_avg:142.06ms step:17/1480 train_time:995ms step_avg:142.12ms step:18/1480 train_time:1138ms step_avg:142.25ms step:19/1480 train_time:1283ms step_avg:142.51ms step:20/1480 train_time:1425ms step_avg:142.51ms step:21/1480 train_time:1567ms step_avg:142.42ms step:22/1480 train_time:1708ms step_avg:142.36ms step:23/1480 train_time:1851ms step_avg:142.35ms step:24/1480 train_time:1993ms step_avg:142.38ms step:25/1480 train_time:2137ms step_avg:142.45ms step:26/1480 train_time:2281ms step_avg:142.58ms step:27/1480 train_time:2425ms step_avg:142.67ms step:28/1480 train_time:2567ms step_avg:142.62ms step:29/1480 train_time:2709ms step_avg:142.59ms step:30/1480 train_time:2852ms step_avg:142.58ms step:31/1480 train_time:2994ms step_avg:142.59ms step:32/1480 train_time:3139ms step_avg:142.67ms step:33/1480 train_time:3284ms step_avg:142.78ms step:34/1480 train_time:3429ms step_avg:142.86ms step:35/1480 train_time:3570ms step_avg:142.82ms step:36/1480 train_time:3713ms step_avg:142.80ms step:37/1480 train_time:3853ms step_avg:142.72ms step:38/1480 train_time:3994ms step_avg:142.66ms step:39/1480 train_time:4137ms step_avg:142.65ms step:40/1480 train_time:4280ms step_avg:142.68ms step:41/1480 train_time:4425ms step_avg:142.73ms step:42/1480 train_time:4567ms step_avg:142.72ms step:43/1480 train_time:4709ms step_avg:142.69ms step:44/1480 train_time:4850ms step_avg:142.65ms step:45/1480 train_time:4993ms step_avg:142.65ms step:46/1480 train_time:5135ms step_avg:142.63ms step:47/1480 train_time:5277ms step_avg:142.62ms step:48/1480 train_time:5421ms step_avg:142.67ms step:49/1480 train_time:5565ms step_avg:142.68ms step:50/1480 train_time:5708ms step_avg:142.70ms step:51/1480 train_time:5850ms step_avg:142.68ms step:52/1480 train_time:5991ms step_avg:142.65ms step:53/1480 train_time:6134ms step_avg:142.65ms step:54/1480 train_time:6277ms step_avg:142.67ms step:55/1480 train_time:6422ms step_avg:142.71ms step:56/1480 train_time:6564ms step_avg:142.70ms step:57/1480 train_time:6708ms step_avg:142.71ms step:58/1480 train_time:6848ms step_avg:142.68ms step:59/1480 train_time:6990ms step_avg:142.65ms step:60/1480 train_time:7133ms step_avg:142.66ms step:61/1480 train_time:7277ms step_avg:142.68ms step:62/1480 train_time:7421ms step_avg:142.71ms step:63/1480 train_time:7565ms step_avg:142.73ms step:64/1480 train_time:7708ms step_avg:142.74ms step:65/1480 train_time:7849ms step_avg:142.72ms step:66/1480 train_time:7990ms step_avg:142.68ms step:67/1480 train_time:8132ms step_avg:142.67ms step:68/1480 train_time:8274ms step_avg:142.66ms step:69/1480 train_time:8416ms step_avg:142.64ms step:70/1480 train_time:8560ms step_avg:142.67ms step:71/1480 train_time:8705ms step_avg:142.70ms step:72/1480 train_time:8847ms step_avg:142.70ms step:73/1480 train_time:8989ms step_avg:142.69ms step:74/1480 train_time:9132ms step_avg:142.68ms step:75/1480 train_time:9275ms step_avg:142.70ms step:76/1480 train_time:9418ms step_avg:142.70ms step:77/1480 train_time:9560ms step_avg:142.69ms step:78/1480 train_time:9704ms step_avg:142.71ms step:79/1480 train_time:9847ms step_avg:142.71ms step:80/1480 train_time:9990ms step_avg:142.71ms step:81/1480 train_time:10132ms step_avg:142.71ms step:82/1480 train_time:10276ms step_avg:142.72ms step:83/1480 train_time:10420ms step_avg:142.73ms step:84/1480 train_time:10562ms step_avg:142.73ms step:85/1480 train_time:10706ms step_avg:142.75ms step:86/1480 train_time:10849ms step_avg:142.75ms step:87/1480 train_time:10992ms step_avg:142.75ms step:88/1480 train_time:11134ms step_avg:142.74ms step:89/1480 train_time:11277ms step_avg:142.75ms step:90/1480 train_time:11420ms step_avg:142.76ms step:91/1480 train_time:11564ms step_avg:142.77ms step:92/1480 train_time:11707ms step_avg:142.77ms step:93/1480 train_time:11850ms step_avg:142.77ms step:94/1480 train_time:11992ms step_avg:142.76ms step:95/1480 train_time:12134ms step_avg:142.75ms step:96/1480 train_time:12276ms step_avg:142.74ms step:97/1480 train_time:12420ms step_avg:142.76ms step:98/1480 train_time:12564ms step_avg:142.78ms step:99/1480 train_time:12708ms step_avg:142.78ms step:100/1480 train_time:12851ms step_avg:142.79ms step:101/1480 train_time:12993ms step_avg:142.78ms step:102/1480 train_time:13135ms step_avg:142.77ms step:103/1480 train_time:13278ms step_avg:142.77ms step:104/1480 train_time:13422ms step_avg:142.78ms step:105/1480 train_time:13565ms step_avg:142.79ms step:106/1480 train_time:13709ms step_avg:142.80ms step:107/1480 train_time:13851ms step_avg:142.79ms step:108/1480 train_time:13992ms step_avg:142.77ms step:109/1480 train_time:14135ms step_avg:142.78ms step:110/1480 train_time:14278ms step_avg:142.78ms step:111/1480 train_time:14424ms step_avg:142.81ms step:112/1480 train_time:14572ms step_avg:142.86ms step:113/1480 train_time:14719ms step_avg:142.90ms step:114/1480 train_time:14866ms step_avg:142.94ms step:115/1480 train_time:15012ms step_avg:142.97ms step:116/1480 train_time:15158ms step_avg:143.00ms step:117/1480 train_time:15306ms step_avg:143.05ms step:118/1480 train_time:15452ms step_avg:143.08ms step:119/1480 train_time:15600ms step_avg:143.12ms step:120/1480 train_time:15748ms step_avg:143.17ms step:121/1480 train_time:15895ms step_avg:143.20ms step:122/1480 train_time:16043ms step_avg:143.24ms step:123/1480 train_time:16189ms step_avg:143.27ms step:124/1480 train_time:16336ms step_avg:143.30ms step:125/1480 train_time:16484ms step_avg:143.34ms step:125/1480 val_loss:4.4176 train_time:16542ms step_avg:143.84ms step:126/1480 train_time:16640ms step_avg:143.45ms step:127/1480 train_time:16790ms step_avg:143.51ms step:128/1480 train_time:16939ms step_avg:143.55ms step:129/1480 train_time:17084ms step_avg:143.56ms step:130/1480 train_time:17230ms step_avg:143.58ms step:131/1480 train_time:17377ms step_avg:143.61ms step:132/1480 train_time:17523ms step_avg:143.63ms step:133/1480 train_time:17673ms step_avg:143.68ms step:134/1480 train_time:17821ms step_avg:143.72ms step:135/1480 train_time:17970ms step_avg:143.76ms step:136/1480 train_time:18117ms step_avg:143.79ms step:137/1480 train_time:18263ms step_avg:143.80ms step:138/1480 train_time:18409ms step_avg:143.82ms step:139/1480 train_time:18558ms step_avg:143.86ms step:140/1480 train_time:18705ms step_avg:143.88ms step:141/1480 train_time:18853ms step_avg:143.92ms step:142/1480 train_time:19001ms step_avg:143.95ms step:143/1480 train_time:19147ms step_avg:143.96ms step:144/1480 train_time:19294ms step_avg:143.99ms step:145/1480 train_time:19441ms step_avg:144.01ms step:146/1480 train_time:19586ms step_avg:144.02ms step:147/1480 train_time:19734ms step_avg:144.05ms step:148/1480 train_time:19882ms step_avg:144.07ms step:149/1480 train_time:20029ms step_avg:144.09ms step:150/1480 train_time:20177ms step_avg:144.12ms step:151/1480 train_time:20323ms step_avg:144.13ms step:152/1480 train_time:20470ms step_avg:144.15ms step:153/1480 train_time:20618ms step_avg:144.18ms step:154/1480 train_time:20766ms step_avg:144.21ms step:155/1480 train_time:20914ms step_avg:144.23ms step:156/1480 train_time:21062ms step_avg:144.26ms step:157/1480 train_time:21208ms step_avg:144.27ms step:158/1480 train_time:21356ms step_avg:144.30ms step:159/1480 train_time:21503ms step_avg:144.31ms step:160/1480 train_time:21650ms step_avg:144.33ms step:161/1480 train_time:21797ms step_avg:144.35ms step:162/1480 train_time:21943ms step_avg:144.36ms step:163/1480 train_time:22089ms step_avg:144.38ms step:164/1480 train_time:22238ms step_avg:144.40ms step:165/1480 train_time:22385ms step_avg:144.42ms step:166/1480 train_time:22533ms step_avg:144.44ms step:167/1480 train_time:22681ms step_avg:144.46ms step:168/1480 train_time:22828ms step_avg:144.48ms step:169/1480 train_time:22976ms step_avg:144.50ms step:170/1480 train_time:23123ms step_avg:144.52ms step:171/1480 train_time:23270ms step_avg:144.54ms step:172/1480 train_time:23417ms step_avg:144.55ms step:173/1480 train_time:23563ms step_avg:144.56ms step:174/1480 train_time:23711ms step_avg:144.58ms step:175/1480 train_time:23859ms step_avg:144.60ms step:176/1480 train_time:24005ms step_avg:144.61ms step:177/1480 train_time:24153ms step_avg:144.63ms step:178/1480 train_time:24300ms step_avg:144.64ms step:179/1480 train_time:24447ms step_avg:144.65ms step:180/1480 train_time:24594ms step_avg:144.67ms step:181/1480 train_time:24741ms step_avg:144.69ms step:182/1480 train_time:24888ms step_avg:144.70ms step:183/1480 train_time:25036ms step_avg:144.72ms step:184/1480 train_time:25182ms step_avg:144.73ms step:185/1480 train_time:25328ms step_avg:144.73ms step:186/1480 train_time:25476ms step_avg:144.75ms step:187/1480 train_time:25622ms step_avg:144.76ms step:188/1480 train_time:25769ms step_avg:144.77ms step:189/1480 train_time:25917ms step_avg:144.79ms step:190/1480 train_time:26064ms step_avg:144.80ms step:191/1480 train_time:26210ms step_avg:144.81ms step:192/1480 train_time:26358ms step_avg:144.83ms step:193/1480 train_time:26505ms step_avg:144.83ms step:194/1480 train_time:26651ms step_avg:144.84ms step:195/1480 train_time:26798ms step_avg:144.85ms step:196/1480 train_time:26945ms step_avg:144.87ms step:197/1480 train_time:27093ms step_avg:144.88ms step:198/1480 train_time:27241ms step_avg:144.90ms step:199/1480 train_time:27387ms step_avg:144.91ms step:200/1480 train_time:27536ms step_avg:144.93ms step:201/1480 train_time:27682ms step_avg:144.93ms step:202/1480 train_time:27829ms step_avg:144.95ms step:203/1480 train_time:27976ms step_avg:144.95ms step:204/1480 train_time:28123ms step_avg:144.96ms step:205/1480 train_time:28271ms step_avg:144.98ms step:206/1480 train_time:28418ms step_avg:144.99ms step:207/1480 train_time:28565ms step_avg:145.00ms step:208/1480 train_time:28713ms step_avg:145.01ms step:209/1480 train_time:28861ms step_avg:145.03ms step:210/1480 train_time:29007ms step_avg:145.04ms step:211/1480 train_time:29155ms step_avg:145.05ms step:212/1480 train_time:29301ms step_avg:145.06ms step:213/1480 train_time:29448ms step_avg:145.07ms step:214/1480 train_time:29595ms step_avg:145.07ms step:215/1480 train_time:29743ms step_avg:145.09ms step:216/1480 train_time:29889ms step_avg:145.09ms step:217/1480 train_time:30038ms step_avg:145.11ms step:218/1480 train_time:30184ms step_avg:145.12ms step:219/1480 train_time:30332ms step_avg:145.13ms step:220/1480 train_time:30479ms step_avg:145.14ms step:221/1480 train_time:30628ms step_avg:145.15ms step:222/1480 train_time:30779ms step_avg:145.18ms step:223/1480 train_time:30929ms step_avg:145.21ms step:224/1480 train_time:31080ms step_avg:145.23ms step:225/1480 train_time:31229ms step_avg:145.25ms step:226/1480 train_time:31381ms step_avg:145.28ms step:227/1480 train_time:31531ms step_avg:145.31ms step:228/1480 train_time:31682ms step_avg:145.33ms step:229/1480 train_time:31833ms step_avg:145.35ms step:230/1480 train_time:31983ms step_avg:145.38ms step:231/1480 train_time:32133ms step_avg:145.40ms step:232/1480 train_time:32284ms step_avg:145.42ms step:233/1480 train_time:32435ms step_avg:145.45ms step:234/1480 train_time:32584ms step_avg:145.46ms step:235/1480 train_time:32736ms step_avg:145.49ms step:236/1480 train_time:32887ms step_avg:145.52ms step:237/1480 train_time:33037ms step_avg:145.54ms step:238/1480 train_time:33188ms step_avg:145.56ms step:239/1480 train_time:33339ms step_avg:145.59ms step:240/1480 train_time:33490ms step_avg:145.61ms step:241/1480 train_time:33641ms step_avg:145.63ms step:242/1480 train_time:33791ms step_avg:145.65ms step:243/1480 train_time:33942ms step_avg:145.67ms step:244/1480 train_time:34093ms step_avg:145.70ms step:245/1480 train_time:34244ms step_avg:145.72ms step:246/1480 train_time:34394ms step_avg:145.74ms step:247/1480 train_time:34545ms step_avg:145.76ms step:248/1480 train_time:34695ms step_avg:145.78ms step:249/1480 train_time:34845ms step_avg:145.80ms step:250/1480 train_time:34996ms step_avg:145.82ms step:250/1480 val_loss:3.9957 train_time:35056ms step_avg:146.07ms step:251/1480 train_time:35153ms step_avg:145.86ms step:252/1480 train_time:35305ms step_avg:145.89ms step:253/1480 train_time:35455ms step_avg:145.90ms step:254/1480 train_time:35604ms step_avg:145.92ms step:255/1480 train_time:35752ms step_avg:145.93ms step:256/1480 train_time:35902ms step_avg:145.94ms step:257/1480 train_time:36053ms step_avg:145.97ms step:258/1480 train_time:36207ms step_avg:145.99ms step:259/1480 train_time:36359ms step_avg:146.02ms step:260/1480 train_time:36510ms step_avg:146.04ms step:261/1480 train_time:36660ms step_avg:146.06ms step:262/1480 train_time:36811ms step_avg:146.07ms step:263/1480 train_time:36961ms step_avg:146.09ms step:264/1480 train_time:37112ms step_avg:146.11ms step:265/1480 train_time:37265ms step_avg:146.14ms step:266/1480 train_time:37415ms step_avg:146.15ms step:267/1480 train_time:37566ms step_avg:146.17ms step:268/1480 train_time:37716ms step_avg:146.19ms step:269/1480 train_time:37866ms step_avg:146.20ms step:270/1480 train_time:38016ms step_avg:146.22ms step:271/1480 train_time:38168ms step_avg:146.24ms step:272/1480 train_time:38318ms step_avg:146.25ms step:273/1480 train_time:38470ms step_avg:146.27ms step:274/1480 train_time:38621ms step_avg:146.29ms step:275/1480 train_time:38771ms step_avg:146.31ms step:276/1480 train_time:38922ms step_avg:146.32ms step:277/1480 train_time:39073ms step_avg:146.34ms step:278/1480 train_time:39225ms step_avg:146.36ms step:279/1480 train_time:39375ms step_avg:146.38ms step:280/1480 train_time:39526ms step_avg:146.39ms step:281/1480 train_time:39677ms step_avg:146.41ms step:282/1480 train_time:39828ms step_avg:146.43ms step:283/1480 train_time:39979ms step_avg:146.44ms step:284/1480 train_time:40129ms step_avg:146.46ms step:285/1480 train_time:40281ms step_avg:146.48ms step:286/1480 train_time:40430ms step_avg:146.49ms step:287/1480 train_time:40582ms step_avg:146.51ms step:288/1480 train_time:40732ms step_avg:146.52ms step:289/1480 train_time:40884ms step_avg:146.54ms step:290/1480 train_time:41033ms step_avg:146.55ms step:291/1480 train_time:41183ms step_avg:146.56ms step:292/1480 train_time:41333ms step_avg:146.57ms step:293/1480 train_time:41484ms step_avg:146.59ms step:294/1480 train_time:41633ms step_avg:146.60ms step:295/1480 train_time:41784ms step_avg:146.61ms step:296/1480 train_time:41934ms step_avg:146.62ms step:297/1480 train_time:42085ms step_avg:146.64ms step:298/1480 train_time:42233ms step_avg:146.64ms step:299/1480 train_time:42386ms step_avg:146.66ms step:300/1480 train_time:42538ms step_avg:146.68ms step:301/1480 train_time:42687ms step_avg:146.69ms step:302/1480 train_time:42838ms step_avg:146.70ms step:303/1480 train_time:42988ms step_avg:146.72ms step:304/1480 train_time:43138ms step_avg:146.73ms step:305/1480 train_time:43288ms step_avg:146.74ms step:306/1480 train_time:43439ms step_avg:146.75ms step:307/1480 train_time:43590ms step_avg:146.77ms step:308/1480 train_time:43740ms step_avg:146.78ms step:309/1480 train_time:43891ms step_avg:146.79ms step:310/1480 train_time:44042ms step_avg:146.81ms step:311/1480 train_time:44191ms step_avg:146.82ms step:312/1480 train_time:44343ms step_avg:146.83ms step:313/1480 train_time:44492ms step_avg:146.84ms step:314/1480 train_time:44643ms step_avg:146.85ms step:315/1480 train_time:44793ms step_avg:146.86ms step:316/1480 train_time:44944ms step_avg:146.88ms step:317/1480 train_time:45095ms step_avg:146.89ms step:318/1480 train_time:45246ms step_avg:146.90ms step:319/1480 train_time:45398ms step_avg:146.92ms step:320/1480 train_time:45548ms step_avg:146.93ms step:321/1480 train_time:45698ms step_avg:146.94ms step:322/1480 train_time:45849ms step_avg:146.95ms step:323/1480 train_time:45999ms step_avg:146.96ms step:324/1480 train_time:46149ms step_avg:146.97ms step:325/1480 train_time:46299ms step_avg:146.98ms step:326/1480 train_time:46451ms step_avg:147.00ms step:327/1480 train_time:46600ms step_avg:147.00ms step:328/1480 train_time:46750ms step_avg:147.01ms step:329/1480 train_time:46900ms step_avg:147.02ms step:330/1480 train_time:47052ms step_avg:147.04ms step:331/1480 train_time:47206ms step_avg:147.06ms step:332/1480 train_time:47360ms step_avg:147.08ms step:333/1480 train_time:47512ms step_avg:147.10ms step:334/1480 train_time:47666ms step_avg:147.12ms step:335/1480 train_time:47820ms step_avg:147.14ms step:336/1480 train_time:47974ms step_avg:147.16ms step:337/1480 train_time:48128ms step_avg:147.18ms step:338/1480 train_time:48283ms step_avg:147.21ms step:339/1480 train_time:48438ms step_avg:147.23ms step:340/1480 train_time:48592ms step_avg:147.25ms step:341/1480 train_time:48745ms step_avg:147.27ms step:342/1480 train_time:48900ms step_avg:147.29ms step:343/1480 train_time:49054ms step_avg:147.31ms step:344/1480 train_time:49209ms step_avg:147.33ms step:345/1480 train_time:49364ms step_avg:147.36ms step:346/1480 train_time:49519ms step_avg:147.38ms step:347/1480 train_time:49673ms step_avg:147.40ms step:348/1480 train_time:49826ms step_avg:147.41ms step:349/1480 train_time:49980ms step_avg:147.43ms step:350/1480 train_time:50133ms step_avg:147.45ms step:351/1480 train_time:50288ms step_avg:147.47ms step:352/1480 train_time:50442ms step_avg:147.49ms step:353/1480 train_time:50596ms step_avg:147.51ms step:354/1480 train_time:50749ms step_avg:147.52ms step:355/1480 train_time:50902ms step_avg:147.54ms step:356/1480 train_time:51055ms step_avg:147.56ms step:357/1480 train_time:51209ms step_avg:147.58ms step:358/1480 train_time:51363ms step_avg:147.60ms step:359/1480 train_time:51519ms step_avg:147.62ms step:360/1480 train_time:51675ms step_avg:147.64ms step:361/1480 train_time:51829ms step_avg:147.66ms step:362/1480 train_time:51983ms step_avg:147.68ms step:363/1480 train_time:52136ms step_avg:147.69ms step:364/1480 train_time:52288ms step_avg:147.71ms step:365/1480 train_time:52443ms step_avg:147.73ms step:366/1480 train_time:52597ms step_avg:147.74ms step:367/1480 train_time:52751ms step_avg:147.76ms step:368/1480 train_time:52905ms step_avg:147.78ms step:369/1480 train_time:53059ms step_avg:147.80ms step:370/1480 train_time:53212ms step_avg:147.81ms step:371/1480 train_time:53365ms step_avg:147.83ms step:372/1480 train_time:53519ms step_avg:147.84ms step:373/1480 train_time:53674ms step_avg:147.86ms step:374/1480 train_time:53828ms step_avg:147.88ms step:375/1480 train_time:53981ms step_avg:147.89ms step:375/1480 val_loss:3.8040 train_time:54042ms step_avg:148.06ms step:376/1480 train_time:54140ms step_avg:147.92ms step:377/1480 train_time:54297ms step_avg:147.95ms step:378/1480 train_time:54450ms step_avg:147.96ms step:379/1480 train_time:54603ms step_avg:147.97ms step:380/1480 train_time:54755ms step_avg:147.99ms step:381/1480 train_time:54907ms step_avg:148.00ms step:382/1480 train_time:55062ms step_avg:148.02ms step:383/1480 train_time:55217ms step_avg:148.03ms step:384/1480 train_time:55372ms step_avg:148.05ms step:385/1480 train_time:55526ms step_avg:148.07ms step:386/1480 train_time:55680ms step_avg:148.08ms step:387/1480 train_time:55832ms step_avg:148.10ms step:388/1480 train_time:55985ms step_avg:148.11ms step:389/1480 train_time:56139ms step_avg:148.12ms step:390/1480 train_time:56295ms step_avg:148.14ms step:391/1480 train_time:56449ms step_avg:148.16ms step:392/1480 train_time:56602ms step_avg:148.17ms step:393/1480 train_time:56756ms step_avg:148.19ms step:394/1480 train_time:56908ms step_avg:148.20ms step:395/1480 train_time:57062ms step_avg:148.21ms step:396/1480 train_time:57216ms step_avg:148.23ms step:397/1480 train_time:57370ms step_avg:148.24ms step:398/1480 train_time:57524ms step_avg:148.26ms step:399/1480 train_time:57678ms step_avg:148.27ms step:400/1480 train_time:57833ms step_avg:148.29ms step:401/1480 train_time:57985ms step_avg:148.30ms step:402/1480 train_time:58139ms step_avg:148.31ms step:403/1480 train_time:58294ms step_avg:148.33ms step:404/1480 train_time:58447ms step_avg:148.34ms step:405/1480 train_time:58601ms step_avg:148.36ms step:406/1480 train_time:58755ms step_avg:148.37ms step:407/1480 train_time:58908ms step_avg:148.38ms step:408/1480 train_time:59061ms step_avg:148.40ms step:409/1480 train_time:59215ms step_avg:148.41ms step:410/1480 train_time:59368ms step_avg:148.42ms step:411/1480 train_time:59521ms step_avg:148.43ms step:412/1480 train_time:59675ms step_avg:148.45ms step:413/1480 train_time:59829ms step_avg:148.46ms step:414/1480 train_time:59982ms step_avg:148.47ms step:415/1480 train_time:60136ms step_avg:148.48ms step:416/1480 train_time:60290ms step_avg:148.50ms step:417/1480 train_time:60443ms step_avg:148.51ms step:418/1480 train_time:60597ms step_avg:148.52ms step:419/1480 train_time:60750ms step_avg:148.53ms step:420/1480 train_time:60904ms step_avg:148.55ms step:421/1480 train_time:61058ms step_avg:148.56ms step:422/1480 train_time:61212ms step_avg:148.57ms step:423/1480 train_time:61365ms step_avg:148.58ms step:424/1480 train_time:61519ms step_avg:148.60ms step:425/1480 train_time:61673ms step_avg:148.61ms step:426/1480 train_time:61828ms step_avg:148.63ms step:427/1480 train_time:61982ms step_avg:148.64ms step:428/1480 train_time:62135ms step_avg:148.65ms step:429/1480 train_time:62288ms step_avg:148.66ms step:430/1480 train_time:62441ms step_avg:148.67ms step:431/1480 train_time:62594ms step_avg:148.68ms step:432/1480 train_time:62748ms step_avg:148.69ms step:433/1480 train_time:62901ms step_avg:148.70ms step:434/1480 train_time:63056ms step_avg:148.72ms step:435/1480 train_time:63210ms step_avg:148.73ms step:436/1480 train_time:63364ms step_avg:148.74ms step:437/1480 train_time:63517ms step_avg:148.75ms step:438/1480 train_time:63673ms step_avg:148.77ms step:439/1480 train_time:63826ms step_avg:148.78ms step:440/1480 train_time:63981ms step_avg:148.79ms step:441/1480 train_time:64138ms step_avg:148.81ms step:442/1480 train_time:64297ms step_avg:148.83ms step:443/1480 train_time:64454ms step_avg:148.85ms step:444/1480 train_time:64610ms step_avg:148.87ms step:445/1480 train_time:64764ms step_avg:148.88ms step:446/1480 train_time:64919ms step_avg:148.90ms step:447/1480 train_time:65076ms step_avg:148.92ms step:448/1480 train_time:65232ms step_avg:148.93ms step:449/1480 train_time:65391ms step_avg:148.95ms step:450/1480 train_time:65549ms step_avg:148.98ms step:451/1480 train_time:65705ms step_avg:148.99ms step:452/1480 train_time:65861ms step_avg:149.01ms step:453/1480 train_time:66017ms step_avg:149.02ms step:454/1480 train_time:66175ms step_avg:149.04ms step:455/1480 train_time:66331ms step_avg:149.06ms step:456/1480 train_time:66487ms step_avg:149.07ms step:457/1480 train_time:66644ms step_avg:149.09ms step:458/1480 train_time:66800ms step_avg:149.11ms step:459/1480 train_time:66959ms step_avg:149.13ms step:460/1480 train_time:67117ms step_avg:149.15ms step:461/1480 train_time:67277ms step_avg:149.17ms step:462/1480 train_time:67435ms step_avg:149.19ms step:463/1480 train_time:67592ms step_avg:149.21ms step:464/1480 train_time:67748ms step_avg:149.22ms step:465/1480 train_time:67904ms step_avg:149.24ms step:466/1480 train_time:68061ms step_avg:149.26ms step:467/1480 train_time:68218ms step_avg:149.27ms step:468/1480 train_time:68377ms step_avg:149.29ms step:469/1480 train_time:68534ms step_avg:149.31ms step:470/1480 train_time:68691ms step_avg:149.33ms step:471/1480 train_time:68848ms step_avg:149.34ms step:472/1480 train_time:69004ms step_avg:149.36ms step:473/1480 train_time:69161ms step_avg:149.38ms step:474/1480 train_time:69318ms step_avg:149.39ms step:475/1480 train_time:69476ms step_avg:149.41ms step:476/1480 train_time:69632ms step_avg:149.43ms step:477/1480 train_time:69788ms step_avg:149.44ms step:478/1480 train_time:69944ms step_avg:149.45ms step:479/1480 train_time:70101ms step_avg:149.47ms step:480/1480 train_time:70259ms step_avg:149.49ms step:481/1480 train_time:70415ms step_avg:149.50ms step:482/1480 train_time:70574ms step_avg:149.52ms step:483/1480 train_time:70731ms step_avg:149.54ms step:484/1480 train_time:70889ms step_avg:149.55ms step:485/1480 train_time:71044ms step_avg:149.57ms step:486/1480 train_time:71201ms step_avg:149.58ms step:487/1480 train_time:71358ms step_avg:149.60ms step:488/1480 train_time:71516ms step_avg:149.61ms step:489/1480 train_time:71672ms step_avg:149.63ms step:490/1480 train_time:71827ms step_avg:149.64ms step:491/1480 train_time:71984ms step_avg:149.65ms step:492/1480 train_time:72140ms step_avg:149.67ms step:493/1480 train_time:72298ms step_avg:149.69ms step:494/1480 train_time:72457ms step_avg:149.70ms step:495/1480 train_time:72615ms step_avg:149.72ms step:496/1480 train_time:72774ms step_avg:149.74ms step:497/1480 train_time:72931ms step_avg:149.76ms step:498/1480 train_time:73087ms step_avg:149.77ms step:499/1480 train_time:73244ms step_avg:149.78ms step:500/1480 train_time:73401ms step_avg:149.80ms step:500/1480 val_loss:3.6823 train_time:73463ms step_avg:149.92ms step:501/1480 train_time:73564ms step_avg:149.82ms step:502/1480 train_time:73723ms step_avg:149.84ms step:503/1480 train_time:73881ms step_avg:149.86ms step:504/1480 train_time:74036ms step_avg:149.87ms step:505/1480 train_time:74190ms step_avg:149.88ms step:506/1480 train_time:74347ms step_avg:149.89ms step:507/1480 train_time:74504ms step_avg:149.91ms step:508/1480 train_time:74665ms step_avg:149.93ms step:509/1480 train_time:74823ms step_avg:149.95ms step:510/1480 train_time:74978ms step_avg:149.96ms step:511/1480 train_time:75135ms step_avg:149.97ms step:512/1480 train_time:75291ms step_avg:149.98ms step:513/1480 train_time:75447ms step_avg:149.99ms step:514/1480 train_time:75604ms step_avg:150.01ms step:515/1480 train_time:75763ms step_avg:150.03ms step:516/1480 train_time:75922ms step_avg:150.04ms step:517/1480 train_time:76078ms step_avg:150.06ms step:518/1480 train_time:76236ms step_avg:150.07ms step:519/1480 train_time:76392ms step_avg:150.08ms step:520/1480 train_time:76549ms step_avg:150.10ms step:521/1480 train_time:76704ms step_avg:150.11ms step:522/1480 train_time:76864ms step_avg:150.12ms step:523/1480 train_time:77021ms step_avg:150.14ms step:524/1480 train_time:77178ms step_avg:150.15ms step:525/1480 train_time:77336ms step_avg:150.17ms step:526/1480 train_time:77492ms step_avg:150.18ms step:527/1480 train_time:77648ms step_avg:150.19ms step:528/1480 train_time:77804ms step_avg:150.20ms step:529/1480 train_time:77963ms step_avg:150.22ms step:530/1480 train_time:78122ms step_avg:150.24ms step:531/1480 train_time:78279ms step_avg:150.25ms step:532/1480 train_time:78435ms step_avg:150.26ms step:533/1480 train_time:78591ms step_avg:150.27ms step:534/1480 train_time:78747ms step_avg:150.28ms step:535/1480 train_time:78903ms step_avg:150.29ms step:536/1480 train_time:79062ms step_avg:150.31ms step:537/1480 train_time:79218ms step_avg:150.32ms step:538/1480 train_time:79375ms step_avg:150.33ms step:539/1480 train_time:79533ms step_avg:150.35ms step:540/1480 train_time:79691ms step_avg:150.36ms step:541/1480 train_time:79846ms step_avg:150.37ms step:542/1480 train_time:80002ms step_avg:150.38ms step:543/1480 train_time:80158ms step_avg:150.39ms step:544/1480 train_time:80314ms step_avg:150.40ms step:545/1480 train_time:80469ms step_avg:150.41ms step:546/1480 train_time:80626ms step_avg:150.42ms step:547/1480 train_time:80784ms step_avg:150.44ms step:548/1480 train_time:80944ms step_avg:150.45ms step:549/1480 train_time:81102ms step_avg:150.47ms step:550/1480 train_time:81262ms step_avg:150.49ms step:551/1480 train_time:81421ms step_avg:150.50ms step:552/1480 train_time:81579ms step_avg:150.51ms step:553/1480 train_time:81740ms step_avg:150.53ms step:554/1480 train_time:81900ms step_avg:150.55ms step:555/1480 train_time:82062ms step_avg:150.57ms step:556/1480 train_time:82221ms step_avg:150.59ms step:557/1480 train_time:82381ms step_avg:150.61ms step:558/1480 train_time:82541ms step_avg:150.62ms step:559/1480 train_time:82700ms step_avg:150.64ms step:560/1480 train_time:82859ms step_avg:150.65ms step:561/1480 train_time:83017ms step_avg:150.67ms step:562/1480 train_time:83175ms step_avg:150.68ms step:563/1480 train_time:83333ms step_avg:150.69ms step:564/1480 train_time:83491ms step_avg:150.71ms step:565/1480 train_time:83651ms step_avg:150.72ms step:566/1480 train_time:83811ms step_avg:150.74ms step:567/1480 train_time:83970ms step_avg:150.75ms step:568/1480 train_time:84127ms step_avg:150.76ms step:569/1480 train_time:84285ms step_avg:150.78ms step:570/1480 train_time:84445ms step_avg:150.79ms step:571/1480 train_time:84605ms step_avg:150.81ms step:572/1480 train_time:84765ms step_avg:150.83ms step:573/1480 train_time:84926ms step_avg:150.85ms step:574/1480 train_time:85088ms step_avg:150.87ms step:575/1480 train_time:85249ms step_avg:150.88ms step:576/1480 train_time:85407ms step_avg:150.90ms step:577/1480 train_time:85568ms step_avg:150.91ms step:578/1480 train_time:85726ms step_avg:150.93ms step:579/1480 train_time:85885ms step_avg:150.94ms step:580/1480 train_time:86044ms step_avg:150.96ms step:581/1480 train_time:86206ms step_avg:150.97ms step:582/1480 train_time:86368ms step_avg:150.99ms step:583/1480 train_time:86527ms step_avg:151.01ms step:584/1480 train_time:86686ms step_avg:151.02ms step:585/1480 train_time:86844ms step_avg:151.03ms step:586/1480 train_time:87004ms step_avg:151.05ms step:587/1480 train_time:87165ms step_avg:151.07ms step:588/1480 train_time:87323ms step_avg:151.08ms step:589/1480 train_time:87484ms step_avg:151.10ms step:590/1480 train_time:87646ms step_avg:151.11ms step:591/1480 train_time:87805ms step_avg:151.13ms step:592/1480 train_time:87966ms step_avg:151.14ms step:593/1480 train_time:88126ms step_avg:151.16ms step:594/1480 train_time:88287ms step_avg:151.18ms step:595/1480 train_time:88448ms step_avg:151.19ms step:596/1480 train_time:88608ms step_avg:151.21ms step:597/1480 train_time:88767ms step_avg:151.22ms step:598/1480 train_time:88925ms step_avg:151.23ms step:599/1480 train_time:89084ms step_avg:151.25ms step:600/1480 train_time:89245ms step_avg:151.26ms step:601/1480 train_time:89405ms step_avg:151.28ms step:602/1480 train_time:89566ms step_avg:151.29ms step:603/1480 train_time:89727ms step_avg:151.31ms step:604/1480 train_time:89886ms step_avg:151.32ms step:605/1480 train_time:90047ms step_avg:151.34ms step:606/1480 train_time:90207ms step_avg:151.35ms step:607/1480 train_time:90370ms step_avg:151.37ms step:608/1480 train_time:90530ms step_avg:151.39ms step:609/1480 train_time:90690ms step_avg:151.40ms step:610/1480 train_time:90848ms step_avg:151.41ms step:611/1480 train_time:91008ms step_avg:151.43ms step:612/1480 train_time:91169ms step_avg:151.44ms step:613/1480 train_time:91329ms step_avg:151.46ms step:614/1480 train_time:91488ms step_avg:151.47ms step:615/1480 train_time:91646ms step_avg:151.48ms step:616/1480 train_time:91804ms step_avg:151.49ms step:617/1480 train_time:91964ms step_avg:151.51ms step:618/1480 train_time:92124ms step_avg:151.52ms step:619/1480 train_time:92284ms step_avg:151.53ms step:620/1480 train_time:92444ms step_avg:151.55ms step:621/1480 train_time:92604ms step_avg:151.56ms step:622/1480 train_time:92765ms step_avg:151.58ms step:623/1480 train_time:92927ms step_avg:151.59ms step:624/1480 train_time:93087ms step_avg:151.61ms step:625/1480 train_time:93246ms step_avg:151.62ms step:625/1480 val_loss:3.6048 train_time:93309ms step_avg:151.72ms step:626/1480 train_time:93407ms step_avg:151.64ms step:627/1480 train_time:93568ms step_avg:151.65ms step:628/1480 train_time:93727ms step_avg:151.66ms step:629/1480 train_time:93886ms step_avg:151.67ms step:630/1480 train_time:94044ms step_avg:151.68ms step:631/1480 train_time:94203ms step_avg:151.70ms step:632/1480 train_time:94363ms step_avg:151.71ms step:633/1480 train_time:94524ms step_avg:151.72ms step:634/1480 train_time:94684ms step_avg:151.74ms step:635/1480 train_time:94844ms step_avg:151.75ms step:636/1480 train_time:95004ms step_avg:151.76ms step:637/1480 train_time:95164ms step_avg:151.78ms step:638/1480 train_time:95324ms step_avg:151.79ms step:639/1480 train_time:95482ms step_avg:151.80ms step:640/1480 train_time:95640ms step_avg:151.81ms step:641/1480 train_time:95798ms step_avg:151.82ms step:642/1480 train_time:95956ms step_avg:151.83ms step:643/1480 train_time:96115ms step_avg:151.84ms step:644/1480 train_time:96272ms step_avg:151.85ms step:645/1480 train_time:96430ms step_avg:151.86ms step:646/1480 train_time:96590ms step_avg:151.87ms step:647/1480 train_time:96749ms step_avg:151.88ms step:648/1480 train_time:96910ms step_avg:151.90ms step:649/1480 train_time:97070ms step_avg:151.91ms step:650/1480 train_time:97231ms step_avg:151.92ms step:651/1480 train_time:97390ms step_avg:151.93ms step:652/1480 train_time:97550ms step_avg:151.95ms step:653/1480 train_time:97709ms step_avg:151.96ms step:654/1480 train_time:97868ms step_avg:151.97ms step:655/1480 train_time:98029ms step_avg:151.98ms step:656/1480 train_time:98189ms step_avg:152.00ms step:657/1480 train_time:98349ms step_avg:152.01ms step:658/1480 train_time:98509ms step_avg:152.02ms step:659/1480 train_time:98671ms step_avg:152.04ms step:660/1480 train_time:98833ms step_avg:152.05ms step:661/1480 train_time:98995ms step_avg:152.07ms step:662/1480 train_time:99156ms step_avg:152.08ms step:663/1480 train_time:99316ms step_avg:152.09ms step:664/1480 train_time:99476ms step_avg:152.10ms step:665/1480 train_time:99638ms step_avg:152.12ms step:666/1480 train_time:99798ms step_avg:152.13ms step:667/1480 train_time:99958ms step_avg:152.14ms step:668/1480 train_time:100119ms step_avg:152.16ms step:669/1480 train_time:100281ms step_avg:152.17ms step:670/1480 train_time:100441ms step_avg:152.18ms step:671/1480 train_time:100602ms step_avg:152.20ms step:672/1480 train_time:100765ms step_avg:152.21ms step:673/1480 train_time:100928ms step_avg:152.23ms step:674/1480 train_time:101090ms step_avg:152.24ms step:675/1480 train_time:101253ms step_avg:152.26ms step:676/1480 train_time:101415ms step_avg:152.27ms step:677/1480 train_time:101574ms step_avg:152.28ms step:678/1480 train_time:101735ms step_avg:152.30ms step:679/1480 train_time:101898ms step_avg:152.31ms step:680/1480 train_time:102058ms step_avg:152.32ms step:681/1480 train_time:102218ms step_avg:152.34ms step:682/1480 train_time:102380ms step_avg:152.35ms step:683/1480 train_time:102540ms step_avg:152.36ms step:684/1480 train_time:102702ms step_avg:152.38ms step:685/1480 train_time:102864ms step_avg:152.39ms step:686/1480 train_time:103026ms step_avg:152.41ms step:687/1480 train_time:103188ms step_avg:152.42ms step:688/1480 train_time:103352ms step_avg:152.44ms step:689/1480 train_time:103515ms step_avg:152.45ms step:690/1480 train_time:103676ms step_avg:152.47ms step:691/1480 train_time:103836ms step_avg:152.48ms step:692/1480 train_time:103996ms step_avg:152.49ms step:693/1480 train_time:104157ms step_avg:152.50ms step:694/1480 train_time:104318ms step_avg:152.51ms step:695/1480 train_time:104478ms step_avg:152.52ms step:696/1480 train_time:104637ms step_avg:152.53ms step:697/1480 train_time:104800ms step_avg:152.55ms step:698/1480 train_time:104960ms step_avg:152.56ms step:699/1480 train_time:105123ms step_avg:152.57ms step:700/1480 train_time:105287ms step_avg:152.59ms step:701/1480 train_time:105448ms step_avg:152.60ms step:702/1480 train_time:105610ms step_avg:152.62ms step:703/1480 train_time:105770ms step_avg:152.63ms step:704/1480 train_time:105931ms step_avg:152.64ms step:705/1480 train_time:106092ms step_avg:152.65ms step:706/1480 train_time:106256ms step_avg:152.67ms step:707/1480 train_time:106417ms step_avg:152.68ms step:708/1480 train_time:106577ms step_avg:152.69ms step:709/1480 train_time:106738ms step_avg:152.70ms step:710/1480 train_time:106898ms step_avg:152.71ms step:711/1480 train_time:107061ms step_avg:152.73ms step:712/1480 train_time:107228ms step_avg:152.75ms step:713/1480 train_time:107392ms step_avg:152.76ms step:714/1480 train_time:107552ms step_avg:152.77ms step:715/1480 train_time:107712ms step_avg:152.78ms step:716/1480 train_time:107870ms step_avg:152.79ms step:717/1480 train_time:108034ms step_avg:152.81ms step:718/1480 train_time:108193ms step_avg:152.82ms step:719/1480 train_time:108353ms step_avg:152.83ms step:720/1480 train_time:108517ms step_avg:152.84ms step:721/1480 train_time:108679ms step_avg:152.85ms step:722/1480 train_time:108839ms step_avg:152.86ms step:723/1480 train_time:108999ms step_avg:152.87ms step:724/1480 train_time:109160ms step_avg:152.89ms step:725/1480 train_time:109324ms step_avg:152.90ms step:726/1480 train_time:109488ms step_avg:152.92ms step:727/1480 train_time:109651ms step_avg:152.93ms step:728/1480 train_time:109812ms step_avg:152.94ms step:729/1480 train_time:109973ms step_avg:152.95ms step:730/1480 train_time:110136ms step_avg:152.97ms step:731/1480 train_time:110296ms step_avg:152.98ms step:732/1480 train_time:110456ms step_avg:152.99ms step:733/1480 train_time:110619ms step_avg:153.00ms step:734/1480 train_time:110780ms step_avg:153.01ms step:735/1480 train_time:110941ms step_avg:153.02ms step:736/1480 train_time:111104ms step_avg:153.04ms step:737/1480 train_time:111264ms step_avg:153.05ms step:738/1480 train_time:111428ms step_avg:153.06ms step:739/1480 train_time:111589ms step_avg:153.07ms step:740/1480 train_time:111754ms step_avg:153.09ms step:741/1480 train_time:111916ms step_avg:153.10ms step:742/1480 train_time:112077ms step_avg:153.11ms step:743/1480 train_time:112237ms step_avg:153.12ms step:744/1480 train_time:112401ms step_avg:153.14ms step:745/1480 train_time:112566ms step_avg:153.15ms step:746/1480 train_time:112728ms step_avg:153.16ms step:747/1480 train_time:112889ms step_avg:153.17ms step:748/1480 train_time:113055ms step_avg:153.19ms step:749/1480 train_time:113217ms step_avg:153.20ms step:750/1480 train_time:113377ms step_avg:153.21ms step:750/1480 val_loss:3.5474 train_time:113441ms step_avg:153.30ms step:751/1480 train_time:113541ms step_avg:153.23ms step:752/1480 train_time:113703ms step_avg:153.24ms step:753/1480 train_time:113863ms step_avg:153.25ms step:754/1480 train_time:114023ms step_avg:153.26ms step:755/1480 train_time:114184ms step_avg:153.27ms step:756/1480 train_time:114345ms step_avg:153.28ms step:757/1480 train_time:114509ms step_avg:153.29ms step:758/1480 train_time:114669ms step_avg:153.30ms step:759/1480 train_time:114832ms step_avg:153.31ms step:760/1480 train_time:114995ms step_avg:153.33ms step:761/1480 train_time:115158ms step_avg:153.34ms step:762/1480 train_time:115319ms step_avg:153.35ms step:763/1480 train_time:115480ms step_avg:153.36ms step:764/1480 train_time:115642ms step_avg:153.37ms step:765/1480 train_time:115803ms step_avg:153.38ms step:766/1480 train_time:115966ms step_avg:153.39ms step:767/1480 train_time:116128ms step_avg:153.41ms step:768/1480 train_time:116290ms step_avg:153.42ms step:769/1480 train_time:116454ms step_avg:153.43ms step:770/1480 train_time:116616ms step_avg:153.44ms step:771/1480 train_time:116780ms step_avg:153.46ms step:772/1480 train_time:116942ms step_avg:153.47ms step:773/1480 train_time:117105ms step_avg:153.48ms step:774/1480 train_time:117266ms step_avg:153.49ms step:775/1480 train_time:117429ms step_avg:153.50ms step:776/1480 train_time:117595ms step_avg:153.52ms step:777/1480 train_time:117761ms step_avg:153.53ms step:778/1480 train_time:117923ms step_avg:153.55ms step:779/1480 train_time:118085ms step_avg:153.56ms step:780/1480 train_time:118248ms step_avg:153.57ms step:781/1480 train_time:118412ms step_avg:153.58ms step:782/1480 train_time:118576ms step_avg:153.60ms step:783/1480 train_time:118737ms step_avg:153.61ms step:784/1480 train_time:118900ms step_avg:153.62ms step:785/1480 train_time:119061ms step_avg:153.63ms step:786/1480 train_time:119225ms step_avg:153.64ms step:787/1480 train_time:119389ms step_avg:153.65ms step:788/1480 train_time:119553ms step_avg:153.67ms step:789/1480 train_time:119716ms step_avg:153.68ms step:790/1480 train_time:119882ms step_avg:153.69ms step:791/1480 train_time:120048ms step_avg:153.71ms step:792/1480 train_time:120212ms step_avg:153.72ms step:793/1480 train_time:120375ms step_avg:153.74ms step:794/1480 train_time:120539ms step_avg:153.75ms step:795/1480 train_time:120704ms step_avg:153.76ms step:796/1480 train_time:120869ms step_avg:153.78ms step:797/1480 train_time:121034ms step_avg:153.79ms step:798/1480 train_time:121198ms step_avg:153.80ms step:799/1480 train_time:121365ms step_avg:153.82ms step:800/1480 train_time:121529ms step_avg:153.83ms step:801/1480 train_time:121693ms step_avg:153.85ms step:802/1480 train_time:121860ms step_avg:153.86ms step:803/1480 train_time:122022ms step_avg:153.87ms step:804/1480 train_time:122185ms step_avg:153.89ms step:805/1480 train_time:122351ms step_avg:153.90ms step:806/1480 train_time:122512ms step_avg:153.91ms step:807/1480 train_time:122675ms step_avg:153.92ms step:808/1480 train_time:122839ms step_avg:153.93ms step:809/1480 train_time:123001ms step_avg:153.94ms step:810/1480 train_time:123163ms step_avg:153.95ms step:811/1480 train_time:123324ms step_avg:153.96ms step:812/1480 train_time:123487ms step_avg:153.97ms step:813/1480 train_time:123647ms step_avg:153.98ms step:814/1480 train_time:123811ms step_avg:153.99ms step:815/1480 train_time:123974ms step_avg:154.01ms step:816/1480 train_time:124138ms step_avg:154.02ms step:817/1480 train_time:124301ms step_avg:154.03ms step:818/1480 train_time:124462ms step_avg:154.04ms step:819/1480 train_time:124624ms step_avg:154.05ms step:820/1480 train_time:124790ms step_avg:154.06ms step:821/1480 train_time:124952ms step_avg:154.07ms step:822/1480 train_time:125115ms step_avg:154.08ms step:823/1480 train_time:125278ms step_avg:154.09ms step:824/1480 train_time:125439ms step_avg:154.10ms step:825/1480 train_time:125603ms step_avg:154.11ms step:826/1480 train_time:125771ms step_avg:154.13ms step:827/1480 train_time:125936ms step_avg:154.14ms step:828/1480 train_time:126099ms step_avg:154.16ms step:829/1480 train_time:126262ms step_avg:154.17ms step:830/1480 train_time:126426ms step_avg:154.18ms step:831/1480 train_time:126590ms step_avg:154.19ms step:832/1480 train_time:126754ms step_avg:154.20ms step:833/1480 train_time:126919ms step_avg:154.22ms step:834/1480 train_time:127085ms step_avg:154.23ms step:835/1480 train_time:127248ms step_avg:154.24ms step:836/1480 train_time:127413ms step_avg:154.25ms step:837/1480 train_time:127575ms step_avg:154.26ms step:838/1480 train_time:127739ms step_avg:154.27ms step:839/1480 train_time:127901ms step_avg:154.28ms step:840/1480 train_time:128062ms step_avg:154.29ms step:841/1480 train_time:128223ms step_avg:154.30ms step:842/1480 train_time:128386ms step_avg:154.31ms step:843/1480 train_time:128547ms step_avg:154.32ms step:844/1480 train_time:128708ms step_avg:154.33ms step:845/1480 train_time:128873ms step_avg:154.34ms step:846/1480 train_time:129037ms step_avg:154.35ms step:847/1480 train_time:129201ms step_avg:154.36ms step:848/1480 train_time:129362ms step_avg:154.37ms step:849/1480 train_time:129525ms step_avg:154.38ms step:850/1480 train_time:129688ms step_avg:154.39ms step:851/1480 train_time:129852ms step_avg:154.40ms step:852/1480 train_time:130015ms step_avg:154.41ms step:853/1480 train_time:130178ms step_avg:154.42ms step:854/1480 train_time:130342ms step_avg:154.43ms step:855/1480 train_time:130506ms step_avg:154.44ms step:856/1480 train_time:130667ms step_avg:154.45ms step:857/1480 train_time:130833ms step_avg:154.47ms step:858/1480 train_time:130999ms step_avg:154.48ms step:859/1480 train_time:131163ms step_avg:154.49ms step:860/1480 train_time:131324ms step_avg:154.50ms step:861/1480 train_time:131490ms step_avg:154.51ms step:862/1480 train_time:131660ms step_avg:154.53ms step:863/1480 train_time:131827ms step_avg:154.54ms step:864/1480 train_time:131991ms step_avg:154.56ms step:865/1480 train_time:132153ms step_avg:154.56ms step:866/1480 train_time:132319ms step_avg:154.58ms step:867/1480 train_time:132483ms step_avg:154.59ms step:868/1480 train_time:132644ms step_avg:154.60ms step:869/1480 train_time:132807ms step_avg:154.61ms step:870/1480 train_time:132973ms step_avg:154.62ms step:871/1480 train_time:133136ms step_avg:154.63ms step:872/1480 train_time:133300ms step_avg:154.64ms step:873/1480 train_time:133462ms step_avg:154.65ms step:874/1480 train_time:133627ms step_avg:154.66ms step:875/1480 train_time:133794ms step_avg:154.68ms step:875/1480 val_loss:3.5029 train_time:133859ms step_avg:154.75ms step:876/1480 train_time:133959ms step_avg:154.69ms step:877/1480 train_time:134128ms step_avg:154.70ms step:878/1480 train_time:134291ms step_avg:154.71ms step:879/1480 train_time:134454ms step_avg:154.72ms step:880/1480 train_time:134616ms step_avg:154.73ms step:881/1480 train_time:134778ms step_avg:154.74ms step:882/1480 train_time:134944ms step_avg:154.75ms step:883/1480 train_time:135111ms step_avg:154.77ms step:884/1480 train_time:135277ms step_avg:154.78ms step:885/1480 train_time:135442ms step_avg:154.79ms step:886/1480 train_time:135609ms step_avg:154.80ms step:887/1480 train_time:135777ms step_avg:154.82ms step:888/1480 train_time:135952ms step_avg:154.84ms step:889/1480 train_time:136120ms step_avg:154.86ms step:890/1480 train_time:136282ms step_avg:154.87ms step:891/1480 train_time:136449ms step_avg:154.88ms step:892/1480 train_time:136613ms step_avg:154.89ms step:893/1480 train_time:136775ms step_avg:154.90ms step:894/1480 train_time:136940ms step_avg:154.91ms step:895/1480 train_time:137106ms step_avg:154.92ms step:896/1480 train_time:137272ms step_avg:154.93ms step:897/1480 train_time:137439ms step_avg:154.95ms step:898/1480 train_time:137607ms step_avg:154.96ms step:899/1480 train_time:137772ms step_avg:154.97ms step:900/1480 train_time:137936ms step_avg:154.98ms step:901/1480 train_time:138100ms step_avg:154.99ms step:902/1480 train_time:138264ms step_avg:155.01ms step:903/1480 train_time:138438ms step_avg:155.03ms step:904/1480 train_time:138604ms step_avg:155.04ms step:905/1480 train_time:138767ms step_avg:155.05ms step:906/1480 train_time:138934ms step_avg:155.06ms step:907/1480 train_time:139103ms step_avg:155.08ms step:908/1480 train_time:139266ms step_avg:155.08ms step:909/1480 train_time:139432ms step_avg:155.10ms step:910/1480 train_time:139603ms step_avg:155.11ms step:911/1480 train_time:139769ms step_avg:155.13ms step:912/1480 train_time:139935ms step_avg:155.14ms step:913/1480 train_time:140103ms step_avg:155.15ms step:914/1480 train_time:140270ms step_avg:155.17ms step:915/1480 train_time:140440ms step_avg:155.18ms step:916/1480 train_time:140604ms step_avg:155.19ms step:917/1480 train_time:140767ms step_avg:155.20ms step:918/1480 train_time:140934ms step_avg:155.21ms step:919/1480 train_time:141104ms step_avg:155.23ms step:920/1480 train_time:141271ms step_avg:155.24ms step:921/1480 train_time:141435ms step_avg:155.25ms step:922/1480 train_time:141603ms step_avg:155.27ms step:923/1480 train_time:141765ms step_avg:155.27ms step:924/1480 train_time:141930ms step_avg:155.28ms step:925/1480 train_time:142095ms step_avg:155.30ms step:926/1480 train_time:142258ms step_avg:155.30ms step:927/1480 train_time:142424ms step_avg:155.32ms step:928/1480 train_time:142590ms step_avg:155.33ms step:929/1480 train_time:142754ms step_avg:155.34ms step:930/1480 train_time:142918ms step_avg:155.35ms step:931/1480 train_time:143081ms step_avg:155.35ms step:932/1480 train_time:143248ms step_avg:155.37ms step:933/1480 train_time:143414ms step_avg:155.38ms step:934/1480 train_time:143580ms step_avg:155.39ms step:935/1480 train_time:143751ms step_avg:155.41ms step:936/1480 train_time:143918ms step_avg:155.42ms step:937/1480 train_time:144087ms step_avg:155.43ms step:938/1480 train_time:144251ms step_avg:155.44ms step:939/1480 train_time:144420ms step_avg:155.46ms step:940/1480 train_time:144588ms step_avg:155.47ms step:941/1480 train_time:144751ms step_avg:155.48ms step:942/1480 train_time:144916ms step_avg:155.49ms step:943/1480 train_time:145087ms step_avg:155.51ms step:944/1480 train_time:145258ms step_avg:155.52ms step:945/1480 train_time:145423ms step_avg:155.53ms step:946/1480 train_time:145591ms step_avg:155.55ms step:947/1480 train_time:145758ms step_avg:155.56ms step:948/1480 train_time:145924ms step_avg:155.57ms step:949/1480 train_time:146090ms step_avg:155.58ms step:950/1480 train_time:146254ms step_avg:155.59ms step:951/1480 train_time:146423ms step_avg:155.60ms step:952/1480 train_time:146591ms step_avg:155.62ms step:953/1480 train_time:146759ms step_avg:155.63ms step:954/1480 train_time:146929ms step_avg:155.64ms step:955/1480 train_time:147092ms step_avg:155.65ms step:956/1480 train_time:147258ms step_avg:155.66ms step:957/1480 train_time:147426ms step_avg:155.68ms step:958/1480 train_time:147596ms step_avg:155.69ms step:959/1480 train_time:147763ms step_avg:155.70ms step:960/1480 train_time:147931ms step_avg:155.72ms step:961/1480 train_time:148095ms step_avg:155.73ms step:962/1480 train_time:148259ms step_avg:155.73ms step:963/1480 train_time:148425ms step_avg:155.75ms step:964/1480 train_time:148593ms step_avg:155.76ms step:965/1480 train_time:148757ms step_avg:155.77ms step:966/1480 train_time:148923ms step_avg:155.78ms step:967/1480 train_time:149087ms step_avg:155.79ms step:968/1480 train_time:149253ms step_avg:155.80ms step:969/1480 train_time:149421ms step_avg:155.81ms step:970/1480 train_time:149584ms step_avg:155.82ms step:971/1480 train_time:149749ms step_avg:155.83ms step:972/1480 train_time:149912ms step_avg:155.83ms step:973/1480 train_time:150075ms step_avg:155.84ms step:974/1480 train_time:150247ms step_avg:155.86ms step:975/1480 train_time:150413ms step_avg:155.87ms step:976/1480 train_time:150578ms step_avg:155.88ms step:977/1480 train_time:150742ms step_avg:155.89ms step:978/1480 train_time:150908ms step_avg:155.90ms step:979/1480 train_time:151074ms step_avg:155.91ms step:980/1480 train_time:151239ms step_avg:155.92ms step:981/1480 train_time:151408ms step_avg:155.93ms step:982/1480 train_time:151572ms step_avg:155.94ms step:983/1480 train_time:151736ms step_avg:155.95ms step:984/1480 train_time:151900ms step_avg:155.95ms step:985/1480 train_time:152067ms step_avg:155.97ms step:986/1480 train_time:152232ms step_avg:155.98ms step:987/1480 train_time:152395ms step_avg:155.98ms step:988/1480 train_time:152563ms step_avg:156.00ms step:989/1480 train_time:152731ms step_avg:156.01ms step:990/1480 train_time:152902ms step_avg:156.02ms step:991/1480 train_time:153070ms step_avg:156.03ms step:992/1480 train_time:153245ms step_avg:156.05ms step:993/1480 train_time:153423ms step_avg:156.08ms step:994/1480 train_time:153589ms step_avg:156.09ms step:995/1480 train_time:153753ms step_avg:156.09ms step:996/1480 train_time:153915ms step_avg:156.10ms step:997/1480 train_time:154081ms step_avg:156.11ms step:998/1480 train_time:154246ms step_avg:156.12ms step:999/1480 train_time:154411ms step_avg:156.13ms step:1000/1480 train_time:154580ms step_avg:156.14ms step:1000/1480 val_loss:3.4404 train_time:154648ms step_avg:156.21ms step:1001/1480 train_time:154748ms step_avg:156.15ms step:1002/1480 train_time:154916ms step_avg:156.17ms step:1003/1480 train_time:155087ms step_avg:156.18ms step:1004/1480 train_time:155255ms step_avg:156.19ms step:1005/1480 train_time:155422ms step_avg:156.20ms step:1006/1480 train_time:155589ms step_avg:156.21ms step:1007/1480 train_time:155755ms step_avg:156.22ms step:1008/1480 train_time:155922ms step_avg:156.23ms step:1009/1480 train_time:156095ms step_avg:156.25ms step:1010/1480 train_time:156261ms step_avg:156.26ms step:1011/1480 train_time:156425ms step_avg:156.27ms step:1012/1480 train_time:156591ms step_avg:156.28ms step:1013/1480 train_time:156761ms step_avg:156.29ms step:1014/1480 train_time:156927ms step_avg:156.30ms step:1015/1480 train_time:157098ms step_avg:156.32ms step:1016/1480 train_time:157265ms step_avg:156.33ms step:1017/1480 train_time:157438ms step_avg:156.34ms step:1018/1480 train_time:157607ms step_avg:156.36ms step:1019/1480 train_time:157778ms step_avg:156.37ms step:1020/1480 train_time:157947ms step_avg:156.38ms step:1021/1480 train_time:158112ms step_avg:156.39ms step:1022/1480 train_time:158280ms step_avg:156.40ms step:1023/1480 train_time:158447ms step_avg:156.41ms step:1024/1480 train_time:158614ms step_avg:156.42ms step:1025/1480 train_time:158784ms step_avg:156.44ms step:1026/1480 train_time:158949ms step_avg:156.45ms step:1027/1480 train_time:159117ms step_avg:156.46ms step:1028/1480 train_time:159290ms step_avg:156.47ms step:1029/1480 train_time:159466ms step_avg:156.49ms step:1030/1480 train_time:159633ms step_avg:156.50ms step:1031/1480 train_time:159798ms step_avg:156.51ms step:1032/1480 train_time:159971ms step_avg:156.53ms step:1033/1480 train_time:160138ms step_avg:156.54ms step:1034/1480 train_time:160307ms step_avg:156.55ms step:1035/1480 train_time:160476ms step_avg:156.56ms step:1036/1480 train_time:160642ms step_avg:156.57ms step:1037/1480 train_time:160809ms step_avg:156.58ms step:1038/1480 train_time:160979ms step_avg:156.59ms step:1039/1480 train_time:161148ms step_avg:156.61ms step:1040/1480 train_time:161315ms step_avg:156.62ms step:1041/1480 train_time:161482ms step_avg:156.63ms step:1042/1480 train_time:161645ms step_avg:156.63ms step:1043/1480 train_time:161811ms step_avg:156.64ms step:1044/1480 train_time:161977ms step_avg:156.65ms step:1045/1480 train_time:162145ms step_avg:156.66ms step:1046/1480 train_time:162312ms step_avg:156.67ms step:1047/1480 train_time:162480ms step_avg:156.68ms step:1048/1480 train_time:162646ms step_avg:156.69ms step:1049/1480 train_time:162811ms step_avg:156.70ms step:1050/1480 train_time:162981ms step_avg:156.71ms step:1051/1480 train_time:163149ms step_avg:156.72ms step:1052/1480 train_time:163318ms step_avg:156.73ms step:1053/1480 train_time:163484ms step_avg:156.74ms step:1054/1480 train_time:163652ms step_avg:156.76ms step:1055/1480 train_time:163819ms step_avg:156.76ms step:1056/1480 train_time:163983ms step_avg:156.77ms step:1057/1480 train_time:164150ms step_avg:156.78ms step:1058/1480 train_time:164318ms step_avg:156.79ms step:1059/1480 train_time:164491ms step_avg:156.81ms step:1060/1480 train_time:164660ms step_avg:156.82ms step:1061/1480 train_time:164824ms step_avg:156.83ms step:1062/1480 train_time:164991ms step_avg:156.84ms step:1063/1480 train_time:165157ms step_avg:156.84ms step:1064/1480 train_time:165320ms step_avg:156.85ms step:1065/1480 train_time:165488ms step_avg:156.86ms step:1066/1480 train_time:165657ms step_avg:156.87ms step:1067/1480 train_time:165824ms step_avg:156.88ms step:1068/1480 train_time:165989ms step_avg:156.89ms step:1069/1480 train_time:166161ms step_avg:156.90ms step:1070/1480 train_time:166326ms step_avg:156.91ms step:1071/1480 train_time:166499ms step_avg:156.93ms step:1072/1480 train_time:166665ms step_avg:156.94ms step:1073/1480 train_time:166829ms step_avg:156.94ms step:1074/1480 train_time:166996ms step_avg:156.95ms step:1075/1480 train_time:167168ms step_avg:156.97ms step:1076/1480 train_time:167334ms step_avg:156.97ms step:1077/1480 train_time:167501ms step_avg:156.98ms step:1078/1480 train_time:167675ms step_avg:157.00ms step:1079/1480 train_time:167847ms step_avg:157.01ms step:1080/1480 train_time:168018ms step_avg:157.03ms step:1081/1480 train_time:168184ms step_avg:157.03ms step:1082/1480 train_time:168349ms step_avg:157.04ms step:1083/1480 train_time:168515ms step_avg:157.05ms step:1084/1480 train_time:168682ms step_avg:157.06ms step:1085/1480 train_time:168852ms step_avg:157.07ms step:1086/1480 train_time:169019ms step_avg:157.08ms step:1087/1480 train_time:169185ms step_avg:157.09ms step:1088/1480 train_time:169354ms step_avg:157.10ms step:1089/1480 train_time:169525ms step_avg:157.11ms step:1090/1480 train_time:169698ms step_avg:157.13ms step:1091/1480 train_time:169867ms step_avg:157.14ms step:1092/1480 train_time:170034ms step_avg:157.15ms step:1093/1480 train_time:170202ms step_avg:157.16ms step:1094/1480 train_time:170367ms step_avg:157.17ms step:1095/1480 train_time:170532ms step_avg:157.17ms step:1096/1480 train_time:170700ms step_avg:157.18ms step:1097/1480 train_time:170868ms step_avg:157.19ms step:1098/1480 train_time:171039ms step_avg:157.21ms step:1099/1480 train_time:171211ms step_avg:157.22ms step:1100/1480 train_time:171383ms step_avg:157.23ms step:1101/1480 train_time:171553ms step_avg:157.24ms step:1102/1480 train_time:171724ms step_avg:157.26ms step:1103/1480 train_time:171900ms step_avg:157.27ms step:1104/1480 train_time:172067ms step_avg:157.28ms step:1105/1480 train_time:172237ms step_avg:157.29ms step:1106/1480 train_time:172406ms step_avg:157.30ms step:1107/1480 train_time:172576ms step_avg:157.32ms step:1108/1480 train_time:172741ms step_avg:157.32ms step:1109/1480 train_time:172907ms step_avg:157.33ms step:1110/1480 train_time:173073ms step_avg:157.34ms step:1111/1480 train_time:173240ms step_avg:157.35ms step:1112/1480 train_time:173408ms step_avg:157.36ms step:1113/1480 train_time:173589ms step_avg:157.38ms step:1114/1480 train_time:173763ms step_avg:157.39ms step:1115/1480 train_time:173935ms step_avg:157.41ms step:1116/1480 train_time:174101ms step_avg:157.42ms step:1117/1480 train_time:174274ms step_avg:157.43ms step:1118/1480 train_time:174448ms step_avg:157.44ms step:1119/1480 train_time:174615ms step_avg:157.45ms step:1120/1480 train_time:174783ms step_avg:157.46ms step:1121/1480 train_time:174953ms step_avg:157.47ms step:1122/1480 train_time:175119ms step_avg:157.48ms step:1123/1480 train_time:175286ms step_avg:157.49ms step:1124/1480 train_time:175455ms step_avg:157.50ms step:1125/1480 train_time:175622ms step_avg:157.51ms step:1125/1480 val_loss:3.3839 train_time:175689ms step_avg:157.57ms step:1126/1480 train_time:175789ms step_avg:157.52ms step:1127/1480 train_time:175961ms step_avg:157.53ms step:1128/1480 train_time:176133ms step_avg:157.54ms step:1129/1480 train_time:176305ms step_avg:157.56ms step:1130/1480 train_time:176474ms step_avg:157.57ms step:1131/1480 train_time:176652ms step_avg:157.58ms step:1132/1480 train_time:176818ms step_avg:157.59ms step:1133/1480 train_time:176991ms step_avg:157.61ms step:1134/1480 train_time:177162ms step_avg:157.62ms step:1135/1480 train_time:177330ms step_avg:157.63ms step:1136/1480 train_time:177501ms step_avg:157.64ms step:1137/1480 train_time:177668ms step_avg:157.65ms step:1138/1480 train_time:177840ms step_avg:157.66ms step:1139/1480 train_time:178007ms step_avg:157.67ms step:1140/1480 train_time:178174ms step_avg:157.68ms step:1141/1480 train_time:178346ms step_avg:157.69ms step:1142/1480 train_time:178514ms step_avg:157.70ms step:1143/1480 train_time:178686ms step_avg:157.71ms step:1144/1480 train_time:178855ms step_avg:157.72ms step:1145/1480 train_time:179021ms step_avg:157.73ms step:1146/1480 train_time:179193ms step_avg:157.74ms step:1147/1480 train_time:179361ms step_avg:157.75ms step:1148/1480 train_time:179529ms step_avg:157.76ms step:1149/1480 train_time:179701ms step_avg:157.77ms step:1150/1480 train_time:179869ms step_avg:157.78ms step:1151/1480 train_time:180041ms step_avg:157.79ms step:1152/1480 train_time:180212ms step_avg:157.80ms step:1153/1480 train_time:180385ms step_avg:157.82ms step:1154/1480 train_time:180551ms step_avg:157.82ms step:1155/1480 train_time:180724ms step_avg:157.84ms step:1156/1480 train_time:180903ms step_avg:157.86ms step:1157/1480 train_time:181073ms step_avg:157.87ms step:1158/1480 train_time:181240ms step_avg:157.87ms step:1159/1480 train_time:181407ms step_avg:157.88ms step:1160/1480 train_time:181573ms step_avg:157.89ms step:1161/1480 train_time:181742ms step_avg:157.90ms step:1162/1480 train_time:181911ms step_avg:157.91ms step:1163/1480 train_time:182082ms step_avg:157.92ms step:1164/1480 train_time:182251ms step_avg:157.93ms step:1165/1480 train_time:182417ms step_avg:157.94ms step:1166/1480 train_time:182587ms step_avg:157.95ms step:1167/1480 train_time:182755ms step_avg:157.96ms step:1168/1480 train_time:182922ms step_avg:157.96ms step:1169/1480 train_time:183091ms step_avg:157.97ms step:1170/1480 train_time:183260ms step_avg:157.98ms step:1171/1480 train_time:183427ms step_avg:157.99ms step:1172/1480 train_time:183594ms step_avg:158.00ms step:1173/1480 train_time:183766ms step_avg:158.01ms step:1174/1480 train_time:183948ms step_avg:158.03ms step:1175/1480 train_time:184120ms step_avg:158.04ms step:1176/1480 train_time:184292ms step_avg:158.06ms step:1177/1480 train_time:184471ms step_avg:158.07ms step:1178/1480 train_time:184638ms step_avg:158.08ms step:1179/1480 train_time:184804ms step_avg:158.09ms step:1180/1480 train_time:184984ms step_avg:158.11ms step:1181/1480 train_time:185153ms step_avg:158.12ms step:1182/1480 train_time:185322ms step_avg:158.12ms step:1183/1480 train_time:185493ms step_avg:158.14ms step:1184/1480 train_time:185661ms step_avg:158.14ms step:1185/1480 train_time:185833ms step_avg:158.16ms step:1186/1480 train_time:186005ms step_avg:158.17ms step:1187/1480 train_time:186188ms step_avg:158.19ms step:1188/1480 train_time:186354ms step_avg:158.20ms step:1189/1480 train_time:186525ms step_avg:158.21ms step:1190/1480 train_time:186692ms step_avg:158.21ms step:1191/1480 train_time:186863ms step_avg:158.22ms step:1192/1480 train_time:187029ms step_avg:158.23ms step:1193/1480 train_time:187198ms step_avg:158.24ms step:1194/1480 train_time:187366ms step_avg:158.25ms step:1195/1480 train_time:187540ms step_avg:158.26ms step:1196/1480 train_time:187722ms step_avg:158.28ms step:1197/1480 train_time:187894ms step_avg:158.29ms step:1198/1480 train_time:188076ms step_avg:158.31ms step:1199/1480 train_time:188245ms step_avg:158.32ms step:1200/1480 train_time:188414ms step_avg:158.33ms step:1201/1480 train_time:188582ms step_avg:158.34ms step:1202/1480 train_time:188766ms step_avg:158.36ms step:1203/1480 train_time:188942ms step_avg:158.38ms step:1204/1480 train_time:189117ms step_avg:158.39ms step:1205/1480 train_time:189285ms step_avg:158.40ms step:1206/1480 train_time:189453ms step_avg:158.41ms step:1207/1480 train_time:189623ms step_avg:158.42ms step:1208/1480 train_time:189790ms step_avg:158.42ms step:1209/1480 train_time:189964ms step_avg:158.44ms step:1210/1480 train_time:190140ms step_avg:158.45ms step:1211/1480 train_time:190314ms step_avg:158.46ms step:1212/1480 train_time:190485ms step_avg:158.47ms step:1213/1480 train_time:190658ms step_avg:158.49ms step:1214/1480 train_time:190835ms step_avg:158.50ms step:1215/1480 train_time:191008ms step_avg:158.51ms step:1216/1480 train_time:191177ms step_avg:158.52ms step:1217/1480 train_time:191349ms step_avg:158.53ms step:1218/1480 train_time:191518ms step_avg:158.54ms step:1219/1480 train_time:191699ms step_avg:158.56ms step:1220/1480 train_time:191868ms step_avg:158.57ms step:1221/1480 train_time:192035ms step_avg:158.58ms step:1222/1480 train_time:192204ms step_avg:158.58ms step:1223/1480 train_time:192374ms step_avg:158.59ms step:1224/1480 train_time:192550ms step_avg:158.61ms step:1225/1480 train_time:192721ms step_avg:158.62ms step:1226/1480 train_time:192894ms step_avg:158.63ms step:1227/1480 train_time:193067ms step_avg:158.64ms step:1228/1480 train_time:193236ms step_avg:158.65ms step:1229/1480 train_time:193409ms step_avg:158.66ms step:1230/1480 train_time:193590ms step_avg:158.68ms step:1231/1480 train_time:193764ms step_avg:158.69ms step:1232/1480 train_time:193939ms step_avg:158.71ms step:1233/1480 train_time:194108ms step_avg:158.71ms step:1234/1480 train_time:194280ms step_avg:158.73ms step:1235/1480 train_time:194456ms step_avg:158.74ms step:1236/1480 train_time:194625ms step_avg:158.75ms step:1237/1480 train_time:194795ms step_avg:158.76ms step:1238/1480 train_time:194982ms step_avg:158.78ms step:1239/1480 train_time:195151ms step_avg:158.79ms step:1240/1480 train_time:195322ms step_avg:158.80ms step:1241/1480 train_time:195495ms step_avg:158.81ms step:1242/1480 train_time:195664ms step_avg:158.82ms step:1243/1480 train_time:195838ms step_avg:158.83ms step:1244/1480 train_time:196004ms step_avg:158.84ms step:1245/1480 train_time:196173ms step_avg:158.84ms step:1246/1480 train_time:196342ms step_avg:158.85ms step:1247/1480 train_time:196509ms step_avg:158.86ms step:1248/1480 train_time:196680ms step_avg:158.87ms step:1249/1480 train_time:196848ms step_avg:158.88ms step:1250/1480 train_time:197017ms step_avg:158.88ms step:1250/1480 val_loss:3.3339 train_time:197088ms step_avg:158.94ms step:1251/1480 train_time:197196ms step_avg:158.90ms step:1252/1480 train_time:197365ms step_avg:158.91ms step:1253/1480 train_time:197534ms step_avg:158.92ms step:1254/1480 train_time:197707ms step_avg:158.93ms step:1255/1480 train_time:197892ms step_avg:158.95ms step:1256/1480 train_time:198067ms step_avg:158.96ms step:1257/1480 train_time:198239ms step_avg:158.97ms step:1258/1480 train_time:198414ms step_avg:158.99ms step:1259/1480 train_time:198586ms step_avg:159.00ms step:1260/1480 train_time:198753ms step_avg:159.00ms step:1261/1480 train_time:198927ms step_avg:159.01ms step:1262/1480 train_time:199101ms step_avg:159.03ms step:1263/1480 train_time:199275ms step_avg:159.04ms step:1264/1480 train_time:199443ms step_avg:159.05ms step:1265/1480 train_time:199611ms step_avg:159.05ms step:1266/1480 train_time:199783ms step_avg:159.06ms step:1267/1480 train_time:199951ms step_avg:159.07ms step:1268/1480 train_time:200124ms step_avg:159.08ms step:1269/1480 train_time:200300ms step_avg:159.09ms step:1270/1480 train_time:200469ms step_avg:159.10ms step:1271/1480 train_time:200638ms step_avg:159.11ms step:1272/1480 train_time:200805ms step_avg:159.12ms step:1273/1480 train_time:200976ms step_avg:159.13ms step:1274/1480 train_time:201148ms step_avg:159.14ms step:1275/1480 train_time:201315ms step_avg:159.14ms step:1276/1480 train_time:201482ms step_avg:159.15ms step:1277/1480 train_time:201654ms step_avg:159.16ms step:1278/1480 train_time:201821ms step_avg:159.16ms step:1279/1480 train_time:201994ms step_avg:159.18ms step:1280/1480 train_time:202173ms step_avg:159.19ms step:1281/1480 train_time:202342ms step_avg:159.20ms step:1282/1480 train_time:202508ms step_avg:159.20ms step:1283/1480 train_time:202677ms step_avg:159.21ms step:1284/1480 train_time:202846ms step_avg:159.22ms step:1285/1480 train_time:203014ms step_avg:159.23ms step:1286/1480 train_time:203184ms step_avg:159.24ms step:1287/1480 train_time:203357ms step_avg:159.25ms step:1288/1480 train_time:203530ms step_avg:159.26ms step:1289/1480 train_time:203712ms step_avg:159.27ms step:1290/1480 train_time:203892ms step_avg:159.29ms step:1291/1480 train_time:204065ms step_avg:159.30ms step:1292/1480 train_time:204237ms step_avg:159.31ms step:1293/1480 train_time:204413ms step_avg:159.32ms step:1294/1480 train_time:204584ms step_avg:159.33ms step:1295/1480 train_time:204755ms step_avg:159.34ms step:1296/1480 train_time:204930ms step_avg:159.35ms step:1297/1480 train_time:205100ms step_avg:159.36ms step:1298/1480 train_time:205272ms step_avg:159.37ms step:1299/1480 train_time:205443ms step_avg:159.38ms step:1300/1480 train_time:205611ms step_avg:159.39ms step:1301/1480 train_time:205781ms step_avg:159.40ms step:1302/1480 train_time:205954ms step_avg:159.41ms step:1303/1480 train_time:206132ms step_avg:159.42ms step:1304/1480 train_time:206307ms step_avg:159.43ms step:1305/1480 train_time:206476ms step_avg:159.44ms step:1306/1480 train_time:206650ms step_avg:159.45ms step:1307/1480 train_time:206817ms step_avg:159.46ms step:1308/1480 train_time:206987ms step_avg:159.47ms step:1309/1480 train_time:207159ms step_avg:159.48ms step:1310/1480 train_time:207328ms step_avg:159.48ms step:1311/1480 train_time:207495ms step_avg:159.49ms step:1312/1480 train_time:207667ms step_avg:159.50ms step:1313/1480 train_time:207837ms step_avg:159.51ms step:1314/1480 train_time:208010ms step_avg:159.52ms step:1315/1480 train_time:208181ms step_avg:159.53ms step:1316/1480 train_time:208349ms step_avg:159.53ms step:1317/1480 train_time:208520ms step_avg:159.54ms step:1318/1480 train_time:208700ms step_avg:159.56ms step:1319/1480 train_time:208875ms step_avg:159.57ms step:1320/1480 train_time:209052ms step_avg:159.58ms step:1321/1480 train_time:209225ms step_avg:159.59ms step:1322/1480 train_time:209406ms step_avg:159.61ms step:1323/1480 train_time:209578ms step_avg:159.62ms step:1324/1480 train_time:209753ms step_avg:159.63ms step:1325/1480 train_time:209933ms step_avg:159.64ms step:1326/1480 train_time:210109ms step_avg:159.66ms step:1327/1480 train_time:210279ms step_avg:159.67ms step:1328/1480 train_time:210449ms step_avg:159.67ms step:1329/1480 train_time:210645ms step_avg:159.70ms step:1330/1480 train_time:210826ms step_avg:159.72ms step:1331/1480 train_time:210996ms step_avg:159.72ms step:1332/1480 train_time:211170ms step_avg:159.74ms step:1333/1480 train_time:211346ms step_avg:159.75ms step:1334/1480 train_time:211517ms step_avg:159.76ms step:1335/1480 train_time:211687ms step_avg:159.76ms step:1336/1480 train_time:211870ms step_avg:159.78ms step:1337/1480 train_time:212048ms step_avg:159.79ms step:1338/1480 train_time:212220ms step_avg:159.80ms step:1339/1480 train_time:212393ms step_avg:159.81ms step:1340/1480 train_time:212565ms step_avg:159.82ms step:1341/1480 train_time:212733ms step_avg:159.83ms step:1342/1480 train_time:212907ms step_avg:159.84ms step:1343/1480 train_time:213077ms step_avg:159.85ms step:1344/1480 train_time:213250ms step_avg:159.86ms step:1345/1480 train_time:213428ms step_avg:159.87ms step:1346/1480 train_time:213596ms step_avg:159.88ms step:1347/1480 train_time:213766ms step_avg:159.88ms step:1348/1480 train_time:213935ms step_avg:159.89ms step:1349/1480 train_time:214106ms step_avg:159.90ms step:1350/1480 train_time:214282ms step_avg:159.91ms step:1351/1480 train_time:214453ms step_avg:159.92ms step:1352/1480 train_time:214624ms step_avg:159.93ms step:1353/1480 train_time:214801ms step_avg:159.94ms step:1354/1480 train_time:214972ms step_avg:159.95ms step:1355/1480 train_time:215138ms step_avg:159.95ms step:1356/1480 train_time:215310ms step_avg:159.96ms step:1357/1480 train_time:215485ms step_avg:159.97ms step:1358/1480 train_time:215655ms step_avg:159.98ms step:1359/1480 train_time:215826ms step_avg:159.99ms step:1360/1480 train_time:215999ms step_avg:160.00ms step:1361/1480 train_time:216175ms step_avg:160.01ms step:1362/1480 train_time:216349ms step_avg:160.02ms step:1363/1480 train_time:216530ms step_avg:160.04ms step:1364/1480 train_time:216700ms step_avg:160.04ms step:1365/1480 train_time:216866ms step_avg:160.05ms step:1366/1480 train_time:217037ms step_avg:160.06ms step:1367/1480 train_time:217210ms step_avg:160.07ms step:1368/1480 train_time:217384ms step_avg:160.08ms step:1369/1480 train_time:217565ms step_avg:160.09ms step:1370/1480 train_time:217743ms step_avg:160.11ms step:1371/1480 train_time:217914ms step_avg:160.11ms step:1372/1480 train_time:218092ms step_avg:160.13ms step:1373/1480 train_time:218262ms step_avg:160.13ms step:1374/1480 train_time:218437ms step_avg:160.14ms step:1375/1480 train_time:218608ms step_avg:160.15ms step:1375/1480 val_loss:3.2956 train_time:218675ms step_avg:160.20ms step:1376/1480 train_time:218785ms step_avg:160.16ms step:1377/1480 train_time:218957ms step_avg:160.17ms step:1378/1480 train_time:219125ms step_avg:160.18ms step:1379/1480 train_time:219300ms step_avg:160.19ms step:1380/1480 train_time:219474ms step_avg:160.20ms step:1381/1480 train_time:219655ms step_avg:160.21ms step:1382/1480 train_time:219826ms step_avg:160.22ms step:1383/1480 train_time:219999ms step_avg:160.23ms step:1384/1480 train_time:220174ms step_avg:160.24ms step:1385/1480 train_time:220341ms step_avg:160.25ms step:1386/1480 train_time:220510ms step_avg:160.25ms step:1387/1480 train_time:220682ms step_avg:160.26ms step:1388/1480 train_time:220851ms step_avg:160.27ms step:1389/1480 train_time:221024ms step_avg:160.28ms step:1390/1480 train_time:221191ms step_avg:160.28ms step:1391/1480 train_time:221362ms step_avg:160.29ms step:1392/1480 train_time:221536ms step_avg:160.30ms step:1393/1480 train_time:221705ms step_avg:160.31ms step:1394/1480 train_time:221876ms step_avg:160.32ms step:1395/1480 train_time:222045ms step_avg:160.32ms step:1396/1480 train_time:222213ms step_avg:160.33ms step:1397/1480 train_time:222381ms step_avg:160.33ms step:1398/1480 train_time:222548ms step_avg:160.34ms step:1399/1480 train_time:222717ms step_avg:160.34ms step:1400/1480 train_time:222894ms step_avg:160.36ms step:1401/1480 train_time:223060ms step_avg:160.36ms step:1402/1480 train_time:223230ms step_avg:160.37ms step:1403/1480 train_time:223407ms step_avg:160.38ms step:1404/1480 train_time:223578ms step_avg:160.39ms step:1405/1480 train_time:223751ms step_avg:160.40ms step:1406/1480 train_time:223926ms step_avg:160.41ms step:1407/1480 train_time:224094ms step_avg:160.41ms step:1408/1480 train_time:224264ms step_avg:160.42ms step:1409/1480 train_time:224447ms step_avg:160.43ms step:1410/1480 train_time:224616ms step_avg:160.44ms step:1411/1480 train_time:224785ms step_avg:160.45ms step:1412/1480 train_time:224954ms step_avg:160.45ms step:1413/1480 train_time:225125ms step_avg:160.46ms step:1414/1480 train_time:225297ms step_avg:160.47ms step:1415/1480 train_time:225471ms step_avg:160.48ms step:1416/1480 train_time:225657ms step_avg:160.50ms step:1417/1480 train_time:225829ms step_avg:160.50ms step:1418/1480 train_time:226002ms step_avg:160.51ms step:1419/1480 train_time:226176ms step_avg:160.52ms step:1420/1480 train_time:226351ms step_avg:160.53ms step:1421/1480 train_time:226526ms step_avg:160.54ms step:1422/1480 train_time:226699ms step_avg:160.55ms step:1423/1480 train_time:226869ms step_avg:160.56ms step:1424/1480 train_time:227045ms step_avg:160.57ms step:1425/1480 train_time:227226ms step_avg:160.58ms step:1426/1480 train_time:227398ms step_avg:160.59ms step:1427/1480 train_time:227573ms step_avg:160.60ms step:1428/1480 train_time:227745ms step_avg:160.61ms step:1429/1480 train_time:227913ms step_avg:160.62ms step:1430/1480 train_time:228086ms step_avg:160.62ms step:1431/1480 train_time:228261ms step_avg:160.63ms step:1432/1480 train_time:228438ms step_avg:160.65ms step:1433/1480 train_time:228616ms step_avg:160.66ms step:1434/1480 train_time:228796ms step_avg:160.67ms step:1435/1480 train_time:228971ms step_avg:160.68ms step:1436/1480 train_time:229145ms step_avg:160.69ms step:1437/1480 train_time:229313ms step_avg:160.70ms step:1438/1480 train_time:229483ms step_avg:160.70ms step:1439/1480 train_time:229655ms step_avg:160.71ms step:1440/1480 train_time:229825ms step_avg:160.72ms step:1441/1480 train_time:229996ms step_avg:160.72ms step:1442/1480 train_time:230172ms step_avg:160.73ms step:1443/1480 train_time:230361ms step_avg:160.75ms step:1444/1480 train_time:230531ms step_avg:160.76ms step:1445/1480 train_time:230703ms step_avg:160.77ms step:1446/1480 train_time:230880ms step_avg:160.78ms step:1447/1480 train_time:231057ms step_avg:160.79ms step:1448/1480 train_time:231228ms step_avg:160.80ms step:1449/1480 train_time:231401ms step_avg:160.81ms step:1450/1480 train_time:231574ms step_avg:160.82ms step:1451/1480 train_time:231745ms step_avg:160.82ms step:1452/1480 train_time:231918ms step_avg:160.83ms step:1453/1480 train_time:232087ms step_avg:160.84ms step:1454/1480 train_time:232257ms step_avg:160.84ms step:1455/1480 train_time:232436ms step_avg:160.86ms step:1456/1480 train_time:232608ms step_avg:160.86ms step:1457/1480 train_time:232779ms step_avg:160.87ms step:1458/1480 train_time:232950ms step_avg:160.88ms step:1459/1480 train_time:233128ms step_avg:160.89ms step:1460/1480 train_time:233299ms step_avg:160.90ms step:1461/1480 train_time:233473ms step_avg:160.91ms step:1462/1480 train_time:233645ms step_avg:160.91ms step:1463/1480 train_time:233823ms step_avg:160.92ms step:1464/1480 train_time:233997ms step_avg:160.93ms step:1465/1480 train_time:234169ms step_avg:160.94ms step:1466/1480 train_time:234339ms step_avg:160.95ms step:1467/1480 train_time:234514ms step_avg:160.96ms step:1468/1480 train_time:234682ms step_avg:160.96ms step:1469/1480 train_time:234855ms step_avg:160.97ms step:1470/1480 train_time:235035ms step_avg:160.98ms step:1471/1480 train_time:235223ms step_avg:161.00ms step:1472/1480 train_time:235406ms step_avg:161.02ms step:1473/1480 train_time:235578ms step_avg:161.02ms step:1474/1480 train_time:235755ms step_avg:161.03ms step:1475/1480 train_time:235934ms step_avg:161.05ms step:1476/1480 train_time:236106ms step_avg:161.05ms step:1477/1480 train_time:236288ms step_avg:161.07ms step:1478/1480 train_time:236471ms step_avg:161.08ms step:1479/1480 train_time:236645ms step_avg:161.09ms step:1480/1480 train_time:236817ms step_avg:161.10ms step:1480/1480 val_loss:3.2766 train_time:236887ms step_avg:161.15ms