import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:34:16 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 111W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 108W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 115W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 128W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23157ms step_avg:nanms step:2/1480 train_time:23243ms step_avg:nanms step:3/1480 train_time:23382ms step_avg:nanms step:4/1480 train_time:23523ms step_avg:nanms step:5/1480 train_time:23663ms step_avg:nanms step:6/1480 train_time:23805ms step_avg:nanms step:7/1480 train_time:23945ms step_avg:nanms step:8/1480 train_time:24089ms step_avg:nanms step:9/1480 train_time:24235ms step_avg:nanms step:10/1480 train_time:24379ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:285ms step_avg:nanms step:13/1480 train_time:426ms step_avg:142.11ms step:14/1480 train_time:567ms step_avg:141.68ms step:15/1480 train_time:708ms step_avg:141.61ms step:16/1480 train_time:853ms step_avg:142.09ms step:17/1480 train_time:997ms step_avg:142.39ms step:18/1480 train_time:1141ms step_avg:142.60ms step:19/1480 train_time:1283ms step_avg:142.53ms step:20/1480 train_time:1425ms step_avg:142.50ms step:21/1480 train_time:1566ms step_avg:142.39ms step:22/1480 train_time:1708ms step_avg:142.34ms step:23/1480 train_time:1852ms step_avg:142.43ms step:24/1480 train_time:1995ms step_avg:142.52ms step:25/1480 train_time:2140ms step_avg:142.65ms step:26/1480 train_time:2283ms step_avg:142.71ms step:27/1480 train_time:2426ms step_avg:142.68ms step:28/1480 train_time:2567ms step_avg:142.61ms step:29/1480 train_time:2707ms step_avg:142.49ms step:30/1480 train_time:2849ms step_avg:142.47ms step:31/1480 train_time:2994ms step_avg:142.58ms step:32/1480 train_time:3140ms step_avg:142.71ms step:33/1480 train_time:3283ms step_avg:142.73ms step:34/1480 train_time:3426ms step_avg:142.76ms step:35/1480 train_time:3568ms step_avg:142.70ms step:36/1480 train_time:3709ms step_avg:142.64ms step:37/1480 train_time:3851ms step_avg:142.62ms step:38/1480 train_time:3994ms step_avg:142.63ms step:39/1480 train_time:4139ms step_avg:142.72ms step:40/1480 train_time:4283ms step_avg:142.76ms step:41/1480 train_time:4425ms step_avg:142.75ms step:42/1480 train_time:4569ms step_avg:142.77ms step:43/1480 train_time:4708ms step_avg:142.68ms step:44/1480 train_time:4850ms step_avg:142.66ms step:45/1480 train_time:4992ms step_avg:142.62ms step:46/1480 train_time:5136ms step_avg:142.66ms step:47/1480 train_time:5280ms step_avg:142.70ms step:48/1480 train_time:5423ms step_avg:142.72ms step:49/1480 train_time:5566ms step_avg:142.71ms step:50/1480 train_time:5707ms step_avg:142.66ms step:51/1480 train_time:5849ms step_avg:142.65ms step:52/1480 train_time:5991ms step_avg:142.64ms step:53/1480 train_time:6133ms step_avg:142.63ms step:54/1480 train_time:6280ms step_avg:142.72ms step:55/1480 train_time:6421ms step_avg:142.70ms step:56/1480 train_time:6565ms step_avg:142.72ms step:57/1480 train_time:6707ms step_avg:142.71ms step:58/1480 train_time:6849ms step_avg:142.69ms step:59/1480 train_time:6990ms step_avg:142.66ms step:60/1480 train_time:7132ms step_avg:142.65ms step:61/1480 train_time:7274ms step_avg:142.62ms step:62/1480 train_time:7418ms step_avg:142.65ms step:63/1480 train_time:7561ms step_avg:142.66ms step:64/1480 train_time:7704ms step_avg:142.66ms step:65/1480 train_time:7848ms step_avg:142.69ms step:66/1480 train_time:7989ms step_avg:142.67ms step:67/1480 train_time:8131ms step_avg:142.64ms step:68/1480 train_time:8272ms step_avg:142.62ms step:69/1480 train_time:8414ms step_avg:142.60ms step:70/1480 train_time:8558ms step_avg:142.63ms step:71/1480 train_time:8703ms step_avg:142.67ms step:72/1480 train_time:8845ms step_avg:142.66ms step:73/1480 train_time:8987ms step_avg:142.64ms step:74/1480 train_time:9128ms step_avg:142.62ms step:75/1480 train_time:9269ms step_avg:142.60ms step:76/1480 train_time:9412ms step_avg:142.61ms step:77/1480 train_time:9555ms step_avg:142.61ms step:78/1480 train_time:9699ms step_avg:142.63ms step:79/1480 train_time:9843ms step_avg:142.65ms step:80/1480 train_time:9985ms step_avg:142.64ms step:81/1480 train_time:10128ms step_avg:142.65ms step:82/1480 train_time:10269ms step_avg:142.63ms step:83/1480 train_time:10409ms step_avg:142.60ms step:84/1480 train_time:10554ms step_avg:142.62ms step:85/1480 train_time:10699ms step_avg:142.66ms step:86/1480 train_time:10844ms step_avg:142.69ms step:87/1480 train_time:10987ms step_avg:142.69ms step:88/1480 train_time:11129ms step_avg:142.68ms step:89/1480 train_time:11270ms step_avg:142.66ms step:90/1480 train_time:11410ms step_avg:142.62ms step:91/1480 train_time:11552ms step_avg:142.62ms step:92/1480 train_time:11696ms step_avg:142.64ms step:93/1480 train_time:11841ms step_avg:142.66ms step:94/1480 train_time:11984ms step_avg:142.66ms step:95/1480 train_time:12126ms step_avg:142.66ms step:96/1480 train_time:12267ms step_avg:142.64ms step:97/1480 train_time:12408ms step_avg:142.62ms step:98/1480 train_time:12551ms step_avg:142.62ms step:99/1480 train_time:12693ms step_avg:142.62ms step:100/1480 train_time:12839ms step_avg:142.65ms step:101/1480 train_time:12982ms step_avg:142.66ms step:102/1480 train_time:13124ms step_avg:142.66ms step:103/1480 train_time:13266ms step_avg:142.65ms step:104/1480 train_time:13407ms step_avg:142.62ms step:105/1480 train_time:13549ms step_avg:142.62ms step:106/1480 train_time:13691ms step_avg:142.62ms step:107/1480 train_time:13833ms step_avg:142.60ms step:108/1480 train_time:13977ms step_avg:142.62ms step:109/1480 train_time:14121ms step_avg:142.63ms step:110/1480 train_time:14264ms step_avg:142.64ms step:111/1480 train_time:14407ms step_avg:142.65ms step:112/1480 train_time:14554ms step_avg:142.69ms step:113/1480 train_time:14701ms step_avg:142.73ms step:114/1480 train_time:14848ms step_avg:142.77ms step:115/1480 train_time:14994ms step_avg:142.80ms step:116/1480 train_time:15142ms step_avg:142.85ms step:117/1480 train_time:15290ms step_avg:142.89ms step:118/1480 train_time:15438ms step_avg:142.94ms step:119/1480 train_time:15585ms step_avg:142.98ms step:120/1480 train_time:15732ms step_avg:143.01ms step:121/1480 train_time:15879ms step_avg:143.05ms step:122/1480 train_time:16027ms step_avg:143.10ms step:123/1480 train_time:16173ms step_avg:143.13ms step:124/1480 train_time:16322ms step_avg:143.18ms step:125/1480 train_time:16469ms step_avg:143.21ms step:125/1480 val_loss:4.4254 train_time:16526ms step_avg:143.71ms step:126/1480 train_time:16621ms step_avg:143.28ms step:127/1480 train_time:16769ms step_avg:143.32ms step:128/1480 train_time:16915ms step_avg:143.35ms step:129/1480 train_time:17061ms step_avg:143.37ms step:130/1480 train_time:17206ms step_avg:143.39ms step:131/1480 train_time:17353ms step_avg:143.42ms step:132/1480 train_time:17500ms step_avg:143.44ms step:133/1480 train_time:17650ms step_avg:143.50ms step:134/1480 train_time:17798ms step_avg:143.53ms step:135/1480 train_time:17944ms step_avg:143.55ms step:136/1480 train_time:18092ms step_avg:143.59ms step:137/1480 train_time:18238ms step_avg:143.61ms step:138/1480 train_time:18383ms step_avg:143.62ms step:139/1480 train_time:18530ms step_avg:143.65ms step:140/1480 train_time:18678ms step_avg:143.68ms step:141/1480 train_time:18825ms step_avg:143.70ms step:142/1480 train_time:18973ms step_avg:143.73ms step:143/1480 train_time:19120ms step_avg:143.76ms step:144/1480 train_time:19266ms step_avg:143.77ms step:145/1480 train_time:19413ms step_avg:143.80ms step:146/1480 train_time:19560ms step_avg:143.82ms step:147/1480 train_time:19706ms step_avg:143.84ms step:148/1480 train_time:19853ms step_avg:143.87ms step:149/1480 train_time:20000ms step_avg:143.88ms step:150/1480 train_time:20148ms step_avg:143.91ms step:151/1480 train_time:20297ms step_avg:143.95ms step:152/1480 train_time:20441ms step_avg:143.95ms step:153/1480 train_time:20589ms step_avg:143.98ms step:154/1480 train_time:20736ms step_avg:144.00ms step:155/1480 train_time:20882ms step_avg:144.02ms step:156/1480 train_time:21030ms step_avg:144.04ms step:157/1480 train_time:21177ms step_avg:144.06ms step:158/1480 train_time:21323ms step_avg:144.08ms step:159/1480 train_time:21472ms step_avg:144.11ms step:160/1480 train_time:21618ms step_avg:144.12ms step:161/1480 train_time:21763ms step_avg:144.12ms step:162/1480 train_time:21909ms step_avg:144.14ms step:163/1480 train_time:22057ms step_avg:144.17ms step:164/1480 train_time:22204ms step_avg:144.18ms step:165/1480 train_time:22352ms step_avg:144.21ms step:166/1480 train_time:22500ms step_avg:144.23ms step:167/1480 train_time:22646ms step_avg:144.24ms step:168/1480 train_time:22794ms step_avg:144.27ms step:169/1480 train_time:22940ms step_avg:144.28ms step:170/1480 train_time:23087ms step_avg:144.29ms step:171/1480 train_time:23235ms step_avg:144.31ms step:172/1480 train_time:23381ms step_avg:144.32ms step:173/1480 train_time:23528ms step_avg:144.35ms step:174/1480 train_time:23676ms step_avg:144.37ms step:175/1480 train_time:23822ms step_avg:144.38ms step:176/1480 train_time:23970ms step_avg:144.40ms step:177/1480 train_time:24118ms step_avg:144.42ms step:178/1480 train_time:24265ms step_avg:144.44ms step:179/1480 train_time:24413ms step_avg:144.46ms step:180/1480 train_time:24560ms step_avg:144.47ms step:181/1480 train_time:24707ms step_avg:144.48ms step:182/1480 train_time:24854ms step_avg:144.50ms step:183/1480 train_time:25000ms step_avg:144.51ms step:184/1480 train_time:25148ms step_avg:144.53ms step:185/1480 train_time:25296ms step_avg:144.55ms step:186/1480 train_time:25443ms step_avg:144.56ms step:187/1480 train_time:25591ms step_avg:144.58ms step:188/1480 train_time:25738ms step_avg:144.59ms step:189/1480 train_time:25885ms step_avg:144.61ms step:190/1480 train_time:26032ms step_avg:144.62ms step:191/1480 train_time:26179ms step_avg:144.64ms step:192/1480 train_time:26327ms step_avg:144.66ms step:193/1480 train_time:26475ms step_avg:144.67ms step:194/1480 train_time:26620ms step_avg:144.68ms step:195/1480 train_time:26767ms step_avg:144.69ms step:196/1480 train_time:26914ms step_avg:144.70ms step:197/1480 train_time:27060ms step_avg:144.70ms step:198/1480 train_time:27206ms step_avg:144.71ms step:199/1480 train_time:27354ms step_avg:144.73ms step:200/1480 train_time:27501ms step_avg:144.74ms step:201/1480 train_time:27646ms step_avg:144.74ms step:202/1480 train_time:27793ms step_avg:144.75ms step:203/1480 train_time:27939ms step_avg:144.76ms step:204/1480 train_time:28084ms step_avg:144.76ms step:205/1480 train_time:28231ms step_avg:144.78ms step:206/1480 train_time:28380ms step_avg:144.80ms step:207/1480 train_time:28527ms step_avg:144.81ms step:208/1480 train_time:28675ms step_avg:144.82ms step:209/1480 train_time:28821ms step_avg:144.83ms step:210/1480 train_time:28970ms step_avg:144.85ms step:211/1480 train_time:29117ms step_avg:144.86ms step:212/1480 train_time:29263ms step_avg:144.87ms step:213/1480 train_time:29410ms step_avg:144.88ms step:214/1480 train_time:29556ms step_avg:144.88ms step:215/1480 train_time:29702ms step_avg:144.89ms step:216/1480 train_time:29849ms step_avg:144.90ms step:217/1480 train_time:29996ms step_avg:144.91ms step:218/1480 train_time:30143ms step_avg:144.92ms step:219/1480 train_time:30291ms step_avg:144.93ms step:220/1480 train_time:30438ms step_avg:144.94ms step:221/1480 train_time:30585ms step_avg:144.95ms step:222/1480 train_time:30737ms step_avg:144.98ms step:223/1480 train_time:30887ms step_avg:145.01ms step:224/1480 train_time:31037ms step_avg:145.03ms step:225/1480 train_time:31188ms step_avg:145.06ms step:226/1480 train_time:31339ms step_avg:145.09ms step:227/1480 train_time:31490ms step_avg:145.11ms step:228/1480 train_time:31640ms step_avg:145.14ms step:229/1480 train_time:31791ms step_avg:145.16ms step:230/1480 train_time:31940ms step_avg:145.18ms step:231/1480 train_time:32091ms step_avg:145.21ms step:232/1480 train_time:32241ms step_avg:145.23ms step:233/1480 train_time:32390ms step_avg:145.25ms step:234/1480 train_time:32541ms step_avg:145.27ms step:235/1480 train_time:32692ms step_avg:145.30ms step:236/1480 train_time:32842ms step_avg:145.32ms step:237/1480 train_time:32994ms step_avg:145.35ms step:238/1480 train_time:33142ms step_avg:145.36ms step:239/1480 train_time:33293ms step_avg:145.38ms step:240/1480 train_time:33443ms step_avg:145.41ms step:241/1480 train_time:33594ms step_avg:145.43ms step:242/1480 train_time:33745ms step_avg:145.45ms step:243/1480 train_time:33896ms step_avg:145.48ms step:244/1480 train_time:34045ms step_avg:145.49ms step:245/1480 train_time:34195ms step_avg:145.51ms step:246/1480 train_time:34346ms step_avg:145.53ms step:247/1480 train_time:34496ms step_avg:145.55ms step:248/1480 train_time:34646ms step_avg:145.57ms step:249/1480 train_time:34797ms step_avg:145.60ms step:250/1480 train_time:34948ms step_avg:145.62ms step:250/1480 val_loss:3.9947 train_time:35008ms step_avg:145.86ms step:251/1480 train_time:35105ms step_avg:145.67ms step:252/1480 train_time:35256ms step_avg:145.69ms step:253/1480 train_time:35407ms step_avg:145.71ms step:254/1480 train_time:35556ms step_avg:145.72ms step:255/1480 train_time:35706ms step_avg:145.74ms step:256/1480 train_time:35855ms step_avg:145.75ms step:257/1480 train_time:36005ms step_avg:145.77ms step:258/1480 train_time:36157ms step_avg:145.80ms step:259/1480 train_time:36309ms step_avg:145.82ms step:260/1480 train_time:36459ms step_avg:145.84ms step:261/1480 train_time:36608ms step_avg:145.85ms step:262/1480 train_time:36757ms step_avg:145.86ms step:263/1480 train_time:36907ms step_avg:145.88ms step:264/1480 train_time:37056ms step_avg:145.89ms step:265/1480 train_time:37208ms step_avg:145.91ms step:266/1480 train_time:37359ms step_avg:145.93ms step:267/1480 train_time:37512ms step_avg:145.96ms step:268/1480 train_time:37659ms step_avg:145.97ms step:269/1480 train_time:37809ms step_avg:145.98ms step:270/1480 train_time:37959ms step_avg:145.99ms step:271/1480 train_time:38108ms step_avg:146.01ms step:272/1480 train_time:38260ms step_avg:146.03ms step:273/1480 train_time:38410ms step_avg:146.05ms step:274/1480 train_time:38561ms step_avg:146.06ms step:275/1480 train_time:38712ms step_avg:146.08ms step:276/1480 train_time:38862ms step_avg:146.10ms step:277/1480 train_time:39012ms step_avg:146.11ms step:278/1480 train_time:39164ms step_avg:146.13ms step:279/1480 train_time:39313ms step_avg:146.15ms step:280/1480 train_time:39466ms step_avg:146.17ms step:281/1480 train_time:39616ms step_avg:146.18ms step:282/1480 train_time:39767ms step_avg:146.20ms step:283/1480 train_time:39917ms step_avg:146.22ms step:284/1480 train_time:40068ms step_avg:146.23ms step:285/1480 train_time:40222ms step_avg:146.26ms step:286/1480 train_time:40369ms step_avg:146.27ms step:287/1480 train_time:40520ms step_avg:146.28ms step:288/1480 train_time:40671ms step_avg:146.30ms step:289/1480 train_time:40821ms step_avg:146.31ms step:290/1480 train_time:40971ms step_avg:146.33ms step:291/1480 train_time:41123ms step_avg:146.35ms step:292/1480 train_time:41272ms step_avg:146.35ms step:293/1480 train_time:41421ms step_avg:146.36ms step:294/1480 train_time:41573ms step_avg:146.38ms step:295/1480 train_time:41723ms step_avg:146.40ms step:296/1480 train_time:41875ms step_avg:146.42ms step:297/1480 train_time:42026ms step_avg:146.43ms step:298/1480 train_time:42176ms step_avg:146.45ms step:299/1480 train_time:42327ms step_avg:146.46ms step:300/1480 train_time:42478ms step_avg:146.48ms step:301/1480 train_time:42627ms step_avg:146.49ms step:302/1480 train_time:42777ms step_avg:146.50ms step:303/1480 train_time:42928ms step_avg:146.51ms step:304/1480 train_time:43080ms step_avg:146.53ms step:305/1480 train_time:43230ms step_avg:146.54ms step:306/1480 train_time:43381ms step_avg:146.56ms step:307/1480 train_time:43531ms step_avg:146.57ms step:308/1480 train_time:43683ms step_avg:146.59ms step:309/1480 train_time:43833ms step_avg:146.60ms step:310/1480 train_time:43982ms step_avg:146.61ms step:311/1480 train_time:44132ms step_avg:146.62ms step:312/1480 train_time:44284ms step_avg:146.63ms step:313/1480 train_time:44434ms step_avg:146.65ms step:314/1480 train_time:44584ms step_avg:146.66ms step:315/1480 train_time:44734ms step_avg:146.67ms step:316/1480 train_time:44885ms step_avg:146.68ms step:317/1480 train_time:45036ms step_avg:146.70ms step:318/1480 train_time:45187ms step_avg:146.71ms step:319/1480 train_time:45337ms step_avg:146.72ms step:320/1480 train_time:45488ms step_avg:146.74ms step:321/1480 train_time:45638ms step_avg:146.75ms step:322/1480 train_time:45789ms step_avg:146.76ms step:323/1480 train_time:45939ms step_avg:146.77ms step:324/1480 train_time:46089ms step_avg:146.78ms step:325/1480 train_time:46240ms step_avg:146.79ms step:326/1480 train_time:46390ms step_avg:146.81ms step:327/1480 train_time:46540ms step_avg:146.81ms step:328/1480 train_time:46691ms step_avg:146.83ms step:329/1480 train_time:46842ms step_avg:146.84ms step:330/1480 train_time:46994ms step_avg:146.86ms step:331/1480 train_time:47146ms step_avg:146.87ms step:332/1480 train_time:47303ms step_avg:146.90ms step:333/1480 train_time:47457ms step_avg:146.93ms step:334/1480 train_time:47610ms step_avg:146.95ms step:335/1480 train_time:47765ms step_avg:146.97ms step:336/1480 train_time:47918ms step_avg:146.99ms step:337/1480 train_time:48072ms step_avg:147.01ms step:338/1480 train_time:48225ms step_avg:147.03ms step:339/1480 train_time:48379ms step_avg:147.05ms step:340/1480 train_time:48532ms step_avg:147.07ms step:341/1480 train_time:48685ms step_avg:147.09ms step:342/1480 train_time:48839ms step_avg:147.11ms step:343/1480 train_time:48993ms step_avg:147.13ms step:344/1480 train_time:49147ms step_avg:147.15ms step:345/1480 train_time:49303ms step_avg:147.17ms step:346/1480 train_time:49456ms step_avg:147.19ms step:347/1480 train_time:49610ms step_avg:147.21ms step:348/1480 train_time:49764ms step_avg:147.23ms step:349/1480 train_time:49918ms step_avg:147.25ms step:350/1480 train_time:50072ms step_avg:147.27ms step:351/1480 train_time:50225ms step_avg:147.29ms step:352/1480 train_time:50379ms step_avg:147.31ms step:353/1480 train_time:50533ms step_avg:147.33ms step:354/1480 train_time:50686ms step_avg:147.34ms step:355/1480 train_time:50840ms step_avg:147.36ms step:356/1480 train_time:50994ms step_avg:147.38ms step:357/1480 train_time:51147ms step_avg:147.40ms step:358/1480 train_time:51302ms step_avg:147.42ms step:359/1480 train_time:51455ms step_avg:147.44ms step:360/1480 train_time:51609ms step_avg:147.46ms step:361/1480 train_time:51764ms step_avg:147.48ms step:362/1480 train_time:51918ms step_avg:147.50ms step:363/1480 train_time:52071ms step_avg:147.51ms step:364/1480 train_time:52225ms step_avg:147.53ms step:365/1480 train_time:52380ms step_avg:147.55ms step:366/1480 train_time:52534ms step_avg:147.57ms step:367/1480 train_time:52687ms step_avg:147.58ms step:368/1480 train_time:52840ms step_avg:147.60ms step:369/1480 train_time:52993ms step_avg:147.61ms step:370/1480 train_time:53146ms step_avg:147.63ms step:371/1480 train_time:53303ms step_avg:147.65ms step:372/1480 train_time:53458ms step_avg:147.67ms step:373/1480 train_time:53611ms step_avg:147.69ms step:374/1480 train_time:53764ms step_avg:147.70ms step:375/1480 train_time:53918ms step_avg:147.72ms step:375/1480 val_loss:3.8012 train_time:53977ms step_avg:147.88ms step:376/1480 train_time:54074ms step_avg:147.74ms step:377/1480 train_time:54228ms step_avg:147.76ms step:378/1480 train_time:54381ms step_avg:147.77ms step:379/1480 train_time:54534ms step_avg:147.79ms step:380/1480 train_time:54686ms step_avg:147.80ms step:381/1480 train_time:54839ms step_avg:147.81ms step:382/1480 train_time:54993ms step_avg:147.83ms step:383/1480 train_time:55147ms step_avg:147.85ms step:384/1480 train_time:55302ms step_avg:147.87ms step:385/1480 train_time:55455ms step_avg:147.88ms step:386/1480 train_time:55611ms step_avg:147.90ms step:387/1480 train_time:55762ms step_avg:147.91ms step:388/1480 train_time:55915ms step_avg:147.92ms step:389/1480 train_time:56069ms step_avg:147.94ms step:390/1480 train_time:56223ms step_avg:147.95ms step:391/1480 train_time:56378ms step_avg:147.97ms step:392/1480 train_time:56529ms step_avg:147.98ms step:393/1480 train_time:56683ms step_avg:148.00ms step:394/1480 train_time:56837ms step_avg:148.01ms step:395/1480 train_time:56991ms step_avg:148.03ms step:396/1480 train_time:57145ms step_avg:148.04ms step:397/1480 train_time:57300ms step_avg:148.06ms step:398/1480 train_time:57455ms step_avg:148.08ms step:399/1480 train_time:57607ms step_avg:148.09ms step:400/1480 train_time:57762ms step_avg:148.11ms step:401/1480 train_time:57916ms step_avg:148.12ms step:402/1480 train_time:58069ms step_avg:148.14ms step:403/1480 train_time:58224ms step_avg:148.15ms step:404/1480 train_time:58379ms step_avg:148.17ms step:405/1480 train_time:58533ms step_avg:148.19ms step:406/1480 train_time:58687ms step_avg:148.20ms step:407/1480 train_time:58840ms step_avg:148.21ms step:408/1480 train_time:58994ms step_avg:148.23ms step:409/1480 train_time:59147ms step_avg:148.24ms step:410/1480 train_time:59303ms step_avg:148.26ms step:411/1480 train_time:59457ms step_avg:148.27ms step:412/1480 train_time:59611ms step_avg:148.29ms step:413/1480 train_time:59765ms step_avg:148.30ms step:414/1480 train_time:59920ms step_avg:148.32ms step:415/1480 train_time:60074ms step_avg:148.33ms step:416/1480 train_time:60228ms step_avg:148.34ms step:417/1480 train_time:60381ms step_avg:148.36ms step:418/1480 train_time:60534ms step_avg:148.37ms step:419/1480 train_time:60687ms step_avg:148.38ms step:420/1480 train_time:60841ms step_avg:148.39ms step:421/1480 train_time:60994ms step_avg:148.40ms step:422/1480 train_time:61147ms step_avg:148.42ms step:423/1480 train_time:61302ms step_avg:148.43ms step:424/1480 train_time:61457ms step_avg:148.45ms step:425/1480 train_time:61609ms step_avg:148.46ms step:426/1480 train_time:61764ms step_avg:148.47ms step:427/1480 train_time:61919ms step_avg:148.49ms step:428/1480 train_time:62072ms step_avg:148.50ms step:429/1480 train_time:62225ms step_avg:148.51ms step:430/1480 train_time:62379ms step_avg:148.52ms step:431/1480 train_time:62533ms step_avg:148.53ms step:432/1480 train_time:62686ms step_avg:148.54ms step:433/1480 train_time:62839ms step_avg:148.56ms step:434/1480 train_time:62994ms step_avg:148.57ms step:435/1480 train_time:63147ms step_avg:148.58ms step:436/1480 train_time:63302ms step_avg:148.60ms step:437/1480 train_time:63457ms step_avg:148.61ms step:438/1480 train_time:63610ms step_avg:148.62ms step:439/1480 train_time:63764ms step_avg:148.63ms step:440/1480 train_time:63919ms step_avg:148.65ms step:441/1480 train_time:64075ms step_avg:148.67ms step:442/1480 train_time:64231ms step_avg:148.68ms step:443/1480 train_time:64387ms step_avg:148.70ms step:444/1480 train_time:64542ms step_avg:148.71ms step:445/1480 train_time:64698ms step_avg:148.73ms step:446/1480 train_time:64853ms step_avg:148.75ms step:447/1480 train_time:65008ms step_avg:148.76ms step:448/1480 train_time:65165ms step_avg:148.78ms step:449/1480 train_time:65324ms step_avg:148.80ms step:450/1480 train_time:65482ms step_avg:148.82ms step:451/1480 train_time:65641ms step_avg:148.85ms step:452/1480 train_time:65798ms step_avg:148.86ms step:453/1480 train_time:65955ms step_avg:148.88ms step:454/1480 train_time:66110ms step_avg:148.90ms step:455/1480 train_time:66266ms step_avg:148.91ms step:456/1480 train_time:66423ms step_avg:148.93ms step:457/1480 train_time:66581ms step_avg:148.95ms step:458/1480 train_time:66736ms step_avg:148.97ms step:459/1480 train_time:66896ms step_avg:148.99ms step:460/1480 train_time:67052ms step_avg:149.00ms step:461/1480 train_time:67209ms step_avg:149.02ms step:462/1480 train_time:67365ms step_avg:149.04ms step:463/1480 train_time:67523ms step_avg:149.06ms step:464/1480 train_time:67681ms step_avg:149.08ms step:465/1480 train_time:67838ms step_avg:149.09ms step:466/1480 train_time:67997ms step_avg:149.12ms step:467/1480 train_time:68155ms step_avg:149.13ms step:468/1480 train_time:68310ms step_avg:149.15ms step:469/1480 train_time:68466ms step_avg:149.16ms step:470/1480 train_time:68623ms step_avg:149.18ms step:471/1480 train_time:68781ms step_avg:149.20ms step:472/1480 train_time:68938ms step_avg:149.22ms step:473/1480 train_time:69095ms step_avg:149.23ms step:474/1480 train_time:69251ms step_avg:149.25ms step:475/1480 train_time:69407ms step_avg:149.26ms step:476/1480 train_time:69563ms step_avg:149.28ms step:477/1480 train_time:69721ms step_avg:149.30ms step:478/1480 train_time:69879ms step_avg:149.31ms step:479/1480 train_time:70036ms step_avg:149.33ms step:480/1480 train_time:70193ms step_avg:149.35ms step:481/1480 train_time:70348ms step_avg:149.36ms step:482/1480 train_time:70504ms step_avg:149.37ms step:483/1480 train_time:70660ms step_avg:149.39ms step:484/1480 train_time:70816ms step_avg:149.40ms step:485/1480 train_time:70972ms step_avg:149.41ms step:486/1480 train_time:71129ms step_avg:149.43ms step:487/1480 train_time:71287ms step_avg:149.45ms step:488/1480 train_time:71444ms step_avg:149.46ms step:489/1480 train_time:71601ms step_avg:149.48ms step:490/1480 train_time:71757ms step_avg:149.49ms step:491/1480 train_time:71913ms step_avg:149.51ms step:492/1480 train_time:72069ms step_avg:149.52ms step:493/1480 train_time:72226ms step_avg:149.54ms step:494/1480 train_time:72384ms step_avg:149.55ms step:495/1480 train_time:72543ms step_avg:149.57ms step:496/1480 train_time:72702ms step_avg:149.59ms step:497/1480 train_time:72860ms step_avg:149.61ms step:498/1480 train_time:73019ms step_avg:149.63ms step:499/1480 train_time:73178ms step_avg:149.65ms step:500/1480 train_time:73334ms step_avg:149.66ms step:500/1480 val_loss:3.6827 train_time:73397ms step_avg:149.79ms step:501/1480 train_time:73495ms step_avg:149.68ms step:502/1480 train_time:73652ms step_avg:149.70ms step:503/1480 train_time:73807ms step_avg:149.71ms step:504/1480 train_time:73964ms step_avg:149.72ms step:505/1480 train_time:74120ms step_avg:149.74ms step:506/1480 train_time:74277ms step_avg:149.75ms step:507/1480 train_time:74432ms step_avg:149.76ms step:508/1480 train_time:74590ms step_avg:149.78ms step:509/1480 train_time:74747ms step_avg:149.79ms step:510/1480 train_time:74904ms step_avg:149.81ms step:511/1480 train_time:75062ms step_avg:149.82ms step:512/1480 train_time:75219ms step_avg:149.84ms step:513/1480 train_time:75376ms step_avg:149.85ms step:514/1480 train_time:75533ms step_avg:149.87ms step:515/1480 train_time:75689ms step_avg:149.88ms step:516/1480 train_time:75846ms step_avg:149.89ms step:517/1480 train_time:76003ms step_avg:149.91ms step:518/1480 train_time:76162ms step_avg:149.93ms step:519/1480 train_time:76319ms step_avg:149.94ms step:520/1480 train_time:76479ms step_avg:149.96ms step:521/1480 train_time:76637ms step_avg:149.97ms step:522/1480 train_time:76793ms step_avg:149.99ms step:523/1480 train_time:76949ms step_avg:150.00ms step:524/1480 train_time:77105ms step_avg:150.01ms step:525/1480 train_time:77263ms step_avg:150.02ms step:526/1480 train_time:77419ms step_avg:150.04ms step:527/1480 train_time:77576ms step_avg:150.05ms step:528/1480 train_time:77732ms step_avg:150.06ms step:529/1480 train_time:77889ms step_avg:150.07ms step:530/1480 train_time:78046ms step_avg:150.09ms step:531/1480 train_time:78202ms step_avg:150.10ms step:532/1480 train_time:78359ms step_avg:150.11ms step:533/1480 train_time:78515ms step_avg:150.12ms step:534/1480 train_time:78671ms step_avg:150.14ms step:535/1480 train_time:78827ms step_avg:150.15ms step:536/1480 train_time:78986ms step_avg:150.16ms step:537/1480 train_time:79145ms step_avg:150.18ms step:538/1480 train_time:79302ms step_avg:150.19ms step:539/1480 train_time:79460ms step_avg:150.21ms step:540/1480 train_time:79618ms step_avg:150.22ms step:541/1480 train_time:79774ms step_avg:150.23ms step:542/1480 train_time:79929ms step_avg:150.24ms step:543/1480 train_time:80087ms step_avg:150.26ms step:544/1480 train_time:80243ms step_avg:150.27ms step:545/1480 train_time:80401ms step_avg:150.28ms step:546/1480 train_time:80557ms step_avg:150.29ms step:547/1480 train_time:80713ms step_avg:150.30ms step:548/1480 train_time:80871ms step_avg:150.32ms step:549/1480 train_time:81027ms step_avg:150.33ms step:550/1480 train_time:81187ms step_avg:150.35ms step:551/1480 train_time:81345ms step_avg:150.36ms step:552/1480 train_time:81505ms step_avg:150.38ms step:553/1480 train_time:81666ms step_avg:150.40ms step:554/1480 train_time:81826ms step_avg:150.42ms step:555/1480 train_time:81987ms step_avg:150.44ms step:556/1480 train_time:82146ms step_avg:150.45ms step:557/1480 train_time:82306ms step_avg:150.47ms step:558/1480 train_time:82465ms step_avg:150.48ms step:559/1480 train_time:82624ms step_avg:150.50ms step:560/1480 train_time:82785ms step_avg:150.52ms step:561/1480 train_time:82944ms step_avg:150.53ms step:562/1480 train_time:83104ms step_avg:150.55ms step:563/1480 train_time:83263ms step_avg:150.57ms step:564/1480 train_time:83423ms step_avg:150.58ms step:565/1480 train_time:83584ms step_avg:150.60ms step:566/1480 train_time:83744ms step_avg:150.62ms step:567/1480 train_time:83904ms step_avg:150.64ms step:568/1480 train_time:84064ms step_avg:150.65ms step:569/1480 train_time:84223ms step_avg:150.67ms step:570/1480 train_time:84384ms step_avg:150.68ms step:571/1480 train_time:84544ms step_avg:150.70ms step:572/1480 train_time:84703ms step_avg:150.72ms step:573/1480 train_time:84864ms step_avg:150.74ms step:574/1480 train_time:85026ms step_avg:150.76ms step:575/1480 train_time:85187ms step_avg:150.77ms step:576/1480 train_time:85346ms step_avg:150.79ms step:577/1480 train_time:85506ms step_avg:150.80ms step:578/1480 train_time:85665ms step_avg:150.82ms step:579/1480 train_time:85824ms step_avg:150.83ms step:580/1480 train_time:85985ms step_avg:150.85ms step:581/1480 train_time:86145ms step_avg:150.87ms step:582/1480 train_time:86306ms step_avg:150.88ms step:583/1480 train_time:86466ms step_avg:150.90ms step:584/1480 train_time:86625ms step_avg:150.91ms step:585/1480 train_time:86784ms step_avg:150.93ms step:586/1480 train_time:86944ms step_avg:150.94ms step:587/1480 train_time:87104ms step_avg:150.96ms step:588/1480 train_time:87263ms step_avg:150.97ms step:589/1480 train_time:87423ms step_avg:150.99ms step:590/1480 train_time:87584ms step_avg:151.01ms step:591/1480 train_time:87743ms step_avg:151.02ms step:592/1480 train_time:87903ms step_avg:151.04ms step:593/1480 train_time:88064ms step_avg:151.05ms step:594/1480 train_time:88224ms step_avg:151.07ms step:595/1480 train_time:88387ms step_avg:151.09ms step:596/1480 train_time:88548ms step_avg:151.11ms step:597/1480 train_time:88707ms step_avg:151.12ms step:598/1480 train_time:88865ms step_avg:151.13ms step:599/1480 train_time:89023ms step_avg:151.14ms step:600/1480 train_time:89184ms step_avg:151.16ms step:601/1480 train_time:89343ms step_avg:151.17ms step:602/1480 train_time:89503ms step_avg:151.19ms step:603/1480 train_time:89665ms step_avg:151.21ms step:604/1480 train_time:89825ms step_avg:151.22ms step:605/1480 train_time:89986ms step_avg:151.24ms step:606/1480 train_time:90148ms step_avg:151.26ms step:607/1480 train_time:90309ms step_avg:151.27ms step:608/1480 train_time:90469ms step_avg:151.29ms step:609/1480 train_time:90627ms step_avg:151.30ms step:610/1480 train_time:90787ms step_avg:151.31ms step:611/1480 train_time:90946ms step_avg:151.32ms step:612/1480 train_time:91107ms step_avg:151.34ms step:613/1480 train_time:91268ms step_avg:151.36ms step:614/1480 train_time:91427ms step_avg:151.37ms step:615/1480 train_time:91586ms step_avg:151.38ms step:616/1480 train_time:91745ms step_avg:151.39ms step:617/1480 train_time:91906ms step_avg:151.41ms step:618/1480 train_time:92065ms step_avg:151.42ms step:619/1480 train_time:92224ms step_avg:151.44ms step:620/1480 train_time:92385ms step_avg:151.45ms step:621/1480 train_time:92545ms step_avg:151.47ms step:622/1480 train_time:92706ms step_avg:151.48ms step:623/1480 train_time:92867ms step_avg:151.50ms step:624/1480 train_time:93026ms step_avg:151.51ms step:625/1480 train_time:93187ms step_avg:151.52ms step:625/1480 val_loss:3.6026 train_time:93250ms step_avg:151.63ms step:626/1480 train_time:93352ms step_avg:151.55ms step:627/1480 train_time:93512ms step_avg:151.56ms step:628/1480 train_time:93670ms step_avg:151.57ms step:629/1480 train_time:93829ms step_avg:151.58ms step:630/1480 train_time:93988ms step_avg:151.59ms step:631/1480 train_time:94145ms step_avg:151.60ms step:632/1480 train_time:94304ms step_avg:151.61ms step:633/1480 train_time:94462ms step_avg:151.62ms step:634/1480 train_time:94623ms step_avg:151.64ms step:635/1480 train_time:94782ms step_avg:151.65ms step:636/1480 train_time:94940ms step_avg:151.66ms step:637/1480 train_time:95100ms step_avg:151.67ms step:638/1480 train_time:95257ms step_avg:151.68ms step:639/1480 train_time:95416ms step_avg:151.70ms step:640/1480 train_time:95577ms step_avg:151.71ms step:641/1480 train_time:95737ms step_avg:151.72ms step:642/1480 train_time:95896ms step_avg:151.73ms step:643/1480 train_time:96057ms step_avg:151.75ms step:644/1480 train_time:96216ms step_avg:151.76ms step:645/1480 train_time:96374ms step_avg:151.77ms step:646/1480 train_time:96534ms step_avg:151.78ms step:647/1480 train_time:96695ms step_avg:151.80ms step:648/1480 train_time:96856ms step_avg:151.81ms step:649/1480 train_time:97017ms step_avg:151.83ms step:650/1480 train_time:97177ms step_avg:151.84ms step:651/1480 train_time:97337ms step_avg:151.85ms step:652/1480 train_time:97496ms step_avg:151.86ms step:653/1480 train_time:97656ms step_avg:151.88ms step:654/1480 train_time:97816ms step_avg:151.89ms step:655/1480 train_time:97976ms step_avg:151.90ms step:656/1480 train_time:98135ms step_avg:151.91ms step:657/1480 train_time:98294ms step_avg:151.92ms step:658/1480 train_time:98456ms step_avg:151.94ms step:659/1480 train_time:98618ms step_avg:151.95ms step:660/1480 train_time:98780ms step_avg:151.97ms step:661/1480 train_time:98941ms step_avg:151.98ms step:662/1480 train_time:99100ms step_avg:151.99ms step:663/1480 train_time:99259ms step_avg:152.00ms step:664/1480 train_time:99420ms step_avg:152.02ms step:665/1480 train_time:99583ms step_avg:152.03ms step:666/1480 train_time:99743ms step_avg:152.05ms step:667/1480 train_time:99904ms step_avg:152.06ms step:668/1480 train_time:100066ms step_avg:152.08ms step:669/1480 train_time:100230ms step_avg:152.09ms step:670/1480 train_time:100390ms step_avg:152.11ms step:671/1480 train_time:100552ms step_avg:152.12ms step:672/1480 train_time:100714ms step_avg:152.14ms step:673/1480 train_time:100876ms step_avg:152.15ms step:674/1480 train_time:101039ms step_avg:152.17ms step:675/1480 train_time:101200ms step_avg:152.18ms step:676/1480 train_time:101362ms step_avg:152.20ms step:677/1480 train_time:101523ms step_avg:152.21ms step:678/1480 train_time:101683ms step_avg:152.22ms step:679/1480 train_time:101844ms step_avg:152.23ms step:680/1480 train_time:102006ms step_avg:152.25ms step:681/1480 train_time:102168ms step_avg:152.26ms step:682/1480 train_time:102332ms step_avg:152.28ms step:683/1480 train_time:102492ms step_avg:152.29ms step:684/1480 train_time:102655ms step_avg:152.31ms step:685/1480 train_time:102818ms step_avg:152.32ms step:686/1480 train_time:102980ms step_avg:152.34ms step:687/1480 train_time:103141ms step_avg:152.35ms step:688/1480 train_time:103302ms step_avg:152.36ms step:689/1480 train_time:103465ms step_avg:152.38ms step:690/1480 train_time:103631ms step_avg:152.40ms step:691/1480 train_time:103793ms step_avg:152.41ms step:692/1480 train_time:103955ms step_avg:152.43ms step:693/1480 train_time:104118ms step_avg:152.44ms step:694/1480 train_time:104280ms step_avg:152.46ms step:695/1480 train_time:104441ms step_avg:152.47ms step:696/1480 train_time:104600ms step_avg:152.48ms step:697/1480 train_time:104762ms step_avg:152.49ms step:698/1480 train_time:104923ms step_avg:152.51ms step:699/1480 train_time:105085ms step_avg:152.52ms step:700/1480 train_time:105249ms step_avg:152.53ms step:701/1480 train_time:105409ms step_avg:152.55ms step:702/1480 train_time:105569ms step_avg:152.56ms step:703/1480 train_time:105731ms step_avg:152.57ms step:704/1480 train_time:105892ms step_avg:152.58ms step:705/1480 train_time:106055ms step_avg:152.60ms step:706/1480 train_time:106217ms step_avg:152.61ms step:707/1480 train_time:106378ms step_avg:152.62ms step:708/1480 train_time:106539ms step_avg:152.64ms step:709/1480 train_time:106700ms step_avg:152.65ms step:710/1480 train_time:106860ms step_avg:152.66ms step:711/1480 train_time:107021ms step_avg:152.67ms step:712/1480 train_time:107184ms step_avg:152.68ms step:713/1480 train_time:107346ms step_avg:152.70ms step:714/1480 train_time:107508ms step_avg:152.71ms step:715/1480 train_time:107668ms step_avg:152.72ms step:716/1480 train_time:107829ms step_avg:152.73ms step:717/1480 train_time:107991ms step_avg:152.75ms step:718/1480 train_time:108151ms step_avg:152.76ms step:719/1480 train_time:108312ms step_avg:152.77ms step:720/1480 train_time:108474ms step_avg:152.78ms step:721/1480 train_time:108638ms step_avg:152.80ms step:722/1480 train_time:108800ms step_avg:152.81ms step:723/1480 train_time:108961ms step_avg:152.82ms step:724/1480 train_time:109124ms step_avg:152.83ms step:725/1480 train_time:109284ms step_avg:152.85ms step:726/1480 train_time:109448ms step_avg:152.86ms step:727/1480 train_time:109613ms step_avg:152.88ms step:728/1480 train_time:109775ms step_avg:152.89ms step:729/1480 train_time:109937ms step_avg:152.90ms step:730/1480 train_time:110101ms step_avg:152.92ms step:731/1480 train_time:110261ms step_avg:152.93ms step:732/1480 train_time:110421ms step_avg:152.94ms step:733/1480 train_time:110581ms step_avg:152.95ms step:734/1480 train_time:110742ms step_avg:152.96ms step:735/1480 train_time:110903ms step_avg:152.97ms step:736/1480 train_time:111065ms step_avg:152.98ms step:737/1480 train_time:111225ms step_avg:152.99ms step:738/1480 train_time:111384ms step_avg:153.00ms step:739/1480 train_time:111543ms step_avg:153.01ms step:740/1480 train_time:111708ms step_avg:153.03ms step:741/1480 train_time:111872ms step_avg:153.04ms step:742/1480 train_time:112037ms step_avg:153.06ms step:743/1480 train_time:112198ms step_avg:153.07ms step:744/1480 train_time:112363ms step_avg:153.08ms step:745/1480 train_time:112527ms step_avg:153.10ms step:746/1480 train_time:112686ms step_avg:153.11ms step:747/1480 train_time:112848ms step_avg:153.12ms step:748/1480 train_time:113015ms step_avg:153.14ms step:749/1480 train_time:113179ms step_avg:153.15ms step:750/1480 train_time:113338ms step_avg:153.16ms step:750/1480 val_loss:3.5488 train_time:113402ms step_avg:153.25ms step:751/1480 train_time:113502ms step_avg:153.17ms step:752/1480 train_time:113664ms step_avg:153.19ms step:753/1480 train_time:113825ms step_avg:153.20ms step:754/1480 train_time:113986ms step_avg:153.21ms step:755/1480 train_time:114147ms step_avg:153.22ms step:756/1480 train_time:114310ms step_avg:153.23ms step:757/1480 train_time:114474ms step_avg:153.25ms step:758/1480 train_time:114636ms step_avg:153.26ms step:759/1480 train_time:114798ms step_avg:153.27ms step:760/1480 train_time:114959ms step_avg:153.28ms step:761/1480 train_time:115120ms step_avg:153.29ms step:762/1480 train_time:115281ms step_avg:153.30ms step:763/1480 train_time:115443ms step_avg:153.31ms step:764/1480 train_time:115605ms step_avg:153.32ms step:765/1480 train_time:115766ms step_avg:153.33ms step:766/1480 train_time:115930ms step_avg:153.35ms step:767/1480 train_time:116093ms step_avg:153.36ms step:768/1480 train_time:116255ms step_avg:153.37ms step:769/1480 train_time:116418ms step_avg:153.38ms step:770/1480 train_time:116580ms step_avg:153.40ms step:771/1480 train_time:116742ms step_avg:153.41ms step:772/1480 train_time:116904ms step_avg:153.42ms step:773/1480 train_time:117069ms step_avg:153.43ms step:774/1480 train_time:117233ms step_avg:153.45ms step:775/1480 train_time:117395ms step_avg:153.46ms step:776/1480 train_time:117558ms step_avg:153.47ms step:777/1480 train_time:117723ms step_avg:153.49ms step:778/1480 train_time:117887ms step_avg:153.50ms step:779/1480 train_time:118050ms step_avg:153.51ms step:780/1480 train_time:118214ms step_avg:153.53ms step:781/1480 train_time:118378ms step_avg:153.54ms step:782/1480 train_time:118542ms step_avg:153.55ms step:783/1480 train_time:118703ms step_avg:153.56ms step:784/1480 train_time:118869ms step_avg:153.58ms step:785/1480 train_time:119032ms step_avg:153.59ms step:786/1480 train_time:119197ms step_avg:153.60ms step:787/1480 train_time:119360ms step_avg:153.62ms step:788/1480 train_time:119526ms step_avg:153.63ms step:789/1480 train_time:119688ms step_avg:153.64ms step:790/1480 train_time:119852ms step_avg:153.66ms step:791/1480 train_time:120017ms step_avg:153.67ms step:792/1480 train_time:120182ms step_avg:153.69ms step:793/1480 train_time:120345ms step_avg:153.70ms step:794/1480 train_time:120509ms step_avg:153.71ms step:795/1480 train_time:120675ms step_avg:153.73ms step:796/1480 train_time:120840ms step_avg:153.74ms step:797/1480 train_time:121004ms step_avg:153.75ms step:798/1480 train_time:121170ms step_avg:153.77ms step:799/1480 train_time:121337ms step_avg:153.79ms step:800/1480 train_time:121499ms step_avg:153.80ms step:801/1480 train_time:121660ms step_avg:153.81ms step:802/1480 train_time:121829ms step_avg:153.82ms step:803/1480 train_time:121992ms step_avg:153.84ms step:804/1480 train_time:122154ms step_avg:153.85ms step:805/1480 train_time:122319ms step_avg:153.86ms step:806/1480 train_time:122481ms step_avg:153.87ms step:807/1480 train_time:122642ms step_avg:153.88ms step:808/1480 train_time:122805ms step_avg:153.89ms step:809/1480 train_time:122968ms step_avg:153.90ms step:810/1480 train_time:123131ms step_avg:153.91ms step:811/1480 train_time:123295ms step_avg:153.93ms step:812/1480 train_time:123458ms step_avg:153.94ms step:813/1480 train_time:123618ms step_avg:153.95ms step:814/1480 train_time:123783ms step_avg:153.96ms step:815/1480 train_time:123945ms step_avg:153.97ms step:816/1480 train_time:124112ms step_avg:153.99ms step:817/1480 train_time:124275ms step_avg:154.00ms step:818/1480 train_time:124435ms step_avg:154.00ms step:819/1480 train_time:124598ms step_avg:154.02ms step:820/1480 train_time:124760ms step_avg:154.02ms step:821/1480 train_time:124921ms step_avg:154.03ms step:822/1480 train_time:125086ms step_avg:154.05ms step:823/1480 train_time:125250ms step_avg:154.06ms step:824/1480 train_time:125413ms step_avg:154.07ms step:825/1480 train_time:125578ms step_avg:154.08ms step:826/1480 train_time:125744ms step_avg:154.10ms step:827/1480 train_time:125908ms step_avg:154.11ms step:828/1480 train_time:126072ms step_avg:154.12ms step:829/1480 train_time:126236ms step_avg:154.13ms step:830/1480 train_time:126400ms step_avg:154.15ms step:831/1480 train_time:126563ms step_avg:154.16ms step:832/1480 train_time:126726ms step_avg:154.17ms step:833/1480 train_time:126892ms step_avg:154.18ms step:834/1480 train_time:127057ms step_avg:154.20ms step:835/1480 train_time:127219ms step_avg:154.20ms step:836/1480 train_time:127386ms step_avg:154.22ms step:837/1480 train_time:127549ms step_avg:154.23ms step:838/1480 train_time:127714ms step_avg:154.24ms step:839/1480 train_time:127876ms step_avg:154.25ms step:840/1480 train_time:128037ms step_avg:154.26ms step:841/1480 train_time:128198ms step_avg:154.27ms step:842/1480 train_time:128361ms step_avg:154.28ms step:843/1480 train_time:128521ms step_avg:154.29ms step:844/1480 train_time:128683ms step_avg:154.30ms step:845/1480 train_time:128847ms step_avg:154.31ms step:846/1480 train_time:129012ms step_avg:154.32ms step:847/1480 train_time:129176ms step_avg:154.33ms step:848/1480 train_time:129337ms step_avg:154.34ms step:849/1480 train_time:129500ms step_avg:154.35ms step:850/1480 train_time:129661ms step_avg:154.36ms step:851/1480 train_time:129827ms step_avg:154.37ms step:852/1480 train_time:129989ms step_avg:154.38ms step:853/1480 train_time:130152ms step_avg:154.39ms step:854/1480 train_time:130317ms step_avg:154.40ms step:855/1480 train_time:130479ms step_avg:154.41ms step:856/1480 train_time:130640ms step_avg:154.42ms step:857/1480 train_time:130805ms step_avg:154.43ms step:858/1480 train_time:130972ms step_avg:154.45ms step:859/1480 train_time:131137ms step_avg:154.46ms step:860/1480 train_time:131298ms step_avg:154.47ms step:861/1480 train_time:131463ms step_avg:154.48ms step:862/1480 train_time:131633ms step_avg:154.50ms step:863/1480 train_time:131801ms step_avg:154.51ms step:864/1480 train_time:131965ms step_avg:154.53ms step:865/1480 train_time:132126ms step_avg:154.53ms step:866/1480 train_time:132294ms step_avg:154.55ms step:867/1480 train_time:132457ms step_avg:154.56ms step:868/1480 train_time:132620ms step_avg:154.57ms step:869/1480 train_time:132781ms step_avg:154.58ms step:870/1480 train_time:132947ms step_avg:154.59ms step:871/1480 train_time:133110ms step_avg:154.60ms step:872/1480 train_time:133274ms step_avg:154.61ms step:873/1480 train_time:133437ms step_avg:154.62ms step:874/1480 train_time:133602ms step_avg:154.63ms step:875/1480 train_time:133767ms step_avg:154.64ms step:875/1480 val_loss:3.5029 train_time:133832ms step_avg:154.72ms step:876/1480 train_time:133934ms step_avg:154.66ms step:877/1480 train_time:134099ms step_avg:154.67ms step:878/1480 train_time:134262ms step_avg:154.68ms step:879/1480 train_time:134425ms step_avg:154.69ms step:880/1480 train_time:134588ms step_avg:154.70ms step:881/1480 train_time:134750ms step_avg:154.71ms step:882/1480 train_time:134915ms step_avg:154.72ms step:883/1480 train_time:135082ms step_avg:154.73ms step:884/1480 train_time:135249ms step_avg:154.75ms step:885/1480 train_time:135414ms step_avg:154.76ms step:886/1480 train_time:135580ms step_avg:154.77ms step:887/1480 train_time:135748ms step_avg:154.79ms step:888/1480 train_time:135921ms step_avg:154.81ms step:889/1480 train_time:136089ms step_avg:154.82ms step:890/1480 train_time:136251ms step_avg:154.83ms step:891/1480 train_time:136418ms step_avg:154.84ms step:892/1480 train_time:136584ms step_avg:154.86ms step:893/1480 train_time:136744ms step_avg:154.86ms step:894/1480 train_time:136913ms step_avg:154.88ms step:895/1480 train_time:137081ms step_avg:154.89ms step:896/1480 train_time:137246ms step_avg:154.90ms step:897/1480 train_time:137413ms step_avg:154.92ms step:898/1480 train_time:137580ms step_avg:154.93ms step:899/1480 train_time:137744ms step_avg:154.94ms step:900/1480 train_time:137907ms step_avg:154.95ms step:901/1480 train_time:138070ms step_avg:154.96ms step:902/1480 train_time:138236ms step_avg:154.97ms step:903/1480 train_time:138405ms step_avg:154.99ms step:904/1480 train_time:138570ms step_avg:155.00ms step:905/1480 train_time:138732ms step_avg:155.01ms step:906/1480 train_time:138898ms step_avg:155.02ms step:907/1480 train_time:139066ms step_avg:155.03ms step:908/1480 train_time:139228ms step_avg:155.04ms step:909/1480 train_time:139391ms step_avg:155.05ms step:910/1480 train_time:139561ms step_avg:155.07ms step:911/1480 train_time:139726ms step_avg:155.08ms step:912/1480 train_time:139893ms step_avg:155.09ms step:913/1480 train_time:140061ms step_avg:155.11ms step:914/1480 train_time:140227ms step_avg:155.12ms step:915/1480 train_time:140397ms step_avg:155.13ms step:916/1480 train_time:140561ms step_avg:155.14ms step:917/1480 train_time:140724ms step_avg:155.15ms step:918/1480 train_time:140892ms step_avg:155.17ms step:919/1480 train_time:141062ms step_avg:155.18ms step:920/1480 train_time:141226ms step_avg:155.19ms step:921/1480 train_time:141393ms step_avg:155.21ms step:922/1480 train_time:141561ms step_avg:155.22ms step:923/1480 train_time:141723ms step_avg:155.23ms step:924/1480 train_time:141887ms step_avg:155.24ms step:925/1480 train_time:142053ms step_avg:155.25ms step:926/1480 train_time:142216ms step_avg:155.26ms step:927/1480 train_time:142381ms step_avg:155.27ms step:928/1480 train_time:142547ms step_avg:155.28ms step:929/1480 train_time:142714ms step_avg:155.29ms step:930/1480 train_time:142879ms step_avg:155.30ms step:931/1480 train_time:143042ms step_avg:155.31ms step:932/1480 train_time:143207ms step_avg:155.32ms step:933/1480 train_time:143374ms step_avg:155.33ms step:934/1480 train_time:143542ms step_avg:155.35ms step:935/1480 train_time:143711ms step_avg:155.36ms step:936/1480 train_time:143880ms step_avg:155.38ms step:937/1480 train_time:144052ms step_avg:155.40ms step:938/1480 train_time:144215ms step_avg:155.40ms step:939/1480 train_time:144383ms step_avg:155.42ms step:940/1480 train_time:144550ms step_avg:155.43ms step:941/1480 train_time:144715ms step_avg:155.44ms step:942/1480 train_time:144881ms step_avg:155.45ms step:943/1480 train_time:145051ms step_avg:155.47ms step:944/1480 train_time:145224ms step_avg:155.49ms step:945/1480 train_time:145386ms step_avg:155.49ms step:946/1480 train_time:145556ms step_avg:155.51ms step:947/1480 train_time:145724ms step_avg:155.52ms step:948/1480 train_time:145888ms step_avg:155.53ms step:949/1480 train_time:146053ms step_avg:155.54ms step:950/1480 train_time:146217ms step_avg:155.55ms step:951/1480 train_time:146384ms step_avg:155.56ms step:952/1480 train_time:146549ms step_avg:155.57ms step:953/1480 train_time:146716ms step_avg:155.58ms step:954/1480 train_time:146884ms step_avg:155.60ms step:955/1480 train_time:147048ms step_avg:155.61ms step:956/1480 train_time:147212ms step_avg:155.62ms step:957/1480 train_time:147381ms step_avg:155.63ms step:958/1480 train_time:147549ms step_avg:155.64ms step:959/1480 train_time:147714ms step_avg:155.65ms step:960/1480 train_time:147881ms step_avg:155.66ms step:961/1480 train_time:148046ms step_avg:155.67ms step:962/1480 train_time:148210ms step_avg:155.68ms step:963/1480 train_time:148377ms step_avg:155.69ms step:964/1480 train_time:148545ms step_avg:155.71ms step:965/1480 train_time:148708ms step_avg:155.72ms step:966/1480 train_time:148870ms step_avg:155.72ms step:967/1480 train_time:149035ms step_avg:155.73ms step:968/1480 train_time:149200ms step_avg:155.74ms step:969/1480 train_time:149367ms step_avg:155.75ms step:970/1480 train_time:149531ms step_avg:155.76ms step:971/1480 train_time:149695ms step_avg:155.77ms step:972/1480 train_time:149861ms step_avg:155.78ms step:973/1480 train_time:150024ms step_avg:155.79ms step:974/1480 train_time:150194ms step_avg:155.80ms step:975/1480 train_time:150360ms step_avg:155.81ms step:976/1480 train_time:150525ms step_avg:155.82ms step:977/1480 train_time:150688ms step_avg:155.83ms step:978/1480 train_time:150855ms step_avg:155.84ms step:979/1480 train_time:151022ms step_avg:155.85ms step:980/1480 train_time:151187ms step_avg:155.86ms step:981/1480 train_time:151356ms step_avg:155.88ms step:982/1480 train_time:151519ms step_avg:155.88ms step:983/1480 train_time:151684ms step_avg:155.89ms step:984/1480 train_time:151847ms step_avg:155.90ms step:985/1480 train_time:152017ms step_avg:155.91ms step:986/1480 train_time:152183ms step_avg:155.92ms step:987/1480 train_time:152346ms step_avg:155.93ms step:988/1480 train_time:152512ms step_avg:155.94ms step:989/1480 train_time:152678ms step_avg:155.95ms step:990/1480 train_time:152848ms step_avg:155.97ms step:991/1480 train_time:153016ms step_avg:155.98ms step:992/1480 train_time:153190ms step_avg:156.00ms step:993/1480 train_time:153366ms step_avg:156.02ms step:994/1480 train_time:153530ms step_avg:156.03ms step:995/1480 train_time:153695ms step_avg:156.04ms step:996/1480 train_time:153858ms step_avg:156.04ms step:997/1480 train_time:154023ms step_avg:156.05ms step:998/1480 train_time:154185ms step_avg:156.06ms step:999/1480 train_time:154351ms step_avg:156.07ms step:1000/1480 train_time:154523ms step_avg:156.08ms step:1000/1480 val_loss:3.4405 train_time:154591ms step_avg:156.15ms step:1001/1480 train_time:154694ms step_avg:156.10ms step:1002/1480 train_time:154862ms step_avg:156.11ms step:1003/1480 train_time:155033ms step_avg:156.13ms step:1004/1480 train_time:155202ms step_avg:156.14ms step:1005/1480 train_time:155369ms step_avg:156.15ms step:1006/1480 train_time:155538ms step_avg:156.16ms step:1007/1480 train_time:155703ms step_avg:156.17ms step:1008/1480 train_time:155869ms step_avg:156.18ms step:1009/1480 train_time:156042ms step_avg:156.20ms step:1010/1480 train_time:156206ms step_avg:156.21ms step:1011/1480 train_time:156372ms step_avg:156.22ms step:1012/1480 train_time:156538ms step_avg:156.23ms step:1013/1480 train_time:156709ms step_avg:156.24ms step:1014/1480 train_time:156877ms step_avg:156.25ms step:1015/1480 train_time:157046ms step_avg:156.26ms step:1016/1480 train_time:157214ms step_avg:156.28ms step:1017/1480 train_time:157387ms step_avg:156.29ms step:1018/1480 train_time:157555ms step_avg:156.30ms step:1019/1480 train_time:157723ms step_avg:156.32ms step:1020/1480 train_time:157892ms step_avg:156.33ms step:1021/1480 train_time:158058ms step_avg:156.34ms step:1022/1480 train_time:158226ms step_avg:156.35ms step:1023/1480 train_time:158394ms step_avg:156.36ms step:1024/1480 train_time:158561ms step_avg:156.37ms step:1025/1480 train_time:158733ms step_avg:156.39ms step:1026/1480 train_time:158899ms step_avg:156.40ms step:1027/1480 train_time:159065ms step_avg:156.41ms step:1028/1480 train_time:159238ms step_avg:156.42ms step:1029/1480 train_time:159414ms step_avg:156.44ms step:1030/1480 train_time:159581ms step_avg:156.45ms step:1031/1480 train_time:159745ms step_avg:156.46ms step:1032/1480 train_time:159919ms step_avg:156.48ms step:1033/1480 train_time:160084ms step_avg:156.49ms step:1034/1480 train_time:160252ms step_avg:156.50ms step:1035/1480 train_time:160421ms step_avg:156.51ms step:1036/1480 train_time:160585ms step_avg:156.52ms step:1037/1480 train_time:160752ms step_avg:156.53ms step:1038/1480 train_time:160921ms step_avg:156.54ms step:1039/1480 train_time:161091ms step_avg:156.55ms step:1040/1480 train_time:161258ms step_avg:156.56ms step:1041/1480 train_time:161425ms step_avg:156.57ms step:1042/1480 train_time:161589ms step_avg:156.58ms step:1043/1480 train_time:161754ms step_avg:156.59ms step:1044/1480 train_time:161919ms step_avg:156.59ms step:1045/1480 train_time:162089ms step_avg:156.61ms step:1046/1480 train_time:162257ms step_avg:156.62ms step:1047/1480 train_time:162423ms step_avg:156.63ms step:1048/1480 train_time:162589ms step_avg:156.64ms step:1049/1480 train_time:162755ms step_avg:156.65ms step:1050/1480 train_time:162924ms step_avg:156.66ms step:1051/1480 train_time:163094ms step_avg:156.67ms step:1052/1480 train_time:163263ms step_avg:156.68ms step:1053/1480 train_time:163428ms step_avg:156.69ms step:1054/1480 train_time:163597ms step_avg:156.70ms step:1055/1480 train_time:163764ms step_avg:156.71ms step:1056/1480 train_time:163928ms step_avg:156.72ms step:1057/1480 train_time:164096ms step_avg:156.73ms step:1058/1480 train_time:164265ms step_avg:156.74ms step:1059/1480 train_time:164438ms step_avg:156.76ms step:1060/1480 train_time:164606ms step_avg:156.77ms step:1061/1480 train_time:164770ms step_avg:156.77ms step:1062/1480 train_time:164936ms step_avg:156.78ms step:1063/1480 train_time:165101ms step_avg:156.79ms step:1064/1480 train_time:165266ms step_avg:156.80ms step:1065/1480 train_time:165431ms step_avg:156.81ms step:1066/1480 train_time:165601ms step_avg:156.82ms step:1067/1480 train_time:165768ms step_avg:156.83ms step:1068/1480 train_time:165935ms step_avg:156.84ms step:1069/1480 train_time:166106ms step_avg:156.85ms step:1070/1480 train_time:166271ms step_avg:156.86ms step:1071/1480 train_time:166443ms step_avg:156.87ms step:1072/1480 train_time:166609ms step_avg:156.88ms step:1073/1480 train_time:166773ms step_avg:156.89ms step:1074/1480 train_time:166940ms step_avg:156.90ms step:1075/1480 train_time:167111ms step_avg:156.91ms step:1076/1480 train_time:167280ms step_avg:156.92ms step:1077/1480 train_time:167445ms step_avg:156.93ms step:1078/1480 train_time:167619ms step_avg:156.95ms step:1079/1480 train_time:167793ms step_avg:156.96ms step:1080/1480 train_time:167962ms step_avg:156.97ms step:1081/1480 train_time:168130ms step_avg:156.98ms step:1082/1480 train_time:168298ms step_avg:156.99ms step:1083/1480 train_time:168464ms step_avg:157.00ms step:1084/1480 train_time:168631ms step_avg:157.01ms step:1085/1480 train_time:168799ms step_avg:157.02ms step:1086/1480 train_time:168966ms step_avg:157.03ms step:1087/1480 train_time:169132ms step_avg:157.04ms step:1088/1480 train_time:169302ms step_avg:157.05ms step:1089/1480 train_time:169475ms step_avg:157.07ms step:1090/1480 train_time:169646ms step_avg:157.08ms step:1091/1480 train_time:169814ms step_avg:157.09ms step:1092/1480 train_time:169982ms step_avg:157.10ms step:1093/1480 train_time:170149ms step_avg:157.11ms step:1094/1480 train_time:170314ms step_avg:157.12ms step:1095/1480 train_time:170479ms step_avg:157.12ms step:1096/1480 train_time:170647ms step_avg:157.13ms step:1097/1480 train_time:170817ms step_avg:157.15ms step:1098/1480 train_time:170988ms step_avg:157.16ms step:1099/1480 train_time:171159ms step_avg:157.17ms step:1100/1480 train_time:171330ms step_avg:157.18ms step:1101/1480 train_time:171501ms step_avg:157.20ms step:1102/1480 train_time:171672ms step_avg:157.21ms step:1103/1480 train_time:171849ms step_avg:157.23ms step:1104/1480 train_time:172018ms step_avg:157.24ms step:1105/1480 train_time:172188ms step_avg:157.25ms step:1106/1480 train_time:172356ms step_avg:157.26ms step:1107/1480 train_time:172526ms step_avg:157.27ms step:1108/1480 train_time:172690ms step_avg:157.28ms step:1109/1480 train_time:172857ms step_avg:157.29ms step:1110/1480 train_time:173022ms step_avg:157.29ms step:1111/1480 train_time:173189ms step_avg:157.30ms step:1112/1480 train_time:173360ms step_avg:157.31ms step:1113/1480 train_time:173539ms step_avg:157.33ms step:1114/1480 train_time:173713ms step_avg:157.35ms step:1115/1480 train_time:173885ms step_avg:157.36ms step:1116/1480 train_time:174052ms step_avg:157.37ms step:1117/1480 train_time:174224ms step_avg:157.38ms step:1118/1480 train_time:174399ms step_avg:157.40ms step:1119/1480 train_time:174566ms step_avg:157.41ms step:1120/1480 train_time:174736ms step_avg:157.42ms step:1121/1480 train_time:174905ms step_avg:157.43ms step:1122/1480 train_time:175072ms step_avg:157.44ms step:1123/1480 train_time:175239ms step_avg:157.45ms step:1124/1480 train_time:175406ms step_avg:157.46ms step:1125/1480 train_time:175574ms step_avg:157.47ms step:1125/1480 val_loss:3.3850 train_time:175642ms step_avg:157.53ms step:1126/1480 train_time:175746ms step_avg:157.48ms step:1127/1480 train_time:175917ms step_avg:157.49ms step:1128/1480 train_time:176089ms step_avg:157.50ms step:1129/1480 train_time:176263ms step_avg:157.52ms step:1130/1480 train_time:176432ms step_avg:157.53ms step:1131/1480 train_time:176608ms step_avg:157.55ms step:1132/1480 train_time:176773ms step_avg:157.55ms step:1133/1480 train_time:176946ms step_avg:157.57ms step:1134/1480 train_time:177116ms step_avg:157.58ms step:1135/1480 train_time:177285ms step_avg:157.59ms step:1136/1480 train_time:177454ms step_avg:157.60ms step:1137/1480 train_time:177623ms step_avg:157.61ms step:1138/1480 train_time:177794ms step_avg:157.62ms step:1139/1480 train_time:177962ms step_avg:157.63ms step:1140/1480 train_time:178130ms step_avg:157.64ms step:1141/1480 train_time:178302ms step_avg:157.65ms step:1142/1480 train_time:178470ms step_avg:157.66ms step:1143/1480 train_time:178639ms step_avg:157.67ms step:1144/1480 train_time:178809ms step_avg:157.68ms step:1145/1480 train_time:178974ms step_avg:157.69ms step:1146/1480 train_time:179145ms step_avg:157.70ms step:1147/1480 train_time:179313ms step_avg:157.71ms step:1148/1480 train_time:179482ms step_avg:157.72ms step:1149/1480 train_time:179652ms step_avg:157.73ms step:1150/1480 train_time:179821ms step_avg:157.74ms step:1151/1480 train_time:179992ms step_avg:157.75ms step:1152/1480 train_time:180163ms step_avg:157.76ms step:1153/1480 train_time:180335ms step_avg:157.77ms step:1154/1480 train_time:180503ms step_avg:157.78ms step:1155/1480 train_time:180675ms step_avg:157.79ms step:1156/1480 train_time:180854ms step_avg:157.81ms step:1157/1480 train_time:181024ms step_avg:157.82ms step:1158/1480 train_time:181191ms step_avg:157.83ms step:1159/1480 train_time:181357ms step_avg:157.84ms step:1160/1480 train_time:181523ms step_avg:157.85ms step:1161/1480 train_time:181694ms step_avg:157.86ms step:1162/1480 train_time:181865ms step_avg:157.87ms step:1163/1480 train_time:182034ms step_avg:157.88ms step:1164/1480 train_time:182203ms step_avg:157.89ms step:1165/1480 train_time:182368ms step_avg:157.89ms step:1166/1480 train_time:182536ms step_avg:157.90ms step:1167/1480 train_time:182703ms step_avg:157.91ms step:1168/1480 train_time:182871ms step_avg:157.92ms step:1169/1480 train_time:183040ms step_avg:157.93ms step:1170/1480 train_time:183210ms step_avg:157.94ms step:1171/1480 train_time:183377ms step_avg:157.95ms step:1172/1480 train_time:183544ms step_avg:157.95ms step:1173/1480 train_time:183716ms step_avg:157.97ms step:1174/1480 train_time:183896ms step_avg:157.99ms step:1175/1480 train_time:184068ms step_avg:158.00ms step:1176/1480 train_time:184239ms step_avg:158.01ms step:1177/1480 train_time:184418ms step_avg:158.03ms step:1178/1480 train_time:184585ms step_avg:158.04ms step:1179/1480 train_time:184751ms step_avg:158.04ms step:1180/1480 train_time:184933ms step_avg:158.06ms step:1181/1480 train_time:185103ms step_avg:158.07ms step:1182/1480 train_time:185271ms step_avg:158.08ms step:1183/1480 train_time:185442ms step_avg:158.09ms step:1184/1480 train_time:185610ms step_avg:158.10ms step:1185/1480 train_time:185783ms step_avg:158.11ms step:1186/1480 train_time:185955ms step_avg:158.12ms step:1187/1480 train_time:186137ms step_avg:158.15ms step:1188/1480 train_time:186304ms step_avg:158.15ms step:1189/1480 train_time:186475ms step_avg:158.16ms step:1190/1480 train_time:186642ms step_avg:158.17ms step:1191/1480 train_time:186814ms step_avg:158.18ms step:1192/1480 train_time:186980ms step_avg:158.19ms step:1193/1480 train_time:187146ms step_avg:158.20ms step:1194/1480 train_time:187314ms step_avg:158.20ms step:1195/1480 train_time:187489ms step_avg:158.22ms step:1196/1480 train_time:187669ms step_avg:158.24ms step:1197/1480 train_time:187839ms step_avg:158.25ms step:1198/1480 train_time:188023ms step_avg:158.27ms step:1199/1480 train_time:188194ms step_avg:158.28ms step:1200/1480 train_time:188362ms step_avg:158.29ms step:1201/1480 train_time:188530ms step_avg:158.30ms step:1202/1480 train_time:188711ms step_avg:158.31ms step:1203/1480 train_time:188888ms step_avg:158.33ms step:1204/1480 train_time:189062ms step_avg:158.34ms step:1205/1480 train_time:189230ms step_avg:158.35ms step:1206/1480 train_time:189397ms step_avg:158.36ms step:1207/1480 train_time:189567ms step_avg:158.37ms step:1208/1480 train_time:189734ms step_avg:158.38ms step:1209/1480 train_time:189908ms step_avg:158.39ms step:1210/1480 train_time:190084ms step_avg:158.40ms step:1211/1480 train_time:190257ms step_avg:158.42ms step:1212/1480 train_time:190429ms step_avg:158.43ms step:1213/1480 train_time:190602ms step_avg:158.44ms step:1214/1480 train_time:190778ms step_avg:158.45ms step:1215/1480 train_time:190951ms step_avg:158.47ms step:1216/1480 train_time:191118ms step_avg:158.47ms step:1217/1480 train_time:191291ms step_avg:158.48ms step:1218/1480 train_time:191460ms step_avg:158.49ms step:1219/1480 train_time:191638ms step_avg:158.51ms step:1220/1480 train_time:191807ms step_avg:158.52ms step:1221/1480 train_time:191976ms step_avg:158.53ms step:1222/1480 train_time:192143ms step_avg:158.53ms step:1223/1480 train_time:192315ms step_avg:158.55ms step:1224/1480 train_time:192493ms step_avg:158.56ms step:1225/1480 train_time:192664ms step_avg:158.57ms step:1226/1480 train_time:192835ms step_avg:158.58ms step:1227/1480 train_time:193009ms step_avg:158.59ms step:1228/1480 train_time:193179ms step_avg:158.60ms step:1229/1480 train_time:193351ms step_avg:158.61ms step:1230/1480 train_time:193532ms step_avg:158.63ms step:1231/1480 train_time:193708ms step_avg:158.65ms step:1232/1480 train_time:193882ms step_avg:158.66ms step:1233/1480 train_time:194052ms step_avg:158.67ms step:1234/1480 train_time:194223ms step_avg:158.68ms step:1235/1480 train_time:194396ms step_avg:158.69ms step:1236/1480 train_time:194563ms step_avg:158.70ms step:1237/1480 train_time:194734ms step_avg:158.71ms step:1238/1480 train_time:194919ms step_avg:158.73ms step:1239/1480 train_time:195091ms step_avg:158.74ms step:1240/1480 train_time:195260ms step_avg:158.75ms step:1241/1480 train_time:195433ms step_avg:158.76ms step:1242/1480 train_time:195601ms step_avg:158.77ms step:1243/1480 train_time:195773ms step_avg:158.78ms step:1244/1480 train_time:195940ms step_avg:158.78ms step:1245/1480 train_time:196109ms step_avg:158.79ms step:1246/1480 train_time:196277ms step_avg:158.80ms step:1247/1480 train_time:196446ms step_avg:158.81ms step:1248/1480 train_time:196615ms step_avg:158.82ms step:1249/1480 train_time:196785ms step_avg:158.83ms step:1250/1480 train_time:196955ms step_avg:158.83ms step:1250/1480 val_loss:3.3355 train_time:197027ms step_avg:158.89ms step:1251/1480 train_time:197136ms step_avg:158.85ms step:1252/1480 train_time:197305ms step_avg:158.86ms step:1253/1480 train_time:197474ms step_avg:158.87ms step:1254/1480 train_time:197646ms step_avg:158.88ms step:1255/1480 train_time:197833ms step_avg:158.90ms step:1256/1480 train_time:198006ms step_avg:158.91ms step:1257/1480 train_time:198177ms step_avg:158.92ms step:1258/1480 train_time:198353ms step_avg:158.94ms step:1259/1480 train_time:198525ms step_avg:158.95ms step:1260/1480 train_time:198693ms step_avg:158.95ms step:1261/1480 train_time:198865ms step_avg:158.97ms step:1262/1480 train_time:199041ms step_avg:158.98ms step:1263/1480 train_time:199216ms step_avg:158.99ms step:1264/1480 train_time:199384ms step_avg:159.00ms step:1265/1480 train_time:199551ms step_avg:159.00ms step:1266/1480 train_time:199722ms step_avg:159.01ms step:1267/1480 train_time:199892ms step_avg:159.02ms step:1268/1480 train_time:200064ms step_avg:159.03ms step:1269/1480 train_time:200240ms step_avg:159.05ms step:1270/1480 train_time:200410ms step_avg:159.06ms step:1271/1480 train_time:200579ms step_avg:159.06ms step:1272/1480 train_time:200745ms step_avg:159.07ms step:1273/1480 train_time:200915ms step_avg:159.08ms step:1274/1480 train_time:201088ms step_avg:159.09ms step:1275/1480 train_time:201255ms step_avg:159.10ms step:1276/1480 train_time:201421ms step_avg:159.10ms step:1277/1480 train_time:201592ms step_avg:159.11ms step:1278/1480 train_time:201761ms step_avg:159.12ms step:1279/1480 train_time:201933ms step_avg:159.13ms step:1280/1480 train_time:202112ms step_avg:159.14ms step:1281/1480 train_time:202281ms step_avg:159.15ms step:1282/1480 train_time:202448ms step_avg:159.16ms step:1283/1480 train_time:202617ms step_avg:159.17ms step:1284/1480 train_time:202787ms step_avg:159.17ms step:1285/1480 train_time:202957ms step_avg:159.18ms step:1286/1480 train_time:203129ms step_avg:159.19ms step:1287/1480 train_time:203302ms step_avg:159.20ms step:1288/1480 train_time:203474ms step_avg:159.21ms step:1289/1480 train_time:203657ms step_avg:159.23ms step:1290/1480 train_time:203836ms step_avg:159.25ms step:1291/1480 train_time:204011ms step_avg:159.26ms step:1292/1480 train_time:204185ms step_avg:159.27ms step:1293/1480 train_time:204359ms step_avg:159.28ms step:1294/1480 train_time:204533ms step_avg:159.29ms step:1295/1480 train_time:204706ms step_avg:159.30ms step:1296/1480 train_time:204880ms step_avg:159.32ms step:1297/1480 train_time:205053ms step_avg:159.33ms step:1298/1480 train_time:205222ms step_avg:159.33ms step:1299/1480 train_time:205392ms step_avg:159.34ms step:1300/1480 train_time:205558ms step_avg:159.35ms step:1301/1480 train_time:205728ms step_avg:159.36ms step:1302/1480 train_time:205900ms step_avg:159.37ms step:1303/1480 train_time:206076ms step_avg:159.38ms step:1304/1480 train_time:206251ms step_avg:159.39ms step:1305/1480 train_time:206420ms step_avg:159.40ms step:1306/1480 train_time:206595ms step_avg:159.41ms step:1307/1480 train_time:206763ms step_avg:159.42ms step:1308/1480 train_time:206933ms step_avg:159.42ms step:1309/1480 train_time:207105ms step_avg:159.43ms step:1310/1480 train_time:207273ms step_avg:159.44ms step:1311/1480 train_time:207441ms step_avg:159.45ms step:1312/1480 train_time:207617ms step_avg:159.46ms step:1313/1480 train_time:207786ms step_avg:159.47ms step:1314/1480 train_time:207959ms step_avg:159.48ms step:1315/1480 train_time:208129ms step_avg:159.49ms step:1316/1480 train_time:208296ms step_avg:159.49ms step:1317/1480 train_time:208467ms step_avg:159.50ms step:1318/1480 train_time:208648ms step_avg:159.52ms step:1319/1480 train_time:208825ms step_avg:159.53ms step:1320/1480 train_time:209001ms step_avg:159.54ms step:1321/1480 train_time:209174ms step_avg:159.55ms step:1322/1480 train_time:209356ms step_avg:159.57ms step:1323/1480 train_time:209531ms step_avg:159.58ms step:1324/1480 train_time:209706ms step_avg:159.59ms step:1325/1480 train_time:209887ms step_avg:159.61ms step:1326/1480 train_time:210061ms step_avg:159.62ms step:1327/1480 train_time:210231ms step_avg:159.63ms step:1328/1480 train_time:210400ms step_avg:159.64ms step:1329/1480 train_time:210595ms step_avg:159.66ms step:1330/1480 train_time:210774ms step_avg:159.68ms step:1331/1480 train_time:210945ms step_avg:159.69ms step:1332/1480 train_time:211118ms step_avg:159.70ms step:1333/1480 train_time:211294ms step_avg:159.71ms step:1334/1480 train_time:211464ms step_avg:159.72ms step:1335/1480 train_time:211634ms step_avg:159.72ms step:1336/1480 train_time:211815ms step_avg:159.74ms step:1337/1480 train_time:211989ms step_avg:159.75ms step:1338/1480 train_time:212158ms step_avg:159.76ms step:1339/1480 train_time:212333ms step_avg:159.77ms step:1340/1480 train_time:212505ms step_avg:159.78ms step:1341/1480 train_time:212672ms step_avg:159.78ms step:1342/1480 train_time:212846ms step_avg:159.79ms step:1343/1480 train_time:213016ms step_avg:159.80ms step:1344/1480 train_time:213188ms step_avg:159.81ms step:1345/1480 train_time:213367ms step_avg:159.83ms step:1346/1480 train_time:213535ms step_avg:159.83ms step:1347/1480 train_time:213705ms step_avg:159.84ms step:1348/1480 train_time:213874ms step_avg:159.85ms step:1349/1480 train_time:214044ms step_avg:159.85ms step:1350/1480 train_time:214218ms step_avg:159.86ms step:1351/1480 train_time:214389ms step_avg:159.87ms step:1352/1480 train_time:214560ms step_avg:159.88ms step:1353/1480 train_time:214736ms step_avg:159.89ms step:1354/1480 train_time:214908ms step_avg:159.90ms step:1355/1480 train_time:215075ms step_avg:159.91ms step:1356/1480 train_time:215249ms step_avg:159.92ms step:1357/1480 train_time:215422ms step_avg:159.93ms step:1358/1480 train_time:215595ms step_avg:159.94ms step:1359/1480 train_time:215768ms step_avg:159.95ms step:1360/1480 train_time:215942ms step_avg:159.96ms step:1361/1480 train_time:216119ms step_avg:159.97ms step:1362/1480 train_time:216294ms step_avg:159.98ms step:1363/1480 train_time:216475ms step_avg:160.00ms step:1364/1480 train_time:216642ms step_avg:160.00ms step:1365/1480 train_time:216810ms step_avg:160.01ms step:1366/1480 train_time:216983ms step_avg:160.02ms step:1367/1480 train_time:217155ms step_avg:160.03ms step:1368/1480 train_time:217329ms step_avg:160.04ms step:1369/1480 train_time:217510ms step_avg:160.05ms step:1370/1480 train_time:217688ms step_avg:160.06ms step:1371/1480 train_time:217858ms step_avg:160.07ms step:1372/1480 train_time:218036ms step_avg:160.09ms step:1373/1480 train_time:218206ms step_avg:160.09ms step:1374/1480 train_time:218381ms step_avg:160.10ms step:1375/1480 train_time:218552ms step_avg:160.11ms step:1375/1480 val_loss:3.2971 train_time:218620ms step_avg:160.16ms step:1376/1480 train_time:218728ms step_avg:160.12ms step:1377/1480 train_time:218900ms step_avg:160.13ms step:1378/1480 train_time:219068ms step_avg:160.14ms step:1379/1480 train_time:219245ms step_avg:160.15ms step:1380/1480 train_time:219419ms step_avg:160.16ms step:1381/1480 train_time:219602ms step_avg:160.18ms step:1382/1480 train_time:219773ms step_avg:160.18ms step:1383/1480 train_time:219947ms step_avg:160.19ms step:1384/1480 train_time:220124ms step_avg:160.21ms step:1385/1480 train_time:220290ms step_avg:160.21ms step:1386/1480 train_time:220462ms step_avg:160.22ms step:1387/1480 train_time:220632ms step_avg:160.23ms step:1388/1480 train_time:220801ms step_avg:160.23ms step:1389/1480 train_time:220972ms step_avg:160.24ms step:1390/1480 train_time:221140ms step_avg:160.25ms step:1391/1480 train_time:221309ms step_avg:160.25ms step:1392/1480 train_time:221482ms step_avg:160.26ms step:1393/1480 train_time:221654ms step_avg:160.27ms step:1394/1480 train_time:221824ms step_avg:160.28ms step:1395/1480 train_time:221991ms step_avg:160.28ms step:1396/1480 train_time:222159ms step_avg:160.29ms step:1397/1480 train_time:222326ms step_avg:160.29ms step:1398/1480 train_time:222492ms step_avg:160.30ms step:1399/1480 train_time:222663ms step_avg:160.30ms step:1400/1480 train_time:222840ms step_avg:160.32ms step:1401/1480 train_time:223006ms step_avg:160.32ms step:1402/1480 train_time:223178ms step_avg:160.33ms step:1403/1480 train_time:223355ms step_avg:160.34ms step:1404/1480 train_time:223527ms step_avg:160.35ms step:1405/1480 train_time:223701ms step_avg:160.36ms step:1406/1480 train_time:223876ms step_avg:160.37ms step:1407/1480 train_time:224046ms step_avg:160.38ms step:1408/1480 train_time:224213ms step_avg:160.38ms step:1409/1480 train_time:224398ms step_avg:160.40ms step:1410/1480 train_time:224567ms step_avg:160.40ms step:1411/1480 train_time:224735ms step_avg:160.41ms step:1412/1480 train_time:224903ms step_avg:160.42ms step:1413/1480 train_time:225073ms step_avg:160.42ms step:1414/1480 train_time:225246ms step_avg:160.43ms step:1415/1480 train_time:225420ms step_avg:160.44ms step:1416/1480 train_time:225607ms step_avg:160.46ms step:1417/1480 train_time:225782ms step_avg:160.47ms step:1418/1480 train_time:225952ms step_avg:160.48ms step:1419/1480 train_time:226128ms step_avg:160.49ms step:1420/1480 train_time:226304ms step_avg:160.50ms step:1421/1480 train_time:226477ms step_avg:160.51ms step:1422/1480 train_time:226649ms step_avg:160.52ms step:1423/1480 train_time:226819ms step_avg:160.52ms step:1424/1480 train_time:226994ms step_avg:160.53ms step:1425/1480 train_time:227174ms step_avg:160.55ms step:1426/1480 train_time:227346ms step_avg:160.56ms step:1427/1480 train_time:227522ms step_avg:160.57ms step:1428/1480 train_time:227691ms step_avg:160.57ms step:1429/1480 train_time:227860ms step_avg:160.58ms step:1430/1480 train_time:228034ms step_avg:160.59ms step:1431/1480 train_time:228210ms step_avg:160.60ms step:1432/1480 train_time:228387ms step_avg:160.61ms step:1433/1480 train_time:228566ms step_avg:160.62ms step:1434/1480 train_time:228746ms step_avg:160.64ms step:1435/1480 train_time:228921ms step_avg:160.65ms step:1436/1480 train_time:229094ms step_avg:160.65ms step:1437/1480 train_time:229265ms step_avg:160.66ms step:1438/1480 train_time:229432ms step_avg:160.67ms step:1439/1480 train_time:229607ms step_avg:160.68ms step:1440/1480 train_time:229778ms step_avg:160.68ms step:1441/1480 train_time:229949ms step_avg:160.69ms step:1442/1480 train_time:230128ms step_avg:160.70ms step:1443/1480 train_time:230316ms step_avg:160.72ms step:1444/1480 train_time:230487ms step_avg:160.73ms step:1445/1480 train_time:230660ms step_avg:160.74ms step:1446/1480 train_time:230835ms step_avg:160.75ms step:1447/1480 train_time:231013ms step_avg:160.76ms step:1448/1480 train_time:231186ms step_avg:160.77ms step:1449/1480 train_time:231361ms step_avg:160.78ms step:1450/1480 train_time:231532ms step_avg:160.79ms step:1451/1480 train_time:231703ms step_avg:160.79ms step:1452/1480 train_time:231875ms step_avg:160.80ms step:1453/1480 train_time:232044ms step_avg:160.81ms step:1454/1480 train_time:232216ms step_avg:160.81ms step:1455/1480 train_time:232394ms step_avg:160.83ms step:1456/1480 train_time:232567ms step_avg:160.83ms step:1457/1480 train_time:232739ms step_avg:160.84ms step:1458/1480 train_time:232908ms step_avg:160.85ms step:1459/1480 train_time:233084ms step_avg:160.86ms step:1460/1480 train_time:233257ms step_avg:160.87ms step:1461/1480 train_time:233431ms step_avg:160.88ms step:1462/1480 train_time:233602ms step_avg:160.88ms step:1463/1480 train_time:233778ms step_avg:160.89ms step:1464/1480 train_time:233953ms step_avg:160.90ms step:1465/1480 train_time:234127ms step_avg:160.91ms step:1466/1480 train_time:234297ms step_avg:160.92ms step:1467/1480 train_time:234470ms step_avg:160.93ms step:1468/1480 train_time:234640ms step_avg:160.93ms step:1469/1480 train_time:234812ms step_avg:160.94ms step:1470/1480 train_time:234991ms step_avg:160.95ms step:1471/1480 train_time:235177ms step_avg:160.97ms step:1472/1480 train_time:235359ms step_avg:160.98ms step:1473/1480 train_time:235529ms step_avg:160.99ms step:1474/1480 train_time:235707ms step_avg:161.00ms step:1475/1480 train_time:235887ms step_avg:161.01ms step:1476/1480 train_time:236060ms step_avg:161.02ms step:1477/1480 train_time:236244ms step_avg:161.04ms step:1478/1480 train_time:236426ms step_avg:161.05ms step:1479/1480 train_time:236600ms step_avg:161.06ms step:1480/1480 train_time:236771ms step_avg:161.07ms step:1480/1480 val_loss:3.2782 train_time:236843ms step_avg:161.12ms