import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 11:41:26 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 130W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 110W / 700W | 533MiB / 81559MiB | 2% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23287ms step_avg:nanms step:2/1480 train_time:23374ms step_avg:nanms step:3/1480 train_time:23512ms step_avg:nanms step:4/1480 train_time:23655ms step_avg:nanms step:5/1480 train_time:23796ms step_avg:nanms step:6/1480 train_time:23938ms step_avg:nanms step:7/1480 train_time:24079ms step_avg:nanms step:8/1480 train_time:24221ms step_avg:nanms step:9/1480 train_time:24365ms step_avg:nanms step:10/1480 train_time:24510ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:428ms step_avg:142.71ms step:14/1480 train_time:570ms step_avg:142.38ms step:15/1480 train_time:711ms step_avg:142.23ms step:16/1480 train_time:855ms step_avg:142.50ms step:17/1480 train_time:999ms step_avg:142.76ms step:18/1480 train_time:1144ms step_avg:142.97ms step:19/1480 train_time:1287ms step_avg:143.02ms step:20/1480 train_time:1430ms step_avg:142.99ms step:21/1480 train_time:1570ms step_avg:142.73ms step:22/1480 train_time:1711ms step_avg:142.61ms step:23/1480 train_time:1854ms step_avg:142.62ms step:24/1480 train_time:1998ms step_avg:142.71ms step:25/1480 train_time:2141ms step_avg:142.75ms step:26/1480 train_time:2285ms step_avg:142.79ms step:27/1480 train_time:2427ms step_avg:142.77ms step:28/1480 train_time:2568ms step_avg:142.69ms step:29/1480 train_time:2709ms step_avg:142.60ms step:30/1480 train_time:2852ms step_avg:142.62ms step:31/1480 train_time:2997ms step_avg:142.70ms step:32/1480 train_time:3142ms step_avg:142.81ms step:33/1480 train_time:3287ms step_avg:142.90ms step:34/1480 train_time:3430ms step_avg:142.93ms step:35/1480 train_time:3571ms step_avg:142.84ms step:36/1480 train_time:3712ms step_avg:142.77ms step:37/1480 train_time:3853ms step_avg:142.69ms step:38/1480 train_time:3994ms step_avg:142.65ms step:39/1480 train_time:4137ms step_avg:142.67ms step:40/1480 train_time:4281ms step_avg:142.69ms step:41/1480 train_time:4425ms step_avg:142.74ms step:42/1480 train_time:4568ms step_avg:142.75ms step:43/1480 train_time:4710ms step_avg:142.72ms step:44/1480 train_time:4851ms step_avg:142.69ms step:45/1480 train_time:4993ms step_avg:142.65ms step:46/1480 train_time:5135ms step_avg:142.64ms step:47/1480 train_time:5278ms step_avg:142.66ms step:48/1480 train_time:5424ms step_avg:142.73ms step:49/1480 train_time:5569ms step_avg:142.80ms step:50/1480 train_time:5710ms step_avg:142.74ms step:51/1480 train_time:5852ms step_avg:142.73ms step:52/1480 train_time:5993ms step_avg:142.69ms step:53/1480 train_time:6134ms step_avg:142.66ms step:54/1480 train_time:6277ms step_avg:142.66ms step:55/1480 train_time:6422ms step_avg:142.72ms step:56/1480 train_time:6566ms step_avg:142.74ms step:57/1480 train_time:6709ms step_avg:142.74ms step:58/1480 train_time:6852ms step_avg:142.74ms step:59/1480 train_time:6994ms step_avg:142.74ms step:60/1480 train_time:7136ms step_avg:142.73ms step:61/1480 train_time:7277ms step_avg:142.69ms step:62/1480 train_time:7420ms step_avg:142.70ms step:63/1480 train_time:7564ms step_avg:142.72ms step:64/1480 train_time:7708ms step_avg:142.73ms step:65/1480 train_time:7850ms step_avg:142.73ms step:66/1480 train_time:7991ms step_avg:142.70ms step:67/1480 train_time:8132ms step_avg:142.67ms step:68/1480 train_time:8273ms step_avg:142.63ms step:69/1480 train_time:8415ms step_avg:142.62ms step:70/1480 train_time:8558ms step_avg:142.64ms step:71/1480 train_time:8703ms step_avg:142.67ms step:72/1480 train_time:8846ms step_avg:142.68ms step:73/1480 train_time:8989ms step_avg:142.69ms step:74/1480 train_time:9131ms step_avg:142.67ms step:75/1480 train_time:9271ms step_avg:142.63ms step:76/1480 train_time:9412ms step_avg:142.61ms step:77/1480 train_time:9554ms step_avg:142.60ms step:78/1480 train_time:9700ms step_avg:142.65ms step:79/1480 train_time:9844ms step_avg:142.66ms step:80/1480 train_time:9987ms step_avg:142.67ms step:81/1480 train_time:10130ms step_avg:142.68ms step:82/1480 train_time:10271ms step_avg:142.65ms step:83/1480 train_time:10412ms step_avg:142.63ms step:84/1480 train_time:10554ms step_avg:142.62ms step:85/1480 train_time:10697ms step_avg:142.63ms step:86/1480 train_time:10840ms step_avg:142.63ms step:87/1480 train_time:10982ms step_avg:142.63ms step:88/1480 train_time:11127ms step_avg:142.65ms step:89/1480 train_time:11270ms step_avg:142.65ms step:90/1480 train_time:11411ms step_avg:142.64ms step:91/1480 train_time:11554ms step_avg:142.64ms step:92/1480 train_time:11697ms step_avg:142.65ms step:93/1480 train_time:11842ms step_avg:142.67ms step:94/1480 train_time:11984ms step_avg:142.67ms step:95/1480 train_time:12128ms step_avg:142.68ms step:96/1480 train_time:12270ms step_avg:142.67ms step:97/1480 train_time:12411ms step_avg:142.65ms step:98/1480 train_time:12554ms step_avg:142.66ms step:99/1480 train_time:12698ms step_avg:142.67ms step:100/1480 train_time:12843ms step_avg:142.70ms step:101/1480 train_time:12986ms step_avg:142.70ms step:102/1480 train_time:13127ms step_avg:142.69ms step:103/1480 train_time:13269ms step_avg:142.67ms step:104/1480 train_time:13409ms step_avg:142.65ms step:105/1480 train_time:13552ms step_avg:142.66ms step:106/1480 train_time:13694ms step_avg:142.64ms step:107/1480 train_time:13837ms step_avg:142.65ms step:108/1480 train_time:13979ms step_avg:142.65ms step:109/1480 train_time:14123ms step_avg:142.65ms step:110/1480 train_time:14266ms step_avg:142.66ms step:111/1480 train_time:14409ms step_avg:142.67ms step:112/1480 train_time:14556ms step_avg:142.71ms step:113/1480 train_time:14704ms step_avg:142.76ms step:114/1480 train_time:14852ms step_avg:142.81ms step:115/1480 train_time:14997ms step_avg:142.83ms step:116/1480 train_time:15145ms step_avg:142.88ms step:117/1480 train_time:15291ms step_avg:142.90ms step:118/1480 train_time:15436ms step_avg:142.93ms step:119/1480 train_time:15582ms step_avg:142.95ms step:120/1480 train_time:15730ms step_avg:143.00ms step:121/1480 train_time:15876ms step_avg:143.03ms step:122/1480 train_time:16024ms step_avg:143.08ms step:123/1480 train_time:16173ms step_avg:143.12ms step:124/1480 train_time:16318ms step_avg:143.14ms step:125/1480 train_time:16465ms step_avg:143.18ms step:125/1480 val_loss:4.4084 train_time:16522ms step_avg:143.67ms step:126/1480 train_time:16617ms step_avg:143.25ms step:127/1480 train_time:16767ms step_avg:143.31ms step:128/1480 train_time:16913ms step_avg:143.33ms step:129/1480 train_time:17059ms step_avg:143.35ms step:130/1480 train_time:17205ms step_avg:143.37ms step:131/1480 train_time:17350ms step_avg:143.39ms step:132/1480 train_time:17496ms step_avg:143.41ms step:133/1480 train_time:17646ms step_avg:143.46ms step:134/1480 train_time:17794ms step_avg:143.50ms step:135/1480 train_time:17942ms step_avg:143.53ms step:136/1480 train_time:18089ms step_avg:143.56ms step:137/1480 train_time:18235ms step_avg:143.58ms step:138/1480 train_time:18381ms step_avg:143.60ms step:139/1480 train_time:18527ms step_avg:143.62ms step:140/1480 train_time:18673ms step_avg:143.64ms step:141/1480 train_time:18820ms step_avg:143.66ms step:142/1480 train_time:18968ms step_avg:143.70ms step:143/1480 train_time:19115ms step_avg:143.72ms step:144/1480 train_time:19262ms step_avg:143.74ms step:145/1480 train_time:19408ms step_avg:143.76ms step:146/1480 train_time:19554ms step_avg:143.78ms step:147/1480 train_time:19701ms step_avg:143.80ms step:148/1480 train_time:19848ms step_avg:143.82ms step:149/1480 train_time:19995ms step_avg:143.85ms step:150/1480 train_time:20143ms step_avg:143.88ms step:151/1480 train_time:20289ms step_avg:143.90ms step:152/1480 train_time:20435ms step_avg:143.91ms step:153/1480 train_time:20582ms step_avg:143.93ms step:154/1480 train_time:20730ms step_avg:143.96ms step:155/1480 train_time:20876ms step_avg:143.97ms step:156/1480 train_time:21023ms step_avg:143.99ms step:157/1480 train_time:21171ms step_avg:144.02ms step:158/1480 train_time:21317ms step_avg:144.03ms step:159/1480 train_time:21465ms step_avg:144.06ms step:160/1480 train_time:21611ms step_avg:144.08ms step:161/1480 train_time:21757ms step_avg:144.08ms step:162/1480 train_time:21905ms step_avg:144.11ms step:163/1480 train_time:22051ms step_avg:144.12ms step:164/1480 train_time:22197ms step_avg:144.14ms step:165/1480 train_time:22344ms step_avg:144.16ms step:166/1480 train_time:22491ms step_avg:144.17ms step:167/1480 train_time:22637ms step_avg:144.18ms step:168/1480 train_time:22783ms step_avg:144.20ms step:169/1480 train_time:22930ms step_avg:144.21ms step:170/1480 train_time:23075ms step_avg:144.22ms step:171/1480 train_time:23221ms step_avg:144.23ms step:172/1480 train_time:23369ms step_avg:144.25ms step:173/1480 train_time:23515ms step_avg:144.26ms step:174/1480 train_time:23662ms step_avg:144.28ms step:175/1480 train_time:23809ms step_avg:144.30ms step:176/1480 train_time:23955ms step_avg:144.31ms step:177/1480 train_time:24102ms step_avg:144.32ms step:178/1480 train_time:24248ms step_avg:144.33ms step:179/1480 train_time:24393ms step_avg:144.34ms step:180/1480 train_time:24540ms step_avg:144.36ms step:181/1480 train_time:24687ms step_avg:144.37ms step:182/1480 train_time:24832ms step_avg:144.37ms step:183/1480 train_time:24980ms step_avg:144.39ms step:184/1480 train_time:25128ms step_avg:144.41ms step:185/1480 train_time:25274ms step_avg:144.43ms step:186/1480 train_time:25421ms step_avg:144.44ms step:187/1480 train_time:25569ms step_avg:144.46ms step:188/1480 train_time:25715ms step_avg:144.46ms step:189/1480 train_time:25862ms step_avg:144.48ms step:190/1480 train_time:26009ms step_avg:144.50ms step:191/1480 train_time:26156ms step_avg:144.51ms step:192/1480 train_time:26304ms step_avg:144.53ms step:193/1480 train_time:26450ms step_avg:144.53ms step:194/1480 train_time:26595ms step_avg:144.54ms step:195/1480 train_time:26742ms step_avg:144.55ms step:196/1480 train_time:26889ms step_avg:144.56ms step:197/1480 train_time:27034ms step_avg:144.57ms step:198/1480 train_time:27182ms step_avg:144.58ms step:199/1480 train_time:27329ms step_avg:144.60ms step:200/1480 train_time:27476ms step_avg:144.61ms step:201/1480 train_time:27623ms step_avg:144.62ms step:202/1480 train_time:27770ms step_avg:144.64ms step:203/1480 train_time:27915ms step_avg:144.64ms step:204/1480 train_time:28063ms step_avg:144.66ms step:205/1480 train_time:28210ms step_avg:144.67ms step:206/1480 train_time:28357ms step_avg:144.68ms step:207/1480 train_time:28505ms step_avg:144.69ms step:208/1480 train_time:28650ms step_avg:144.70ms step:209/1480 train_time:28797ms step_avg:144.71ms step:210/1480 train_time:28946ms step_avg:144.73ms step:211/1480 train_time:29092ms step_avg:144.73ms step:212/1480 train_time:29237ms step_avg:144.74ms step:213/1480 train_time:29383ms step_avg:144.74ms step:214/1480 train_time:29530ms step_avg:144.76ms step:215/1480 train_time:29676ms step_avg:144.76ms step:216/1480 train_time:29823ms step_avg:144.77ms step:217/1480 train_time:29971ms step_avg:144.79ms step:218/1480 train_time:30115ms step_avg:144.78ms step:219/1480 train_time:30262ms step_avg:144.79ms step:220/1480 train_time:30408ms step_avg:144.80ms step:221/1480 train_time:30555ms step_avg:144.81ms step:222/1480 train_time:30706ms step_avg:144.84ms step:223/1480 train_time:30855ms step_avg:144.86ms step:224/1480 train_time:31008ms step_avg:144.90ms step:225/1480 train_time:31159ms step_avg:144.93ms step:226/1480 train_time:31309ms step_avg:144.95ms step:227/1480 train_time:31459ms step_avg:144.97ms step:228/1480 train_time:31610ms step_avg:145.00ms step:229/1480 train_time:31760ms step_avg:145.02ms step:230/1480 train_time:31911ms step_avg:145.05ms step:231/1480 train_time:32061ms step_avg:145.07ms step:232/1480 train_time:32212ms step_avg:145.10ms step:233/1480 train_time:32361ms step_avg:145.12ms step:234/1480 train_time:32510ms step_avg:145.14ms step:235/1480 train_time:32661ms step_avg:145.16ms step:236/1480 train_time:32813ms step_avg:145.19ms step:237/1480 train_time:32962ms step_avg:145.21ms step:238/1480 train_time:33112ms step_avg:145.23ms step:239/1480 train_time:33262ms step_avg:145.25ms step:240/1480 train_time:33412ms step_avg:145.27ms step:241/1480 train_time:33561ms step_avg:145.29ms step:242/1480 train_time:33713ms step_avg:145.31ms step:243/1480 train_time:33864ms step_avg:145.34ms step:244/1480 train_time:34015ms step_avg:145.36ms step:245/1480 train_time:34167ms step_avg:145.39ms step:246/1480 train_time:34316ms step_avg:145.41ms step:247/1480 train_time:34467ms step_avg:145.43ms step:248/1480 train_time:34617ms step_avg:145.45ms step:249/1480 train_time:34768ms step_avg:145.47ms step:250/1480 train_time:34918ms step_avg:145.49ms step:250/1480 val_loss:3.9926 train_time:34978ms step_avg:145.74ms step:251/1480 train_time:35074ms step_avg:145.54ms step:252/1480 train_time:35227ms step_avg:145.56ms step:253/1480 train_time:35375ms step_avg:145.58ms step:254/1480 train_time:35525ms step_avg:145.60ms step:255/1480 train_time:35673ms step_avg:145.61ms step:256/1480 train_time:35826ms step_avg:145.63ms step:257/1480 train_time:35973ms step_avg:145.64ms step:258/1480 train_time:36126ms step_avg:145.67ms step:259/1480 train_time:36277ms step_avg:145.69ms step:260/1480 train_time:36427ms step_avg:145.71ms step:261/1480 train_time:36576ms step_avg:145.72ms step:262/1480 train_time:36726ms step_avg:145.74ms step:263/1480 train_time:36875ms step_avg:145.75ms step:264/1480 train_time:37026ms step_avg:145.77ms step:265/1480 train_time:37176ms step_avg:145.79ms step:266/1480 train_time:37326ms step_avg:145.80ms step:267/1480 train_time:37476ms step_avg:145.82ms step:268/1480 train_time:37626ms step_avg:145.84ms step:269/1480 train_time:37776ms step_avg:145.85ms step:270/1480 train_time:37927ms step_avg:145.87ms step:271/1480 train_time:38076ms step_avg:145.89ms step:272/1480 train_time:38230ms step_avg:145.91ms step:273/1480 train_time:38377ms step_avg:145.92ms step:274/1480 train_time:38528ms step_avg:145.94ms step:275/1480 train_time:38678ms step_avg:145.96ms step:276/1480 train_time:38829ms step_avg:145.97ms step:277/1480 train_time:38977ms step_avg:145.98ms step:278/1480 train_time:39128ms step_avg:146.00ms step:279/1480 train_time:39277ms step_avg:146.01ms step:280/1480 train_time:39429ms step_avg:146.03ms step:281/1480 train_time:39578ms step_avg:146.04ms step:282/1480 train_time:39728ms step_avg:146.06ms step:283/1480 train_time:39878ms step_avg:146.07ms step:284/1480 train_time:40028ms step_avg:146.09ms step:285/1480 train_time:40179ms step_avg:146.10ms step:286/1480 train_time:40329ms step_avg:146.12ms step:287/1480 train_time:40479ms step_avg:146.13ms step:288/1480 train_time:40629ms step_avg:146.15ms step:289/1480 train_time:40779ms step_avg:146.16ms step:290/1480 train_time:40929ms step_avg:146.18ms step:291/1480 train_time:41080ms step_avg:146.19ms step:292/1480 train_time:41230ms step_avg:146.21ms step:293/1480 train_time:41381ms step_avg:146.22ms step:294/1480 train_time:41531ms step_avg:146.24ms step:295/1480 train_time:41681ms step_avg:146.25ms step:296/1480 train_time:41832ms step_avg:146.26ms step:297/1480 train_time:41982ms step_avg:146.28ms step:298/1480 train_time:42133ms step_avg:146.30ms step:299/1480 train_time:42285ms step_avg:146.31ms step:300/1480 train_time:42435ms step_avg:146.33ms step:301/1480 train_time:42586ms step_avg:146.34ms step:302/1480 train_time:42734ms step_avg:146.35ms step:303/1480 train_time:42885ms step_avg:146.36ms step:304/1480 train_time:43034ms step_avg:146.38ms step:305/1480 train_time:43185ms step_avg:146.39ms step:306/1480 train_time:43334ms step_avg:146.40ms step:307/1480 train_time:43486ms step_avg:146.42ms step:308/1480 train_time:43636ms step_avg:146.43ms step:309/1480 train_time:43787ms step_avg:146.44ms step:310/1480 train_time:43937ms step_avg:146.46ms step:311/1480 train_time:44087ms step_avg:146.47ms step:312/1480 train_time:44237ms step_avg:146.48ms step:313/1480 train_time:44387ms step_avg:146.49ms step:314/1480 train_time:44538ms step_avg:146.51ms step:315/1480 train_time:44688ms step_avg:146.52ms step:316/1480 train_time:44837ms step_avg:146.53ms step:317/1480 train_time:44988ms step_avg:146.54ms step:318/1480 train_time:45139ms step_avg:146.55ms step:319/1480 train_time:45290ms step_avg:146.57ms step:320/1480 train_time:45440ms step_avg:146.58ms step:321/1480 train_time:45591ms step_avg:146.60ms step:322/1480 train_time:45742ms step_avg:146.61ms step:323/1480 train_time:45893ms step_avg:146.62ms step:324/1480 train_time:46044ms step_avg:146.64ms step:325/1480 train_time:46194ms step_avg:146.65ms step:326/1480 train_time:46345ms step_avg:146.66ms step:327/1480 train_time:46495ms step_avg:146.67ms step:328/1480 train_time:46646ms step_avg:146.68ms step:329/1480 train_time:46796ms step_avg:146.70ms step:330/1480 train_time:46948ms step_avg:146.71ms step:331/1480 train_time:47102ms step_avg:146.74ms step:332/1480 train_time:47255ms step_avg:146.76ms step:333/1480 train_time:47409ms step_avg:146.78ms step:334/1480 train_time:47562ms step_avg:146.80ms step:335/1480 train_time:47716ms step_avg:146.82ms step:336/1480 train_time:47869ms step_avg:146.84ms step:337/1480 train_time:48024ms step_avg:146.86ms step:338/1480 train_time:48178ms step_avg:146.89ms step:339/1480 train_time:48332ms step_avg:146.91ms step:340/1480 train_time:48486ms step_avg:146.93ms step:341/1480 train_time:48640ms step_avg:146.95ms step:342/1480 train_time:48794ms step_avg:146.97ms step:343/1480 train_time:48948ms step_avg:146.99ms step:344/1480 train_time:49103ms step_avg:147.02ms step:345/1480 train_time:49258ms step_avg:147.04ms step:346/1480 train_time:49412ms step_avg:147.06ms step:347/1480 train_time:49566ms step_avg:147.08ms step:348/1480 train_time:49720ms step_avg:147.10ms step:349/1480 train_time:49873ms step_avg:147.12ms step:350/1480 train_time:50028ms step_avg:147.14ms step:351/1480 train_time:50182ms step_avg:147.16ms step:352/1480 train_time:50335ms step_avg:147.18ms step:353/1480 train_time:50488ms step_avg:147.20ms step:354/1480 train_time:50640ms step_avg:147.21ms step:355/1480 train_time:50795ms step_avg:147.23ms step:356/1480 train_time:50949ms step_avg:147.25ms step:357/1480 train_time:51104ms step_avg:147.27ms step:358/1480 train_time:51257ms step_avg:147.29ms step:359/1480 train_time:51410ms step_avg:147.31ms step:360/1480 train_time:51565ms step_avg:147.33ms step:361/1480 train_time:51723ms step_avg:147.36ms step:362/1480 train_time:51875ms step_avg:147.37ms step:363/1480 train_time:52029ms step_avg:147.39ms step:364/1480 train_time:52183ms step_avg:147.41ms step:365/1480 train_time:52337ms step_avg:147.43ms step:366/1480 train_time:52490ms step_avg:147.44ms step:367/1480 train_time:52644ms step_avg:147.46ms step:368/1480 train_time:52797ms step_avg:147.48ms step:369/1480 train_time:52950ms step_avg:147.49ms step:370/1480 train_time:53104ms step_avg:147.51ms step:371/1480 train_time:53257ms step_avg:147.53ms step:372/1480 train_time:53411ms step_avg:147.54ms step:373/1480 train_time:53565ms step_avg:147.56ms step:374/1480 train_time:53718ms step_avg:147.58ms step:375/1480 train_time:53871ms step_avg:147.59ms step:375/1480 val_loss:3.8061 train_time:53930ms step_avg:147.75ms step:376/1480 train_time:54030ms step_avg:147.62ms step:377/1480 train_time:54185ms step_avg:147.64ms step:378/1480 train_time:54338ms step_avg:147.66ms step:379/1480 train_time:54492ms step_avg:147.67ms step:380/1480 train_time:54643ms step_avg:147.68ms step:381/1480 train_time:54796ms step_avg:147.70ms step:382/1480 train_time:54949ms step_avg:147.71ms step:383/1480 train_time:55107ms step_avg:147.74ms step:384/1480 train_time:55261ms step_avg:147.76ms step:385/1480 train_time:55413ms step_avg:147.77ms step:386/1480 train_time:55566ms step_avg:147.78ms step:387/1480 train_time:55719ms step_avg:147.80ms step:388/1480 train_time:55872ms step_avg:147.81ms step:389/1480 train_time:56026ms step_avg:147.83ms step:390/1480 train_time:56180ms step_avg:147.84ms step:391/1480 train_time:56334ms step_avg:147.86ms step:392/1480 train_time:56486ms step_avg:147.87ms step:393/1480 train_time:56639ms step_avg:147.88ms step:394/1480 train_time:56792ms step_avg:147.90ms step:395/1480 train_time:56946ms step_avg:147.91ms step:396/1480 train_time:57100ms step_avg:147.93ms step:397/1480 train_time:57254ms step_avg:147.94ms step:398/1480 train_time:57408ms step_avg:147.96ms step:399/1480 train_time:57562ms step_avg:147.97ms step:400/1480 train_time:57716ms step_avg:147.99ms step:401/1480 train_time:57869ms step_avg:148.00ms step:402/1480 train_time:58024ms step_avg:148.02ms step:403/1480 train_time:58179ms step_avg:148.04ms step:404/1480 train_time:58332ms step_avg:148.05ms step:405/1480 train_time:58485ms step_avg:148.06ms step:406/1480 train_time:58639ms step_avg:148.08ms step:407/1480 train_time:58793ms step_avg:148.09ms step:408/1480 train_time:58947ms step_avg:148.11ms step:409/1480 train_time:59101ms step_avg:148.12ms step:410/1480 train_time:59254ms step_avg:148.14ms step:411/1480 train_time:59408ms step_avg:148.15ms step:412/1480 train_time:59562ms step_avg:148.16ms step:413/1480 train_time:59715ms step_avg:148.18ms step:414/1480 train_time:59869ms step_avg:148.19ms step:415/1480 train_time:60025ms step_avg:148.21ms step:416/1480 train_time:60178ms step_avg:148.22ms step:417/1480 train_time:60332ms step_avg:148.24ms step:418/1480 train_time:60485ms step_avg:148.25ms step:419/1480 train_time:60638ms step_avg:148.26ms step:420/1480 train_time:60791ms step_avg:148.27ms step:421/1480 train_time:60946ms step_avg:148.29ms step:422/1480 train_time:61099ms step_avg:148.30ms step:423/1480 train_time:61252ms step_avg:148.31ms step:424/1480 train_time:61407ms step_avg:148.33ms step:425/1480 train_time:61562ms step_avg:148.34ms step:426/1480 train_time:61715ms step_avg:148.35ms step:427/1480 train_time:61868ms step_avg:148.37ms step:428/1480 train_time:62022ms step_avg:148.38ms step:429/1480 train_time:62176ms step_avg:148.39ms step:430/1480 train_time:62330ms step_avg:148.40ms step:431/1480 train_time:62483ms step_avg:148.42ms step:432/1480 train_time:62635ms step_avg:148.42ms step:433/1480 train_time:62789ms step_avg:148.44ms step:434/1480 train_time:62943ms step_avg:148.45ms step:435/1480 train_time:63097ms step_avg:148.46ms step:436/1480 train_time:63251ms step_avg:148.48ms step:437/1480 train_time:63404ms step_avg:148.49ms step:438/1480 train_time:63557ms step_avg:148.50ms step:439/1480 train_time:63713ms step_avg:148.51ms step:440/1480 train_time:63868ms step_avg:148.53ms step:441/1480 train_time:64026ms step_avg:148.55ms step:442/1480 train_time:64184ms step_avg:148.57ms step:443/1480 train_time:64341ms step_avg:148.59ms step:444/1480 train_time:64496ms step_avg:148.61ms step:445/1480 train_time:64652ms step_avg:148.63ms step:446/1480 train_time:64808ms step_avg:148.64ms step:447/1480 train_time:64964ms step_avg:148.66ms step:448/1480 train_time:65120ms step_avg:148.68ms step:449/1480 train_time:65278ms step_avg:148.70ms step:450/1480 train_time:65435ms step_avg:148.72ms step:451/1480 train_time:65593ms step_avg:148.74ms step:452/1480 train_time:65748ms step_avg:148.75ms step:453/1480 train_time:65905ms step_avg:148.77ms step:454/1480 train_time:66061ms step_avg:148.79ms step:455/1480 train_time:66217ms step_avg:148.80ms step:456/1480 train_time:66374ms step_avg:148.82ms step:457/1480 train_time:66531ms step_avg:148.84ms step:458/1480 train_time:66688ms step_avg:148.86ms step:459/1480 train_time:66846ms step_avg:148.88ms step:460/1480 train_time:67002ms step_avg:148.89ms step:461/1480 train_time:67160ms step_avg:148.91ms step:462/1480 train_time:67317ms step_avg:148.93ms step:463/1480 train_time:67473ms step_avg:148.95ms step:464/1480 train_time:67630ms step_avg:148.97ms step:465/1480 train_time:67787ms step_avg:148.98ms step:466/1480 train_time:67944ms step_avg:149.00ms step:467/1480 train_time:68102ms step_avg:149.02ms step:468/1480 train_time:68259ms step_avg:149.04ms step:469/1480 train_time:68416ms step_avg:149.05ms step:470/1480 train_time:68573ms step_avg:149.07ms step:471/1480 train_time:68730ms step_avg:149.09ms step:472/1480 train_time:68888ms step_avg:149.11ms step:473/1480 train_time:69043ms step_avg:149.12ms step:474/1480 train_time:69199ms step_avg:149.14ms step:475/1480 train_time:69355ms step_avg:149.15ms step:476/1480 train_time:69512ms step_avg:149.17ms step:477/1480 train_time:69669ms step_avg:149.18ms step:478/1480 train_time:69827ms step_avg:149.20ms step:479/1480 train_time:69984ms step_avg:149.22ms step:480/1480 train_time:70141ms step_avg:149.24ms step:481/1480 train_time:70296ms step_avg:149.25ms step:482/1480 train_time:70452ms step_avg:149.26ms step:483/1480 train_time:70609ms step_avg:149.28ms step:484/1480 train_time:70765ms step_avg:149.29ms step:485/1480 train_time:70923ms step_avg:149.31ms step:486/1480 train_time:71080ms step_avg:149.33ms step:487/1480 train_time:71236ms step_avg:149.34ms step:488/1480 train_time:71393ms step_avg:149.36ms step:489/1480 train_time:71548ms step_avg:149.37ms step:490/1480 train_time:71705ms step_avg:149.39ms step:491/1480 train_time:71862ms step_avg:149.40ms step:492/1480 train_time:72018ms step_avg:149.42ms step:493/1480 train_time:72175ms step_avg:149.43ms step:494/1480 train_time:72333ms step_avg:149.45ms step:495/1480 train_time:72492ms step_avg:149.47ms step:496/1480 train_time:72649ms step_avg:149.48ms step:497/1480 train_time:72805ms step_avg:149.50ms step:498/1480 train_time:72961ms step_avg:149.51ms step:499/1480 train_time:73118ms step_avg:149.53ms step:500/1480 train_time:73276ms step_avg:149.54ms step:500/1480 val_loss:3.6820 train_time:73338ms step_avg:149.67ms step:501/1480 train_time:73436ms step_avg:149.56ms step:502/1480 train_time:73593ms step_avg:149.58ms step:503/1480 train_time:73749ms step_avg:149.59ms step:504/1480 train_time:73905ms step_avg:149.61ms step:505/1480 train_time:74061ms step_avg:149.62ms step:506/1480 train_time:74216ms step_avg:149.63ms step:507/1480 train_time:74373ms step_avg:149.64ms step:508/1480 train_time:74532ms step_avg:149.66ms step:509/1480 train_time:74690ms step_avg:149.68ms step:510/1480 train_time:74848ms step_avg:149.70ms step:511/1480 train_time:75004ms step_avg:149.71ms step:512/1480 train_time:75162ms step_avg:149.73ms step:513/1480 train_time:75318ms step_avg:149.74ms step:514/1480 train_time:75475ms step_avg:149.75ms step:515/1480 train_time:75631ms step_avg:149.76ms step:516/1480 train_time:75791ms step_avg:149.79ms step:517/1480 train_time:75950ms step_avg:149.80ms step:518/1480 train_time:76109ms step_avg:149.82ms step:519/1480 train_time:76268ms step_avg:149.84ms step:520/1480 train_time:76426ms step_avg:149.85ms step:521/1480 train_time:76582ms step_avg:149.87ms step:522/1480 train_time:76738ms step_avg:149.88ms step:523/1480 train_time:76895ms step_avg:149.89ms step:524/1480 train_time:77051ms step_avg:149.90ms step:525/1480 train_time:77210ms step_avg:149.92ms step:526/1480 train_time:77367ms step_avg:149.94ms step:527/1480 train_time:77523ms step_avg:149.95ms step:528/1480 train_time:77678ms step_avg:149.96ms step:529/1480 train_time:77835ms step_avg:149.97ms step:530/1480 train_time:77991ms step_avg:149.98ms step:531/1480 train_time:78149ms step_avg:150.00ms step:532/1480 train_time:78306ms step_avg:150.01ms step:533/1480 train_time:78462ms step_avg:150.02ms step:534/1480 train_time:78617ms step_avg:150.03ms step:535/1480 train_time:78774ms step_avg:150.05ms step:536/1480 train_time:78933ms step_avg:150.06ms step:537/1480 train_time:79090ms step_avg:150.08ms step:538/1480 train_time:79248ms step_avg:150.09ms step:539/1480 train_time:79406ms step_avg:150.11ms step:540/1480 train_time:79563ms step_avg:150.12ms step:541/1480 train_time:79718ms step_avg:150.13ms step:542/1480 train_time:79874ms step_avg:150.14ms step:543/1480 train_time:80031ms step_avg:150.15ms step:544/1480 train_time:80189ms step_avg:150.17ms step:545/1480 train_time:80345ms step_avg:150.18ms step:546/1480 train_time:80502ms step_avg:150.19ms step:547/1480 train_time:80658ms step_avg:150.20ms step:548/1480 train_time:80816ms step_avg:150.22ms step:549/1480 train_time:80972ms step_avg:150.23ms step:550/1480 train_time:81131ms step_avg:150.24ms step:551/1480 train_time:81289ms step_avg:150.26ms step:552/1480 train_time:81448ms step_avg:150.27ms step:553/1480 train_time:81608ms step_avg:150.29ms step:554/1480 train_time:81769ms step_avg:150.31ms step:555/1480 train_time:81929ms step_avg:150.33ms step:556/1480 train_time:82089ms step_avg:150.35ms step:557/1480 train_time:82249ms step_avg:150.36ms step:558/1480 train_time:82409ms step_avg:150.38ms step:559/1480 train_time:82569ms step_avg:150.40ms step:560/1480 train_time:82728ms step_avg:150.41ms step:561/1480 train_time:82886ms step_avg:150.43ms step:562/1480 train_time:83045ms step_avg:150.44ms step:563/1480 train_time:83203ms step_avg:150.46ms step:564/1480 train_time:83363ms step_avg:150.47ms step:565/1480 train_time:83519ms step_avg:150.49ms step:566/1480 train_time:83679ms step_avg:150.50ms step:567/1480 train_time:83837ms step_avg:150.51ms step:568/1480 train_time:83993ms step_avg:150.53ms step:569/1480 train_time:84153ms step_avg:150.54ms step:570/1480 train_time:84314ms step_avg:150.56ms step:571/1480 train_time:84473ms step_avg:150.58ms step:572/1480 train_time:84633ms step_avg:150.59ms step:573/1480 train_time:84794ms step_avg:150.61ms step:574/1480 train_time:84955ms step_avg:150.63ms step:575/1480 train_time:85115ms step_avg:150.65ms step:576/1480 train_time:85274ms step_avg:150.66ms step:577/1480 train_time:85434ms step_avg:150.68ms step:578/1480 train_time:85593ms step_avg:150.69ms step:579/1480 train_time:85753ms step_avg:150.71ms step:580/1480 train_time:85913ms step_avg:150.73ms step:581/1480 train_time:86072ms step_avg:150.74ms step:582/1480 train_time:86233ms step_avg:150.76ms step:583/1480 train_time:86393ms step_avg:150.77ms step:584/1480 train_time:86553ms step_avg:150.79ms step:585/1480 train_time:86713ms step_avg:150.81ms step:586/1480 train_time:86873ms step_avg:150.82ms step:587/1480 train_time:87032ms step_avg:150.84ms step:588/1480 train_time:87192ms step_avg:150.85ms step:589/1480 train_time:87353ms step_avg:150.87ms step:590/1480 train_time:87514ms step_avg:150.89ms step:591/1480 train_time:87673ms step_avg:150.90ms step:592/1480 train_time:87832ms step_avg:150.91ms step:593/1480 train_time:87994ms step_avg:150.93ms step:594/1480 train_time:88154ms step_avg:150.95ms step:595/1480 train_time:88315ms step_avg:150.97ms step:596/1480 train_time:88476ms step_avg:150.98ms step:597/1480 train_time:88635ms step_avg:151.00ms step:598/1480 train_time:88794ms step_avg:151.01ms step:599/1480 train_time:88952ms step_avg:151.02ms step:600/1480 train_time:89113ms step_avg:151.04ms step:601/1480 train_time:89272ms step_avg:151.05ms step:602/1480 train_time:89431ms step_avg:151.07ms step:603/1480 train_time:89593ms step_avg:151.08ms step:604/1480 train_time:89752ms step_avg:151.10ms step:605/1480 train_time:89912ms step_avg:151.11ms step:606/1480 train_time:90074ms step_avg:151.13ms step:607/1480 train_time:90236ms step_avg:151.15ms step:608/1480 train_time:90396ms step_avg:151.16ms step:609/1480 train_time:90555ms step_avg:151.18ms step:610/1480 train_time:90714ms step_avg:151.19ms step:611/1480 train_time:90875ms step_avg:151.21ms step:612/1480 train_time:91034ms step_avg:151.22ms step:613/1480 train_time:91194ms step_avg:151.23ms step:614/1480 train_time:91354ms step_avg:151.25ms step:615/1480 train_time:91513ms step_avg:151.26ms step:616/1480 train_time:91673ms step_avg:151.27ms step:617/1480 train_time:91832ms step_avg:151.29ms step:618/1480 train_time:91991ms step_avg:151.30ms step:619/1480 train_time:92150ms step_avg:151.31ms step:620/1480 train_time:92312ms step_avg:151.33ms step:621/1480 train_time:92472ms step_avg:151.34ms step:622/1480 train_time:92632ms step_avg:151.36ms step:623/1480 train_time:92793ms step_avg:151.38ms step:624/1480 train_time:92953ms step_avg:151.39ms step:625/1480 train_time:93113ms step_avg:151.40ms step:625/1480 val_loss:3.6035 train_time:93176ms step_avg:151.51ms step:626/1480 train_time:93275ms step_avg:151.42ms step:627/1480 train_time:93434ms step_avg:151.43ms step:628/1480 train_time:93592ms step_avg:151.44ms step:629/1480 train_time:93751ms step_avg:151.46ms step:630/1480 train_time:93909ms step_avg:151.47ms step:631/1480 train_time:94068ms step_avg:151.48ms step:632/1480 train_time:94227ms step_avg:151.49ms step:633/1480 train_time:94388ms step_avg:151.51ms step:634/1480 train_time:94548ms step_avg:151.52ms step:635/1480 train_time:94707ms step_avg:151.53ms step:636/1480 train_time:94866ms step_avg:151.54ms step:637/1480 train_time:95026ms step_avg:151.56ms step:638/1480 train_time:95186ms step_avg:151.57ms step:639/1480 train_time:95345ms step_avg:151.58ms step:640/1480 train_time:95506ms step_avg:151.60ms step:641/1480 train_time:95667ms step_avg:151.61ms step:642/1480 train_time:95826ms step_avg:151.62ms step:643/1480 train_time:95986ms step_avg:151.64ms step:644/1480 train_time:96146ms step_avg:151.65ms step:645/1480 train_time:96305ms step_avg:151.66ms step:646/1480 train_time:96466ms step_avg:151.68ms step:647/1480 train_time:96627ms step_avg:151.69ms step:648/1480 train_time:96788ms step_avg:151.71ms step:649/1480 train_time:96947ms step_avg:151.72ms step:650/1480 train_time:97106ms step_avg:151.73ms step:651/1480 train_time:97267ms step_avg:151.74ms step:652/1480 train_time:97426ms step_avg:151.75ms step:653/1480 train_time:97585ms step_avg:151.77ms step:654/1480 train_time:97746ms step_avg:151.78ms step:655/1480 train_time:97906ms step_avg:151.79ms step:656/1480 train_time:98066ms step_avg:151.80ms step:657/1480 train_time:98225ms step_avg:151.82ms step:658/1480 train_time:98386ms step_avg:151.83ms step:659/1480 train_time:98548ms step_avg:151.85ms step:660/1480 train_time:98710ms step_avg:151.86ms step:661/1480 train_time:98871ms step_avg:151.88ms step:662/1480 train_time:99031ms step_avg:151.89ms step:663/1480 train_time:99191ms step_avg:151.90ms step:664/1480 train_time:99352ms step_avg:151.91ms step:665/1480 train_time:99514ms step_avg:151.93ms step:666/1480 train_time:99675ms step_avg:151.94ms step:667/1480 train_time:99836ms step_avg:151.96ms step:668/1480 train_time:99998ms step_avg:151.97ms step:669/1480 train_time:100160ms step_avg:151.99ms step:670/1480 train_time:100320ms step_avg:152.00ms step:671/1480 train_time:100482ms step_avg:152.02ms step:672/1480 train_time:100643ms step_avg:152.03ms step:673/1480 train_time:100807ms step_avg:152.05ms step:674/1480 train_time:100968ms step_avg:152.06ms step:675/1480 train_time:101130ms step_avg:152.07ms step:676/1480 train_time:101292ms step_avg:152.09ms step:677/1480 train_time:101453ms step_avg:152.10ms step:678/1480 train_time:101614ms step_avg:152.12ms step:679/1480 train_time:101775ms step_avg:152.13ms step:680/1480 train_time:101936ms step_avg:152.14ms step:681/1480 train_time:102097ms step_avg:152.16ms step:682/1480 train_time:102260ms step_avg:152.17ms step:683/1480 train_time:102422ms step_avg:152.19ms step:684/1480 train_time:102585ms step_avg:152.20ms step:685/1480 train_time:102747ms step_avg:152.22ms step:686/1480 train_time:102911ms step_avg:152.23ms step:687/1480 train_time:103071ms step_avg:152.25ms step:688/1480 train_time:103234ms step_avg:152.26ms step:689/1480 train_time:103395ms step_avg:152.28ms step:690/1480 train_time:103559ms step_avg:152.29ms step:691/1480 train_time:103722ms step_avg:152.31ms step:692/1480 train_time:103884ms step_avg:152.32ms step:693/1480 train_time:104047ms step_avg:152.34ms step:694/1480 train_time:104209ms step_avg:152.35ms step:695/1480 train_time:104369ms step_avg:152.36ms step:696/1480 train_time:104529ms step_avg:152.37ms step:697/1480 train_time:104691ms step_avg:152.39ms step:698/1480 train_time:104852ms step_avg:152.40ms step:699/1480 train_time:105014ms step_avg:152.41ms step:700/1480 train_time:105175ms step_avg:152.43ms step:701/1480 train_time:105334ms step_avg:152.44ms step:702/1480 train_time:105494ms step_avg:152.45ms step:703/1480 train_time:105654ms step_avg:152.46ms step:704/1480 train_time:105814ms step_avg:152.47ms step:705/1480 train_time:105976ms step_avg:152.48ms step:706/1480 train_time:106141ms step_avg:152.50ms step:707/1480 train_time:106302ms step_avg:152.51ms step:708/1480 train_time:106463ms step_avg:152.53ms step:709/1480 train_time:106626ms step_avg:152.54ms step:710/1480 train_time:106788ms step_avg:152.55ms step:711/1480 train_time:106950ms step_avg:152.57ms step:712/1480 train_time:107113ms step_avg:152.58ms step:713/1480 train_time:107277ms step_avg:152.60ms step:714/1480 train_time:107438ms step_avg:152.61ms step:715/1480 train_time:107597ms step_avg:152.62ms step:716/1480 train_time:107757ms step_avg:152.63ms step:717/1480 train_time:107921ms step_avg:152.65ms step:718/1480 train_time:108082ms step_avg:152.66ms step:719/1480 train_time:108241ms step_avg:152.67ms step:720/1480 train_time:108404ms step_avg:152.68ms step:721/1480 train_time:108567ms step_avg:152.70ms step:722/1480 train_time:108729ms step_avg:152.71ms step:723/1480 train_time:108890ms step_avg:152.72ms step:724/1480 train_time:109051ms step_avg:152.73ms step:725/1480 train_time:109214ms step_avg:152.75ms step:726/1480 train_time:109378ms step_avg:152.76ms step:727/1480 train_time:109542ms step_avg:152.78ms step:728/1480 train_time:109703ms step_avg:152.79ms step:729/1480 train_time:109865ms step_avg:152.80ms step:730/1480 train_time:110028ms step_avg:152.82ms step:731/1480 train_time:110190ms step_avg:152.83ms step:732/1480 train_time:110350ms step_avg:152.84ms step:733/1480 train_time:110510ms step_avg:152.85ms step:734/1480 train_time:110670ms step_avg:152.86ms step:735/1480 train_time:110830ms step_avg:152.87ms step:736/1480 train_time:110991ms step_avg:152.88ms step:737/1480 train_time:111152ms step_avg:152.89ms step:738/1480 train_time:111313ms step_avg:152.90ms step:739/1480 train_time:111473ms step_avg:152.91ms step:740/1480 train_time:111637ms step_avg:152.93ms step:741/1480 train_time:111799ms step_avg:152.94ms step:742/1480 train_time:111962ms step_avg:152.95ms step:743/1480 train_time:112123ms step_avg:152.97ms step:744/1480 train_time:112288ms step_avg:152.98ms step:745/1480 train_time:112452ms step_avg:153.00ms step:746/1480 train_time:112611ms step_avg:153.00ms step:747/1480 train_time:112773ms step_avg:153.02ms step:748/1480 train_time:112939ms step_avg:153.03ms step:749/1480 train_time:113102ms step_avg:153.05ms step:750/1480 train_time:113262ms step_avg:153.06ms step:750/1480 val_loss:3.5463 train_time:113326ms step_avg:153.14ms step:751/1480 train_time:113427ms step_avg:153.07ms step:752/1480 train_time:113588ms step_avg:153.08ms step:753/1480 train_time:113749ms step_avg:153.09ms step:754/1480 train_time:113909ms step_avg:153.10ms step:755/1480 train_time:114071ms step_avg:153.12ms step:756/1480 train_time:114233ms step_avg:153.13ms step:757/1480 train_time:114398ms step_avg:153.14ms step:758/1480 train_time:114559ms step_avg:153.15ms step:759/1480 train_time:114722ms step_avg:153.17ms step:760/1480 train_time:114884ms step_avg:153.18ms step:761/1480 train_time:115047ms step_avg:153.19ms step:762/1480 train_time:115208ms step_avg:153.20ms step:763/1480 train_time:115370ms step_avg:153.21ms step:764/1480 train_time:115531ms step_avg:153.22ms step:765/1480 train_time:115692ms step_avg:153.23ms step:766/1480 train_time:115854ms step_avg:153.25ms step:767/1480 train_time:116014ms step_avg:153.25ms step:768/1480 train_time:116175ms step_avg:153.26ms step:769/1480 train_time:116340ms step_avg:153.28ms step:770/1480 train_time:116502ms step_avg:153.29ms step:771/1480 train_time:116667ms step_avg:153.31ms step:772/1480 train_time:116828ms step_avg:153.32ms step:773/1480 train_time:116990ms step_avg:153.33ms step:774/1480 train_time:117151ms step_avg:153.34ms step:775/1480 train_time:117312ms step_avg:153.35ms step:776/1480 train_time:117478ms step_avg:153.37ms step:777/1480 train_time:117646ms step_avg:153.39ms step:778/1480 train_time:117809ms step_avg:153.40ms step:779/1480 train_time:117971ms step_avg:153.41ms step:780/1480 train_time:118133ms step_avg:153.42ms step:781/1480 train_time:118296ms step_avg:153.43ms step:782/1480 train_time:118459ms step_avg:153.44ms step:783/1480 train_time:118621ms step_avg:153.46ms step:784/1480 train_time:118785ms step_avg:153.47ms step:785/1480 train_time:118948ms step_avg:153.48ms step:786/1480 train_time:119112ms step_avg:153.49ms step:787/1480 train_time:119275ms step_avg:153.51ms step:788/1480 train_time:119440ms step_avg:153.52ms step:789/1480 train_time:119603ms step_avg:153.53ms step:790/1480 train_time:119768ms step_avg:153.55ms step:791/1480 train_time:119935ms step_avg:153.57ms step:792/1480 train_time:120100ms step_avg:153.58ms step:793/1480 train_time:120262ms step_avg:153.59ms step:794/1480 train_time:120428ms step_avg:153.61ms step:795/1480 train_time:120593ms step_avg:153.62ms step:796/1480 train_time:120757ms step_avg:153.63ms step:797/1480 train_time:120922ms step_avg:153.65ms step:798/1480 train_time:121085ms step_avg:153.66ms step:799/1480 train_time:121251ms step_avg:153.68ms step:800/1480 train_time:121414ms step_avg:153.69ms step:801/1480 train_time:121576ms step_avg:153.70ms step:802/1480 train_time:121745ms step_avg:153.72ms step:803/1480 train_time:121908ms step_avg:153.73ms step:804/1480 train_time:122069ms step_avg:153.74ms step:805/1480 train_time:122233ms step_avg:153.75ms step:806/1480 train_time:122395ms step_avg:153.76ms step:807/1480 train_time:122556ms step_avg:153.77ms step:808/1480 train_time:122719ms step_avg:153.78ms step:809/1480 train_time:122882ms step_avg:153.79ms step:810/1480 train_time:123045ms step_avg:153.81ms step:811/1480 train_time:123208ms step_avg:153.82ms step:812/1480 train_time:123371ms step_avg:153.83ms step:813/1480 train_time:123532ms step_avg:153.84ms step:814/1480 train_time:123693ms step_avg:153.85ms step:815/1480 train_time:123856ms step_avg:153.86ms step:816/1480 train_time:124024ms step_avg:153.88ms step:817/1480 train_time:124186ms step_avg:153.89ms step:818/1480 train_time:124347ms step_avg:153.89ms step:819/1480 train_time:124511ms step_avg:153.91ms step:820/1480 train_time:124673ms step_avg:153.92ms step:821/1480 train_time:124833ms step_avg:153.93ms step:822/1480 train_time:124999ms step_avg:153.94ms step:823/1480 train_time:125161ms step_avg:153.95ms step:824/1480 train_time:125325ms step_avg:153.96ms step:825/1480 train_time:125489ms step_avg:153.97ms step:826/1480 train_time:125658ms step_avg:153.99ms step:827/1480 train_time:125823ms step_avg:154.01ms step:828/1480 train_time:125986ms step_avg:154.02ms step:829/1480 train_time:126150ms step_avg:154.03ms step:830/1480 train_time:126313ms step_avg:154.04ms step:831/1480 train_time:126478ms step_avg:154.05ms step:832/1480 train_time:126643ms step_avg:154.07ms step:833/1480 train_time:126807ms step_avg:154.08ms step:834/1480 train_time:126970ms step_avg:154.09ms step:835/1480 train_time:127132ms step_avg:154.10ms step:836/1480 train_time:127298ms step_avg:154.11ms step:837/1480 train_time:127461ms step_avg:154.12ms step:838/1480 train_time:127626ms step_avg:154.14ms step:839/1480 train_time:127787ms step_avg:154.15ms step:840/1480 train_time:127948ms step_avg:154.15ms step:841/1480 train_time:128109ms step_avg:154.16ms step:842/1480 train_time:128273ms step_avg:154.17ms step:843/1480 train_time:128434ms step_avg:154.18ms step:844/1480 train_time:128596ms step_avg:154.19ms step:845/1480 train_time:128761ms step_avg:154.20ms step:846/1480 train_time:128926ms step_avg:154.22ms step:847/1480 train_time:129088ms step_avg:154.23ms step:848/1480 train_time:129250ms step_avg:154.24ms step:849/1480 train_time:129411ms step_avg:154.24ms step:850/1480 train_time:129574ms step_avg:154.25ms step:851/1480 train_time:129741ms step_avg:154.27ms step:852/1480 train_time:129904ms step_avg:154.28ms step:853/1480 train_time:130067ms step_avg:154.29ms step:854/1480 train_time:130229ms step_avg:154.30ms step:855/1480 train_time:130392ms step_avg:154.31ms step:856/1480 train_time:130553ms step_avg:154.32ms step:857/1480 train_time:130718ms step_avg:154.33ms step:858/1480 train_time:130884ms step_avg:154.34ms step:859/1480 train_time:131049ms step_avg:154.36ms step:860/1480 train_time:131210ms step_avg:154.37ms step:861/1480 train_time:131377ms step_avg:154.38ms step:862/1480 train_time:131547ms step_avg:154.40ms step:863/1480 train_time:131713ms step_avg:154.41ms step:864/1480 train_time:131877ms step_avg:154.42ms step:865/1480 train_time:132039ms step_avg:154.43ms step:866/1480 train_time:132206ms step_avg:154.45ms step:867/1480 train_time:132369ms step_avg:154.46ms step:868/1480 train_time:132532ms step_avg:154.47ms step:869/1480 train_time:132693ms step_avg:154.47ms step:870/1480 train_time:132857ms step_avg:154.49ms step:871/1480 train_time:133023ms step_avg:154.50ms step:872/1480 train_time:133186ms step_avg:154.51ms step:873/1480 train_time:133349ms step_avg:154.52ms step:874/1480 train_time:133515ms step_avg:154.53ms step:875/1480 train_time:133681ms step_avg:154.54ms step:875/1480 val_loss:3.5019 train_time:133747ms step_avg:154.62ms step:876/1480 train_time:133850ms step_avg:154.56ms step:877/1480 train_time:134017ms step_avg:154.58ms step:878/1480 train_time:134178ms step_avg:154.58ms step:879/1480 train_time:134341ms step_avg:154.59ms step:880/1480 train_time:134503ms step_avg:154.60ms step:881/1480 train_time:134666ms step_avg:154.61ms step:882/1480 train_time:134832ms step_avg:154.62ms step:883/1480 train_time:134999ms step_avg:154.64ms step:884/1480 train_time:135165ms step_avg:154.65ms step:885/1480 train_time:135330ms step_avg:154.66ms step:886/1480 train_time:135496ms step_avg:154.68ms step:887/1480 train_time:135662ms step_avg:154.69ms step:888/1480 train_time:135835ms step_avg:154.71ms step:889/1480 train_time:136002ms step_avg:154.72ms step:890/1480 train_time:136164ms step_avg:154.73ms step:891/1480 train_time:136329ms step_avg:154.74ms step:892/1480 train_time:136495ms step_avg:154.76ms step:893/1480 train_time:136658ms step_avg:154.77ms step:894/1480 train_time:136824ms step_avg:154.78ms step:895/1480 train_time:136992ms step_avg:154.79ms step:896/1480 train_time:137157ms step_avg:154.80ms step:897/1480 train_time:137322ms step_avg:154.82ms step:898/1480 train_time:137490ms step_avg:154.83ms step:899/1480 train_time:137655ms step_avg:154.84ms step:900/1480 train_time:137819ms step_avg:154.85ms step:901/1480 train_time:137982ms step_avg:154.86ms step:902/1480 train_time:138145ms step_avg:154.87ms step:903/1480 train_time:138319ms step_avg:154.89ms step:904/1480 train_time:138484ms step_avg:154.90ms step:905/1480 train_time:138647ms step_avg:154.91ms step:906/1480 train_time:138814ms step_avg:154.93ms step:907/1480 train_time:138981ms step_avg:154.94ms step:908/1480 train_time:139142ms step_avg:154.95ms step:909/1480 train_time:139307ms step_avg:154.96ms step:910/1480 train_time:139477ms step_avg:154.97ms step:911/1480 train_time:139641ms step_avg:154.98ms step:912/1480 train_time:139809ms step_avg:155.00ms step:913/1480 train_time:139977ms step_avg:155.01ms step:914/1480 train_time:140144ms step_avg:155.03ms step:915/1480 train_time:140316ms step_avg:155.05ms step:916/1480 train_time:140480ms step_avg:155.06ms step:917/1480 train_time:140643ms step_avg:155.06ms step:918/1480 train_time:140812ms step_avg:155.08ms step:919/1480 train_time:140982ms step_avg:155.10ms step:920/1480 train_time:141148ms step_avg:155.11ms step:921/1480 train_time:141314ms step_avg:155.12ms step:922/1480 train_time:141481ms step_avg:155.13ms step:923/1480 train_time:141645ms step_avg:155.14ms step:924/1480 train_time:141810ms step_avg:155.15ms step:925/1480 train_time:141975ms step_avg:155.16ms step:926/1480 train_time:142138ms step_avg:155.17ms step:927/1480 train_time:142302ms step_avg:155.18ms step:928/1480 train_time:142470ms step_avg:155.20ms step:929/1480 train_time:142634ms step_avg:155.21ms step:930/1480 train_time:142801ms step_avg:155.22ms step:931/1480 train_time:142964ms step_avg:155.23ms step:932/1480 train_time:143131ms step_avg:155.24ms step:933/1480 train_time:143298ms step_avg:155.25ms step:934/1480 train_time:143464ms step_avg:155.26ms step:935/1480 train_time:143635ms step_avg:155.28ms step:936/1480 train_time:143803ms step_avg:155.29ms step:937/1480 train_time:143973ms step_avg:155.31ms step:938/1480 train_time:144136ms step_avg:155.32ms step:939/1480 train_time:144304ms step_avg:155.33ms step:940/1480 train_time:144472ms step_avg:155.35ms step:941/1480 train_time:144636ms step_avg:155.36ms step:942/1480 train_time:144801ms step_avg:155.37ms step:943/1480 train_time:144970ms step_avg:155.38ms step:944/1480 train_time:145142ms step_avg:155.40ms step:945/1480 train_time:145307ms step_avg:155.41ms step:946/1480 train_time:145478ms step_avg:155.42ms step:947/1480 train_time:145645ms step_avg:155.44ms step:948/1480 train_time:145811ms step_avg:155.45ms step:949/1480 train_time:145976ms step_avg:155.46ms step:950/1480 train_time:146140ms step_avg:155.47ms step:951/1480 train_time:146308ms step_avg:155.48ms step:952/1480 train_time:146475ms step_avg:155.49ms step:953/1480 train_time:146643ms step_avg:155.51ms step:954/1480 train_time:146812ms step_avg:155.52ms step:955/1480 train_time:146976ms step_avg:155.53ms step:956/1480 train_time:147140ms step_avg:155.54ms step:957/1480 train_time:147308ms step_avg:155.55ms step:958/1480 train_time:147478ms step_avg:155.57ms step:959/1480 train_time:147642ms step_avg:155.58ms step:960/1480 train_time:147808ms step_avg:155.59ms step:961/1480 train_time:147975ms step_avg:155.60ms step:962/1480 train_time:148138ms step_avg:155.61ms step:963/1480 train_time:148304ms step_avg:155.62ms step:964/1480 train_time:148473ms step_avg:155.63ms step:965/1480 train_time:148636ms step_avg:155.64ms step:966/1480 train_time:148801ms step_avg:155.65ms step:967/1480 train_time:148965ms step_avg:155.66ms step:968/1480 train_time:149133ms step_avg:155.67ms step:969/1480 train_time:149299ms step_avg:155.68ms step:970/1480 train_time:149461ms step_avg:155.69ms step:971/1480 train_time:149626ms step_avg:155.70ms step:972/1480 train_time:149793ms step_avg:155.71ms step:973/1480 train_time:149957ms step_avg:155.72ms step:974/1480 train_time:150125ms step_avg:155.73ms step:975/1480 train_time:150290ms step_avg:155.74ms step:976/1480 train_time:150456ms step_avg:155.75ms step:977/1480 train_time:150620ms step_avg:155.76ms step:978/1480 train_time:150783ms step_avg:155.77ms step:979/1480 train_time:150949ms step_avg:155.78ms step:980/1480 train_time:151115ms step_avg:155.79ms step:981/1480 train_time:151281ms step_avg:155.80ms step:982/1480 train_time:151443ms step_avg:155.81ms step:983/1480 train_time:151610ms step_avg:155.82ms step:984/1480 train_time:151774ms step_avg:155.83ms step:985/1480 train_time:151941ms step_avg:155.84ms step:986/1480 train_time:152105ms step_avg:155.85ms step:987/1480 train_time:152269ms step_avg:155.85ms step:988/1480 train_time:152437ms step_avg:155.87ms step:989/1480 train_time:152602ms step_avg:155.88ms step:990/1480 train_time:152772ms step_avg:155.89ms step:991/1480 train_time:152939ms step_avg:155.90ms step:992/1480 train_time:153115ms step_avg:155.92ms step:993/1480 train_time:153293ms step_avg:155.94ms step:994/1480 train_time:153459ms step_avg:155.95ms step:995/1480 train_time:153622ms step_avg:155.96ms step:996/1480 train_time:153784ms step_avg:155.97ms step:997/1480 train_time:153950ms step_avg:155.98ms step:998/1480 train_time:154114ms step_avg:155.99ms step:999/1480 train_time:154279ms step_avg:156.00ms step:1000/1480 train_time:154449ms step_avg:156.01ms step:1000/1480 val_loss:3.4381 train_time:154516ms step_avg:156.08ms step:1001/1480 train_time:154617ms step_avg:156.02ms step:1002/1480 train_time:154783ms step_avg:156.03ms step:1003/1480 train_time:154955ms step_avg:156.05ms step:1004/1480 train_time:155124ms step_avg:156.06ms step:1005/1480 train_time:155291ms step_avg:156.07ms step:1006/1480 train_time:155457ms step_avg:156.08ms step:1007/1480 train_time:155622ms step_avg:156.09ms step:1008/1480 train_time:155791ms step_avg:156.10ms step:1009/1480 train_time:155964ms step_avg:156.12ms step:1010/1480 train_time:156130ms step_avg:156.13ms step:1011/1480 train_time:156295ms step_avg:156.14ms step:1012/1480 train_time:156459ms step_avg:156.15ms step:1013/1480 train_time:156630ms step_avg:156.16ms step:1014/1480 train_time:156795ms step_avg:156.17ms step:1015/1480 train_time:156968ms step_avg:156.19ms step:1016/1480 train_time:157136ms step_avg:156.20ms step:1017/1480 train_time:157307ms step_avg:156.21ms step:1018/1480 train_time:157475ms step_avg:156.23ms step:1019/1480 train_time:157644ms step_avg:156.24ms step:1020/1480 train_time:157813ms step_avg:156.25ms step:1021/1480 train_time:157977ms step_avg:156.26ms step:1022/1480 train_time:158144ms step_avg:156.27ms step:1023/1480 train_time:158311ms step_avg:156.28ms step:1024/1480 train_time:158476ms step_avg:156.29ms step:1025/1480 train_time:158646ms step_avg:156.30ms step:1026/1480 train_time:158812ms step_avg:156.31ms step:1027/1480 train_time:158978ms step_avg:156.32ms step:1028/1480 train_time:159152ms step_avg:156.34ms step:1029/1480 train_time:159327ms step_avg:156.36ms step:1030/1480 train_time:159494ms step_avg:156.37ms step:1031/1480 train_time:159657ms step_avg:156.37ms step:1032/1480 train_time:159832ms step_avg:156.39ms step:1033/1480 train_time:159997ms step_avg:156.40ms step:1034/1480 train_time:160166ms step_avg:156.41ms step:1035/1480 train_time:160334ms step_avg:156.42ms step:1036/1480 train_time:160498ms step_avg:156.43ms step:1037/1480 train_time:160664ms step_avg:156.44ms step:1038/1480 train_time:160832ms step_avg:156.45ms step:1039/1480 train_time:161002ms step_avg:156.46ms step:1040/1480 train_time:161169ms step_avg:156.47ms step:1041/1480 train_time:161337ms step_avg:156.49ms step:1042/1480 train_time:161499ms step_avg:156.49ms step:1043/1480 train_time:161664ms step_avg:156.50ms step:1044/1480 train_time:161831ms step_avg:156.51ms step:1045/1480 train_time:161999ms step_avg:156.52ms step:1046/1480 train_time:162167ms step_avg:156.53ms step:1047/1480 train_time:162334ms step_avg:156.54ms step:1048/1480 train_time:162500ms step_avg:156.55ms step:1049/1480 train_time:162665ms step_avg:156.56ms step:1050/1480 train_time:162835ms step_avg:156.57ms step:1051/1480 train_time:163004ms step_avg:156.58ms step:1052/1480 train_time:163171ms step_avg:156.59ms step:1053/1480 train_time:163336ms step_avg:156.60ms step:1054/1480 train_time:163504ms step_avg:156.61ms step:1055/1480 train_time:163670ms step_avg:156.62ms step:1056/1480 train_time:163835ms step_avg:156.63ms step:1057/1480 train_time:164000ms step_avg:156.64ms step:1058/1480 train_time:164171ms step_avg:156.65ms step:1059/1480 train_time:164345ms step_avg:156.67ms step:1060/1480 train_time:164514ms step_avg:156.68ms step:1061/1480 train_time:164676ms step_avg:156.69ms step:1062/1480 train_time:164841ms step_avg:156.69ms step:1063/1480 train_time:165008ms step_avg:156.70ms step:1064/1480 train_time:165171ms step_avg:156.71ms step:1065/1480 train_time:165338ms step_avg:156.72ms step:1066/1480 train_time:165506ms step_avg:156.73ms step:1067/1480 train_time:165674ms step_avg:156.74ms step:1068/1480 train_time:165839ms step_avg:156.75ms step:1069/1480 train_time:166011ms step_avg:156.76ms step:1070/1480 train_time:166177ms step_avg:156.77ms step:1071/1480 train_time:166352ms step_avg:156.79ms step:1072/1480 train_time:166518ms step_avg:156.80ms step:1073/1480 train_time:166681ms step_avg:156.80ms step:1074/1480 train_time:166848ms step_avg:156.81ms step:1075/1480 train_time:167019ms step_avg:156.83ms step:1076/1480 train_time:167186ms step_avg:156.84ms step:1077/1480 train_time:167353ms step_avg:156.84ms step:1078/1480 train_time:167528ms step_avg:156.86ms step:1079/1480 train_time:167699ms step_avg:156.88ms step:1080/1480 train_time:167870ms step_avg:156.89ms step:1081/1480 train_time:168037ms step_avg:156.90ms step:1082/1480 train_time:168203ms step_avg:156.91ms step:1083/1480 train_time:168370ms step_avg:156.92ms step:1084/1480 train_time:168537ms step_avg:156.92ms step:1085/1480 train_time:168706ms step_avg:156.94ms step:1086/1480 train_time:168874ms step_avg:156.95ms step:1087/1480 train_time:169039ms step_avg:156.95ms step:1088/1480 train_time:169210ms step_avg:156.97ms step:1089/1480 train_time:169381ms step_avg:156.98ms step:1090/1480 train_time:169553ms step_avg:156.99ms step:1091/1480 train_time:169720ms step_avg:157.00ms step:1092/1480 train_time:169890ms step_avg:157.01ms step:1093/1480 train_time:170056ms step_avg:157.02ms step:1094/1480 train_time:170223ms step_avg:157.03ms step:1095/1480 train_time:170389ms step_avg:157.04ms step:1096/1480 train_time:170557ms step_avg:157.05ms step:1097/1480 train_time:170726ms step_avg:157.06ms step:1098/1480 train_time:170896ms step_avg:157.07ms step:1099/1480 train_time:171066ms step_avg:157.09ms step:1100/1480 train_time:171237ms step_avg:157.10ms step:1101/1480 train_time:171409ms step_avg:157.11ms step:1102/1480 train_time:171580ms step_avg:157.12ms step:1103/1480 train_time:171757ms step_avg:157.14ms step:1104/1480 train_time:171924ms step_avg:157.15ms step:1105/1480 train_time:172095ms step_avg:157.16ms step:1106/1480 train_time:172263ms step_avg:157.17ms step:1107/1480 train_time:172433ms step_avg:157.19ms step:1108/1480 train_time:172597ms step_avg:157.19ms step:1109/1480 train_time:172763ms step_avg:157.20ms step:1110/1480 train_time:172931ms step_avg:157.21ms step:1111/1480 train_time:173097ms step_avg:157.22ms step:1112/1480 train_time:173269ms step_avg:157.23ms step:1113/1480 train_time:173449ms step_avg:157.25ms step:1114/1480 train_time:173623ms step_avg:157.27ms step:1115/1480 train_time:173796ms step_avg:157.28ms step:1116/1480 train_time:173965ms step_avg:157.29ms step:1117/1480 train_time:174138ms step_avg:157.31ms step:1118/1480 train_time:174311ms step_avg:157.32ms step:1119/1480 train_time:174477ms step_avg:157.33ms step:1120/1480 train_time:174645ms step_avg:157.34ms step:1121/1480 train_time:174815ms step_avg:157.35ms step:1122/1480 train_time:174982ms step_avg:157.36ms step:1123/1480 train_time:175149ms step_avg:157.37ms step:1124/1480 train_time:175318ms step_avg:157.38ms step:1125/1480 train_time:175486ms step_avg:157.39ms step:1125/1480 val_loss:3.3836 train_time:175555ms step_avg:157.45ms step:1126/1480 train_time:175658ms step_avg:157.40ms step:1127/1480 train_time:175825ms step_avg:157.41ms step:1128/1480 train_time:175997ms step_avg:157.42ms step:1129/1480 train_time:176171ms step_avg:157.44ms step:1130/1480 train_time:176341ms step_avg:157.45ms step:1131/1480 train_time:176519ms step_avg:157.47ms step:1132/1480 train_time:176684ms step_avg:157.47ms step:1133/1480 train_time:176857ms step_avg:157.49ms step:1134/1480 train_time:177027ms step_avg:157.50ms step:1135/1480 train_time:177195ms step_avg:157.51ms step:1136/1480 train_time:177365ms step_avg:157.52ms step:1137/1480 train_time:177535ms step_avg:157.53ms step:1138/1480 train_time:177706ms step_avg:157.54ms step:1139/1480 train_time:177875ms step_avg:157.55ms step:1140/1480 train_time:178043ms step_avg:157.56ms step:1141/1480 train_time:178218ms step_avg:157.58ms step:1142/1480 train_time:178386ms step_avg:157.58ms step:1143/1480 train_time:178557ms step_avg:157.60ms step:1144/1480 train_time:178725ms step_avg:157.61ms step:1145/1480 train_time:178890ms step_avg:157.61ms step:1146/1480 train_time:179061ms step_avg:157.62ms step:1147/1480 train_time:179228ms step_avg:157.63ms step:1148/1480 train_time:179396ms step_avg:157.64ms step:1149/1480 train_time:179566ms step_avg:157.65ms step:1150/1480 train_time:179734ms step_avg:157.66ms step:1151/1480 train_time:179905ms step_avg:157.67ms step:1152/1480 train_time:180078ms step_avg:157.69ms step:1153/1480 train_time:180252ms step_avg:157.70ms step:1154/1480 train_time:180419ms step_avg:157.71ms step:1155/1480 train_time:180590ms step_avg:157.72ms step:1156/1480 train_time:180769ms step_avg:157.74ms step:1157/1480 train_time:180938ms step_avg:157.75ms step:1158/1480 train_time:181104ms step_avg:157.76ms step:1159/1480 train_time:181270ms step_avg:157.76ms step:1160/1480 train_time:181436ms step_avg:157.77ms step:1161/1480 train_time:181605ms step_avg:157.78ms step:1162/1480 train_time:181776ms step_avg:157.79ms step:1163/1480 train_time:181945ms step_avg:157.80ms step:1164/1480 train_time:182116ms step_avg:157.81ms step:1165/1480 train_time:182281ms step_avg:157.82ms step:1166/1480 train_time:182452ms step_avg:157.83ms step:1167/1480 train_time:182620ms step_avg:157.84ms step:1168/1480 train_time:182788ms step_avg:157.85ms step:1169/1480 train_time:182957ms step_avg:157.86ms step:1170/1480 train_time:183125ms step_avg:157.87ms step:1171/1480 train_time:183292ms step_avg:157.87ms step:1172/1480 train_time:183460ms step_avg:157.88ms step:1173/1480 train_time:183630ms step_avg:157.89ms step:1174/1480 train_time:183812ms step_avg:157.91ms step:1175/1480 train_time:183984ms step_avg:157.93ms step:1176/1480 train_time:184157ms step_avg:157.94ms step:1177/1480 train_time:184334ms step_avg:157.96ms step:1178/1480 train_time:184501ms step_avg:157.96ms step:1179/1480 train_time:184666ms step_avg:157.97ms step:1180/1480 train_time:184847ms step_avg:157.99ms step:1181/1480 train_time:185017ms step_avg:158.00ms step:1182/1480 train_time:185183ms step_avg:158.01ms step:1183/1480 train_time:185355ms step_avg:158.02ms step:1184/1480 train_time:185522ms step_avg:158.03ms step:1185/1480 train_time:185696ms step_avg:158.04ms step:1186/1480 train_time:185867ms step_avg:158.05ms step:1187/1480 train_time:186052ms step_avg:158.07ms step:1188/1480 train_time:186219ms step_avg:158.08ms step:1189/1480 train_time:186390ms step_avg:158.09ms step:1190/1480 train_time:186558ms step_avg:158.10ms step:1191/1480 train_time:186728ms step_avg:158.11ms step:1192/1480 train_time:186895ms step_avg:158.12ms step:1193/1480 train_time:187061ms step_avg:158.12ms step:1194/1480 train_time:187230ms step_avg:158.13ms step:1195/1480 train_time:187404ms step_avg:158.15ms step:1196/1480 train_time:187588ms step_avg:158.17ms step:1197/1480 train_time:187760ms step_avg:158.18ms step:1198/1480 train_time:187940ms step_avg:158.20ms step:1199/1480 train_time:188110ms step_avg:158.21ms step:1200/1480 train_time:188279ms step_avg:158.22ms step:1201/1480 train_time:188447ms step_avg:158.23ms step:1202/1480 train_time:188629ms step_avg:158.25ms step:1203/1480 train_time:188805ms step_avg:158.26ms step:1204/1480 train_time:188979ms step_avg:158.27ms step:1205/1480 train_time:189146ms step_avg:158.28ms step:1206/1480 train_time:189315ms step_avg:158.29ms step:1207/1480 train_time:189484ms step_avg:158.30ms step:1208/1480 train_time:189651ms step_avg:158.31ms step:1209/1480 train_time:189826ms step_avg:158.32ms step:1210/1480 train_time:190001ms step_avg:158.33ms step:1211/1480 train_time:190175ms step_avg:158.35ms step:1212/1480 train_time:190347ms step_avg:158.36ms step:1213/1480 train_time:190520ms step_avg:158.37ms step:1214/1480 train_time:190698ms step_avg:158.39ms step:1215/1480 train_time:190871ms step_avg:158.40ms step:1216/1480 train_time:191040ms step_avg:158.41ms step:1217/1480 train_time:191213ms step_avg:158.42ms step:1218/1480 train_time:191383ms step_avg:158.43ms step:1219/1480 train_time:191563ms step_avg:158.45ms step:1220/1480 train_time:191732ms step_avg:158.46ms step:1221/1480 train_time:191901ms step_avg:158.47ms step:1222/1480 train_time:192068ms step_avg:158.47ms step:1223/1480 train_time:192240ms step_avg:158.48ms step:1224/1480 train_time:192418ms step_avg:158.50ms step:1225/1480 train_time:192591ms step_avg:158.51ms step:1226/1480 train_time:192763ms step_avg:158.52ms step:1227/1480 train_time:192937ms step_avg:158.53ms step:1228/1480 train_time:193106ms step_avg:158.54ms step:1229/1480 train_time:193280ms step_avg:158.56ms step:1230/1480 train_time:193461ms step_avg:158.57ms step:1231/1480 train_time:193637ms step_avg:158.59ms step:1232/1480 train_time:193812ms step_avg:158.60ms step:1233/1480 train_time:193981ms step_avg:158.61ms step:1234/1480 train_time:194153ms step_avg:158.62ms step:1235/1480 train_time:194325ms step_avg:158.63ms step:1236/1480 train_time:194494ms step_avg:158.64ms step:1237/1480 train_time:194665ms step_avg:158.65ms step:1238/1480 train_time:194850ms step_avg:158.67ms step:1239/1480 train_time:195020ms step_avg:158.68ms step:1240/1480 train_time:195190ms step_avg:158.69ms step:1241/1480 train_time:195362ms step_avg:158.70ms step:1242/1480 train_time:195531ms step_avg:158.71ms step:1243/1480 train_time:195704ms step_avg:158.72ms step:1244/1480 train_time:195873ms step_avg:158.73ms step:1245/1480 train_time:196041ms step_avg:158.74ms step:1246/1480 train_time:196210ms step_avg:158.75ms step:1247/1480 train_time:196380ms step_avg:158.75ms step:1248/1480 train_time:196549ms step_avg:158.76ms step:1249/1480 train_time:196719ms step_avg:158.77ms step:1250/1480 train_time:196887ms step_avg:158.78ms step:1250/1480 val_loss:3.3336 train_time:196959ms step_avg:158.84ms step:1251/1480 train_time:197066ms step_avg:158.80ms step:1252/1480 train_time:197236ms step_avg:158.81ms step:1253/1480 train_time:197403ms step_avg:158.81ms step:1254/1480 train_time:197576ms step_avg:158.82ms step:1255/1480 train_time:197761ms step_avg:158.84ms step:1256/1480 train_time:197936ms step_avg:158.86ms step:1257/1480 train_time:198107ms step_avg:158.87ms step:1258/1480 train_time:198282ms step_avg:158.88ms step:1259/1480 train_time:198454ms step_avg:158.89ms step:1260/1480 train_time:198621ms step_avg:158.90ms step:1261/1480 train_time:198795ms step_avg:158.91ms step:1262/1480 train_time:198972ms step_avg:158.92ms step:1263/1480 train_time:199145ms step_avg:158.93ms step:1264/1480 train_time:199313ms step_avg:158.94ms step:1265/1480 train_time:199480ms step_avg:158.95ms step:1266/1480 train_time:199653ms step_avg:158.96ms step:1267/1480 train_time:199823ms step_avg:158.97ms step:1268/1480 train_time:199995ms step_avg:158.98ms step:1269/1480 train_time:200171ms step_avg:158.99ms step:1270/1480 train_time:200340ms step_avg:159.00ms step:1271/1480 train_time:200510ms step_avg:159.01ms step:1272/1480 train_time:200676ms step_avg:159.01ms step:1273/1480 train_time:200845ms step_avg:159.02ms step:1274/1480 train_time:201017ms step_avg:159.03ms step:1275/1480 train_time:201185ms step_avg:159.04ms step:1276/1480 train_time:201352ms step_avg:159.05ms step:1277/1480 train_time:201523ms step_avg:159.06ms step:1278/1480 train_time:201693ms step_avg:159.06ms step:1279/1480 train_time:201864ms step_avg:159.07ms step:1280/1480 train_time:202042ms step_avg:159.09ms step:1281/1480 train_time:202210ms step_avg:159.09ms step:1282/1480 train_time:202377ms step_avg:159.10ms step:1283/1480 train_time:202547ms step_avg:159.11ms step:1284/1480 train_time:202717ms step_avg:159.12ms step:1285/1480 train_time:202885ms step_avg:159.13ms step:1286/1480 train_time:203056ms step_avg:159.13ms step:1287/1480 train_time:203227ms step_avg:159.14ms step:1288/1480 train_time:203399ms step_avg:159.15ms step:1289/1480 train_time:203582ms step_avg:159.17ms step:1290/1480 train_time:203762ms step_avg:159.19ms step:1291/1480 train_time:203936ms step_avg:159.20ms step:1292/1480 train_time:204109ms step_avg:159.21ms step:1293/1480 train_time:204283ms step_avg:159.22ms step:1294/1480 train_time:204456ms step_avg:159.23ms step:1295/1480 train_time:204629ms step_avg:159.24ms step:1296/1480 train_time:204801ms step_avg:159.25ms step:1297/1480 train_time:204973ms step_avg:159.26ms step:1298/1480 train_time:205142ms step_avg:159.27ms step:1299/1480 train_time:205312ms step_avg:159.28ms step:1300/1480 train_time:205480ms step_avg:159.29ms step:1301/1480 train_time:205650ms step_avg:159.29ms step:1302/1480 train_time:205823ms step_avg:159.31ms step:1303/1480 train_time:205999ms step_avg:159.32ms step:1304/1480 train_time:206173ms step_avg:159.33ms step:1305/1480 train_time:206341ms step_avg:159.34ms step:1306/1480 train_time:206515ms step_avg:159.35ms step:1307/1480 train_time:206684ms step_avg:159.36ms step:1308/1480 train_time:206853ms step_avg:159.36ms step:1309/1480 train_time:207023ms step_avg:159.37ms step:1310/1480 train_time:207192ms step_avg:159.38ms step:1311/1480 train_time:207360ms step_avg:159.38ms step:1312/1480 train_time:207533ms step_avg:159.40ms step:1313/1480 train_time:207700ms step_avg:159.40ms step:1314/1480 train_time:207876ms step_avg:159.41ms step:1315/1480 train_time:208046ms step_avg:159.42ms step:1316/1480 train_time:208215ms step_avg:159.43ms step:1317/1480 train_time:208386ms step_avg:159.44ms step:1318/1480 train_time:208567ms step_avg:159.45ms step:1319/1480 train_time:208742ms step_avg:159.47ms step:1320/1480 train_time:208918ms step_avg:159.48ms step:1321/1480 train_time:209090ms step_avg:159.49ms step:1322/1480 train_time:209272ms step_avg:159.51ms step:1323/1480 train_time:209443ms step_avg:159.51ms step:1324/1480 train_time:209617ms step_avg:159.53ms step:1325/1480 train_time:209797ms step_avg:159.54ms step:1326/1480 train_time:209974ms step_avg:159.55ms step:1327/1480 train_time:210144ms step_avg:159.56ms step:1328/1480 train_time:210316ms step_avg:159.57ms step:1329/1480 train_time:210511ms step_avg:159.60ms step:1330/1480 train_time:210690ms step_avg:159.61ms step:1331/1480 train_time:210859ms step_avg:159.62ms step:1332/1480 train_time:211032ms step_avg:159.63ms step:1333/1480 train_time:211208ms step_avg:159.64ms step:1334/1480 train_time:211379ms step_avg:159.65ms step:1335/1480 train_time:211546ms step_avg:159.66ms step:1336/1480 train_time:211731ms step_avg:159.68ms step:1337/1480 train_time:211904ms step_avg:159.69ms step:1338/1480 train_time:212076ms step_avg:159.70ms step:1339/1480 train_time:212250ms step_avg:159.71ms step:1340/1480 train_time:212420ms step_avg:159.71ms step:1341/1480 train_time:212589ms step_avg:159.72ms step:1342/1480 train_time:212763ms step_avg:159.73ms step:1343/1480 train_time:212932ms step_avg:159.74ms step:1344/1480 train_time:213104ms step_avg:159.75ms step:1345/1480 train_time:213282ms step_avg:159.76ms step:1346/1480 train_time:213452ms step_avg:159.77ms step:1347/1480 train_time:213620ms step_avg:159.78ms step:1348/1480 train_time:213790ms step_avg:159.78ms step:1349/1480 train_time:213959ms step_avg:159.79ms step:1350/1480 train_time:214135ms step_avg:159.80ms step:1351/1480 train_time:214305ms step_avg:159.81ms step:1352/1480 train_time:214476ms step_avg:159.82ms step:1353/1480 train_time:214652ms step_avg:159.83ms step:1354/1480 train_time:214822ms step_avg:159.84ms step:1355/1480 train_time:214990ms step_avg:159.84ms step:1356/1480 train_time:215162ms step_avg:159.85ms step:1357/1480 train_time:215336ms step_avg:159.86ms step:1358/1480 train_time:215508ms step_avg:159.87ms step:1359/1480 train_time:215680ms step_avg:159.88ms step:1360/1480 train_time:215855ms step_avg:159.89ms step:1361/1480 train_time:216034ms step_avg:159.91ms step:1362/1480 train_time:216210ms step_avg:159.92ms step:1363/1480 train_time:216391ms step_avg:159.93ms step:1364/1480 train_time:216560ms step_avg:159.94ms step:1365/1480 train_time:216726ms step_avg:159.95ms step:1366/1480 train_time:216898ms step_avg:159.95ms step:1367/1480 train_time:217071ms step_avg:159.96ms step:1368/1480 train_time:217241ms step_avg:159.97ms step:1369/1480 train_time:217421ms step_avg:159.99ms step:1370/1480 train_time:217598ms step_avg:160.00ms step:1371/1480 train_time:217770ms step_avg:160.01ms step:1372/1480 train_time:217947ms step_avg:160.02ms step:1373/1480 train_time:218117ms step_avg:160.03ms step:1374/1480 train_time:218295ms step_avg:160.04ms step:1375/1480 train_time:218466ms step_avg:160.05ms step:1375/1480 val_loss:3.2949 train_time:218534ms step_avg:160.10ms step:1376/1480 train_time:218639ms step_avg:160.06ms step:1377/1480 train_time:218813ms step_avg:160.07ms step:1378/1480 train_time:218981ms step_avg:160.07ms step:1379/1480 train_time:219156ms step_avg:160.08ms step:1380/1480 train_time:219331ms step_avg:160.10ms step:1381/1480 train_time:219513ms step_avg:160.11ms step:1382/1480 train_time:219685ms step_avg:160.12ms step:1383/1480 train_time:219858ms step_avg:160.13ms step:1384/1480 train_time:220035ms step_avg:160.14ms step:1385/1480 train_time:220200ms step_avg:160.15ms step:1386/1480 train_time:220370ms step_avg:160.15ms step:1387/1480 train_time:220541ms step_avg:160.16ms step:1388/1480 train_time:220709ms step_avg:160.17ms step:1389/1480 train_time:220882ms step_avg:160.18ms step:1390/1480 train_time:221050ms step_avg:160.18ms step:1391/1480 train_time:221220ms step_avg:160.19ms step:1392/1480 train_time:221392ms step_avg:160.20ms step:1393/1480 train_time:221562ms step_avg:160.20ms step:1394/1480 train_time:221733ms step_avg:160.21ms step:1395/1480 train_time:221900ms step_avg:160.22ms step:1396/1480 train_time:222069ms step_avg:160.22ms step:1397/1480 train_time:222236ms step_avg:160.23ms step:1398/1480 train_time:222402ms step_avg:160.23ms step:1399/1480 train_time:222573ms step_avg:160.24ms step:1400/1480 train_time:222752ms step_avg:160.25ms step:1401/1480 train_time:222918ms step_avg:160.26ms step:1402/1480 train_time:223090ms step_avg:160.27ms step:1403/1480 train_time:223266ms step_avg:160.28ms step:1404/1480 train_time:223437ms step_avg:160.28ms step:1405/1480 train_time:223614ms step_avg:160.30ms step:1406/1480 train_time:223789ms step_avg:160.31ms step:1407/1480 train_time:223957ms step_avg:160.31ms step:1408/1480 train_time:224124ms step_avg:160.32ms step:1409/1480 train_time:224306ms step_avg:160.33ms step:1410/1480 train_time:224476ms step_avg:160.34ms step:1411/1480 train_time:224642ms step_avg:160.34ms step:1412/1480 train_time:224813ms step_avg:160.35ms step:1413/1480 train_time:224983ms step_avg:160.36ms step:1414/1480 train_time:225156ms step_avg:160.37ms step:1415/1480 train_time:225330ms step_avg:160.38ms step:1416/1480 train_time:225515ms step_avg:160.39ms step:1417/1480 train_time:225690ms step_avg:160.41ms step:1418/1480 train_time:225860ms step_avg:160.41ms step:1419/1480 train_time:226036ms step_avg:160.42ms step:1420/1480 train_time:226212ms step_avg:160.43ms step:1421/1480 train_time:226386ms step_avg:160.44ms step:1422/1480 train_time:226558ms step_avg:160.45ms step:1423/1480 train_time:226728ms step_avg:160.46ms step:1424/1480 train_time:226904ms step_avg:160.47ms step:1425/1480 train_time:227084ms step_avg:160.48ms step:1426/1480 train_time:227256ms step_avg:160.49ms step:1427/1480 train_time:227429ms step_avg:160.50ms step:1428/1480 train_time:227600ms step_avg:160.51ms step:1429/1480 train_time:227769ms step_avg:160.51ms step:1430/1480 train_time:227942ms step_avg:160.52ms step:1431/1480 train_time:228119ms step_avg:160.53ms step:1432/1480 train_time:228296ms step_avg:160.55ms step:1433/1480 train_time:228476ms step_avg:160.56ms step:1434/1480 train_time:228657ms step_avg:160.57ms step:1435/1480 train_time:228831ms step_avg:160.58ms step:1436/1480 train_time:229004ms step_avg:160.59ms step:1437/1480 train_time:229175ms step_avg:160.60ms step:1438/1480 train_time:229344ms step_avg:160.61ms step:1439/1480 train_time:229518ms step_avg:160.61ms step:1440/1480 train_time:229688ms step_avg:160.62ms step:1441/1480 train_time:229859ms step_avg:160.63ms step:1442/1480 train_time:230037ms step_avg:160.64ms step:1443/1480 train_time:230225ms step_avg:160.66ms step:1444/1480 train_time:230396ms step_avg:160.67ms step:1445/1480 train_time:230566ms step_avg:160.67ms step:1446/1480 train_time:230740ms step_avg:160.68ms step:1447/1480 train_time:230917ms step_avg:160.69ms step:1448/1480 train_time:231087ms step_avg:160.70ms step:1449/1480 train_time:231259ms step_avg:160.71ms step:1450/1480 train_time:231431ms step_avg:160.72ms step:1451/1480 train_time:231601ms step_avg:160.72ms step:1452/1480 train_time:231776ms step_avg:160.73ms step:1453/1480 train_time:231945ms step_avg:160.74ms step:1454/1480 train_time:232116ms step_avg:160.75ms step:1455/1480 train_time:232296ms step_avg:160.76ms step:1456/1480 train_time:232470ms step_avg:160.77ms step:1457/1480 train_time:232640ms step_avg:160.77ms step:1458/1480 train_time:232813ms step_avg:160.78ms step:1459/1480 train_time:232991ms step_avg:160.79ms step:1460/1480 train_time:233163ms step_avg:160.80ms step:1461/1480 train_time:233336ms step_avg:160.81ms step:1462/1480 train_time:233508ms step_avg:160.82ms step:1463/1480 train_time:233684ms step_avg:160.83ms step:1464/1480 train_time:233858ms step_avg:160.84ms step:1465/1480 train_time:234029ms step_avg:160.84ms step:1466/1480 train_time:234199ms step_avg:160.85ms step:1467/1480 train_time:234374ms step_avg:160.86ms step:1468/1480 train_time:234543ms step_avg:160.87ms step:1469/1480 train_time:234717ms step_avg:160.88ms step:1470/1480 train_time:234898ms step_avg:160.89ms step:1471/1480 train_time:235083ms step_avg:160.91ms step:1472/1480 train_time:235262ms step_avg:160.92ms step:1473/1480 train_time:235432ms step_avg:160.92ms step:1474/1480 train_time:235613ms step_avg:160.94ms step:1475/1480 train_time:235793ms step_avg:160.95ms step:1476/1480 train_time:235964ms step_avg:160.96ms step:1477/1480 train_time:236148ms step_avg:160.97ms step:1478/1480 train_time:236334ms step_avg:160.99ms step:1479/1480 train_time:236507ms step_avg:161.00ms step:1480/1480 train_time:236679ms step_avg:161.01ms step:1480/1480 val_loss:3.2760 train_time:236750ms step_avg:161.05ms