import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 12:02:54 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 131W / 700W | 533MiB / 81559MiB | 2% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 94W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 100W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 98W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 102W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22969ms step_avg:nanms step:2/1480 train_time:23060ms step_avg:nanms step:3/1480 train_time:23199ms step_avg:nanms step:4/1480 train_time:23340ms step_avg:nanms step:5/1480 train_time:23481ms step_avg:nanms step:6/1480 train_time:23623ms step_avg:nanms step:7/1480 train_time:23765ms step_avg:nanms step:8/1480 train_time:23907ms step_avg:nanms step:9/1480 train_time:24053ms step_avg:nanms step:10/1480 train_time:24199ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.48ms step:14/1480 train_time:566ms step_avg:141.43ms step:15/1480 train_time:707ms step_avg:141.47ms step:16/1480 train_time:851ms step_avg:141.85ms step:17/1480 train_time:996ms step_avg:142.35ms step:18/1480 train_time:1141ms step_avg:142.58ms step:19/1480 train_time:1283ms step_avg:142.55ms step:20/1480 train_time:1425ms step_avg:142.50ms step:21/1480 train_time:1566ms step_avg:142.39ms step:22/1480 train_time:1707ms step_avg:142.26ms step:23/1480 train_time:1849ms step_avg:142.26ms step:24/1480 train_time:1993ms step_avg:142.38ms step:25/1480 train_time:2139ms step_avg:142.58ms step:26/1480 train_time:2283ms step_avg:142.68ms step:27/1480 train_time:2425ms step_avg:142.65ms step:28/1480 train_time:2567ms step_avg:142.59ms step:29/1480 train_time:2708ms step_avg:142.51ms step:30/1480 train_time:2850ms step_avg:142.50ms step:31/1480 train_time:2994ms step_avg:142.59ms step:32/1480 train_time:3139ms step_avg:142.66ms step:33/1480 train_time:3283ms step_avg:142.72ms step:34/1480 train_time:3424ms step_avg:142.67ms step:35/1480 train_time:3566ms step_avg:142.66ms step:36/1480 train_time:3707ms step_avg:142.57ms step:37/1480 train_time:3848ms step_avg:142.51ms step:38/1480 train_time:3990ms step_avg:142.50ms step:39/1480 train_time:4135ms step_avg:142.58ms step:40/1480 train_time:4280ms step_avg:142.67ms step:41/1480 train_time:4424ms step_avg:142.70ms step:42/1480 train_time:4566ms step_avg:142.70ms step:43/1480 train_time:4708ms step_avg:142.66ms step:44/1480 train_time:4849ms step_avg:142.62ms step:45/1480 train_time:4991ms step_avg:142.61ms step:46/1480 train_time:5135ms step_avg:142.63ms step:47/1480 train_time:5279ms step_avg:142.68ms step:48/1480 train_time:5422ms step_avg:142.67ms step:49/1480 train_time:5565ms step_avg:142.69ms step:50/1480 train_time:5707ms step_avg:142.67ms step:51/1480 train_time:5848ms step_avg:142.64ms step:52/1480 train_time:5990ms step_avg:142.63ms step:53/1480 train_time:6135ms step_avg:142.68ms step:54/1480 train_time:6281ms step_avg:142.74ms step:55/1480 train_time:6423ms step_avg:142.73ms step:56/1480 train_time:6566ms step_avg:142.74ms step:57/1480 train_time:6707ms step_avg:142.70ms step:58/1480 train_time:6849ms step_avg:142.70ms step:59/1480 train_time:6991ms step_avg:142.68ms step:60/1480 train_time:7133ms step_avg:142.67ms step:61/1480 train_time:7278ms step_avg:142.71ms step:62/1480 train_time:7422ms step_avg:142.74ms step:63/1480 train_time:7566ms step_avg:142.76ms step:64/1480 train_time:7707ms step_avg:142.72ms step:65/1480 train_time:7848ms step_avg:142.70ms step:66/1480 train_time:7991ms step_avg:142.69ms step:67/1480 train_time:8134ms step_avg:142.70ms step:68/1480 train_time:8278ms step_avg:142.73ms step:69/1480 train_time:8422ms step_avg:142.74ms step:70/1480 train_time:8565ms step_avg:142.75ms step:71/1480 train_time:8706ms step_avg:142.73ms step:72/1480 train_time:8847ms step_avg:142.70ms step:73/1480 train_time:8988ms step_avg:142.67ms step:74/1480 train_time:9130ms step_avg:142.66ms step:75/1480 train_time:9275ms step_avg:142.69ms step:76/1480 train_time:9419ms step_avg:142.71ms step:77/1480 train_time:9563ms step_avg:142.74ms step:78/1480 train_time:9706ms step_avg:142.73ms step:79/1480 train_time:9847ms step_avg:142.71ms step:80/1480 train_time:9989ms step_avg:142.70ms step:81/1480 train_time:10131ms step_avg:142.69ms step:82/1480 train_time:10274ms step_avg:142.70ms step:83/1480 train_time:10417ms step_avg:142.70ms step:84/1480 train_time:10560ms step_avg:142.71ms step:85/1480 train_time:10704ms step_avg:142.72ms step:86/1480 train_time:10844ms step_avg:142.69ms step:87/1480 train_time:10987ms step_avg:142.69ms step:88/1480 train_time:11128ms step_avg:142.66ms step:89/1480 train_time:11272ms step_avg:142.69ms step:90/1480 train_time:11416ms step_avg:142.70ms step:91/1480 train_time:11560ms step_avg:142.71ms step:92/1480 train_time:11703ms step_avg:142.72ms step:93/1480 train_time:11845ms step_avg:142.70ms step:94/1480 train_time:11987ms step_avg:142.70ms step:95/1480 train_time:12129ms step_avg:142.69ms step:96/1480 train_time:12270ms step_avg:142.68ms step:97/1480 train_time:12412ms step_avg:142.67ms step:98/1480 train_time:12557ms step_avg:142.69ms step:99/1480 train_time:12701ms step_avg:142.70ms step:100/1480 train_time:12843ms step_avg:142.71ms step:101/1480 train_time:12986ms step_avg:142.70ms step:102/1480 train_time:13127ms step_avg:142.68ms step:103/1480 train_time:13268ms step_avg:142.67ms step:104/1480 train_time:13409ms step_avg:142.64ms step:105/1480 train_time:13551ms step_avg:142.65ms step:106/1480 train_time:13694ms step_avg:142.65ms step:107/1480 train_time:13837ms step_avg:142.65ms step:108/1480 train_time:13981ms step_avg:142.66ms step:109/1480 train_time:14122ms step_avg:142.65ms step:110/1480 train_time:14265ms step_avg:142.65ms step:111/1480 train_time:14407ms step_avg:142.65ms step:112/1480 train_time:14553ms step_avg:142.68ms step:113/1480 train_time:14701ms step_avg:142.73ms step:114/1480 train_time:14849ms step_avg:142.77ms step:115/1480 train_time:14996ms step_avg:142.82ms step:116/1480 train_time:15144ms step_avg:142.86ms step:117/1480 train_time:15290ms step_avg:142.89ms step:118/1480 train_time:15437ms step_avg:142.93ms step:119/1480 train_time:15585ms step_avg:142.98ms step:120/1480 train_time:15732ms step_avg:143.02ms step:121/1480 train_time:15879ms step_avg:143.06ms step:122/1480 train_time:16026ms step_avg:143.09ms step:123/1480 train_time:16171ms step_avg:143.11ms step:124/1480 train_time:16319ms step_avg:143.15ms step:125/1480 train_time:16467ms step_avg:143.19ms step:125/1480 val_loss:4.4229 train_time:16524ms step_avg:143.69ms step:126/1480 train_time:16619ms step_avg:143.27ms step:127/1480 train_time:16768ms step_avg:143.31ms step:128/1480 train_time:16914ms step_avg:143.34ms step:129/1480 train_time:17060ms step_avg:143.36ms step:130/1480 train_time:17207ms step_avg:143.39ms step:131/1480 train_time:17353ms step_avg:143.41ms step:132/1480 train_time:17499ms step_avg:143.43ms step:133/1480 train_time:17648ms step_avg:143.48ms step:134/1480 train_time:17797ms step_avg:143.52ms step:135/1480 train_time:17944ms step_avg:143.55ms step:136/1480 train_time:18092ms step_avg:143.59ms step:137/1480 train_time:18237ms step_avg:143.60ms step:138/1480 train_time:18385ms step_avg:143.63ms step:139/1480 train_time:18532ms step_avg:143.66ms step:140/1480 train_time:18679ms step_avg:143.69ms step:141/1480 train_time:18828ms step_avg:143.73ms step:142/1480 train_time:18975ms step_avg:143.75ms step:143/1480 train_time:19122ms step_avg:143.77ms step:144/1480 train_time:19269ms step_avg:143.80ms step:145/1480 train_time:19416ms step_avg:143.82ms step:146/1480 train_time:19564ms step_avg:143.86ms step:147/1480 train_time:19712ms step_avg:143.88ms step:148/1480 train_time:19859ms step_avg:143.90ms step:149/1480 train_time:20007ms step_avg:143.94ms step:150/1480 train_time:20155ms step_avg:143.96ms step:151/1480 train_time:20301ms step_avg:143.98ms step:152/1480 train_time:20449ms step_avg:144.01ms step:153/1480 train_time:20596ms step_avg:144.03ms step:154/1480 train_time:20741ms step_avg:144.03ms step:155/1480 train_time:20890ms step_avg:144.07ms step:156/1480 train_time:21036ms step_avg:144.08ms step:157/1480 train_time:21182ms step_avg:144.10ms step:158/1480 train_time:21330ms step_avg:144.12ms step:159/1480 train_time:21476ms step_avg:144.14ms step:160/1480 train_time:21624ms step_avg:144.16ms step:161/1480 train_time:21771ms step_avg:144.18ms step:162/1480 train_time:21917ms step_avg:144.19ms step:163/1480 train_time:22065ms step_avg:144.21ms step:164/1480 train_time:22213ms step_avg:144.24ms step:165/1480 train_time:22359ms step_avg:144.25ms step:166/1480 train_time:22507ms step_avg:144.28ms step:167/1480 train_time:22655ms step_avg:144.30ms step:168/1480 train_time:22801ms step_avg:144.31ms step:169/1480 train_time:22948ms step_avg:144.33ms step:170/1480 train_time:23095ms step_avg:144.34ms step:171/1480 train_time:23242ms step_avg:144.36ms step:172/1480 train_time:23390ms step_avg:144.38ms step:173/1480 train_time:23536ms step_avg:144.39ms step:174/1480 train_time:23682ms step_avg:144.41ms step:175/1480 train_time:23830ms step_avg:144.43ms step:176/1480 train_time:23976ms step_avg:144.44ms step:177/1480 train_time:24123ms step_avg:144.45ms step:178/1480 train_time:24270ms step_avg:144.46ms step:179/1480 train_time:24417ms step_avg:144.48ms step:180/1480 train_time:24565ms step_avg:144.50ms step:181/1480 train_time:24712ms step_avg:144.51ms step:182/1480 train_time:24857ms step_avg:144.52ms step:183/1480 train_time:25007ms step_avg:144.55ms step:184/1480 train_time:25154ms step_avg:144.56ms step:185/1480 train_time:25300ms step_avg:144.57ms step:186/1480 train_time:25448ms step_avg:144.59ms step:187/1480 train_time:25594ms step_avg:144.60ms step:188/1480 train_time:25740ms step_avg:144.61ms step:189/1480 train_time:25888ms step_avg:144.62ms step:190/1480 train_time:26035ms step_avg:144.64ms step:191/1480 train_time:26184ms step_avg:144.66ms step:192/1480 train_time:26331ms step_avg:144.67ms step:193/1480 train_time:26477ms step_avg:144.69ms step:194/1480 train_time:26625ms step_avg:144.70ms step:195/1480 train_time:26771ms step_avg:144.71ms step:196/1480 train_time:26918ms step_avg:144.72ms step:197/1480 train_time:27066ms step_avg:144.74ms step:198/1480 train_time:27213ms step_avg:144.75ms step:199/1480 train_time:27359ms step_avg:144.76ms step:200/1480 train_time:27508ms step_avg:144.78ms step:201/1480 train_time:27654ms step_avg:144.79ms step:202/1480 train_time:27800ms step_avg:144.79ms step:203/1480 train_time:27948ms step_avg:144.81ms step:204/1480 train_time:28095ms step_avg:144.82ms step:205/1480 train_time:28242ms step_avg:144.83ms step:206/1480 train_time:28390ms step_avg:144.85ms step:207/1480 train_time:28536ms step_avg:144.85ms step:208/1480 train_time:28685ms step_avg:144.88ms step:209/1480 train_time:28833ms step_avg:144.89ms step:210/1480 train_time:28979ms step_avg:144.89ms step:211/1480 train_time:29126ms step_avg:144.91ms step:212/1480 train_time:29274ms step_avg:144.92ms step:213/1480 train_time:29421ms step_avg:144.93ms step:214/1480 train_time:29568ms step_avg:144.94ms step:215/1480 train_time:29716ms step_avg:144.95ms step:216/1480 train_time:29864ms step_avg:144.97ms step:217/1480 train_time:30011ms step_avg:144.98ms step:218/1480 train_time:30156ms step_avg:144.98ms step:219/1480 train_time:30303ms step_avg:144.99ms step:220/1480 train_time:30451ms step_avg:145.00ms step:221/1480 train_time:30598ms step_avg:145.02ms step:222/1480 train_time:30751ms step_avg:145.05ms step:223/1480 train_time:30903ms step_avg:145.08ms step:224/1480 train_time:31053ms step_avg:145.11ms step:225/1480 train_time:31204ms step_avg:145.13ms step:226/1480 train_time:31354ms step_avg:145.16ms step:227/1480 train_time:31504ms step_avg:145.18ms step:228/1480 train_time:31654ms step_avg:145.20ms step:229/1480 train_time:31806ms step_avg:145.23ms step:230/1480 train_time:31955ms step_avg:145.25ms step:231/1480 train_time:32108ms step_avg:145.28ms step:232/1480 train_time:32258ms step_avg:145.31ms step:233/1480 train_time:32409ms step_avg:145.33ms step:234/1480 train_time:32558ms step_avg:145.35ms step:235/1480 train_time:32711ms step_avg:145.38ms step:236/1480 train_time:32862ms step_avg:145.41ms step:237/1480 train_time:33013ms step_avg:145.43ms step:238/1480 train_time:33164ms step_avg:145.45ms step:239/1480 train_time:33315ms step_avg:145.48ms step:240/1480 train_time:33464ms step_avg:145.50ms step:241/1480 train_time:33615ms step_avg:145.52ms step:242/1480 train_time:33764ms step_avg:145.54ms step:243/1480 train_time:33914ms step_avg:145.55ms step:244/1480 train_time:34064ms step_avg:145.57ms step:245/1480 train_time:34215ms step_avg:145.60ms step:246/1480 train_time:34365ms step_avg:145.62ms step:247/1480 train_time:34517ms step_avg:145.64ms step:248/1480 train_time:34668ms step_avg:145.66ms step:249/1480 train_time:34817ms step_avg:145.68ms step:250/1480 train_time:34967ms step_avg:145.70ms step:250/1480 val_loss:4.0016 train_time:35026ms step_avg:145.94ms step:251/1480 train_time:35123ms step_avg:145.74ms step:252/1480 train_time:35274ms step_avg:145.76ms step:253/1480 train_time:35425ms step_avg:145.78ms step:254/1480 train_time:35575ms step_avg:145.80ms step:255/1480 train_time:35725ms step_avg:145.82ms step:256/1480 train_time:35876ms step_avg:145.84ms step:257/1480 train_time:36026ms step_avg:145.86ms step:258/1480 train_time:36179ms step_avg:145.88ms step:259/1480 train_time:36331ms step_avg:145.91ms step:260/1480 train_time:36481ms step_avg:145.93ms step:261/1480 train_time:36632ms step_avg:145.94ms step:262/1480 train_time:36782ms step_avg:145.96ms step:263/1480 train_time:36932ms step_avg:145.98ms step:264/1480 train_time:37083ms step_avg:146.00ms step:265/1480 train_time:37235ms step_avg:146.02ms step:266/1480 train_time:37385ms step_avg:146.04ms step:267/1480 train_time:37536ms step_avg:146.06ms step:268/1480 train_time:37686ms step_avg:146.07ms step:269/1480 train_time:37837ms step_avg:146.09ms step:270/1480 train_time:37987ms step_avg:146.10ms step:271/1480 train_time:38138ms step_avg:146.12ms step:272/1480 train_time:38287ms step_avg:146.13ms step:273/1480 train_time:38439ms step_avg:146.16ms step:274/1480 train_time:38589ms step_avg:146.17ms step:275/1480 train_time:38739ms step_avg:146.18ms step:276/1480 train_time:38887ms step_avg:146.19ms step:277/1480 train_time:39038ms step_avg:146.21ms step:278/1480 train_time:39188ms step_avg:146.22ms step:279/1480 train_time:39339ms step_avg:146.24ms step:280/1480 train_time:39491ms step_avg:146.26ms step:281/1480 train_time:39641ms step_avg:146.28ms step:282/1480 train_time:39792ms step_avg:146.29ms step:283/1480 train_time:39942ms step_avg:146.31ms step:284/1480 train_time:40092ms step_avg:146.32ms step:285/1480 train_time:40242ms step_avg:146.33ms step:286/1480 train_time:40392ms step_avg:146.35ms step:287/1480 train_time:40543ms step_avg:146.37ms step:288/1480 train_time:40695ms step_avg:146.39ms step:289/1480 train_time:40845ms step_avg:146.40ms step:290/1480 train_time:40996ms step_avg:146.42ms step:291/1480 train_time:41146ms step_avg:146.43ms step:292/1480 train_time:41297ms step_avg:146.44ms step:293/1480 train_time:41448ms step_avg:146.46ms step:294/1480 train_time:41599ms step_avg:146.47ms step:295/1480 train_time:41750ms step_avg:146.49ms step:296/1480 train_time:41901ms step_avg:146.51ms step:297/1480 train_time:42051ms step_avg:146.52ms step:298/1480 train_time:42201ms step_avg:146.53ms step:299/1480 train_time:42350ms step_avg:146.54ms step:300/1480 train_time:42502ms step_avg:146.56ms step:301/1480 train_time:42651ms step_avg:146.57ms step:302/1480 train_time:42804ms step_avg:146.59ms step:303/1480 train_time:42954ms step_avg:146.60ms step:304/1480 train_time:43104ms step_avg:146.61ms step:305/1480 train_time:43256ms step_avg:146.63ms step:306/1480 train_time:43406ms step_avg:146.64ms step:307/1480 train_time:43557ms step_avg:146.66ms step:308/1480 train_time:43705ms step_avg:146.66ms step:309/1480 train_time:43856ms step_avg:146.68ms step:310/1480 train_time:44007ms step_avg:146.69ms step:311/1480 train_time:44157ms step_avg:146.70ms step:312/1480 train_time:44308ms step_avg:146.71ms step:313/1480 train_time:44458ms step_avg:146.73ms step:314/1480 train_time:44608ms step_avg:146.74ms step:315/1480 train_time:44759ms step_avg:146.75ms step:316/1480 train_time:44909ms step_avg:146.76ms step:317/1480 train_time:45059ms step_avg:146.77ms step:318/1480 train_time:45212ms step_avg:146.79ms step:319/1480 train_time:45362ms step_avg:146.80ms step:320/1480 train_time:45513ms step_avg:146.82ms step:321/1480 train_time:45664ms step_avg:146.83ms step:322/1480 train_time:45816ms step_avg:146.85ms step:323/1480 train_time:45965ms step_avg:146.85ms step:324/1480 train_time:46116ms step_avg:146.87ms step:325/1480 train_time:46266ms step_avg:146.88ms step:326/1480 train_time:46416ms step_avg:146.89ms step:327/1480 train_time:46566ms step_avg:146.90ms step:328/1480 train_time:46717ms step_avg:146.91ms step:329/1480 train_time:46867ms step_avg:146.92ms step:330/1480 train_time:47019ms step_avg:146.94ms step:331/1480 train_time:47172ms step_avg:146.95ms step:332/1480 train_time:47326ms step_avg:146.97ms step:333/1480 train_time:47479ms step_avg:146.99ms step:334/1480 train_time:47635ms step_avg:147.02ms step:335/1480 train_time:47788ms step_avg:147.04ms step:336/1480 train_time:47942ms step_avg:147.06ms step:337/1480 train_time:48096ms step_avg:147.08ms step:338/1480 train_time:48249ms step_avg:147.10ms step:339/1480 train_time:48403ms step_avg:147.12ms step:340/1480 train_time:48556ms step_avg:147.14ms step:341/1480 train_time:48710ms step_avg:147.16ms step:342/1480 train_time:48863ms step_avg:147.18ms step:343/1480 train_time:49017ms step_avg:147.20ms step:344/1480 train_time:49171ms step_avg:147.22ms step:345/1480 train_time:49326ms step_avg:147.24ms step:346/1480 train_time:49480ms step_avg:147.26ms step:347/1480 train_time:49635ms step_avg:147.28ms step:348/1480 train_time:49789ms step_avg:147.30ms step:349/1480 train_time:49942ms step_avg:147.32ms step:350/1480 train_time:50096ms step_avg:147.34ms step:351/1480 train_time:50250ms step_avg:147.36ms step:352/1480 train_time:50404ms step_avg:147.38ms step:353/1480 train_time:50557ms step_avg:147.40ms step:354/1480 train_time:50709ms step_avg:147.41ms step:355/1480 train_time:50863ms step_avg:147.43ms step:356/1480 train_time:51018ms step_avg:147.45ms step:357/1480 train_time:51173ms step_avg:147.47ms step:358/1480 train_time:51327ms step_avg:147.49ms step:359/1480 train_time:51481ms step_avg:147.51ms step:360/1480 train_time:51638ms step_avg:147.54ms step:361/1480 train_time:51794ms step_avg:147.56ms step:362/1480 train_time:51946ms step_avg:147.57ms step:363/1480 train_time:52101ms step_avg:147.60ms step:364/1480 train_time:52255ms step_avg:147.61ms step:365/1480 train_time:52408ms step_avg:147.63ms step:366/1480 train_time:52563ms step_avg:147.65ms step:367/1480 train_time:52718ms step_avg:147.67ms step:368/1480 train_time:52870ms step_avg:147.68ms step:369/1480 train_time:53024ms step_avg:147.70ms step:370/1480 train_time:53176ms step_avg:147.71ms step:371/1480 train_time:53331ms step_avg:147.73ms step:372/1480 train_time:53486ms step_avg:147.75ms step:373/1480 train_time:53641ms step_avg:147.77ms step:374/1480 train_time:53794ms step_avg:147.78ms step:375/1480 train_time:53946ms step_avg:147.80ms step:375/1480 val_loss:3.8060 train_time:54006ms step_avg:147.96ms step:376/1480 train_time:54104ms step_avg:147.83ms step:377/1480 train_time:54258ms step_avg:147.84ms step:378/1480 train_time:54410ms step_avg:147.85ms step:379/1480 train_time:54563ms step_avg:147.87ms step:380/1480 train_time:54715ms step_avg:147.88ms step:381/1480 train_time:54868ms step_avg:147.89ms step:382/1480 train_time:55024ms step_avg:147.91ms step:383/1480 train_time:55179ms step_avg:147.93ms step:384/1480 train_time:55333ms step_avg:147.95ms step:385/1480 train_time:55487ms step_avg:147.97ms step:386/1480 train_time:55639ms step_avg:147.98ms step:387/1480 train_time:55794ms step_avg:147.99ms step:388/1480 train_time:55946ms step_avg:148.01ms step:389/1480 train_time:56100ms step_avg:148.02ms step:390/1480 train_time:56254ms step_avg:148.04ms step:391/1480 train_time:56409ms step_avg:148.05ms step:392/1480 train_time:56561ms step_avg:148.07ms step:393/1480 train_time:56714ms step_avg:148.08ms step:394/1480 train_time:56867ms step_avg:148.09ms step:395/1480 train_time:57020ms step_avg:148.10ms step:396/1480 train_time:57173ms step_avg:148.12ms step:397/1480 train_time:57328ms step_avg:148.13ms step:398/1480 train_time:57482ms step_avg:148.15ms step:399/1480 train_time:57636ms step_avg:148.16ms step:400/1480 train_time:57790ms step_avg:148.18ms step:401/1480 train_time:57943ms step_avg:148.19ms step:402/1480 train_time:58096ms step_avg:148.20ms step:403/1480 train_time:58250ms step_avg:148.22ms step:404/1480 train_time:58404ms step_avg:148.23ms step:405/1480 train_time:58558ms step_avg:148.25ms step:406/1480 train_time:58712ms step_avg:148.26ms step:407/1480 train_time:58866ms step_avg:148.28ms step:408/1480 train_time:59019ms step_avg:148.29ms step:409/1480 train_time:59172ms step_avg:148.30ms step:410/1480 train_time:59326ms step_avg:148.31ms step:411/1480 train_time:59479ms step_avg:148.33ms step:412/1480 train_time:59633ms step_avg:148.34ms step:413/1480 train_time:59788ms step_avg:148.36ms step:414/1480 train_time:59943ms step_avg:148.37ms step:415/1480 train_time:60096ms step_avg:148.39ms step:416/1480 train_time:60250ms step_avg:148.40ms step:417/1480 train_time:60404ms step_avg:148.41ms step:418/1480 train_time:60558ms step_avg:148.43ms step:419/1480 train_time:60711ms step_avg:148.44ms step:420/1480 train_time:60864ms step_avg:148.45ms step:421/1480 train_time:61017ms step_avg:148.46ms step:422/1480 train_time:61170ms step_avg:148.47ms step:423/1480 train_time:61324ms step_avg:148.48ms step:424/1480 train_time:61477ms step_avg:148.50ms step:425/1480 train_time:61633ms step_avg:148.51ms step:426/1480 train_time:61787ms step_avg:148.53ms step:427/1480 train_time:61941ms step_avg:148.54ms step:428/1480 train_time:62094ms step_avg:148.55ms step:429/1480 train_time:62247ms step_avg:148.56ms step:430/1480 train_time:62402ms step_avg:148.58ms step:431/1480 train_time:62555ms step_avg:148.59ms step:432/1480 train_time:62709ms step_avg:148.60ms step:433/1480 train_time:62863ms step_avg:148.61ms step:434/1480 train_time:63017ms step_avg:148.62ms step:435/1480 train_time:63171ms step_avg:148.64ms step:436/1480 train_time:63327ms step_avg:148.66ms step:437/1480 train_time:63480ms step_avg:148.67ms step:438/1480 train_time:63633ms step_avg:148.68ms step:439/1480 train_time:63787ms step_avg:148.69ms step:440/1480 train_time:63941ms step_avg:148.70ms step:441/1480 train_time:64097ms step_avg:148.72ms step:442/1480 train_time:64253ms step_avg:148.73ms step:443/1480 train_time:64409ms step_avg:148.75ms step:444/1480 train_time:64565ms step_avg:148.77ms step:445/1480 train_time:64720ms step_avg:148.78ms step:446/1480 train_time:64874ms step_avg:148.79ms step:447/1480 train_time:65030ms step_avg:148.81ms step:448/1480 train_time:65188ms step_avg:148.83ms step:449/1480 train_time:65346ms step_avg:148.85ms step:450/1480 train_time:65504ms step_avg:148.87ms step:451/1480 train_time:65662ms step_avg:148.89ms step:452/1480 train_time:65817ms step_avg:148.91ms step:453/1480 train_time:65972ms step_avg:148.92ms step:454/1480 train_time:66130ms step_avg:148.94ms step:455/1480 train_time:66287ms step_avg:148.96ms step:456/1480 train_time:66444ms step_avg:148.98ms step:457/1480 train_time:66603ms step_avg:149.00ms step:458/1480 train_time:66760ms step_avg:149.02ms step:459/1480 train_time:66916ms step_avg:149.03ms step:460/1480 train_time:67072ms step_avg:149.05ms step:461/1480 train_time:67232ms step_avg:149.07ms step:462/1480 train_time:67389ms step_avg:149.09ms step:463/1480 train_time:67547ms step_avg:149.11ms step:464/1480 train_time:67704ms step_avg:149.13ms step:465/1480 train_time:67861ms step_avg:149.15ms step:466/1480 train_time:68017ms step_avg:149.16ms step:467/1480 train_time:68173ms step_avg:149.18ms step:468/1480 train_time:68330ms step_avg:149.19ms step:469/1480 train_time:68488ms step_avg:149.21ms step:470/1480 train_time:68645ms step_avg:149.23ms step:471/1480 train_time:68803ms step_avg:149.25ms step:472/1480 train_time:68962ms step_avg:149.27ms step:473/1480 train_time:69116ms step_avg:149.28ms step:474/1480 train_time:69272ms step_avg:149.29ms step:475/1480 train_time:69430ms step_avg:149.31ms step:476/1480 train_time:69587ms step_avg:149.33ms step:477/1480 train_time:69743ms step_avg:149.34ms step:478/1480 train_time:69901ms step_avg:149.36ms step:479/1480 train_time:70056ms step_avg:149.37ms step:480/1480 train_time:70213ms step_avg:149.39ms step:481/1480 train_time:70370ms step_avg:149.41ms step:482/1480 train_time:70529ms step_avg:149.42ms step:483/1480 train_time:70685ms step_avg:149.44ms step:484/1480 train_time:70842ms step_avg:149.46ms step:485/1480 train_time:70998ms step_avg:149.47ms step:486/1480 train_time:71155ms step_avg:149.48ms step:487/1480 train_time:71311ms step_avg:149.50ms step:488/1480 train_time:71469ms step_avg:149.52ms step:489/1480 train_time:71627ms step_avg:149.53ms step:490/1480 train_time:71784ms step_avg:149.55ms step:491/1480 train_time:71940ms step_avg:149.56ms step:492/1480 train_time:72096ms step_avg:149.58ms step:493/1480 train_time:72252ms step_avg:149.59ms step:494/1480 train_time:72409ms step_avg:149.61ms step:495/1480 train_time:72566ms step_avg:149.62ms step:496/1480 train_time:72724ms step_avg:149.64ms step:497/1480 train_time:72880ms step_avg:149.65ms step:498/1480 train_time:73037ms step_avg:149.67ms step:499/1480 train_time:73194ms step_avg:149.68ms step:500/1480 train_time:73351ms step_avg:149.70ms step:500/1480 val_loss:3.6892 train_time:73413ms step_avg:149.82ms step:501/1480 train_time:73512ms step_avg:149.72ms step:502/1480 train_time:73671ms step_avg:149.74ms step:503/1480 train_time:73826ms step_avg:149.75ms step:504/1480 train_time:73981ms step_avg:149.76ms step:505/1480 train_time:74137ms step_avg:149.77ms step:506/1480 train_time:74294ms step_avg:149.79ms step:507/1480 train_time:74449ms step_avg:149.80ms step:508/1480 train_time:74605ms step_avg:149.81ms step:509/1480 train_time:74762ms step_avg:149.82ms step:510/1480 train_time:74920ms step_avg:149.84ms step:511/1480 train_time:75078ms step_avg:149.86ms step:512/1480 train_time:75235ms step_avg:149.87ms step:513/1480 train_time:75391ms step_avg:149.88ms step:514/1480 train_time:75548ms step_avg:149.90ms step:515/1480 train_time:75704ms step_avg:149.91ms step:516/1480 train_time:75861ms step_avg:149.92ms step:517/1480 train_time:76018ms step_avg:149.94ms step:518/1480 train_time:76175ms step_avg:149.95ms step:519/1480 train_time:76332ms step_avg:149.97ms step:520/1480 train_time:76492ms step_avg:149.98ms step:521/1480 train_time:76648ms step_avg:150.00ms step:522/1480 train_time:76805ms step_avg:150.01ms step:523/1480 train_time:76962ms step_avg:150.02ms step:524/1480 train_time:77118ms step_avg:150.03ms step:525/1480 train_time:77274ms step_avg:150.05ms step:526/1480 train_time:77432ms step_avg:150.06ms step:527/1480 train_time:77588ms step_avg:150.07ms step:528/1480 train_time:77743ms step_avg:150.08ms step:529/1480 train_time:77902ms step_avg:150.10ms step:530/1480 train_time:78058ms step_avg:150.11ms step:531/1480 train_time:78217ms step_avg:150.13ms step:532/1480 train_time:78373ms step_avg:150.14ms step:533/1480 train_time:78528ms step_avg:150.15ms step:534/1480 train_time:78684ms step_avg:150.16ms step:535/1480 train_time:78842ms step_avg:150.17ms step:536/1480 train_time:78999ms step_avg:150.19ms step:537/1480 train_time:79156ms step_avg:150.20ms step:538/1480 train_time:79314ms step_avg:150.22ms step:539/1480 train_time:79472ms step_avg:150.23ms step:540/1480 train_time:79628ms step_avg:150.24ms step:541/1480 train_time:79784ms step_avg:150.25ms step:542/1480 train_time:79941ms step_avg:150.26ms step:543/1480 train_time:80097ms step_avg:150.28ms step:544/1480 train_time:80256ms step_avg:150.29ms step:545/1480 train_time:80414ms step_avg:150.31ms step:546/1480 train_time:80571ms step_avg:150.32ms step:547/1480 train_time:80728ms step_avg:150.33ms step:548/1480 train_time:80885ms step_avg:150.34ms step:549/1480 train_time:81040ms step_avg:150.35ms step:550/1480 train_time:81198ms step_avg:150.37ms step:551/1480 train_time:81357ms step_avg:150.38ms step:552/1480 train_time:81518ms step_avg:150.40ms step:553/1480 train_time:81679ms step_avg:150.42ms step:554/1480 train_time:81839ms step_avg:150.44ms step:555/1480 train_time:82000ms step_avg:150.46ms step:556/1480 train_time:82157ms step_avg:150.47ms step:557/1480 train_time:82318ms step_avg:150.49ms step:558/1480 train_time:82479ms step_avg:150.51ms step:559/1480 train_time:82638ms step_avg:150.53ms step:560/1480 train_time:82799ms step_avg:150.54ms step:561/1480 train_time:82958ms step_avg:150.56ms step:562/1480 train_time:83118ms step_avg:150.58ms step:563/1480 train_time:83278ms step_avg:150.59ms step:564/1480 train_time:83437ms step_avg:150.61ms step:565/1480 train_time:83597ms step_avg:150.63ms step:566/1480 train_time:83758ms step_avg:150.64ms step:567/1480 train_time:83919ms step_avg:150.66ms step:568/1480 train_time:84077ms step_avg:150.68ms step:569/1480 train_time:84236ms step_avg:150.69ms step:570/1480 train_time:84397ms step_avg:150.71ms step:571/1480 train_time:84556ms step_avg:150.72ms step:572/1480 train_time:84715ms step_avg:150.74ms step:573/1480 train_time:84876ms step_avg:150.76ms step:574/1480 train_time:85038ms step_avg:150.78ms step:575/1480 train_time:85200ms step_avg:150.80ms step:576/1480 train_time:85358ms step_avg:150.81ms step:577/1480 train_time:85519ms step_avg:150.83ms step:578/1480 train_time:85678ms step_avg:150.84ms step:579/1480 train_time:85837ms step_avg:150.86ms step:580/1480 train_time:85997ms step_avg:150.87ms step:581/1480 train_time:86158ms step_avg:150.89ms step:582/1480 train_time:86318ms step_avg:150.91ms step:583/1480 train_time:86478ms step_avg:150.92ms step:584/1480 train_time:86638ms step_avg:150.94ms step:585/1480 train_time:86797ms step_avg:150.95ms step:586/1480 train_time:86957ms step_avg:150.97ms step:587/1480 train_time:87118ms step_avg:150.98ms step:588/1480 train_time:87277ms step_avg:151.00ms step:589/1480 train_time:87437ms step_avg:151.01ms step:590/1480 train_time:87600ms step_avg:151.03ms step:591/1480 train_time:87757ms step_avg:151.04ms step:592/1480 train_time:87917ms step_avg:151.06ms step:593/1480 train_time:88077ms step_avg:151.08ms step:594/1480 train_time:88238ms step_avg:151.09ms step:595/1480 train_time:88401ms step_avg:151.11ms step:596/1480 train_time:88562ms step_avg:151.13ms step:597/1480 train_time:88720ms step_avg:151.14ms step:598/1480 train_time:88878ms step_avg:151.15ms step:599/1480 train_time:89037ms step_avg:151.17ms step:600/1480 train_time:89198ms step_avg:151.18ms step:601/1480 train_time:89357ms step_avg:151.20ms step:602/1480 train_time:89517ms step_avg:151.21ms step:603/1480 train_time:89677ms step_avg:151.23ms step:604/1480 train_time:89838ms step_avg:151.24ms step:605/1480 train_time:89998ms step_avg:151.26ms step:606/1480 train_time:90159ms step_avg:151.27ms step:607/1480 train_time:90321ms step_avg:151.29ms step:608/1480 train_time:90480ms step_avg:151.30ms step:609/1480 train_time:90640ms step_avg:151.32ms step:610/1480 train_time:90799ms step_avg:151.33ms step:611/1480 train_time:90959ms step_avg:151.35ms step:612/1480 train_time:91120ms step_avg:151.36ms step:613/1480 train_time:91281ms step_avg:151.38ms step:614/1480 train_time:91440ms step_avg:151.39ms step:615/1480 train_time:91600ms step_avg:151.40ms step:616/1480 train_time:91757ms step_avg:151.41ms step:617/1480 train_time:91918ms step_avg:151.43ms step:618/1480 train_time:92077ms step_avg:151.44ms step:619/1480 train_time:92237ms step_avg:151.46ms step:620/1480 train_time:92398ms step_avg:151.47ms step:621/1480 train_time:92559ms step_avg:151.49ms step:622/1480 train_time:92718ms step_avg:151.50ms step:623/1480 train_time:92879ms step_avg:151.52ms step:624/1480 train_time:93038ms step_avg:151.53ms step:625/1480 train_time:93199ms step_avg:151.54ms step:625/1480 val_loss:3.6059 train_time:93262ms step_avg:151.64ms step:626/1480 train_time:93361ms step_avg:151.56ms step:627/1480 train_time:93521ms step_avg:151.57ms step:628/1480 train_time:93679ms step_avg:151.58ms step:629/1480 train_time:93836ms step_avg:151.59ms step:630/1480 train_time:93993ms step_avg:151.60ms step:631/1480 train_time:94152ms step_avg:151.61ms step:632/1480 train_time:94311ms step_avg:151.63ms step:633/1480 train_time:94470ms step_avg:151.64ms step:634/1480 train_time:94630ms step_avg:151.65ms step:635/1480 train_time:94790ms step_avg:151.66ms step:636/1480 train_time:94950ms step_avg:151.68ms step:637/1480 train_time:95109ms step_avg:151.69ms step:638/1480 train_time:95268ms step_avg:151.70ms step:639/1480 train_time:95427ms step_avg:151.71ms step:640/1480 train_time:95588ms step_avg:151.73ms step:641/1480 train_time:95748ms step_avg:151.74ms step:642/1480 train_time:95908ms step_avg:151.75ms step:643/1480 train_time:96067ms step_avg:151.76ms step:644/1480 train_time:96227ms step_avg:151.78ms step:645/1480 train_time:96387ms step_avg:151.79ms step:646/1480 train_time:96547ms step_avg:151.80ms step:647/1480 train_time:96707ms step_avg:151.82ms step:648/1480 train_time:96869ms step_avg:151.83ms step:649/1480 train_time:97028ms step_avg:151.84ms step:650/1480 train_time:97189ms step_avg:151.86ms step:651/1480 train_time:97349ms step_avg:151.87ms step:652/1480 train_time:97509ms step_avg:151.88ms step:653/1480 train_time:97668ms step_avg:151.89ms step:654/1480 train_time:97828ms step_avg:151.91ms step:655/1480 train_time:97989ms step_avg:151.92ms step:656/1480 train_time:98148ms step_avg:151.93ms step:657/1480 train_time:98308ms step_avg:151.94ms step:658/1480 train_time:98468ms step_avg:151.96ms step:659/1480 train_time:98630ms step_avg:151.97ms step:660/1480 train_time:98792ms step_avg:151.99ms step:661/1480 train_time:98953ms step_avg:152.00ms step:662/1480 train_time:99113ms step_avg:152.01ms step:663/1480 train_time:99272ms step_avg:152.03ms step:664/1480 train_time:99433ms step_avg:152.04ms step:665/1480 train_time:99595ms step_avg:152.05ms step:666/1480 train_time:99756ms step_avg:152.07ms step:667/1480 train_time:99916ms step_avg:152.08ms step:668/1480 train_time:100077ms step_avg:152.09ms step:669/1480 train_time:100239ms step_avg:152.11ms step:670/1480 train_time:100398ms step_avg:152.12ms step:671/1480 train_time:100561ms step_avg:152.13ms step:672/1480 train_time:100724ms step_avg:152.15ms step:673/1480 train_time:100888ms step_avg:152.17ms step:674/1480 train_time:101051ms step_avg:152.18ms step:675/1480 train_time:101213ms step_avg:152.20ms step:676/1480 train_time:101374ms step_avg:152.21ms step:677/1480 train_time:101533ms step_avg:152.22ms step:678/1480 train_time:101694ms step_avg:152.24ms step:679/1480 train_time:101855ms step_avg:152.25ms step:680/1480 train_time:102017ms step_avg:152.26ms step:681/1480 train_time:102177ms step_avg:152.28ms step:682/1480 train_time:102342ms step_avg:152.29ms step:683/1480 train_time:102503ms step_avg:152.31ms step:684/1480 train_time:102664ms step_avg:152.32ms step:685/1480 train_time:102827ms step_avg:152.34ms step:686/1480 train_time:102990ms step_avg:152.35ms step:687/1480 train_time:103151ms step_avg:152.36ms step:688/1480 train_time:103313ms step_avg:152.38ms step:689/1480 train_time:103476ms step_avg:152.40ms step:690/1480 train_time:103639ms step_avg:152.41ms step:691/1480 train_time:103799ms step_avg:152.42ms step:692/1480 train_time:103961ms step_avg:152.43ms step:693/1480 train_time:104123ms step_avg:152.45ms step:694/1480 train_time:104288ms step_avg:152.47ms step:695/1480 train_time:104449ms step_avg:152.48ms step:696/1480 train_time:104610ms step_avg:152.49ms step:697/1480 train_time:104774ms step_avg:152.51ms step:698/1480 train_time:104934ms step_avg:152.52ms step:699/1480 train_time:105097ms step_avg:152.54ms step:700/1480 train_time:105259ms step_avg:152.55ms step:701/1480 train_time:105418ms step_avg:152.56ms step:702/1480 train_time:105578ms step_avg:152.57ms step:703/1480 train_time:105738ms step_avg:152.58ms step:704/1480 train_time:105897ms step_avg:152.59ms step:705/1480 train_time:106061ms step_avg:152.61ms step:706/1480 train_time:106227ms step_avg:152.63ms step:707/1480 train_time:106389ms step_avg:152.64ms step:708/1480 train_time:106551ms step_avg:152.65ms step:709/1480 train_time:106713ms step_avg:152.66ms step:710/1480 train_time:106873ms step_avg:152.68ms step:711/1480 train_time:107034ms step_avg:152.69ms step:712/1480 train_time:107199ms step_avg:152.70ms step:713/1480 train_time:107361ms step_avg:152.72ms step:714/1480 train_time:107523ms step_avg:152.73ms step:715/1480 train_time:107684ms step_avg:152.74ms step:716/1480 train_time:107846ms step_avg:152.76ms step:717/1480 train_time:108008ms step_avg:152.77ms step:718/1480 train_time:108168ms step_avg:152.78ms step:719/1480 train_time:108328ms step_avg:152.79ms step:720/1480 train_time:108492ms step_avg:152.81ms step:721/1480 train_time:108654ms step_avg:152.82ms step:722/1480 train_time:108815ms step_avg:152.83ms step:723/1480 train_time:108976ms step_avg:152.84ms step:724/1480 train_time:109137ms step_avg:152.85ms step:725/1480 train_time:109300ms step_avg:152.87ms step:726/1480 train_time:109464ms step_avg:152.88ms step:727/1480 train_time:109629ms step_avg:152.90ms step:728/1480 train_time:109790ms step_avg:152.91ms step:729/1480 train_time:109951ms step_avg:152.92ms step:730/1480 train_time:110114ms step_avg:152.94ms step:731/1480 train_time:110274ms step_avg:152.95ms step:732/1480 train_time:110434ms step_avg:152.96ms step:733/1480 train_time:110594ms step_avg:152.97ms step:734/1480 train_time:110754ms step_avg:152.98ms step:735/1480 train_time:110916ms step_avg:152.99ms step:736/1480 train_time:111077ms step_avg:153.00ms step:737/1480 train_time:111236ms step_avg:153.01ms step:738/1480 train_time:111396ms step_avg:153.02ms step:739/1480 train_time:111555ms step_avg:153.03ms step:740/1480 train_time:111719ms step_avg:153.04ms step:741/1480 train_time:111883ms step_avg:153.05ms step:742/1480 train_time:112045ms step_avg:153.07ms step:743/1480 train_time:112208ms step_avg:153.08ms step:744/1480 train_time:112372ms step_avg:153.10ms step:745/1480 train_time:112535ms step_avg:153.11ms step:746/1480 train_time:112695ms step_avg:153.12ms step:747/1480 train_time:112856ms step_avg:153.13ms step:748/1480 train_time:113023ms step_avg:153.15ms step:749/1480 train_time:113189ms step_avg:153.16ms step:750/1480 train_time:113350ms step_avg:153.18ms step:750/1480 val_loss:3.5496 train_time:113413ms step_avg:153.26ms step:751/1480 train_time:113514ms step_avg:153.19ms step:752/1480 train_time:113675ms step_avg:153.20ms step:753/1480 train_time:113836ms step_avg:153.21ms step:754/1480 train_time:113995ms step_avg:153.22ms step:755/1480 train_time:114155ms step_avg:153.23ms step:756/1480 train_time:114316ms step_avg:153.24ms step:757/1480 train_time:114482ms step_avg:153.26ms step:758/1480 train_time:114643ms step_avg:153.27ms step:759/1480 train_time:114806ms step_avg:153.28ms step:760/1480 train_time:114968ms step_avg:153.29ms step:761/1480 train_time:115131ms step_avg:153.30ms step:762/1480 train_time:115292ms step_avg:153.31ms step:763/1480 train_time:115453ms step_avg:153.32ms step:764/1480 train_time:115616ms step_avg:153.34ms step:765/1480 train_time:115776ms step_avg:153.35ms step:766/1480 train_time:115939ms step_avg:153.36ms step:767/1480 train_time:116099ms step_avg:153.37ms step:768/1480 train_time:116259ms step_avg:153.38ms step:769/1480 train_time:116422ms step_avg:153.39ms step:770/1480 train_time:116586ms step_avg:153.40ms step:771/1480 train_time:116751ms step_avg:153.42ms step:772/1480 train_time:116914ms step_avg:153.43ms step:773/1480 train_time:117076ms step_avg:153.44ms step:774/1480 train_time:117238ms step_avg:153.45ms step:775/1480 train_time:117399ms step_avg:153.46ms step:776/1480 train_time:117564ms step_avg:153.48ms step:777/1480 train_time:117731ms step_avg:153.50ms step:778/1480 train_time:117893ms step_avg:153.51ms step:779/1480 train_time:118055ms step_avg:153.52ms step:780/1480 train_time:118218ms step_avg:153.53ms step:781/1480 train_time:118381ms step_avg:153.54ms step:782/1480 train_time:118545ms step_avg:153.56ms step:783/1480 train_time:118707ms step_avg:153.57ms step:784/1480 train_time:118870ms step_avg:153.58ms step:785/1480 train_time:119033ms step_avg:153.59ms step:786/1480 train_time:119197ms step_avg:153.60ms step:787/1480 train_time:119359ms step_avg:153.61ms step:788/1480 train_time:119524ms step_avg:153.63ms step:789/1480 train_time:119686ms step_avg:153.64ms step:790/1480 train_time:119853ms step_avg:153.66ms step:791/1480 train_time:120017ms step_avg:153.67ms step:792/1480 train_time:120182ms step_avg:153.69ms step:793/1480 train_time:120345ms step_avg:153.70ms step:794/1480 train_time:120510ms step_avg:153.71ms step:795/1480 train_time:120674ms step_avg:153.73ms step:796/1480 train_time:120839ms step_avg:153.74ms step:797/1480 train_time:121003ms step_avg:153.75ms step:798/1480 train_time:121167ms step_avg:153.77ms step:799/1480 train_time:121335ms step_avg:153.78ms step:800/1480 train_time:121496ms step_avg:153.79ms step:801/1480 train_time:121660ms step_avg:153.80ms step:802/1480 train_time:121827ms step_avg:153.82ms step:803/1480 train_time:121990ms step_avg:153.83ms step:804/1480 train_time:122153ms step_avg:153.84ms step:805/1480 train_time:122317ms step_avg:153.86ms step:806/1480 train_time:122478ms step_avg:153.87ms step:807/1480 train_time:122639ms step_avg:153.88ms step:808/1480 train_time:122805ms step_avg:153.89ms step:809/1480 train_time:122968ms step_avg:153.90ms step:810/1480 train_time:123131ms step_avg:153.91ms step:811/1480 train_time:123293ms step_avg:153.92ms step:812/1480 train_time:123456ms step_avg:153.94ms step:813/1480 train_time:123617ms step_avg:153.94ms step:814/1480 train_time:123779ms step_avg:153.95ms step:815/1480 train_time:123940ms step_avg:153.96ms step:816/1480 train_time:124106ms step_avg:153.98ms step:817/1480 train_time:124268ms step_avg:153.99ms step:818/1480 train_time:124430ms step_avg:154.00ms step:819/1480 train_time:124593ms step_avg:154.01ms step:820/1480 train_time:124758ms step_avg:154.02ms step:821/1480 train_time:124918ms step_avg:154.03ms step:822/1480 train_time:125084ms step_avg:154.04ms step:823/1480 train_time:125247ms step_avg:154.05ms step:824/1480 train_time:125410ms step_avg:154.07ms step:825/1480 train_time:125575ms step_avg:154.08ms step:826/1480 train_time:125742ms step_avg:154.10ms step:827/1480 train_time:125907ms step_avg:154.11ms step:828/1480 train_time:126070ms step_avg:154.12ms step:829/1480 train_time:126234ms step_avg:154.13ms step:830/1480 train_time:126398ms step_avg:154.14ms step:831/1480 train_time:126562ms step_avg:154.16ms step:832/1480 train_time:126726ms step_avg:154.17ms step:833/1480 train_time:126890ms step_avg:154.18ms step:834/1480 train_time:127054ms step_avg:154.19ms step:835/1480 train_time:127216ms step_avg:154.20ms step:836/1480 train_time:127380ms step_avg:154.21ms step:837/1480 train_time:127542ms step_avg:154.22ms step:838/1480 train_time:127705ms step_avg:154.23ms step:839/1480 train_time:127868ms step_avg:154.24ms step:840/1480 train_time:128031ms step_avg:154.25ms step:841/1480 train_time:128192ms step_avg:154.26ms step:842/1480 train_time:128356ms step_avg:154.27ms step:843/1480 train_time:128517ms step_avg:154.28ms step:844/1480 train_time:128677ms step_avg:154.29ms step:845/1480 train_time:128840ms step_avg:154.30ms step:846/1480 train_time:129004ms step_avg:154.31ms step:847/1480 train_time:129168ms step_avg:154.32ms step:848/1480 train_time:129330ms step_avg:154.33ms step:849/1480 train_time:129491ms step_avg:154.34ms step:850/1480 train_time:129657ms step_avg:154.35ms step:851/1480 train_time:129824ms step_avg:154.37ms step:852/1480 train_time:129987ms step_avg:154.38ms step:853/1480 train_time:130151ms step_avg:154.39ms step:854/1480 train_time:130314ms step_avg:154.40ms step:855/1480 train_time:130477ms step_avg:154.41ms step:856/1480 train_time:130638ms step_avg:154.42ms step:857/1480 train_time:130804ms step_avg:154.43ms step:858/1480 train_time:130969ms step_avg:154.45ms step:859/1480 train_time:131133ms step_avg:154.46ms step:860/1480 train_time:131294ms step_avg:154.46ms step:861/1480 train_time:131460ms step_avg:154.48ms step:862/1480 train_time:131630ms step_avg:154.50ms step:863/1480 train_time:131797ms step_avg:154.51ms step:864/1480 train_time:131959ms step_avg:154.52ms step:865/1480 train_time:132119ms step_avg:154.53ms step:866/1480 train_time:132286ms step_avg:154.54ms step:867/1480 train_time:132449ms step_avg:154.55ms step:868/1480 train_time:132611ms step_avg:154.56ms step:869/1480 train_time:132773ms step_avg:154.57ms step:870/1480 train_time:132937ms step_avg:154.58ms step:871/1480 train_time:133101ms step_avg:154.59ms step:872/1480 train_time:133265ms step_avg:154.60ms step:873/1480 train_time:133428ms step_avg:154.61ms step:874/1480 train_time:133594ms step_avg:154.62ms step:875/1480 train_time:133758ms step_avg:154.63ms step:875/1480 val_loss:3.5037 train_time:133823ms step_avg:154.71ms step:876/1480 train_time:133923ms step_avg:154.65ms step:877/1480 train_time:134088ms step_avg:154.66ms step:878/1480 train_time:134251ms step_avg:154.67ms step:879/1480 train_time:134415ms step_avg:154.68ms step:880/1480 train_time:134579ms step_avg:154.69ms step:881/1480 train_time:134741ms step_avg:154.70ms step:882/1480 train_time:134906ms step_avg:154.71ms step:883/1480 train_time:135071ms step_avg:154.72ms step:884/1480 train_time:135239ms step_avg:154.74ms step:885/1480 train_time:135403ms step_avg:154.75ms step:886/1480 train_time:135569ms step_avg:154.76ms step:887/1480 train_time:135736ms step_avg:154.77ms step:888/1480 train_time:135910ms step_avg:154.79ms step:889/1480 train_time:136078ms step_avg:154.81ms step:890/1480 train_time:136240ms step_avg:154.82ms step:891/1480 train_time:136407ms step_avg:154.83ms step:892/1480 train_time:136571ms step_avg:154.84ms step:893/1480 train_time:136735ms step_avg:154.85ms step:894/1480 train_time:136903ms step_avg:154.87ms step:895/1480 train_time:137069ms step_avg:154.88ms step:896/1480 train_time:137233ms step_avg:154.89ms step:897/1480 train_time:137400ms step_avg:154.90ms step:898/1480 train_time:137566ms step_avg:154.92ms step:899/1480 train_time:137730ms step_avg:154.93ms step:900/1480 train_time:137893ms step_avg:154.94ms step:901/1480 train_time:138059ms step_avg:154.95ms step:902/1480 train_time:138222ms step_avg:154.96ms step:903/1480 train_time:138393ms step_avg:154.98ms step:904/1480 train_time:138560ms step_avg:154.99ms step:905/1480 train_time:138722ms step_avg:155.00ms step:906/1480 train_time:138887ms step_avg:155.01ms step:907/1480 train_time:139056ms step_avg:155.02ms step:908/1480 train_time:139219ms step_avg:155.03ms step:909/1480 train_time:139384ms step_avg:155.04ms step:910/1480 train_time:139555ms step_avg:155.06ms step:911/1480 train_time:139720ms step_avg:155.07ms step:912/1480 train_time:139885ms step_avg:155.08ms step:913/1480 train_time:140051ms step_avg:155.10ms step:914/1480 train_time:140219ms step_avg:155.11ms step:915/1480 train_time:140387ms step_avg:155.12ms step:916/1480 train_time:140552ms step_avg:155.13ms step:917/1480 train_time:140715ms step_avg:155.14ms step:918/1480 train_time:140883ms step_avg:155.16ms step:919/1480 train_time:141053ms step_avg:155.17ms step:920/1480 train_time:141219ms step_avg:155.19ms step:921/1480 train_time:141384ms step_avg:155.20ms step:922/1480 train_time:141552ms step_avg:155.21ms step:923/1480 train_time:141715ms step_avg:155.22ms step:924/1480 train_time:141881ms step_avg:155.23ms step:925/1480 train_time:142045ms step_avg:155.24ms step:926/1480 train_time:142206ms step_avg:155.25ms step:927/1480 train_time:142371ms step_avg:155.26ms step:928/1480 train_time:142537ms step_avg:155.27ms step:929/1480 train_time:142702ms step_avg:155.28ms step:930/1480 train_time:142868ms step_avg:155.29ms step:931/1480 train_time:143031ms step_avg:155.30ms step:932/1480 train_time:143198ms step_avg:155.31ms step:933/1480 train_time:143365ms step_avg:155.32ms step:934/1480 train_time:143530ms step_avg:155.34ms step:935/1480 train_time:143702ms step_avg:155.35ms step:936/1480 train_time:143869ms step_avg:155.37ms step:937/1480 train_time:144039ms step_avg:155.38ms step:938/1480 train_time:144203ms step_avg:155.39ms step:939/1480 train_time:144373ms step_avg:155.41ms step:940/1480 train_time:144540ms step_avg:155.42ms step:941/1480 train_time:144704ms step_avg:155.43ms step:942/1480 train_time:144868ms step_avg:155.44ms step:943/1480 train_time:145039ms step_avg:155.45ms step:944/1480 train_time:145211ms step_avg:155.47ms step:945/1480 train_time:145376ms step_avg:155.48ms step:946/1480 train_time:145543ms step_avg:155.50ms step:947/1480 train_time:145710ms step_avg:155.51ms step:948/1480 train_time:145878ms step_avg:155.52ms step:949/1480 train_time:146044ms step_avg:155.53ms step:950/1480 train_time:146206ms step_avg:155.54ms step:951/1480 train_time:146375ms step_avg:155.55ms step:952/1480 train_time:146542ms step_avg:155.56ms step:953/1480 train_time:146710ms step_avg:155.58ms step:954/1480 train_time:146880ms step_avg:155.59ms step:955/1480 train_time:147043ms step_avg:155.60ms step:956/1480 train_time:147209ms step_avg:155.61ms step:957/1480 train_time:147378ms step_avg:155.63ms step:958/1480 train_time:147547ms step_avg:155.64ms step:959/1480 train_time:147710ms step_avg:155.65ms step:960/1480 train_time:147878ms step_avg:155.66ms step:961/1480 train_time:148043ms step_avg:155.67ms step:962/1480 train_time:148207ms step_avg:155.68ms step:963/1480 train_time:148373ms step_avg:155.69ms step:964/1480 train_time:148542ms step_avg:155.70ms step:965/1480 train_time:148706ms step_avg:155.71ms step:966/1480 train_time:148871ms step_avg:155.72ms step:967/1480 train_time:149035ms step_avg:155.73ms step:968/1480 train_time:149201ms step_avg:155.74ms step:969/1480 train_time:149366ms step_avg:155.75ms step:970/1480 train_time:149527ms step_avg:155.76ms step:971/1480 train_time:149691ms step_avg:155.77ms step:972/1480 train_time:149856ms step_avg:155.78ms step:973/1480 train_time:150021ms step_avg:155.78ms step:974/1480 train_time:150189ms step_avg:155.80ms step:975/1480 train_time:150355ms step_avg:155.81ms step:976/1480 train_time:150521ms step_avg:155.82ms step:977/1480 train_time:150685ms step_avg:155.83ms step:978/1480 train_time:150850ms step_avg:155.84ms step:979/1480 train_time:151015ms step_avg:155.85ms step:980/1480 train_time:151181ms step_avg:155.86ms step:981/1480 train_time:151349ms step_avg:155.87ms step:982/1480 train_time:151512ms step_avg:155.88ms step:983/1480 train_time:151680ms step_avg:155.89ms step:984/1480 train_time:151844ms step_avg:155.90ms step:985/1480 train_time:152010ms step_avg:155.91ms step:986/1480 train_time:152175ms step_avg:155.92ms step:987/1480 train_time:152341ms step_avg:155.93ms step:988/1480 train_time:152507ms step_avg:155.94ms step:989/1480 train_time:152674ms step_avg:155.95ms step:990/1480 train_time:152843ms step_avg:155.96ms step:991/1480 train_time:153009ms step_avg:155.97ms step:992/1480 train_time:153186ms step_avg:155.99ms step:993/1480 train_time:153363ms step_avg:156.02ms step:994/1480 train_time:153527ms step_avg:156.02ms step:995/1480 train_time:153691ms step_avg:156.03ms step:996/1480 train_time:153855ms step_avg:156.04ms step:997/1480 train_time:154020ms step_avg:156.05ms step:998/1480 train_time:154183ms step_avg:156.06ms step:999/1480 train_time:154349ms step_avg:156.07ms step:1000/1480 train_time:154519ms step_avg:156.08ms step:1000/1480 val_loss:3.4403 train_time:154586ms step_avg:156.15ms step:1001/1480 train_time:154690ms step_avg:156.09ms step:1002/1480 train_time:154856ms step_avg:156.11ms step:1003/1480 train_time:155029ms step_avg:156.12ms step:1004/1480 train_time:155198ms step_avg:156.13ms step:1005/1480 train_time:155365ms step_avg:156.15ms step:1006/1480 train_time:155534ms step_avg:156.16ms step:1007/1480 train_time:155700ms step_avg:156.17ms step:1008/1480 train_time:155866ms step_avg:156.18ms step:1009/1480 train_time:156039ms step_avg:156.20ms step:1010/1480 train_time:156204ms step_avg:156.20ms step:1011/1480 train_time:156371ms step_avg:156.21ms step:1012/1480 train_time:156537ms step_avg:156.22ms step:1013/1480 train_time:156706ms step_avg:156.24ms step:1014/1480 train_time:156873ms step_avg:156.25ms step:1015/1480 train_time:157042ms step_avg:156.26ms step:1016/1480 train_time:157209ms step_avg:156.27ms step:1017/1480 train_time:157379ms step_avg:156.29ms step:1018/1480 train_time:157546ms step_avg:156.30ms step:1019/1480 train_time:157716ms step_avg:156.31ms step:1020/1480 train_time:157885ms step_avg:156.32ms step:1021/1480 train_time:158050ms step_avg:156.33ms step:1022/1480 train_time:158219ms step_avg:156.34ms step:1023/1480 train_time:158387ms step_avg:156.35ms step:1024/1480 train_time:158554ms step_avg:156.36ms step:1025/1480 train_time:158724ms step_avg:156.38ms step:1026/1480 train_time:158890ms step_avg:156.39ms step:1027/1480 train_time:159056ms step_avg:156.40ms step:1028/1480 train_time:159229ms step_avg:156.41ms step:1029/1480 train_time:159403ms step_avg:156.43ms step:1030/1480 train_time:159570ms step_avg:156.44ms step:1031/1480 train_time:159736ms step_avg:156.45ms step:1032/1480 train_time:159907ms step_avg:156.46ms step:1033/1480 train_time:160073ms step_avg:156.47ms step:1034/1480 train_time:160242ms step_avg:156.49ms step:1035/1480 train_time:160412ms step_avg:156.50ms step:1036/1480 train_time:160577ms step_avg:156.51ms step:1037/1480 train_time:160744ms step_avg:156.52ms step:1038/1480 train_time:160913ms step_avg:156.53ms step:1039/1480 train_time:161082ms step_avg:156.54ms step:1040/1480 train_time:161249ms step_avg:156.55ms step:1041/1480 train_time:161417ms step_avg:156.56ms step:1042/1480 train_time:161581ms step_avg:156.57ms step:1043/1480 train_time:161746ms step_avg:156.58ms step:1044/1480 train_time:161911ms step_avg:156.59ms step:1045/1480 train_time:162079ms step_avg:156.60ms step:1046/1480 train_time:162245ms step_avg:156.61ms step:1047/1480 train_time:162413ms step_avg:156.62ms step:1048/1480 train_time:162579ms step_avg:156.63ms step:1049/1480 train_time:162744ms step_avg:156.64ms step:1050/1480 train_time:162916ms step_avg:156.65ms step:1051/1480 train_time:163084ms step_avg:156.66ms step:1052/1480 train_time:163252ms step_avg:156.67ms step:1053/1480 train_time:163418ms step_avg:156.68ms step:1054/1480 train_time:163585ms step_avg:156.69ms step:1055/1480 train_time:163750ms step_avg:156.70ms step:1056/1480 train_time:163916ms step_avg:156.71ms step:1057/1480 train_time:164081ms step_avg:156.72ms step:1058/1480 train_time:164251ms step_avg:156.73ms step:1059/1480 train_time:164424ms step_avg:156.74ms step:1060/1480 train_time:164594ms step_avg:156.76ms step:1061/1480 train_time:164757ms step_avg:156.76ms step:1062/1480 train_time:164923ms step_avg:156.77ms step:1063/1480 train_time:165088ms step_avg:156.78ms step:1064/1480 train_time:165250ms step_avg:156.78ms step:1065/1480 train_time:165418ms step_avg:156.79ms step:1066/1480 train_time:165586ms step_avg:156.81ms step:1067/1480 train_time:165756ms step_avg:156.82ms step:1068/1480 train_time:165921ms step_avg:156.83ms step:1069/1480 train_time:166095ms step_avg:156.84ms step:1070/1480 train_time:166261ms step_avg:156.85ms step:1071/1480 train_time:166435ms step_avg:156.87ms step:1072/1480 train_time:166602ms step_avg:156.88ms step:1073/1480 train_time:166765ms step_avg:156.88ms step:1074/1480 train_time:166931ms step_avg:156.89ms step:1075/1480 train_time:167101ms step_avg:156.90ms step:1076/1480 train_time:167268ms step_avg:156.91ms step:1077/1480 train_time:167435ms step_avg:156.92ms step:1078/1480 train_time:167608ms step_avg:156.94ms step:1079/1480 train_time:167780ms step_avg:156.95ms step:1080/1480 train_time:167948ms step_avg:156.96ms step:1081/1480 train_time:168115ms step_avg:156.97ms step:1082/1480 train_time:168280ms step_avg:156.98ms step:1083/1480 train_time:168446ms step_avg:156.99ms step:1084/1480 train_time:168615ms step_avg:157.00ms step:1085/1480 train_time:168782ms step_avg:157.01ms step:1086/1480 train_time:168950ms step_avg:157.02ms step:1087/1480 train_time:169117ms step_avg:157.03ms step:1088/1480 train_time:169286ms step_avg:157.04ms step:1089/1480 train_time:169458ms step_avg:157.05ms step:1090/1480 train_time:169629ms step_avg:157.06ms step:1091/1480 train_time:169797ms step_avg:157.07ms step:1092/1480 train_time:169964ms step_avg:157.08ms step:1093/1480 train_time:170135ms step_avg:157.10ms step:1094/1480 train_time:170301ms step_avg:157.10ms step:1095/1480 train_time:170464ms step_avg:157.11ms step:1096/1480 train_time:170634ms step_avg:157.12ms step:1097/1480 train_time:170802ms step_avg:157.13ms step:1098/1480 train_time:170973ms step_avg:157.14ms step:1099/1480 train_time:171143ms step_avg:157.16ms step:1100/1480 train_time:171316ms step_avg:157.17ms step:1101/1480 train_time:171485ms step_avg:157.18ms step:1102/1480 train_time:171656ms step_avg:157.19ms step:1103/1480 train_time:171832ms step_avg:157.21ms step:1104/1480 train_time:172001ms step_avg:157.22ms step:1105/1480 train_time:172170ms step_avg:157.23ms step:1106/1480 train_time:172339ms step_avg:157.24ms step:1107/1480 train_time:172509ms step_avg:157.25ms step:1108/1480 train_time:172674ms step_avg:157.26ms step:1109/1480 train_time:172840ms step_avg:157.27ms step:1110/1480 train_time:173006ms step_avg:157.28ms step:1111/1480 train_time:173174ms step_avg:157.29ms step:1112/1480 train_time:173344ms step_avg:157.30ms step:1113/1480 train_time:173524ms step_avg:157.32ms step:1114/1480 train_time:173697ms step_avg:157.33ms step:1115/1480 train_time:173869ms step_avg:157.35ms step:1116/1480 train_time:174036ms step_avg:157.36ms step:1117/1480 train_time:174208ms step_avg:157.37ms step:1118/1480 train_time:174382ms step_avg:157.38ms step:1119/1480 train_time:174547ms step_avg:157.39ms step:1120/1480 train_time:174716ms step_avg:157.40ms step:1121/1480 train_time:174884ms step_avg:157.41ms step:1122/1480 train_time:175050ms step_avg:157.42ms step:1123/1480 train_time:175217ms step_avg:157.43ms step:1124/1480 train_time:175385ms step_avg:157.44ms step:1125/1480 train_time:175553ms step_avg:157.45ms step:1125/1480 val_loss:3.3853 train_time:175621ms step_avg:157.51ms step:1126/1480 train_time:175722ms step_avg:157.46ms step:1127/1480 train_time:175891ms step_avg:157.47ms step:1128/1480 train_time:176063ms step_avg:157.48ms step:1129/1480 train_time:176238ms step_avg:157.50ms step:1130/1480 train_time:176408ms step_avg:157.51ms step:1131/1480 train_time:176586ms step_avg:157.53ms step:1132/1480 train_time:176752ms step_avg:157.53ms step:1133/1480 train_time:176923ms step_avg:157.55ms step:1134/1480 train_time:177094ms step_avg:157.56ms step:1135/1480 train_time:177262ms step_avg:157.57ms step:1136/1480 train_time:177433ms step_avg:157.58ms step:1137/1480 train_time:177602ms step_avg:157.59ms step:1138/1480 train_time:177774ms step_avg:157.60ms step:1139/1480 train_time:177942ms step_avg:157.61ms step:1140/1480 train_time:178110ms step_avg:157.62ms step:1141/1480 train_time:178282ms step_avg:157.63ms step:1142/1480 train_time:178450ms step_avg:157.64ms step:1143/1480 train_time:178620ms step_avg:157.65ms step:1144/1480 train_time:178789ms step_avg:157.66ms step:1145/1480 train_time:178955ms step_avg:157.67ms step:1146/1480 train_time:179126ms step_avg:157.68ms step:1147/1480 train_time:179296ms step_avg:157.69ms step:1148/1480 train_time:179466ms step_avg:157.70ms step:1149/1480 train_time:179637ms step_avg:157.71ms step:1150/1480 train_time:179804ms step_avg:157.72ms step:1151/1480 train_time:179977ms step_avg:157.74ms step:1152/1480 train_time:180148ms step_avg:157.75ms step:1153/1480 train_time:180321ms step_avg:157.76ms step:1154/1480 train_time:180488ms step_avg:157.77ms step:1155/1480 train_time:180660ms step_avg:157.78ms step:1156/1480 train_time:180838ms step_avg:157.80ms step:1157/1480 train_time:181009ms step_avg:157.81ms step:1158/1480 train_time:181176ms step_avg:157.82ms step:1159/1480 train_time:181343ms step_avg:157.83ms step:1160/1480 train_time:181509ms step_avg:157.83ms step:1161/1480 train_time:181681ms step_avg:157.85ms step:1162/1480 train_time:181851ms step_avg:157.86ms step:1163/1480 train_time:182021ms step_avg:157.87ms step:1164/1480 train_time:182190ms step_avg:157.88ms step:1165/1480 train_time:182357ms step_avg:157.88ms step:1166/1480 train_time:182525ms step_avg:157.89ms step:1167/1480 train_time:182693ms step_avg:157.90ms step:1168/1480 train_time:182861ms step_avg:157.91ms step:1169/1480 train_time:183029ms step_avg:157.92ms step:1170/1480 train_time:183196ms step_avg:157.93ms step:1171/1480 train_time:183363ms step_avg:157.94ms step:1172/1480 train_time:183530ms step_avg:157.94ms step:1173/1480 train_time:183701ms step_avg:157.95ms step:1174/1480 train_time:183883ms step_avg:157.98ms step:1175/1480 train_time:184056ms step_avg:157.99ms step:1176/1480 train_time:184229ms step_avg:158.00ms step:1177/1480 train_time:184404ms step_avg:158.02ms step:1178/1480 train_time:184573ms step_avg:158.02ms step:1179/1480 train_time:184739ms step_avg:158.03ms step:1180/1480 train_time:184920ms step_avg:158.05ms step:1181/1480 train_time:185090ms step_avg:158.06ms step:1182/1480 train_time:185258ms step_avg:158.07ms step:1183/1480 train_time:185428ms step_avg:158.08ms step:1184/1480 train_time:185596ms step_avg:158.09ms step:1185/1480 train_time:185768ms step_avg:158.10ms step:1186/1480 train_time:185939ms step_avg:158.11ms step:1187/1480 train_time:186121ms step_avg:158.13ms step:1188/1480 train_time:186287ms step_avg:158.14ms step:1189/1480 train_time:186460ms step_avg:158.15ms step:1190/1480 train_time:186628ms step_avg:158.16ms step:1191/1480 train_time:186799ms step_avg:158.17ms step:1192/1480 train_time:186964ms step_avg:158.18ms step:1193/1480 train_time:187132ms step_avg:158.18ms step:1194/1480 train_time:187302ms step_avg:158.19ms step:1195/1480 train_time:187476ms step_avg:158.21ms step:1196/1480 train_time:187657ms step_avg:158.23ms step:1197/1480 train_time:187827ms step_avg:158.24ms step:1198/1480 train_time:188010ms step_avg:158.26ms step:1199/1480 train_time:188180ms step_avg:158.27ms step:1200/1480 train_time:188349ms step_avg:158.28ms step:1201/1480 train_time:188517ms step_avg:158.28ms step:1202/1480 train_time:188698ms step_avg:158.30ms step:1203/1480 train_time:188875ms step_avg:158.32ms step:1204/1480 train_time:189050ms step_avg:158.33ms step:1205/1480 train_time:189219ms step_avg:158.34ms step:1206/1480 train_time:189386ms step_avg:158.35ms step:1207/1480 train_time:189556ms step_avg:158.36ms step:1208/1480 train_time:189723ms step_avg:158.37ms step:1209/1480 train_time:189897ms step_avg:158.38ms step:1210/1480 train_time:190073ms step_avg:158.39ms step:1211/1480 train_time:190247ms step_avg:158.41ms step:1212/1480 train_time:190420ms step_avg:158.42ms step:1213/1480 train_time:190592ms step_avg:158.43ms step:1214/1480 train_time:190768ms step_avg:158.45ms step:1215/1480 train_time:190941ms step_avg:158.46ms step:1216/1480 train_time:191109ms step_avg:158.47ms step:1217/1480 train_time:191283ms step_avg:158.48ms step:1218/1480 train_time:191453ms step_avg:158.49ms step:1219/1480 train_time:191631ms step_avg:158.50ms step:1220/1480 train_time:191800ms step_avg:158.51ms step:1221/1480 train_time:191970ms step_avg:158.52ms step:1222/1480 train_time:192138ms step_avg:158.53ms step:1223/1480 train_time:192308ms step_avg:158.54ms step:1224/1480 train_time:192484ms step_avg:158.55ms step:1225/1480 train_time:192656ms step_avg:158.56ms step:1226/1480 train_time:192829ms step_avg:158.58ms step:1227/1480 train_time:193001ms step_avg:158.59ms step:1228/1480 train_time:193170ms step_avg:158.60ms step:1229/1480 train_time:193343ms step_avg:158.61ms step:1230/1480 train_time:193523ms step_avg:158.63ms step:1231/1480 train_time:193698ms step_avg:158.64ms step:1232/1480 train_time:193872ms step_avg:158.65ms step:1233/1480 train_time:194042ms step_avg:158.66ms step:1234/1480 train_time:194213ms step_avg:158.67ms step:1235/1480 train_time:194388ms step_avg:158.68ms step:1236/1480 train_time:194557ms step_avg:158.69ms step:1237/1480 train_time:194727ms step_avg:158.70ms step:1238/1480 train_time:194913ms step_avg:158.72ms step:1239/1480 train_time:195084ms step_avg:158.73ms step:1240/1480 train_time:195256ms step_avg:158.74ms step:1241/1480 train_time:195431ms step_avg:158.76ms step:1242/1480 train_time:195601ms step_avg:158.77ms step:1243/1480 train_time:195776ms step_avg:158.78ms step:1244/1480 train_time:195942ms step_avg:158.79ms step:1245/1480 train_time:196111ms step_avg:158.79ms step:1246/1480 train_time:196282ms step_avg:158.80ms step:1247/1480 train_time:196452ms step_avg:158.81ms step:1248/1480 train_time:196621ms step_avg:158.82ms step:1249/1480 train_time:196788ms step_avg:158.83ms step:1250/1480 train_time:196958ms step_avg:158.84ms step:1250/1480 val_loss:3.3347 train_time:197030ms step_avg:158.90ms step:1251/1480 train_time:197139ms step_avg:158.86ms step:1252/1480 train_time:197308ms step_avg:158.86ms step:1253/1480 train_time:197478ms step_avg:158.87ms step:1254/1480 train_time:197648ms step_avg:158.88ms step:1255/1480 train_time:197836ms step_avg:158.90ms step:1256/1480 train_time:198009ms step_avg:158.92ms step:1257/1480 train_time:198180ms step_avg:158.93ms step:1258/1480 train_time:198356ms step_avg:158.94ms step:1259/1480 train_time:198527ms step_avg:158.95ms step:1260/1480 train_time:198694ms step_avg:158.95ms step:1261/1480 train_time:198867ms step_avg:158.97ms step:1262/1480 train_time:199042ms step_avg:158.98ms step:1263/1480 train_time:199216ms step_avg:158.99ms step:1264/1480 train_time:199382ms step_avg:159.00ms step:1265/1480 train_time:199548ms step_avg:159.00ms step:1266/1480 train_time:199719ms step_avg:159.01ms step:1267/1480 train_time:199889ms step_avg:159.02ms step:1268/1480 train_time:200062ms step_avg:159.03ms step:1269/1480 train_time:200237ms step_avg:159.04ms step:1270/1480 train_time:200406ms step_avg:159.05ms step:1271/1480 train_time:200577ms step_avg:159.06ms step:1272/1480 train_time:200743ms step_avg:159.07ms step:1273/1480 train_time:200914ms step_avg:159.08ms step:1274/1480 train_time:201087ms step_avg:159.09ms step:1275/1480 train_time:201255ms step_avg:159.09ms step:1276/1480 train_time:201422ms step_avg:159.10ms step:1277/1480 train_time:201594ms step_avg:159.11ms step:1278/1480 train_time:201762ms step_avg:159.12ms step:1279/1480 train_time:201933ms step_avg:159.13ms step:1280/1480 train_time:202111ms step_avg:159.14ms step:1281/1480 train_time:202280ms step_avg:159.15ms step:1282/1480 train_time:202446ms step_avg:159.16ms step:1283/1480 train_time:202617ms step_avg:159.17ms step:1284/1480 train_time:202786ms step_avg:159.17ms step:1285/1480 train_time:202956ms step_avg:159.18ms step:1286/1480 train_time:203125ms step_avg:159.19ms step:1287/1480 train_time:203298ms step_avg:159.20ms step:1288/1480 train_time:203469ms step_avg:159.21ms step:1289/1480 train_time:203651ms step_avg:159.23ms step:1290/1480 train_time:203828ms step_avg:159.24ms step:1291/1480 train_time:204002ms step_avg:159.25ms step:1292/1480 train_time:204177ms step_avg:159.26ms step:1293/1480 train_time:204350ms step_avg:159.28ms step:1294/1480 train_time:204522ms step_avg:159.29ms step:1295/1480 train_time:204692ms step_avg:159.29ms step:1296/1480 train_time:204866ms step_avg:159.30ms step:1297/1480 train_time:205038ms step_avg:159.31ms step:1298/1480 train_time:205208ms step_avg:159.32ms step:1299/1480 train_time:205380ms step_avg:159.33ms step:1300/1480 train_time:205548ms step_avg:159.34ms step:1301/1480 train_time:205715ms step_avg:159.35ms step:1302/1480 train_time:205889ms step_avg:159.36ms step:1303/1480 train_time:206067ms step_avg:159.37ms step:1304/1480 train_time:206241ms step_avg:159.38ms step:1305/1480 train_time:206409ms step_avg:159.39ms step:1306/1480 train_time:206585ms step_avg:159.40ms step:1307/1480 train_time:206752ms step_avg:159.41ms step:1308/1480 train_time:206920ms step_avg:159.41ms step:1309/1480 train_time:207091ms step_avg:159.42ms step:1310/1480 train_time:207262ms step_avg:159.43ms step:1311/1480 train_time:207430ms step_avg:159.44ms step:1312/1480 train_time:207604ms step_avg:159.45ms step:1313/1480 train_time:207775ms step_avg:159.46ms step:1314/1480 train_time:207947ms step_avg:159.47ms step:1315/1480 train_time:208118ms step_avg:159.48ms step:1316/1480 train_time:208285ms step_avg:159.48ms step:1317/1480 train_time:208457ms step_avg:159.49ms step:1318/1480 train_time:208638ms step_avg:159.51ms step:1319/1480 train_time:208814ms step_avg:159.52ms step:1320/1480 train_time:208990ms step_avg:159.53ms step:1321/1480 train_time:209163ms step_avg:159.54ms step:1322/1480 train_time:209343ms step_avg:159.56ms step:1323/1480 train_time:209516ms step_avg:159.57ms step:1324/1480 train_time:209690ms step_avg:159.58ms step:1325/1480 train_time:209871ms step_avg:159.60ms step:1326/1480 train_time:210047ms step_avg:159.61ms step:1327/1480 train_time:210217ms step_avg:159.62ms step:1328/1480 train_time:210388ms step_avg:159.63ms step:1329/1480 train_time:210586ms step_avg:159.66ms step:1330/1480 train_time:210766ms step_avg:159.67ms step:1331/1480 train_time:210935ms step_avg:159.68ms step:1332/1480 train_time:211110ms step_avg:159.69ms step:1333/1480 train_time:211284ms step_avg:159.70ms step:1334/1480 train_time:211456ms step_avg:159.71ms step:1335/1480 train_time:211625ms step_avg:159.72ms step:1336/1480 train_time:211809ms step_avg:159.74ms step:1337/1480 train_time:211984ms step_avg:159.75ms step:1338/1480 train_time:212155ms step_avg:159.76ms step:1339/1480 train_time:212329ms step_avg:159.77ms step:1340/1480 train_time:212501ms step_avg:159.78ms step:1341/1480 train_time:212669ms step_avg:159.78ms step:1342/1480 train_time:212844ms step_avg:159.79ms step:1343/1480 train_time:213014ms step_avg:159.80ms step:1344/1480 train_time:213186ms step_avg:159.81ms step:1345/1480 train_time:213363ms step_avg:159.82ms step:1346/1480 train_time:213530ms step_avg:159.83ms step:1347/1480 train_time:213701ms step_avg:159.84ms step:1348/1480 train_time:213870ms step_avg:159.84ms step:1349/1480 train_time:214040ms step_avg:159.85ms step:1350/1480 train_time:214214ms step_avg:159.86ms step:1351/1480 train_time:214385ms step_avg:159.87ms step:1352/1480 train_time:214556ms step_avg:159.88ms step:1353/1480 train_time:214733ms step_avg:159.89ms step:1354/1480 train_time:214904ms step_avg:159.90ms step:1355/1480 train_time:215073ms step_avg:159.91ms step:1356/1480 train_time:215247ms step_avg:159.92ms step:1357/1480 train_time:215421ms step_avg:159.93ms step:1358/1480 train_time:215593ms step_avg:159.94ms step:1359/1480 train_time:215765ms step_avg:159.94ms step:1360/1480 train_time:215940ms step_avg:159.96ms step:1361/1480 train_time:216116ms step_avg:159.97ms step:1362/1480 train_time:216291ms step_avg:159.98ms step:1363/1480 train_time:216471ms step_avg:159.99ms step:1364/1480 train_time:216641ms step_avg:160.00ms step:1365/1480 train_time:216808ms step_avg:160.01ms step:1366/1480 train_time:216981ms step_avg:160.02ms step:1367/1480 train_time:217150ms step_avg:160.02ms step:1368/1480 train_time:217323ms step_avg:160.03ms step:1369/1480 train_time:217506ms step_avg:160.05ms step:1370/1480 train_time:217683ms step_avg:160.06ms step:1371/1480 train_time:217854ms step_avg:160.07ms step:1372/1480 train_time:218032ms step_avg:160.08ms step:1373/1480 train_time:218201ms step_avg:160.09ms step:1374/1480 train_time:218375ms step_avg:160.10ms step:1375/1480 train_time:218546ms step_avg:160.11ms step:1375/1480 val_loss:3.2960 train_time:218614ms step_avg:160.16ms step:1376/1480 train_time:218721ms step_avg:160.12ms step:1377/1480 train_time:218896ms step_avg:160.13ms step:1378/1480 train_time:219064ms step_avg:160.13ms step:1379/1480 train_time:219238ms step_avg:160.14ms step:1380/1480 train_time:219412ms step_avg:160.15ms step:1381/1480 train_time:219595ms step_avg:160.17ms step:1382/1480 train_time:219765ms step_avg:160.18ms step:1383/1480 train_time:219938ms step_avg:160.19ms step:1384/1480 train_time:220117ms step_avg:160.20ms step:1385/1480 train_time:220282ms step_avg:160.21ms step:1386/1480 train_time:220452ms step_avg:160.21ms step:1387/1480 train_time:220623ms step_avg:160.22ms step:1388/1480 train_time:220793ms step_avg:160.23ms step:1389/1480 train_time:220966ms step_avg:160.24ms step:1390/1480 train_time:221135ms step_avg:160.24ms step:1391/1480 train_time:221304ms step_avg:160.25ms step:1392/1480 train_time:221478ms step_avg:160.26ms step:1393/1480 train_time:221649ms step_avg:160.27ms step:1394/1480 train_time:221819ms step_avg:160.27ms step:1395/1480 train_time:221986ms step_avg:160.28ms step:1396/1480 train_time:222155ms step_avg:160.29ms step:1397/1480 train_time:222322ms step_avg:160.29ms step:1398/1480 train_time:222489ms step_avg:160.29ms step:1399/1480 train_time:222659ms step_avg:160.30ms step:1400/1480 train_time:222835ms step_avg:160.31ms step:1401/1480 train_time:223001ms step_avg:160.32ms step:1402/1480 train_time:223172ms step_avg:160.33ms step:1403/1480 train_time:223349ms step_avg:160.34ms step:1404/1480 train_time:223520ms step_avg:160.34ms step:1405/1480 train_time:223697ms step_avg:160.36ms step:1406/1480 train_time:223870ms step_avg:160.37ms step:1407/1480 train_time:224038ms step_avg:160.37ms step:1408/1480 train_time:224206ms step_avg:160.38ms step:1409/1480 train_time:224391ms step_avg:160.39ms step:1410/1480 train_time:224560ms step_avg:160.40ms step:1411/1480 train_time:224727ms step_avg:160.40ms step:1412/1480 train_time:224898ms step_avg:160.41ms step:1413/1480 train_time:225067ms step_avg:160.42ms step:1414/1480 train_time:225239ms step_avg:160.43ms step:1415/1480 train_time:225414ms step_avg:160.44ms step:1416/1480 train_time:225602ms step_avg:160.46ms step:1417/1480 train_time:225776ms step_avg:160.47ms step:1418/1480 train_time:225946ms step_avg:160.47ms step:1419/1480 train_time:226120ms step_avg:160.48ms step:1420/1480 train_time:226298ms step_avg:160.49ms step:1421/1480 train_time:226470ms step_avg:160.50ms step:1422/1480 train_time:226640ms step_avg:160.51ms step:1423/1480 train_time:226811ms step_avg:160.52ms step:1424/1480 train_time:226988ms step_avg:160.53ms step:1425/1480 train_time:227168ms step_avg:160.54ms step:1426/1480 train_time:227339ms step_avg:160.55ms step:1427/1480 train_time:227515ms step_avg:160.56ms step:1428/1480 train_time:227685ms step_avg:160.57ms step:1429/1480 train_time:227854ms step_avg:160.57ms step:1430/1480 train_time:228027ms step_avg:160.58ms step:1431/1480 train_time:228203ms step_avg:160.59ms step:1432/1480 train_time:228380ms step_avg:160.60ms step:1433/1480 train_time:228560ms step_avg:160.62ms step:1434/1480 train_time:228740ms step_avg:160.63ms step:1435/1480 train_time:228915ms step_avg:160.64ms step:1436/1480 train_time:229089ms step_avg:160.65ms step:1437/1480 train_time:229259ms step_avg:160.66ms step:1438/1480 train_time:229428ms step_avg:160.66ms step:1439/1480 train_time:229601ms step_avg:160.67ms step:1440/1480 train_time:229772ms step_avg:160.68ms step:1441/1480 train_time:229943ms step_avg:160.69ms step:1442/1480 train_time:230120ms step_avg:160.70ms step:1443/1480 train_time:230307ms step_avg:160.72ms step:1444/1480 train_time:230477ms step_avg:160.72ms step:1445/1480 train_time:230646ms step_avg:160.73ms step:1446/1480 train_time:230822ms step_avg:160.74ms step:1447/1480 train_time:231000ms step_avg:160.75ms step:1448/1480 train_time:231173ms step_avg:160.76ms step:1449/1480 train_time:231345ms step_avg:160.77ms step:1450/1480 train_time:231520ms step_avg:160.78ms step:1451/1480 train_time:231691ms step_avg:160.78ms step:1452/1480 train_time:231864ms step_avg:160.79ms step:1453/1480 train_time:232034ms step_avg:160.80ms step:1454/1480 train_time:232207ms step_avg:160.81ms step:1455/1480 train_time:232386ms step_avg:160.82ms step:1456/1480 train_time:232559ms step_avg:160.83ms step:1457/1480 train_time:232730ms step_avg:160.84ms step:1458/1480 train_time:232901ms step_avg:160.84ms step:1459/1480 train_time:233079ms step_avg:160.85ms step:1460/1480 train_time:233250ms step_avg:160.86ms step:1461/1480 train_time:233424ms step_avg:160.87ms step:1462/1480 train_time:233595ms step_avg:160.88ms step:1463/1480 train_time:233771ms step_avg:160.89ms step:1464/1480 train_time:233944ms step_avg:160.90ms step:1465/1480 train_time:234117ms step_avg:160.90ms step:1466/1480 train_time:234287ms step_avg:160.91ms step:1467/1480 train_time:234460ms step_avg:160.92ms step:1468/1480 train_time:234630ms step_avg:160.93ms step:1469/1480 train_time:234804ms step_avg:160.93ms step:1470/1480 train_time:234983ms step_avg:160.95ms step:1471/1480 train_time:235168ms step_avg:160.96ms step:1472/1480 train_time:235346ms step_avg:160.98ms step:1473/1480 train_time:235516ms step_avg:160.98ms step:1474/1480 train_time:235696ms step_avg:160.99ms step:1475/1480 train_time:235876ms step_avg:161.01ms step:1476/1480 train_time:236048ms step_avg:161.02ms step:1477/1480 train_time:236234ms step_avg:161.03ms step:1478/1480 train_time:236419ms step_avg:161.05ms step:1479/1480 train_time:236594ms step_avg:161.06ms step:1480/1480 train_time:236765ms step_avg:161.06ms step:1480/1480 val_loss:3.2773 train_time:236836ms step_avg:161.11ms