import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 08:17:05 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 44C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 101W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 38C P0 77W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 38C P0 112W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 96W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22990ms step_avg:nanms step:2/1480 train_time:23077ms step_avg:nanms step:3/1480 train_time:23216ms step_avg:nanms step:4/1480 train_time:23359ms step_avg:nanms step:5/1480 train_time:23501ms step_avg:nanms step:6/1480 train_time:23642ms step_avg:nanms step:7/1480 train_time:23783ms step_avg:nanms step:8/1480 train_time:23926ms step_avg:nanms step:9/1480 train_time:24070ms step_avg:nanms step:10/1480 train_time:24214ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:428ms step_avg:142.54ms step:14/1480 train_time:569ms step_avg:142.19ms step:15/1480 train_time:710ms step_avg:141.92ms step:16/1480 train_time:853ms step_avg:142.09ms step:17/1480 train_time:994ms step_avg:141.95ms step:18/1480 train_time:1136ms step_avg:141.97ms step:19/1480 train_time:1279ms step_avg:142.07ms step:20/1480 train_time:1423ms step_avg:142.29ms step:21/1480 train_time:1565ms step_avg:142.28ms step:22/1480 train_time:1707ms step_avg:142.22ms step:23/1480 train_time:1849ms step_avg:142.24ms step:24/1480 train_time:1991ms step_avg:142.21ms step:25/1480 train_time:2133ms step_avg:142.22ms step:26/1480 train_time:2277ms step_avg:142.32ms step:27/1480 train_time:2421ms step_avg:142.40ms step:28/1480 train_time:2565ms step_avg:142.48ms step:29/1480 train_time:2708ms step_avg:142.51ms step:30/1480 train_time:2849ms step_avg:142.46ms step:31/1480 train_time:2991ms step_avg:142.45ms step:32/1480 train_time:3133ms step_avg:142.41ms step:33/1480 train_time:3276ms step_avg:142.45ms step:34/1480 train_time:3419ms step_avg:142.46ms step:35/1480 train_time:3565ms step_avg:142.62ms step:36/1480 train_time:3707ms step_avg:142.59ms step:37/1480 train_time:3848ms step_avg:142.53ms step:38/1480 train_time:3990ms step_avg:142.51ms step:39/1480 train_time:4132ms step_avg:142.48ms step:40/1480 train_time:4274ms step_avg:142.47ms step:41/1480 train_time:4418ms step_avg:142.51ms step:42/1480 train_time:4561ms step_avg:142.52ms step:43/1480 train_time:4704ms step_avg:142.55ms step:44/1480 train_time:4847ms step_avg:142.57ms step:45/1480 train_time:4989ms step_avg:142.54ms step:46/1480 train_time:5131ms step_avg:142.53ms step:47/1480 train_time:5273ms step_avg:142.51ms step:48/1480 train_time:5415ms step_avg:142.49ms step:49/1480 train_time:5558ms step_avg:142.52ms step:50/1480 train_time:5701ms step_avg:142.52ms step:51/1480 train_time:5845ms step_avg:142.56ms step:52/1480 train_time:5987ms step_avg:142.55ms step:53/1480 train_time:6129ms step_avg:142.53ms step:54/1480 train_time:6271ms step_avg:142.53ms step:55/1480 train_time:6413ms step_avg:142.50ms step:56/1480 train_time:6555ms step_avg:142.49ms step:57/1480 train_time:6699ms step_avg:142.53ms step:58/1480 train_time:6843ms step_avg:142.57ms step:59/1480 train_time:6988ms step_avg:142.61ms step:60/1480 train_time:7130ms step_avg:142.60ms step:61/1480 train_time:7273ms step_avg:142.61ms step:62/1480 train_time:7414ms step_avg:142.58ms step:63/1480 train_time:7556ms step_avg:142.57ms step:64/1480 train_time:7698ms step_avg:142.56ms step:65/1480 train_time:7843ms step_avg:142.61ms step:66/1480 train_time:7987ms step_avg:142.63ms step:67/1480 train_time:8130ms step_avg:142.63ms step:68/1480 train_time:8273ms step_avg:142.64ms step:69/1480 train_time:8415ms step_avg:142.62ms step:70/1480 train_time:8556ms step_avg:142.60ms step:71/1480 train_time:8699ms step_avg:142.60ms step:72/1480 train_time:8842ms step_avg:142.61ms step:73/1480 train_time:8985ms step_avg:142.62ms step:74/1480 train_time:9128ms step_avg:142.63ms step:75/1480 train_time:9274ms step_avg:142.68ms step:76/1480 train_time:9415ms step_avg:142.65ms step:77/1480 train_time:9558ms step_avg:142.66ms step:78/1480 train_time:9701ms step_avg:142.66ms step:79/1480 train_time:9845ms step_avg:142.68ms step:80/1480 train_time:9989ms step_avg:142.70ms step:81/1480 train_time:10130ms step_avg:142.68ms step:82/1480 train_time:10272ms step_avg:142.67ms step:83/1480 train_time:10414ms step_avg:142.66ms step:84/1480 train_time:10558ms step_avg:142.67ms step:85/1480 train_time:10701ms step_avg:142.68ms step:86/1480 train_time:10845ms step_avg:142.70ms step:87/1480 train_time:10988ms step_avg:142.71ms step:88/1480 train_time:11132ms step_avg:142.71ms step:89/1480 train_time:11274ms step_avg:142.70ms step:90/1480 train_time:11415ms step_avg:142.68ms step:91/1480 train_time:11556ms step_avg:142.66ms step:92/1480 train_time:11697ms step_avg:142.65ms step:93/1480 train_time:11842ms step_avg:142.68ms step:94/1480 train_time:11985ms step_avg:142.68ms step:95/1480 train_time:12128ms step_avg:142.68ms step:96/1480 train_time:12272ms step_avg:142.69ms step:97/1480 train_time:12413ms step_avg:142.68ms step:98/1480 train_time:12555ms step_avg:142.67ms step:99/1480 train_time:12698ms step_avg:142.67ms step:100/1480 train_time:12843ms step_avg:142.70ms step:101/1480 train_time:12988ms step_avg:142.72ms step:102/1480 train_time:13130ms step_avg:142.72ms step:103/1480 train_time:13273ms step_avg:142.72ms step:104/1480 train_time:13415ms step_avg:142.71ms step:105/1480 train_time:13557ms step_avg:142.71ms step:106/1480 train_time:13699ms step_avg:142.70ms step:107/1480 train_time:13843ms step_avg:142.72ms step:108/1480 train_time:13986ms step_avg:142.72ms step:109/1480 train_time:14130ms step_avg:142.73ms step:110/1480 train_time:14273ms step_avg:142.73ms step:111/1480 train_time:14417ms step_avg:142.75ms step:112/1480 train_time:14565ms step_avg:142.80ms step:113/1480 train_time:14712ms step_avg:142.84ms step:114/1480 train_time:14860ms step_avg:142.88ms step:115/1480 train_time:15006ms step_avg:142.91ms step:116/1480 train_time:15152ms step_avg:142.94ms step:117/1480 train_time:15298ms step_avg:142.97ms step:118/1480 train_time:15445ms step_avg:143.01ms step:119/1480 train_time:15592ms step_avg:143.05ms step:120/1480 train_time:15738ms step_avg:143.08ms step:121/1480 train_time:15886ms step_avg:143.12ms step:122/1480 train_time:16032ms step_avg:143.14ms step:123/1480 train_time:16177ms step_avg:143.16ms step:124/1480 train_time:16323ms step_avg:143.19ms step:125/1480 train_time:16470ms step_avg:143.22ms step:125/1480 val_loss:4.4201 train_time:16527ms step_avg:143.71ms step:126/1480 train_time:16622ms step_avg:143.29ms step:127/1480 train_time:16772ms step_avg:143.35ms step:128/1480 train_time:16918ms step_avg:143.38ms step:129/1480 train_time:17063ms step_avg:143.39ms step:130/1480 train_time:17210ms step_avg:143.42ms step:131/1480 train_time:17356ms step_avg:143.44ms step:132/1480 train_time:17502ms step_avg:143.46ms step:133/1480 train_time:17650ms step_avg:143.50ms step:134/1480 train_time:17799ms step_avg:143.54ms step:135/1480 train_time:17945ms step_avg:143.56ms step:136/1480 train_time:18093ms step_avg:143.60ms step:137/1480 train_time:18240ms step_avg:143.62ms step:138/1480 train_time:18385ms step_avg:143.64ms step:139/1480 train_time:18532ms step_avg:143.66ms step:140/1480 train_time:18680ms step_avg:143.69ms step:141/1480 train_time:18826ms step_avg:143.71ms step:142/1480 train_time:18975ms step_avg:143.75ms step:143/1480 train_time:19121ms step_avg:143.77ms step:144/1480 train_time:19270ms step_avg:143.80ms step:145/1480 train_time:19417ms step_avg:143.83ms step:146/1480 train_time:19562ms step_avg:143.84ms step:147/1480 train_time:19708ms step_avg:143.86ms step:148/1480 train_time:19855ms step_avg:143.88ms step:149/1480 train_time:20002ms step_avg:143.90ms step:150/1480 train_time:20149ms step_avg:143.92ms step:151/1480 train_time:20299ms step_avg:143.97ms step:152/1480 train_time:20443ms step_avg:143.96ms step:153/1480 train_time:20591ms step_avg:143.99ms step:154/1480 train_time:20737ms step_avg:144.01ms step:155/1480 train_time:20883ms step_avg:144.02ms step:156/1480 train_time:21030ms step_avg:144.04ms step:157/1480 train_time:21178ms step_avg:144.07ms step:158/1480 train_time:21325ms step_avg:144.09ms step:159/1480 train_time:21472ms step_avg:144.11ms step:160/1480 train_time:21619ms step_avg:144.13ms step:161/1480 train_time:21765ms step_avg:144.14ms step:162/1480 train_time:21914ms step_avg:144.17ms step:163/1480 train_time:22060ms step_avg:144.18ms step:164/1480 train_time:22208ms step_avg:144.21ms step:165/1480 train_time:22356ms step_avg:144.23ms step:166/1480 train_time:22501ms step_avg:144.24ms step:167/1480 train_time:22649ms step_avg:144.26ms step:168/1480 train_time:22796ms step_avg:144.28ms step:169/1480 train_time:22942ms step_avg:144.29ms step:170/1480 train_time:23091ms step_avg:144.32ms step:171/1480 train_time:23238ms step_avg:144.34ms step:172/1480 train_time:23385ms step_avg:144.35ms step:173/1480 train_time:23532ms step_avg:144.37ms step:174/1480 train_time:23679ms step_avg:144.39ms step:175/1480 train_time:23824ms step_avg:144.39ms step:176/1480 train_time:23972ms step_avg:144.41ms step:177/1480 train_time:24119ms step_avg:144.42ms step:178/1480 train_time:24266ms step_avg:144.44ms step:179/1480 train_time:24415ms step_avg:144.46ms step:180/1480 train_time:24561ms step_avg:144.48ms step:181/1480 train_time:24709ms step_avg:144.49ms step:182/1480 train_time:24856ms step_avg:144.51ms step:183/1480 train_time:25002ms step_avg:144.52ms step:184/1480 train_time:25149ms step_avg:144.54ms step:185/1480 train_time:25297ms step_avg:144.55ms step:186/1480 train_time:25442ms step_avg:144.56ms step:187/1480 train_time:25589ms step_avg:144.57ms step:188/1480 train_time:25737ms step_avg:144.59ms step:189/1480 train_time:25883ms step_avg:144.60ms step:190/1480 train_time:26031ms step_avg:144.62ms step:191/1480 train_time:26178ms step_avg:144.63ms step:192/1480 train_time:26324ms step_avg:144.64ms step:193/1480 train_time:26471ms step_avg:144.65ms step:194/1480 train_time:26618ms step_avg:144.66ms step:195/1480 train_time:26764ms step_avg:144.67ms step:196/1480 train_time:26913ms step_avg:144.69ms step:197/1480 train_time:27059ms step_avg:144.70ms step:198/1480 train_time:27207ms step_avg:144.72ms step:199/1480 train_time:27353ms step_avg:144.73ms step:200/1480 train_time:27501ms step_avg:144.74ms step:201/1480 train_time:27647ms step_avg:144.75ms step:202/1480 train_time:27795ms step_avg:144.76ms step:203/1480 train_time:27940ms step_avg:144.77ms step:204/1480 train_time:28086ms step_avg:144.77ms step:205/1480 train_time:28233ms step_avg:144.79ms step:206/1480 train_time:28380ms step_avg:144.79ms step:207/1480 train_time:28526ms step_avg:144.80ms step:208/1480 train_time:28674ms step_avg:144.82ms step:209/1480 train_time:28820ms step_avg:144.83ms step:210/1480 train_time:28967ms step_avg:144.84ms step:211/1480 train_time:29115ms step_avg:144.85ms step:212/1480 train_time:29261ms step_avg:144.86ms step:213/1480 train_time:29408ms step_avg:144.87ms step:214/1480 train_time:29554ms step_avg:144.87ms step:215/1480 train_time:29700ms step_avg:144.88ms step:216/1480 train_time:29845ms step_avg:144.88ms step:217/1480 train_time:29993ms step_avg:144.89ms step:218/1480 train_time:30139ms step_avg:144.90ms step:219/1480 train_time:30285ms step_avg:144.90ms step:220/1480 train_time:30433ms step_avg:144.92ms step:221/1480 train_time:30581ms step_avg:144.94ms step:222/1480 train_time:30731ms step_avg:144.96ms step:223/1480 train_time:30882ms step_avg:144.99ms step:224/1480 train_time:31033ms step_avg:145.01ms step:225/1480 train_time:31182ms step_avg:145.03ms step:226/1480 train_time:31333ms step_avg:145.06ms step:227/1480 train_time:31483ms step_avg:145.08ms step:228/1480 train_time:31634ms step_avg:145.11ms step:229/1480 train_time:31784ms step_avg:145.13ms step:230/1480 train_time:31934ms step_avg:145.16ms step:231/1480 train_time:32085ms step_avg:145.18ms step:232/1480 train_time:32235ms step_avg:145.20ms step:233/1480 train_time:32385ms step_avg:145.22ms step:234/1480 train_time:32536ms step_avg:145.25ms step:235/1480 train_time:32686ms step_avg:145.27ms step:236/1480 train_time:32837ms step_avg:145.29ms step:237/1480 train_time:32987ms step_avg:145.32ms step:238/1480 train_time:33137ms step_avg:145.34ms step:239/1480 train_time:33287ms step_avg:145.36ms step:240/1480 train_time:33439ms step_avg:145.39ms step:241/1480 train_time:33589ms step_avg:145.40ms step:242/1480 train_time:33740ms step_avg:145.43ms step:243/1480 train_time:33890ms step_avg:145.45ms step:244/1480 train_time:34040ms step_avg:145.47ms step:245/1480 train_time:34191ms step_avg:145.49ms step:246/1480 train_time:34341ms step_avg:145.51ms step:247/1480 train_time:34492ms step_avg:145.53ms step:248/1480 train_time:34643ms step_avg:145.56ms step:249/1480 train_time:34794ms step_avg:145.58ms step:250/1480 train_time:34944ms step_avg:145.60ms step:250/1480 val_loss:3.9934 train_time:35002ms step_avg:145.84ms step:251/1480 train_time:35097ms step_avg:145.63ms step:252/1480 train_time:35249ms step_avg:145.66ms step:253/1480 train_time:35398ms step_avg:145.67ms step:254/1480 train_time:35548ms step_avg:145.69ms step:255/1480 train_time:35696ms step_avg:145.70ms step:256/1480 train_time:35846ms step_avg:145.71ms step:257/1480 train_time:35995ms step_avg:145.73ms step:258/1480 train_time:36147ms step_avg:145.76ms step:259/1480 train_time:36300ms step_avg:145.78ms step:260/1480 train_time:36451ms step_avg:145.80ms step:261/1480 train_time:36600ms step_avg:145.82ms step:262/1480 train_time:36750ms step_avg:145.83ms step:263/1480 train_time:36898ms step_avg:145.84ms step:264/1480 train_time:37049ms step_avg:145.86ms step:265/1480 train_time:37200ms step_avg:145.88ms step:266/1480 train_time:37351ms step_avg:145.90ms step:267/1480 train_time:37501ms step_avg:145.92ms step:268/1480 train_time:37651ms step_avg:145.93ms step:269/1480 train_time:37800ms step_avg:145.95ms step:270/1480 train_time:37949ms step_avg:145.96ms step:271/1480 train_time:38098ms step_avg:145.97ms step:272/1480 train_time:38248ms step_avg:145.99ms step:273/1480 train_time:38399ms step_avg:146.00ms step:274/1480 train_time:38551ms step_avg:146.03ms step:275/1480 train_time:38701ms step_avg:146.04ms step:276/1480 train_time:38851ms step_avg:146.06ms step:277/1480 train_time:39001ms step_avg:146.07ms step:278/1480 train_time:39151ms step_avg:146.09ms step:279/1480 train_time:39301ms step_avg:146.10ms step:280/1480 train_time:39453ms step_avg:146.12ms step:281/1480 train_time:39603ms step_avg:146.14ms step:282/1480 train_time:39754ms step_avg:146.15ms step:283/1480 train_time:39905ms step_avg:146.17ms step:284/1480 train_time:40056ms step_avg:146.19ms step:285/1480 train_time:40207ms step_avg:146.21ms step:286/1480 train_time:40358ms step_avg:146.22ms step:287/1480 train_time:40510ms step_avg:146.24ms step:288/1480 train_time:40659ms step_avg:146.25ms step:289/1480 train_time:40810ms step_avg:146.27ms step:290/1480 train_time:40959ms step_avg:146.28ms step:291/1480 train_time:41110ms step_avg:146.30ms step:292/1480 train_time:41259ms step_avg:146.31ms step:293/1480 train_time:41410ms step_avg:146.33ms step:294/1480 train_time:41559ms step_avg:146.34ms step:295/1480 train_time:41710ms step_avg:146.35ms step:296/1480 train_time:41858ms step_avg:146.36ms step:297/1480 train_time:42010ms step_avg:146.38ms step:298/1480 train_time:42160ms step_avg:146.39ms step:299/1480 train_time:42310ms step_avg:146.40ms step:300/1480 train_time:42460ms step_avg:146.41ms step:301/1480 train_time:42611ms step_avg:146.43ms step:302/1480 train_time:42759ms step_avg:146.44ms step:303/1480 train_time:42911ms step_avg:146.45ms step:304/1480 train_time:43059ms step_avg:146.46ms step:305/1480 train_time:43210ms step_avg:146.47ms step:306/1480 train_time:43360ms step_avg:146.49ms step:307/1480 train_time:43511ms step_avg:146.50ms step:308/1480 train_time:43661ms step_avg:146.51ms step:309/1480 train_time:43812ms step_avg:146.53ms step:310/1480 train_time:43961ms step_avg:146.54ms step:311/1480 train_time:44113ms step_avg:146.55ms step:312/1480 train_time:44262ms step_avg:146.56ms step:313/1480 train_time:44413ms step_avg:146.58ms step:314/1480 train_time:44563ms step_avg:146.59ms step:315/1480 train_time:44714ms step_avg:146.60ms step:316/1480 train_time:44865ms step_avg:146.62ms step:317/1480 train_time:45016ms step_avg:146.63ms step:318/1480 train_time:45166ms step_avg:146.64ms step:319/1480 train_time:45317ms step_avg:146.66ms step:320/1480 train_time:45468ms step_avg:146.67ms step:321/1480 train_time:45618ms step_avg:146.68ms step:322/1480 train_time:45768ms step_avg:146.69ms step:323/1480 train_time:45917ms step_avg:146.70ms step:324/1480 train_time:46069ms step_avg:146.72ms step:325/1480 train_time:46219ms step_avg:146.73ms step:326/1480 train_time:46369ms step_avg:146.74ms step:327/1480 train_time:46519ms step_avg:146.75ms step:328/1480 train_time:46669ms step_avg:146.76ms step:329/1480 train_time:46819ms step_avg:146.77ms step:330/1480 train_time:46972ms step_avg:146.79ms step:331/1480 train_time:47127ms step_avg:146.81ms step:332/1480 train_time:47282ms step_avg:146.84ms step:333/1480 train_time:47436ms step_avg:146.86ms step:334/1480 train_time:47590ms step_avg:146.88ms step:335/1480 train_time:47743ms step_avg:146.90ms step:336/1480 train_time:47898ms step_avg:146.93ms step:337/1480 train_time:48052ms step_avg:146.95ms step:338/1480 train_time:48208ms step_avg:146.97ms step:339/1480 train_time:48362ms step_avg:147.00ms step:340/1480 train_time:48517ms step_avg:147.02ms step:341/1480 train_time:48670ms step_avg:147.04ms step:342/1480 train_time:48823ms step_avg:147.06ms step:343/1480 train_time:48977ms step_avg:147.08ms step:344/1480 train_time:49131ms step_avg:147.10ms step:345/1480 train_time:49285ms step_avg:147.12ms step:346/1480 train_time:49438ms step_avg:147.14ms step:347/1480 train_time:49593ms step_avg:147.16ms step:348/1480 train_time:49746ms step_avg:147.18ms step:349/1480 train_time:49901ms step_avg:147.20ms step:350/1480 train_time:50054ms step_avg:147.22ms step:351/1480 train_time:50209ms step_avg:147.24ms step:352/1480 train_time:50362ms step_avg:147.26ms step:353/1480 train_time:50515ms step_avg:147.27ms step:354/1480 train_time:50668ms step_avg:147.29ms step:355/1480 train_time:50822ms step_avg:147.31ms step:356/1480 train_time:50976ms step_avg:147.33ms step:357/1480 train_time:51130ms step_avg:147.35ms step:358/1480 train_time:51283ms step_avg:147.37ms step:359/1480 train_time:51437ms step_avg:147.38ms step:360/1480 train_time:51592ms step_avg:147.41ms step:361/1480 train_time:51746ms step_avg:147.43ms step:362/1480 train_time:51900ms step_avg:147.44ms step:363/1480 train_time:52054ms step_avg:147.46ms step:364/1480 train_time:52209ms step_avg:147.48ms step:365/1480 train_time:52361ms step_avg:147.50ms step:366/1480 train_time:52515ms step_avg:147.52ms step:367/1480 train_time:52669ms step_avg:147.53ms step:368/1480 train_time:52822ms step_avg:147.55ms step:369/1480 train_time:52976ms step_avg:147.56ms step:370/1480 train_time:53129ms step_avg:147.58ms step:371/1480 train_time:53284ms step_avg:147.60ms step:372/1480 train_time:53437ms step_avg:147.62ms step:373/1480 train_time:53591ms step_avg:147.63ms step:374/1480 train_time:53744ms step_avg:147.65ms step:375/1480 train_time:53896ms step_avg:147.66ms step:375/1480 val_loss:3.8055 train_time:53956ms step_avg:147.82ms step:376/1480 train_time:54054ms step_avg:147.69ms step:377/1480 train_time:54209ms step_avg:147.71ms step:378/1480 train_time:54361ms step_avg:147.72ms step:379/1480 train_time:54514ms step_avg:147.73ms step:380/1480 train_time:54666ms step_avg:147.75ms step:381/1480 train_time:54819ms step_avg:147.76ms step:382/1480 train_time:54973ms step_avg:147.78ms step:383/1480 train_time:55128ms step_avg:147.80ms step:384/1480 train_time:55282ms step_avg:147.81ms step:385/1480 train_time:55436ms step_avg:147.83ms step:386/1480 train_time:55590ms step_avg:147.84ms step:387/1480 train_time:55742ms step_avg:147.86ms step:388/1480 train_time:55895ms step_avg:147.87ms step:389/1480 train_time:56049ms step_avg:147.89ms step:390/1480 train_time:56203ms step_avg:147.90ms step:391/1480 train_time:56356ms step_avg:147.92ms step:392/1480 train_time:56510ms step_avg:147.93ms step:393/1480 train_time:56663ms step_avg:147.94ms step:394/1480 train_time:56816ms step_avg:147.96ms step:395/1480 train_time:56970ms step_avg:147.97ms step:396/1480 train_time:57125ms step_avg:147.99ms step:397/1480 train_time:57280ms step_avg:148.01ms step:398/1480 train_time:57433ms step_avg:148.02ms step:399/1480 train_time:57588ms step_avg:148.04ms step:400/1480 train_time:57742ms step_avg:148.06ms step:401/1480 train_time:57894ms step_avg:148.07ms step:402/1480 train_time:58047ms step_avg:148.08ms step:403/1480 train_time:58202ms step_avg:148.10ms step:404/1480 train_time:58355ms step_avg:148.11ms step:405/1480 train_time:58509ms step_avg:148.12ms step:406/1480 train_time:58663ms step_avg:148.14ms step:407/1480 train_time:58818ms step_avg:148.16ms step:408/1480 train_time:58971ms step_avg:148.17ms step:409/1480 train_time:59125ms step_avg:148.18ms step:410/1480 train_time:59277ms step_avg:148.19ms step:411/1480 train_time:59431ms step_avg:148.21ms step:412/1480 train_time:59586ms step_avg:148.22ms step:413/1480 train_time:59739ms step_avg:148.24ms step:414/1480 train_time:59894ms step_avg:148.25ms step:415/1480 train_time:60049ms step_avg:148.27ms step:416/1480 train_time:60202ms step_avg:148.28ms step:417/1480 train_time:60355ms step_avg:148.29ms step:418/1480 train_time:60508ms step_avg:148.30ms step:419/1480 train_time:60660ms step_avg:148.31ms step:420/1480 train_time:60813ms step_avg:148.32ms step:421/1480 train_time:60966ms step_avg:148.34ms step:422/1480 train_time:61121ms step_avg:148.35ms step:423/1480 train_time:61274ms step_avg:148.36ms step:424/1480 train_time:61428ms step_avg:148.38ms step:425/1480 train_time:61582ms step_avg:148.39ms step:426/1480 train_time:61735ms step_avg:148.40ms step:427/1480 train_time:61888ms step_avg:148.41ms step:428/1480 train_time:62041ms step_avg:148.42ms step:429/1480 train_time:62195ms step_avg:148.44ms step:430/1480 train_time:62349ms step_avg:148.45ms step:431/1480 train_time:62503ms step_avg:148.46ms step:432/1480 train_time:62657ms step_avg:148.48ms step:433/1480 train_time:62811ms step_avg:148.49ms step:434/1480 train_time:62964ms step_avg:148.50ms step:435/1480 train_time:63117ms step_avg:148.51ms step:436/1480 train_time:63270ms step_avg:148.52ms step:437/1480 train_time:63425ms step_avg:148.54ms step:438/1480 train_time:63580ms step_avg:148.55ms step:439/1480 train_time:63734ms step_avg:148.57ms step:440/1480 train_time:63889ms step_avg:148.58ms step:441/1480 train_time:64046ms step_avg:148.60ms step:442/1480 train_time:64203ms step_avg:148.62ms step:443/1480 train_time:64360ms step_avg:148.64ms step:444/1480 train_time:64517ms step_avg:148.66ms step:445/1480 train_time:64673ms step_avg:148.67ms step:446/1480 train_time:64829ms step_avg:148.69ms step:447/1480 train_time:64985ms step_avg:148.71ms step:448/1480 train_time:65142ms step_avg:148.73ms step:449/1480 train_time:65301ms step_avg:148.75ms step:450/1480 train_time:65457ms step_avg:148.77ms step:451/1480 train_time:65615ms step_avg:148.79ms step:452/1480 train_time:65771ms step_avg:148.80ms step:453/1480 train_time:65928ms step_avg:148.82ms step:454/1480 train_time:66084ms step_avg:148.84ms step:455/1480 train_time:66240ms step_avg:148.85ms step:456/1480 train_time:66396ms step_avg:148.87ms step:457/1480 train_time:66552ms step_avg:148.89ms step:458/1480 train_time:66711ms step_avg:148.91ms step:459/1480 train_time:66866ms step_avg:148.92ms step:460/1480 train_time:67024ms step_avg:148.94ms step:461/1480 train_time:67184ms step_avg:148.97ms step:462/1480 train_time:67341ms step_avg:148.98ms step:463/1480 train_time:67496ms step_avg:149.00ms step:464/1480 train_time:67651ms step_avg:149.01ms step:465/1480 train_time:67807ms step_avg:149.03ms step:466/1480 train_time:67964ms step_avg:149.04ms step:467/1480 train_time:68124ms step_avg:149.07ms step:468/1480 train_time:68283ms step_avg:149.09ms step:469/1480 train_time:68439ms step_avg:149.11ms step:470/1480 train_time:68599ms step_avg:149.13ms step:471/1480 train_time:68753ms step_avg:149.14ms step:472/1480 train_time:68910ms step_avg:149.16ms step:473/1480 train_time:69066ms step_avg:149.17ms step:474/1480 train_time:69223ms step_avg:149.19ms step:475/1480 train_time:69380ms step_avg:149.20ms step:476/1480 train_time:69535ms step_avg:149.22ms step:477/1480 train_time:69691ms step_avg:149.23ms step:478/1480 train_time:69847ms step_avg:149.25ms step:479/1480 train_time:70006ms step_avg:149.27ms step:480/1480 train_time:70162ms step_avg:149.28ms step:481/1480 train_time:70320ms step_avg:149.30ms step:482/1480 train_time:70476ms step_avg:149.31ms step:483/1480 train_time:70632ms step_avg:149.33ms step:484/1480 train_time:70788ms step_avg:149.34ms step:485/1480 train_time:70945ms step_avg:149.36ms step:486/1480 train_time:71102ms step_avg:149.37ms step:487/1480 train_time:71260ms step_avg:149.39ms step:488/1480 train_time:71417ms step_avg:149.41ms step:489/1480 train_time:71573ms step_avg:149.42ms step:490/1480 train_time:71730ms step_avg:149.44ms step:491/1480 train_time:71888ms step_avg:149.46ms step:492/1480 train_time:72045ms step_avg:149.47ms step:493/1480 train_time:72201ms step_avg:149.49ms step:494/1480 train_time:72359ms step_avg:149.50ms step:495/1480 train_time:72515ms step_avg:149.52ms step:496/1480 train_time:72673ms step_avg:149.53ms step:497/1480 train_time:72830ms step_avg:149.55ms step:498/1480 train_time:72987ms step_avg:149.56ms step:499/1480 train_time:73145ms step_avg:149.58ms step:500/1480 train_time:73303ms step_avg:149.60ms step:500/1480 val_loss:3.6833 train_time:73365ms step_avg:149.72ms step:501/1480 train_time:73464ms step_avg:149.62ms step:502/1480 train_time:73623ms step_avg:149.64ms step:503/1480 train_time:73779ms step_avg:149.65ms step:504/1480 train_time:73934ms step_avg:149.66ms step:505/1480 train_time:74091ms step_avg:149.68ms step:506/1480 train_time:74246ms step_avg:149.69ms step:507/1480 train_time:74403ms step_avg:149.70ms step:508/1480 train_time:74560ms step_avg:149.72ms step:509/1480 train_time:74716ms step_avg:149.73ms step:510/1480 train_time:74872ms step_avg:149.74ms step:511/1480 train_time:75028ms step_avg:149.76ms step:512/1480 train_time:75185ms step_avg:149.77ms step:513/1480 train_time:75343ms step_avg:149.79ms step:514/1480 train_time:75502ms step_avg:149.81ms step:515/1480 train_time:75660ms step_avg:149.82ms step:516/1480 train_time:75818ms step_avg:149.84ms step:517/1480 train_time:75975ms step_avg:149.85ms step:518/1480 train_time:76132ms step_avg:149.87ms step:519/1480 train_time:76290ms step_avg:149.88ms step:520/1480 train_time:76447ms step_avg:149.90ms step:521/1480 train_time:76607ms step_avg:149.92ms step:522/1480 train_time:76767ms step_avg:149.93ms step:523/1480 train_time:76925ms step_avg:149.95ms step:524/1480 train_time:77082ms step_avg:149.97ms step:525/1480 train_time:77239ms step_avg:149.98ms step:526/1480 train_time:77399ms step_avg:150.00ms step:527/1480 train_time:77555ms step_avg:150.01ms step:528/1480 train_time:77712ms step_avg:150.02ms step:529/1480 train_time:77870ms step_avg:150.04ms step:530/1480 train_time:78026ms step_avg:150.05ms step:531/1480 train_time:78183ms step_avg:150.06ms step:532/1480 train_time:78340ms step_avg:150.08ms step:533/1480 train_time:78496ms step_avg:150.09ms step:534/1480 train_time:78651ms step_avg:150.10ms step:535/1480 train_time:78810ms step_avg:150.11ms step:536/1480 train_time:78968ms step_avg:150.13ms step:537/1480 train_time:79125ms step_avg:150.14ms step:538/1480 train_time:79283ms step_avg:150.16ms step:539/1480 train_time:79441ms step_avg:150.17ms step:540/1480 train_time:79599ms step_avg:150.19ms step:541/1480 train_time:79756ms step_avg:150.20ms step:542/1480 train_time:79913ms step_avg:150.21ms step:543/1480 train_time:80069ms step_avg:150.22ms step:544/1480 train_time:80226ms step_avg:150.24ms step:545/1480 train_time:80384ms step_avg:150.25ms step:546/1480 train_time:80540ms step_avg:150.26ms step:547/1480 train_time:80698ms step_avg:150.28ms step:548/1480 train_time:80856ms step_avg:150.29ms step:549/1480 train_time:81013ms step_avg:150.30ms step:550/1480 train_time:81170ms step_avg:150.31ms step:551/1480 train_time:81328ms step_avg:150.33ms step:552/1480 train_time:81487ms step_avg:150.35ms step:553/1480 train_time:81648ms step_avg:150.36ms step:554/1480 train_time:81809ms step_avg:150.38ms step:555/1480 train_time:81969ms step_avg:150.40ms step:556/1480 train_time:82127ms step_avg:150.42ms step:557/1480 train_time:82288ms step_avg:150.44ms step:558/1480 train_time:82448ms step_avg:150.45ms step:559/1480 train_time:82608ms step_avg:150.47ms step:560/1480 train_time:82769ms step_avg:150.49ms step:561/1480 train_time:82928ms step_avg:150.51ms step:562/1480 train_time:83088ms step_avg:150.52ms step:563/1480 train_time:83248ms step_avg:150.54ms step:564/1480 train_time:83408ms step_avg:150.56ms step:565/1480 train_time:83568ms step_avg:150.57ms step:566/1480 train_time:83728ms step_avg:150.59ms step:567/1480 train_time:83888ms step_avg:150.61ms step:568/1480 train_time:84047ms step_avg:150.62ms step:569/1480 train_time:84207ms step_avg:150.64ms step:570/1480 train_time:84365ms step_avg:150.65ms step:571/1480 train_time:84526ms step_avg:150.67ms step:572/1480 train_time:84686ms step_avg:150.69ms step:573/1480 train_time:84847ms step_avg:150.71ms step:574/1480 train_time:85009ms step_avg:150.73ms step:575/1480 train_time:85170ms step_avg:150.74ms step:576/1480 train_time:85329ms step_avg:150.76ms step:577/1480 train_time:85489ms step_avg:150.77ms step:578/1480 train_time:85648ms step_avg:150.79ms step:579/1480 train_time:85809ms step_avg:150.81ms step:580/1480 train_time:85969ms step_avg:150.82ms step:581/1480 train_time:86129ms step_avg:150.84ms step:582/1480 train_time:86289ms step_avg:150.85ms step:583/1480 train_time:86447ms step_avg:150.87ms step:584/1480 train_time:86605ms step_avg:150.88ms step:585/1480 train_time:86764ms step_avg:150.89ms step:586/1480 train_time:86923ms step_avg:150.91ms step:587/1480 train_time:87083ms step_avg:150.92ms step:588/1480 train_time:87242ms step_avg:150.94ms step:589/1480 train_time:87403ms step_avg:150.96ms step:590/1480 train_time:87563ms step_avg:150.97ms step:591/1480 train_time:87721ms step_avg:150.98ms step:592/1480 train_time:87882ms step_avg:151.00ms step:593/1480 train_time:88042ms step_avg:151.02ms step:594/1480 train_time:88204ms step_avg:151.03ms step:595/1480 train_time:88366ms step_avg:151.05ms step:596/1480 train_time:88529ms step_avg:151.07ms step:597/1480 train_time:88689ms step_avg:151.09ms step:598/1480 train_time:88846ms step_avg:151.10ms step:599/1480 train_time:89005ms step_avg:151.11ms step:600/1480 train_time:89165ms step_avg:151.13ms step:601/1480 train_time:89324ms step_avg:151.14ms step:602/1480 train_time:89484ms step_avg:151.15ms step:603/1480 train_time:89645ms step_avg:151.17ms step:604/1480 train_time:89804ms step_avg:151.19ms step:605/1480 train_time:89963ms step_avg:151.20ms step:606/1480 train_time:90126ms step_avg:151.22ms step:607/1480 train_time:90290ms step_avg:151.24ms step:608/1480 train_time:90448ms step_avg:151.25ms step:609/1480 train_time:90608ms step_avg:151.27ms step:610/1480 train_time:90768ms step_avg:151.28ms step:611/1480 train_time:90928ms step_avg:151.29ms step:612/1480 train_time:91087ms step_avg:151.31ms step:613/1480 train_time:91248ms step_avg:151.32ms step:614/1480 train_time:91409ms step_avg:151.34ms step:615/1480 train_time:91569ms step_avg:151.35ms step:616/1480 train_time:91728ms step_avg:151.37ms step:617/1480 train_time:91888ms step_avg:151.38ms step:618/1480 train_time:92048ms step_avg:151.39ms step:619/1480 train_time:92208ms step_avg:151.41ms step:620/1480 train_time:92368ms step_avg:151.42ms step:621/1480 train_time:92528ms step_avg:151.44ms step:622/1480 train_time:92688ms step_avg:151.45ms step:623/1480 train_time:92848ms step_avg:151.47ms step:624/1480 train_time:93008ms step_avg:151.48ms step:625/1480 train_time:93168ms step_avg:151.49ms step:625/1480 val_loss:3.6030 train_time:93231ms step_avg:151.60ms step:626/1480 train_time:93332ms step_avg:151.51ms step:627/1480 train_time:93493ms step_avg:151.53ms step:628/1480 train_time:93652ms step_avg:151.54ms step:629/1480 train_time:93812ms step_avg:151.55ms step:630/1480 train_time:93971ms step_avg:151.57ms step:631/1480 train_time:94130ms step_avg:151.58ms step:632/1480 train_time:94289ms step_avg:151.59ms step:633/1480 train_time:94450ms step_avg:151.60ms step:634/1480 train_time:94611ms step_avg:151.62ms step:635/1480 train_time:94772ms step_avg:151.63ms step:636/1480 train_time:94932ms step_avg:151.65ms step:637/1480 train_time:95092ms step_avg:151.66ms step:638/1480 train_time:95253ms step_avg:151.68ms step:639/1480 train_time:95412ms step_avg:151.69ms step:640/1480 train_time:95573ms step_avg:151.70ms step:641/1480 train_time:95734ms step_avg:151.72ms step:642/1480 train_time:95893ms step_avg:151.73ms step:643/1480 train_time:96054ms step_avg:151.74ms step:644/1480 train_time:96213ms step_avg:151.76ms step:645/1480 train_time:96371ms step_avg:151.77ms step:646/1480 train_time:96530ms step_avg:151.78ms step:647/1480 train_time:96689ms step_avg:151.79ms step:648/1480 train_time:96851ms step_avg:151.80ms step:649/1480 train_time:97011ms step_avg:151.82ms step:650/1480 train_time:97172ms step_avg:151.83ms step:651/1480 train_time:97332ms step_avg:151.84ms step:652/1480 train_time:97492ms step_avg:151.86ms step:653/1480 train_time:97652ms step_avg:151.87ms step:654/1480 train_time:97813ms step_avg:151.88ms step:655/1480 train_time:97973ms step_avg:151.90ms step:656/1480 train_time:98133ms step_avg:151.91ms step:657/1480 train_time:98293ms step_avg:151.92ms step:658/1480 train_time:98453ms step_avg:151.93ms step:659/1480 train_time:98615ms step_avg:151.95ms step:660/1480 train_time:98777ms step_avg:151.96ms step:661/1480 train_time:98939ms step_avg:151.98ms step:662/1480 train_time:99099ms step_avg:151.99ms step:663/1480 train_time:99258ms step_avg:152.00ms step:664/1480 train_time:99419ms step_avg:152.02ms step:665/1480 train_time:99581ms step_avg:152.03ms step:666/1480 train_time:99741ms step_avg:152.04ms step:667/1480 train_time:99901ms step_avg:152.06ms step:668/1480 train_time:100063ms step_avg:152.07ms step:669/1480 train_time:100223ms step_avg:152.08ms step:670/1480 train_time:100382ms step_avg:152.09ms step:671/1480 train_time:100543ms step_avg:152.11ms step:672/1480 train_time:100705ms step_avg:152.12ms step:673/1480 train_time:100869ms step_avg:152.14ms step:674/1480 train_time:101034ms step_avg:152.16ms step:675/1480 train_time:101196ms step_avg:152.18ms step:676/1480 train_time:101358ms step_avg:152.19ms step:677/1480 train_time:101518ms step_avg:152.20ms step:678/1480 train_time:101678ms step_avg:152.21ms step:679/1480 train_time:101840ms step_avg:152.23ms step:680/1480 train_time:102001ms step_avg:152.24ms step:681/1480 train_time:102161ms step_avg:152.25ms step:682/1480 train_time:102323ms step_avg:152.27ms step:683/1480 train_time:102483ms step_avg:152.28ms step:684/1480 train_time:102643ms step_avg:152.29ms step:685/1480 train_time:102805ms step_avg:152.30ms step:686/1480 train_time:102966ms step_avg:152.32ms step:687/1480 train_time:103129ms step_avg:152.33ms step:688/1480 train_time:103293ms step_avg:152.35ms step:689/1480 train_time:103457ms step_avg:152.37ms step:690/1480 train_time:103619ms step_avg:152.38ms step:691/1480 train_time:103780ms step_avg:152.39ms step:692/1480 train_time:103940ms step_avg:152.40ms step:693/1480 train_time:104100ms step_avg:152.42ms step:694/1480 train_time:104262ms step_avg:152.43ms step:695/1480 train_time:104422ms step_avg:152.44ms step:696/1480 train_time:104582ms step_avg:152.45ms step:697/1480 train_time:104744ms step_avg:152.47ms step:698/1480 train_time:104906ms step_avg:152.48ms step:699/1480 train_time:105071ms step_avg:152.50ms step:700/1480 train_time:105234ms step_avg:152.51ms step:701/1480 train_time:105396ms step_avg:152.53ms step:702/1480 train_time:105557ms step_avg:152.54ms step:703/1480 train_time:105716ms step_avg:152.55ms step:704/1480 train_time:105877ms step_avg:152.56ms step:705/1480 train_time:106039ms step_avg:152.57ms step:706/1480 train_time:106203ms step_avg:152.59ms step:707/1480 train_time:106364ms step_avg:152.60ms step:708/1480 train_time:106525ms step_avg:152.62ms step:709/1480 train_time:106687ms step_avg:152.63ms step:710/1480 train_time:106848ms step_avg:152.64ms step:711/1480 train_time:107010ms step_avg:152.65ms step:712/1480 train_time:107176ms step_avg:152.67ms step:713/1480 train_time:107339ms step_avg:152.69ms step:714/1480 train_time:107500ms step_avg:152.70ms step:715/1480 train_time:107659ms step_avg:152.71ms step:716/1480 train_time:107817ms step_avg:152.72ms step:717/1480 train_time:107980ms step_avg:152.73ms step:718/1480 train_time:108138ms step_avg:152.74ms step:719/1480 train_time:108298ms step_avg:152.75ms step:720/1480 train_time:108461ms step_avg:152.76ms step:721/1480 train_time:108622ms step_avg:152.77ms step:722/1480 train_time:108782ms step_avg:152.78ms step:723/1480 train_time:108943ms step_avg:152.79ms step:724/1480 train_time:109105ms step_avg:152.81ms step:725/1480 train_time:109271ms step_avg:152.83ms step:726/1480 train_time:109435ms step_avg:152.84ms step:727/1480 train_time:109597ms step_avg:152.86ms step:728/1480 train_time:109757ms step_avg:152.87ms step:729/1480 train_time:109917ms step_avg:152.88ms step:730/1480 train_time:110080ms step_avg:152.89ms step:731/1480 train_time:110241ms step_avg:152.90ms step:732/1480 train_time:110401ms step_avg:152.91ms step:733/1480 train_time:110563ms step_avg:152.92ms step:734/1480 train_time:110725ms step_avg:152.93ms step:735/1480 train_time:110887ms step_avg:152.95ms step:736/1480 train_time:111049ms step_avg:152.96ms step:737/1480 train_time:111211ms step_avg:152.97ms step:738/1480 train_time:111374ms step_avg:152.99ms step:739/1480 train_time:111537ms step_avg:153.00ms step:740/1480 train_time:111701ms step_avg:153.02ms step:741/1480 train_time:111864ms step_avg:153.03ms step:742/1480 train_time:112025ms step_avg:153.04ms step:743/1480 train_time:112187ms step_avg:153.05ms step:744/1480 train_time:112352ms step_avg:153.07ms step:745/1480 train_time:112518ms step_avg:153.09ms step:746/1480 train_time:112678ms step_avg:153.10ms step:747/1480 train_time:112839ms step_avg:153.11ms step:748/1480 train_time:113002ms step_avg:153.12ms step:749/1480 train_time:113165ms step_avg:153.13ms step:750/1480 train_time:113324ms step_avg:153.14ms step:750/1480 val_loss:3.5484 train_time:113390ms step_avg:153.23ms step:751/1480 train_time:113492ms step_avg:153.16ms step:752/1480 train_time:113652ms step_avg:153.17ms step:753/1480 train_time:113813ms step_avg:153.18ms step:754/1480 train_time:113973ms step_avg:153.19ms step:755/1480 train_time:114133ms step_avg:153.20ms step:756/1480 train_time:114294ms step_avg:153.21ms step:757/1480 train_time:114457ms step_avg:153.22ms step:758/1480 train_time:114618ms step_avg:153.23ms step:759/1480 train_time:114779ms step_avg:153.24ms step:760/1480 train_time:114939ms step_avg:153.25ms step:761/1480 train_time:115103ms step_avg:153.27ms step:762/1480 train_time:115265ms step_avg:153.28ms step:763/1480 train_time:115429ms step_avg:153.29ms step:764/1480 train_time:115592ms step_avg:153.30ms step:765/1480 train_time:115753ms step_avg:153.32ms step:766/1480 train_time:115917ms step_avg:153.33ms step:767/1480 train_time:116077ms step_avg:153.34ms step:768/1480 train_time:116239ms step_avg:153.35ms step:769/1480 train_time:116402ms step_avg:153.36ms step:770/1480 train_time:116565ms step_avg:153.38ms step:771/1480 train_time:116730ms step_avg:153.39ms step:772/1480 train_time:116892ms step_avg:153.40ms step:773/1480 train_time:117054ms step_avg:153.41ms step:774/1480 train_time:117216ms step_avg:153.42ms step:775/1480 train_time:117377ms step_avg:153.43ms step:776/1480 train_time:117540ms step_avg:153.45ms step:777/1480 train_time:117706ms step_avg:153.46ms step:778/1480 train_time:117870ms step_avg:153.48ms step:779/1480 train_time:118033ms step_avg:153.49ms step:780/1480 train_time:118196ms step_avg:153.50ms step:781/1480 train_time:118359ms step_avg:153.51ms step:782/1480 train_time:118523ms step_avg:153.53ms step:783/1480 train_time:118686ms step_avg:153.54ms step:784/1480 train_time:118851ms step_avg:153.55ms step:785/1480 train_time:119013ms step_avg:153.57ms step:786/1480 train_time:119177ms step_avg:153.58ms step:787/1480 train_time:119342ms step_avg:153.59ms step:788/1480 train_time:119508ms step_avg:153.61ms step:789/1480 train_time:119670ms step_avg:153.62ms step:790/1480 train_time:119834ms step_avg:153.63ms step:791/1480 train_time:119999ms step_avg:153.65ms step:792/1480 train_time:120164ms step_avg:153.66ms step:793/1480 train_time:120328ms step_avg:153.68ms step:794/1480 train_time:120492ms step_avg:153.69ms step:795/1480 train_time:120656ms step_avg:153.70ms step:796/1480 train_time:120821ms step_avg:153.72ms step:797/1480 train_time:120986ms step_avg:153.73ms step:798/1480 train_time:121151ms step_avg:153.74ms step:799/1480 train_time:121317ms step_avg:153.76ms step:800/1480 train_time:121480ms step_avg:153.77ms step:801/1480 train_time:121643ms step_avg:153.78ms step:802/1480 train_time:121812ms step_avg:153.80ms step:803/1480 train_time:121974ms step_avg:153.81ms step:804/1480 train_time:122135ms step_avg:153.82ms step:805/1480 train_time:122301ms step_avg:153.84ms step:806/1480 train_time:122463ms step_avg:153.85ms step:807/1480 train_time:122625ms step_avg:153.86ms step:808/1480 train_time:122789ms step_avg:153.87ms step:809/1480 train_time:122951ms step_avg:153.88ms step:810/1480 train_time:123113ms step_avg:153.89ms step:811/1480 train_time:123275ms step_avg:153.90ms step:812/1480 train_time:123437ms step_avg:153.91ms step:813/1480 train_time:123596ms step_avg:153.92ms step:814/1480 train_time:123760ms step_avg:153.93ms step:815/1480 train_time:123925ms step_avg:153.94ms step:816/1480 train_time:124091ms step_avg:153.96ms step:817/1480 train_time:124252ms step_avg:153.97ms step:818/1480 train_time:124413ms step_avg:153.98ms step:819/1480 train_time:124576ms step_avg:153.99ms step:820/1480 train_time:124739ms step_avg:154.00ms step:821/1480 train_time:124901ms step_avg:154.01ms step:822/1480 train_time:125066ms step_avg:154.02ms step:823/1480 train_time:125231ms step_avg:154.04ms step:824/1480 train_time:125392ms step_avg:154.04ms step:825/1480 train_time:125557ms step_avg:154.06ms step:826/1480 train_time:125723ms step_avg:154.07ms step:827/1480 train_time:125889ms step_avg:154.09ms step:828/1480 train_time:126051ms step_avg:154.10ms step:829/1480 train_time:126215ms step_avg:154.11ms step:830/1480 train_time:126378ms step_avg:154.12ms step:831/1480 train_time:126543ms step_avg:154.13ms step:832/1480 train_time:126708ms step_avg:154.15ms step:833/1480 train_time:126871ms step_avg:154.16ms step:834/1480 train_time:127035ms step_avg:154.17ms step:835/1480 train_time:127198ms step_avg:154.18ms step:836/1480 train_time:127363ms step_avg:154.19ms step:837/1480 train_time:127528ms step_avg:154.21ms step:838/1480 train_time:127693ms step_avg:154.22ms step:839/1480 train_time:127854ms step_avg:154.23ms step:840/1480 train_time:128015ms step_avg:154.24ms step:841/1480 train_time:128176ms step_avg:154.24ms step:842/1480 train_time:128339ms step_avg:154.25ms step:843/1480 train_time:128501ms step_avg:154.26ms step:844/1480 train_time:128664ms step_avg:154.27ms step:845/1480 train_time:128831ms step_avg:154.29ms step:846/1480 train_time:128995ms step_avg:154.30ms step:847/1480 train_time:129158ms step_avg:154.31ms step:848/1480 train_time:129318ms step_avg:154.32ms step:849/1480 train_time:129481ms step_avg:154.33ms step:850/1480 train_time:129643ms step_avg:154.34ms step:851/1480 train_time:129810ms step_avg:154.35ms step:852/1480 train_time:129972ms step_avg:154.36ms step:853/1480 train_time:130135ms step_avg:154.37ms step:854/1480 train_time:130299ms step_avg:154.38ms step:855/1480 train_time:130462ms step_avg:154.39ms step:856/1480 train_time:130626ms step_avg:154.40ms step:857/1480 train_time:130792ms step_avg:154.42ms step:858/1480 train_time:130956ms step_avg:154.43ms step:859/1480 train_time:131119ms step_avg:154.44ms step:860/1480 train_time:131280ms step_avg:154.45ms step:861/1480 train_time:131450ms step_avg:154.47ms step:862/1480 train_time:131619ms step_avg:154.48ms step:863/1480 train_time:131788ms step_avg:154.50ms step:864/1480 train_time:131952ms step_avg:154.51ms step:865/1480 train_time:132112ms step_avg:154.52ms step:866/1480 train_time:132278ms step_avg:154.53ms step:867/1480 train_time:132442ms step_avg:154.54ms step:868/1480 train_time:132604ms step_avg:154.55ms step:869/1480 train_time:132766ms step_avg:154.56ms step:870/1480 train_time:132931ms step_avg:154.57ms step:871/1480 train_time:133094ms step_avg:154.58ms step:872/1480 train_time:133256ms step_avg:154.59ms step:873/1480 train_time:133417ms step_avg:154.60ms step:874/1480 train_time:133582ms step_avg:154.61ms step:875/1480 train_time:133748ms step_avg:154.62ms step:875/1480 val_loss:3.5048 train_time:133814ms step_avg:154.70ms step:876/1480 train_time:133915ms step_avg:154.64ms step:877/1480 train_time:134079ms step_avg:154.65ms step:878/1480 train_time:134242ms step_avg:154.66ms step:879/1480 train_time:134405ms step_avg:154.67ms step:880/1480 train_time:134569ms step_avg:154.68ms step:881/1480 train_time:134733ms step_avg:154.69ms step:882/1480 train_time:134898ms step_avg:154.70ms step:883/1480 train_time:135064ms step_avg:154.71ms step:884/1480 train_time:135231ms step_avg:154.73ms step:885/1480 train_time:135397ms step_avg:154.74ms step:886/1480 train_time:135565ms step_avg:154.75ms step:887/1480 train_time:135734ms step_avg:154.77ms step:888/1480 train_time:135905ms step_avg:154.79ms step:889/1480 train_time:136073ms step_avg:154.80ms step:890/1480 train_time:136237ms step_avg:154.81ms step:891/1480 train_time:136402ms step_avg:154.83ms step:892/1480 train_time:136568ms step_avg:154.84ms step:893/1480 train_time:136733ms step_avg:154.85ms step:894/1480 train_time:136899ms step_avg:154.86ms step:895/1480 train_time:137067ms step_avg:154.88ms step:896/1480 train_time:137232ms step_avg:154.89ms step:897/1480 train_time:137398ms step_avg:154.90ms step:898/1480 train_time:137564ms step_avg:154.91ms step:899/1480 train_time:137729ms step_avg:154.93ms step:900/1480 train_time:137895ms step_avg:154.94ms step:901/1480 train_time:138059ms step_avg:154.95ms step:902/1480 train_time:138221ms step_avg:154.96ms step:903/1480 train_time:138393ms step_avg:154.98ms step:904/1480 train_time:138558ms step_avg:154.99ms step:905/1480 train_time:138719ms step_avg:154.99ms step:906/1480 train_time:138885ms step_avg:155.01ms step:907/1480 train_time:139054ms step_avg:155.02ms step:908/1480 train_time:139217ms step_avg:155.03ms step:909/1480 train_time:139382ms step_avg:155.04ms step:910/1480 train_time:139555ms step_avg:155.06ms step:911/1480 train_time:139720ms step_avg:155.07ms step:912/1480 train_time:139884ms step_avg:155.08ms step:913/1480 train_time:140054ms step_avg:155.10ms step:914/1480 train_time:140220ms step_avg:155.11ms step:915/1480 train_time:140392ms step_avg:155.13ms step:916/1480 train_time:140556ms step_avg:155.14ms step:917/1480 train_time:140719ms step_avg:155.15ms step:918/1480 train_time:140886ms step_avg:155.16ms step:919/1480 train_time:141057ms step_avg:155.18ms step:920/1480 train_time:141222ms step_avg:155.19ms step:921/1480 train_time:141388ms step_avg:155.20ms step:922/1480 train_time:141555ms step_avg:155.21ms step:923/1480 train_time:141717ms step_avg:155.22ms step:924/1480 train_time:141882ms step_avg:155.23ms step:925/1480 train_time:142049ms step_avg:155.25ms step:926/1480 train_time:142213ms step_avg:155.25ms step:927/1480 train_time:142378ms step_avg:155.26ms step:928/1480 train_time:142543ms step_avg:155.28ms step:929/1480 train_time:142709ms step_avg:155.29ms step:930/1480 train_time:142877ms step_avg:155.30ms step:931/1480 train_time:143040ms step_avg:155.31ms step:932/1480 train_time:143204ms step_avg:155.32ms step:933/1480 train_time:143374ms step_avg:155.33ms step:934/1480 train_time:143539ms step_avg:155.35ms step:935/1480 train_time:143709ms step_avg:155.36ms step:936/1480 train_time:143879ms step_avg:155.38ms step:937/1480 train_time:144049ms step_avg:155.39ms step:938/1480 train_time:144212ms step_avg:155.40ms step:939/1480 train_time:144381ms step_avg:155.42ms step:940/1480 train_time:144548ms step_avg:155.43ms step:941/1480 train_time:144713ms step_avg:155.44ms step:942/1480 train_time:144879ms step_avg:155.45ms step:943/1480 train_time:145049ms step_avg:155.47ms step:944/1480 train_time:145222ms step_avg:155.48ms step:945/1480 train_time:145387ms step_avg:155.49ms step:946/1480 train_time:145557ms step_avg:155.51ms step:947/1480 train_time:145725ms step_avg:155.52ms step:948/1480 train_time:145892ms step_avg:155.54ms step:949/1480 train_time:146060ms step_avg:155.55ms step:950/1480 train_time:146225ms step_avg:155.56ms step:951/1480 train_time:146394ms step_avg:155.57ms step:952/1480 train_time:146559ms step_avg:155.58ms step:953/1480 train_time:146729ms step_avg:155.60ms step:954/1480 train_time:146898ms step_avg:155.61ms step:955/1480 train_time:147061ms step_avg:155.62ms step:956/1480 train_time:147228ms step_avg:155.63ms step:957/1480 train_time:147396ms step_avg:155.65ms step:958/1480 train_time:147565ms step_avg:155.66ms step:959/1480 train_time:147730ms step_avg:155.67ms step:960/1480 train_time:147897ms step_avg:155.68ms step:961/1480 train_time:148061ms step_avg:155.69ms step:962/1480 train_time:148227ms step_avg:155.70ms step:963/1480 train_time:148394ms step_avg:155.71ms step:964/1480 train_time:148562ms step_avg:155.72ms step:965/1480 train_time:148725ms step_avg:155.73ms step:966/1480 train_time:148891ms step_avg:155.74ms step:967/1480 train_time:149055ms step_avg:155.75ms step:968/1480 train_time:149219ms step_avg:155.76ms step:969/1480 train_time:149385ms step_avg:155.77ms step:970/1480 train_time:149551ms step_avg:155.78ms step:971/1480 train_time:149715ms step_avg:155.79ms step:972/1480 train_time:149879ms step_avg:155.80ms step:973/1480 train_time:150042ms step_avg:155.81ms step:974/1480 train_time:150212ms step_avg:155.82ms step:975/1480 train_time:150379ms step_avg:155.83ms step:976/1480 train_time:150542ms step_avg:155.84ms step:977/1480 train_time:150706ms step_avg:155.85ms step:978/1480 train_time:150874ms step_avg:155.86ms step:979/1480 train_time:151040ms step_avg:155.87ms step:980/1480 train_time:151205ms step_avg:155.88ms step:981/1480 train_time:151374ms step_avg:155.90ms step:982/1480 train_time:151536ms step_avg:155.90ms step:983/1480 train_time:151701ms step_avg:155.91ms step:984/1480 train_time:151864ms step_avg:155.92ms step:985/1480 train_time:152033ms step_avg:155.93ms step:986/1480 train_time:152198ms step_avg:155.94ms step:987/1480 train_time:152360ms step_avg:155.95ms step:988/1480 train_time:152528ms step_avg:155.96ms step:989/1480 train_time:152695ms step_avg:155.97ms step:990/1480 train_time:152864ms step_avg:155.98ms step:991/1480 train_time:153031ms step_avg:156.00ms step:992/1480 train_time:153204ms step_avg:156.01ms step:993/1480 train_time:153381ms step_avg:156.03ms step:994/1480 train_time:153546ms step_avg:156.04ms step:995/1480 train_time:153711ms step_avg:156.05ms step:996/1480 train_time:153874ms step_avg:156.06ms step:997/1480 train_time:154039ms step_avg:156.07ms step:998/1480 train_time:154202ms step_avg:156.07ms step:999/1480 train_time:154370ms step_avg:156.09ms step:1000/1480 train_time:154538ms step_avg:156.10ms step:1000/1480 val_loss:3.4383 train_time:154605ms step_avg:156.17ms step:1001/1480 train_time:154708ms step_avg:156.11ms step:1002/1480 train_time:154874ms step_avg:156.12ms step:1003/1480 train_time:155046ms step_avg:156.14ms step:1004/1480 train_time:155213ms step_avg:156.15ms step:1005/1480 train_time:155381ms step_avg:156.16ms step:1006/1480 train_time:155548ms step_avg:156.17ms step:1007/1480 train_time:155713ms step_avg:156.18ms step:1008/1480 train_time:155880ms step_avg:156.19ms step:1009/1480 train_time:156052ms step_avg:156.21ms step:1010/1480 train_time:156217ms step_avg:156.22ms step:1011/1480 train_time:156384ms step_avg:156.23ms step:1012/1480 train_time:156549ms step_avg:156.24ms step:1013/1480 train_time:156718ms step_avg:156.25ms step:1014/1480 train_time:156886ms step_avg:156.26ms step:1015/1480 train_time:157055ms step_avg:156.27ms step:1016/1480 train_time:157224ms step_avg:156.29ms step:1017/1480 train_time:157396ms step_avg:156.30ms step:1018/1480 train_time:157564ms step_avg:156.31ms step:1019/1480 train_time:157731ms step_avg:156.32ms step:1020/1480 train_time:157901ms step_avg:156.34ms step:1021/1480 train_time:158067ms step_avg:156.35ms step:1022/1480 train_time:158233ms step_avg:156.36ms step:1023/1480 train_time:158401ms step_avg:156.37ms step:1024/1480 train_time:158569ms step_avg:156.38ms step:1025/1480 train_time:158742ms step_avg:156.40ms step:1026/1480 train_time:158908ms step_avg:156.41ms step:1027/1480 train_time:159073ms step_avg:156.41ms step:1028/1480 train_time:159247ms step_avg:156.43ms step:1029/1480 train_time:159421ms step_avg:156.45ms step:1030/1480 train_time:159589ms step_avg:156.46ms step:1031/1480 train_time:159753ms step_avg:156.47ms step:1032/1480 train_time:159927ms step_avg:156.48ms step:1033/1480 train_time:160093ms step_avg:156.49ms step:1034/1480 train_time:160261ms step_avg:156.50ms step:1035/1480 train_time:160428ms step_avg:156.52ms step:1036/1480 train_time:160592ms step_avg:156.52ms step:1037/1480 train_time:160759ms step_avg:156.53ms step:1038/1480 train_time:160927ms step_avg:156.54ms step:1039/1480 train_time:161096ms step_avg:156.56ms step:1040/1480 train_time:161263ms step_avg:156.57ms step:1041/1480 train_time:161431ms step_avg:156.58ms step:1042/1480 train_time:161594ms step_avg:156.58ms step:1043/1480 train_time:161760ms step_avg:156.59ms step:1044/1480 train_time:161925ms step_avg:156.60ms step:1045/1480 train_time:162094ms step_avg:156.61ms step:1046/1480 train_time:162263ms step_avg:156.62ms step:1047/1480 train_time:162431ms step_avg:156.64ms step:1048/1480 train_time:162598ms step_avg:156.65ms step:1049/1480 train_time:162764ms step_avg:156.65ms step:1050/1480 train_time:162932ms step_avg:156.67ms step:1051/1480 train_time:163103ms step_avg:156.68ms step:1052/1480 train_time:163270ms step_avg:156.69ms step:1053/1480 train_time:163436ms step_avg:156.70ms step:1054/1480 train_time:163607ms step_avg:156.71ms step:1055/1480 train_time:163772ms step_avg:156.72ms step:1056/1480 train_time:163937ms step_avg:156.73ms step:1057/1480 train_time:164105ms step_avg:156.74ms step:1058/1480 train_time:164273ms step_avg:156.75ms step:1059/1480 train_time:164446ms step_avg:156.76ms step:1060/1480 train_time:164614ms step_avg:156.78ms step:1061/1480 train_time:164777ms step_avg:156.78ms step:1062/1480 train_time:164944ms step_avg:156.79ms step:1063/1480 train_time:165111ms step_avg:156.80ms step:1064/1480 train_time:165273ms step_avg:156.81ms step:1065/1480 train_time:165440ms step_avg:156.82ms step:1066/1480 train_time:165608ms step_avg:156.83ms step:1067/1480 train_time:165775ms step_avg:156.84ms step:1068/1480 train_time:165940ms step_avg:156.84ms step:1069/1480 train_time:166111ms step_avg:156.86ms step:1070/1480 train_time:166276ms step_avg:156.86ms step:1071/1480 train_time:166449ms step_avg:156.88ms step:1072/1480 train_time:166614ms step_avg:156.89ms step:1073/1480 train_time:166779ms step_avg:156.89ms step:1074/1480 train_time:166948ms step_avg:156.91ms step:1075/1480 train_time:167117ms step_avg:156.92ms step:1076/1480 train_time:167287ms step_avg:156.93ms step:1077/1480 train_time:167452ms step_avg:156.94ms step:1078/1480 train_time:167626ms step_avg:156.95ms step:1079/1480 train_time:167799ms step_avg:156.97ms step:1080/1480 train_time:167969ms step_avg:156.98ms step:1081/1480 train_time:168135ms step_avg:156.99ms step:1082/1480 train_time:168302ms step_avg:157.00ms step:1083/1480 train_time:168469ms step_avg:157.01ms step:1084/1480 train_time:168634ms step_avg:157.02ms step:1085/1480 train_time:168803ms step_avg:157.03ms step:1086/1480 train_time:168971ms step_avg:157.04ms step:1087/1480 train_time:169138ms step_avg:157.05ms step:1088/1480 train_time:169309ms step_avg:157.06ms step:1089/1480 train_time:169481ms step_avg:157.07ms step:1090/1480 train_time:169650ms step_avg:157.08ms step:1091/1480 train_time:169818ms step_avg:157.09ms step:1092/1480 train_time:169986ms step_avg:157.10ms step:1093/1480 train_time:170153ms step_avg:157.11ms step:1094/1480 train_time:170318ms step_avg:157.12ms step:1095/1480 train_time:170482ms step_avg:157.13ms step:1096/1480 train_time:170651ms step_avg:157.14ms step:1097/1480 train_time:170819ms step_avg:157.15ms step:1098/1480 train_time:170990ms step_avg:157.16ms step:1099/1480 train_time:171161ms step_avg:157.17ms step:1100/1480 train_time:171334ms step_avg:157.19ms step:1101/1480 train_time:171506ms step_avg:157.20ms step:1102/1480 train_time:171676ms step_avg:157.21ms step:1103/1480 train_time:171852ms step_avg:157.23ms step:1104/1480 train_time:172020ms step_avg:157.24ms step:1105/1480 train_time:172191ms step_avg:157.25ms step:1106/1480 train_time:172360ms step_avg:157.26ms step:1107/1480 train_time:172530ms step_avg:157.27ms step:1108/1480 train_time:172694ms step_avg:157.28ms step:1109/1480 train_time:172861ms step_avg:157.29ms step:1110/1480 train_time:173027ms step_avg:157.30ms step:1111/1480 train_time:173193ms step_avg:157.31ms step:1112/1480 train_time:173363ms step_avg:157.32ms step:1113/1480 train_time:173543ms step_avg:157.34ms step:1114/1480 train_time:173715ms step_avg:157.35ms step:1115/1480 train_time:173888ms step_avg:157.36ms step:1116/1480 train_time:174055ms step_avg:157.37ms step:1117/1480 train_time:174228ms step_avg:157.39ms step:1118/1480 train_time:174403ms step_avg:157.40ms step:1119/1480 train_time:174568ms step_avg:157.41ms step:1120/1480 train_time:174737ms step_avg:157.42ms step:1121/1480 train_time:174908ms step_avg:157.43ms step:1122/1480 train_time:175073ms step_avg:157.44ms step:1123/1480 train_time:175239ms step_avg:157.45ms step:1124/1480 train_time:175408ms step_avg:157.46ms step:1125/1480 train_time:175575ms step_avg:157.47ms step:1125/1480 val_loss:3.3841 train_time:175643ms step_avg:157.53ms step:1126/1480 train_time:175745ms step_avg:157.48ms step:1127/1480 train_time:175917ms step_avg:157.49ms step:1128/1480 train_time:176088ms step_avg:157.50ms step:1129/1480 train_time:176261ms step_avg:157.52ms step:1130/1480 train_time:176430ms step_avg:157.53ms step:1131/1480 train_time:176607ms step_avg:157.54ms step:1132/1480 train_time:176774ms step_avg:157.55ms step:1133/1480 train_time:176945ms step_avg:157.56ms step:1134/1480 train_time:177116ms step_avg:157.58ms step:1135/1480 train_time:177282ms step_avg:157.58ms step:1136/1480 train_time:177454ms step_avg:157.60ms step:1137/1480 train_time:177623ms step_avg:157.61ms step:1138/1480 train_time:177796ms step_avg:157.62ms step:1139/1480 train_time:177964ms step_avg:157.63ms step:1140/1480 train_time:178134ms step_avg:157.64ms step:1141/1480 train_time:178305ms step_avg:157.65ms step:1142/1480 train_time:178473ms step_avg:157.66ms step:1143/1480 train_time:178642ms step_avg:157.67ms step:1144/1480 train_time:178812ms step_avg:157.68ms step:1145/1480 train_time:178977ms step_avg:157.69ms step:1146/1480 train_time:179146ms step_avg:157.70ms step:1147/1480 train_time:179317ms step_avg:157.71ms step:1148/1480 train_time:179485ms step_avg:157.72ms step:1149/1480 train_time:179656ms step_avg:157.73ms step:1150/1480 train_time:179824ms step_avg:157.74ms step:1151/1480 train_time:179997ms step_avg:157.75ms step:1152/1480 train_time:180168ms step_avg:157.77ms step:1153/1480 train_time:180341ms step_avg:157.78ms step:1154/1480 train_time:180507ms step_avg:157.79ms step:1155/1480 train_time:180679ms step_avg:157.80ms step:1156/1480 train_time:180858ms step_avg:157.82ms step:1157/1480 train_time:181028ms step_avg:157.83ms step:1158/1480 train_time:181196ms step_avg:157.84ms step:1159/1480 train_time:181362ms step_avg:157.84ms step:1160/1480 train_time:181528ms step_avg:157.85ms step:1161/1480 train_time:181698ms step_avg:157.86ms step:1162/1480 train_time:181868ms step_avg:157.87ms step:1163/1480 train_time:182039ms step_avg:157.88ms step:1164/1480 train_time:182207ms step_avg:157.89ms step:1165/1480 train_time:182374ms step_avg:157.90ms step:1166/1480 train_time:182542ms step_avg:157.91ms step:1167/1480 train_time:182712ms step_avg:157.92ms step:1168/1480 train_time:182879ms step_avg:157.93ms step:1169/1480 train_time:183048ms step_avg:157.94ms step:1170/1480 train_time:183218ms step_avg:157.95ms step:1171/1480 train_time:183386ms step_avg:157.96ms step:1172/1480 train_time:183553ms step_avg:157.96ms step:1173/1480 train_time:183724ms step_avg:157.97ms step:1174/1480 train_time:183905ms step_avg:157.99ms step:1175/1480 train_time:184076ms step_avg:158.01ms step:1176/1480 train_time:184248ms step_avg:158.02ms step:1177/1480 train_time:184424ms step_avg:158.03ms step:1178/1480 train_time:184592ms step_avg:158.04ms step:1179/1480 train_time:184757ms step_avg:158.05ms step:1180/1480 train_time:184939ms step_avg:158.07ms step:1181/1480 train_time:185109ms step_avg:158.08ms step:1182/1480 train_time:185277ms step_avg:158.09ms step:1183/1480 train_time:185447ms step_avg:158.10ms step:1184/1480 train_time:185616ms step_avg:158.11ms step:1185/1480 train_time:185789ms step_avg:158.12ms step:1186/1480 train_time:185960ms step_avg:158.13ms step:1187/1480 train_time:186141ms step_avg:158.15ms step:1188/1480 train_time:186309ms step_avg:158.16ms step:1189/1480 train_time:186481ms step_avg:158.17ms step:1190/1480 train_time:186648ms step_avg:158.18ms step:1191/1480 train_time:186821ms step_avg:158.19ms step:1192/1480 train_time:186988ms step_avg:158.20ms step:1193/1480 train_time:187154ms step_avg:158.20ms step:1194/1480 train_time:187320ms step_avg:158.21ms step:1195/1480 train_time:187493ms step_avg:158.22ms step:1196/1480 train_time:187674ms step_avg:158.24ms step:1197/1480 train_time:187845ms step_avg:158.25ms step:1198/1480 train_time:188028ms step_avg:158.27ms step:1199/1480 train_time:188198ms step_avg:158.28ms step:1200/1480 train_time:188366ms step_avg:158.29ms step:1201/1480 train_time:188535ms step_avg:158.30ms step:1202/1480 train_time:188716ms step_avg:158.32ms step:1203/1480 train_time:188893ms step_avg:158.33ms step:1204/1480 train_time:189067ms step_avg:158.35ms step:1205/1480 train_time:189235ms step_avg:158.36ms step:1206/1480 train_time:189401ms step_avg:158.36ms step:1207/1480 train_time:189571ms step_avg:158.37ms step:1208/1480 train_time:189739ms step_avg:158.38ms step:1209/1480 train_time:189912ms step_avg:158.39ms step:1210/1480 train_time:190084ms step_avg:158.40ms step:1211/1480 train_time:190258ms step_avg:158.42ms step:1212/1480 train_time:190429ms step_avg:158.43ms step:1213/1480 train_time:190603ms step_avg:158.44ms step:1214/1480 train_time:190779ms step_avg:158.45ms step:1215/1480 train_time:190953ms step_avg:158.47ms step:1216/1480 train_time:191121ms step_avg:158.48ms step:1217/1480 train_time:191294ms step_avg:158.49ms step:1218/1480 train_time:191463ms step_avg:158.50ms step:1219/1480 train_time:191643ms step_avg:158.51ms step:1220/1480 train_time:191813ms step_avg:158.52ms step:1221/1480 train_time:191981ms step_avg:158.53ms step:1222/1480 train_time:192149ms step_avg:158.54ms step:1223/1480 train_time:192319ms step_avg:158.55ms step:1224/1480 train_time:192497ms step_avg:158.56ms step:1225/1480 train_time:192669ms step_avg:158.58ms step:1226/1480 train_time:192843ms step_avg:158.59ms step:1227/1480 train_time:193016ms step_avg:158.60ms step:1228/1480 train_time:193185ms step_avg:158.61ms step:1229/1480 train_time:193359ms step_avg:158.62ms step:1230/1480 train_time:193539ms step_avg:158.64ms step:1231/1480 train_time:193715ms step_avg:158.65ms step:1232/1480 train_time:193889ms step_avg:158.67ms step:1233/1480 train_time:194059ms step_avg:158.67ms step:1234/1480 train_time:194229ms step_avg:158.68ms step:1235/1480 train_time:194403ms step_avg:158.70ms step:1236/1480 train_time:194572ms step_avg:158.70ms step:1237/1480 train_time:194744ms step_avg:158.72ms step:1238/1480 train_time:194931ms step_avg:158.74ms step:1239/1480 train_time:195102ms step_avg:158.75ms step:1240/1480 train_time:195273ms step_avg:158.76ms step:1241/1480 train_time:195443ms step_avg:158.77ms step:1242/1480 train_time:195613ms step_avg:158.78ms step:1243/1480 train_time:195786ms step_avg:158.79ms step:1244/1480 train_time:195952ms step_avg:158.79ms step:1245/1480 train_time:196121ms step_avg:158.80ms step:1246/1480 train_time:196292ms step_avg:158.81ms step:1247/1480 train_time:196460ms step_avg:158.82ms step:1248/1480 train_time:196629ms step_avg:158.83ms step:1249/1480 train_time:196797ms step_avg:158.84ms step:1250/1480 train_time:196966ms step_avg:158.84ms step:1250/1480 val_loss:3.3344 train_time:197036ms step_avg:158.90ms step:1251/1480 train_time:197144ms step_avg:158.86ms step:1252/1480 train_time:197314ms step_avg:158.87ms step:1253/1480 train_time:197482ms step_avg:158.88ms step:1254/1480 train_time:197654ms step_avg:158.89ms step:1255/1480 train_time:197840ms step_avg:158.91ms step:1256/1480 train_time:198015ms step_avg:158.92ms step:1257/1480 train_time:198185ms step_avg:158.93ms step:1258/1480 train_time:198360ms step_avg:158.94ms step:1259/1480 train_time:198531ms step_avg:158.95ms step:1260/1480 train_time:198698ms step_avg:158.96ms step:1261/1480 train_time:198872ms step_avg:158.97ms step:1262/1480 train_time:199048ms step_avg:158.98ms step:1263/1480 train_time:199222ms step_avg:159.00ms step:1264/1480 train_time:199389ms step_avg:159.00ms step:1265/1480 train_time:199556ms step_avg:159.01ms step:1266/1480 train_time:199726ms step_avg:159.02ms step:1267/1480 train_time:199896ms step_avg:159.03ms step:1268/1480 train_time:200066ms step_avg:159.03ms step:1269/1480 train_time:200243ms step_avg:159.05ms step:1270/1480 train_time:200413ms step_avg:159.06ms step:1271/1480 train_time:200582ms step_avg:159.07ms step:1272/1480 train_time:200749ms step_avg:159.07ms step:1273/1480 train_time:200920ms step_avg:159.08ms step:1274/1480 train_time:201092ms step_avg:159.09ms step:1275/1480 train_time:201260ms step_avg:159.10ms step:1276/1480 train_time:201424ms step_avg:159.10ms step:1277/1480 train_time:201598ms step_avg:159.11ms step:1278/1480 train_time:201766ms step_avg:159.12ms step:1279/1480 train_time:201939ms step_avg:159.13ms step:1280/1480 train_time:202120ms step_avg:159.15ms step:1281/1480 train_time:202288ms step_avg:159.16ms step:1282/1480 train_time:202454ms step_avg:159.16ms step:1283/1480 train_time:202623ms step_avg:159.17ms step:1284/1480 train_time:202794ms step_avg:159.18ms step:1285/1480 train_time:202963ms step_avg:159.19ms step:1286/1480 train_time:203135ms step_avg:159.20ms step:1287/1480 train_time:203306ms step_avg:159.21ms step:1288/1480 train_time:203478ms step_avg:159.22ms step:1289/1480 train_time:203661ms step_avg:159.23ms step:1290/1480 train_time:203842ms step_avg:159.25ms step:1291/1480 train_time:204016ms step_avg:159.26ms step:1292/1480 train_time:204192ms step_avg:159.28ms step:1293/1480 train_time:204368ms step_avg:159.29ms step:1294/1480 train_time:204538ms step_avg:159.30ms step:1295/1480 train_time:204709ms step_avg:159.31ms step:1296/1480 train_time:204882ms step_avg:159.32ms step:1297/1480 train_time:205055ms step_avg:159.33ms step:1298/1480 train_time:205224ms step_avg:159.34ms step:1299/1480 train_time:205395ms step_avg:159.34ms step:1300/1480 train_time:205561ms step_avg:159.35ms step:1301/1480 train_time:205731ms step_avg:159.36ms step:1302/1480 train_time:205905ms step_avg:159.37ms step:1303/1480 train_time:206081ms step_avg:159.38ms step:1304/1480 train_time:206255ms step_avg:159.39ms step:1305/1480 train_time:206424ms step_avg:159.40ms step:1306/1480 train_time:206598ms step_avg:159.41ms step:1307/1480 train_time:206765ms step_avg:159.42ms step:1308/1480 train_time:206935ms step_avg:159.43ms step:1309/1480 train_time:207105ms step_avg:159.43ms step:1310/1480 train_time:207274ms step_avg:159.44ms step:1311/1480 train_time:207442ms step_avg:159.45ms step:1312/1480 train_time:207615ms step_avg:159.46ms step:1313/1480 train_time:207783ms step_avg:159.46ms step:1314/1480 train_time:207958ms step_avg:159.48ms step:1315/1480 train_time:208128ms step_avg:159.48ms step:1316/1480 train_time:208295ms step_avg:159.49ms step:1317/1480 train_time:208464ms step_avg:159.50ms step:1318/1480 train_time:208644ms step_avg:159.51ms step:1319/1480 train_time:208819ms step_avg:159.53ms step:1320/1480 train_time:208996ms step_avg:159.54ms step:1321/1480 train_time:209167ms step_avg:159.55ms step:1322/1480 train_time:209351ms step_avg:159.57ms step:1323/1480 train_time:209521ms step_avg:159.57ms step:1324/1480 train_time:209696ms step_avg:159.59ms step:1325/1480 train_time:209876ms step_avg:159.60ms step:1326/1480 train_time:210051ms step_avg:159.61ms step:1327/1480 train_time:210222ms step_avg:159.62ms step:1328/1480 train_time:210393ms step_avg:159.63ms step:1329/1480 train_time:210587ms step_avg:159.66ms step:1330/1480 train_time:210765ms step_avg:159.67ms step:1331/1480 train_time:210936ms step_avg:159.68ms step:1332/1480 train_time:211111ms step_avg:159.69ms step:1333/1480 train_time:211287ms step_avg:159.70ms step:1334/1480 train_time:211459ms step_avg:159.71ms step:1335/1480 train_time:211626ms step_avg:159.72ms step:1336/1480 train_time:211813ms step_avg:159.74ms step:1337/1480 train_time:211987ms step_avg:159.75ms step:1338/1480 train_time:212159ms step_avg:159.76ms step:1339/1480 train_time:212334ms step_avg:159.77ms step:1340/1480 train_time:212504ms step_avg:159.78ms step:1341/1480 train_time:212674ms step_avg:159.78ms step:1342/1480 train_time:212846ms step_avg:159.79ms step:1343/1480 train_time:213016ms step_avg:159.80ms step:1344/1480 train_time:213188ms step_avg:159.81ms step:1345/1480 train_time:213368ms step_avg:159.83ms step:1346/1480 train_time:213536ms step_avg:159.83ms step:1347/1480 train_time:213704ms step_avg:159.84ms step:1348/1480 train_time:213874ms step_avg:159.85ms step:1349/1480 train_time:214043ms step_avg:159.85ms step:1350/1480 train_time:214220ms step_avg:159.87ms step:1351/1480 train_time:214390ms step_avg:159.87ms step:1352/1480 train_time:214560ms step_avg:159.88ms step:1353/1480 train_time:214736ms step_avg:159.89ms step:1354/1480 train_time:214907ms step_avg:159.90ms step:1355/1480 train_time:215075ms step_avg:159.91ms step:1356/1480 train_time:215247ms step_avg:159.92ms step:1357/1480 train_time:215423ms step_avg:159.93ms step:1358/1480 train_time:215595ms step_avg:159.94ms step:1359/1480 train_time:215767ms step_avg:159.95ms step:1360/1480 train_time:215941ms step_avg:159.96ms step:1361/1480 train_time:216119ms step_avg:159.97ms step:1362/1480 train_time:216295ms step_avg:159.98ms step:1363/1480 train_time:216476ms step_avg:160.00ms step:1364/1480 train_time:216645ms step_avg:160.00ms step:1365/1480 train_time:216813ms step_avg:160.01ms step:1366/1480 train_time:216984ms step_avg:160.02ms step:1367/1480 train_time:217156ms step_avg:160.03ms step:1368/1480 train_time:217329ms step_avg:160.04ms step:1369/1480 train_time:217510ms step_avg:160.05ms step:1370/1480 train_time:217689ms step_avg:160.07ms step:1371/1480 train_time:217861ms step_avg:160.07ms step:1372/1480 train_time:218040ms step_avg:160.09ms step:1373/1480 train_time:218208ms step_avg:160.09ms step:1374/1480 train_time:218386ms step_avg:160.11ms step:1375/1480 train_time:218557ms step_avg:160.12ms step:1375/1480 val_loss:3.2963 train_time:218625ms step_avg:160.16ms step:1376/1480 train_time:218729ms step_avg:160.12ms step:1377/1480 train_time:218902ms step_avg:160.13ms step:1378/1480 train_time:219070ms step_avg:160.14ms step:1379/1480 train_time:219247ms step_avg:160.15ms step:1380/1480 train_time:219422ms step_avg:160.16ms step:1381/1480 train_time:219603ms step_avg:160.18ms step:1382/1480 train_time:219775ms step_avg:160.19ms step:1383/1480 train_time:219948ms step_avg:160.20ms step:1384/1480 train_time:220125ms step_avg:160.21ms step:1385/1480 train_time:220290ms step_avg:160.21ms step:1386/1480 train_time:220461ms step_avg:160.22ms step:1387/1480 train_time:220631ms step_avg:160.23ms step:1388/1480 train_time:220800ms step_avg:160.23ms step:1389/1480 train_time:220972ms step_avg:160.24ms step:1390/1480 train_time:221141ms step_avg:160.25ms step:1391/1480 train_time:221310ms step_avg:160.25ms step:1392/1480 train_time:221483ms step_avg:160.26ms step:1393/1480 train_time:221652ms step_avg:160.27ms step:1394/1480 train_time:221823ms step_avg:160.28ms step:1395/1480 train_time:221991ms step_avg:160.28ms step:1396/1480 train_time:222161ms step_avg:160.29ms step:1397/1480 train_time:222328ms step_avg:160.29ms step:1398/1480 train_time:222495ms step_avg:160.30ms step:1399/1480 train_time:222665ms step_avg:160.31ms step:1400/1480 train_time:222842ms step_avg:160.32ms step:1401/1480 train_time:223008ms step_avg:160.32ms step:1402/1480 train_time:223180ms step_avg:160.33ms step:1403/1480 train_time:223358ms step_avg:160.34ms step:1404/1480 train_time:223529ms step_avg:160.35ms step:1405/1480 train_time:223703ms step_avg:160.36ms step:1406/1480 train_time:223877ms step_avg:160.37ms step:1407/1480 train_time:224045ms step_avg:160.38ms step:1408/1480 train_time:224214ms step_avg:160.38ms step:1409/1480 train_time:224396ms step_avg:160.40ms step:1410/1480 train_time:224564ms step_avg:160.40ms step:1411/1480 train_time:224730ms step_avg:160.41ms step:1412/1480 train_time:224901ms step_avg:160.41ms step:1413/1480 train_time:225069ms step_avg:160.42ms step:1414/1480 train_time:225242ms step_avg:160.43ms step:1415/1480 train_time:225416ms step_avg:160.44ms step:1416/1480 train_time:225602ms step_avg:160.46ms step:1417/1480 train_time:225775ms step_avg:160.47ms step:1418/1480 train_time:225946ms step_avg:160.47ms step:1419/1480 train_time:226120ms step_avg:160.48ms step:1420/1480 train_time:226295ms step_avg:160.49ms step:1421/1480 train_time:226467ms step_avg:160.50ms step:1422/1480 train_time:226641ms step_avg:160.51ms step:1423/1480 train_time:226811ms step_avg:160.52ms step:1424/1480 train_time:226990ms step_avg:160.53ms step:1425/1480 train_time:227169ms step_avg:160.54ms step:1426/1480 train_time:227341ms step_avg:160.55ms step:1427/1480 train_time:227515ms step_avg:160.56ms step:1428/1480 train_time:227685ms step_avg:160.57ms step:1429/1480 train_time:227852ms step_avg:160.57ms step:1430/1480 train_time:228026ms step_avg:160.58ms step:1431/1480 train_time:228201ms step_avg:160.59ms step:1432/1480 train_time:228377ms step_avg:160.60ms step:1433/1480 train_time:228556ms step_avg:160.62ms step:1434/1480 train_time:228737ms step_avg:160.63ms step:1435/1480 train_time:228912ms step_avg:160.64ms step:1436/1480 train_time:229087ms step_avg:160.65ms step:1437/1480 train_time:229258ms step_avg:160.66ms step:1438/1480 train_time:229426ms step_avg:160.66ms step:1439/1480 train_time:229600ms step_avg:160.67ms step:1440/1480 train_time:229770ms step_avg:160.68ms step:1441/1480 train_time:229942ms step_avg:160.69ms step:1442/1480 train_time:230119ms step_avg:160.70ms step:1443/1480 train_time:230309ms step_avg:160.72ms step:1444/1480 train_time:230481ms step_avg:160.73ms step:1445/1480 train_time:230652ms step_avg:160.73ms step:1446/1480 train_time:230827ms step_avg:160.74ms step:1447/1480 train_time:231004ms step_avg:160.75ms step:1448/1480 train_time:231177ms step_avg:160.76ms step:1449/1480 train_time:231351ms step_avg:160.77ms step:1450/1480 train_time:231525ms step_avg:160.78ms step:1451/1480 train_time:231694ms step_avg:160.79ms step:1452/1480 train_time:231868ms step_avg:160.80ms step:1453/1480 train_time:232038ms step_avg:160.80ms step:1454/1480 train_time:232210ms step_avg:160.81ms step:1455/1480 train_time:232388ms step_avg:160.82ms step:1456/1480 train_time:232562ms step_avg:160.83ms step:1457/1480 train_time:232732ms step_avg:160.84ms step:1458/1480 train_time:232903ms step_avg:160.84ms step:1459/1480 train_time:233079ms step_avg:160.85ms step:1460/1480 train_time:233250ms step_avg:160.86ms step:1461/1480 train_time:233425ms step_avg:160.87ms step:1462/1480 train_time:233595ms step_avg:160.88ms step:1463/1480 train_time:233771ms step_avg:160.89ms step:1464/1480 train_time:233948ms step_avg:160.90ms step:1465/1480 train_time:234120ms step_avg:160.91ms step:1466/1480 train_time:234291ms step_avg:160.91ms step:1467/1480 train_time:234465ms step_avg:160.92ms step:1468/1480 train_time:234634ms step_avg:160.93ms step:1469/1480 train_time:234806ms step_avg:160.94ms step:1470/1480 train_time:234988ms step_avg:160.95ms step:1471/1480 train_time:235173ms step_avg:160.97ms step:1472/1480 train_time:235352ms step_avg:160.98ms step:1473/1480 train_time:235524ms step_avg:160.99ms step:1474/1480 train_time:235703ms step_avg:161.00ms step:1475/1480 train_time:235883ms step_avg:161.01ms step:1476/1480 train_time:236055ms step_avg:161.02ms step:1477/1480 train_time:236239ms step_avg:161.04ms step:1478/1480 train_time:236422ms step_avg:161.05ms step:1479/1480 train_time:236596ms step_avg:161.06ms step:1480/1480 train_time:236768ms step_avg:161.07ms step:1480/1480 val_loss:3.2775 train_time:236839ms step_avg:161.11ms