import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 11:09:11 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 130W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 114W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 38C P0 97W / 700W | 31MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 105W / 700W | 37MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23018ms step_avg:nanms step:2/1480 train_time:23141ms step_avg:nanms step:3/1480 train_time:23280ms step_avg:nanms step:4/1480 train_time:23420ms step_avg:nanms step:5/1480 train_time:23562ms step_avg:nanms step:6/1480 train_time:23703ms step_avg:nanms step:7/1480 train_time:23844ms step_avg:nanms step:8/1480 train_time:23986ms step_avg:nanms step:9/1480 train_time:24130ms step_avg:nanms step:10/1480 train_time:24273ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:427ms step_avg:142.49ms step:14/1480 train_time:569ms step_avg:142.15ms step:15/1480 train_time:711ms step_avg:142.13ms step:16/1480 train_time:854ms step_avg:142.38ms step:17/1480 train_time:999ms step_avg:142.68ms step:18/1480 train_time:1143ms step_avg:142.88ms step:19/1480 train_time:1287ms step_avg:142.96ms step:20/1480 train_time:1429ms step_avg:142.93ms step:21/1480 train_time:1571ms step_avg:142.81ms step:22/1480 train_time:1712ms step_avg:142.69ms step:23/1480 train_time:1854ms step_avg:142.64ms step:24/1480 train_time:1999ms step_avg:142.76ms step:25/1480 train_time:2142ms step_avg:142.82ms step:26/1480 train_time:2285ms step_avg:142.80ms step:27/1480 train_time:2428ms step_avg:142.81ms step:28/1480 train_time:2569ms step_avg:142.75ms step:29/1480 train_time:2711ms step_avg:142.66ms step:30/1480 train_time:2853ms step_avg:142.66ms step:31/1480 train_time:2997ms step_avg:142.72ms step:32/1480 train_time:3142ms step_avg:142.81ms step:33/1480 train_time:3286ms step_avg:142.86ms step:34/1480 train_time:3429ms step_avg:142.86ms step:35/1480 train_time:3571ms step_avg:142.84ms step:36/1480 train_time:3712ms step_avg:142.78ms step:37/1480 train_time:3855ms step_avg:142.77ms step:38/1480 train_time:3998ms step_avg:142.79ms step:39/1480 train_time:4141ms step_avg:142.80ms step:40/1480 train_time:4283ms step_avg:142.78ms step:41/1480 train_time:4428ms step_avg:142.83ms step:42/1480 train_time:4570ms step_avg:142.80ms step:43/1480 train_time:4711ms step_avg:142.76ms step:44/1480 train_time:4852ms step_avg:142.72ms step:45/1480 train_time:4996ms step_avg:142.75ms step:46/1480 train_time:5141ms step_avg:142.82ms step:47/1480 train_time:5284ms step_avg:142.82ms step:48/1480 train_time:5427ms step_avg:142.83ms step:49/1480 train_time:5570ms step_avg:142.82ms step:50/1480 train_time:5712ms step_avg:142.79ms step:51/1480 train_time:5854ms step_avg:142.77ms step:52/1480 train_time:5996ms step_avg:142.75ms step:53/1480 train_time:6140ms step_avg:142.79ms step:54/1480 train_time:6284ms step_avg:142.82ms step:55/1480 train_time:6428ms step_avg:142.85ms step:56/1480 train_time:6571ms step_avg:142.84ms step:57/1480 train_time:6711ms step_avg:142.80ms step:58/1480 train_time:6853ms step_avg:142.76ms step:59/1480 train_time:6996ms step_avg:142.78ms step:60/1480 train_time:7141ms step_avg:142.82ms step:61/1480 train_time:7284ms step_avg:142.82ms step:62/1480 train_time:7428ms step_avg:142.85ms step:63/1480 train_time:7570ms step_avg:142.84ms step:64/1480 train_time:7712ms step_avg:142.82ms step:65/1480 train_time:7854ms step_avg:142.79ms step:66/1480 train_time:7997ms step_avg:142.80ms step:67/1480 train_time:8141ms step_avg:142.83ms step:68/1480 train_time:8286ms step_avg:142.86ms step:69/1480 train_time:8429ms step_avg:142.87ms step:70/1480 train_time:8571ms step_avg:142.84ms step:71/1480 train_time:8711ms step_avg:142.81ms step:72/1480 train_time:8853ms step_avg:142.80ms step:73/1480 train_time:8996ms step_avg:142.79ms step:74/1480 train_time:9138ms step_avg:142.79ms step:75/1480 train_time:9281ms step_avg:142.78ms step:76/1480 train_time:9426ms step_avg:142.81ms step:77/1480 train_time:9570ms step_avg:142.83ms step:78/1480 train_time:9711ms step_avg:142.80ms step:79/1480 train_time:9854ms step_avg:142.81ms step:80/1480 train_time:9995ms step_avg:142.79ms step:81/1480 train_time:10139ms step_avg:142.81ms step:82/1480 train_time:10283ms step_avg:142.82ms step:83/1480 train_time:10427ms step_avg:142.83ms step:84/1480 train_time:10570ms step_avg:142.83ms step:85/1480 train_time:10712ms step_avg:142.83ms step:86/1480 train_time:10853ms step_avg:142.80ms step:87/1480 train_time:10996ms step_avg:142.80ms step:88/1480 train_time:11139ms step_avg:142.81ms step:89/1480 train_time:11282ms step_avg:142.81ms step:90/1480 train_time:11426ms step_avg:142.83ms step:91/1480 train_time:11569ms step_avg:142.82ms step:92/1480 train_time:11710ms step_avg:142.81ms step:93/1480 train_time:11852ms step_avg:142.79ms step:94/1480 train_time:11995ms step_avg:142.80ms step:95/1480 train_time:12138ms step_avg:142.80ms step:96/1480 train_time:12280ms step_avg:142.79ms step:97/1480 train_time:12425ms step_avg:142.81ms step:98/1480 train_time:12568ms step_avg:142.82ms step:99/1480 train_time:12710ms step_avg:142.81ms step:100/1480 train_time:12852ms step_avg:142.80ms step:101/1480 train_time:12993ms step_avg:142.78ms step:102/1480 train_time:13137ms step_avg:142.79ms step:103/1480 train_time:13278ms step_avg:142.78ms step:104/1480 train_time:13421ms step_avg:142.78ms step:105/1480 train_time:13565ms step_avg:142.79ms step:106/1480 train_time:13709ms step_avg:142.80ms step:107/1480 train_time:13852ms step_avg:142.80ms step:108/1480 train_time:13993ms step_avg:142.79ms step:109/1480 train_time:14134ms step_avg:142.77ms step:110/1480 train_time:14275ms step_avg:142.75ms step:111/1480 train_time:14422ms step_avg:142.79ms step:112/1480 train_time:14570ms step_avg:142.84ms step:113/1480 train_time:14717ms step_avg:142.88ms step:114/1480 train_time:14866ms step_avg:142.94ms step:115/1480 train_time:15012ms step_avg:142.97ms step:116/1480 train_time:15159ms step_avg:143.01ms step:117/1480 train_time:15307ms step_avg:143.06ms step:118/1480 train_time:15454ms step_avg:143.09ms step:119/1480 train_time:15601ms step_avg:143.13ms step:120/1480 train_time:15749ms step_avg:143.17ms step:121/1480 train_time:15895ms step_avg:143.20ms step:122/1480 train_time:16042ms step_avg:143.23ms step:123/1480 train_time:16189ms step_avg:143.26ms step:124/1480 train_time:16335ms step_avg:143.29ms step:125/1480 train_time:16482ms step_avg:143.32ms step:125/1480 val_loss:4.4255 train_time:16541ms step_avg:143.83ms step:126/1480 train_time:16637ms step_avg:143.42ms step:127/1480 train_time:16786ms step_avg:143.47ms step:128/1480 train_time:16930ms step_avg:143.48ms step:129/1480 train_time:17078ms step_avg:143.51ms step:130/1480 train_time:17224ms step_avg:143.53ms step:131/1480 train_time:17369ms step_avg:143.55ms step:132/1480 train_time:17515ms step_avg:143.57ms step:133/1480 train_time:17666ms step_avg:143.63ms step:134/1480 train_time:17813ms step_avg:143.66ms step:135/1480 train_time:17962ms step_avg:143.70ms step:136/1480 train_time:18108ms step_avg:143.71ms step:137/1480 train_time:18255ms step_avg:143.74ms step:138/1480 train_time:18402ms step_avg:143.77ms step:139/1480 train_time:18549ms step_avg:143.79ms step:140/1480 train_time:18698ms step_avg:143.83ms step:141/1480 train_time:18845ms step_avg:143.85ms step:142/1480 train_time:18994ms step_avg:143.90ms step:143/1480 train_time:19141ms step_avg:143.92ms step:144/1480 train_time:19288ms step_avg:143.94ms step:145/1480 train_time:19435ms step_avg:143.96ms step:146/1480 train_time:19583ms step_avg:143.99ms step:147/1480 train_time:19729ms step_avg:144.01ms step:148/1480 train_time:19877ms step_avg:144.04ms step:149/1480 train_time:20024ms step_avg:144.06ms step:150/1480 train_time:20171ms step_avg:144.08ms step:151/1480 train_time:20319ms step_avg:144.11ms step:152/1480 train_time:20466ms step_avg:144.13ms step:153/1480 train_time:20613ms step_avg:144.15ms step:154/1480 train_time:20760ms step_avg:144.17ms step:155/1480 train_time:20907ms step_avg:144.19ms step:156/1480 train_time:21056ms step_avg:144.22ms step:157/1480 train_time:21203ms step_avg:144.24ms step:158/1480 train_time:21349ms step_avg:144.25ms step:159/1480 train_time:21497ms step_avg:144.28ms step:160/1480 train_time:21644ms step_avg:144.29ms step:161/1480 train_time:21790ms step_avg:144.30ms step:162/1480 train_time:21937ms step_avg:144.32ms step:163/1480 train_time:22084ms step_avg:144.34ms step:164/1480 train_time:22230ms step_avg:144.35ms step:165/1480 train_time:22378ms step_avg:144.37ms step:166/1480 train_time:22526ms step_avg:144.39ms step:167/1480 train_time:22672ms step_avg:144.41ms step:168/1480 train_time:22820ms step_avg:144.43ms step:169/1480 train_time:22967ms step_avg:144.45ms step:170/1480 train_time:23114ms step_avg:144.46ms step:171/1480 train_time:23262ms step_avg:144.48ms step:172/1480 train_time:23408ms step_avg:144.49ms step:173/1480 train_time:23555ms step_avg:144.51ms step:174/1480 train_time:23703ms step_avg:144.53ms step:175/1480 train_time:23849ms step_avg:144.54ms step:176/1480 train_time:23996ms step_avg:144.55ms step:177/1480 train_time:24143ms step_avg:144.57ms step:178/1480 train_time:24289ms step_avg:144.58ms step:179/1480 train_time:24436ms step_avg:144.59ms step:180/1480 train_time:24584ms step_avg:144.61ms step:181/1480 train_time:24731ms step_avg:144.63ms step:182/1480 train_time:24880ms step_avg:144.65ms step:183/1480 train_time:25027ms step_avg:144.67ms step:184/1480 train_time:25174ms step_avg:144.68ms step:185/1480 train_time:25320ms step_avg:144.69ms step:186/1480 train_time:25467ms step_avg:144.70ms step:187/1480 train_time:25614ms step_avg:144.71ms step:188/1480 train_time:25763ms step_avg:144.74ms step:189/1480 train_time:25909ms step_avg:144.74ms step:190/1480 train_time:26055ms step_avg:144.75ms step:191/1480 train_time:26203ms step_avg:144.77ms step:192/1480 train_time:26349ms step_avg:144.77ms step:193/1480 train_time:26496ms step_avg:144.79ms step:194/1480 train_time:26643ms step_avg:144.80ms step:195/1480 train_time:26791ms step_avg:144.82ms step:196/1480 train_time:26940ms step_avg:144.84ms step:197/1480 train_time:27087ms step_avg:144.85ms step:198/1480 train_time:27232ms step_avg:144.85ms step:199/1480 train_time:27380ms step_avg:144.87ms step:200/1480 train_time:27528ms step_avg:144.88ms step:201/1480 train_time:27674ms step_avg:144.89ms step:202/1480 train_time:27821ms step_avg:144.90ms step:203/1480 train_time:27967ms step_avg:144.91ms step:204/1480 train_time:28113ms step_avg:144.91ms step:205/1480 train_time:28262ms step_avg:144.93ms step:206/1480 train_time:28408ms step_avg:144.94ms step:207/1480 train_time:28555ms step_avg:144.95ms step:208/1480 train_time:28703ms step_avg:144.96ms step:209/1480 train_time:28849ms step_avg:144.97ms step:210/1480 train_time:28997ms step_avg:144.98ms step:211/1480 train_time:29144ms step_avg:145.00ms step:212/1480 train_time:29290ms step_avg:145.00ms step:213/1480 train_time:29438ms step_avg:145.02ms step:214/1480 train_time:29586ms step_avg:145.03ms step:215/1480 train_time:29731ms step_avg:145.03ms step:216/1480 train_time:29878ms step_avg:145.04ms step:217/1480 train_time:30025ms step_avg:145.05ms step:218/1480 train_time:30171ms step_avg:145.05ms step:219/1480 train_time:30318ms step_avg:145.06ms step:220/1480 train_time:30466ms step_avg:145.08ms step:221/1480 train_time:30613ms step_avg:145.09ms step:222/1480 train_time:30763ms step_avg:145.11ms step:223/1480 train_time:30913ms step_avg:145.13ms step:224/1480 train_time:31064ms step_avg:145.16ms step:225/1480 train_time:31213ms step_avg:145.18ms step:226/1480 train_time:31364ms step_avg:145.20ms step:227/1480 train_time:31516ms step_avg:145.23ms step:228/1480 train_time:31666ms step_avg:145.26ms step:229/1480 train_time:31816ms step_avg:145.28ms step:230/1480 train_time:31968ms step_avg:145.31ms step:231/1480 train_time:32118ms step_avg:145.33ms step:232/1480 train_time:32270ms step_avg:145.36ms step:233/1480 train_time:32421ms step_avg:145.39ms step:234/1480 train_time:32573ms step_avg:145.41ms step:235/1480 train_time:32723ms step_avg:145.43ms step:236/1480 train_time:32873ms step_avg:145.45ms step:237/1480 train_time:33023ms step_avg:145.48ms step:238/1480 train_time:33174ms step_avg:145.50ms step:239/1480 train_time:33325ms step_avg:145.52ms step:240/1480 train_time:33475ms step_avg:145.54ms step:241/1480 train_time:33626ms step_avg:145.57ms step:242/1480 train_time:33777ms step_avg:145.59ms step:243/1480 train_time:33928ms step_avg:145.61ms step:244/1480 train_time:34079ms step_avg:145.64ms step:245/1480 train_time:34230ms step_avg:145.66ms step:246/1480 train_time:34382ms step_avg:145.69ms step:247/1480 train_time:34532ms step_avg:145.70ms step:248/1480 train_time:34682ms step_avg:145.72ms step:249/1480 train_time:34832ms step_avg:145.74ms step:250/1480 train_time:34984ms step_avg:145.77ms step:250/1480 val_loss:3.9930 train_time:35042ms step_avg:146.01ms step:251/1480 train_time:35140ms step_avg:145.81ms step:252/1480 train_time:35292ms step_avg:145.83ms step:253/1480 train_time:35442ms step_avg:145.85ms step:254/1480 train_time:35591ms step_avg:145.86ms step:255/1480 train_time:35741ms step_avg:145.88ms step:256/1480 train_time:35890ms step_avg:145.89ms step:257/1480 train_time:36040ms step_avg:145.91ms step:258/1480 train_time:36192ms step_avg:145.93ms step:259/1480 train_time:36343ms step_avg:145.96ms step:260/1480 train_time:36494ms step_avg:145.98ms step:261/1480 train_time:36645ms step_avg:145.99ms step:262/1480 train_time:36794ms step_avg:146.01ms step:263/1480 train_time:36945ms step_avg:146.03ms step:264/1480 train_time:37096ms step_avg:146.05ms step:265/1480 train_time:37247ms step_avg:146.07ms step:266/1480 train_time:37399ms step_avg:146.09ms step:267/1480 train_time:37549ms step_avg:146.10ms step:268/1480 train_time:37700ms step_avg:146.12ms step:269/1480 train_time:37848ms step_avg:146.13ms step:270/1480 train_time:37998ms step_avg:146.15ms step:271/1480 train_time:38148ms step_avg:146.16ms step:272/1480 train_time:38299ms step_avg:146.18ms step:273/1480 train_time:38449ms step_avg:146.19ms step:274/1480 train_time:38600ms step_avg:146.21ms step:275/1480 train_time:38750ms step_avg:146.23ms step:276/1480 train_time:38901ms step_avg:146.24ms step:277/1480 train_time:39049ms step_avg:146.25ms step:278/1480 train_time:39200ms step_avg:146.27ms step:279/1480 train_time:39351ms step_avg:146.29ms step:280/1480 train_time:39503ms step_avg:146.31ms step:281/1480 train_time:39652ms step_avg:146.32ms step:282/1480 train_time:39803ms step_avg:146.33ms step:283/1480 train_time:39953ms step_avg:146.35ms step:284/1480 train_time:40103ms step_avg:146.36ms step:285/1480 train_time:40252ms step_avg:146.37ms step:286/1480 train_time:40403ms step_avg:146.39ms step:287/1480 train_time:40553ms step_avg:146.40ms step:288/1480 train_time:40704ms step_avg:146.42ms step:289/1480 train_time:40854ms step_avg:146.43ms step:290/1480 train_time:41005ms step_avg:146.45ms step:291/1480 train_time:41155ms step_avg:146.46ms step:292/1480 train_time:41306ms step_avg:146.47ms step:293/1480 train_time:41456ms step_avg:146.49ms step:294/1480 train_time:41607ms step_avg:146.50ms step:295/1480 train_time:41758ms step_avg:146.52ms step:296/1480 train_time:41909ms step_avg:146.53ms step:297/1480 train_time:42060ms step_avg:146.55ms step:298/1480 train_time:42211ms step_avg:146.57ms step:299/1480 train_time:42361ms step_avg:146.58ms step:300/1480 train_time:42512ms step_avg:146.59ms step:301/1480 train_time:42662ms step_avg:146.60ms step:302/1480 train_time:42812ms step_avg:146.62ms step:303/1480 train_time:42962ms step_avg:146.63ms step:304/1480 train_time:43112ms step_avg:146.64ms step:305/1480 train_time:43263ms step_avg:146.65ms step:306/1480 train_time:43413ms step_avg:146.67ms step:307/1480 train_time:43564ms step_avg:146.68ms step:308/1480 train_time:43715ms step_avg:146.69ms step:309/1480 train_time:43865ms step_avg:146.70ms step:310/1480 train_time:44015ms step_avg:146.72ms step:311/1480 train_time:44165ms step_avg:146.73ms step:312/1480 train_time:44317ms step_avg:146.74ms step:313/1480 train_time:44467ms step_avg:146.76ms step:314/1480 train_time:44619ms step_avg:146.77ms step:315/1480 train_time:44768ms step_avg:146.78ms step:316/1480 train_time:44920ms step_avg:146.80ms step:317/1480 train_time:45071ms step_avg:146.81ms step:318/1480 train_time:45222ms step_avg:146.82ms step:319/1480 train_time:45372ms step_avg:146.83ms step:320/1480 train_time:45522ms step_avg:146.85ms step:321/1480 train_time:45672ms step_avg:146.85ms step:322/1480 train_time:45823ms step_avg:146.87ms step:323/1480 train_time:45973ms step_avg:146.88ms step:324/1480 train_time:46124ms step_avg:146.89ms step:325/1480 train_time:46275ms step_avg:146.91ms step:326/1480 train_time:46425ms step_avg:146.92ms step:327/1480 train_time:46575ms step_avg:146.92ms step:328/1480 train_time:46726ms step_avg:146.94ms step:329/1480 train_time:46876ms step_avg:146.95ms step:330/1480 train_time:47029ms step_avg:146.97ms step:331/1480 train_time:47184ms step_avg:146.99ms step:332/1480 train_time:47340ms step_avg:147.02ms step:333/1480 train_time:47493ms step_avg:147.04ms step:334/1480 train_time:47647ms step_avg:147.06ms step:335/1480 train_time:47799ms step_avg:147.07ms step:336/1480 train_time:47953ms step_avg:147.10ms step:337/1480 train_time:48107ms step_avg:147.11ms step:338/1480 train_time:48261ms step_avg:147.14ms step:339/1480 train_time:48416ms step_avg:147.16ms step:340/1480 train_time:48570ms step_avg:147.18ms step:341/1480 train_time:48724ms step_avg:147.20ms step:342/1480 train_time:48878ms step_avg:147.22ms step:343/1480 train_time:49031ms step_avg:147.24ms step:344/1480 train_time:49186ms step_avg:147.26ms step:345/1480 train_time:49341ms step_avg:147.29ms step:346/1480 train_time:49496ms step_avg:147.31ms step:347/1480 train_time:49650ms step_avg:147.33ms step:348/1480 train_time:49803ms step_avg:147.35ms step:349/1480 train_time:49956ms step_avg:147.36ms step:350/1480 train_time:50109ms step_avg:147.38ms step:351/1480 train_time:50262ms step_avg:147.40ms step:352/1480 train_time:50418ms step_avg:147.42ms step:353/1480 train_time:50571ms step_avg:147.44ms step:354/1480 train_time:50724ms step_avg:147.45ms step:355/1480 train_time:50879ms step_avg:147.48ms step:356/1480 train_time:51032ms step_avg:147.49ms step:357/1480 train_time:51187ms step_avg:147.51ms step:358/1480 train_time:51341ms step_avg:147.53ms step:359/1480 train_time:51497ms step_avg:147.56ms step:360/1480 train_time:51652ms step_avg:147.58ms step:361/1480 train_time:51807ms step_avg:147.60ms step:362/1480 train_time:51961ms step_avg:147.62ms step:363/1480 train_time:52114ms step_avg:147.63ms step:364/1480 train_time:52268ms step_avg:147.65ms step:365/1480 train_time:52421ms step_avg:147.67ms step:366/1480 train_time:52575ms step_avg:147.68ms step:367/1480 train_time:52729ms step_avg:147.70ms step:368/1480 train_time:52883ms step_avg:147.72ms step:369/1480 train_time:53036ms step_avg:147.73ms step:370/1480 train_time:53189ms step_avg:147.75ms step:371/1480 train_time:53343ms step_avg:147.76ms step:372/1480 train_time:53498ms step_avg:147.78ms step:373/1480 train_time:53651ms step_avg:147.80ms step:374/1480 train_time:53804ms step_avg:147.81ms step:375/1480 train_time:53959ms step_avg:147.83ms step:375/1480 val_loss:3.8092 train_time:54020ms step_avg:148.00ms step:376/1480 train_time:54119ms step_avg:147.87ms step:377/1480 train_time:54275ms step_avg:147.89ms step:378/1480 train_time:54427ms step_avg:147.90ms step:379/1480 train_time:54580ms step_avg:147.91ms step:380/1480 train_time:54732ms step_avg:147.92ms step:381/1480 train_time:54885ms step_avg:147.94ms step:382/1480 train_time:55038ms step_avg:147.95ms step:383/1480 train_time:55195ms step_avg:147.98ms step:384/1480 train_time:55348ms step_avg:147.99ms step:385/1480 train_time:55503ms step_avg:148.01ms step:386/1480 train_time:55657ms step_avg:148.02ms step:387/1480 train_time:55811ms step_avg:148.04ms step:388/1480 train_time:55963ms step_avg:148.05ms step:389/1480 train_time:56116ms step_avg:148.06ms step:390/1480 train_time:56271ms step_avg:148.08ms step:391/1480 train_time:56426ms step_avg:148.10ms step:392/1480 train_time:56578ms step_avg:148.11ms step:393/1480 train_time:56731ms step_avg:148.12ms step:394/1480 train_time:56886ms step_avg:148.14ms step:395/1480 train_time:57038ms step_avg:148.15ms step:396/1480 train_time:57191ms step_avg:148.16ms step:397/1480 train_time:57346ms step_avg:148.18ms step:398/1480 train_time:57500ms step_avg:148.20ms step:399/1480 train_time:57654ms step_avg:148.21ms step:400/1480 train_time:57808ms step_avg:148.23ms step:401/1480 train_time:57961ms step_avg:148.24ms step:402/1480 train_time:58115ms step_avg:148.25ms step:403/1480 train_time:58269ms step_avg:148.27ms step:404/1480 train_time:58422ms step_avg:148.28ms step:405/1480 train_time:58578ms step_avg:148.30ms step:406/1480 train_time:58731ms step_avg:148.31ms step:407/1480 train_time:58887ms step_avg:148.33ms step:408/1480 train_time:59041ms step_avg:148.34ms step:409/1480 train_time:59195ms step_avg:148.36ms step:410/1480 train_time:59348ms step_avg:148.37ms step:411/1480 train_time:59501ms step_avg:148.38ms step:412/1480 train_time:59655ms step_avg:148.40ms step:413/1480 train_time:59808ms step_avg:148.41ms step:414/1480 train_time:59962ms step_avg:148.42ms step:415/1480 train_time:60117ms step_avg:148.44ms step:416/1480 train_time:60270ms step_avg:148.45ms step:417/1480 train_time:60424ms step_avg:148.46ms step:418/1480 train_time:60577ms step_avg:148.47ms step:419/1480 train_time:60730ms step_avg:148.48ms step:420/1480 train_time:60884ms step_avg:148.50ms step:421/1480 train_time:61038ms step_avg:148.51ms step:422/1480 train_time:61191ms step_avg:148.52ms step:423/1480 train_time:61344ms step_avg:148.53ms step:424/1480 train_time:61497ms step_avg:148.54ms step:425/1480 train_time:61651ms step_avg:148.56ms step:426/1480 train_time:61806ms step_avg:148.57ms step:427/1480 train_time:61960ms step_avg:148.58ms step:428/1480 train_time:62113ms step_avg:148.60ms step:429/1480 train_time:62266ms step_avg:148.61ms step:430/1480 train_time:62421ms step_avg:148.62ms step:431/1480 train_time:62574ms step_avg:148.63ms step:432/1480 train_time:62727ms step_avg:148.64ms step:433/1480 train_time:62881ms step_avg:148.66ms step:434/1480 train_time:63034ms step_avg:148.66ms step:435/1480 train_time:63187ms step_avg:148.68ms step:436/1480 train_time:63342ms step_avg:148.69ms step:437/1480 train_time:63496ms step_avg:148.70ms step:438/1480 train_time:63649ms step_avg:148.71ms step:439/1480 train_time:63805ms step_avg:148.73ms step:440/1480 train_time:63960ms step_avg:148.74ms step:441/1480 train_time:64117ms step_avg:148.76ms step:442/1480 train_time:64273ms step_avg:148.78ms step:443/1480 train_time:64429ms step_avg:148.80ms step:444/1480 train_time:64586ms step_avg:148.82ms step:445/1480 train_time:64742ms step_avg:148.83ms step:446/1480 train_time:64899ms step_avg:148.85ms step:447/1480 train_time:65055ms step_avg:148.87ms step:448/1480 train_time:65210ms step_avg:148.88ms step:449/1480 train_time:65367ms step_avg:148.90ms step:450/1480 train_time:65525ms step_avg:148.92ms step:451/1480 train_time:65682ms step_avg:148.94ms step:452/1480 train_time:65840ms step_avg:148.96ms step:453/1480 train_time:65996ms step_avg:148.97ms step:454/1480 train_time:66150ms step_avg:148.99ms step:455/1480 train_time:66307ms step_avg:149.00ms step:456/1480 train_time:66463ms step_avg:149.02ms step:457/1480 train_time:66620ms step_avg:149.04ms step:458/1480 train_time:66775ms step_avg:149.05ms step:459/1480 train_time:66933ms step_avg:149.07ms step:460/1480 train_time:67089ms step_avg:149.09ms step:461/1480 train_time:67247ms step_avg:149.11ms step:462/1480 train_time:67404ms step_avg:149.12ms step:463/1480 train_time:67561ms step_avg:149.14ms step:464/1480 train_time:67718ms step_avg:149.16ms step:465/1480 train_time:67876ms step_avg:149.18ms step:466/1480 train_time:68032ms step_avg:149.19ms step:467/1480 train_time:68188ms step_avg:149.21ms step:468/1480 train_time:68344ms step_avg:149.22ms step:469/1480 train_time:68502ms step_avg:149.24ms step:470/1480 train_time:68659ms step_avg:149.26ms step:471/1480 train_time:68815ms step_avg:149.27ms step:472/1480 train_time:68972ms step_avg:149.29ms step:473/1480 train_time:69128ms step_avg:149.30ms step:474/1480 train_time:69283ms step_avg:149.32ms step:475/1480 train_time:69439ms step_avg:149.33ms step:476/1480 train_time:69597ms step_avg:149.35ms step:477/1480 train_time:69753ms step_avg:149.36ms step:478/1480 train_time:69909ms step_avg:149.38ms step:479/1480 train_time:70065ms step_avg:149.39ms step:480/1480 train_time:70223ms step_avg:149.41ms step:481/1480 train_time:70380ms step_avg:149.43ms step:482/1480 train_time:70537ms step_avg:149.44ms step:483/1480 train_time:70693ms step_avg:149.46ms step:484/1480 train_time:70850ms step_avg:149.47ms step:485/1480 train_time:71007ms step_avg:149.49ms step:486/1480 train_time:71164ms step_avg:149.50ms step:487/1480 train_time:71322ms step_avg:149.52ms step:488/1480 train_time:71480ms step_avg:149.54ms step:489/1480 train_time:71636ms step_avg:149.55ms step:490/1480 train_time:71792ms step_avg:149.57ms step:491/1480 train_time:71948ms step_avg:149.58ms step:492/1480 train_time:72104ms step_avg:149.59ms step:493/1480 train_time:72262ms step_avg:149.61ms step:494/1480 train_time:72421ms step_avg:149.63ms step:495/1480 train_time:72580ms step_avg:149.65ms step:496/1480 train_time:72736ms step_avg:149.66ms step:497/1480 train_time:72893ms step_avg:149.68ms step:498/1480 train_time:73049ms step_avg:149.69ms step:499/1480 train_time:73208ms step_avg:149.71ms step:500/1480 train_time:73365ms step_avg:149.73ms step:500/1480 val_loss:3.6864 train_time:73427ms step_avg:149.85ms step:501/1480 train_time:73525ms step_avg:149.74ms step:502/1480 train_time:73683ms step_avg:149.76ms step:503/1480 train_time:73840ms step_avg:149.78ms step:504/1480 train_time:73997ms step_avg:149.79ms step:505/1480 train_time:74152ms step_avg:149.80ms step:506/1480 train_time:74308ms step_avg:149.81ms step:507/1480 train_time:74463ms step_avg:149.83ms step:508/1480 train_time:74621ms step_avg:149.84ms step:509/1480 train_time:74780ms step_avg:149.86ms step:510/1480 train_time:74937ms step_avg:149.87ms step:511/1480 train_time:75094ms step_avg:149.89ms step:512/1480 train_time:75251ms step_avg:149.90ms step:513/1480 train_time:75406ms step_avg:149.91ms step:514/1480 train_time:75563ms step_avg:149.93ms step:515/1480 train_time:75720ms step_avg:149.94ms step:516/1480 train_time:75881ms step_avg:149.96ms step:517/1480 train_time:76039ms step_avg:149.98ms step:518/1480 train_time:76196ms step_avg:149.99ms step:519/1480 train_time:76352ms step_avg:150.00ms step:520/1480 train_time:76509ms step_avg:150.02ms step:521/1480 train_time:76666ms step_avg:150.03ms step:522/1480 train_time:76823ms step_avg:150.04ms step:523/1480 train_time:76981ms step_avg:150.06ms step:524/1480 train_time:77138ms step_avg:150.07ms step:525/1480 train_time:77297ms step_avg:150.09ms step:526/1480 train_time:77454ms step_avg:150.10ms step:527/1480 train_time:77610ms step_avg:150.12ms step:528/1480 train_time:77767ms step_avg:150.13ms step:529/1480 train_time:77925ms step_avg:150.14ms step:530/1480 train_time:78081ms step_avg:150.16ms step:531/1480 train_time:78239ms step_avg:150.17ms step:532/1480 train_time:78397ms step_avg:150.19ms step:533/1480 train_time:78555ms step_avg:150.20ms step:534/1480 train_time:78711ms step_avg:150.21ms step:535/1480 train_time:78869ms step_avg:150.23ms step:536/1480 train_time:79027ms step_avg:150.24ms step:537/1480 train_time:79182ms step_avg:150.25ms step:538/1480 train_time:79339ms step_avg:150.26ms step:539/1480 train_time:79497ms step_avg:150.28ms step:540/1480 train_time:79654ms step_avg:150.29ms step:541/1480 train_time:79810ms step_avg:150.30ms step:542/1480 train_time:79968ms step_avg:150.32ms step:543/1480 train_time:80125ms step_avg:150.33ms step:544/1480 train_time:80283ms step_avg:150.34ms step:545/1480 train_time:80439ms step_avg:150.35ms step:546/1480 train_time:80596ms step_avg:150.37ms step:547/1480 train_time:80751ms step_avg:150.38ms step:548/1480 train_time:80909ms step_avg:150.39ms step:549/1480 train_time:81067ms step_avg:150.40ms step:550/1480 train_time:81224ms step_avg:150.41ms step:551/1480 train_time:81383ms step_avg:150.43ms step:552/1480 train_time:81542ms step_avg:150.45ms step:553/1480 train_time:81703ms step_avg:150.47ms step:554/1480 train_time:81863ms step_avg:150.48ms step:555/1480 train_time:82024ms step_avg:150.50ms step:556/1480 train_time:82183ms step_avg:150.52ms step:557/1480 train_time:82343ms step_avg:150.54ms step:558/1480 train_time:82503ms step_avg:150.55ms step:559/1480 train_time:82663ms step_avg:150.57ms step:560/1480 train_time:82822ms step_avg:150.59ms step:561/1480 train_time:82982ms step_avg:150.60ms step:562/1480 train_time:83141ms step_avg:150.62ms step:563/1480 train_time:83301ms step_avg:150.64ms step:564/1480 train_time:83460ms step_avg:150.65ms step:565/1480 train_time:83620ms step_avg:150.67ms step:566/1480 train_time:83781ms step_avg:150.69ms step:567/1480 train_time:83941ms step_avg:150.70ms step:568/1480 train_time:84102ms step_avg:150.72ms step:569/1480 train_time:84262ms step_avg:150.74ms step:570/1480 train_time:84422ms step_avg:150.75ms step:571/1480 train_time:84582ms step_avg:150.77ms step:572/1480 train_time:84741ms step_avg:150.79ms step:573/1480 train_time:84902ms step_avg:150.80ms step:574/1480 train_time:85063ms step_avg:150.82ms step:575/1480 train_time:85224ms step_avg:150.84ms step:576/1480 train_time:85384ms step_avg:150.86ms step:577/1480 train_time:85543ms step_avg:150.87ms step:578/1480 train_time:85703ms step_avg:150.89ms step:579/1480 train_time:85862ms step_avg:150.90ms step:580/1480 train_time:86021ms step_avg:150.91ms step:581/1480 train_time:86181ms step_avg:150.93ms step:582/1480 train_time:86342ms step_avg:150.95ms step:583/1480 train_time:86503ms step_avg:150.96ms step:584/1480 train_time:86664ms step_avg:150.98ms step:585/1480 train_time:86823ms step_avg:151.00ms step:586/1480 train_time:86984ms step_avg:151.01ms step:587/1480 train_time:87143ms step_avg:151.03ms step:588/1480 train_time:87302ms step_avg:151.04ms step:589/1480 train_time:87462ms step_avg:151.06ms step:590/1480 train_time:87622ms step_avg:151.07ms step:591/1480 train_time:87782ms step_avg:151.09ms step:592/1480 train_time:87942ms step_avg:151.10ms step:593/1480 train_time:88104ms step_avg:151.12ms step:594/1480 train_time:88264ms step_avg:151.14ms step:595/1480 train_time:88424ms step_avg:151.15ms step:596/1480 train_time:88585ms step_avg:151.17ms step:597/1480 train_time:88743ms step_avg:151.18ms step:598/1480 train_time:88901ms step_avg:151.19ms step:599/1480 train_time:89061ms step_avg:151.21ms step:600/1480 train_time:89221ms step_avg:151.22ms step:601/1480 train_time:89381ms step_avg:151.24ms step:602/1480 train_time:89541ms step_avg:151.25ms step:603/1480 train_time:89703ms step_avg:151.27ms step:604/1480 train_time:89863ms step_avg:151.28ms step:605/1480 train_time:90022ms step_avg:151.30ms step:606/1480 train_time:90184ms step_avg:151.32ms step:607/1480 train_time:90344ms step_avg:151.33ms step:608/1480 train_time:90504ms step_avg:151.34ms step:609/1480 train_time:90663ms step_avg:151.36ms step:610/1480 train_time:90822ms step_avg:151.37ms step:611/1480 train_time:90984ms step_avg:151.39ms step:612/1480 train_time:91143ms step_avg:151.40ms step:613/1480 train_time:91304ms step_avg:151.42ms step:614/1480 train_time:91464ms step_avg:151.43ms step:615/1480 train_time:91623ms step_avg:151.44ms step:616/1480 train_time:91781ms step_avg:151.45ms step:617/1480 train_time:91941ms step_avg:151.47ms step:618/1480 train_time:92102ms step_avg:151.48ms step:619/1480 train_time:92261ms step_avg:151.50ms step:620/1480 train_time:92421ms step_avg:151.51ms step:621/1480 train_time:92582ms step_avg:151.52ms step:622/1480 train_time:92742ms step_avg:151.54ms step:623/1480 train_time:92904ms step_avg:151.56ms step:624/1480 train_time:93064ms step_avg:151.57ms step:625/1480 train_time:93224ms step_avg:151.58ms step:625/1480 val_loss:3.6060 train_time:93288ms step_avg:151.69ms step:626/1480 train_time:93386ms step_avg:151.60ms step:627/1480 train_time:93545ms step_avg:151.61ms step:628/1480 train_time:93702ms step_avg:151.62ms step:629/1480 train_time:93859ms step_avg:151.63ms step:630/1480 train_time:94017ms step_avg:151.64ms step:631/1480 train_time:94176ms step_avg:151.65ms step:632/1480 train_time:94337ms step_avg:151.67ms step:633/1480 train_time:94497ms step_avg:151.68ms step:634/1480 train_time:94657ms step_avg:151.69ms step:635/1480 train_time:94817ms step_avg:151.71ms step:636/1480 train_time:94977ms step_avg:151.72ms step:637/1480 train_time:95137ms step_avg:151.73ms step:638/1480 train_time:95297ms step_avg:151.75ms step:639/1480 train_time:95457ms step_avg:151.76ms step:640/1480 train_time:95617ms step_avg:151.77ms step:641/1480 train_time:95778ms step_avg:151.79ms step:642/1480 train_time:95937ms step_avg:151.80ms step:643/1480 train_time:96097ms step_avg:151.81ms step:644/1480 train_time:96256ms step_avg:151.82ms step:645/1480 train_time:96415ms step_avg:151.83ms step:646/1480 train_time:96575ms step_avg:151.85ms step:647/1480 train_time:96736ms step_avg:151.86ms step:648/1480 train_time:96897ms step_avg:151.88ms step:649/1480 train_time:97056ms step_avg:151.89ms step:650/1480 train_time:97216ms step_avg:151.90ms step:651/1480 train_time:97376ms step_avg:151.91ms step:652/1480 train_time:97537ms step_avg:151.93ms step:653/1480 train_time:97696ms step_avg:151.94ms step:654/1480 train_time:97857ms step_avg:151.95ms step:655/1480 train_time:98017ms step_avg:151.96ms step:656/1480 train_time:98178ms step_avg:151.98ms step:657/1480 train_time:98338ms step_avg:151.99ms step:658/1480 train_time:98498ms step_avg:152.00ms step:659/1480 train_time:98660ms step_avg:152.02ms step:660/1480 train_time:98821ms step_avg:152.03ms step:661/1480 train_time:98983ms step_avg:152.05ms step:662/1480 train_time:99142ms step_avg:152.06ms step:663/1480 train_time:99302ms step_avg:152.07ms step:664/1480 train_time:99464ms step_avg:152.09ms step:665/1480 train_time:99625ms step_avg:152.10ms step:666/1480 train_time:99785ms step_avg:152.11ms step:667/1480 train_time:99949ms step_avg:152.13ms step:668/1480 train_time:100109ms step_avg:152.14ms step:669/1480 train_time:100273ms step_avg:152.16ms step:670/1480 train_time:100433ms step_avg:152.17ms step:671/1480 train_time:100595ms step_avg:152.19ms step:672/1480 train_time:100758ms step_avg:152.20ms step:673/1480 train_time:100919ms step_avg:152.22ms step:674/1480 train_time:101082ms step_avg:152.23ms step:675/1480 train_time:101243ms step_avg:152.25ms step:676/1480 train_time:101404ms step_avg:152.26ms step:677/1480 train_time:101565ms step_avg:152.27ms step:678/1480 train_time:101724ms step_avg:152.28ms step:679/1480 train_time:101886ms step_avg:152.30ms step:680/1480 train_time:102048ms step_avg:152.31ms step:681/1480 train_time:102208ms step_avg:152.32ms step:682/1480 train_time:102371ms step_avg:152.34ms step:683/1480 train_time:102535ms step_avg:152.35ms step:684/1480 train_time:102696ms step_avg:152.37ms step:685/1480 train_time:102860ms step_avg:152.39ms step:686/1480 train_time:103020ms step_avg:152.40ms step:687/1480 train_time:103180ms step_avg:152.41ms step:688/1480 train_time:103342ms step_avg:152.42ms step:689/1480 train_time:103505ms step_avg:152.44ms step:690/1480 train_time:103668ms step_avg:152.45ms step:691/1480 train_time:103830ms step_avg:152.47ms step:692/1480 train_time:103992ms step_avg:152.48ms step:693/1480 train_time:104155ms step_avg:152.50ms step:694/1480 train_time:104318ms step_avg:152.51ms step:695/1480 train_time:104479ms step_avg:152.52ms step:696/1480 train_time:104640ms step_avg:152.54ms step:697/1480 train_time:104802ms step_avg:152.55ms step:698/1480 train_time:104962ms step_avg:152.56ms step:699/1480 train_time:105124ms step_avg:152.58ms step:700/1480 train_time:105285ms step_avg:152.59ms step:701/1480 train_time:105445ms step_avg:152.60ms step:702/1480 train_time:105605ms step_avg:152.61ms step:703/1480 train_time:105766ms step_avg:152.62ms step:704/1480 train_time:105926ms step_avg:152.63ms step:705/1480 train_time:106088ms step_avg:152.64ms step:706/1480 train_time:106253ms step_avg:152.66ms step:707/1480 train_time:106415ms step_avg:152.68ms step:708/1480 train_time:106577ms step_avg:152.69ms step:709/1480 train_time:106739ms step_avg:152.70ms step:710/1480 train_time:106900ms step_avg:152.71ms step:711/1480 train_time:107062ms step_avg:152.73ms step:712/1480 train_time:107226ms step_avg:152.74ms step:713/1480 train_time:107388ms step_avg:152.76ms step:714/1480 train_time:107548ms step_avg:152.77ms step:715/1480 train_time:107708ms step_avg:152.78ms step:716/1480 train_time:107869ms step_avg:152.79ms step:717/1480 train_time:108033ms step_avg:152.80ms step:718/1480 train_time:108194ms step_avg:152.82ms step:719/1480 train_time:108356ms step_avg:152.83ms step:720/1480 train_time:108518ms step_avg:152.84ms step:721/1480 train_time:108680ms step_avg:152.85ms step:722/1480 train_time:108841ms step_avg:152.87ms step:723/1480 train_time:109001ms step_avg:152.88ms step:724/1480 train_time:109162ms step_avg:152.89ms step:725/1480 train_time:109324ms step_avg:152.90ms step:726/1480 train_time:109486ms step_avg:152.91ms step:727/1480 train_time:109650ms step_avg:152.93ms step:728/1480 train_time:109810ms step_avg:152.94ms step:729/1480 train_time:109972ms step_avg:152.95ms step:730/1480 train_time:110137ms step_avg:152.97ms step:731/1480 train_time:110298ms step_avg:152.98ms step:732/1480 train_time:110459ms step_avg:152.99ms step:733/1480 train_time:110619ms step_avg:153.00ms step:734/1480 train_time:110780ms step_avg:153.01ms step:735/1480 train_time:110942ms step_avg:153.02ms step:736/1480 train_time:111103ms step_avg:153.03ms step:737/1480 train_time:111264ms step_avg:153.05ms step:738/1480 train_time:111424ms step_avg:153.06ms step:739/1480 train_time:111584ms step_avg:153.06ms step:740/1480 train_time:111748ms step_avg:153.08ms step:741/1480 train_time:111913ms step_avg:153.10ms step:742/1480 train_time:112075ms step_avg:153.11ms step:743/1480 train_time:112237ms step_avg:153.12ms step:744/1480 train_time:112401ms step_avg:153.14ms step:745/1480 train_time:112565ms step_avg:153.15ms step:746/1480 train_time:112723ms step_avg:153.16ms step:747/1480 train_time:112885ms step_avg:153.17ms step:748/1480 train_time:113051ms step_avg:153.19ms step:749/1480 train_time:113214ms step_avg:153.20ms step:750/1480 train_time:113376ms step_avg:153.21ms step:750/1480 val_loss:3.5508 train_time:113440ms step_avg:153.30ms step:751/1480 train_time:113540ms step_avg:153.23ms step:752/1480 train_time:113704ms step_avg:153.24ms step:753/1480 train_time:113865ms step_avg:153.25ms step:754/1480 train_time:114026ms step_avg:153.26ms step:755/1480 train_time:114188ms step_avg:153.27ms step:756/1480 train_time:114349ms step_avg:153.28ms step:757/1480 train_time:114513ms step_avg:153.30ms step:758/1480 train_time:114673ms step_avg:153.31ms step:759/1480 train_time:114836ms step_avg:153.32ms step:760/1480 train_time:114998ms step_avg:153.33ms step:761/1480 train_time:115159ms step_avg:153.34ms step:762/1480 train_time:115319ms step_avg:153.35ms step:763/1480 train_time:115480ms step_avg:153.36ms step:764/1480 train_time:115641ms step_avg:153.37ms step:765/1480 train_time:115803ms step_avg:153.38ms step:766/1480 train_time:115967ms step_avg:153.40ms step:767/1480 train_time:116130ms step_avg:153.41ms step:768/1480 train_time:116292ms step_avg:153.42ms step:769/1480 train_time:116455ms step_avg:153.43ms step:770/1480 train_time:116617ms step_avg:153.44ms step:771/1480 train_time:116779ms step_avg:153.46ms step:772/1480 train_time:116940ms step_avg:153.46ms step:773/1480 train_time:117104ms step_avg:153.48ms step:774/1480 train_time:117267ms step_avg:153.49ms step:775/1480 train_time:117431ms step_avg:153.50ms step:776/1480 train_time:117596ms step_avg:153.52ms step:777/1480 train_time:117760ms step_avg:153.53ms step:778/1480 train_time:117922ms step_avg:153.54ms step:779/1480 train_time:118085ms step_avg:153.56ms step:780/1480 train_time:118249ms step_avg:153.57ms step:781/1480 train_time:118413ms step_avg:153.58ms step:782/1480 train_time:118576ms step_avg:153.60ms step:783/1480 train_time:118737ms step_avg:153.61ms step:784/1480 train_time:118900ms step_avg:153.62ms step:785/1480 train_time:119061ms step_avg:153.63ms step:786/1480 train_time:119228ms step_avg:153.64ms step:787/1480 train_time:119392ms step_avg:153.66ms step:788/1480 train_time:119556ms step_avg:153.67ms step:789/1480 train_time:119717ms step_avg:153.68ms step:790/1480 train_time:119881ms step_avg:153.69ms step:791/1480 train_time:120047ms step_avg:153.71ms step:792/1480 train_time:120213ms step_avg:153.73ms step:793/1480 train_time:120374ms step_avg:153.73ms step:794/1480 train_time:120538ms step_avg:153.75ms step:795/1480 train_time:120705ms step_avg:153.76ms step:796/1480 train_time:120872ms step_avg:153.78ms step:797/1480 train_time:121036ms step_avg:153.79ms step:798/1480 train_time:121198ms step_avg:153.80ms step:799/1480 train_time:121364ms step_avg:153.82ms step:800/1480 train_time:121528ms step_avg:153.83ms step:801/1480 train_time:121692ms step_avg:153.85ms step:802/1480 train_time:121861ms step_avg:153.86ms step:803/1480 train_time:122024ms step_avg:153.88ms step:804/1480 train_time:122187ms step_avg:153.89ms step:805/1480 train_time:122352ms step_avg:153.90ms step:806/1480 train_time:122513ms step_avg:153.91ms step:807/1480 train_time:122674ms step_avg:153.92ms step:808/1480 train_time:122838ms step_avg:153.93ms step:809/1480 train_time:123000ms step_avg:153.94ms step:810/1480 train_time:123161ms step_avg:153.95ms step:811/1480 train_time:123324ms step_avg:153.96ms step:812/1480 train_time:123490ms step_avg:153.98ms step:813/1480 train_time:123651ms step_avg:153.99ms step:814/1480 train_time:123815ms step_avg:154.00ms step:815/1480 train_time:123977ms step_avg:154.01ms step:816/1480 train_time:124141ms step_avg:154.02ms step:817/1480 train_time:124303ms step_avg:154.03ms step:818/1480 train_time:124465ms step_avg:154.04ms step:819/1480 train_time:124630ms step_avg:154.05ms step:820/1480 train_time:124794ms step_avg:154.07ms step:821/1480 train_time:124955ms step_avg:154.07ms step:822/1480 train_time:125119ms step_avg:154.09ms step:823/1480 train_time:125281ms step_avg:154.10ms step:824/1480 train_time:125441ms step_avg:154.10ms step:825/1480 train_time:125605ms step_avg:154.12ms step:826/1480 train_time:125772ms step_avg:154.13ms step:827/1480 train_time:125937ms step_avg:154.15ms step:828/1480 train_time:126099ms step_avg:154.15ms step:829/1480 train_time:126262ms step_avg:154.17ms step:830/1480 train_time:126426ms step_avg:154.18ms step:831/1480 train_time:126592ms step_avg:154.19ms step:832/1480 train_time:126755ms step_avg:154.20ms step:833/1480 train_time:126919ms step_avg:154.22ms step:834/1480 train_time:127084ms step_avg:154.23ms step:835/1480 train_time:127247ms step_avg:154.24ms step:836/1480 train_time:127413ms step_avg:154.25ms step:837/1480 train_time:127576ms step_avg:154.26ms step:838/1480 train_time:127739ms step_avg:154.27ms step:839/1480 train_time:127901ms step_avg:154.28ms step:840/1480 train_time:128061ms step_avg:154.29ms step:841/1480 train_time:128220ms step_avg:154.30ms step:842/1480 train_time:128385ms step_avg:154.31ms step:843/1480 train_time:128548ms step_avg:154.32ms step:844/1480 train_time:128711ms step_avg:154.33ms step:845/1480 train_time:128873ms step_avg:154.34ms step:846/1480 train_time:129039ms step_avg:154.35ms step:847/1480 train_time:129203ms step_avg:154.36ms step:848/1480 train_time:129365ms step_avg:154.37ms step:849/1480 train_time:129530ms step_avg:154.39ms step:850/1480 train_time:129693ms step_avg:154.40ms step:851/1480 train_time:129857ms step_avg:154.41ms step:852/1480 train_time:130018ms step_avg:154.42ms step:853/1480 train_time:130179ms step_avg:154.42ms step:854/1480 train_time:130343ms step_avg:154.44ms step:855/1480 train_time:130507ms step_avg:154.45ms step:856/1480 train_time:130672ms step_avg:154.46ms step:857/1480 train_time:130837ms step_avg:154.47ms step:858/1480 train_time:131003ms step_avg:154.48ms step:859/1480 train_time:131168ms step_avg:154.50ms step:860/1480 train_time:131330ms step_avg:154.51ms step:861/1480 train_time:131495ms step_avg:154.52ms step:862/1480 train_time:131666ms step_avg:154.54ms step:863/1480 train_time:131835ms step_avg:154.55ms step:864/1480 train_time:131998ms step_avg:154.56ms step:865/1480 train_time:132158ms step_avg:154.57ms step:866/1480 train_time:132325ms step_avg:154.59ms step:867/1480 train_time:132489ms step_avg:154.60ms step:868/1480 train_time:132651ms step_avg:154.60ms step:869/1480 train_time:132814ms step_avg:154.61ms step:870/1480 train_time:132977ms step_avg:154.62ms step:871/1480 train_time:133139ms step_avg:154.63ms step:872/1480 train_time:133303ms step_avg:154.64ms step:873/1480 train_time:133465ms step_avg:154.65ms step:874/1480 train_time:133632ms step_avg:154.67ms step:875/1480 train_time:133797ms step_avg:154.68ms step:875/1480 val_loss:3.5041 train_time:133863ms step_avg:154.75ms step:876/1480 train_time:133963ms step_avg:154.69ms step:877/1480 train_time:134126ms step_avg:154.70ms step:878/1480 train_time:134288ms step_avg:154.71ms step:879/1480 train_time:134452ms step_avg:154.72ms step:880/1480 train_time:134615ms step_avg:154.73ms step:881/1480 train_time:134777ms step_avg:154.74ms step:882/1480 train_time:134943ms step_avg:154.75ms step:883/1480 train_time:135109ms step_avg:154.76ms step:884/1480 train_time:135277ms step_avg:154.78ms step:885/1480 train_time:135442ms step_avg:154.79ms step:886/1480 train_time:135608ms step_avg:154.80ms step:887/1480 train_time:135776ms step_avg:154.82ms step:888/1480 train_time:135949ms step_avg:154.84ms step:889/1480 train_time:136117ms step_avg:154.85ms step:890/1480 train_time:136281ms step_avg:154.87ms step:891/1480 train_time:136447ms step_avg:154.88ms step:892/1480 train_time:136612ms step_avg:154.89ms step:893/1480 train_time:136775ms step_avg:154.90ms step:894/1480 train_time:136940ms step_avg:154.91ms step:895/1480 train_time:137106ms step_avg:154.92ms step:896/1480 train_time:137271ms step_avg:154.93ms step:897/1480 train_time:137437ms step_avg:154.95ms step:898/1480 train_time:137602ms step_avg:154.96ms step:899/1480 train_time:137767ms step_avg:154.97ms step:900/1480 train_time:137929ms step_avg:154.98ms step:901/1480 train_time:138094ms step_avg:154.99ms step:902/1480 train_time:138259ms step_avg:155.00ms step:903/1480 train_time:138429ms step_avg:155.02ms step:904/1480 train_time:138593ms step_avg:155.03ms step:905/1480 train_time:138756ms step_avg:155.03ms step:906/1480 train_time:138922ms step_avg:155.05ms step:907/1480 train_time:139089ms step_avg:155.06ms step:908/1480 train_time:139253ms step_avg:155.07ms step:909/1480 train_time:139418ms step_avg:155.08ms step:910/1480 train_time:139587ms step_avg:155.10ms step:911/1480 train_time:139750ms step_avg:155.11ms step:912/1480 train_time:139919ms step_avg:155.12ms step:913/1480 train_time:140087ms step_avg:155.13ms step:914/1480 train_time:140253ms step_avg:155.15ms step:915/1480 train_time:140422ms step_avg:155.16ms step:916/1480 train_time:140587ms step_avg:155.17ms step:917/1480 train_time:140751ms step_avg:155.18ms step:918/1480 train_time:140920ms step_avg:155.20ms step:919/1480 train_time:141089ms step_avg:155.21ms step:920/1480 train_time:141254ms step_avg:155.22ms step:921/1480 train_time:141421ms step_avg:155.24ms step:922/1480 train_time:141587ms step_avg:155.25ms step:923/1480 train_time:141750ms step_avg:155.26ms step:924/1480 train_time:141915ms step_avg:155.27ms step:925/1480 train_time:142081ms step_avg:155.28ms step:926/1480 train_time:142245ms step_avg:155.29ms step:927/1480 train_time:142407ms step_avg:155.30ms step:928/1480 train_time:142575ms step_avg:155.31ms step:929/1480 train_time:142741ms step_avg:155.32ms step:930/1480 train_time:142906ms step_avg:155.33ms step:931/1480 train_time:143068ms step_avg:155.34ms step:932/1480 train_time:143235ms step_avg:155.35ms step:933/1480 train_time:143404ms step_avg:155.37ms step:934/1480 train_time:143571ms step_avg:155.38ms step:935/1480 train_time:143742ms step_avg:155.40ms step:936/1480 train_time:143909ms step_avg:155.41ms step:937/1480 train_time:144081ms step_avg:155.43ms step:938/1480 train_time:144243ms step_avg:155.43ms step:939/1480 train_time:144412ms step_avg:155.45ms step:940/1480 train_time:144578ms step_avg:155.46ms step:941/1480 train_time:144744ms step_avg:155.47ms step:942/1480 train_time:144909ms step_avg:155.48ms step:943/1480 train_time:145079ms step_avg:155.50ms step:944/1480 train_time:145250ms step_avg:155.51ms step:945/1480 train_time:145415ms step_avg:155.52ms step:946/1480 train_time:145584ms step_avg:155.54ms step:947/1480 train_time:145751ms step_avg:155.55ms step:948/1480 train_time:145917ms step_avg:155.56ms step:949/1480 train_time:146083ms step_avg:155.57ms step:950/1480 train_time:146247ms step_avg:155.58ms step:951/1480 train_time:146416ms step_avg:155.60ms step:952/1480 train_time:146583ms step_avg:155.61ms step:953/1480 train_time:146750ms step_avg:155.62ms step:954/1480 train_time:146920ms step_avg:155.64ms step:955/1480 train_time:147083ms step_avg:155.64ms step:956/1480 train_time:147249ms step_avg:155.65ms step:957/1480 train_time:147417ms step_avg:155.67ms step:958/1480 train_time:147585ms step_avg:155.68ms step:959/1480 train_time:147749ms step_avg:155.69ms step:960/1480 train_time:147916ms step_avg:155.70ms step:961/1480 train_time:148082ms step_avg:155.71ms step:962/1480 train_time:148245ms step_avg:155.72ms step:963/1480 train_time:148410ms step_avg:155.73ms step:964/1480 train_time:148578ms step_avg:155.74ms step:965/1480 train_time:148742ms step_avg:155.75ms step:966/1480 train_time:148905ms step_avg:155.76ms step:967/1480 train_time:149069ms step_avg:155.77ms step:968/1480 train_time:149234ms step_avg:155.78ms step:969/1480 train_time:149401ms step_avg:155.79ms step:970/1480 train_time:149566ms step_avg:155.80ms step:971/1480 train_time:149730ms step_avg:155.81ms step:972/1480 train_time:149895ms step_avg:155.82ms step:973/1480 train_time:150060ms step_avg:155.83ms step:974/1480 train_time:150228ms step_avg:155.84ms step:975/1480 train_time:150391ms step_avg:155.85ms step:976/1480 train_time:150557ms step_avg:155.86ms step:977/1480 train_time:150721ms step_avg:155.86ms step:978/1480 train_time:150887ms step_avg:155.87ms step:979/1480 train_time:151052ms step_avg:155.88ms step:980/1480 train_time:151218ms step_avg:155.89ms step:981/1480 train_time:151385ms step_avg:155.91ms step:982/1480 train_time:151548ms step_avg:155.91ms step:983/1480 train_time:151715ms step_avg:155.92ms step:984/1480 train_time:151880ms step_avg:155.93ms step:985/1480 train_time:152046ms step_avg:155.94ms step:986/1480 train_time:152210ms step_avg:155.95ms step:987/1480 train_time:152376ms step_avg:155.96ms step:988/1480 train_time:152543ms step_avg:155.97ms step:989/1480 train_time:152709ms step_avg:155.98ms step:990/1480 train_time:152880ms step_avg:156.00ms step:991/1480 train_time:153047ms step_avg:156.01ms step:992/1480 train_time:153222ms step_avg:156.03ms step:993/1480 train_time:153398ms step_avg:156.05ms step:994/1480 train_time:153563ms step_avg:156.06ms step:995/1480 train_time:153727ms step_avg:156.07ms step:996/1480 train_time:153888ms step_avg:156.07ms step:997/1480 train_time:154053ms step_avg:156.08ms step:998/1480 train_time:154217ms step_avg:156.09ms step:999/1480 train_time:154383ms step_avg:156.10ms step:1000/1480 train_time:154552ms step_avg:156.11ms step:1000/1480 val_loss:3.4392 train_time:154621ms step_avg:156.18ms step:1001/1480 train_time:154721ms step_avg:156.13ms step:1002/1480 train_time:154888ms step_avg:156.14ms step:1003/1480 train_time:155059ms step_avg:156.15ms step:1004/1480 train_time:155228ms step_avg:156.16ms step:1005/1480 train_time:155397ms step_avg:156.18ms step:1006/1480 train_time:155566ms step_avg:156.19ms step:1007/1480 train_time:155732ms step_avg:156.20ms step:1008/1480 train_time:155899ms step_avg:156.21ms step:1009/1480 train_time:156074ms step_avg:156.23ms step:1010/1480 train_time:156238ms step_avg:156.24ms step:1011/1480 train_time:156404ms step_avg:156.25ms step:1012/1480 train_time:156569ms step_avg:156.26ms step:1013/1480 train_time:156739ms step_avg:156.27ms step:1014/1480 train_time:156906ms step_avg:156.28ms step:1015/1480 train_time:157075ms step_avg:156.29ms step:1016/1480 train_time:157245ms step_avg:156.31ms step:1017/1480 train_time:157417ms step_avg:156.32ms step:1018/1480 train_time:157584ms step_avg:156.33ms step:1019/1480 train_time:157752ms step_avg:156.34ms step:1020/1480 train_time:157921ms step_avg:156.36ms step:1021/1480 train_time:158086ms step_avg:156.37ms step:1022/1480 train_time:158253ms step_avg:156.38ms step:1023/1480 train_time:158420ms step_avg:156.39ms step:1024/1480 train_time:158586ms step_avg:156.40ms step:1025/1480 train_time:158757ms step_avg:156.41ms step:1026/1480 train_time:158921ms step_avg:156.42ms step:1027/1480 train_time:159088ms step_avg:156.43ms step:1028/1480 train_time:159259ms step_avg:156.44ms step:1029/1480 train_time:159433ms step_avg:156.46ms step:1030/1480 train_time:159600ms step_avg:156.47ms step:1031/1480 train_time:159765ms step_avg:156.48ms step:1032/1480 train_time:159937ms step_avg:156.49ms step:1033/1480 train_time:160103ms step_avg:156.50ms step:1034/1480 train_time:160273ms step_avg:156.52ms step:1035/1480 train_time:160440ms step_avg:156.53ms step:1036/1480 train_time:160605ms step_avg:156.53ms step:1037/1480 train_time:160773ms step_avg:156.55ms step:1038/1480 train_time:160940ms step_avg:156.56ms step:1039/1480 train_time:161110ms step_avg:156.57ms step:1040/1480 train_time:161277ms step_avg:156.58ms step:1041/1480 train_time:161443ms step_avg:156.59ms step:1042/1480 train_time:161608ms step_avg:156.60ms step:1043/1480 train_time:161773ms step_avg:156.61ms step:1044/1480 train_time:161938ms step_avg:156.61ms step:1045/1480 train_time:162108ms step_avg:156.63ms step:1046/1480 train_time:162276ms step_avg:156.64ms step:1047/1480 train_time:162442ms step_avg:156.65ms step:1048/1480 train_time:162608ms step_avg:156.66ms step:1049/1480 train_time:162774ms step_avg:156.66ms step:1050/1480 train_time:162943ms step_avg:156.68ms step:1051/1480 train_time:163113ms step_avg:156.69ms step:1052/1480 train_time:163280ms step_avg:156.70ms step:1053/1480 train_time:163448ms step_avg:156.71ms step:1054/1480 train_time:163616ms step_avg:156.72ms step:1055/1480 train_time:163781ms step_avg:156.73ms step:1056/1480 train_time:163946ms step_avg:156.74ms step:1057/1480 train_time:164112ms step_avg:156.75ms step:1058/1480 train_time:164280ms step_avg:156.76ms step:1059/1480 train_time:164452ms step_avg:156.77ms step:1060/1480 train_time:164620ms step_avg:156.78ms step:1061/1480 train_time:164783ms step_avg:156.79ms step:1062/1480 train_time:164951ms step_avg:156.80ms step:1063/1480 train_time:165116ms step_avg:156.81ms step:1064/1480 train_time:165278ms step_avg:156.81ms step:1065/1480 train_time:165446ms step_avg:156.82ms step:1066/1480 train_time:165614ms step_avg:156.83ms step:1067/1480 train_time:165781ms step_avg:156.84ms step:1068/1480 train_time:165947ms step_avg:156.85ms step:1069/1480 train_time:166117ms step_avg:156.86ms step:1070/1480 train_time:166281ms step_avg:156.87ms step:1071/1480 train_time:166455ms step_avg:156.88ms step:1072/1480 train_time:166621ms step_avg:156.89ms step:1073/1480 train_time:166784ms step_avg:156.90ms step:1074/1480 train_time:166951ms step_avg:156.91ms step:1075/1480 train_time:167123ms step_avg:156.92ms step:1076/1480 train_time:167292ms step_avg:156.93ms step:1077/1480 train_time:167456ms step_avg:156.94ms step:1078/1480 train_time:167630ms step_avg:156.96ms step:1079/1480 train_time:167802ms step_avg:156.97ms step:1080/1480 train_time:167972ms step_avg:156.98ms step:1081/1480 train_time:168140ms step_avg:156.99ms step:1082/1480 train_time:168306ms step_avg:157.00ms step:1083/1480 train_time:168473ms step_avg:157.01ms step:1084/1480 train_time:168640ms step_avg:157.02ms step:1085/1480 train_time:168808ms step_avg:157.03ms step:1086/1480 train_time:168975ms step_avg:157.04ms step:1087/1480 train_time:169141ms step_avg:157.05ms step:1088/1480 train_time:169310ms step_avg:157.06ms step:1089/1480 train_time:169482ms step_avg:157.07ms step:1090/1480 train_time:169653ms step_avg:157.09ms step:1091/1480 train_time:169820ms step_avg:157.10ms step:1092/1480 train_time:169989ms step_avg:157.11ms step:1093/1480 train_time:170157ms step_avg:157.12ms step:1094/1480 train_time:170323ms step_avg:157.12ms step:1095/1480 train_time:170487ms step_avg:157.13ms step:1096/1480 train_time:170655ms step_avg:157.14ms step:1097/1480 train_time:170825ms step_avg:157.15ms step:1098/1480 train_time:170997ms step_avg:157.17ms step:1099/1480 train_time:171167ms step_avg:157.18ms step:1100/1480 train_time:171338ms step_avg:157.19ms step:1101/1480 train_time:171509ms step_avg:157.20ms step:1102/1480 train_time:171681ms step_avg:157.22ms step:1103/1480 train_time:171857ms step_avg:157.23ms step:1104/1480 train_time:172023ms step_avg:157.24ms step:1105/1480 train_time:172194ms step_avg:157.26ms step:1106/1480 train_time:172362ms step_avg:157.26ms step:1107/1480 train_time:172531ms step_avg:157.28ms step:1108/1480 train_time:172696ms step_avg:157.28ms step:1109/1480 train_time:172863ms step_avg:157.29ms step:1110/1480 train_time:173029ms step_avg:157.30ms step:1111/1480 train_time:173197ms step_avg:157.31ms step:1112/1480 train_time:173367ms step_avg:157.32ms step:1113/1480 train_time:173545ms step_avg:157.34ms step:1114/1480 train_time:173720ms step_avg:157.35ms step:1115/1480 train_time:173892ms step_avg:157.37ms step:1116/1480 train_time:174059ms step_avg:157.38ms step:1117/1480 train_time:174231ms step_avg:157.39ms step:1118/1480 train_time:174407ms step_avg:157.41ms step:1119/1480 train_time:174574ms step_avg:157.42ms step:1120/1480 train_time:174742ms step_avg:157.42ms step:1121/1480 train_time:174913ms step_avg:157.44ms step:1122/1480 train_time:175079ms step_avg:157.44ms step:1123/1480 train_time:175246ms step_avg:157.45ms step:1124/1480 train_time:175415ms step_avg:157.46ms step:1125/1480 train_time:175581ms step_avg:157.47ms step:1125/1480 val_loss:3.3840 train_time:175649ms step_avg:157.53ms step:1126/1480 train_time:175751ms step_avg:157.48ms step:1127/1480 train_time:175921ms step_avg:157.49ms step:1128/1480 train_time:176090ms step_avg:157.50ms step:1129/1480 train_time:176265ms step_avg:157.52ms step:1130/1480 train_time:176434ms step_avg:157.53ms step:1131/1480 train_time:176612ms step_avg:157.55ms step:1132/1480 train_time:176779ms step_avg:157.56ms step:1133/1480 train_time:176951ms step_avg:157.57ms step:1134/1480 train_time:177120ms step_avg:157.58ms step:1135/1480 train_time:177288ms step_avg:157.59ms step:1136/1480 train_time:177458ms step_avg:157.60ms step:1137/1480 train_time:177629ms step_avg:157.61ms step:1138/1480 train_time:177801ms step_avg:157.62ms step:1139/1480 train_time:177969ms step_avg:157.63ms step:1140/1480 train_time:178136ms step_avg:157.64ms step:1141/1480 train_time:178309ms step_avg:157.66ms step:1142/1480 train_time:178475ms step_avg:157.66ms step:1143/1480 train_time:178646ms step_avg:157.68ms step:1144/1480 train_time:178814ms step_avg:157.68ms step:1145/1480 train_time:178978ms step_avg:157.69ms step:1146/1480 train_time:179149ms step_avg:157.70ms step:1147/1480 train_time:179317ms step_avg:157.71ms step:1148/1480 train_time:179486ms step_avg:157.72ms step:1149/1480 train_time:179656ms step_avg:157.73ms step:1150/1480 train_time:179826ms step_avg:157.74ms step:1151/1480 train_time:179998ms step_avg:157.76ms step:1152/1480 train_time:180170ms step_avg:157.77ms step:1153/1480 train_time:180344ms step_avg:157.78ms step:1154/1480 train_time:180512ms step_avg:157.79ms step:1155/1480 train_time:180685ms step_avg:157.80ms step:1156/1480 train_time:180864ms step_avg:157.82ms step:1157/1480 train_time:181033ms step_avg:157.83ms step:1158/1480 train_time:181200ms step_avg:157.84ms step:1159/1480 train_time:181368ms step_avg:157.85ms step:1160/1480 train_time:181533ms step_avg:157.85ms step:1161/1480 train_time:181705ms step_avg:157.87ms step:1162/1480 train_time:181874ms step_avg:157.88ms step:1163/1480 train_time:182046ms step_avg:157.89ms step:1164/1480 train_time:182214ms step_avg:157.90ms step:1165/1480 train_time:182380ms step_avg:157.90ms step:1166/1480 train_time:182548ms step_avg:157.91ms step:1167/1480 train_time:182716ms step_avg:157.92ms step:1168/1480 train_time:182884ms step_avg:157.93ms step:1169/1480 train_time:183052ms step_avg:157.94ms step:1170/1480 train_time:183220ms step_avg:157.95ms step:1171/1480 train_time:183386ms step_avg:157.96ms step:1172/1480 train_time:183552ms step_avg:157.96ms step:1173/1480 train_time:183724ms step_avg:157.97ms step:1174/1480 train_time:183907ms step_avg:158.00ms step:1175/1480 train_time:184078ms step_avg:158.01ms step:1176/1480 train_time:184250ms step_avg:158.02ms step:1177/1480 train_time:184426ms step_avg:158.03ms step:1178/1480 train_time:184592ms step_avg:158.04ms step:1179/1480 train_time:184758ms step_avg:158.05ms step:1180/1480 train_time:184939ms step_avg:158.07ms step:1181/1480 train_time:185110ms step_avg:158.08ms step:1182/1480 train_time:185279ms step_avg:158.09ms step:1183/1480 train_time:185450ms step_avg:158.10ms step:1184/1480 train_time:185618ms step_avg:158.11ms step:1185/1480 train_time:185790ms step_avg:158.12ms step:1186/1480 train_time:185960ms step_avg:158.13ms step:1187/1480 train_time:186144ms step_avg:158.15ms step:1188/1480 train_time:186311ms step_avg:158.16ms step:1189/1480 train_time:186484ms step_avg:158.17ms step:1190/1480 train_time:186652ms step_avg:158.18ms step:1191/1480 train_time:186824ms step_avg:158.19ms step:1192/1480 train_time:186990ms step_avg:158.20ms step:1193/1480 train_time:187156ms step_avg:158.20ms step:1194/1480 train_time:187327ms step_avg:158.22ms step:1195/1480 train_time:187500ms step_avg:158.23ms step:1196/1480 train_time:187683ms step_avg:158.25ms step:1197/1480 train_time:187855ms step_avg:158.26ms step:1198/1480 train_time:188037ms step_avg:158.28ms step:1199/1480 train_time:188208ms step_avg:158.29ms step:1200/1480 train_time:188376ms step_avg:158.30ms step:1201/1480 train_time:188544ms step_avg:158.31ms step:1202/1480 train_time:188724ms step_avg:158.33ms step:1203/1480 train_time:188901ms step_avg:158.34ms step:1204/1480 train_time:189075ms step_avg:158.35ms step:1205/1480 train_time:189243ms step_avg:158.36ms step:1206/1480 train_time:189411ms step_avg:158.37ms step:1207/1480 train_time:189581ms step_avg:158.38ms step:1208/1480 train_time:189749ms step_avg:158.39ms step:1209/1480 train_time:189921ms step_avg:158.40ms step:1210/1480 train_time:190096ms step_avg:158.41ms step:1211/1480 train_time:190269ms step_avg:158.43ms step:1212/1480 train_time:190441ms step_avg:158.44ms step:1213/1480 train_time:190613ms step_avg:158.45ms step:1214/1480 train_time:190790ms step_avg:158.46ms step:1215/1480 train_time:190963ms step_avg:158.48ms step:1216/1480 train_time:191130ms step_avg:158.48ms step:1217/1480 train_time:191304ms step_avg:158.50ms step:1218/1480 train_time:191474ms step_avg:158.50ms step:1219/1480 train_time:191652ms step_avg:158.52ms step:1220/1480 train_time:191822ms step_avg:158.53ms step:1221/1480 train_time:191990ms step_avg:158.54ms step:1222/1480 train_time:192157ms step_avg:158.55ms step:1223/1480 train_time:192327ms step_avg:158.56ms step:1224/1480 train_time:192506ms step_avg:158.57ms step:1225/1480 train_time:192677ms step_avg:158.58ms step:1226/1480 train_time:192850ms step_avg:158.59ms step:1227/1480 train_time:193024ms step_avg:158.61ms step:1228/1480 train_time:193194ms step_avg:158.62ms step:1229/1480 train_time:193367ms step_avg:158.63ms step:1230/1480 train_time:193549ms step_avg:158.65ms step:1231/1480 train_time:193725ms step_avg:158.66ms step:1232/1480 train_time:193900ms step_avg:158.67ms step:1233/1480 train_time:194070ms step_avg:158.68ms step:1234/1480 train_time:194240ms step_avg:158.69ms step:1235/1480 train_time:194414ms step_avg:158.71ms step:1236/1480 train_time:194582ms step_avg:158.71ms step:1237/1480 train_time:194753ms step_avg:158.72ms step:1238/1480 train_time:194939ms step_avg:158.75ms step:1239/1480 train_time:195110ms step_avg:158.76ms step:1240/1480 train_time:195280ms step_avg:158.76ms step:1241/1480 train_time:195451ms step_avg:158.77ms step:1242/1480 train_time:195621ms step_avg:158.78ms step:1243/1480 train_time:195793ms step_avg:158.79ms step:1244/1480 train_time:195962ms step_avg:158.80ms step:1245/1480 train_time:196131ms step_avg:158.81ms step:1246/1480 train_time:196302ms step_avg:158.82ms step:1247/1480 train_time:196471ms step_avg:158.83ms step:1248/1480 train_time:196641ms step_avg:158.84ms step:1249/1480 train_time:196809ms step_avg:158.84ms step:1250/1480 train_time:196977ms step_avg:158.85ms step:1250/1480 val_loss:3.3349 train_time:197050ms step_avg:158.91ms step:1251/1480 train_time:197157ms step_avg:158.87ms step:1252/1480 train_time:197327ms step_avg:158.88ms step:1253/1480 train_time:197495ms step_avg:158.89ms step:1254/1480 train_time:197666ms step_avg:158.90ms step:1255/1480 train_time:197854ms step_avg:158.92ms step:1256/1480 train_time:198030ms step_avg:158.93ms step:1257/1480 train_time:198200ms step_avg:158.94ms step:1258/1480 train_time:198374ms step_avg:158.95ms step:1259/1480 train_time:198546ms step_avg:158.96ms step:1260/1480 train_time:198712ms step_avg:158.97ms step:1261/1480 train_time:198885ms step_avg:158.98ms step:1262/1480 train_time:199060ms step_avg:158.99ms step:1263/1480 train_time:199234ms step_avg:159.01ms step:1264/1480 train_time:199399ms step_avg:159.01ms step:1265/1480 train_time:199567ms step_avg:159.02ms step:1266/1480 train_time:199738ms step_avg:159.03ms step:1267/1480 train_time:199909ms step_avg:159.04ms step:1268/1480 train_time:200080ms step_avg:159.05ms step:1269/1480 train_time:200256ms step_avg:159.06ms step:1270/1480 train_time:200426ms step_avg:159.07ms step:1271/1480 train_time:200595ms step_avg:159.08ms step:1272/1480 train_time:200761ms step_avg:159.08ms step:1273/1480 train_time:200931ms step_avg:159.09ms step:1274/1480 train_time:201102ms step_avg:159.10ms step:1275/1480 train_time:201271ms step_avg:159.11ms step:1276/1480 train_time:201436ms step_avg:159.11ms step:1277/1480 train_time:201609ms step_avg:159.12ms step:1278/1480 train_time:201776ms step_avg:159.13ms step:1279/1480 train_time:201949ms step_avg:159.14ms step:1280/1480 train_time:202128ms step_avg:159.16ms step:1281/1480 train_time:202297ms step_avg:159.16ms step:1282/1480 train_time:202463ms step_avg:159.17ms step:1283/1480 train_time:202633ms step_avg:159.18ms step:1284/1480 train_time:202804ms step_avg:159.19ms step:1285/1480 train_time:202974ms step_avg:159.19ms step:1286/1480 train_time:203142ms step_avg:159.20ms step:1287/1480 train_time:203313ms step_avg:159.21ms step:1288/1480 train_time:203485ms step_avg:159.22ms step:1289/1480 train_time:203669ms step_avg:159.24ms step:1290/1480 train_time:203847ms step_avg:159.26ms step:1291/1480 train_time:204021ms step_avg:159.27ms step:1292/1480 train_time:204196ms step_avg:159.28ms step:1293/1480 train_time:204373ms step_avg:159.29ms step:1294/1480 train_time:204544ms step_avg:159.30ms step:1295/1480 train_time:204715ms step_avg:159.31ms step:1296/1480 train_time:204889ms step_avg:159.32ms step:1297/1480 train_time:205060ms step_avg:159.33ms step:1298/1480 train_time:205232ms step_avg:159.34ms step:1299/1480 train_time:205403ms step_avg:159.35ms step:1300/1480 train_time:205571ms step_avg:159.36ms step:1301/1480 train_time:205738ms step_avg:159.36ms step:1302/1480 train_time:205912ms step_avg:159.37ms step:1303/1480 train_time:206090ms step_avg:159.39ms step:1304/1480 train_time:206264ms step_avg:159.40ms step:1305/1480 train_time:206432ms step_avg:159.41ms step:1306/1480 train_time:206607ms step_avg:159.42ms step:1307/1480 train_time:206775ms step_avg:159.43ms step:1308/1480 train_time:206945ms step_avg:159.43ms step:1309/1480 train_time:207116ms step_avg:159.44ms step:1310/1480 train_time:207284ms step_avg:159.45ms step:1311/1480 train_time:207454ms step_avg:159.46ms step:1312/1480 train_time:207629ms step_avg:159.47ms step:1313/1480 train_time:207796ms step_avg:159.48ms step:1314/1480 train_time:207972ms step_avg:159.49ms step:1315/1480 train_time:208143ms step_avg:159.50ms step:1316/1480 train_time:208311ms step_avg:159.50ms step:1317/1480 train_time:208483ms step_avg:159.51ms step:1318/1480 train_time:208663ms step_avg:159.53ms step:1319/1480 train_time:208839ms step_avg:159.54ms step:1320/1480 train_time:209015ms step_avg:159.55ms step:1321/1480 train_time:209187ms step_avg:159.56ms step:1322/1480 train_time:209368ms step_avg:159.58ms step:1323/1480 train_time:209539ms step_avg:159.59ms step:1324/1480 train_time:209713ms step_avg:159.60ms step:1325/1480 train_time:209896ms step_avg:159.62ms step:1326/1480 train_time:210072ms step_avg:159.63ms step:1327/1480 train_time:210242ms step_avg:159.64ms step:1328/1480 train_time:210413ms step_avg:159.65ms step:1329/1480 train_time:210608ms step_avg:159.67ms step:1330/1480 train_time:210786ms step_avg:159.69ms step:1331/1480 train_time:210956ms step_avg:159.69ms step:1332/1480 train_time:211133ms step_avg:159.71ms step:1333/1480 train_time:211308ms step_avg:159.72ms step:1334/1480 train_time:211478ms step_avg:159.73ms step:1335/1480 train_time:211646ms step_avg:159.73ms step:1336/1480 train_time:211829ms step_avg:159.75ms step:1337/1480 train_time:212004ms step_avg:159.76ms step:1338/1480 train_time:212176ms step_avg:159.77ms step:1339/1480 train_time:212351ms step_avg:159.78ms step:1340/1480 train_time:212521ms step_avg:159.79ms step:1341/1480 train_time:212690ms step_avg:159.80ms step:1342/1480 train_time:212864ms step_avg:159.81ms step:1343/1480 train_time:213034ms step_avg:159.82ms step:1344/1480 train_time:213207ms step_avg:159.83ms step:1345/1480 train_time:213385ms step_avg:159.84ms step:1346/1480 train_time:213554ms step_avg:159.85ms step:1347/1480 train_time:213724ms step_avg:159.85ms step:1348/1480 train_time:213894ms step_avg:159.86ms step:1349/1480 train_time:214064ms step_avg:159.87ms step:1350/1480 train_time:214237ms step_avg:159.88ms step:1351/1480 train_time:214408ms step_avg:159.89ms step:1352/1480 train_time:214579ms step_avg:159.90ms step:1353/1480 train_time:214755ms step_avg:159.91ms step:1354/1480 train_time:214928ms step_avg:159.92ms step:1355/1480 train_time:215096ms step_avg:159.92ms step:1356/1480 train_time:215270ms step_avg:159.93ms step:1357/1480 train_time:215443ms step_avg:159.94ms step:1358/1480 train_time:215615ms step_avg:159.95ms step:1359/1480 train_time:215788ms step_avg:159.96ms step:1360/1480 train_time:215964ms step_avg:159.97ms step:1361/1480 train_time:216141ms step_avg:159.99ms step:1362/1480 train_time:216315ms step_avg:160.00ms step:1363/1480 train_time:216496ms step_avg:160.01ms step:1364/1480 train_time:216666ms step_avg:160.02ms step:1365/1480 train_time:216831ms step_avg:160.02ms step:1366/1480 train_time:217003ms step_avg:160.03ms step:1367/1480 train_time:217175ms step_avg:160.04ms step:1368/1480 train_time:217349ms step_avg:160.05ms step:1369/1480 train_time:217531ms step_avg:160.07ms step:1370/1480 train_time:217709ms step_avg:160.08ms step:1371/1480 train_time:217879ms step_avg:160.09ms step:1372/1480 train_time:218055ms step_avg:160.10ms step:1373/1480 train_time:218226ms step_avg:160.11ms step:1374/1480 train_time:218400ms step_avg:160.12ms step:1375/1480 train_time:218572ms step_avg:160.13ms step:1375/1480 val_loss:3.2962 train_time:218639ms step_avg:160.18ms step:1376/1480 train_time:218743ms step_avg:160.13ms step:1377/1480 train_time:218916ms step_avg:160.14ms step:1378/1480 train_time:219084ms step_avg:160.15ms step:1379/1480 train_time:219258ms step_avg:160.16ms step:1380/1480 train_time:219432ms step_avg:160.17ms step:1381/1480 train_time:219614ms step_avg:160.19ms step:1382/1480 train_time:219785ms step_avg:160.19ms step:1383/1480 train_time:219956ms step_avg:160.20ms step:1384/1480 train_time:220133ms step_avg:160.21ms step:1385/1480 train_time:220298ms step_avg:160.22ms step:1386/1480 train_time:220469ms step_avg:160.22ms step:1387/1480 train_time:220640ms step_avg:160.23ms step:1388/1480 train_time:220809ms step_avg:160.24ms step:1389/1480 train_time:220982ms step_avg:160.25ms step:1390/1480 train_time:221151ms step_avg:160.25ms step:1391/1480 train_time:221319ms step_avg:160.26ms step:1392/1480 train_time:221492ms step_avg:160.27ms step:1393/1480 train_time:221662ms step_avg:160.28ms step:1394/1480 train_time:221834ms step_avg:160.28ms step:1395/1480 train_time:222001ms step_avg:160.29ms step:1396/1480 train_time:222169ms step_avg:160.30ms step:1397/1480 train_time:222336ms step_avg:160.30ms step:1398/1480 train_time:222503ms step_avg:160.30ms step:1399/1480 train_time:222673ms step_avg:160.31ms step:1400/1480 train_time:222850ms step_avg:160.32ms step:1401/1480 train_time:223016ms step_avg:160.33ms step:1402/1480 train_time:223188ms step_avg:160.34ms step:1403/1480 train_time:223364ms step_avg:160.35ms step:1404/1480 train_time:223535ms step_avg:160.35ms step:1405/1480 train_time:223711ms step_avg:160.37ms step:1406/1480 train_time:223885ms step_avg:160.38ms step:1407/1480 train_time:224053ms step_avg:160.38ms step:1408/1480 train_time:224222ms step_avg:160.39ms step:1409/1480 train_time:224404ms step_avg:160.40ms step:1410/1480 train_time:224574ms step_avg:160.41ms step:1411/1480 train_time:224740ms step_avg:160.41ms step:1412/1480 train_time:224911ms step_avg:160.42ms step:1413/1480 train_time:225082ms step_avg:160.43ms step:1414/1480 train_time:225254ms step_avg:160.44ms step:1415/1480 train_time:225426ms step_avg:160.45ms step:1416/1480 train_time:225613ms step_avg:160.46ms step:1417/1480 train_time:225788ms step_avg:160.47ms step:1418/1480 train_time:225957ms step_avg:160.48ms step:1419/1480 train_time:226132ms step_avg:160.49ms step:1420/1480 train_time:226305ms step_avg:160.50ms step:1421/1480 train_time:226479ms step_avg:160.51ms step:1422/1480 train_time:226653ms step_avg:160.52ms step:1423/1480 train_time:226822ms step_avg:160.52ms step:1424/1480 train_time:226999ms step_avg:160.54ms step:1425/1480 train_time:227179ms step_avg:160.55ms step:1426/1480 train_time:227350ms step_avg:160.56ms step:1427/1480 train_time:227525ms step_avg:160.57ms step:1428/1480 train_time:227697ms step_avg:160.58ms step:1429/1480 train_time:227866ms step_avg:160.58ms step:1430/1480 train_time:228039ms step_avg:160.59ms step:1431/1480 train_time:228215ms step_avg:160.60ms step:1432/1480 train_time:228392ms step_avg:160.61ms step:1433/1480 train_time:228569ms step_avg:160.62ms step:1434/1480 train_time:228750ms step_avg:160.64ms step:1435/1480 train_time:228925ms step_avg:160.65ms step:1436/1480 train_time:229097ms step_avg:160.66ms step:1437/1480 train_time:229267ms step_avg:160.66ms step:1438/1480 train_time:229436ms step_avg:160.67ms step:1439/1480 train_time:229611ms step_avg:160.68ms step:1440/1480 train_time:229780ms step_avg:160.69ms step:1441/1480 train_time:229951ms step_avg:160.69ms step:1442/1480 train_time:230126ms step_avg:160.70ms step:1443/1480 train_time:230317ms step_avg:160.72ms step:1444/1480 train_time:230488ms step_avg:160.73ms step:1445/1480 train_time:230657ms step_avg:160.74ms step:1446/1480 train_time:230833ms step_avg:160.75ms step:1447/1480 train_time:231011ms step_avg:160.76ms step:1448/1480 train_time:231182ms step_avg:160.77ms step:1449/1480 train_time:231355ms step_avg:160.78ms step:1450/1480 train_time:231527ms step_avg:160.78ms step:1451/1480 train_time:231698ms step_avg:160.79ms step:1452/1480 train_time:231872ms step_avg:160.80ms step:1453/1480 train_time:232041ms step_avg:160.80ms step:1454/1480 train_time:232213ms step_avg:160.81ms step:1455/1480 train_time:232392ms step_avg:160.82ms step:1456/1480 train_time:232565ms step_avg:160.83ms step:1457/1480 train_time:232735ms step_avg:160.84ms step:1458/1480 train_time:232907ms step_avg:160.85ms step:1459/1480 train_time:233082ms step_avg:160.86ms step:1460/1480 train_time:233254ms step_avg:160.86ms step:1461/1480 train_time:233428ms step_avg:160.87ms step:1462/1480 train_time:233598ms step_avg:160.88ms step:1463/1480 train_time:233776ms step_avg:160.89ms step:1464/1480 train_time:233951ms step_avg:160.90ms step:1465/1480 train_time:234122ms step_avg:160.91ms step:1466/1480 train_time:234293ms step_avg:160.92ms step:1467/1480 train_time:234466ms step_avg:160.92ms step:1468/1480 train_time:234635ms step_avg:160.93ms step:1469/1480 train_time:234809ms step_avg:160.94ms step:1470/1480 train_time:234990ms step_avg:160.95ms step:1471/1480 train_time:235178ms step_avg:160.97ms step:1472/1480 train_time:235359ms step_avg:160.98ms step:1473/1480 train_time:235530ms step_avg:160.99ms step:1474/1480 train_time:235708ms step_avg:161.00ms step:1475/1480 train_time:235887ms step_avg:161.02ms step:1476/1480 train_time:236059ms step_avg:161.02ms step:1477/1480 train_time:236240ms step_avg:161.04ms step:1478/1480 train_time:236423ms step_avg:161.05ms step:1479/1480 train_time:236597ms step_avg:161.06ms step:1480/1480 train_time:236768ms step_avg:161.07ms step:1480/1480 val_loss:3.2771 train_time:236839ms step_avg:161.12ms