import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 12:51:18 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 77W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 116W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 128W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22975ms step_avg:nanms step:2/1480 train_time:23061ms step_avg:nanms step:3/1480 train_time:23201ms step_avg:nanms step:4/1480 train_time:23341ms step_avg:nanms step:5/1480 train_time:23482ms step_avg:nanms step:6/1480 train_time:23623ms step_avg:nanms step:7/1480 train_time:23764ms step_avg:nanms step:8/1480 train_time:23906ms step_avg:nanms step:9/1480 train_time:24051ms step_avg:nanms step:10/1480 train_time:24196ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.78ms step:14/1480 train_time:567ms step_avg:141.69ms step:15/1480 train_time:709ms step_avg:141.74ms step:16/1480 train_time:852ms step_avg:142.04ms step:17/1480 train_time:996ms step_avg:142.22ms step:18/1480 train_time:1139ms step_avg:142.41ms step:19/1480 train_time:1282ms step_avg:142.50ms step:20/1480 train_time:1424ms step_avg:142.40ms step:21/1480 train_time:1566ms step_avg:142.37ms step:22/1480 train_time:1709ms step_avg:142.39ms step:23/1480 train_time:1851ms step_avg:142.36ms step:24/1480 train_time:1994ms step_avg:142.43ms step:25/1480 train_time:2138ms step_avg:142.56ms step:26/1480 train_time:2282ms step_avg:142.63ms step:27/1480 train_time:2425ms step_avg:142.64ms step:28/1480 train_time:2568ms step_avg:142.68ms step:29/1480 train_time:2709ms step_avg:142.57ms step:30/1480 train_time:2850ms step_avg:142.50ms step:31/1480 train_time:2992ms step_avg:142.46ms step:32/1480 train_time:3135ms step_avg:142.52ms step:33/1480 train_time:3280ms step_avg:142.61ms step:34/1480 train_time:3423ms step_avg:142.64ms step:35/1480 train_time:3566ms step_avg:142.65ms step:36/1480 train_time:3709ms step_avg:142.66ms step:37/1480 train_time:3851ms step_avg:142.62ms step:38/1480 train_time:3995ms step_avg:142.67ms step:39/1480 train_time:4139ms step_avg:142.71ms step:40/1480 train_time:4283ms step_avg:142.77ms step:41/1480 train_time:4426ms step_avg:142.77ms step:42/1480 train_time:4568ms step_avg:142.75ms step:43/1480 train_time:4709ms step_avg:142.71ms step:44/1480 train_time:4851ms step_avg:142.67ms step:45/1480 train_time:4995ms step_avg:142.71ms step:46/1480 train_time:5139ms step_avg:142.75ms step:47/1480 train_time:5283ms step_avg:142.78ms step:48/1480 train_time:5426ms step_avg:142.79ms step:49/1480 train_time:5568ms step_avg:142.77ms step:50/1480 train_time:5709ms step_avg:142.71ms step:51/1480 train_time:5850ms step_avg:142.68ms step:52/1480 train_time:5995ms step_avg:142.73ms step:53/1480 train_time:6138ms step_avg:142.75ms step:54/1480 train_time:6282ms step_avg:142.78ms step:55/1480 train_time:6426ms step_avg:142.81ms step:56/1480 train_time:6569ms step_avg:142.79ms step:57/1480 train_time:6709ms step_avg:142.75ms step:58/1480 train_time:6851ms step_avg:142.72ms step:59/1480 train_time:6993ms step_avg:142.71ms step:60/1480 train_time:7136ms step_avg:142.72ms step:61/1480 train_time:7281ms step_avg:142.77ms step:62/1480 train_time:7425ms step_avg:142.78ms step:63/1480 train_time:7567ms step_avg:142.78ms step:64/1480 train_time:7709ms step_avg:142.76ms step:65/1480 train_time:7851ms step_avg:142.74ms step:66/1480 train_time:7993ms step_avg:142.73ms step:67/1480 train_time:8135ms step_avg:142.73ms step:68/1480 train_time:8279ms step_avg:142.73ms step:69/1480 train_time:8423ms step_avg:142.76ms step:70/1480 train_time:8566ms step_avg:142.76ms step:71/1480 train_time:8707ms step_avg:142.74ms step:72/1480 train_time:8849ms step_avg:142.72ms step:73/1480 train_time:8991ms step_avg:142.72ms step:74/1480 train_time:9136ms step_avg:142.74ms step:75/1480 train_time:9279ms step_avg:142.75ms step:76/1480 train_time:9423ms step_avg:142.77ms step:77/1480 train_time:9567ms step_avg:142.79ms step:78/1480 train_time:9709ms step_avg:142.78ms step:79/1480 train_time:9850ms step_avg:142.76ms step:80/1480 train_time:9991ms step_avg:142.72ms step:81/1480 train_time:10132ms step_avg:142.71ms step:82/1480 train_time:10276ms step_avg:142.72ms step:83/1480 train_time:10419ms step_avg:142.73ms step:84/1480 train_time:10563ms step_avg:142.74ms step:85/1480 train_time:10706ms step_avg:142.75ms step:86/1480 train_time:10848ms step_avg:142.74ms step:87/1480 train_time:10990ms step_avg:142.73ms step:88/1480 train_time:11131ms step_avg:142.70ms step:89/1480 train_time:11275ms step_avg:142.72ms step:90/1480 train_time:11419ms step_avg:142.74ms step:91/1480 train_time:11562ms step_avg:142.75ms step:92/1480 train_time:11709ms step_avg:142.79ms step:93/1480 train_time:11848ms step_avg:142.74ms step:94/1480 train_time:11990ms step_avg:142.74ms step:95/1480 train_time:12131ms step_avg:142.72ms step:96/1480 train_time:12273ms step_avg:142.70ms step:97/1480 train_time:12415ms step_avg:142.71ms step:98/1480 train_time:12559ms step_avg:142.71ms step:99/1480 train_time:12705ms step_avg:142.75ms step:100/1480 train_time:12845ms step_avg:142.72ms step:101/1480 train_time:12987ms step_avg:142.72ms step:102/1480 train_time:13130ms step_avg:142.72ms step:103/1480 train_time:13273ms step_avg:142.72ms step:104/1480 train_time:13416ms step_avg:142.72ms step:105/1480 train_time:13560ms step_avg:142.74ms step:106/1480 train_time:13705ms step_avg:142.76ms step:107/1480 train_time:13847ms step_avg:142.75ms step:108/1480 train_time:13990ms step_avg:142.76ms step:109/1480 train_time:14132ms step_avg:142.75ms step:110/1480 train_time:14274ms step_avg:142.74ms step:111/1480 train_time:14418ms step_avg:142.75ms step:112/1480 train_time:14567ms step_avg:142.81ms step:113/1480 train_time:14714ms step_avg:142.85ms step:114/1480 train_time:14863ms step_avg:142.91ms step:115/1480 train_time:15010ms step_avg:142.96ms step:116/1480 train_time:15158ms step_avg:143.00ms step:117/1480 train_time:15306ms step_avg:143.04ms step:118/1480 train_time:15452ms step_avg:143.07ms step:119/1480 train_time:15600ms step_avg:143.12ms step:120/1480 train_time:15747ms step_avg:143.16ms step:121/1480 train_time:15895ms step_avg:143.20ms step:122/1480 train_time:16043ms step_avg:143.25ms step:123/1480 train_time:16190ms step_avg:143.28ms step:124/1480 train_time:16337ms step_avg:143.31ms step:125/1480 train_time:16484ms step_avg:143.34ms step:125/1480 val_loss:4.4242 train_time:16541ms step_avg:143.84ms step:126/1480 train_time:16637ms step_avg:143.42ms step:127/1480 train_time:16785ms step_avg:143.46ms step:128/1480 train_time:16931ms step_avg:143.49ms step:129/1480 train_time:17078ms step_avg:143.51ms step:130/1480 train_time:17225ms step_avg:143.54ms step:131/1480 train_time:17370ms step_avg:143.55ms step:132/1480 train_time:17517ms step_avg:143.58ms step:133/1480 train_time:17666ms step_avg:143.63ms step:134/1480 train_time:17815ms step_avg:143.67ms step:135/1480 train_time:17962ms step_avg:143.70ms step:136/1480 train_time:18108ms step_avg:143.71ms step:137/1480 train_time:18254ms step_avg:143.73ms step:138/1480 train_time:18401ms step_avg:143.76ms step:139/1480 train_time:18548ms step_avg:143.78ms step:140/1480 train_time:18695ms step_avg:143.81ms step:141/1480 train_time:18845ms step_avg:143.86ms step:142/1480 train_time:18991ms step_avg:143.87ms step:143/1480 train_time:19141ms step_avg:143.92ms step:144/1480 train_time:19287ms step_avg:143.93ms step:145/1480 train_time:19432ms step_avg:143.94ms step:146/1480 train_time:19579ms step_avg:143.96ms step:147/1480 train_time:19726ms step_avg:143.98ms step:148/1480 train_time:19871ms step_avg:144.00ms step:149/1480 train_time:20019ms step_avg:144.02ms step:150/1480 train_time:20166ms step_avg:144.04ms step:151/1480 train_time:20312ms step_avg:144.06ms step:152/1480 train_time:20459ms step_avg:144.07ms step:153/1480 train_time:20605ms step_avg:144.09ms step:154/1480 train_time:20752ms step_avg:144.11ms step:155/1480 train_time:20900ms step_avg:144.14ms step:156/1480 train_time:21047ms step_avg:144.16ms step:157/1480 train_time:21194ms step_avg:144.18ms step:158/1480 train_time:21342ms step_avg:144.20ms step:159/1480 train_time:21488ms step_avg:144.22ms step:160/1480 train_time:21635ms step_avg:144.23ms step:161/1480 train_time:21781ms step_avg:144.25ms step:162/1480 train_time:21929ms step_avg:144.27ms step:163/1480 train_time:22077ms step_avg:144.29ms step:164/1480 train_time:22224ms step_avg:144.31ms step:165/1480 train_time:22370ms step_avg:144.32ms step:166/1480 train_time:22518ms step_avg:144.34ms step:167/1480 train_time:22664ms step_avg:144.36ms step:168/1480 train_time:22809ms step_avg:144.36ms step:169/1480 train_time:22956ms step_avg:144.38ms step:170/1480 train_time:23104ms step_avg:144.40ms step:171/1480 train_time:23249ms step_avg:144.41ms step:172/1480 train_time:23397ms step_avg:144.43ms step:173/1480 train_time:23544ms step_avg:144.44ms step:174/1480 train_time:23691ms step_avg:144.46ms step:175/1480 train_time:23839ms step_avg:144.48ms step:176/1480 train_time:23985ms step_avg:144.49ms step:177/1480 train_time:24131ms step_avg:144.50ms step:178/1480 train_time:24277ms step_avg:144.51ms step:179/1480 train_time:24424ms step_avg:144.52ms step:180/1480 train_time:24570ms step_avg:144.53ms step:181/1480 train_time:24717ms step_avg:144.55ms step:182/1480 train_time:24865ms step_avg:144.56ms step:183/1480 train_time:25013ms step_avg:144.59ms step:184/1480 train_time:25161ms step_avg:144.60ms step:185/1480 train_time:25309ms step_avg:144.62ms step:186/1480 train_time:25455ms step_avg:144.63ms step:187/1480 train_time:25603ms step_avg:144.65ms step:188/1480 train_time:25749ms step_avg:144.66ms step:189/1480 train_time:25896ms step_avg:144.67ms step:190/1480 train_time:26044ms step_avg:144.69ms step:191/1480 train_time:26190ms step_avg:144.70ms step:192/1480 train_time:26340ms step_avg:144.73ms step:193/1480 train_time:26487ms step_avg:144.74ms step:194/1480 train_time:26634ms step_avg:144.75ms step:195/1480 train_time:26781ms step_avg:144.76ms step:196/1480 train_time:26928ms step_avg:144.78ms step:197/1480 train_time:27075ms step_avg:144.79ms step:198/1480 train_time:27221ms step_avg:144.79ms step:199/1480 train_time:27368ms step_avg:144.81ms step:200/1480 train_time:27515ms step_avg:144.82ms step:201/1480 train_time:27661ms step_avg:144.82ms step:202/1480 train_time:27809ms step_avg:144.84ms step:203/1480 train_time:27954ms step_avg:144.84ms step:204/1480 train_time:28102ms step_avg:144.86ms step:205/1480 train_time:28249ms step_avg:144.87ms step:206/1480 train_time:28395ms step_avg:144.87ms step:207/1480 train_time:28543ms step_avg:144.89ms step:208/1480 train_time:28688ms step_avg:144.89ms step:209/1480 train_time:28836ms step_avg:144.90ms step:210/1480 train_time:28983ms step_avg:144.91ms step:211/1480 train_time:29130ms step_avg:144.93ms step:212/1480 train_time:29277ms step_avg:144.94ms step:213/1480 train_time:29425ms step_avg:144.95ms step:214/1480 train_time:29570ms step_avg:144.95ms step:215/1480 train_time:29718ms step_avg:144.96ms step:216/1480 train_time:29865ms step_avg:144.98ms step:217/1480 train_time:30012ms step_avg:144.98ms step:218/1480 train_time:30159ms step_avg:144.99ms step:219/1480 train_time:30307ms step_avg:145.01ms step:220/1480 train_time:30453ms step_avg:145.01ms step:221/1480 train_time:30601ms step_avg:145.03ms step:222/1480 train_time:30751ms step_avg:145.05ms step:223/1480 train_time:30903ms step_avg:145.09ms step:224/1480 train_time:31054ms step_avg:145.11ms step:225/1480 train_time:31205ms step_avg:145.14ms step:226/1480 train_time:31354ms step_avg:145.16ms step:227/1480 train_time:31505ms step_avg:145.19ms step:228/1480 train_time:31657ms step_avg:145.21ms step:229/1480 train_time:31808ms step_avg:145.24ms step:230/1480 train_time:31959ms step_avg:145.27ms step:231/1480 train_time:32109ms step_avg:145.29ms step:232/1480 train_time:32258ms step_avg:145.31ms step:233/1480 train_time:32408ms step_avg:145.33ms step:234/1480 train_time:32558ms step_avg:145.35ms step:235/1480 train_time:32710ms step_avg:145.38ms step:236/1480 train_time:32861ms step_avg:145.40ms step:237/1480 train_time:33011ms step_avg:145.42ms step:238/1480 train_time:33162ms step_avg:145.45ms step:239/1480 train_time:33312ms step_avg:145.47ms step:240/1480 train_time:33462ms step_avg:145.49ms step:241/1480 train_time:33612ms step_avg:145.51ms step:242/1480 train_time:33762ms step_avg:145.53ms step:243/1480 train_time:33913ms step_avg:145.55ms step:244/1480 train_time:34064ms step_avg:145.57ms step:245/1480 train_time:34216ms step_avg:145.60ms step:246/1480 train_time:34366ms step_avg:145.62ms step:247/1480 train_time:34517ms step_avg:145.64ms step:248/1480 train_time:34667ms step_avg:145.66ms step:249/1480 train_time:34818ms step_avg:145.68ms step:250/1480 train_time:34968ms step_avg:145.70ms step:250/1480 val_loss:3.9899 train_time:35026ms step_avg:145.94ms step:251/1480 train_time:35124ms step_avg:145.74ms step:252/1480 train_time:35275ms step_avg:145.77ms step:253/1480 train_time:35426ms step_avg:145.79ms step:254/1480 train_time:35575ms step_avg:145.80ms step:255/1480 train_time:35725ms step_avg:145.81ms step:256/1480 train_time:35874ms step_avg:145.83ms step:257/1480 train_time:36025ms step_avg:145.85ms step:258/1480 train_time:36178ms step_avg:145.88ms step:259/1480 train_time:36329ms step_avg:145.90ms step:260/1480 train_time:36479ms step_avg:145.92ms step:261/1480 train_time:36630ms step_avg:145.94ms step:262/1480 train_time:36779ms step_avg:145.95ms step:263/1480 train_time:36929ms step_avg:145.96ms step:264/1480 train_time:37079ms step_avg:145.98ms step:265/1480 train_time:37232ms step_avg:146.01ms step:266/1480 train_time:37383ms step_avg:146.03ms step:267/1480 train_time:37534ms step_avg:146.05ms step:268/1480 train_time:37685ms step_avg:146.07ms step:269/1480 train_time:37835ms step_avg:146.08ms step:270/1480 train_time:37985ms step_avg:146.10ms step:271/1480 train_time:38134ms step_avg:146.11ms step:272/1480 train_time:38285ms step_avg:146.13ms step:273/1480 train_time:38438ms step_avg:146.15ms step:274/1480 train_time:38588ms step_avg:146.17ms step:275/1480 train_time:38739ms step_avg:146.19ms step:276/1480 train_time:38889ms step_avg:146.20ms step:277/1480 train_time:39040ms step_avg:146.22ms step:278/1480 train_time:39190ms step_avg:146.23ms step:279/1480 train_time:39341ms step_avg:146.25ms step:280/1480 train_time:39492ms step_avg:146.27ms step:281/1480 train_time:39642ms step_avg:146.28ms step:282/1480 train_time:39792ms step_avg:146.29ms step:283/1480 train_time:39942ms step_avg:146.31ms step:284/1480 train_time:40093ms step_avg:146.32ms step:285/1480 train_time:40243ms step_avg:146.34ms step:286/1480 train_time:40394ms step_avg:146.35ms step:287/1480 train_time:40546ms step_avg:146.37ms step:288/1480 train_time:40696ms step_avg:146.39ms step:289/1480 train_time:40847ms step_avg:146.40ms step:290/1480 train_time:40996ms step_avg:146.41ms step:291/1480 train_time:41147ms step_avg:146.43ms step:292/1480 train_time:41297ms step_avg:146.44ms step:293/1480 train_time:41448ms step_avg:146.46ms step:294/1480 train_time:41599ms step_avg:146.48ms step:295/1480 train_time:41751ms step_avg:146.49ms step:296/1480 train_time:41902ms step_avg:146.51ms step:297/1480 train_time:42051ms step_avg:146.52ms step:298/1480 train_time:42202ms step_avg:146.54ms step:299/1480 train_time:42351ms step_avg:146.54ms step:300/1480 train_time:42503ms step_avg:146.56ms step:301/1480 train_time:42653ms step_avg:146.57ms step:302/1480 train_time:42803ms step_avg:146.59ms step:303/1480 train_time:42955ms step_avg:146.60ms step:304/1480 train_time:43106ms step_avg:146.62ms step:305/1480 train_time:43257ms step_avg:146.63ms step:306/1480 train_time:43407ms step_avg:146.64ms step:307/1480 train_time:43558ms step_avg:146.66ms step:308/1480 train_time:43709ms step_avg:146.67ms step:309/1480 train_time:43860ms step_avg:146.69ms step:310/1480 train_time:44010ms step_avg:146.70ms step:311/1480 train_time:44160ms step_avg:146.71ms step:312/1480 train_time:44310ms step_avg:146.72ms step:313/1480 train_time:44461ms step_avg:146.74ms step:314/1480 train_time:44610ms step_avg:146.74ms step:315/1480 train_time:44761ms step_avg:146.76ms step:316/1480 train_time:44912ms step_avg:146.77ms step:317/1480 train_time:45065ms step_avg:146.79ms step:318/1480 train_time:45216ms step_avg:146.80ms step:319/1480 train_time:45367ms step_avg:146.82ms step:320/1480 train_time:45519ms step_avg:146.83ms step:321/1480 train_time:45670ms step_avg:146.85ms step:322/1480 train_time:45819ms step_avg:146.86ms step:323/1480 train_time:45970ms step_avg:146.87ms step:324/1480 train_time:46121ms step_avg:146.88ms step:325/1480 train_time:46271ms step_avg:146.89ms step:326/1480 train_time:46423ms step_avg:146.91ms step:327/1480 train_time:46573ms step_avg:146.92ms step:328/1480 train_time:46725ms step_avg:146.93ms step:329/1480 train_time:46876ms step_avg:146.95ms step:330/1480 train_time:47029ms step_avg:146.96ms step:331/1480 train_time:47183ms step_avg:146.99ms step:332/1480 train_time:47336ms step_avg:147.01ms step:333/1480 train_time:47490ms step_avg:147.03ms step:334/1480 train_time:47645ms step_avg:147.05ms step:335/1480 train_time:47799ms step_avg:147.07ms step:336/1480 train_time:47952ms step_avg:147.09ms step:337/1480 train_time:48106ms step_avg:147.11ms step:338/1480 train_time:48263ms step_avg:147.14ms step:339/1480 train_time:48415ms step_avg:147.16ms step:340/1480 train_time:48568ms step_avg:147.17ms step:341/1480 train_time:48722ms step_avg:147.20ms step:342/1480 train_time:48875ms step_avg:147.21ms step:343/1480 train_time:49030ms step_avg:147.24ms step:344/1480 train_time:49185ms step_avg:147.26ms step:345/1480 train_time:49339ms step_avg:147.28ms step:346/1480 train_time:49493ms step_avg:147.30ms step:347/1480 train_time:49646ms step_avg:147.32ms step:348/1480 train_time:49801ms step_avg:147.34ms step:349/1480 train_time:49955ms step_avg:147.36ms step:350/1480 train_time:50108ms step_avg:147.38ms step:351/1480 train_time:50265ms step_avg:147.40ms step:352/1480 train_time:50419ms step_avg:147.42ms step:353/1480 train_time:50572ms step_avg:147.44ms step:354/1480 train_time:50726ms step_avg:147.46ms step:355/1480 train_time:50880ms step_avg:147.48ms step:356/1480 train_time:51033ms step_avg:147.49ms step:357/1480 train_time:51189ms step_avg:147.52ms step:358/1480 train_time:51341ms step_avg:147.53ms step:359/1480 train_time:51495ms step_avg:147.55ms step:360/1480 train_time:51650ms step_avg:147.57ms step:361/1480 train_time:51804ms step_avg:147.59ms step:362/1480 train_time:51956ms step_avg:147.60ms step:363/1480 train_time:52109ms step_avg:147.62ms step:364/1480 train_time:52265ms step_avg:147.64ms step:365/1480 train_time:52419ms step_avg:147.66ms step:366/1480 train_time:52571ms step_avg:147.67ms step:367/1480 train_time:52725ms step_avg:147.69ms step:368/1480 train_time:52878ms step_avg:147.71ms step:369/1480 train_time:53032ms step_avg:147.72ms step:370/1480 train_time:53185ms step_avg:147.73ms step:371/1480 train_time:53339ms step_avg:147.75ms step:372/1480 train_time:53493ms step_avg:147.77ms step:373/1480 train_time:53648ms step_avg:147.79ms step:374/1480 train_time:53803ms step_avg:147.81ms step:375/1480 train_time:53957ms step_avg:147.83ms step:375/1480 val_loss:3.8064 train_time:54017ms step_avg:147.99ms step:376/1480 train_time:54114ms step_avg:147.85ms step:377/1480 train_time:54269ms step_avg:147.87ms step:378/1480 train_time:54422ms step_avg:147.89ms step:379/1480 train_time:54574ms step_avg:147.90ms step:380/1480 train_time:54728ms step_avg:147.91ms step:381/1480 train_time:54881ms step_avg:147.93ms step:382/1480 train_time:55035ms step_avg:147.94ms step:383/1480 train_time:55190ms step_avg:147.96ms step:384/1480 train_time:55345ms step_avg:147.98ms step:385/1480 train_time:55499ms step_avg:148.00ms step:386/1480 train_time:55652ms step_avg:148.01ms step:387/1480 train_time:55805ms step_avg:148.02ms step:388/1480 train_time:55958ms step_avg:148.04ms step:389/1480 train_time:56110ms step_avg:148.05ms step:390/1480 train_time:56265ms step_avg:148.07ms step:391/1480 train_time:56418ms step_avg:148.08ms step:392/1480 train_time:56571ms step_avg:148.09ms step:393/1480 train_time:56725ms step_avg:148.11ms step:394/1480 train_time:56879ms step_avg:148.12ms step:395/1480 train_time:57032ms step_avg:148.14ms step:396/1480 train_time:57186ms step_avg:148.15ms step:397/1480 train_time:57340ms step_avg:148.16ms step:398/1480 train_time:57494ms step_avg:148.18ms step:399/1480 train_time:57649ms step_avg:148.20ms step:400/1480 train_time:57803ms step_avg:148.21ms step:401/1480 train_time:57957ms step_avg:148.23ms step:402/1480 train_time:58109ms step_avg:148.24ms step:403/1480 train_time:58264ms step_avg:148.25ms step:404/1480 train_time:58417ms step_avg:148.27ms step:405/1480 train_time:58570ms step_avg:148.28ms step:406/1480 train_time:58724ms step_avg:148.29ms step:407/1480 train_time:58879ms step_avg:148.31ms step:408/1480 train_time:59033ms step_avg:148.32ms step:409/1480 train_time:59186ms step_avg:148.34ms step:410/1480 train_time:59340ms step_avg:148.35ms step:411/1480 train_time:59494ms step_avg:148.36ms step:412/1480 train_time:59647ms step_avg:148.37ms step:413/1480 train_time:59799ms step_avg:148.39ms step:414/1480 train_time:59953ms step_avg:148.40ms step:415/1480 train_time:60108ms step_avg:148.42ms step:416/1480 train_time:60263ms step_avg:148.43ms step:417/1480 train_time:60418ms step_avg:148.45ms step:418/1480 train_time:60571ms step_avg:148.46ms step:419/1480 train_time:60725ms step_avg:148.47ms step:420/1480 train_time:60879ms step_avg:148.48ms step:421/1480 train_time:61033ms step_avg:148.50ms step:422/1480 train_time:61186ms step_avg:148.51ms step:423/1480 train_time:61341ms step_avg:148.52ms step:424/1480 train_time:61494ms step_avg:148.54ms step:425/1480 train_time:61648ms step_avg:148.55ms step:426/1480 train_time:61802ms step_avg:148.56ms step:427/1480 train_time:61957ms step_avg:148.58ms step:428/1480 train_time:62110ms step_avg:148.59ms step:429/1480 train_time:62264ms step_avg:148.60ms step:430/1480 train_time:62417ms step_avg:148.61ms step:431/1480 train_time:62570ms step_avg:148.62ms step:432/1480 train_time:62724ms step_avg:148.63ms step:433/1480 train_time:62877ms step_avg:148.65ms step:434/1480 train_time:63031ms step_avg:148.66ms step:435/1480 train_time:63185ms step_avg:148.67ms step:436/1480 train_time:63340ms step_avg:148.69ms step:437/1480 train_time:63494ms step_avg:148.70ms step:438/1480 train_time:63647ms step_avg:148.71ms step:439/1480 train_time:63803ms step_avg:148.72ms step:440/1480 train_time:63958ms step_avg:148.74ms step:441/1480 train_time:64113ms step_avg:148.75ms step:442/1480 train_time:64270ms step_avg:148.77ms step:443/1480 train_time:64427ms step_avg:148.79ms step:444/1480 train_time:64584ms step_avg:148.81ms step:445/1480 train_time:64741ms step_avg:148.83ms step:446/1480 train_time:64896ms step_avg:148.84ms step:447/1480 train_time:65052ms step_avg:148.86ms step:448/1480 train_time:65208ms step_avg:148.88ms step:449/1480 train_time:65367ms step_avg:148.90ms step:450/1480 train_time:65526ms step_avg:148.92ms step:451/1480 train_time:65684ms step_avg:148.94ms step:452/1480 train_time:65840ms step_avg:148.96ms step:453/1480 train_time:65995ms step_avg:148.97ms step:454/1480 train_time:66151ms step_avg:148.99ms step:455/1480 train_time:66306ms step_avg:149.00ms step:456/1480 train_time:66466ms step_avg:149.03ms step:457/1480 train_time:66623ms step_avg:149.04ms step:458/1480 train_time:66777ms step_avg:149.06ms step:459/1480 train_time:66934ms step_avg:149.07ms step:460/1480 train_time:67090ms step_avg:149.09ms step:461/1480 train_time:67249ms step_avg:149.11ms step:462/1480 train_time:67406ms step_avg:149.13ms step:463/1480 train_time:67564ms step_avg:149.15ms step:464/1480 train_time:67722ms step_avg:149.17ms step:465/1480 train_time:67877ms step_avg:149.18ms step:466/1480 train_time:68034ms step_avg:149.20ms step:467/1480 train_time:68191ms step_avg:149.21ms step:468/1480 train_time:68347ms step_avg:149.23ms step:469/1480 train_time:68503ms step_avg:149.24ms step:470/1480 train_time:68662ms step_avg:149.27ms step:471/1480 train_time:68819ms step_avg:149.28ms step:472/1480 train_time:68976ms step_avg:149.30ms step:473/1480 train_time:69132ms step_avg:149.31ms step:474/1480 train_time:69288ms step_avg:149.33ms step:475/1480 train_time:69445ms step_avg:149.34ms step:476/1480 train_time:69601ms step_avg:149.36ms step:477/1480 train_time:69759ms step_avg:149.38ms step:478/1480 train_time:69914ms step_avg:149.39ms step:479/1480 train_time:70070ms step_avg:149.40ms step:480/1480 train_time:70228ms step_avg:149.42ms step:481/1480 train_time:70385ms step_avg:149.44ms step:482/1480 train_time:70543ms step_avg:149.45ms step:483/1480 train_time:70699ms step_avg:149.47ms step:484/1480 train_time:70856ms step_avg:149.49ms step:485/1480 train_time:71013ms step_avg:149.50ms step:486/1480 train_time:71170ms step_avg:149.52ms step:487/1480 train_time:71327ms step_avg:149.53ms step:488/1480 train_time:71485ms step_avg:149.55ms step:489/1480 train_time:71641ms step_avg:149.56ms step:490/1480 train_time:71797ms step_avg:149.58ms step:491/1480 train_time:71953ms step_avg:149.59ms step:492/1480 train_time:72110ms step_avg:149.60ms step:493/1480 train_time:72268ms step_avg:149.62ms step:494/1480 train_time:72427ms step_avg:149.64ms step:495/1480 train_time:72585ms step_avg:149.66ms step:496/1480 train_time:72742ms step_avg:149.68ms step:497/1480 train_time:72899ms step_avg:149.69ms step:498/1480 train_time:73056ms step_avg:149.70ms step:499/1480 train_time:73213ms step_avg:149.72ms step:500/1480 train_time:73371ms step_avg:149.74ms step:500/1480 val_loss:3.6850 train_time:73433ms step_avg:149.86ms step:501/1480 train_time:73533ms step_avg:149.76ms step:502/1480 train_time:73691ms step_avg:149.78ms step:503/1480 train_time:73849ms step_avg:149.80ms step:504/1480 train_time:74005ms step_avg:149.81ms step:505/1480 train_time:74158ms step_avg:149.81ms step:506/1480 train_time:74314ms step_avg:149.83ms step:507/1480 train_time:74471ms step_avg:149.84ms step:508/1480 train_time:74630ms step_avg:149.86ms step:509/1480 train_time:74787ms step_avg:149.87ms step:510/1480 train_time:74943ms step_avg:149.89ms step:511/1480 train_time:75098ms step_avg:149.90ms step:512/1480 train_time:75256ms step_avg:149.91ms step:513/1480 train_time:75412ms step_avg:149.92ms step:514/1480 train_time:75569ms step_avg:149.94ms step:515/1480 train_time:75726ms step_avg:149.95ms step:516/1480 train_time:75883ms step_avg:149.97ms step:517/1480 train_time:76039ms step_avg:149.98ms step:518/1480 train_time:76196ms step_avg:149.99ms step:519/1480 train_time:76352ms step_avg:150.00ms step:520/1480 train_time:76511ms step_avg:150.02ms step:521/1480 train_time:76669ms step_avg:150.04ms step:522/1480 train_time:76826ms step_avg:150.05ms step:523/1480 train_time:76983ms step_avg:150.06ms step:524/1480 train_time:77139ms step_avg:150.08ms step:525/1480 train_time:77295ms step_avg:150.09ms step:526/1480 train_time:77454ms step_avg:150.10ms step:527/1480 train_time:77612ms step_avg:150.12ms step:528/1480 train_time:77770ms step_avg:150.14ms step:529/1480 train_time:77929ms step_avg:150.15ms step:530/1480 train_time:78087ms step_avg:150.17ms step:531/1480 train_time:78244ms step_avg:150.18ms step:532/1480 train_time:78399ms step_avg:150.19ms step:533/1480 train_time:78555ms step_avg:150.20ms step:534/1480 train_time:78712ms step_avg:150.21ms step:535/1480 train_time:78872ms step_avg:150.23ms step:536/1480 train_time:79031ms step_avg:150.25ms step:537/1480 train_time:79189ms step_avg:150.26ms step:538/1480 train_time:79348ms step_avg:150.28ms step:539/1480 train_time:79507ms step_avg:150.30ms step:540/1480 train_time:79663ms step_avg:150.31ms step:541/1480 train_time:79818ms step_avg:150.32ms step:542/1480 train_time:79974ms step_avg:150.33ms step:543/1480 train_time:80130ms step_avg:150.34ms step:544/1480 train_time:80288ms step_avg:150.35ms step:545/1480 train_time:80446ms step_avg:150.37ms step:546/1480 train_time:80602ms step_avg:150.38ms step:547/1480 train_time:80758ms step_avg:150.39ms step:548/1480 train_time:80916ms step_avg:150.40ms step:549/1480 train_time:81071ms step_avg:150.41ms step:550/1480 train_time:81230ms step_avg:150.43ms step:551/1480 train_time:81390ms step_avg:150.44ms step:552/1480 train_time:81551ms step_avg:150.46ms step:553/1480 train_time:81712ms step_avg:150.48ms step:554/1480 train_time:81873ms step_avg:150.50ms step:555/1480 train_time:82034ms step_avg:150.52ms step:556/1480 train_time:82192ms step_avg:150.53ms step:557/1480 train_time:82353ms step_avg:150.55ms step:558/1480 train_time:82512ms step_avg:150.57ms step:559/1480 train_time:82672ms step_avg:150.59ms step:560/1480 train_time:82833ms step_avg:150.61ms step:561/1480 train_time:82992ms step_avg:150.62ms step:562/1480 train_time:83152ms step_avg:150.64ms step:563/1480 train_time:83311ms step_avg:150.65ms step:564/1480 train_time:83471ms step_avg:150.67ms step:565/1480 train_time:83631ms step_avg:150.69ms step:566/1480 train_time:83792ms step_avg:150.70ms step:567/1480 train_time:83952ms step_avg:150.72ms step:568/1480 train_time:84111ms step_avg:150.74ms step:569/1480 train_time:84270ms step_avg:150.75ms step:570/1480 train_time:84430ms step_avg:150.77ms step:571/1480 train_time:84590ms step_avg:150.78ms step:572/1480 train_time:84750ms step_avg:150.80ms step:573/1480 train_time:84910ms step_avg:150.82ms step:574/1480 train_time:85072ms step_avg:150.84ms step:575/1480 train_time:85233ms step_avg:150.85ms step:576/1480 train_time:85393ms step_avg:150.87ms step:577/1480 train_time:85554ms step_avg:150.89ms step:578/1480 train_time:85713ms step_avg:150.90ms step:579/1480 train_time:85872ms step_avg:150.92ms step:580/1480 train_time:86032ms step_avg:150.93ms step:581/1480 train_time:86193ms step_avg:150.95ms step:582/1480 train_time:86354ms step_avg:150.97ms step:583/1480 train_time:86515ms step_avg:150.99ms step:584/1480 train_time:86674ms step_avg:151.00ms step:585/1480 train_time:86833ms step_avg:151.01ms step:586/1480 train_time:86993ms step_avg:151.03ms step:587/1480 train_time:87153ms step_avg:151.05ms step:588/1480 train_time:87312ms step_avg:151.06ms step:589/1480 train_time:87473ms step_avg:151.08ms step:590/1480 train_time:87634ms step_avg:151.09ms step:591/1480 train_time:87792ms step_avg:151.10ms step:592/1480 train_time:87953ms step_avg:151.12ms step:593/1480 train_time:88114ms step_avg:151.14ms step:594/1480 train_time:88274ms step_avg:151.15ms step:595/1480 train_time:88435ms step_avg:151.17ms step:596/1480 train_time:88596ms step_avg:151.19ms step:597/1480 train_time:88755ms step_avg:151.20ms step:598/1480 train_time:88912ms step_avg:151.21ms step:599/1480 train_time:89071ms step_avg:151.22ms step:600/1480 train_time:89230ms step_avg:151.24ms step:601/1480 train_time:89390ms step_avg:151.25ms step:602/1480 train_time:89549ms step_avg:151.27ms step:603/1480 train_time:89710ms step_avg:151.28ms step:604/1480 train_time:89870ms step_avg:151.30ms step:605/1480 train_time:90029ms step_avg:151.31ms step:606/1480 train_time:90190ms step_avg:151.33ms step:607/1480 train_time:90352ms step_avg:151.34ms step:608/1480 train_time:90512ms step_avg:151.36ms step:609/1480 train_time:90672ms step_avg:151.37ms step:610/1480 train_time:90832ms step_avg:151.39ms step:611/1480 train_time:90993ms step_avg:151.40ms step:612/1480 train_time:91154ms step_avg:151.42ms step:613/1480 train_time:91315ms step_avg:151.43ms step:614/1480 train_time:91474ms step_avg:151.45ms step:615/1480 train_time:91632ms step_avg:151.46ms step:616/1480 train_time:91791ms step_avg:151.47ms step:617/1480 train_time:91952ms step_avg:151.49ms step:618/1480 train_time:92112ms step_avg:151.50ms step:619/1480 train_time:92272ms step_avg:151.51ms step:620/1480 train_time:92433ms step_avg:151.53ms step:621/1480 train_time:92592ms step_avg:151.54ms step:622/1480 train_time:92754ms step_avg:151.56ms step:623/1480 train_time:92914ms step_avg:151.57ms step:624/1480 train_time:93074ms step_avg:151.59ms step:625/1480 train_time:93233ms step_avg:151.60ms step:625/1480 val_loss:3.6038 train_time:93296ms step_avg:151.70ms step:626/1480 train_time:93396ms step_avg:151.62ms step:627/1480 train_time:93556ms step_avg:151.63ms step:628/1480 train_time:93714ms step_avg:151.64ms step:629/1480 train_time:93873ms step_avg:151.65ms step:630/1480 train_time:94030ms step_avg:151.66ms step:631/1480 train_time:94187ms step_avg:151.67ms step:632/1480 train_time:94346ms step_avg:151.68ms step:633/1480 train_time:94507ms step_avg:151.70ms step:634/1480 train_time:94667ms step_avg:151.71ms step:635/1480 train_time:94826ms step_avg:151.72ms step:636/1480 train_time:94984ms step_avg:151.73ms step:637/1480 train_time:95145ms step_avg:151.75ms step:638/1480 train_time:95303ms step_avg:151.76ms step:639/1480 train_time:95462ms step_avg:151.77ms step:640/1480 train_time:95622ms step_avg:151.78ms step:641/1480 train_time:95782ms step_avg:151.79ms step:642/1480 train_time:95942ms step_avg:151.81ms step:643/1480 train_time:96102ms step_avg:151.82ms step:644/1480 train_time:96260ms step_avg:151.83ms step:645/1480 train_time:96421ms step_avg:151.84ms step:646/1480 train_time:96581ms step_avg:151.86ms step:647/1480 train_time:96741ms step_avg:151.87ms step:648/1480 train_time:96903ms step_avg:151.89ms step:649/1480 train_time:97062ms step_avg:151.90ms step:650/1480 train_time:97223ms step_avg:151.91ms step:651/1480 train_time:97383ms step_avg:151.92ms step:652/1480 train_time:97542ms step_avg:151.94ms step:653/1480 train_time:97701ms step_avg:151.95ms step:654/1480 train_time:97860ms step_avg:151.96ms step:655/1480 train_time:98021ms step_avg:151.97ms step:656/1480 train_time:98181ms step_avg:151.98ms step:657/1480 train_time:98342ms step_avg:152.00ms step:658/1480 train_time:98501ms step_avg:152.01ms step:659/1480 train_time:98662ms step_avg:152.02ms step:660/1480 train_time:98825ms step_avg:152.04ms step:661/1480 train_time:98987ms step_avg:152.05ms step:662/1480 train_time:99146ms step_avg:152.06ms step:663/1480 train_time:99304ms step_avg:152.07ms step:664/1480 train_time:99465ms step_avg:152.09ms step:665/1480 train_time:99628ms step_avg:152.10ms step:666/1480 train_time:99787ms step_avg:152.11ms step:667/1480 train_time:99948ms step_avg:152.13ms step:668/1480 train_time:100109ms step_avg:152.14ms step:669/1480 train_time:100270ms step_avg:152.15ms step:670/1480 train_time:100429ms step_avg:152.17ms step:671/1480 train_time:100592ms step_avg:152.18ms step:672/1480 train_time:100753ms step_avg:152.20ms step:673/1480 train_time:100915ms step_avg:152.21ms step:674/1480 train_time:101079ms step_avg:152.23ms step:675/1480 train_time:101243ms step_avg:152.24ms step:676/1480 train_time:101405ms step_avg:152.26ms step:677/1480 train_time:101565ms step_avg:152.27ms step:678/1480 train_time:101727ms step_avg:152.29ms step:679/1480 train_time:101889ms step_avg:152.30ms step:680/1480 train_time:102051ms step_avg:152.32ms step:681/1480 train_time:102212ms step_avg:152.33ms step:682/1480 train_time:102375ms step_avg:152.34ms step:683/1480 train_time:102536ms step_avg:152.36ms step:684/1480 train_time:102698ms step_avg:152.37ms step:685/1480 train_time:102860ms step_avg:152.39ms step:686/1480 train_time:103023ms step_avg:152.40ms step:687/1480 train_time:103184ms step_avg:152.41ms step:688/1480 train_time:103347ms step_avg:152.43ms step:689/1480 train_time:103510ms step_avg:152.44ms step:690/1480 train_time:103672ms step_avg:152.46ms step:691/1480 train_time:103832ms step_avg:152.47ms step:692/1480 train_time:103992ms step_avg:152.48ms step:693/1480 train_time:104154ms step_avg:152.49ms step:694/1480 train_time:104316ms step_avg:152.51ms step:695/1480 train_time:104480ms step_avg:152.52ms step:696/1480 train_time:104642ms step_avg:152.54ms step:697/1480 train_time:104806ms step_avg:152.56ms step:698/1480 train_time:104966ms step_avg:152.57ms step:699/1480 train_time:105128ms step_avg:152.58ms step:700/1480 train_time:105290ms step_avg:152.59ms step:701/1480 train_time:105448ms step_avg:152.60ms step:702/1480 train_time:105609ms step_avg:152.61ms step:703/1480 train_time:105768ms step_avg:152.62ms step:704/1480 train_time:105929ms step_avg:152.63ms step:705/1480 train_time:106092ms step_avg:152.65ms step:706/1480 train_time:106258ms step_avg:152.67ms step:707/1480 train_time:106420ms step_avg:152.68ms step:708/1480 train_time:106582ms step_avg:152.70ms step:709/1480 train_time:106744ms step_avg:152.71ms step:710/1480 train_time:106905ms step_avg:152.72ms step:711/1480 train_time:107066ms step_avg:152.73ms step:712/1480 train_time:107230ms step_avg:152.75ms step:713/1480 train_time:107393ms step_avg:152.76ms step:714/1480 train_time:107555ms step_avg:152.78ms step:715/1480 train_time:107715ms step_avg:152.79ms step:716/1480 train_time:107877ms step_avg:152.80ms step:717/1480 train_time:108040ms step_avg:152.81ms step:718/1480 train_time:108200ms step_avg:152.82ms step:719/1480 train_time:108360ms step_avg:152.84ms step:720/1480 train_time:108524ms step_avg:152.85ms step:721/1480 train_time:108685ms step_avg:152.86ms step:722/1480 train_time:108848ms step_avg:152.88ms step:723/1480 train_time:109008ms step_avg:152.89ms step:724/1480 train_time:109169ms step_avg:152.90ms step:725/1480 train_time:109331ms step_avg:152.91ms step:726/1480 train_time:109495ms step_avg:152.93ms step:727/1480 train_time:109659ms step_avg:152.94ms step:728/1480 train_time:109820ms step_avg:152.95ms step:729/1480 train_time:109983ms step_avg:152.97ms step:730/1480 train_time:110146ms step_avg:152.98ms step:731/1480 train_time:110306ms step_avg:152.99ms step:732/1480 train_time:110466ms step_avg:153.00ms step:733/1480 train_time:110627ms step_avg:153.01ms step:734/1480 train_time:110788ms step_avg:153.02ms step:735/1480 train_time:110949ms step_avg:153.03ms step:736/1480 train_time:111113ms step_avg:153.05ms step:737/1480 train_time:111275ms step_avg:153.06ms step:738/1480 train_time:111438ms step_avg:153.07ms step:739/1480 train_time:111600ms step_avg:153.09ms step:740/1480 train_time:111765ms step_avg:153.10ms step:741/1480 train_time:111926ms step_avg:153.11ms step:742/1480 train_time:112089ms step_avg:153.13ms step:743/1480 train_time:112250ms step_avg:153.14ms step:744/1480 train_time:112414ms step_avg:153.15ms step:745/1480 train_time:112579ms step_avg:153.17ms step:746/1480 train_time:112740ms step_avg:153.18ms step:747/1480 train_time:112902ms step_avg:153.19ms step:748/1480 train_time:113068ms step_avg:153.21ms step:749/1480 train_time:113230ms step_avg:153.22ms step:750/1480 train_time:113389ms step_avg:153.23ms step:750/1480 val_loss:3.5479 train_time:113452ms step_avg:153.31ms step:751/1480 train_time:113555ms step_avg:153.25ms step:752/1480 train_time:113717ms step_avg:153.26ms step:753/1480 train_time:113880ms step_avg:153.27ms step:754/1480 train_time:114041ms step_avg:153.28ms step:755/1480 train_time:114203ms step_avg:153.29ms step:756/1480 train_time:114364ms step_avg:153.30ms step:757/1480 train_time:114528ms step_avg:153.32ms step:758/1480 train_time:114688ms step_avg:153.33ms step:759/1480 train_time:114851ms step_avg:153.34ms step:760/1480 train_time:115012ms step_avg:153.35ms step:761/1480 train_time:115177ms step_avg:153.36ms step:762/1480 train_time:115339ms step_avg:153.38ms step:763/1480 train_time:115502ms step_avg:153.39ms step:764/1480 train_time:115664ms step_avg:153.40ms step:765/1480 train_time:115825ms step_avg:153.41ms step:766/1480 train_time:115986ms step_avg:153.42ms step:767/1480 train_time:116147ms step_avg:153.43ms step:768/1480 train_time:116309ms step_avg:153.44ms step:769/1480 train_time:116473ms step_avg:153.46ms step:770/1480 train_time:116637ms step_avg:153.47ms step:771/1480 train_time:116800ms step_avg:153.48ms step:772/1480 train_time:116962ms step_avg:153.49ms step:773/1480 train_time:117125ms step_avg:153.51ms step:774/1480 train_time:117287ms step_avg:153.52ms step:775/1480 train_time:117450ms step_avg:153.53ms step:776/1480 train_time:117615ms step_avg:153.54ms step:777/1480 train_time:117782ms step_avg:153.56ms step:778/1480 train_time:117946ms step_avg:153.58ms step:779/1480 train_time:118108ms step_avg:153.59ms step:780/1480 train_time:118271ms step_avg:153.60ms step:781/1480 train_time:118435ms step_avg:153.61ms step:782/1480 train_time:118600ms step_avg:153.63ms step:783/1480 train_time:118761ms step_avg:153.64ms step:784/1480 train_time:118925ms step_avg:153.65ms step:785/1480 train_time:119086ms step_avg:153.66ms step:786/1480 train_time:119252ms step_avg:153.68ms step:787/1480 train_time:119416ms step_avg:153.69ms step:788/1480 train_time:119580ms step_avg:153.70ms step:789/1480 train_time:119743ms step_avg:153.71ms step:790/1480 train_time:119908ms step_avg:153.73ms step:791/1480 train_time:120076ms step_avg:153.75ms step:792/1480 train_time:120240ms step_avg:153.76ms step:793/1480 train_time:120402ms step_avg:153.77ms step:794/1480 train_time:120567ms step_avg:153.78ms step:795/1480 train_time:120733ms step_avg:153.80ms step:796/1480 train_time:120899ms step_avg:153.82ms step:797/1480 train_time:121063ms step_avg:153.83ms step:798/1480 train_time:121226ms step_avg:153.84ms step:799/1480 train_time:121391ms step_avg:153.85ms step:800/1480 train_time:121555ms step_avg:153.87ms step:801/1480 train_time:121719ms step_avg:153.88ms step:802/1480 train_time:121887ms step_avg:153.90ms step:803/1480 train_time:122050ms step_avg:153.91ms step:804/1480 train_time:122214ms step_avg:153.92ms step:805/1480 train_time:122379ms step_avg:153.94ms step:806/1480 train_time:122540ms step_avg:153.94ms step:807/1480 train_time:122701ms step_avg:153.95ms step:808/1480 train_time:122865ms step_avg:153.97ms step:809/1480 train_time:123026ms step_avg:153.97ms step:810/1480 train_time:123186ms step_avg:153.98ms step:811/1480 train_time:123349ms step_avg:153.99ms step:812/1480 train_time:123512ms step_avg:154.01ms step:813/1480 train_time:123674ms step_avg:154.01ms step:814/1480 train_time:123837ms step_avg:154.03ms step:815/1480 train_time:124000ms step_avg:154.04ms step:816/1480 train_time:124165ms step_avg:154.05ms step:817/1480 train_time:124326ms step_avg:154.06ms step:818/1480 train_time:124486ms step_avg:154.07ms step:819/1480 train_time:124651ms step_avg:154.08ms step:820/1480 train_time:124815ms step_avg:154.09ms step:821/1480 train_time:124978ms step_avg:154.10ms step:822/1480 train_time:125142ms step_avg:154.12ms step:823/1480 train_time:125305ms step_avg:154.13ms step:824/1480 train_time:125467ms step_avg:154.14ms step:825/1480 train_time:125632ms step_avg:154.15ms step:826/1480 train_time:125798ms step_avg:154.16ms step:827/1480 train_time:125963ms step_avg:154.18ms step:828/1480 train_time:126125ms step_avg:154.19ms step:829/1480 train_time:126287ms step_avg:154.20ms step:830/1480 train_time:126452ms step_avg:154.21ms step:831/1480 train_time:126615ms step_avg:154.22ms step:832/1480 train_time:126779ms step_avg:154.23ms step:833/1480 train_time:126944ms step_avg:154.25ms step:834/1480 train_time:127108ms step_avg:154.26ms step:835/1480 train_time:127272ms step_avg:154.27ms step:836/1480 train_time:127438ms step_avg:154.28ms step:837/1480 train_time:127600ms step_avg:154.29ms step:838/1480 train_time:127763ms step_avg:154.30ms step:839/1480 train_time:127925ms step_avg:154.31ms step:840/1480 train_time:128085ms step_avg:154.32ms step:841/1480 train_time:128245ms step_avg:154.33ms step:842/1480 train_time:128408ms step_avg:154.34ms step:843/1480 train_time:128570ms step_avg:154.35ms step:844/1480 train_time:128733ms step_avg:154.36ms step:845/1480 train_time:128898ms step_avg:154.37ms step:846/1480 train_time:129063ms step_avg:154.38ms step:847/1480 train_time:129225ms step_avg:154.39ms step:848/1480 train_time:129387ms step_avg:154.40ms step:849/1480 train_time:129550ms step_avg:154.41ms step:850/1480 train_time:129714ms step_avg:154.42ms step:851/1480 train_time:129878ms step_avg:154.43ms step:852/1480 train_time:130042ms step_avg:154.44ms step:853/1480 train_time:130204ms step_avg:154.45ms step:854/1480 train_time:130367ms step_avg:154.46ms step:855/1480 train_time:130530ms step_avg:154.47ms step:856/1480 train_time:130692ms step_avg:154.48ms step:857/1480 train_time:130858ms step_avg:154.50ms step:858/1480 train_time:131023ms step_avg:154.51ms step:859/1480 train_time:131185ms step_avg:154.52ms step:860/1480 train_time:131346ms step_avg:154.52ms step:861/1480 train_time:131511ms step_avg:154.54ms step:862/1480 train_time:131680ms step_avg:154.55ms step:863/1480 train_time:131847ms step_avg:154.57ms step:864/1480 train_time:132010ms step_avg:154.58ms step:865/1480 train_time:132172ms step_avg:154.59ms step:866/1480 train_time:132340ms step_avg:154.60ms step:867/1480 train_time:132503ms step_avg:154.61ms step:868/1480 train_time:132663ms step_avg:154.62ms step:869/1480 train_time:132825ms step_avg:154.63ms step:870/1480 train_time:132989ms step_avg:154.64ms step:871/1480 train_time:133154ms step_avg:154.65ms step:872/1480 train_time:133318ms step_avg:154.66ms step:873/1480 train_time:133482ms step_avg:154.67ms step:874/1480 train_time:133649ms step_avg:154.69ms step:875/1480 train_time:133813ms step_avg:154.70ms step:875/1480 val_loss:3.5027 train_time:133878ms step_avg:154.77ms step:876/1480 train_time:133979ms step_avg:154.71ms step:877/1480 train_time:134143ms step_avg:154.72ms step:878/1480 train_time:134307ms step_avg:154.73ms step:879/1480 train_time:134470ms step_avg:154.74ms step:880/1480 train_time:134634ms step_avg:154.75ms step:881/1480 train_time:134796ms step_avg:154.76ms step:882/1480 train_time:134960ms step_avg:154.77ms step:883/1480 train_time:135126ms step_avg:154.78ms step:884/1480 train_time:135293ms step_avg:154.80ms step:885/1480 train_time:135456ms step_avg:154.81ms step:886/1480 train_time:135623ms step_avg:154.82ms step:887/1480 train_time:135789ms step_avg:154.83ms step:888/1480 train_time:135964ms step_avg:154.86ms step:889/1480 train_time:136132ms step_avg:154.87ms step:890/1480 train_time:136294ms step_avg:154.88ms step:891/1480 train_time:136459ms step_avg:154.89ms step:892/1480 train_time:136624ms step_avg:154.90ms step:893/1480 train_time:136788ms step_avg:154.91ms step:894/1480 train_time:136957ms step_avg:154.93ms step:895/1480 train_time:137123ms step_avg:154.94ms step:896/1480 train_time:137290ms step_avg:154.95ms step:897/1480 train_time:137456ms step_avg:154.97ms step:898/1480 train_time:137624ms step_avg:154.98ms step:899/1480 train_time:137789ms step_avg:154.99ms step:900/1480 train_time:137953ms step_avg:155.00ms step:901/1480 train_time:138117ms step_avg:155.01ms step:902/1480 train_time:138280ms step_avg:155.02ms step:903/1480 train_time:138453ms step_avg:155.04ms step:904/1480 train_time:138618ms step_avg:155.05ms step:905/1480 train_time:138780ms step_avg:155.06ms step:906/1480 train_time:138948ms step_avg:155.08ms step:907/1480 train_time:139115ms step_avg:155.09ms step:908/1480 train_time:139278ms step_avg:155.10ms step:909/1480 train_time:139442ms step_avg:155.11ms step:910/1480 train_time:139613ms step_avg:155.13ms step:911/1480 train_time:139777ms step_avg:155.14ms step:912/1480 train_time:139942ms step_avg:155.15ms step:913/1480 train_time:140109ms step_avg:155.16ms step:914/1480 train_time:140276ms step_avg:155.17ms step:915/1480 train_time:140447ms step_avg:155.19ms step:916/1480 train_time:140611ms step_avg:155.20ms step:917/1480 train_time:140775ms step_avg:155.21ms step:918/1480 train_time:140941ms step_avg:155.22ms step:919/1480 train_time:141111ms step_avg:155.24ms step:920/1480 train_time:141275ms step_avg:155.25ms step:921/1480 train_time:141440ms step_avg:155.26ms step:922/1480 train_time:141609ms step_avg:155.27ms step:923/1480 train_time:141772ms step_avg:155.28ms step:924/1480 train_time:141936ms step_avg:155.29ms step:925/1480 train_time:142101ms step_avg:155.30ms step:926/1480 train_time:142264ms step_avg:155.31ms step:927/1480 train_time:142429ms step_avg:155.32ms step:928/1480 train_time:142593ms step_avg:155.33ms step:929/1480 train_time:142757ms step_avg:155.34ms step:930/1480 train_time:142923ms step_avg:155.35ms step:931/1480 train_time:143087ms step_avg:155.36ms step:932/1480 train_time:143252ms step_avg:155.37ms step:933/1480 train_time:143420ms step_avg:155.38ms step:934/1480 train_time:143587ms step_avg:155.40ms step:935/1480 train_time:143759ms step_avg:155.41ms step:936/1480 train_time:143926ms step_avg:155.43ms step:937/1480 train_time:144095ms step_avg:155.44ms step:938/1480 train_time:144256ms step_avg:155.45ms step:939/1480 train_time:144426ms step_avg:155.46ms step:940/1480 train_time:144592ms step_avg:155.48ms step:941/1480 train_time:144756ms step_avg:155.48ms step:942/1480 train_time:144921ms step_avg:155.49ms step:943/1480 train_time:145090ms step_avg:155.51ms step:944/1480 train_time:145262ms step_avg:155.53ms step:945/1480 train_time:145426ms step_avg:155.54ms step:946/1480 train_time:145595ms step_avg:155.55ms step:947/1480 train_time:145762ms step_avg:155.56ms step:948/1480 train_time:145929ms step_avg:155.57ms step:949/1480 train_time:146094ms step_avg:155.58ms step:950/1480 train_time:146257ms step_avg:155.59ms step:951/1480 train_time:146427ms step_avg:155.61ms step:952/1480 train_time:146592ms step_avg:155.62ms step:953/1480 train_time:146760ms step_avg:155.63ms step:954/1480 train_time:146929ms step_avg:155.65ms step:955/1480 train_time:147092ms step_avg:155.65ms step:956/1480 train_time:147257ms step_avg:155.66ms step:957/1480 train_time:147425ms step_avg:155.68ms step:958/1480 train_time:147594ms step_avg:155.69ms step:959/1480 train_time:147758ms step_avg:155.70ms step:960/1480 train_time:147926ms step_avg:155.71ms step:961/1480 train_time:148092ms step_avg:155.72ms step:962/1480 train_time:148256ms step_avg:155.73ms step:963/1480 train_time:148423ms step_avg:155.74ms step:964/1480 train_time:148592ms step_avg:155.76ms step:965/1480 train_time:148756ms step_avg:155.77ms step:966/1480 train_time:148922ms step_avg:155.78ms step:967/1480 train_time:149086ms step_avg:155.78ms step:968/1480 train_time:149250ms step_avg:155.79ms step:969/1480 train_time:149417ms step_avg:155.81ms step:970/1480 train_time:149579ms step_avg:155.81ms step:971/1480 train_time:149744ms step_avg:155.82ms step:972/1480 train_time:149910ms step_avg:155.83ms step:973/1480 train_time:150073ms step_avg:155.84ms step:974/1480 train_time:150242ms step_avg:155.85ms step:975/1480 train_time:150409ms step_avg:155.86ms step:976/1480 train_time:150574ms step_avg:155.87ms step:977/1480 train_time:150737ms step_avg:155.88ms step:978/1480 train_time:150904ms step_avg:155.89ms step:979/1480 train_time:151070ms step_avg:155.90ms step:980/1480 train_time:151236ms step_avg:155.91ms step:981/1480 train_time:151407ms step_avg:155.93ms step:982/1480 train_time:151569ms step_avg:155.94ms step:983/1480 train_time:151736ms step_avg:155.95ms step:984/1480 train_time:151900ms step_avg:155.95ms step:985/1480 train_time:152067ms step_avg:155.97ms step:986/1480 train_time:152233ms step_avg:155.98ms step:987/1480 train_time:152396ms step_avg:155.98ms step:988/1480 train_time:152562ms step_avg:155.99ms step:989/1480 train_time:152728ms step_avg:156.00ms step:990/1480 train_time:152897ms step_avg:156.02ms step:991/1480 train_time:153065ms step_avg:156.03ms step:992/1480 train_time:153239ms step_avg:156.05ms step:993/1480 train_time:153417ms step_avg:156.07ms step:994/1480 train_time:153582ms step_avg:156.08ms step:995/1480 train_time:153746ms step_avg:156.09ms step:996/1480 train_time:153910ms step_avg:156.10ms step:997/1480 train_time:154075ms step_avg:156.10ms step:998/1480 train_time:154237ms step_avg:156.11ms step:999/1480 train_time:154405ms step_avg:156.12ms step:1000/1480 train_time:154573ms step_avg:156.13ms step:1000/1480 val_loss:3.4395 train_time:154642ms step_avg:156.20ms step:1001/1480 train_time:154744ms step_avg:156.15ms step:1002/1480 train_time:154909ms step_avg:156.16ms step:1003/1480 train_time:155081ms step_avg:156.17ms step:1004/1480 train_time:155249ms step_avg:156.19ms step:1005/1480 train_time:155416ms step_avg:156.20ms step:1006/1480 train_time:155584ms step_avg:156.21ms step:1007/1480 train_time:155751ms step_avg:156.22ms step:1008/1480 train_time:155919ms step_avg:156.23ms step:1009/1480 train_time:156093ms step_avg:156.25ms step:1010/1480 train_time:156259ms step_avg:156.26ms step:1011/1480 train_time:156422ms step_avg:156.27ms step:1012/1480 train_time:156587ms step_avg:156.27ms step:1013/1480 train_time:156758ms step_avg:156.29ms step:1014/1480 train_time:156924ms step_avg:156.30ms step:1015/1480 train_time:157095ms step_avg:156.31ms step:1016/1480 train_time:157263ms step_avg:156.33ms step:1017/1480 train_time:157433ms step_avg:156.34ms step:1018/1480 train_time:157600ms step_avg:156.35ms step:1019/1480 train_time:157768ms step_avg:156.36ms step:1020/1480 train_time:157939ms step_avg:156.38ms step:1021/1480 train_time:158105ms step_avg:156.38ms step:1022/1480 train_time:158271ms step_avg:156.39ms step:1023/1480 train_time:158439ms step_avg:156.41ms step:1024/1480 train_time:158606ms step_avg:156.42ms step:1025/1480 train_time:158777ms step_avg:156.43ms step:1026/1480 train_time:158943ms step_avg:156.44ms step:1027/1480 train_time:159108ms step_avg:156.45ms step:1028/1480 train_time:159281ms step_avg:156.46ms step:1029/1480 train_time:159457ms step_avg:156.48ms step:1030/1480 train_time:159624ms step_avg:156.49ms step:1031/1480 train_time:159787ms step_avg:156.50ms step:1032/1480 train_time:159960ms step_avg:156.52ms step:1033/1480 train_time:160126ms step_avg:156.53ms step:1034/1480 train_time:160294ms step_avg:156.54ms step:1035/1480 train_time:160462ms step_avg:156.55ms step:1036/1480 train_time:160628ms step_avg:156.56ms step:1037/1480 train_time:160796ms step_avg:156.57ms step:1038/1480 train_time:160964ms step_avg:156.58ms step:1039/1480 train_time:161134ms step_avg:156.59ms step:1040/1480 train_time:161301ms step_avg:156.60ms step:1041/1480 train_time:161468ms step_avg:156.61ms step:1042/1480 train_time:161632ms step_avg:156.62ms step:1043/1480 train_time:161797ms step_avg:156.63ms step:1044/1480 train_time:161962ms step_avg:156.64ms step:1045/1480 train_time:162131ms step_avg:156.65ms step:1046/1480 train_time:162298ms step_avg:156.66ms step:1047/1480 train_time:162466ms step_avg:156.67ms step:1048/1480 train_time:162631ms step_avg:156.68ms step:1049/1480 train_time:162798ms step_avg:156.69ms step:1050/1480 train_time:162966ms step_avg:156.70ms step:1051/1480 train_time:163135ms step_avg:156.71ms step:1052/1480 train_time:163302ms step_avg:156.72ms step:1053/1480 train_time:163468ms step_avg:156.73ms step:1054/1480 train_time:163638ms step_avg:156.74ms step:1055/1480 train_time:163805ms step_avg:156.75ms step:1056/1480 train_time:163971ms step_avg:156.76ms step:1057/1480 train_time:164138ms step_avg:156.77ms step:1058/1480 train_time:164307ms step_avg:156.78ms step:1059/1480 train_time:164481ms step_avg:156.80ms step:1060/1480 train_time:164650ms step_avg:156.81ms step:1061/1480 train_time:164814ms step_avg:156.82ms step:1062/1480 train_time:164981ms step_avg:156.83ms step:1063/1480 train_time:165146ms step_avg:156.83ms step:1064/1480 train_time:165309ms step_avg:156.84ms step:1065/1480 train_time:165476ms step_avg:156.85ms step:1066/1480 train_time:165644ms step_avg:156.86ms step:1067/1480 train_time:165815ms step_avg:156.87ms step:1068/1480 train_time:165982ms step_avg:156.88ms step:1069/1480 train_time:166154ms step_avg:156.90ms step:1070/1480 train_time:166321ms step_avg:156.91ms step:1071/1480 train_time:166495ms step_avg:156.92ms step:1072/1480 train_time:166661ms step_avg:156.93ms step:1073/1480 train_time:166823ms step_avg:156.94ms step:1074/1480 train_time:166989ms step_avg:156.94ms step:1075/1480 train_time:167160ms step_avg:156.96ms step:1076/1480 train_time:167327ms step_avg:156.97ms step:1077/1480 train_time:167494ms step_avg:156.98ms step:1078/1480 train_time:167669ms step_avg:156.99ms step:1079/1480 train_time:167840ms step_avg:157.01ms step:1080/1480 train_time:168010ms step_avg:157.02ms step:1081/1480 train_time:168177ms step_avg:157.03ms step:1082/1480 train_time:168343ms step_avg:157.04ms step:1083/1480 train_time:168509ms step_avg:157.04ms step:1084/1480 train_time:168675ms step_avg:157.05ms step:1085/1480 train_time:168844ms step_avg:157.06ms step:1086/1480 train_time:169011ms step_avg:157.07ms step:1087/1480 train_time:169177ms step_avg:157.08ms step:1088/1480 train_time:169347ms step_avg:157.09ms step:1089/1480 train_time:169520ms step_avg:157.11ms step:1090/1480 train_time:169692ms step_avg:157.12ms step:1091/1480 train_time:169861ms step_avg:157.13ms step:1092/1480 train_time:170029ms step_avg:157.14ms step:1093/1480 train_time:170198ms step_avg:157.15ms step:1094/1480 train_time:170365ms step_avg:157.16ms step:1095/1480 train_time:170529ms step_avg:157.17ms step:1096/1480 train_time:170698ms step_avg:157.18ms step:1097/1480 train_time:170867ms step_avg:157.19ms step:1098/1480 train_time:171037ms step_avg:157.20ms step:1099/1480 train_time:171207ms step_avg:157.22ms step:1100/1480 train_time:171379ms step_avg:157.23ms step:1101/1480 train_time:171549ms step_avg:157.24ms step:1102/1480 train_time:171721ms step_avg:157.25ms step:1103/1480 train_time:171895ms step_avg:157.27ms step:1104/1480 train_time:172064ms step_avg:157.28ms step:1105/1480 train_time:172236ms step_avg:157.29ms step:1106/1480 train_time:172405ms step_avg:157.30ms step:1107/1480 train_time:172574ms step_avg:157.31ms step:1108/1480 train_time:172739ms step_avg:157.32ms step:1109/1480 train_time:172906ms step_avg:157.33ms step:1110/1480 train_time:173072ms step_avg:157.34ms step:1111/1480 train_time:173239ms step_avg:157.35ms step:1112/1480 train_time:173409ms step_avg:157.36ms step:1113/1480 train_time:173590ms step_avg:157.38ms step:1114/1480 train_time:173765ms step_avg:157.40ms step:1115/1480 train_time:173937ms step_avg:157.41ms step:1116/1480 train_time:174102ms step_avg:157.42ms step:1117/1480 train_time:174274ms step_avg:157.43ms step:1118/1480 train_time:174450ms step_avg:157.45ms step:1119/1480 train_time:174617ms step_avg:157.45ms step:1120/1480 train_time:174784ms step_avg:157.46ms step:1121/1480 train_time:174956ms step_avg:157.48ms step:1122/1480 train_time:175122ms step_avg:157.48ms step:1123/1480 train_time:175288ms step_avg:157.49ms step:1124/1480 train_time:175457ms step_avg:157.50ms step:1125/1480 train_time:175625ms step_avg:157.51ms step:1125/1480 val_loss:3.3840 train_time:175693ms step_avg:157.57ms step:1126/1480 train_time:175793ms step_avg:157.52ms step:1127/1480 train_time:175964ms step_avg:157.53ms step:1128/1480 train_time:176132ms step_avg:157.54ms step:1129/1480 train_time:176308ms step_avg:157.56ms step:1130/1480 train_time:176477ms step_avg:157.57ms step:1131/1480 train_time:176654ms step_avg:157.59ms step:1132/1480 train_time:176820ms step_avg:157.59ms step:1133/1480 train_time:176992ms step_avg:157.61ms step:1134/1480 train_time:177162ms step_avg:157.62ms step:1135/1480 train_time:177329ms step_avg:157.63ms step:1136/1480 train_time:177502ms step_avg:157.64ms step:1137/1480 train_time:177672ms step_avg:157.65ms step:1138/1480 train_time:177843ms step_avg:157.66ms step:1139/1480 train_time:178011ms step_avg:157.67ms step:1140/1480 train_time:178178ms step_avg:157.68ms step:1141/1480 train_time:178349ms step_avg:157.69ms step:1142/1480 train_time:178515ms step_avg:157.70ms step:1143/1480 train_time:178686ms step_avg:157.71ms step:1144/1480 train_time:178854ms step_avg:157.72ms step:1145/1480 train_time:179019ms step_avg:157.73ms step:1146/1480 train_time:179189ms step_avg:157.74ms step:1147/1480 train_time:179361ms step_avg:157.75ms step:1148/1480 train_time:179528ms step_avg:157.76ms step:1149/1480 train_time:179699ms step_avg:157.77ms step:1150/1480 train_time:179869ms step_avg:157.78ms step:1151/1480 train_time:180041ms step_avg:157.79ms step:1152/1480 train_time:180212ms step_avg:157.80ms step:1153/1480 train_time:180385ms step_avg:157.82ms step:1154/1480 train_time:180553ms step_avg:157.83ms step:1155/1480 train_time:180726ms step_avg:157.84ms step:1156/1480 train_time:180907ms step_avg:157.86ms step:1157/1480 train_time:181076ms step_avg:157.87ms step:1158/1480 train_time:181243ms step_avg:157.88ms step:1159/1480 train_time:181411ms step_avg:157.89ms step:1160/1480 train_time:181577ms step_avg:157.89ms step:1161/1480 train_time:181749ms step_avg:157.90ms step:1162/1480 train_time:181918ms step_avg:157.91ms step:1163/1480 train_time:182088ms step_avg:157.93ms step:1164/1480 train_time:182257ms step_avg:157.93ms step:1165/1480 train_time:182423ms step_avg:157.94ms step:1166/1480 train_time:182592ms step_avg:157.95ms step:1167/1480 train_time:182760ms step_avg:157.96ms step:1168/1480 train_time:182926ms step_avg:157.97ms step:1169/1480 train_time:183095ms step_avg:157.98ms step:1170/1480 train_time:183264ms step_avg:157.99ms step:1171/1480 train_time:183431ms step_avg:157.99ms step:1172/1480 train_time:183598ms step_avg:158.00ms step:1173/1480 train_time:183771ms step_avg:158.01ms step:1174/1480 train_time:183952ms step_avg:158.03ms step:1175/1480 train_time:184124ms step_avg:158.05ms step:1176/1480 train_time:184295ms step_avg:158.06ms step:1177/1480 train_time:184472ms step_avg:158.07ms step:1178/1480 train_time:184640ms step_avg:158.08ms step:1179/1480 train_time:184806ms step_avg:158.09ms step:1180/1480 train_time:184987ms step_avg:158.11ms step:1181/1480 train_time:185157ms step_avg:158.12ms step:1182/1480 train_time:185326ms step_avg:158.13ms step:1183/1480 train_time:185497ms step_avg:158.14ms step:1184/1480 train_time:185665ms step_avg:158.15ms step:1185/1480 train_time:185835ms step_avg:158.16ms step:1186/1480 train_time:186006ms step_avg:158.17ms step:1187/1480 train_time:186189ms step_avg:158.19ms step:1188/1480 train_time:186354ms step_avg:158.20ms step:1189/1480 train_time:186526ms step_avg:158.21ms step:1190/1480 train_time:186694ms step_avg:158.22ms step:1191/1480 train_time:186866ms step_avg:158.23ms step:1192/1480 train_time:187031ms step_avg:158.23ms step:1193/1480 train_time:187196ms step_avg:158.24ms step:1194/1480 train_time:187366ms step_avg:158.25ms step:1195/1480 train_time:187539ms step_avg:158.26ms step:1196/1480 train_time:187722ms step_avg:158.28ms step:1197/1480 train_time:187893ms step_avg:158.29ms step:1198/1480 train_time:188075ms step_avg:158.31ms step:1199/1480 train_time:188245ms step_avg:158.32ms step:1200/1480 train_time:188412ms step_avg:158.33ms step:1201/1480 train_time:188579ms step_avg:158.34ms step:1202/1480 train_time:188760ms step_avg:158.36ms step:1203/1480 train_time:188935ms step_avg:158.37ms step:1204/1480 train_time:189109ms step_avg:158.38ms step:1205/1480 train_time:189277ms step_avg:158.39ms step:1206/1480 train_time:189446ms step_avg:158.40ms step:1207/1480 train_time:189614ms step_avg:158.41ms step:1208/1480 train_time:189784ms step_avg:158.42ms step:1209/1480 train_time:189956ms step_avg:158.43ms step:1210/1480 train_time:190131ms step_avg:158.44ms step:1211/1480 train_time:190306ms step_avg:158.46ms step:1212/1480 train_time:190479ms step_avg:158.47ms step:1213/1480 train_time:190652ms step_avg:158.48ms step:1214/1480 train_time:190829ms step_avg:158.50ms step:1215/1480 train_time:191003ms step_avg:158.51ms step:1216/1480 train_time:191172ms step_avg:158.52ms step:1217/1480 train_time:191346ms step_avg:158.53ms step:1218/1480 train_time:191516ms step_avg:158.54ms step:1219/1480 train_time:191693ms step_avg:158.56ms step:1220/1480 train_time:191864ms step_avg:158.57ms step:1221/1480 train_time:192033ms step_avg:158.57ms step:1222/1480 train_time:192199ms step_avg:158.58ms step:1223/1480 train_time:192370ms step_avg:158.59ms step:1224/1480 train_time:192549ms step_avg:158.61ms step:1225/1480 train_time:192720ms step_avg:158.62ms step:1226/1480 train_time:192892ms step_avg:158.63ms step:1227/1480 train_time:193065ms step_avg:158.64ms step:1228/1480 train_time:193236ms step_avg:158.65ms step:1229/1480 train_time:193409ms step_avg:158.66ms step:1230/1480 train_time:193590ms step_avg:158.68ms step:1231/1480 train_time:193765ms step_avg:158.69ms step:1232/1480 train_time:193940ms step_avg:158.71ms step:1233/1480 train_time:194110ms step_avg:158.72ms step:1234/1480 train_time:194281ms step_avg:158.73ms step:1235/1480 train_time:194455ms step_avg:158.74ms step:1236/1480 train_time:194624ms step_avg:158.75ms step:1237/1480 train_time:194795ms step_avg:158.76ms step:1238/1480 train_time:194979ms step_avg:158.78ms step:1239/1480 train_time:195150ms step_avg:158.79ms step:1240/1480 train_time:195321ms step_avg:158.80ms step:1241/1480 train_time:195492ms step_avg:158.81ms step:1242/1480 train_time:195662ms step_avg:158.82ms step:1243/1480 train_time:195835ms step_avg:158.83ms step:1244/1480 train_time:196001ms step_avg:158.83ms step:1245/1480 train_time:196171ms step_avg:158.84ms step:1246/1480 train_time:196343ms step_avg:158.85ms step:1247/1480 train_time:196510ms step_avg:158.86ms step:1248/1480 train_time:196681ms step_avg:158.87ms step:1249/1480 train_time:196849ms step_avg:158.88ms step:1250/1480 train_time:197019ms step_avg:158.89ms step:1250/1480 val_loss:3.3344 train_time:197091ms step_avg:158.94ms step:1251/1480 train_time:197201ms step_avg:158.90ms step:1252/1480 train_time:197371ms step_avg:158.91ms step:1253/1480 train_time:197539ms step_avg:158.92ms step:1254/1480 train_time:197709ms step_avg:158.93ms step:1255/1480 train_time:197896ms step_avg:158.95ms step:1256/1480 train_time:198071ms step_avg:158.97ms step:1257/1480 train_time:198240ms step_avg:158.97ms step:1258/1480 train_time:198416ms step_avg:158.99ms step:1259/1480 train_time:198588ms step_avg:159.00ms step:1260/1480 train_time:198755ms step_avg:159.00ms step:1261/1480 train_time:198927ms step_avg:159.01ms step:1262/1480 train_time:199104ms step_avg:159.03ms step:1263/1480 train_time:199278ms step_avg:159.04ms step:1264/1480 train_time:199445ms step_avg:159.05ms step:1265/1480 train_time:199613ms step_avg:159.05ms step:1266/1480 train_time:199785ms step_avg:159.06ms step:1267/1480 train_time:199955ms step_avg:159.07ms step:1268/1480 train_time:200127ms step_avg:159.08ms step:1269/1480 train_time:200302ms step_avg:159.10ms step:1270/1480 train_time:200473ms step_avg:159.11ms step:1271/1480 train_time:200642ms step_avg:159.11ms step:1272/1480 train_time:200808ms step_avg:159.12ms step:1273/1480 train_time:200978ms step_avg:159.13ms step:1274/1480 train_time:201152ms step_avg:159.14ms step:1275/1480 train_time:201319ms step_avg:159.15ms step:1276/1480 train_time:201485ms step_avg:159.15ms step:1277/1480 train_time:201656ms step_avg:159.16ms step:1278/1480 train_time:201824ms step_avg:159.17ms step:1279/1480 train_time:201997ms step_avg:159.18ms step:1280/1480 train_time:202175ms step_avg:159.19ms step:1281/1480 train_time:202343ms step_avg:159.20ms step:1282/1480 train_time:202510ms step_avg:159.21ms step:1283/1480 train_time:202680ms step_avg:159.21ms step:1284/1480 train_time:202849ms step_avg:159.22ms step:1285/1480 train_time:203019ms step_avg:159.23ms step:1286/1480 train_time:203189ms step_avg:159.24ms step:1287/1480 train_time:203359ms step_avg:159.25ms step:1288/1480 train_time:203530ms step_avg:159.26ms step:1289/1480 train_time:203715ms step_avg:159.28ms step:1290/1480 train_time:203895ms step_avg:159.29ms step:1291/1480 train_time:204068ms step_avg:159.30ms step:1292/1480 train_time:204242ms step_avg:159.32ms step:1293/1480 train_time:204417ms step_avg:159.33ms step:1294/1480 train_time:204590ms step_avg:159.34ms step:1295/1480 train_time:204762ms step_avg:159.35ms step:1296/1480 train_time:204936ms step_avg:159.36ms step:1297/1480 train_time:205108ms step_avg:159.37ms step:1298/1480 train_time:205279ms step_avg:159.38ms step:1299/1480 train_time:205449ms step_avg:159.39ms step:1300/1480 train_time:205617ms step_avg:159.39ms step:1301/1480 train_time:205786ms step_avg:159.40ms step:1302/1480 train_time:205960ms step_avg:159.41ms step:1303/1480 train_time:206136ms step_avg:159.42ms step:1304/1480 train_time:206310ms step_avg:159.44ms step:1305/1480 train_time:206479ms step_avg:159.44ms step:1306/1480 train_time:206653ms step_avg:159.45ms step:1307/1480 train_time:206821ms step_avg:159.46ms step:1308/1480 train_time:206990ms step_avg:159.47ms step:1309/1480 train_time:207162ms step_avg:159.48ms step:1310/1480 train_time:207331ms step_avg:159.49ms step:1311/1480 train_time:207499ms step_avg:159.49ms step:1312/1480 train_time:207674ms step_avg:159.50ms step:1313/1480 train_time:207842ms step_avg:159.51ms step:1314/1480 train_time:208017ms step_avg:159.52ms step:1315/1480 train_time:208187ms step_avg:159.53ms step:1316/1480 train_time:208355ms step_avg:159.54ms step:1317/1480 train_time:208527ms step_avg:159.55ms step:1318/1480 train_time:208707ms step_avg:159.56ms step:1319/1480 train_time:208882ms step_avg:159.57ms step:1320/1480 train_time:209059ms step_avg:159.59ms step:1321/1480 train_time:209230ms step_avg:159.60ms step:1322/1480 train_time:209412ms step_avg:159.61ms step:1323/1480 train_time:209584ms step_avg:159.62ms step:1324/1480 train_time:209758ms step_avg:159.63ms step:1325/1480 train_time:209939ms step_avg:159.65ms step:1326/1480 train_time:210115ms step_avg:159.66ms step:1327/1480 train_time:210286ms step_avg:159.67ms step:1328/1480 train_time:210456ms step_avg:159.68ms step:1329/1480 train_time:210650ms step_avg:159.70ms step:1330/1480 train_time:210828ms step_avg:159.72ms step:1331/1480 train_time:210998ms step_avg:159.73ms step:1332/1480 train_time:211172ms step_avg:159.74ms step:1333/1480 train_time:211348ms step_avg:159.75ms step:1334/1480 train_time:211519ms step_avg:159.76ms step:1335/1480 train_time:211687ms step_avg:159.76ms step:1336/1480 train_time:211872ms step_avg:159.78ms step:1337/1480 train_time:212048ms step_avg:159.79ms step:1338/1480 train_time:212218ms step_avg:159.80ms step:1339/1480 train_time:212392ms step_avg:159.81ms step:1340/1480 train_time:212563ms step_avg:159.82ms step:1341/1480 train_time:212732ms step_avg:159.83ms step:1342/1480 train_time:212905ms step_avg:159.84ms step:1343/1480 train_time:213077ms step_avg:159.85ms step:1344/1480 train_time:213250ms step_avg:159.86ms step:1345/1480 train_time:213427ms step_avg:159.87ms step:1346/1480 train_time:213597ms step_avg:159.88ms step:1347/1480 train_time:213767ms step_avg:159.89ms step:1348/1480 train_time:213937ms step_avg:159.89ms step:1349/1480 train_time:214107ms step_avg:159.90ms step:1350/1480 train_time:214282ms step_avg:159.91ms step:1351/1480 train_time:214452ms step_avg:159.92ms step:1352/1480 train_time:214621ms step_avg:159.93ms step:1353/1480 train_time:214798ms step_avg:159.94ms step:1354/1480 train_time:214969ms step_avg:159.95ms step:1355/1480 train_time:215138ms step_avg:159.95ms step:1356/1480 train_time:215311ms step_avg:159.96ms step:1357/1480 train_time:215484ms step_avg:159.97ms step:1358/1480 train_time:215657ms step_avg:159.98ms step:1359/1480 train_time:215830ms step_avg:159.99ms step:1360/1480 train_time:216005ms step_avg:160.00ms step:1361/1480 train_time:216182ms step_avg:160.02ms step:1362/1480 train_time:216357ms step_avg:160.03ms step:1363/1480 train_time:216540ms step_avg:160.04ms step:1364/1480 train_time:216710ms step_avg:160.05ms step:1365/1480 train_time:216876ms step_avg:160.06ms step:1366/1480 train_time:217049ms step_avg:160.07ms step:1367/1480 train_time:217220ms step_avg:160.07ms step:1368/1480 train_time:217394ms step_avg:160.08ms step:1369/1480 train_time:217575ms step_avg:160.10ms step:1370/1480 train_time:217753ms step_avg:160.11ms step:1371/1480 train_time:217925ms step_avg:160.12ms step:1372/1480 train_time:218105ms step_avg:160.14ms step:1373/1480 train_time:218275ms step_avg:160.14ms step:1374/1480 train_time:218449ms step_avg:160.15ms step:1375/1480 train_time:218620ms step_avg:160.16ms step:1375/1480 val_loss:3.2956 train_time:218688ms step_avg:160.21ms step:1376/1480 train_time:218792ms step_avg:160.17ms step:1377/1480 train_time:218965ms step_avg:160.18ms step:1378/1480 train_time:219132ms step_avg:160.18ms step:1379/1480 train_time:219310ms step_avg:160.20ms step:1380/1480 train_time:219485ms step_avg:160.21ms step:1381/1480 train_time:219666ms step_avg:160.22ms step:1382/1480 train_time:219837ms step_avg:160.23ms step:1383/1480 train_time:220009ms step_avg:160.24ms step:1384/1480 train_time:220185ms step_avg:160.25ms step:1385/1480 train_time:220350ms step_avg:160.25ms step:1386/1480 train_time:220521ms step_avg:160.26ms step:1387/1480 train_time:220692ms step_avg:160.27ms step:1388/1480 train_time:220861ms step_avg:160.28ms step:1389/1480 train_time:221034ms step_avg:160.29ms step:1390/1480 train_time:221203ms step_avg:160.29ms step:1391/1480 train_time:221371ms step_avg:160.30ms step:1392/1480 train_time:221543ms step_avg:160.31ms step:1393/1480 train_time:221715ms step_avg:160.31ms step:1394/1480 train_time:221885ms step_avg:160.32ms step:1395/1480 train_time:222053ms step_avg:160.33ms step:1396/1480 train_time:222223ms step_avg:160.33ms step:1397/1480 train_time:222390ms step_avg:160.34ms step:1398/1480 train_time:222556ms step_avg:160.34ms step:1399/1480 train_time:222726ms step_avg:160.35ms step:1400/1480 train_time:222904ms step_avg:160.36ms step:1401/1480 train_time:223069ms step_avg:160.37ms step:1402/1480 train_time:223243ms step_avg:160.38ms step:1403/1480 train_time:223421ms step_avg:160.39ms step:1404/1480 train_time:223592ms step_avg:160.40ms step:1405/1480 train_time:223765ms step_avg:160.41ms step:1406/1480 train_time:223941ms step_avg:160.42ms step:1407/1480 train_time:224109ms step_avg:160.42ms step:1408/1480 train_time:224277ms step_avg:160.43ms step:1409/1480 train_time:224461ms step_avg:160.44ms step:1410/1480 train_time:224630ms step_avg:160.45ms step:1411/1480 train_time:224799ms step_avg:160.46ms step:1412/1480 train_time:224968ms step_avg:160.46ms step:1413/1480 train_time:225137ms step_avg:160.47ms step:1414/1480 train_time:225308ms step_avg:160.48ms step:1415/1480 train_time:225481ms step_avg:160.48ms step:1416/1480 train_time:225669ms step_avg:160.50ms step:1417/1480 train_time:225845ms step_avg:160.52ms step:1418/1480 train_time:226015ms step_avg:160.52ms step:1419/1480 train_time:226188ms step_avg:160.53ms step:1420/1480 train_time:226363ms step_avg:160.54ms step:1421/1480 train_time:226536ms step_avg:160.55ms step:1422/1480 train_time:226708ms step_avg:160.56ms step:1423/1480 train_time:226877ms step_avg:160.56ms step:1424/1480 train_time:227053ms step_avg:160.58ms step:1425/1480 train_time:227233ms step_avg:160.59ms step:1426/1480 train_time:227405ms step_avg:160.60ms step:1427/1480 train_time:227581ms step_avg:160.61ms step:1428/1480 train_time:227753ms step_avg:160.62ms step:1429/1480 train_time:227921ms step_avg:160.62ms step:1430/1480 train_time:228094ms step_avg:160.63ms step:1431/1480 train_time:228269ms step_avg:160.64ms step:1432/1480 train_time:228446ms step_avg:160.65ms step:1433/1480 train_time:228628ms step_avg:160.67ms step:1434/1480 train_time:228809ms step_avg:160.68ms step:1435/1480 train_time:228986ms step_avg:160.69ms step:1436/1480 train_time:229159ms step_avg:160.70ms step:1437/1480 train_time:229330ms step_avg:160.71ms step:1438/1480 train_time:229500ms step_avg:160.71ms step:1439/1480 train_time:229673ms step_avg:160.72ms step:1440/1480 train_time:229844ms step_avg:160.73ms step:1441/1480 train_time:230015ms step_avg:160.74ms step:1442/1480 train_time:230192ms step_avg:160.75ms step:1443/1480 train_time:230381ms step_avg:160.77ms step:1444/1480 train_time:230551ms step_avg:160.78ms step:1445/1480 train_time:230723ms step_avg:160.78ms step:1446/1480 train_time:230900ms step_avg:160.79ms step:1447/1480 train_time:231077ms step_avg:160.81ms step:1448/1480 train_time:231250ms step_avg:160.81ms step:1449/1480 train_time:231424ms step_avg:160.82ms step:1450/1480 train_time:231596ms step_avg:160.83ms step:1451/1480 train_time:231766ms step_avg:160.84ms step:1452/1480 train_time:231939ms step_avg:160.85ms step:1453/1480 train_time:232109ms step_avg:160.85ms step:1454/1480 train_time:232282ms step_avg:160.86ms step:1455/1480 train_time:232460ms step_avg:160.87ms step:1456/1480 train_time:232632ms step_avg:160.88ms step:1457/1480 train_time:232805ms step_avg:160.89ms step:1458/1480 train_time:232976ms step_avg:160.90ms step:1459/1480 train_time:233154ms step_avg:160.91ms step:1460/1480 train_time:233326ms step_avg:160.91ms step:1461/1480 train_time:233501ms step_avg:160.92ms step:1462/1480 train_time:233671ms step_avg:160.93ms step:1463/1480 train_time:233848ms step_avg:160.94ms step:1464/1480 train_time:234023ms step_avg:160.95ms step:1465/1480 train_time:234195ms step_avg:160.96ms step:1466/1480 train_time:234366ms step_avg:160.97ms step:1467/1480 train_time:234540ms step_avg:160.97ms step:1468/1480 train_time:234709ms step_avg:160.98ms step:1469/1480 train_time:234882ms step_avg:160.99ms step:1470/1480 train_time:235061ms step_avg:161.00ms step:1471/1480 train_time:235247ms step_avg:161.02ms step:1472/1480 train_time:235429ms step_avg:161.03ms step:1473/1480 train_time:235601ms step_avg:161.04ms step:1474/1480 train_time:235780ms step_avg:161.05ms step:1475/1480 train_time:235959ms step_avg:161.06ms step:1476/1480 train_time:236131ms step_avg:161.07ms step:1477/1480 train_time:236312ms step_avg:161.09ms step:1478/1480 train_time:236494ms step_avg:161.10ms step:1479/1480 train_time:236669ms step_avg:161.11ms step:1480/1480 train_time:236842ms step_avg:161.12ms step:1480/1480 val_loss:3.2762 train_time:236912ms step_avg:161.16ms