import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 07:28:38 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 115W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 118W / 700W | 39MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 40C P0 118W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 96W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23956ms step_avg:nanms step:2/1480 train_time:24062ms step_avg:nanms step:3/1480 train_time:24201ms step_avg:nanms step:4/1480 train_time:24342ms step_avg:nanms step:5/1480 train_time:24483ms step_avg:nanms step:6/1480 train_time:24624ms step_avg:nanms step:7/1480 train_time:24764ms step_avg:nanms step:8/1480 train_time:24906ms step_avg:nanms step:9/1480 train_time:25053ms step_avg:nanms step:10/1480 train_time:25198ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:426ms step_avg:142.04ms step:14/1480 train_time:566ms step_avg:141.53ms step:15/1480 train_time:708ms step_avg:141.66ms step:16/1480 train_time:854ms step_avg:142.31ms step:17/1480 train_time:999ms step_avg:142.73ms step:18/1480 train_time:1142ms step_avg:142.79ms step:19/1480 train_time:1284ms step_avg:142.64ms step:20/1480 train_time:1426ms step_avg:142.56ms step:21/1480 train_time:1566ms step_avg:142.33ms step:22/1480 train_time:1707ms step_avg:142.24ms step:23/1480 train_time:1850ms step_avg:142.34ms step:24/1480 train_time:1997ms step_avg:142.63ms step:25/1480 train_time:2141ms step_avg:142.72ms step:26/1480 train_time:2284ms step_avg:142.76ms step:27/1480 train_time:2426ms step_avg:142.72ms step:28/1480 train_time:2567ms step_avg:142.61ms step:29/1480 train_time:2709ms step_avg:142.56ms step:30/1480 train_time:2851ms step_avg:142.53ms step:31/1480 train_time:2996ms step_avg:142.67ms step:32/1480 train_time:3142ms step_avg:142.80ms step:33/1480 train_time:3285ms step_avg:142.82ms step:34/1480 train_time:3426ms step_avg:142.75ms step:35/1480 train_time:3566ms step_avg:142.66ms step:36/1480 train_time:3708ms step_avg:142.60ms step:37/1480 train_time:3851ms step_avg:142.64ms step:38/1480 train_time:3997ms step_avg:142.74ms step:39/1480 train_time:4143ms step_avg:142.87ms step:40/1480 train_time:4287ms step_avg:142.90ms step:41/1480 train_time:4430ms step_avg:142.89ms step:42/1480 train_time:4570ms step_avg:142.82ms step:43/1480 train_time:4713ms step_avg:142.82ms step:44/1480 train_time:4856ms step_avg:142.81ms step:45/1480 train_time:5001ms step_avg:142.88ms step:46/1480 train_time:5144ms step_avg:142.89ms step:47/1480 train_time:5287ms step_avg:142.90ms step:48/1480 train_time:5430ms step_avg:142.89ms step:49/1480 train_time:5571ms step_avg:142.86ms step:50/1480 train_time:5715ms step_avg:142.88ms step:51/1480 train_time:5858ms step_avg:142.89ms step:52/1480 train_time:6002ms step_avg:142.91ms step:53/1480 train_time:6146ms step_avg:142.92ms step:54/1480 train_time:6289ms step_avg:142.94ms step:55/1480 train_time:6434ms step_avg:142.97ms step:56/1480 train_time:6576ms step_avg:142.95ms step:57/1480 train_time:6719ms step_avg:142.96ms step:58/1480 train_time:6862ms step_avg:142.96ms step:59/1480 train_time:7004ms step_avg:142.94ms step:60/1480 train_time:7146ms step_avg:142.93ms step:61/1480 train_time:7287ms step_avg:142.89ms step:62/1480 train_time:7430ms step_avg:142.89ms step:63/1480 train_time:7573ms step_avg:142.88ms step:64/1480 train_time:7716ms step_avg:142.88ms step:65/1480 train_time:7859ms step_avg:142.89ms step:66/1480 train_time:8002ms step_avg:142.89ms step:67/1480 train_time:8144ms step_avg:142.88ms step:68/1480 train_time:8285ms step_avg:142.85ms step:69/1480 train_time:8428ms step_avg:142.85ms step:70/1480 train_time:8571ms step_avg:142.85ms step:71/1480 train_time:8714ms step_avg:142.86ms step:72/1480 train_time:8858ms step_avg:142.88ms step:73/1480 train_time:9001ms step_avg:142.88ms step:74/1480 train_time:9145ms step_avg:142.89ms step:75/1480 train_time:9288ms step_avg:142.89ms step:76/1480 train_time:9430ms step_avg:142.88ms step:77/1480 train_time:9571ms step_avg:142.85ms step:78/1480 train_time:9715ms step_avg:142.86ms step:79/1480 train_time:9858ms step_avg:142.87ms step:80/1480 train_time:10001ms step_avg:142.88ms step:81/1480 train_time:10146ms step_avg:142.90ms step:82/1480 train_time:10287ms step_avg:142.87ms step:83/1480 train_time:10428ms step_avg:142.85ms step:84/1480 train_time:10570ms step_avg:142.84ms step:85/1480 train_time:10714ms step_avg:142.85ms step:86/1480 train_time:10859ms step_avg:142.88ms step:87/1480 train_time:11002ms step_avg:142.88ms step:88/1480 train_time:11144ms step_avg:142.87ms step:89/1480 train_time:11285ms step_avg:142.85ms step:90/1480 train_time:11427ms step_avg:142.84ms step:91/1480 train_time:11569ms step_avg:142.83ms step:92/1480 train_time:11713ms step_avg:142.84ms step:93/1480 train_time:11857ms step_avg:142.85ms step:94/1480 train_time:12000ms step_avg:142.85ms step:95/1480 train_time:12142ms step_avg:142.85ms step:96/1480 train_time:12285ms step_avg:142.85ms step:97/1480 train_time:12426ms step_avg:142.82ms step:98/1480 train_time:12567ms step_avg:142.81ms step:99/1480 train_time:12709ms step_avg:142.80ms step:100/1480 train_time:12855ms step_avg:142.83ms step:101/1480 train_time:12999ms step_avg:142.85ms step:102/1480 train_time:13141ms step_avg:142.84ms step:103/1480 train_time:13283ms step_avg:142.82ms step:104/1480 train_time:13425ms step_avg:142.82ms step:105/1480 train_time:13566ms step_avg:142.80ms step:106/1480 train_time:13707ms step_avg:142.78ms step:107/1480 train_time:13850ms step_avg:142.79ms step:108/1480 train_time:13995ms step_avg:142.81ms step:109/1480 train_time:14139ms step_avg:142.82ms step:110/1480 train_time:14282ms step_avg:142.82ms step:111/1480 train_time:14426ms step_avg:142.83ms step:112/1480 train_time:14571ms step_avg:142.85ms step:113/1480 train_time:14720ms step_avg:142.91ms step:114/1480 train_time:14867ms step_avg:142.95ms step:115/1480 train_time:15013ms step_avg:142.99ms step:116/1480 train_time:15162ms step_avg:143.04ms step:117/1480 train_time:15308ms step_avg:143.07ms step:118/1480 train_time:15456ms step_avg:143.11ms step:119/1480 train_time:15603ms step_avg:143.15ms step:120/1480 train_time:15749ms step_avg:143.17ms step:121/1480 train_time:15897ms step_avg:143.21ms step:122/1480 train_time:16046ms step_avg:143.27ms step:123/1480 train_time:16191ms step_avg:143.28ms step:124/1480 train_time:16340ms step_avg:143.33ms step:125/1480 train_time:16487ms step_avg:143.36ms step:125/1480 val_loss:4.4228 train_time:16543ms step_avg:143.85ms step:126/1480 train_time:16638ms step_avg:143.43ms step:127/1480 train_time:16789ms step_avg:143.49ms step:128/1480 train_time:16935ms step_avg:143.52ms step:129/1480 train_time:17082ms step_avg:143.54ms step:130/1480 train_time:17228ms step_avg:143.57ms step:131/1480 train_time:17375ms step_avg:143.60ms step:132/1480 train_time:17520ms step_avg:143.60ms step:133/1480 train_time:17671ms step_avg:143.66ms step:134/1480 train_time:17819ms step_avg:143.70ms step:135/1480 train_time:17967ms step_avg:143.73ms step:136/1480 train_time:18114ms step_avg:143.76ms step:137/1480 train_time:18260ms step_avg:143.78ms step:138/1480 train_time:18407ms step_avg:143.81ms step:139/1480 train_time:18555ms step_avg:143.84ms step:140/1480 train_time:18702ms step_avg:143.86ms step:141/1480 train_time:18852ms step_avg:143.91ms step:142/1480 train_time:18999ms step_avg:143.93ms step:143/1480 train_time:19147ms step_avg:143.96ms step:144/1480 train_time:19294ms step_avg:143.99ms step:145/1480 train_time:19439ms step_avg:143.99ms step:146/1480 train_time:19586ms step_avg:144.02ms step:147/1480 train_time:19734ms step_avg:144.04ms step:148/1480 train_time:19881ms step_avg:144.06ms step:149/1480 train_time:20030ms step_avg:144.10ms step:150/1480 train_time:20177ms step_avg:144.12ms step:151/1480 train_time:20324ms step_avg:144.14ms step:152/1480 train_time:20472ms step_avg:144.17ms step:153/1480 train_time:20618ms step_avg:144.18ms step:154/1480 train_time:20763ms step_avg:144.19ms step:155/1480 train_time:20911ms step_avg:144.22ms step:156/1480 train_time:21058ms step_avg:144.23ms step:157/1480 train_time:21205ms step_avg:144.25ms step:158/1480 train_time:21355ms step_avg:144.29ms step:159/1480 train_time:21501ms step_avg:144.30ms step:160/1480 train_time:21650ms step_avg:144.33ms step:161/1480 train_time:21797ms step_avg:144.35ms step:162/1480 train_time:21942ms step_avg:144.36ms step:163/1480 train_time:22091ms step_avg:144.39ms step:164/1480 train_time:22238ms step_avg:144.40ms step:165/1480 train_time:22386ms step_avg:144.42ms step:166/1480 train_time:22533ms step_avg:144.44ms step:167/1480 train_time:22679ms step_avg:144.45ms step:168/1480 train_time:22826ms step_avg:144.47ms step:169/1480 train_time:22974ms step_avg:144.49ms step:170/1480 train_time:23121ms step_avg:144.50ms step:171/1480 train_time:23269ms step_avg:144.53ms step:172/1480 train_time:23416ms step_avg:144.54ms step:173/1480 train_time:23562ms step_avg:144.55ms step:174/1480 train_time:23709ms step_avg:144.57ms step:175/1480 train_time:23856ms step_avg:144.58ms step:176/1480 train_time:24002ms step_avg:144.59ms step:177/1480 train_time:24151ms step_avg:144.62ms step:178/1480 train_time:24297ms step_avg:144.62ms step:179/1480 train_time:24444ms step_avg:144.64ms step:180/1480 train_time:24593ms step_avg:144.66ms step:181/1480 train_time:24739ms step_avg:144.67ms step:182/1480 train_time:24888ms step_avg:144.70ms step:183/1480 train_time:25035ms step_avg:144.71ms step:184/1480 train_time:25182ms step_avg:144.72ms step:185/1480 train_time:25329ms step_avg:144.74ms step:186/1480 train_time:25476ms step_avg:144.75ms step:187/1480 train_time:25622ms step_avg:144.76ms step:188/1480 train_time:25769ms step_avg:144.77ms step:189/1480 train_time:25916ms step_avg:144.78ms step:190/1480 train_time:26063ms step_avg:144.79ms step:191/1480 train_time:26210ms step_avg:144.81ms step:192/1480 train_time:26356ms step_avg:144.81ms step:193/1480 train_time:26501ms step_avg:144.82ms step:194/1480 train_time:26649ms step_avg:144.83ms step:195/1480 train_time:26795ms step_avg:144.84ms step:196/1480 train_time:26943ms step_avg:144.86ms step:197/1480 train_time:27092ms step_avg:144.88ms step:198/1480 train_time:27238ms step_avg:144.88ms step:199/1480 train_time:27385ms step_avg:144.89ms step:200/1480 train_time:27534ms step_avg:144.92ms step:201/1480 train_time:27680ms step_avg:144.92ms step:202/1480 train_time:27829ms step_avg:144.94ms step:203/1480 train_time:27976ms step_avg:144.95ms step:204/1480 train_time:28123ms step_avg:144.96ms step:205/1480 train_time:28271ms step_avg:144.98ms step:206/1480 train_time:28419ms step_avg:144.99ms step:207/1480 train_time:28566ms step_avg:145.01ms step:208/1480 train_time:28713ms step_avg:145.02ms step:209/1480 train_time:28859ms step_avg:145.02ms step:210/1480 train_time:29006ms step_avg:145.03ms step:211/1480 train_time:29154ms step_avg:145.05ms step:212/1480 train_time:29299ms step_avg:145.05ms step:213/1480 train_time:29448ms step_avg:145.06ms step:214/1480 train_time:29595ms step_avg:145.08ms step:215/1480 train_time:29741ms step_avg:145.08ms step:216/1480 train_time:29889ms step_avg:145.09ms step:217/1480 train_time:30036ms step_avg:145.10ms step:218/1480 train_time:30181ms step_avg:145.10ms step:219/1480 train_time:30331ms step_avg:145.12ms step:220/1480 train_time:30477ms step_avg:145.13ms step:221/1480 train_time:30626ms step_avg:145.15ms step:222/1480 train_time:30777ms step_avg:145.18ms step:223/1480 train_time:30928ms step_avg:145.20ms step:224/1480 train_time:31079ms step_avg:145.23ms step:225/1480 train_time:31229ms step_avg:145.25ms step:226/1480 train_time:31379ms step_avg:145.27ms step:227/1480 train_time:31530ms step_avg:145.30ms step:228/1480 train_time:31680ms step_avg:145.32ms step:229/1480 train_time:31830ms step_avg:145.34ms step:230/1480 train_time:31982ms step_avg:145.37ms step:231/1480 train_time:32132ms step_avg:145.40ms step:232/1480 train_time:32282ms step_avg:145.41ms step:233/1480 train_time:32433ms step_avg:145.44ms step:234/1480 train_time:32583ms step_avg:145.46ms step:235/1480 train_time:32734ms step_avg:145.48ms step:236/1480 train_time:32884ms step_avg:145.50ms step:237/1480 train_time:33035ms step_avg:145.53ms step:238/1480 train_time:33184ms step_avg:145.54ms step:239/1480 train_time:33335ms step_avg:145.57ms step:240/1480 train_time:33486ms step_avg:145.59ms step:241/1480 train_time:33636ms step_avg:145.61ms step:242/1480 train_time:33788ms step_avg:145.64ms step:243/1480 train_time:33938ms step_avg:145.66ms step:244/1480 train_time:34090ms step_avg:145.68ms step:245/1480 train_time:34239ms step_avg:145.70ms step:246/1480 train_time:34390ms step_avg:145.72ms step:247/1480 train_time:34540ms step_avg:145.74ms step:248/1480 train_time:34692ms step_avg:145.76ms step:249/1480 train_time:34842ms step_avg:145.78ms step:250/1480 train_time:34993ms step_avg:145.81ms step:250/1480 val_loss:3.9978 train_time:35051ms step_avg:146.05ms step:251/1480 train_time:35150ms step_avg:145.85ms step:252/1480 train_time:35301ms step_avg:145.87ms step:253/1480 train_time:35450ms step_avg:145.88ms step:254/1480 train_time:35599ms step_avg:145.90ms step:255/1480 train_time:35749ms step_avg:145.91ms step:256/1480 train_time:35898ms step_avg:145.93ms step:257/1480 train_time:36049ms step_avg:145.95ms step:258/1480 train_time:36200ms step_avg:145.97ms step:259/1480 train_time:36352ms step_avg:145.99ms step:260/1480 train_time:36504ms step_avg:146.01ms step:261/1480 train_time:36653ms step_avg:146.03ms step:262/1480 train_time:36804ms step_avg:146.05ms step:263/1480 train_time:36953ms step_avg:146.06ms step:264/1480 train_time:37106ms step_avg:146.09ms step:265/1480 train_time:37257ms step_avg:146.11ms step:266/1480 train_time:37409ms step_avg:146.13ms step:267/1480 train_time:37558ms step_avg:146.14ms step:268/1480 train_time:37709ms step_avg:146.16ms step:269/1480 train_time:37857ms step_avg:146.17ms step:270/1480 train_time:38008ms step_avg:146.19ms step:271/1480 train_time:38158ms step_avg:146.20ms step:272/1480 train_time:38309ms step_avg:146.22ms step:273/1480 train_time:38460ms step_avg:146.24ms step:274/1480 train_time:38612ms step_avg:146.26ms step:275/1480 train_time:38763ms step_avg:146.28ms step:276/1480 train_time:38914ms step_avg:146.29ms step:277/1480 train_time:39064ms step_avg:146.31ms step:278/1480 train_time:39215ms step_avg:146.33ms step:279/1480 train_time:39368ms step_avg:146.35ms step:280/1480 train_time:39519ms step_avg:146.37ms step:281/1480 train_time:39670ms step_avg:146.38ms step:282/1480 train_time:39820ms step_avg:146.40ms step:283/1480 train_time:39972ms step_avg:146.42ms step:284/1480 train_time:40122ms step_avg:146.43ms step:285/1480 train_time:40273ms step_avg:146.45ms step:286/1480 train_time:40423ms step_avg:146.46ms step:287/1480 train_time:40575ms step_avg:146.48ms step:288/1480 train_time:40726ms step_avg:146.50ms step:289/1480 train_time:40877ms step_avg:146.51ms step:290/1480 train_time:41028ms step_avg:146.53ms step:291/1480 train_time:41178ms step_avg:146.54ms step:292/1480 train_time:41328ms step_avg:146.55ms step:293/1480 train_time:41478ms step_avg:146.57ms step:294/1480 train_time:41629ms step_avg:146.58ms step:295/1480 train_time:41778ms step_avg:146.59ms step:296/1480 train_time:41929ms step_avg:146.61ms step:297/1480 train_time:42080ms step_avg:146.62ms step:298/1480 train_time:42231ms step_avg:146.64ms step:299/1480 train_time:42380ms step_avg:146.64ms step:300/1480 train_time:42532ms step_avg:146.66ms step:301/1480 train_time:42680ms step_avg:146.67ms step:302/1480 train_time:42831ms step_avg:146.68ms step:303/1480 train_time:42980ms step_avg:146.69ms step:304/1480 train_time:43131ms step_avg:146.71ms step:305/1480 train_time:43282ms step_avg:146.72ms step:306/1480 train_time:43432ms step_avg:146.73ms step:307/1480 train_time:43582ms step_avg:146.74ms step:308/1480 train_time:43734ms step_avg:146.76ms step:309/1480 train_time:43885ms step_avg:146.77ms step:310/1480 train_time:44035ms step_avg:146.78ms step:311/1480 train_time:44187ms step_avg:146.80ms step:312/1480 train_time:44337ms step_avg:146.81ms step:313/1480 train_time:44489ms step_avg:146.83ms step:314/1480 train_time:44639ms step_avg:146.84ms step:315/1480 train_time:44789ms step_avg:146.85ms step:316/1480 train_time:44938ms step_avg:146.86ms step:317/1480 train_time:45088ms step_avg:146.87ms step:318/1480 train_time:45239ms step_avg:146.88ms step:319/1480 train_time:45391ms step_avg:146.90ms step:320/1480 train_time:45542ms step_avg:146.91ms step:321/1480 train_time:45693ms step_avg:146.92ms step:322/1480 train_time:45842ms step_avg:146.93ms step:323/1480 train_time:45992ms step_avg:146.94ms step:324/1480 train_time:46141ms step_avg:146.95ms step:325/1480 train_time:46293ms step_avg:146.96ms step:326/1480 train_time:46444ms step_avg:146.97ms step:327/1480 train_time:46595ms step_avg:146.99ms step:328/1480 train_time:46747ms step_avg:147.00ms step:329/1480 train_time:46898ms step_avg:147.01ms step:330/1480 train_time:47049ms step_avg:147.03ms step:331/1480 train_time:47203ms step_avg:147.05ms step:332/1480 train_time:47357ms step_avg:147.07ms step:333/1480 train_time:47511ms step_avg:147.09ms step:334/1480 train_time:47664ms step_avg:147.11ms step:335/1480 train_time:47818ms step_avg:147.13ms step:336/1480 train_time:47972ms step_avg:147.15ms step:337/1480 train_time:48127ms step_avg:147.18ms step:338/1480 train_time:48281ms step_avg:147.20ms step:339/1480 train_time:48434ms step_avg:147.22ms step:340/1480 train_time:48589ms step_avg:147.24ms step:341/1480 train_time:48743ms step_avg:147.26ms step:342/1480 train_time:48897ms step_avg:147.28ms step:343/1480 train_time:49050ms step_avg:147.30ms step:344/1480 train_time:49203ms step_avg:147.32ms step:345/1480 train_time:49357ms step_avg:147.34ms step:346/1480 train_time:49511ms step_avg:147.35ms step:347/1480 train_time:49665ms step_avg:147.37ms step:348/1480 train_time:49819ms step_avg:147.39ms step:349/1480 train_time:49974ms step_avg:147.42ms step:350/1480 train_time:50127ms step_avg:147.43ms step:351/1480 train_time:50282ms step_avg:147.45ms step:352/1480 train_time:50436ms step_avg:147.47ms step:353/1480 train_time:50592ms step_avg:147.50ms step:354/1480 train_time:50744ms step_avg:147.51ms step:355/1480 train_time:50898ms step_avg:147.53ms step:356/1480 train_time:51053ms step_avg:147.55ms step:357/1480 train_time:51208ms step_avg:147.57ms step:358/1480 train_time:51362ms step_avg:147.59ms step:359/1480 train_time:51516ms step_avg:147.61ms step:360/1480 train_time:51671ms step_avg:147.63ms step:361/1480 train_time:51827ms step_avg:147.65ms step:362/1480 train_time:51980ms step_avg:147.67ms step:363/1480 train_time:52134ms step_avg:147.69ms step:364/1480 train_time:52290ms step_avg:147.71ms step:365/1480 train_time:52444ms step_avg:147.73ms step:366/1480 train_time:52597ms step_avg:147.74ms step:367/1480 train_time:52751ms step_avg:147.76ms step:368/1480 train_time:52905ms step_avg:147.78ms step:369/1480 train_time:53057ms step_avg:147.79ms step:370/1480 train_time:53212ms step_avg:147.81ms step:371/1480 train_time:53365ms step_avg:147.83ms step:372/1480 train_time:53519ms step_avg:147.84ms step:373/1480 train_time:53673ms step_avg:147.86ms step:374/1480 train_time:53825ms step_avg:147.87ms step:375/1480 train_time:53977ms step_avg:147.88ms step:375/1480 val_loss:3.8073 train_time:54038ms step_avg:148.05ms step:376/1480 train_time:54136ms step_avg:147.91ms step:377/1480 train_time:54292ms step_avg:147.94ms step:378/1480 train_time:54445ms step_avg:147.95ms step:379/1480 train_time:54598ms step_avg:147.96ms step:380/1480 train_time:54751ms step_avg:147.98ms step:381/1480 train_time:54904ms step_avg:147.99ms step:382/1480 train_time:55057ms step_avg:148.00ms step:383/1480 train_time:55214ms step_avg:148.03ms step:384/1480 train_time:55369ms step_avg:148.05ms step:385/1480 train_time:55523ms step_avg:148.06ms step:386/1480 train_time:55677ms step_avg:148.08ms step:387/1480 train_time:55830ms step_avg:148.09ms step:388/1480 train_time:55983ms step_avg:148.10ms step:389/1480 train_time:56136ms step_avg:148.12ms step:390/1480 train_time:56291ms step_avg:148.13ms step:391/1480 train_time:56445ms step_avg:148.15ms step:392/1480 train_time:56599ms step_avg:148.16ms step:393/1480 train_time:56752ms step_avg:148.18ms step:394/1480 train_time:56905ms step_avg:148.19ms step:395/1480 train_time:57058ms step_avg:148.20ms step:396/1480 train_time:57212ms step_avg:148.22ms step:397/1480 train_time:57367ms step_avg:148.23ms step:398/1480 train_time:57521ms step_avg:148.25ms step:399/1480 train_time:57676ms step_avg:148.27ms step:400/1480 train_time:57830ms step_avg:148.28ms step:401/1480 train_time:57985ms step_avg:148.30ms step:402/1480 train_time:58139ms step_avg:148.31ms step:403/1480 train_time:58293ms step_avg:148.33ms step:404/1480 train_time:58448ms step_avg:148.34ms step:405/1480 train_time:58602ms step_avg:148.36ms step:406/1480 train_time:58755ms step_avg:148.37ms step:407/1480 train_time:58911ms step_avg:148.39ms step:408/1480 train_time:59064ms step_avg:148.40ms step:409/1480 train_time:59218ms step_avg:148.42ms step:410/1480 train_time:59372ms step_avg:148.43ms step:411/1480 train_time:59526ms step_avg:148.44ms step:412/1480 train_time:59680ms step_avg:148.46ms step:413/1480 train_time:59834ms step_avg:148.47ms step:414/1480 train_time:59988ms step_avg:148.48ms step:415/1480 train_time:60142ms step_avg:148.50ms step:416/1480 train_time:60297ms step_avg:148.51ms step:417/1480 train_time:60450ms step_avg:148.53ms step:418/1480 train_time:60604ms step_avg:148.54ms step:419/1480 train_time:60757ms step_avg:148.55ms step:420/1480 train_time:60911ms step_avg:148.56ms step:421/1480 train_time:61064ms step_avg:148.57ms step:422/1480 train_time:61218ms step_avg:148.59ms step:423/1480 train_time:61373ms step_avg:148.60ms step:424/1480 train_time:61527ms step_avg:148.62ms step:425/1480 train_time:61682ms step_avg:148.63ms step:426/1480 train_time:61836ms step_avg:148.64ms step:427/1480 train_time:61991ms step_avg:148.66ms step:428/1480 train_time:62145ms step_avg:148.67ms step:429/1480 train_time:62299ms step_avg:148.68ms step:430/1480 train_time:62452ms step_avg:148.70ms step:431/1480 train_time:62608ms step_avg:148.71ms step:432/1480 train_time:62761ms step_avg:148.72ms step:433/1480 train_time:62915ms step_avg:148.73ms step:434/1480 train_time:63068ms step_avg:148.74ms step:435/1480 train_time:63221ms step_avg:148.75ms step:436/1480 train_time:63375ms step_avg:148.77ms step:437/1480 train_time:63529ms step_avg:148.78ms step:438/1480 train_time:63682ms step_avg:148.79ms step:439/1480 train_time:63836ms step_avg:148.80ms step:440/1480 train_time:63991ms step_avg:148.82ms step:441/1480 train_time:64149ms step_avg:148.84ms step:442/1480 train_time:64307ms step_avg:148.86ms step:443/1480 train_time:64464ms step_avg:148.88ms step:444/1480 train_time:64619ms step_avg:148.89ms step:445/1480 train_time:64775ms step_avg:148.91ms step:446/1480 train_time:64931ms step_avg:148.92ms step:447/1480 train_time:65087ms step_avg:148.94ms step:448/1480 train_time:65244ms step_avg:148.96ms step:449/1480 train_time:65401ms step_avg:148.98ms step:450/1480 train_time:65558ms step_avg:149.00ms step:451/1480 train_time:65715ms step_avg:149.01ms step:452/1480 train_time:65872ms step_avg:149.03ms step:453/1480 train_time:66027ms step_avg:149.05ms step:454/1480 train_time:66183ms step_avg:149.06ms step:455/1480 train_time:66339ms step_avg:149.08ms step:456/1480 train_time:66495ms step_avg:149.09ms step:457/1480 train_time:66651ms step_avg:149.11ms step:458/1480 train_time:66808ms step_avg:149.12ms step:459/1480 train_time:66965ms step_avg:149.14ms step:460/1480 train_time:67121ms step_avg:149.16ms step:461/1480 train_time:67280ms step_avg:149.18ms step:462/1480 train_time:67437ms step_avg:149.20ms step:463/1480 train_time:67594ms step_avg:149.21ms step:464/1480 train_time:67751ms step_avg:149.23ms step:465/1480 train_time:67907ms step_avg:149.25ms step:466/1480 train_time:68064ms step_avg:149.26ms step:467/1480 train_time:68221ms step_avg:149.28ms step:468/1480 train_time:68377ms step_avg:149.29ms step:469/1480 train_time:68535ms step_avg:149.31ms step:470/1480 train_time:68693ms step_avg:149.33ms step:471/1480 train_time:68849ms step_avg:149.35ms step:472/1480 train_time:69007ms step_avg:149.37ms step:473/1480 train_time:69164ms step_avg:149.38ms step:474/1480 train_time:69319ms step_avg:149.39ms step:475/1480 train_time:69475ms step_avg:149.41ms step:476/1480 train_time:69633ms step_avg:149.43ms step:477/1480 train_time:69791ms step_avg:149.44ms step:478/1480 train_time:69947ms step_avg:149.46ms step:479/1480 train_time:70106ms step_avg:149.48ms step:480/1480 train_time:70265ms step_avg:149.50ms step:481/1480 train_time:70421ms step_avg:149.51ms step:482/1480 train_time:70577ms step_avg:149.53ms step:483/1480 train_time:70733ms step_avg:149.54ms step:484/1480 train_time:70892ms step_avg:149.56ms step:485/1480 train_time:71049ms step_avg:149.58ms step:486/1480 train_time:71207ms step_avg:149.60ms step:487/1480 train_time:71365ms step_avg:149.61ms step:488/1480 train_time:71523ms step_avg:149.63ms step:489/1480 train_time:71679ms step_avg:149.64ms step:490/1480 train_time:71835ms step_avg:149.66ms step:491/1480 train_time:71992ms step_avg:149.67ms step:492/1480 train_time:72149ms step_avg:149.69ms step:493/1480 train_time:72305ms step_avg:149.70ms step:494/1480 train_time:72462ms step_avg:149.72ms step:495/1480 train_time:72619ms step_avg:149.73ms step:496/1480 train_time:72778ms step_avg:149.75ms step:497/1480 train_time:72934ms step_avg:149.76ms step:498/1480 train_time:73091ms step_avg:149.78ms step:499/1480 train_time:73250ms step_avg:149.79ms step:500/1480 train_time:73409ms step_avg:149.81ms step:500/1480 val_loss:3.6859 train_time:73471ms step_avg:149.94ms step:501/1480 train_time:73569ms step_avg:149.84ms step:502/1480 train_time:73727ms step_avg:149.85ms step:503/1480 train_time:73883ms step_avg:149.86ms step:504/1480 train_time:74038ms step_avg:149.87ms step:505/1480 train_time:74193ms step_avg:149.88ms step:506/1480 train_time:74351ms step_avg:149.90ms step:507/1480 train_time:74508ms step_avg:149.91ms step:508/1480 train_time:74667ms step_avg:149.93ms step:509/1480 train_time:74823ms step_avg:149.95ms step:510/1480 train_time:74979ms step_avg:149.96ms step:511/1480 train_time:75136ms step_avg:149.97ms step:512/1480 train_time:75293ms step_avg:149.99ms step:513/1480 train_time:75450ms step_avg:150.00ms step:514/1480 train_time:75606ms step_avg:150.01ms step:515/1480 train_time:75763ms step_avg:150.03ms step:516/1480 train_time:75921ms step_avg:150.04ms step:517/1480 train_time:76079ms step_avg:150.06ms step:518/1480 train_time:76237ms step_avg:150.07ms step:519/1480 train_time:76395ms step_avg:150.09ms step:520/1480 train_time:76554ms step_avg:150.11ms step:521/1480 train_time:76712ms step_avg:150.12ms step:522/1480 train_time:76868ms step_avg:150.13ms step:523/1480 train_time:77025ms step_avg:150.15ms step:524/1480 train_time:77181ms step_avg:150.16ms step:525/1480 train_time:77337ms step_avg:150.17ms step:526/1480 train_time:77494ms step_avg:150.18ms step:527/1480 train_time:77651ms step_avg:150.19ms step:528/1480 train_time:77808ms step_avg:150.21ms step:529/1480 train_time:77966ms step_avg:150.22ms step:530/1480 train_time:78123ms step_avg:150.24ms step:531/1480 train_time:78281ms step_avg:150.25ms step:532/1480 train_time:78437ms step_avg:150.26ms step:533/1480 train_time:78594ms step_avg:150.27ms step:534/1480 train_time:78750ms step_avg:150.29ms step:535/1480 train_time:78906ms step_avg:150.30ms step:536/1480 train_time:79064ms step_avg:150.31ms step:537/1480 train_time:79222ms step_avg:150.33ms step:538/1480 train_time:79380ms step_avg:150.34ms step:539/1480 train_time:79538ms step_avg:150.35ms step:540/1480 train_time:79695ms step_avg:150.37ms step:541/1480 train_time:79852ms step_avg:150.38ms step:542/1480 train_time:80009ms step_avg:150.39ms step:543/1480 train_time:80165ms step_avg:150.40ms step:544/1480 train_time:80322ms step_avg:150.42ms step:545/1480 train_time:80478ms step_avg:150.43ms step:546/1480 train_time:80635ms step_avg:150.44ms step:547/1480 train_time:80793ms step_avg:150.45ms step:548/1480 train_time:80952ms step_avg:150.47ms step:549/1480 train_time:81108ms step_avg:150.48ms step:550/1480 train_time:81267ms step_avg:150.50ms step:551/1480 train_time:81425ms step_avg:150.51ms step:552/1480 train_time:81585ms step_avg:150.52ms step:553/1480 train_time:81743ms step_avg:150.54ms step:554/1480 train_time:81903ms step_avg:150.56ms step:555/1480 train_time:82063ms step_avg:150.57ms step:556/1480 train_time:82221ms step_avg:150.59ms step:557/1480 train_time:82382ms step_avg:150.61ms step:558/1480 train_time:82541ms step_avg:150.62ms step:559/1480 train_time:82700ms step_avg:150.64ms step:560/1480 train_time:82860ms step_avg:150.65ms step:561/1480 train_time:83019ms step_avg:150.67ms step:562/1480 train_time:83179ms step_avg:150.69ms step:563/1480 train_time:83340ms step_avg:150.70ms step:564/1480 train_time:83500ms step_avg:150.72ms step:565/1480 train_time:83659ms step_avg:150.74ms step:566/1480 train_time:83819ms step_avg:150.75ms step:567/1480 train_time:83979ms step_avg:150.77ms step:568/1480 train_time:84138ms step_avg:150.78ms step:569/1480 train_time:84298ms step_avg:150.80ms step:570/1480 train_time:84458ms step_avg:150.82ms step:571/1480 train_time:84617ms step_avg:150.83ms step:572/1480 train_time:84778ms step_avg:150.85ms step:573/1480 train_time:84938ms step_avg:150.87ms step:574/1480 train_time:85099ms step_avg:150.88ms step:575/1480 train_time:85259ms step_avg:150.90ms step:576/1480 train_time:85419ms step_avg:150.92ms step:577/1480 train_time:85579ms step_avg:150.93ms step:578/1480 train_time:85738ms step_avg:150.95ms step:579/1480 train_time:85898ms step_avg:150.96ms step:580/1480 train_time:86059ms step_avg:150.98ms step:581/1480 train_time:86219ms step_avg:151.00ms step:582/1480 train_time:86378ms step_avg:151.01ms step:583/1480 train_time:86538ms step_avg:151.03ms step:584/1480 train_time:86698ms step_avg:151.04ms step:585/1480 train_time:86858ms step_avg:151.06ms step:586/1480 train_time:87017ms step_avg:151.07ms step:587/1480 train_time:87178ms step_avg:151.09ms step:588/1480 train_time:87336ms step_avg:151.10ms step:589/1480 train_time:87497ms step_avg:151.12ms step:590/1480 train_time:87658ms step_avg:151.13ms step:591/1480 train_time:87817ms step_avg:151.15ms step:592/1480 train_time:87977ms step_avg:151.16ms step:593/1480 train_time:88138ms step_avg:151.18ms step:594/1480 train_time:88299ms step_avg:151.20ms step:595/1480 train_time:88461ms step_avg:151.22ms step:596/1480 train_time:88623ms step_avg:151.23ms step:597/1480 train_time:88782ms step_avg:151.25ms step:598/1480 train_time:88940ms step_avg:151.26ms step:599/1480 train_time:89100ms step_avg:151.27ms step:600/1480 train_time:89260ms step_avg:151.29ms step:601/1480 train_time:89419ms step_avg:151.30ms step:602/1480 train_time:89579ms step_avg:151.32ms step:603/1480 train_time:89739ms step_avg:151.33ms step:604/1480 train_time:89899ms step_avg:151.35ms step:605/1480 train_time:90059ms step_avg:151.36ms step:606/1480 train_time:90221ms step_avg:151.38ms step:607/1480 train_time:90382ms step_avg:151.39ms step:608/1480 train_time:90541ms step_avg:151.41ms step:609/1480 train_time:90701ms step_avg:151.42ms step:610/1480 train_time:90860ms step_avg:151.43ms step:611/1480 train_time:91020ms step_avg:151.45ms step:612/1480 train_time:91179ms step_avg:151.46ms step:613/1480 train_time:91339ms step_avg:151.47ms step:614/1480 train_time:91500ms step_avg:151.49ms step:615/1480 train_time:91660ms step_avg:151.50ms step:616/1480 train_time:91818ms step_avg:151.51ms step:617/1480 train_time:91979ms step_avg:151.53ms step:618/1480 train_time:92138ms step_avg:151.54ms step:619/1480 train_time:92299ms step_avg:151.56ms step:620/1480 train_time:92458ms step_avg:151.57ms step:621/1480 train_time:92617ms step_avg:151.58ms step:622/1480 train_time:92778ms step_avg:151.60ms step:623/1480 train_time:92939ms step_avg:151.61ms step:624/1480 train_time:93100ms step_avg:151.63ms step:625/1480 train_time:93259ms step_avg:151.64ms step:625/1480 val_loss:3.6065 train_time:93321ms step_avg:151.74ms step:626/1480 train_time:93422ms step_avg:151.66ms step:627/1480 train_time:93583ms step_avg:151.67ms step:628/1480 train_time:93742ms step_avg:151.69ms step:629/1480 train_time:93901ms step_avg:151.70ms step:630/1480 train_time:94060ms step_avg:151.71ms step:631/1480 train_time:94217ms step_avg:151.72ms step:632/1480 train_time:94374ms step_avg:151.73ms step:633/1480 train_time:94534ms step_avg:151.74ms step:634/1480 train_time:94693ms step_avg:151.75ms step:635/1480 train_time:94852ms step_avg:151.76ms step:636/1480 train_time:95011ms step_avg:151.78ms step:637/1480 train_time:95171ms step_avg:151.79ms step:638/1480 train_time:95329ms step_avg:151.80ms step:639/1480 train_time:95488ms step_avg:151.81ms step:640/1480 train_time:95647ms step_avg:151.82ms step:641/1480 train_time:95805ms step_avg:151.83ms step:642/1480 train_time:95964ms step_avg:151.84ms step:643/1480 train_time:96124ms step_avg:151.85ms step:644/1480 train_time:96282ms step_avg:151.87ms step:645/1480 train_time:96441ms step_avg:151.88ms step:646/1480 train_time:96602ms step_avg:151.89ms step:647/1480 train_time:96762ms step_avg:151.90ms step:648/1480 train_time:96924ms step_avg:151.92ms step:649/1480 train_time:97084ms step_avg:151.93ms step:650/1480 train_time:97244ms step_avg:151.94ms step:651/1480 train_time:97403ms step_avg:151.96ms step:652/1480 train_time:97563ms step_avg:151.97ms step:653/1480 train_time:97723ms step_avg:151.98ms step:654/1480 train_time:97884ms step_avg:151.99ms step:655/1480 train_time:98044ms step_avg:152.01ms step:656/1480 train_time:98204ms step_avg:152.02ms step:657/1480 train_time:98366ms step_avg:152.03ms step:658/1480 train_time:98526ms step_avg:152.05ms step:659/1480 train_time:98687ms step_avg:152.06ms step:660/1480 train_time:98848ms step_avg:152.07ms step:661/1480 train_time:99009ms step_avg:152.09ms step:662/1480 train_time:99170ms step_avg:152.10ms step:663/1480 train_time:99329ms step_avg:152.11ms step:664/1480 train_time:99490ms step_avg:152.13ms step:665/1480 train_time:99652ms step_avg:152.14ms step:666/1480 train_time:99812ms step_avg:152.15ms step:667/1480 train_time:99973ms step_avg:152.17ms step:668/1480 train_time:100134ms step_avg:152.18ms step:669/1480 train_time:100297ms step_avg:152.20ms step:670/1480 train_time:100458ms step_avg:152.21ms step:671/1480 train_time:100618ms step_avg:152.22ms step:672/1480 train_time:100781ms step_avg:152.24ms step:673/1480 train_time:100947ms step_avg:152.26ms step:674/1480 train_time:101107ms step_avg:152.27ms step:675/1480 train_time:101270ms step_avg:152.29ms step:676/1480 train_time:101434ms step_avg:152.30ms step:677/1480 train_time:101592ms step_avg:152.31ms step:678/1480 train_time:101753ms step_avg:152.33ms step:679/1480 train_time:101915ms step_avg:152.34ms step:680/1480 train_time:102077ms step_avg:152.35ms step:681/1480 train_time:102238ms step_avg:152.37ms step:682/1480 train_time:102400ms step_avg:152.38ms step:683/1480 train_time:102564ms step_avg:152.40ms step:684/1480 train_time:102727ms step_avg:152.41ms step:685/1480 train_time:102889ms step_avg:152.43ms step:686/1480 train_time:103049ms step_avg:152.44ms step:687/1480 train_time:103208ms step_avg:152.45ms step:688/1480 train_time:103371ms step_avg:152.46ms step:689/1480 train_time:103533ms step_avg:152.48ms step:690/1480 train_time:103698ms step_avg:152.50ms step:691/1480 train_time:103860ms step_avg:152.51ms step:692/1480 train_time:104022ms step_avg:152.52ms step:693/1480 train_time:104184ms step_avg:152.54ms step:694/1480 train_time:104347ms step_avg:152.55ms step:695/1480 train_time:104507ms step_avg:152.56ms step:696/1480 train_time:104668ms step_avg:152.58ms step:697/1480 train_time:104830ms step_avg:152.59ms step:698/1480 train_time:104991ms step_avg:152.60ms step:699/1480 train_time:105153ms step_avg:152.62ms step:700/1480 train_time:105315ms step_avg:152.63ms step:701/1480 train_time:105476ms step_avg:152.64ms step:702/1480 train_time:105637ms step_avg:152.65ms step:703/1480 train_time:105798ms step_avg:152.67ms step:704/1480 train_time:105960ms step_avg:152.68ms step:705/1480 train_time:106125ms step_avg:152.70ms step:706/1480 train_time:106288ms step_avg:152.71ms step:707/1480 train_time:106452ms step_avg:152.73ms step:708/1480 train_time:106611ms step_avg:152.74ms step:709/1480 train_time:106773ms step_avg:152.75ms step:710/1480 train_time:106932ms step_avg:152.76ms step:711/1480 train_time:107094ms step_avg:152.77ms step:712/1480 train_time:107261ms step_avg:152.79ms step:713/1480 train_time:107426ms step_avg:152.81ms step:714/1480 train_time:107587ms step_avg:152.82ms step:715/1480 train_time:107747ms step_avg:152.83ms step:716/1480 train_time:107906ms step_avg:152.84ms step:717/1480 train_time:108068ms step_avg:152.85ms step:718/1480 train_time:108228ms step_avg:152.86ms step:719/1480 train_time:108387ms step_avg:152.87ms step:720/1480 train_time:108550ms step_avg:152.89ms step:721/1480 train_time:108712ms step_avg:152.90ms step:722/1480 train_time:108874ms step_avg:152.91ms step:723/1480 train_time:109033ms step_avg:152.92ms step:724/1480 train_time:109194ms step_avg:152.93ms step:725/1480 train_time:109361ms step_avg:152.95ms step:726/1480 train_time:109525ms step_avg:152.97ms step:727/1480 train_time:109688ms step_avg:152.98ms step:728/1480 train_time:109848ms step_avg:152.99ms step:729/1480 train_time:110009ms step_avg:153.00ms step:730/1480 train_time:110172ms step_avg:153.02ms step:731/1480 train_time:110332ms step_avg:153.03ms step:732/1480 train_time:110493ms step_avg:153.04ms step:733/1480 train_time:110654ms step_avg:153.05ms step:734/1480 train_time:110815ms step_avg:153.06ms step:735/1480 train_time:110977ms step_avg:153.07ms step:736/1480 train_time:111141ms step_avg:153.09ms step:737/1480 train_time:111304ms step_avg:153.10ms step:738/1480 train_time:111466ms step_avg:153.11ms step:739/1480 train_time:111627ms step_avg:153.12ms step:740/1480 train_time:111791ms step_avg:153.14ms step:741/1480 train_time:111954ms step_avg:153.15ms step:742/1480 train_time:112115ms step_avg:153.16ms step:743/1480 train_time:112279ms step_avg:153.18ms step:744/1480 train_time:112445ms step_avg:153.20ms step:745/1480 train_time:112607ms step_avg:153.21ms step:746/1480 train_time:112768ms step_avg:153.22ms step:747/1480 train_time:112930ms step_avg:153.23ms step:748/1480 train_time:113095ms step_avg:153.25ms step:749/1480 train_time:113260ms step_avg:153.26ms step:750/1480 train_time:113420ms step_avg:153.27ms step:750/1480 val_loss:3.5499 train_time:113486ms step_avg:153.36ms step:751/1480 train_time:113589ms step_avg:153.29ms step:752/1480 train_time:113749ms step_avg:153.30ms step:753/1480 train_time:113909ms step_avg:153.31ms step:754/1480 train_time:114070ms step_avg:153.32ms step:755/1480 train_time:114231ms step_avg:153.33ms step:756/1480 train_time:114393ms step_avg:153.34ms step:757/1480 train_time:114557ms step_avg:153.36ms step:758/1480 train_time:114718ms step_avg:153.37ms step:759/1480 train_time:114883ms step_avg:153.38ms step:760/1480 train_time:115045ms step_avg:153.39ms step:761/1480 train_time:115207ms step_avg:153.40ms step:762/1480 train_time:115367ms step_avg:153.41ms step:763/1480 train_time:115529ms step_avg:153.43ms step:764/1480 train_time:115691ms step_avg:153.44ms step:765/1480 train_time:115853ms step_avg:153.45ms step:766/1480 train_time:116016ms step_avg:153.46ms step:767/1480 train_time:116179ms step_avg:153.47ms step:768/1480 train_time:116342ms step_avg:153.49ms step:769/1480 train_time:116505ms step_avg:153.50ms step:770/1480 train_time:116667ms step_avg:153.51ms step:771/1480 train_time:116830ms step_avg:153.52ms step:772/1480 train_time:116991ms step_avg:153.53ms step:773/1480 train_time:117155ms step_avg:153.54ms step:774/1480 train_time:117318ms step_avg:153.56ms step:775/1480 train_time:117481ms step_avg:153.57ms step:776/1480 train_time:117646ms step_avg:153.58ms step:777/1480 train_time:117811ms step_avg:153.60ms step:778/1480 train_time:117973ms step_avg:153.61ms step:779/1480 train_time:118135ms step_avg:153.62ms step:780/1480 train_time:118301ms step_avg:153.64ms step:781/1480 train_time:118465ms step_avg:153.65ms step:782/1480 train_time:118629ms step_avg:153.66ms step:783/1480 train_time:118791ms step_avg:153.67ms step:784/1480 train_time:118953ms step_avg:153.69ms step:785/1480 train_time:119115ms step_avg:153.70ms step:786/1480 train_time:119282ms step_avg:153.71ms step:787/1480 train_time:119446ms step_avg:153.73ms step:788/1480 train_time:119610ms step_avg:153.74ms step:789/1480 train_time:119771ms step_avg:153.75ms step:790/1480 train_time:119936ms step_avg:153.76ms step:791/1480 train_time:120102ms step_avg:153.78ms step:792/1480 train_time:120267ms step_avg:153.79ms step:793/1480 train_time:120429ms step_avg:153.80ms step:794/1480 train_time:120592ms step_avg:153.82ms step:795/1480 train_time:120758ms step_avg:153.83ms step:796/1480 train_time:120926ms step_avg:153.85ms step:797/1480 train_time:121089ms step_avg:153.86ms step:798/1480 train_time:121252ms step_avg:153.87ms step:799/1480 train_time:121421ms step_avg:153.89ms step:800/1480 train_time:121586ms step_avg:153.91ms step:801/1480 train_time:121748ms step_avg:153.92ms step:802/1480 train_time:121914ms step_avg:153.93ms step:803/1480 train_time:122077ms step_avg:153.94ms step:804/1480 train_time:122239ms step_avg:153.95ms step:805/1480 train_time:122405ms step_avg:153.97ms step:806/1480 train_time:122565ms step_avg:153.98ms step:807/1480 train_time:122727ms step_avg:153.99ms step:808/1480 train_time:122891ms step_avg:154.00ms step:809/1480 train_time:123051ms step_avg:154.01ms step:810/1480 train_time:123215ms step_avg:154.02ms step:811/1480 train_time:123378ms step_avg:154.03ms step:812/1480 train_time:123543ms step_avg:154.04ms step:813/1480 train_time:123705ms step_avg:154.05ms step:814/1480 train_time:123867ms step_avg:154.06ms step:815/1480 train_time:124029ms step_avg:154.07ms step:816/1480 train_time:124194ms step_avg:154.09ms step:817/1480 train_time:124355ms step_avg:154.10ms step:818/1480 train_time:124517ms step_avg:154.11ms step:819/1480 train_time:124682ms step_avg:154.12ms step:820/1480 train_time:124846ms step_avg:154.13ms step:821/1480 train_time:125007ms step_avg:154.14ms step:822/1480 train_time:125171ms step_avg:154.15ms step:823/1480 train_time:125334ms step_avg:154.16ms step:824/1480 train_time:125498ms step_avg:154.17ms step:825/1480 train_time:125662ms step_avg:154.19ms step:826/1480 train_time:125830ms step_avg:154.20ms step:827/1480 train_time:125993ms step_avg:154.21ms step:828/1480 train_time:126156ms step_avg:154.22ms step:829/1480 train_time:126321ms step_avg:154.24ms step:830/1480 train_time:126487ms step_avg:154.25ms step:831/1480 train_time:126651ms step_avg:154.26ms step:832/1480 train_time:126814ms step_avg:154.28ms step:833/1480 train_time:126981ms step_avg:154.29ms step:834/1480 train_time:127146ms step_avg:154.30ms step:835/1480 train_time:127309ms step_avg:154.31ms step:836/1480 train_time:127474ms step_avg:154.33ms step:837/1480 train_time:127636ms step_avg:154.34ms step:838/1480 train_time:127800ms step_avg:154.35ms step:839/1480 train_time:127964ms step_avg:154.36ms step:840/1480 train_time:128126ms step_avg:154.37ms step:841/1480 train_time:128287ms step_avg:154.38ms step:842/1480 train_time:128450ms step_avg:154.39ms step:843/1480 train_time:128612ms step_avg:154.40ms step:844/1480 train_time:128775ms step_avg:154.41ms step:845/1480 train_time:128941ms step_avg:154.42ms step:846/1480 train_time:129106ms step_avg:154.43ms step:847/1480 train_time:129269ms step_avg:154.44ms step:848/1480 train_time:129431ms step_avg:154.45ms step:849/1480 train_time:129596ms step_avg:154.46ms step:850/1480 train_time:129759ms step_avg:154.48ms step:851/1480 train_time:129924ms step_avg:154.49ms step:852/1480 train_time:130087ms step_avg:154.50ms step:853/1480 train_time:130249ms step_avg:154.51ms step:854/1480 train_time:130414ms step_avg:154.52ms step:855/1480 train_time:130578ms step_avg:154.53ms step:856/1480 train_time:130740ms step_avg:154.54ms step:857/1480 train_time:130906ms step_avg:154.55ms step:858/1480 train_time:131070ms step_avg:154.56ms step:859/1480 train_time:131232ms step_avg:154.57ms step:860/1480 train_time:131394ms step_avg:154.58ms step:861/1480 train_time:131560ms step_avg:154.59ms step:862/1480 train_time:131727ms step_avg:154.61ms step:863/1480 train_time:131894ms step_avg:154.62ms step:864/1480 train_time:132058ms step_avg:154.63ms step:865/1480 train_time:132221ms step_avg:154.64ms step:866/1480 train_time:132388ms step_avg:154.66ms step:867/1480 train_time:132551ms step_avg:154.67ms step:868/1480 train_time:132711ms step_avg:154.68ms step:869/1480 train_time:132873ms step_avg:154.68ms step:870/1480 train_time:133039ms step_avg:154.70ms step:871/1480 train_time:133203ms step_avg:154.71ms step:872/1480 train_time:133366ms step_avg:154.72ms step:873/1480 train_time:133530ms step_avg:154.73ms step:874/1480 train_time:133696ms step_avg:154.74ms step:875/1480 train_time:133863ms step_avg:154.75ms step:875/1480 val_loss:3.5041 train_time:133928ms step_avg:154.83ms step:876/1480 train_time:134030ms step_avg:154.77ms step:877/1480 train_time:134195ms step_avg:154.78ms step:878/1480 train_time:134358ms step_avg:154.79ms step:879/1480 train_time:134521ms step_avg:154.80ms step:880/1480 train_time:134686ms step_avg:154.81ms step:881/1480 train_time:134849ms step_avg:154.82ms step:882/1480 train_time:135015ms step_avg:154.83ms step:883/1480 train_time:135180ms step_avg:154.85ms step:884/1480 train_time:135348ms step_avg:154.86ms step:885/1480 train_time:135513ms step_avg:154.87ms step:886/1480 train_time:135679ms step_avg:154.88ms step:887/1480 train_time:135847ms step_avg:154.90ms step:888/1480 train_time:136020ms step_avg:154.92ms step:889/1480 train_time:136188ms step_avg:154.94ms step:890/1480 train_time:136351ms step_avg:154.94ms step:891/1480 train_time:136516ms step_avg:154.96ms step:892/1480 train_time:136680ms step_avg:154.97ms step:893/1480 train_time:136842ms step_avg:154.97ms step:894/1480 train_time:137010ms step_avg:154.99ms step:895/1480 train_time:137175ms step_avg:155.00ms step:896/1480 train_time:137340ms step_avg:155.01ms step:897/1480 train_time:137507ms step_avg:155.03ms step:898/1480 train_time:137675ms step_avg:155.04ms step:899/1480 train_time:137839ms step_avg:155.05ms step:900/1480 train_time:138002ms step_avg:155.06ms step:901/1480 train_time:138166ms step_avg:155.07ms step:902/1480 train_time:138332ms step_avg:155.08ms step:903/1480 train_time:138502ms step_avg:155.10ms step:904/1480 train_time:138672ms step_avg:155.11ms step:905/1480 train_time:138835ms step_avg:155.12ms step:906/1480 train_time:139000ms step_avg:155.13ms step:907/1480 train_time:139170ms step_avg:155.15ms step:908/1480 train_time:139333ms step_avg:155.16ms step:909/1480 train_time:139497ms step_avg:155.17ms step:910/1480 train_time:139671ms step_avg:155.19ms step:911/1480 train_time:139836ms step_avg:155.20ms step:912/1480 train_time:140002ms step_avg:155.21ms step:913/1480 train_time:140170ms step_avg:155.23ms step:914/1480 train_time:140338ms step_avg:155.24ms step:915/1480 train_time:140508ms step_avg:155.26ms step:916/1480 train_time:140673ms step_avg:155.27ms step:917/1480 train_time:140835ms step_avg:155.28ms step:918/1480 train_time:141002ms step_avg:155.29ms step:919/1480 train_time:141175ms step_avg:155.31ms step:920/1480 train_time:141340ms step_avg:155.32ms step:921/1480 train_time:141508ms step_avg:155.33ms step:922/1480 train_time:141675ms step_avg:155.35ms step:923/1480 train_time:141838ms step_avg:155.35ms step:924/1480 train_time:142002ms step_avg:155.36ms step:925/1480 train_time:142170ms step_avg:155.38ms step:926/1480 train_time:142333ms step_avg:155.39ms step:927/1480 train_time:142496ms step_avg:155.39ms step:928/1480 train_time:142663ms step_avg:155.41ms step:929/1480 train_time:142830ms step_avg:155.42ms step:930/1480 train_time:142996ms step_avg:155.43ms step:931/1480 train_time:143158ms step_avg:155.44ms step:932/1480 train_time:143324ms step_avg:155.45ms step:933/1480 train_time:143493ms step_avg:155.46ms step:934/1480 train_time:143660ms step_avg:155.48ms step:935/1480 train_time:143832ms step_avg:155.49ms step:936/1480 train_time:143999ms step_avg:155.51ms step:937/1480 train_time:144171ms step_avg:155.52ms step:938/1480 train_time:144333ms step_avg:155.53ms step:939/1480 train_time:144502ms step_avg:155.55ms step:940/1480 train_time:144670ms step_avg:155.56ms step:941/1480 train_time:144834ms step_avg:155.57ms step:942/1480 train_time:144998ms step_avg:155.58ms step:943/1480 train_time:145169ms step_avg:155.59ms step:944/1480 train_time:145340ms step_avg:155.61ms step:945/1480 train_time:145503ms step_avg:155.62ms step:946/1480 train_time:145675ms step_avg:155.64ms step:947/1480 train_time:145842ms step_avg:155.65ms step:948/1480 train_time:146008ms step_avg:155.66ms step:949/1480 train_time:146175ms step_avg:155.67ms step:950/1480 train_time:146338ms step_avg:155.68ms step:951/1480 train_time:146506ms step_avg:155.69ms step:952/1480 train_time:146672ms step_avg:155.70ms step:953/1480 train_time:146839ms step_avg:155.72ms step:954/1480 train_time:147008ms step_avg:155.73ms step:955/1480 train_time:147173ms step_avg:155.74ms step:956/1480 train_time:147337ms step_avg:155.75ms step:957/1480 train_time:147506ms step_avg:155.76ms step:958/1480 train_time:147675ms step_avg:155.78ms step:959/1480 train_time:147840ms step_avg:155.78ms step:960/1480 train_time:148009ms step_avg:155.80ms step:961/1480 train_time:148175ms step_avg:155.81ms step:962/1480 train_time:148339ms step_avg:155.82ms step:963/1480 train_time:148503ms step_avg:155.83ms step:964/1480 train_time:148672ms step_avg:155.84ms step:965/1480 train_time:148836ms step_avg:155.85ms step:966/1480 train_time:149000ms step_avg:155.86ms step:967/1480 train_time:149165ms step_avg:155.87ms step:968/1480 train_time:149330ms step_avg:155.88ms step:969/1480 train_time:149497ms step_avg:155.89ms step:970/1480 train_time:149660ms step_avg:155.90ms step:971/1480 train_time:149827ms step_avg:155.91ms step:972/1480 train_time:149992ms step_avg:155.92ms step:973/1480 train_time:150155ms step_avg:155.92ms step:974/1480 train_time:150323ms step_avg:155.94ms step:975/1480 train_time:150489ms step_avg:155.95ms step:976/1480 train_time:150654ms step_avg:155.96ms step:977/1480 train_time:150818ms step_avg:155.96ms step:978/1480 train_time:150982ms step_avg:155.97ms step:979/1480 train_time:151149ms step_avg:155.98ms step:980/1480 train_time:151316ms step_avg:156.00ms step:981/1480 train_time:151485ms step_avg:156.01ms step:982/1480 train_time:151649ms step_avg:156.02ms step:983/1480 train_time:151814ms step_avg:156.03ms step:984/1480 train_time:151978ms step_avg:156.04ms step:985/1480 train_time:152148ms step_avg:156.05ms step:986/1480 train_time:152315ms step_avg:156.06ms step:987/1480 train_time:152478ms step_avg:156.07ms step:988/1480 train_time:152645ms step_avg:156.08ms step:989/1480 train_time:152812ms step_avg:156.09ms step:990/1480 train_time:152981ms step_avg:156.10ms step:991/1480 train_time:153148ms step_avg:156.11ms step:992/1480 train_time:153322ms step_avg:156.13ms step:993/1480 train_time:153497ms step_avg:156.15ms step:994/1480 train_time:153661ms step_avg:156.16ms step:995/1480 train_time:153827ms step_avg:156.17ms step:996/1480 train_time:153991ms step_avg:156.18ms step:997/1480 train_time:154156ms step_avg:156.19ms step:998/1480 train_time:154319ms step_avg:156.19ms step:999/1480 train_time:154485ms step_avg:156.20ms step:1000/1480 train_time:154654ms step_avg:156.22ms step:1000/1480 val_loss:3.4395 train_time:154722ms step_avg:156.28ms step:1001/1480 train_time:154824ms step_avg:156.23ms step:1002/1480 train_time:154990ms step_avg:156.24ms step:1003/1480 train_time:155159ms step_avg:156.25ms step:1004/1480 train_time:155329ms step_avg:156.27ms step:1005/1480 train_time:155497ms step_avg:156.28ms step:1006/1480 train_time:155665ms step_avg:156.29ms step:1007/1480 train_time:155830ms step_avg:156.30ms step:1008/1480 train_time:155999ms step_avg:156.31ms step:1009/1480 train_time:156173ms step_avg:156.33ms step:1010/1480 train_time:156339ms step_avg:156.34ms step:1011/1480 train_time:156506ms step_avg:156.35ms step:1012/1480 train_time:156673ms step_avg:156.36ms step:1013/1480 train_time:156842ms step_avg:156.37ms step:1014/1480 train_time:157011ms step_avg:156.39ms step:1015/1480 train_time:157179ms step_avg:156.40ms step:1016/1480 train_time:157348ms step_avg:156.41ms step:1017/1480 train_time:157519ms step_avg:156.42ms step:1018/1480 train_time:157689ms step_avg:156.44ms step:1019/1480 train_time:157856ms step_avg:156.45ms step:1020/1480 train_time:158026ms step_avg:156.46ms step:1021/1480 train_time:158191ms step_avg:156.47ms step:1022/1480 train_time:158356ms step_avg:156.48ms step:1023/1480 train_time:158523ms step_avg:156.49ms step:1024/1480 train_time:158691ms step_avg:156.50ms step:1025/1480 train_time:158861ms step_avg:156.51ms step:1026/1480 train_time:159027ms step_avg:156.52ms step:1027/1480 train_time:159193ms step_avg:156.53ms step:1028/1480 train_time:159365ms step_avg:156.55ms step:1029/1480 train_time:159541ms step_avg:156.57ms step:1030/1480 train_time:159710ms step_avg:156.58ms step:1031/1480 train_time:159875ms step_avg:156.59ms step:1032/1480 train_time:160048ms step_avg:156.60ms step:1033/1480 train_time:160215ms step_avg:156.61ms step:1034/1480 train_time:160383ms step_avg:156.62ms step:1035/1480 train_time:160551ms step_avg:156.63ms step:1036/1480 train_time:160715ms step_avg:156.64ms step:1037/1480 train_time:160883ms step_avg:156.65ms step:1038/1480 train_time:161051ms step_avg:156.66ms step:1039/1480 train_time:161223ms step_avg:156.68ms step:1040/1480 train_time:161392ms step_avg:156.69ms step:1041/1480 train_time:161557ms step_avg:156.70ms step:1042/1480 train_time:161721ms step_avg:156.71ms step:1043/1480 train_time:161887ms step_avg:156.72ms step:1044/1480 train_time:162052ms step_avg:156.72ms step:1045/1480 train_time:162221ms step_avg:156.74ms step:1046/1480 train_time:162390ms step_avg:156.75ms step:1047/1480 train_time:162556ms step_avg:156.76ms step:1048/1480 train_time:162722ms step_avg:156.76ms step:1049/1480 train_time:162888ms step_avg:156.77ms step:1050/1480 train_time:163056ms step_avg:156.78ms step:1051/1480 train_time:163227ms step_avg:156.80ms step:1052/1480 train_time:163396ms step_avg:156.81ms step:1053/1480 train_time:163562ms step_avg:156.82ms step:1054/1480 train_time:163730ms step_avg:156.83ms step:1055/1480 train_time:163896ms step_avg:156.84ms step:1056/1480 train_time:164062ms step_avg:156.85ms step:1057/1480 train_time:164229ms step_avg:156.86ms step:1058/1480 train_time:164397ms step_avg:156.87ms step:1059/1480 train_time:164570ms step_avg:156.88ms step:1060/1480 train_time:164738ms step_avg:156.89ms step:1061/1480 train_time:164902ms step_avg:156.90ms step:1062/1480 train_time:165069ms step_avg:156.91ms step:1063/1480 train_time:165233ms step_avg:156.92ms step:1064/1480 train_time:165395ms step_avg:156.92ms step:1065/1480 train_time:165562ms step_avg:156.93ms step:1066/1480 train_time:165730ms step_avg:156.94ms step:1067/1480 train_time:165898ms step_avg:156.95ms step:1068/1480 train_time:166064ms step_avg:156.96ms step:1069/1480 train_time:166234ms step_avg:156.97ms step:1070/1480 train_time:166400ms step_avg:156.98ms step:1071/1480 train_time:166572ms step_avg:157.00ms step:1072/1480 train_time:166738ms step_avg:157.00ms step:1073/1480 train_time:166902ms step_avg:157.01ms step:1074/1480 train_time:167070ms step_avg:157.02ms step:1075/1480 train_time:167240ms step_avg:157.03ms step:1076/1480 train_time:167408ms step_avg:157.04ms step:1077/1480 train_time:167574ms step_avg:157.05ms step:1078/1480 train_time:167749ms step_avg:157.07ms step:1079/1480 train_time:167922ms step_avg:157.08ms step:1080/1480 train_time:168092ms step_avg:157.10ms step:1081/1480 train_time:168259ms step_avg:157.10ms step:1082/1480 train_time:168427ms step_avg:157.11ms step:1083/1480 train_time:168594ms step_avg:157.12ms step:1084/1480 train_time:168761ms step_avg:157.13ms step:1085/1480 train_time:168932ms step_avg:157.15ms step:1086/1480 train_time:169099ms step_avg:157.16ms step:1087/1480 train_time:169267ms step_avg:157.16ms step:1088/1480 train_time:169437ms step_avg:157.18ms step:1089/1480 train_time:169610ms step_avg:157.19ms step:1090/1480 train_time:169780ms step_avg:157.20ms step:1091/1480 train_time:169948ms step_avg:157.21ms step:1092/1480 train_time:170115ms step_avg:157.22ms step:1093/1480 train_time:170283ms step_avg:157.23ms step:1094/1480 train_time:170450ms step_avg:157.24ms step:1095/1480 train_time:170614ms step_avg:157.25ms step:1096/1480 train_time:170783ms step_avg:157.26ms step:1097/1480 train_time:170950ms step_avg:157.27ms step:1098/1480 train_time:171120ms step_avg:157.28ms step:1099/1480 train_time:171294ms step_avg:157.29ms step:1100/1480 train_time:171465ms step_avg:157.31ms step:1101/1480 train_time:171634ms step_avg:157.32ms step:1102/1480 train_time:171806ms step_avg:157.33ms step:1103/1480 train_time:171982ms step_avg:157.35ms step:1104/1480 train_time:172151ms step_avg:157.36ms step:1105/1480 train_time:172320ms step_avg:157.37ms step:1106/1480 train_time:172489ms step_avg:157.38ms step:1107/1480 train_time:172658ms step_avg:157.39ms step:1108/1480 train_time:172822ms step_avg:157.40ms step:1109/1480 train_time:172990ms step_avg:157.41ms step:1110/1480 train_time:173155ms step_avg:157.41ms step:1111/1480 train_time:173322ms step_avg:157.42ms step:1112/1480 train_time:173494ms step_avg:157.44ms step:1113/1480 train_time:173673ms step_avg:157.46ms step:1114/1480 train_time:173846ms step_avg:157.47ms step:1115/1480 train_time:174017ms step_avg:157.48ms step:1116/1480 train_time:174185ms step_avg:157.49ms step:1117/1480 train_time:174357ms step_avg:157.50ms step:1118/1480 train_time:174532ms step_avg:157.52ms step:1119/1480 train_time:174700ms step_avg:157.53ms step:1120/1480 train_time:174870ms step_avg:157.54ms step:1121/1480 train_time:175037ms step_avg:157.55ms step:1122/1480 train_time:175206ms step_avg:157.56ms step:1123/1480 train_time:175371ms step_avg:157.57ms step:1124/1480 train_time:175540ms step_avg:157.58ms step:1125/1480 train_time:175709ms step_avg:157.59ms step:1125/1480 val_loss:3.3842 train_time:175777ms step_avg:157.65ms step:1126/1480 train_time:175880ms step_avg:157.60ms step:1127/1480 train_time:176048ms step_avg:157.61ms step:1128/1480 train_time:176220ms step_avg:157.62ms step:1129/1480 train_time:176393ms step_avg:157.63ms step:1130/1480 train_time:176563ms step_avg:157.65ms step:1131/1480 train_time:176743ms step_avg:157.67ms step:1132/1480 train_time:176908ms step_avg:157.67ms step:1133/1480 train_time:177080ms step_avg:157.68ms step:1134/1480 train_time:177250ms step_avg:157.70ms step:1135/1480 train_time:177419ms step_avg:157.71ms step:1136/1480 train_time:177589ms step_avg:157.72ms step:1137/1480 train_time:177760ms step_avg:157.73ms step:1138/1480 train_time:177931ms step_avg:157.74ms step:1139/1480 train_time:178100ms step_avg:157.75ms step:1140/1480 train_time:178269ms step_avg:157.76ms step:1141/1480 train_time:178443ms step_avg:157.77ms step:1142/1480 train_time:178609ms step_avg:157.78ms step:1143/1480 train_time:178781ms step_avg:157.79ms step:1144/1480 train_time:178948ms step_avg:157.80ms step:1145/1480 train_time:179113ms step_avg:157.81ms step:1146/1480 train_time:179283ms step_avg:157.82ms step:1147/1480 train_time:179452ms step_avg:157.83ms step:1148/1480 train_time:179621ms step_avg:157.84ms step:1149/1480 train_time:179791ms step_avg:157.85ms step:1150/1480 train_time:179960ms step_avg:157.86ms step:1151/1480 train_time:180132ms step_avg:157.87ms step:1152/1480 train_time:180304ms step_avg:157.88ms step:1153/1480 train_time:180478ms step_avg:157.90ms step:1154/1480 train_time:180645ms step_avg:157.91ms step:1155/1480 train_time:180817ms step_avg:157.92ms step:1156/1480 train_time:180997ms step_avg:157.94ms step:1157/1480 train_time:181167ms step_avg:157.95ms step:1158/1480 train_time:181335ms step_avg:157.96ms step:1159/1480 train_time:181503ms step_avg:157.97ms step:1160/1480 train_time:181668ms step_avg:157.97ms step:1161/1480 train_time:181839ms step_avg:157.98ms step:1162/1480 train_time:182008ms step_avg:157.99ms step:1163/1480 train_time:182178ms step_avg:158.00ms step:1164/1480 train_time:182347ms step_avg:158.01ms step:1165/1480 train_time:182511ms step_avg:158.02ms step:1166/1480 train_time:182681ms step_avg:158.03ms step:1167/1480 train_time:182849ms step_avg:158.04ms step:1168/1480 train_time:183018ms step_avg:158.05ms step:1169/1480 train_time:183185ms step_avg:158.05ms step:1170/1480 train_time:183352ms step_avg:158.06ms step:1171/1480 train_time:183521ms step_avg:158.07ms step:1172/1480 train_time:183685ms step_avg:158.08ms step:1173/1480 train_time:183859ms step_avg:158.09ms step:1174/1480 train_time:184042ms step_avg:158.11ms step:1175/1480 train_time:184212ms step_avg:158.12ms step:1176/1480 train_time:184383ms step_avg:158.13ms step:1177/1480 train_time:184559ms step_avg:158.15ms step:1178/1480 train_time:184727ms step_avg:158.16ms step:1179/1480 train_time:184894ms step_avg:158.16ms step:1180/1480 train_time:185074ms step_avg:158.18ms step:1181/1480 train_time:185244ms step_avg:158.19ms step:1182/1480 train_time:185412ms step_avg:158.20ms step:1183/1480 train_time:185583ms step_avg:158.21ms step:1184/1480 train_time:185751ms step_avg:158.22ms step:1185/1480 train_time:185925ms step_avg:158.23ms step:1186/1480 train_time:186096ms step_avg:158.24ms step:1187/1480 train_time:186281ms step_avg:158.27ms step:1188/1480 train_time:186448ms step_avg:158.27ms step:1189/1480 train_time:186620ms step_avg:158.29ms step:1190/1480 train_time:186786ms step_avg:158.29ms step:1191/1480 train_time:186959ms step_avg:158.31ms step:1192/1480 train_time:187126ms step_avg:158.31ms step:1193/1480 train_time:187294ms step_avg:158.32ms step:1194/1480 train_time:187463ms step_avg:158.33ms step:1195/1480 train_time:187636ms step_avg:158.34ms step:1196/1480 train_time:187820ms step_avg:158.36ms step:1197/1480 train_time:187991ms step_avg:158.37ms step:1198/1480 train_time:188174ms step_avg:158.40ms step:1199/1480 train_time:188345ms step_avg:158.41ms step:1200/1480 train_time:188515ms step_avg:158.42ms step:1201/1480 train_time:188682ms step_avg:158.42ms step:1202/1480 train_time:188862ms step_avg:158.44ms step:1203/1480 train_time:189039ms step_avg:158.46ms step:1204/1480 train_time:189214ms step_avg:158.47ms step:1205/1480 train_time:189382ms step_avg:158.48ms step:1206/1480 train_time:189549ms step_avg:158.49ms step:1207/1480 train_time:189719ms step_avg:158.50ms step:1208/1480 train_time:189886ms step_avg:158.50ms step:1209/1480 train_time:190060ms step_avg:158.52ms step:1210/1480 train_time:190235ms step_avg:158.53ms step:1211/1480 train_time:190409ms step_avg:158.54ms step:1212/1480 train_time:190582ms step_avg:158.55ms step:1213/1480 train_time:190753ms step_avg:158.56ms step:1214/1480 train_time:190929ms step_avg:158.58ms step:1215/1480 train_time:191101ms step_avg:158.59ms step:1216/1480 train_time:191269ms step_avg:158.60ms step:1217/1480 train_time:191443ms step_avg:158.61ms step:1218/1480 train_time:191611ms step_avg:158.62ms step:1219/1480 train_time:191789ms step_avg:158.63ms step:1220/1480 train_time:191959ms step_avg:158.64ms step:1221/1480 train_time:192128ms step_avg:158.65ms step:1222/1480 train_time:192296ms step_avg:158.66ms step:1223/1480 train_time:192465ms step_avg:158.67ms step:1224/1480 train_time:192645ms step_avg:158.69ms step:1225/1480 train_time:192816ms step_avg:158.70ms step:1226/1480 train_time:192988ms step_avg:158.71ms step:1227/1480 train_time:193161ms step_avg:158.72ms step:1228/1480 train_time:193331ms step_avg:158.73ms step:1229/1480 train_time:193503ms step_avg:158.74ms step:1230/1480 train_time:193683ms step_avg:158.76ms step:1231/1480 train_time:193858ms step_avg:158.77ms step:1232/1480 train_time:194033ms step_avg:158.78ms step:1233/1480 train_time:194204ms step_avg:158.79ms step:1234/1480 train_time:194373ms step_avg:158.80ms step:1235/1480 train_time:194546ms step_avg:158.81ms step:1236/1480 train_time:194715ms step_avg:158.82ms step:1237/1480 train_time:194885ms step_avg:158.83ms step:1238/1480 train_time:195070ms step_avg:158.85ms step:1239/1480 train_time:195243ms step_avg:158.86ms step:1240/1480 train_time:195412ms step_avg:158.87ms step:1241/1480 train_time:195584ms step_avg:158.88ms step:1242/1480 train_time:195753ms step_avg:158.89ms step:1243/1480 train_time:195928ms step_avg:158.90ms step:1244/1480 train_time:196094ms step_avg:158.91ms step:1245/1480 train_time:196264ms step_avg:158.92ms step:1246/1480 train_time:196435ms step_avg:158.93ms step:1247/1480 train_time:196605ms step_avg:158.94ms step:1248/1480 train_time:196774ms step_avg:158.95ms step:1249/1480 train_time:196943ms step_avg:158.95ms step:1250/1480 train_time:197113ms step_avg:158.96ms step:1250/1480 val_loss:3.3352 train_time:197186ms step_avg:159.02ms step:1251/1480 train_time:197293ms step_avg:158.98ms step:1252/1480 train_time:197462ms step_avg:158.99ms step:1253/1480 train_time:197629ms step_avg:158.99ms step:1254/1480 train_time:197799ms step_avg:159.00ms step:1255/1480 train_time:197986ms step_avg:159.02ms step:1256/1480 train_time:198160ms step_avg:159.04ms step:1257/1480 train_time:198330ms step_avg:159.05ms step:1258/1480 train_time:198506ms step_avg:159.06ms step:1259/1480 train_time:198677ms step_avg:159.07ms step:1260/1480 train_time:198845ms step_avg:159.08ms step:1261/1480 train_time:199017ms step_avg:159.09ms step:1262/1480 train_time:199193ms step_avg:159.10ms step:1263/1480 train_time:199367ms step_avg:159.11ms step:1264/1480 train_time:199533ms step_avg:159.12ms step:1265/1480 train_time:199702ms step_avg:159.12ms step:1266/1480 train_time:199873ms step_avg:159.13ms step:1267/1480 train_time:200045ms step_avg:159.14ms step:1268/1480 train_time:200215ms step_avg:159.15ms step:1269/1480 train_time:200392ms step_avg:159.17ms step:1270/1480 train_time:200562ms step_avg:159.18ms step:1271/1480 train_time:200731ms step_avg:159.18ms step:1272/1480 train_time:200897ms step_avg:159.19ms step:1273/1480 train_time:201068ms step_avg:159.20ms step:1274/1480 train_time:201242ms step_avg:159.21ms step:1275/1480 train_time:201408ms step_avg:159.22ms step:1276/1480 train_time:201573ms step_avg:159.22ms step:1277/1480 train_time:201747ms step_avg:159.23ms step:1278/1480 train_time:201914ms step_avg:159.24ms step:1279/1480 train_time:202087ms step_avg:159.25ms step:1280/1480 train_time:202267ms step_avg:159.27ms step:1281/1480 train_time:202435ms step_avg:159.27ms step:1282/1480 train_time:202601ms step_avg:159.28ms step:1283/1480 train_time:202771ms step_avg:159.29ms step:1284/1480 train_time:202943ms step_avg:159.30ms step:1285/1480 train_time:203112ms step_avg:159.30ms step:1286/1480 train_time:203283ms step_avg:159.31ms step:1287/1480 train_time:203455ms step_avg:159.32ms step:1288/1480 train_time:203627ms step_avg:159.33ms step:1289/1480 train_time:203808ms step_avg:159.35ms step:1290/1480 train_time:203988ms step_avg:159.37ms step:1291/1480 train_time:204161ms step_avg:159.38ms step:1292/1480 train_time:204334ms step_avg:159.39ms step:1293/1480 train_time:204509ms step_avg:159.40ms step:1294/1480 train_time:204681ms step_avg:159.41ms step:1295/1480 train_time:204852ms step_avg:159.42ms step:1296/1480 train_time:205027ms step_avg:159.43ms step:1297/1480 train_time:205199ms step_avg:159.44ms step:1298/1480 train_time:205370ms step_avg:159.45ms step:1299/1480 train_time:205540ms step_avg:159.46ms step:1300/1480 train_time:205708ms step_avg:159.46ms step:1301/1480 train_time:205877ms step_avg:159.47ms step:1302/1480 train_time:206051ms step_avg:159.48ms step:1303/1480 train_time:206227ms step_avg:159.50ms step:1304/1480 train_time:206401ms step_avg:159.51ms step:1305/1480 train_time:206570ms step_avg:159.51ms step:1306/1480 train_time:206745ms step_avg:159.53ms step:1307/1480 train_time:206912ms step_avg:159.53ms step:1308/1480 train_time:207083ms step_avg:159.54ms step:1309/1480 train_time:207254ms step_avg:159.55ms step:1310/1480 train_time:207424ms step_avg:159.56ms step:1311/1480 train_time:207593ms step_avg:159.56ms step:1312/1480 train_time:207767ms step_avg:159.58ms step:1313/1480 train_time:207935ms step_avg:159.58ms step:1314/1480 train_time:208107ms step_avg:159.59ms step:1315/1480 train_time:208277ms step_avg:159.60ms step:1316/1480 train_time:208444ms step_avg:159.61ms step:1317/1480 train_time:208614ms step_avg:159.61ms step:1318/1480 train_time:208795ms step_avg:159.63ms step:1319/1480 train_time:208971ms step_avg:159.64ms step:1320/1480 train_time:209149ms step_avg:159.66ms step:1321/1480 train_time:209322ms step_avg:159.67ms step:1322/1480 train_time:209503ms step_avg:159.68ms step:1323/1480 train_time:209675ms step_avg:159.69ms step:1324/1480 train_time:209850ms step_avg:159.70ms step:1325/1480 train_time:210030ms step_avg:159.72ms step:1326/1480 train_time:210207ms step_avg:159.73ms step:1327/1480 train_time:210376ms step_avg:159.74ms step:1328/1480 train_time:210548ms step_avg:159.75ms step:1329/1480 train_time:210745ms step_avg:159.78ms step:1330/1480 train_time:210926ms step_avg:159.79ms step:1331/1480 train_time:211096ms step_avg:159.80ms step:1332/1480 train_time:211271ms step_avg:159.81ms step:1333/1480 train_time:211446ms step_avg:159.82ms step:1334/1480 train_time:211617ms step_avg:159.83ms step:1335/1480 train_time:211787ms step_avg:159.84ms step:1336/1480 train_time:211970ms step_avg:159.86ms step:1337/1480 train_time:212146ms step_avg:159.87ms step:1338/1480 train_time:212319ms step_avg:159.88ms step:1339/1480 train_time:212493ms step_avg:159.89ms step:1340/1480 train_time:212666ms step_avg:159.90ms step:1341/1480 train_time:212834ms step_avg:159.91ms step:1342/1480 train_time:213007ms step_avg:159.91ms step:1343/1480 train_time:213176ms step_avg:159.92ms step:1344/1480 train_time:213349ms step_avg:159.93ms step:1345/1480 train_time:213528ms step_avg:159.95ms step:1346/1480 train_time:213697ms step_avg:159.95ms step:1347/1480 train_time:213868ms step_avg:159.96ms step:1348/1480 train_time:214038ms step_avg:159.97ms step:1349/1480 train_time:214208ms step_avg:159.98ms step:1350/1480 train_time:214385ms step_avg:159.99ms step:1351/1480 train_time:214556ms step_avg:160.00ms step:1352/1480 train_time:214726ms step_avg:160.00ms step:1353/1480 train_time:214902ms step_avg:160.02ms step:1354/1480 train_time:215073ms step_avg:160.02ms step:1355/1480 train_time:215242ms step_avg:160.03ms step:1356/1480 train_time:215414ms step_avg:160.04ms step:1357/1480 train_time:215587ms step_avg:160.05ms step:1358/1480 train_time:215760ms step_avg:160.06ms step:1359/1480 train_time:215930ms step_avg:160.07ms step:1360/1480 train_time:216104ms step_avg:160.08ms step:1361/1480 train_time:216281ms step_avg:160.09ms step:1362/1480 train_time:216455ms step_avg:160.10ms step:1363/1480 train_time:216637ms step_avg:160.12ms step:1364/1480 train_time:216805ms step_avg:160.12ms step:1365/1480 train_time:216972ms step_avg:160.13ms step:1366/1480 train_time:217145ms step_avg:160.14ms step:1367/1480 train_time:217316ms step_avg:160.14ms step:1368/1480 train_time:217490ms step_avg:160.15ms step:1369/1480 train_time:217673ms step_avg:160.17ms step:1370/1480 train_time:217850ms step_avg:160.18ms step:1371/1480 train_time:218022ms step_avg:160.19ms step:1372/1480 train_time:218198ms step_avg:160.20ms step:1373/1480 train_time:218367ms step_avg:160.21ms step:1374/1480 train_time:218542ms step_avg:160.22ms step:1375/1480 train_time:218711ms step_avg:160.23ms step:1375/1480 val_loss:3.2961 train_time:218778ms step_avg:160.28ms step:1376/1480 train_time:218884ms step_avg:160.24ms step:1377/1480 train_time:219057ms step_avg:160.25ms step:1378/1480 train_time:219226ms step_avg:160.25ms step:1379/1480 train_time:219402ms step_avg:160.26ms step:1380/1480 train_time:219577ms step_avg:160.28ms step:1381/1480 train_time:219756ms step_avg:160.29ms step:1382/1480 train_time:219929ms step_avg:160.30ms step:1383/1480 train_time:220100ms step_avg:160.31ms step:1384/1480 train_time:220277ms step_avg:160.32ms step:1385/1480 train_time:220442ms step_avg:160.32ms step:1386/1480 train_time:220614ms step_avg:160.33ms step:1387/1480 train_time:220785ms step_avg:160.34ms step:1388/1480 train_time:220955ms step_avg:160.34ms step:1389/1480 train_time:221130ms step_avg:160.36ms step:1390/1480 train_time:221299ms step_avg:160.36ms step:1391/1480 train_time:221468ms step_avg:160.37ms step:1392/1480 train_time:221640ms step_avg:160.38ms step:1393/1480 train_time:221811ms step_avg:160.38ms step:1394/1480 train_time:221981ms step_avg:160.39ms step:1395/1480 train_time:222152ms step_avg:160.40ms step:1396/1480 train_time:222320ms step_avg:160.40ms step:1397/1480 train_time:222487ms step_avg:160.41ms step:1398/1480 train_time:222656ms step_avg:160.42ms step:1399/1480 train_time:222823ms step_avg:160.42ms step:1400/1480 train_time:223001ms step_avg:160.43ms step:1401/1480 train_time:223168ms step_avg:160.44ms step:1402/1480 train_time:223339ms step_avg:160.44ms step:1403/1480 train_time:223517ms step_avg:160.46ms step:1404/1480 train_time:223688ms step_avg:160.47ms step:1405/1480 train_time:223863ms step_avg:160.48ms step:1406/1480 train_time:224040ms step_avg:160.49ms step:1407/1480 train_time:224205ms step_avg:160.49ms step:1408/1480 train_time:224376ms step_avg:160.50ms step:1409/1480 train_time:224557ms step_avg:160.51ms step:1410/1480 train_time:224728ms step_avg:160.52ms step:1411/1480 train_time:224897ms step_avg:160.53ms step:1412/1480 train_time:225068ms step_avg:160.53ms step:1413/1480 train_time:225237ms step_avg:160.54ms step:1414/1480 train_time:225409ms step_avg:160.55ms step:1415/1480 train_time:225584ms step_avg:160.56ms step:1416/1480 train_time:225772ms step_avg:160.58ms step:1417/1480 train_time:225944ms step_avg:160.59ms step:1418/1480 train_time:226116ms step_avg:160.59ms step:1419/1480 train_time:226290ms step_avg:160.60ms step:1420/1480 train_time:226466ms step_avg:160.61ms step:1421/1480 train_time:226640ms step_avg:160.62ms step:1422/1480 train_time:226813ms step_avg:160.63ms step:1423/1480 train_time:226982ms step_avg:160.64ms step:1424/1480 train_time:227159ms step_avg:160.65ms step:1425/1480 train_time:227341ms step_avg:160.66ms step:1426/1480 train_time:227513ms step_avg:160.67ms step:1427/1480 train_time:227688ms step_avg:160.68ms step:1428/1480 train_time:227859ms step_avg:160.69ms step:1429/1480 train_time:228029ms step_avg:160.70ms step:1430/1480 train_time:228202ms step_avg:160.71ms step:1431/1480 train_time:228380ms step_avg:160.72ms step:1432/1480 train_time:228558ms step_avg:160.73ms step:1433/1480 train_time:228737ms step_avg:160.74ms step:1434/1480 train_time:228917ms step_avg:160.76ms step:1435/1480 train_time:229091ms step_avg:160.77ms step:1436/1480 train_time:229265ms step_avg:160.78ms step:1437/1480 train_time:229437ms step_avg:160.78ms step:1438/1480 train_time:229606ms step_avg:160.79ms step:1439/1480 train_time:229779ms step_avg:160.80ms step:1440/1480 train_time:229949ms step_avg:160.80ms step:1441/1480 train_time:230118ms step_avg:160.81ms step:1442/1480 train_time:230297ms step_avg:160.82ms step:1443/1480 train_time:230484ms step_avg:160.84ms step:1444/1480 train_time:230655ms step_avg:160.85ms step:1445/1480 train_time:230826ms step_avg:160.85ms step:1446/1480 train_time:231001ms step_avg:160.86ms step:1447/1480 train_time:231181ms step_avg:160.88ms step:1448/1480 train_time:231353ms step_avg:160.89ms step:1449/1480 train_time:231526ms step_avg:160.89ms step:1450/1480 train_time:231699ms step_avg:160.90ms step:1451/1480 train_time:231872ms step_avg:160.91ms step:1452/1480 train_time:232044ms step_avg:160.92ms step:1453/1480 train_time:232214ms step_avg:160.92ms step:1454/1480 train_time:232386ms step_avg:160.93ms step:1455/1480 train_time:232566ms step_avg:160.95ms step:1456/1480 train_time:232741ms step_avg:160.95ms step:1457/1480 train_time:232912ms step_avg:160.96ms step:1458/1480 train_time:233082ms step_avg:160.97ms step:1459/1480 train_time:233258ms step_avg:160.98ms step:1460/1480 train_time:233431ms step_avg:160.99ms step:1461/1480 train_time:233603ms step_avg:160.99ms step:1462/1480 train_time:233775ms step_avg:161.00ms step:1463/1480 train_time:233951ms step_avg:161.01ms step:1464/1480 train_time:234126ms step_avg:161.02ms step:1465/1480 train_time:234298ms step_avg:161.03ms step:1466/1480 train_time:234468ms step_avg:161.04ms step:1467/1480 train_time:234643ms step_avg:161.05ms step:1468/1480 train_time:234814ms step_avg:161.05ms step:1469/1480 train_time:234987ms step_avg:161.06ms step:1470/1480 train_time:235168ms step_avg:161.07ms step:1471/1480 train_time:235356ms step_avg:161.09ms step:1472/1480 train_time:235538ms step_avg:161.11ms step:1473/1480 train_time:235709ms step_avg:161.11ms step:1474/1480 train_time:235886ms step_avg:161.12ms step:1475/1480 train_time:236067ms step_avg:161.14ms step:1476/1480 train_time:236239ms step_avg:161.15ms step:1477/1480 train_time:236419ms step_avg:161.16ms step:1478/1480 train_time:236603ms step_avg:161.17ms step:1479/1480 train_time:236777ms step_avg:161.18ms step:1480/1480 train_time:236949ms step_avg:161.19ms step:1480/1480 val_loss:3.2772 train_time:237019ms step_avg:161.24ms