import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 12:13:37 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 44C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 114W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 91W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 112W / 700W | 119MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 82W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23836ms step_avg:nanms step:2/1480 train_time:23922ms step_avg:nanms step:3/1480 train_time:24060ms step_avg:nanms step:4/1480 train_time:24202ms step_avg:nanms step:5/1480 train_time:24344ms step_avg:nanms step:6/1480 train_time:24487ms step_avg:nanms step:7/1480 train_time:24629ms step_avg:nanms step:8/1480 train_time:24771ms step_avg:nanms step:9/1480 train_time:24915ms step_avg:nanms step:10/1480 train_time:25057ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:426ms step_avg:141.99ms step:14/1480 train_time:568ms step_avg:142.04ms step:15/1480 train_time:711ms step_avg:142.26ms step:16/1480 train_time:855ms step_avg:142.48ms step:17/1480 train_time:998ms step_avg:142.60ms step:18/1480 train_time:1141ms step_avg:142.65ms step:19/1480 train_time:1283ms step_avg:142.55ms step:20/1480 train_time:1425ms step_avg:142.55ms step:21/1480 train_time:1569ms step_avg:142.68ms step:22/1480 train_time:1713ms step_avg:142.72ms step:23/1480 train_time:1857ms step_avg:142.81ms step:24/1480 train_time:1998ms step_avg:142.72ms step:25/1480 train_time:2142ms step_avg:142.79ms step:26/1480 train_time:2283ms step_avg:142.71ms step:27/1480 train_time:2427ms step_avg:142.76ms step:28/1480 train_time:2571ms step_avg:142.86ms step:29/1480 train_time:2715ms step_avg:142.87ms step:30/1480 train_time:2857ms step_avg:142.85ms step:31/1480 train_time:3000ms step_avg:142.84ms step:32/1480 train_time:3143ms step_avg:142.88ms step:33/1480 train_time:3287ms step_avg:142.90ms step:34/1480 train_time:3431ms step_avg:142.96ms step:35/1480 train_time:3574ms step_avg:142.96ms step:36/1480 train_time:3717ms step_avg:142.97ms step:37/1480 train_time:3860ms step_avg:142.97ms step:38/1480 train_time:4003ms step_avg:142.97ms step:39/1480 train_time:4147ms step_avg:143.00ms step:40/1480 train_time:4291ms step_avg:143.02ms step:41/1480 train_time:4434ms step_avg:143.02ms step:42/1480 train_time:4577ms step_avg:143.04ms step:43/1480 train_time:4720ms step_avg:143.02ms step:44/1480 train_time:4862ms step_avg:143.00ms step:45/1480 train_time:5007ms step_avg:143.05ms step:46/1480 train_time:5152ms step_avg:143.10ms step:47/1480 train_time:5294ms step_avg:143.09ms step:48/1480 train_time:5436ms step_avg:143.06ms step:49/1480 train_time:5577ms step_avg:142.99ms step:50/1480 train_time:5718ms step_avg:142.96ms step:51/1480 train_time:5861ms step_avg:142.95ms step:52/1480 train_time:6005ms step_avg:142.97ms step:53/1480 train_time:6150ms step_avg:143.01ms step:54/1480 train_time:6294ms step_avg:143.04ms step:55/1480 train_time:6437ms step_avg:143.04ms step:56/1480 train_time:6578ms step_avg:143.01ms step:57/1480 train_time:6719ms step_avg:142.97ms step:58/1480 train_time:6861ms step_avg:142.93ms step:59/1480 train_time:7004ms step_avg:142.93ms step:60/1480 train_time:7147ms step_avg:142.93ms step:61/1480 train_time:7289ms step_avg:142.92ms step:62/1480 train_time:7433ms step_avg:142.94ms step:63/1480 train_time:7575ms step_avg:142.93ms step:64/1480 train_time:7717ms step_avg:142.90ms step:65/1480 train_time:7858ms step_avg:142.88ms step:66/1480 train_time:7999ms step_avg:142.84ms step:67/1480 train_time:8142ms step_avg:142.84ms step:68/1480 train_time:8286ms step_avg:142.87ms step:69/1480 train_time:8430ms step_avg:142.89ms step:70/1480 train_time:8574ms step_avg:142.90ms step:71/1480 train_time:8716ms step_avg:142.88ms step:72/1480 train_time:8858ms step_avg:142.87ms step:73/1480 train_time:8999ms step_avg:142.85ms step:74/1480 train_time:9141ms step_avg:142.83ms step:75/1480 train_time:9284ms step_avg:142.83ms step:76/1480 train_time:9429ms step_avg:142.87ms step:77/1480 train_time:9573ms step_avg:142.88ms step:78/1480 train_time:9715ms step_avg:142.86ms step:79/1480 train_time:9857ms step_avg:142.85ms step:80/1480 train_time:9999ms step_avg:142.84ms step:81/1480 train_time:10140ms step_avg:142.82ms step:82/1480 train_time:10282ms step_avg:142.80ms step:83/1480 train_time:10426ms step_avg:142.82ms step:84/1480 train_time:10571ms step_avg:142.86ms step:85/1480 train_time:10714ms step_avg:142.86ms step:86/1480 train_time:10857ms step_avg:142.86ms step:87/1480 train_time:10998ms step_avg:142.83ms step:88/1480 train_time:11139ms step_avg:142.81ms step:89/1480 train_time:11280ms step_avg:142.78ms step:90/1480 train_time:11422ms step_avg:142.78ms step:91/1480 train_time:11567ms step_avg:142.80ms step:92/1480 train_time:11711ms step_avg:142.82ms step:93/1480 train_time:11854ms step_avg:142.82ms step:94/1480 train_time:11995ms step_avg:142.80ms step:95/1480 train_time:12136ms step_avg:142.78ms step:96/1480 train_time:12278ms step_avg:142.76ms step:97/1480 train_time:12419ms step_avg:142.75ms step:98/1480 train_time:12563ms step_avg:142.76ms step:99/1480 train_time:12706ms step_avg:142.76ms step:100/1480 train_time:12850ms step_avg:142.78ms step:101/1480 train_time:12993ms step_avg:142.78ms step:102/1480 train_time:13135ms step_avg:142.77ms step:103/1480 train_time:13276ms step_avg:142.76ms step:104/1480 train_time:13418ms step_avg:142.74ms step:105/1480 train_time:13561ms step_avg:142.75ms step:106/1480 train_time:13704ms step_avg:142.75ms step:107/1480 train_time:13849ms step_avg:142.77ms step:108/1480 train_time:13992ms step_avg:142.78ms step:109/1480 train_time:14136ms step_avg:142.79ms step:110/1480 train_time:14277ms step_avg:142.77ms step:111/1480 train_time:14423ms step_avg:142.80ms step:112/1480 train_time:14572ms step_avg:142.86ms step:113/1480 train_time:14719ms step_avg:142.90ms step:114/1480 train_time:14869ms step_avg:142.97ms step:115/1480 train_time:15016ms step_avg:143.01ms step:116/1480 train_time:15161ms step_avg:143.03ms step:117/1480 train_time:15307ms step_avg:143.06ms step:118/1480 train_time:15454ms step_avg:143.09ms step:119/1480 train_time:15600ms step_avg:143.12ms step:120/1480 train_time:15748ms step_avg:143.17ms step:121/1480 train_time:15895ms step_avg:143.20ms step:122/1480 train_time:16042ms step_avg:143.24ms step:123/1480 train_time:16189ms step_avg:143.26ms step:124/1480 train_time:16337ms step_avg:143.30ms step:125/1480 train_time:16482ms step_avg:143.33ms step:125/1480 val_loss:4.4375 train_time:16539ms step_avg:143.82ms step:126/1480 train_time:16636ms step_avg:143.41ms step:127/1480 train_time:16784ms step_avg:143.45ms step:128/1480 train_time:16930ms step_avg:143.47ms step:129/1480 train_time:17076ms step_avg:143.50ms step:130/1480 train_time:17222ms step_avg:143.52ms step:131/1480 train_time:17368ms step_avg:143.54ms step:132/1480 train_time:17514ms step_avg:143.56ms step:133/1480 train_time:17663ms step_avg:143.60ms step:134/1480 train_time:17811ms step_avg:143.63ms step:135/1480 train_time:17960ms step_avg:143.68ms step:136/1480 train_time:18107ms step_avg:143.70ms step:137/1480 train_time:18253ms step_avg:143.72ms step:138/1480 train_time:18400ms step_avg:143.75ms step:139/1480 train_time:18546ms step_avg:143.77ms step:140/1480 train_time:18694ms step_avg:143.80ms step:141/1480 train_time:18841ms step_avg:143.83ms step:142/1480 train_time:18990ms step_avg:143.86ms step:143/1480 train_time:19137ms step_avg:143.89ms step:144/1480 train_time:19284ms step_avg:143.91ms step:145/1480 train_time:19428ms step_avg:143.91ms step:146/1480 train_time:19574ms step_avg:143.93ms step:147/1480 train_time:19722ms step_avg:143.96ms step:148/1480 train_time:19869ms step_avg:143.98ms step:149/1480 train_time:20017ms step_avg:144.01ms step:150/1480 train_time:20165ms step_avg:144.03ms step:151/1480 train_time:20311ms step_avg:144.05ms step:152/1480 train_time:20459ms step_avg:144.08ms step:153/1480 train_time:20607ms step_avg:144.10ms step:154/1480 train_time:20754ms step_avg:144.13ms step:155/1480 train_time:20902ms step_avg:144.15ms step:156/1480 train_time:21048ms step_avg:144.16ms step:157/1480 train_time:21196ms step_avg:144.19ms step:158/1480 train_time:21343ms step_avg:144.21ms step:159/1480 train_time:21490ms step_avg:144.23ms step:160/1480 train_time:21637ms step_avg:144.24ms step:161/1480 train_time:21784ms step_avg:144.27ms step:162/1480 train_time:21929ms step_avg:144.27ms step:163/1480 train_time:22076ms step_avg:144.29ms step:164/1480 train_time:22223ms step_avg:144.31ms step:165/1480 train_time:22370ms step_avg:144.32ms step:166/1480 train_time:22518ms step_avg:144.34ms step:167/1480 train_time:22665ms step_avg:144.37ms step:168/1480 train_time:22812ms step_avg:144.38ms step:169/1480 train_time:22960ms step_avg:144.41ms step:170/1480 train_time:23107ms step_avg:144.42ms step:171/1480 train_time:23256ms step_avg:144.45ms step:172/1480 train_time:23404ms step_avg:144.47ms step:173/1480 train_time:23551ms step_avg:144.49ms step:174/1480 train_time:23699ms step_avg:144.51ms step:175/1480 train_time:23846ms step_avg:144.52ms step:176/1480 train_time:23992ms step_avg:144.53ms step:177/1480 train_time:24141ms step_avg:144.55ms step:178/1480 train_time:24287ms step_avg:144.57ms step:179/1480 train_time:24435ms step_avg:144.59ms step:180/1480 train_time:24582ms step_avg:144.60ms step:181/1480 train_time:24728ms step_avg:144.61ms step:182/1480 train_time:24875ms step_avg:144.62ms step:183/1480 train_time:25023ms step_avg:144.64ms step:184/1480 train_time:25170ms step_avg:144.65ms step:185/1480 train_time:25317ms step_avg:144.67ms step:186/1480 train_time:25465ms step_avg:144.69ms step:187/1480 train_time:25611ms step_avg:144.70ms step:188/1480 train_time:25759ms step_avg:144.71ms step:189/1480 train_time:25907ms step_avg:144.73ms step:190/1480 train_time:26054ms step_avg:144.75ms step:191/1480 train_time:26203ms step_avg:144.77ms step:192/1480 train_time:26348ms step_avg:144.77ms step:193/1480 train_time:26495ms step_avg:144.78ms step:194/1480 train_time:26642ms step_avg:144.80ms step:195/1480 train_time:26788ms step_avg:144.80ms step:196/1480 train_time:26937ms step_avg:144.82ms step:197/1480 train_time:27084ms step_avg:144.83ms step:198/1480 train_time:27230ms step_avg:144.84ms step:199/1480 train_time:27378ms step_avg:144.86ms step:200/1480 train_time:27526ms step_avg:144.87ms step:201/1480 train_time:27673ms step_avg:144.89ms step:202/1480 train_time:27820ms step_avg:144.90ms step:203/1480 train_time:27967ms step_avg:144.91ms step:204/1480 train_time:28112ms step_avg:144.91ms step:205/1480 train_time:28258ms step_avg:144.91ms step:206/1480 train_time:28406ms step_avg:144.93ms step:207/1480 train_time:28551ms step_avg:144.93ms step:208/1480 train_time:28699ms step_avg:144.94ms step:209/1480 train_time:28845ms step_avg:144.95ms step:210/1480 train_time:28991ms step_avg:144.96ms step:211/1480 train_time:29139ms step_avg:144.97ms step:212/1480 train_time:29286ms step_avg:144.98ms step:213/1480 train_time:29431ms step_avg:144.98ms step:214/1480 train_time:29578ms step_avg:144.99ms step:215/1480 train_time:29725ms step_avg:145.00ms step:216/1480 train_time:29871ms step_avg:145.00ms step:217/1480 train_time:30017ms step_avg:145.01ms step:218/1480 train_time:30164ms step_avg:145.02ms step:219/1480 train_time:30310ms step_avg:145.03ms step:220/1480 train_time:30458ms step_avg:145.04ms step:221/1480 train_time:30607ms step_avg:145.06ms step:222/1480 train_time:30758ms step_avg:145.09ms step:223/1480 train_time:30908ms step_avg:145.11ms step:224/1480 train_time:31058ms step_avg:145.13ms step:225/1480 train_time:31208ms step_avg:145.15ms step:226/1480 train_time:31358ms step_avg:145.18ms step:227/1480 train_time:31509ms step_avg:145.20ms step:228/1480 train_time:31660ms step_avg:145.23ms step:229/1480 train_time:31810ms step_avg:145.25ms step:230/1480 train_time:31960ms step_avg:145.27ms step:231/1480 train_time:32110ms step_avg:145.29ms step:232/1480 train_time:32260ms step_avg:145.31ms step:233/1480 train_time:32410ms step_avg:145.34ms step:234/1480 train_time:32561ms step_avg:145.36ms step:235/1480 train_time:32711ms step_avg:145.38ms step:236/1480 train_time:32861ms step_avg:145.40ms step:237/1480 train_time:33011ms step_avg:145.42ms step:238/1480 train_time:33162ms step_avg:145.45ms step:239/1480 train_time:33311ms step_avg:145.46ms step:240/1480 train_time:33461ms step_avg:145.48ms step:241/1480 train_time:33612ms step_avg:145.51ms step:242/1480 train_time:33762ms step_avg:145.52ms step:243/1480 train_time:33911ms step_avg:145.54ms step:244/1480 train_time:34062ms step_avg:145.56ms step:245/1480 train_time:34213ms step_avg:145.59ms step:246/1480 train_time:34364ms step_avg:145.61ms step:247/1480 train_time:34515ms step_avg:145.63ms step:248/1480 train_time:34666ms step_avg:145.66ms step:249/1480 train_time:34815ms step_avg:145.67ms step:250/1480 train_time:34965ms step_avg:145.69ms step:250/1480 val_loss:4.0032 train_time:35023ms step_avg:145.93ms step:251/1480 train_time:35120ms step_avg:145.72ms step:252/1480 train_time:35271ms step_avg:145.75ms step:253/1480 train_time:35422ms step_avg:145.77ms step:254/1480 train_time:35572ms step_avg:145.79ms step:255/1480 train_time:35721ms step_avg:145.80ms step:256/1480 train_time:35870ms step_avg:145.81ms step:257/1480 train_time:36021ms step_avg:145.83ms step:258/1480 train_time:36173ms step_avg:145.86ms step:259/1480 train_time:36325ms step_avg:145.88ms step:260/1480 train_time:36476ms step_avg:145.90ms step:261/1480 train_time:36625ms step_avg:145.92ms step:262/1480 train_time:36775ms step_avg:145.93ms step:263/1480 train_time:36924ms step_avg:145.95ms step:264/1480 train_time:37077ms step_avg:145.97ms step:265/1480 train_time:37228ms step_avg:145.99ms step:266/1480 train_time:37380ms step_avg:146.01ms step:267/1480 train_time:37529ms step_avg:146.03ms step:268/1480 train_time:37680ms step_avg:146.05ms step:269/1480 train_time:37830ms step_avg:146.06ms step:270/1480 train_time:37980ms step_avg:146.08ms step:271/1480 train_time:38129ms step_avg:146.09ms step:272/1480 train_time:38281ms step_avg:146.11ms step:273/1480 train_time:38431ms step_avg:146.13ms step:274/1480 train_time:38581ms step_avg:146.14ms step:275/1480 train_time:38731ms step_avg:146.16ms step:276/1480 train_time:38881ms step_avg:146.17ms step:277/1480 train_time:39031ms step_avg:146.18ms step:278/1480 train_time:39181ms step_avg:146.20ms step:279/1480 train_time:39332ms step_avg:146.21ms step:280/1480 train_time:39483ms step_avg:146.23ms step:281/1480 train_time:39635ms step_avg:146.25ms step:282/1480 train_time:39786ms step_avg:146.27ms step:283/1480 train_time:39937ms step_avg:146.29ms step:284/1480 train_time:40088ms step_avg:146.31ms step:285/1480 train_time:40238ms step_avg:146.32ms step:286/1480 train_time:40389ms step_avg:146.34ms step:287/1480 train_time:40540ms step_avg:146.35ms step:288/1480 train_time:40690ms step_avg:146.37ms step:289/1480 train_time:40840ms step_avg:146.38ms step:290/1480 train_time:40991ms step_avg:146.40ms step:291/1480 train_time:41141ms step_avg:146.41ms step:292/1480 train_time:41291ms step_avg:146.42ms step:293/1480 train_time:41442ms step_avg:146.44ms step:294/1480 train_time:41593ms step_avg:146.45ms step:295/1480 train_time:41744ms step_avg:146.47ms step:296/1480 train_time:41895ms step_avg:146.48ms step:297/1480 train_time:42045ms step_avg:146.50ms step:298/1480 train_time:42197ms step_avg:146.52ms step:299/1480 train_time:42346ms step_avg:146.53ms step:300/1480 train_time:42497ms step_avg:146.54ms step:301/1480 train_time:42647ms step_avg:146.55ms step:302/1480 train_time:42798ms step_avg:146.57ms step:303/1480 train_time:42948ms step_avg:146.58ms step:304/1480 train_time:43099ms step_avg:146.60ms step:305/1480 train_time:43249ms step_avg:146.61ms step:306/1480 train_time:43400ms step_avg:146.62ms step:307/1480 train_time:43549ms step_avg:146.63ms step:308/1480 train_time:43701ms step_avg:146.65ms step:309/1480 train_time:43851ms step_avg:146.66ms step:310/1480 train_time:44001ms step_avg:146.67ms step:311/1480 train_time:44151ms step_avg:146.68ms step:312/1480 train_time:44302ms step_avg:146.69ms step:313/1480 train_time:44453ms step_avg:146.71ms step:314/1480 train_time:44603ms step_avg:146.72ms step:315/1480 train_time:44754ms step_avg:146.73ms step:316/1480 train_time:44905ms step_avg:146.75ms step:317/1480 train_time:45057ms step_avg:146.76ms step:318/1480 train_time:45207ms step_avg:146.78ms step:319/1480 train_time:45358ms step_avg:146.79ms step:320/1480 train_time:45508ms step_avg:146.80ms step:321/1480 train_time:45658ms step_avg:146.81ms step:322/1480 train_time:45809ms step_avg:146.82ms step:323/1480 train_time:45961ms step_avg:146.84ms step:324/1480 train_time:46111ms step_avg:146.85ms step:325/1480 train_time:46261ms step_avg:146.86ms step:326/1480 train_time:46411ms step_avg:146.87ms step:327/1480 train_time:46561ms step_avg:146.88ms step:328/1480 train_time:46712ms step_avg:146.89ms step:329/1480 train_time:46863ms step_avg:146.90ms step:330/1480 train_time:47016ms step_avg:146.92ms step:331/1480 train_time:47170ms step_avg:146.95ms step:332/1480 train_time:47323ms step_avg:146.97ms step:333/1480 train_time:47478ms step_avg:146.99ms step:334/1480 train_time:47633ms step_avg:147.02ms step:335/1480 train_time:47787ms step_avg:147.04ms step:336/1480 train_time:47940ms step_avg:147.06ms step:337/1480 train_time:48095ms step_avg:147.08ms step:338/1480 train_time:48249ms step_avg:147.10ms step:339/1480 train_time:48403ms step_avg:147.12ms step:340/1480 train_time:48557ms step_avg:147.14ms step:341/1480 train_time:48711ms step_avg:147.16ms step:342/1480 train_time:48864ms step_avg:147.18ms step:343/1480 train_time:49018ms step_avg:147.20ms step:344/1480 train_time:49172ms step_avg:147.22ms step:345/1480 train_time:49326ms step_avg:147.24ms step:346/1480 train_time:49481ms step_avg:147.26ms step:347/1480 train_time:49634ms step_avg:147.28ms step:348/1480 train_time:49788ms step_avg:147.30ms step:349/1480 train_time:49941ms step_avg:147.32ms step:350/1480 train_time:50097ms step_avg:147.34ms step:351/1480 train_time:50250ms step_avg:147.36ms step:352/1480 train_time:50403ms step_avg:147.38ms step:353/1480 train_time:50557ms step_avg:147.40ms step:354/1480 train_time:50712ms step_avg:147.42ms step:355/1480 train_time:50866ms step_avg:147.44ms step:356/1480 train_time:51023ms step_avg:147.46ms step:357/1480 train_time:51175ms step_avg:147.48ms step:358/1480 train_time:51329ms step_avg:147.50ms step:359/1480 train_time:51484ms step_avg:147.52ms step:360/1480 train_time:51638ms step_avg:147.54ms step:361/1480 train_time:51795ms step_avg:147.56ms step:362/1480 train_time:51950ms step_avg:147.59ms step:363/1480 train_time:52103ms step_avg:147.60ms step:364/1480 train_time:52256ms step_avg:147.62ms step:365/1480 train_time:52413ms step_avg:147.64ms step:366/1480 train_time:52566ms step_avg:147.66ms step:367/1480 train_time:52718ms step_avg:147.67ms step:368/1480 train_time:52871ms step_avg:147.68ms step:369/1480 train_time:53025ms step_avg:147.70ms step:370/1480 train_time:53179ms step_avg:147.72ms step:371/1480 train_time:53334ms step_avg:147.74ms step:372/1480 train_time:53489ms step_avg:147.76ms step:373/1480 train_time:53643ms step_avg:147.78ms step:374/1480 train_time:53796ms step_avg:147.79ms step:375/1480 train_time:53949ms step_avg:147.81ms step:375/1480 val_loss:3.8129 train_time:54009ms step_avg:147.97ms step:376/1480 train_time:54108ms step_avg:147.84ms step:377/1480 train_time:54262ms step_avg:147.85ms step:378/1480 train_time:54414ms step_avg:147.86ms step:379/1480 train_time:54566ms step_avg:147.88ms step:380/1480 train_time:54717ms step_avg:147.88ms step:381/1480 train_time:54870ms step_avg:147.90ms step:382/1480 train_time:55024ms step_avg:147.91ms step:383/1480 train_time:55179ms step_avg:147.93ms step:384/1480 train_time:55333ms step_avg:147.95ms step:385/1480 train_time:55487ms step_avg:147.97ms step:386/1480 train_time:55640ms step_avg:147.98ms step:387/1480 train_time:55794ms step_avg:147.99ms step:388/1480 train_time:55947ms step_avg:148.01ms step:389/1480 train_time:56102ms step_avg:148.03ms step:390/1480 train_time:56256ms step_avg:148.04ms step:391/1480 train_time:56411ms step_avg:148.06ms step:392/1480 train_time:56565ms step_avg:148.08ms step:393/1480 train_time:56718ms step_avg:148.09ms step:394/1480 train_time:56872ms step_avg:148.10ms step:395/1480 train_time:57025ms step_avg:148.12ms step:396/1480 train_time:57179ms step_avg:148.13ms step:397/1480 train_time:57333ms step_avg:148.15ms step:398/1480 train_time:57487ms step_avg:148.16ms step:399/1480 train_time:57640ms step_avg:148.17ms step:400/1480 train_time:57794ms step_avg:148.19ms step:401/1480 train_time:57947ms step_avg:148.20ms step:402/1480 train_time:58101ms step_avg:148.22ms step:403/1480 train_time:58254ms step_avg:148.23ms step:404/1480 train_time:58409ms step_avg:148.25ms step:405/1480 train_time:58564ms step_avg:148.26ms step:406/1480 train_time:58717ms step_avg:148.28ms step:407/1480 train_time:58872ms step_avg:148.29ms step:408/1480 train_time:59027ms step_avg:148.31ms step:409/1480 train_time:59181ms step_avg:148.32ms step:410/1480 train_time:59334ms step_avg:148.33ms step:411/1480 train_time:59489ms step_avg:148.35ms step:412/1480 train_time:59642ms step_avg:148.36ms step:413/1480 train_time:59795ms step_avg:148.38ms step:414/1480 train_time:59949ms step_avg:148.39ms step:415/1480 train_time:60105ms step_avg:148.41ms step:416/1480 train_time:60258ms step_avg:148.42ms step:417/1480 train_time:60411ms step_avg:148.43ms step:418/1480 train_time:60566ms step_avg:148.45ms step:419/1480 train_time:60720ms step_avg:148.46ms step:420/1480 train_time:60874ms step_avg:148.47ms step:421/1480 train_time:61028ms step_avg:148.49ms step:422/1480 train_time:61181ms step_avg:148.50ms step:423/1480 train_time:61334ms step_avg:148.51ms step:424/1480 train_time:61488ms step_avg:148.52ms step:425/1480 train_time:61643ms step_avg:148.54ms step:426/1480 train_time:61796ms step_avg:148.55ms step:427/1480 train_time:61949ms step_avg:148.56ms step:428/1480 train_time:62104ms step_avg:148.57ms step:429/1480 train_time:62256ms step_avg:148.58ms step:430/1480 train_time:62410ms step_avg:148.60ms step:431/1480 train_time:62564ms step_avg:148.61ms step:432/1480 train_time:62717ms step_avg:148.62ms step:433/1480 train_time:62872ms step_avg:148.63ms step:434/1480 train_time:63026ms step_avg:148.65ms step:435/1480 train_time:63180ms step_avg:148.66ms step:436/1480 train_time:63334ms step_avg:148.67ms step:437/1480 train_time:63488ms step_avg:148.68ms step:438/1480 train_time:63642ms step_avg:148.70ms step:439/1480 train_time:63797ms step_avg:148.71ms step:440/1480 train_time:63952ms step_avg:148.73ms step:441/1480 train_time:64111ms step_avg:148.75ms step:442/1480 train_time:64268ms step_avg:148.77ms step:443/1480 train_time:64424ms step_avg:148.79ms step:444/1480 train_time:64580ms step_avg:148.80ms step:445/1480 train_time:64735ms step_avg:148.82ms step:446/1480 train_time:64892ms step_avg:148.83ms step:447/1480 train_time:65048ms step_avg:148.85ms step:448/1480 train_time:65206ms step_avg:148.87ms step:449/1480 train_time:65364ms step_avg:148.89ms step:450/1480 train_time:65521ms step_avg:148.91ms step:451/1480 train_time:65679ms step_avg:148.93ms step:452/1480 train_time:65835ms step_avg:148.95ms step:453/1480 train_time:65991ms step_avg:148.96ms step:454/1480 train_time:66148ms step_avg:148.98ms step:455/1480 train_time:66305ms step_avg:149.00ms step:456/1480 train_time:66461ms step_avg:149.01ms step:457/1480 train_time:66617ms step_avg:149.03ms step:458/1480 train_time:66774ms step_avg:149.05ms step:459/1480 train_time:66932ms step_avg:149.07ms step:460/1480 train_time:67089ms step_avg:149.09ms step:461/1480 train_time:67246ms step_avg:149.11ms step:462/1480 train_time:67405ms step_avg:149.13ms step:463/1480 train_time:67561ms step_avg:149.14ms step:464/1480 train_time:67717ms step_avg:149.16ms step:465/1480 train_time:67874ms step_avg:149.17ms step:466/1480 train_time:68029ms step_avg:149.19ms step:467/1480 train_time:68189ms step_avg:149.21ms step:468/1480 train_time:68345ms step_avg:149.22ms step:469/1480 train_time:68501ms step_avg:149.24ms step:470/1480 train_time:68656ms step_avg:149.25ms step:471/1480 train_time:68813ms step_avg:149.27ms step:472/1480 train_time:68971ms step_avg:149.29ms step:473/1480 train_time:69128ms step_avg:149.30ms step:474/1480 train_time:69284ms step_avg:149.32ms step:475/1480 train_time:69440ms step_avg:149.33ms step:476/1480 train_time:69596ms step_avg:149.35ms step:477/1480 train_time:69753ms step_avg:149.36ms step:478/1480 train_time:69911ms step_avg:149.38ms step:479/1480 train_time:70069ms step_avg:149.40ms step:480/1480 train_time:70226ms step_avg:149.42ms step:481/1480 train_time:70384ms step_avg:149.44ms step:482/1480 train_time:70540ms step_avg:149.45ms step:483/1480 train_time:70696ms step_avg:149.46ms step:484/1480 train_time:70852ms step_avg:149.48ms step:485/1480 train_time:71011ms step_avg:149.50ms step:486/1480 train_time:71169ms step_avg:149.52ms step:487/1480 train_time:71326ms step_avg:149.53ms step:488/1480 train_time:71485ms step_avg:149.55ms step:489/1480 train_time:71642ms step_avg:149.56ms step:490/1480 train_time:71797ms step_avg:149.58ms step:491/1480 train_time:71954ms step_avg:149.59ms step:492/1480 train_time:72112ms step_avg:149.61ms step:493/1480 train_time:72270ms step_avg:149.63ms step:494/1480 train_time:72428ms step_avg:149.65ms step:495/1480 train_time:72587ms step_avg:149.66ms step:496/1480 train_time:72745ms step_avg:149.68ms step:497/1480 train_time:72901ms step_avg:149.70ms step:498/1480 train_time:73058ms step_avg:149.71ms step:499/1480 train_time:73216ms step_avg:149.73ms step:500/1480 train_time:73373ms step_avg:149.74ms step:500/1480 val_loss:3.6898 train_time:73435ms step_avg:149.87ms step:501/1480 train_time:73534ms step_avg:149.76ms step:502/1480 train_time:73691ms step_avg:149.78ms step:503/1480 train_time:73846ms step_avg:149.79ms step:504/1480 train_time:74002ms step_avg:149.80ms step:505/1480 train_time:74158ms step_avg:149.81ms step:506/1480 train_time:74315ms step_avg:149.83ms step:507/1480 train_time:74473ms step_avg:149.84ms step:508/1480 train_time:74631ms step_avg:149.86ms step:509/1480 train_time:74788ms step_avg:149.88ms step:510/1480 train_time:74945ms step_avg:149.89ms step:511/1480 train_time:75101ms step_avg:149.90ms step:512/1480 train_time:75259ms step_avg:149.92ms step:513/1480 train_time:75416ms step_avg:149.93ms step:514/1480 train_time:75574ms step_avg:149.95ms step:515/1480 train_time:75732ms step_avg:149.96ms step:516/1480 train_time:75891ms step_avg:149.98ms step:517/1480 train_time:76049ms step_avg:150.00ms step:518/1480 train_time:76204ms step_avg:150.01ms step:519/1480 train_time:76361ms step_avg:150.02ms step:520/1480 train_time:76517ms step_avg:150.03ms step:521/1480 train_time:76676ms step_avg:150.05ms step:522/1480 train_time:76834ms step_avg:150.07ms step:523/1480 train_time:76993ms step_avg:150.08ms step:524/1480 train_time:77150ms step_avg:150.10ms step:525/1480 train_time:77307ms step_avg:150.11ms step:526/1480 train_time:77463ms step_avg:150.12ms step:527/1480 train_time:77620ms step_avg:150.14ms step:528/1480 train_time:77777ms step_avg:150.15ms step:529/1480 train_time:77934ms step_avg:150.16ms step:530/1480 train_time:78092ms step_avg:150.18ms step:531/1480 train_time:78248ms step_avg:150.19ms step:532/1480 train_time:78404ms step_avg:150.20ms step:533/1480 train_time:78560ms step_avg:150.21ms step:534/1480 train_time:78717ms step_avg:150.22ms step:535/1480 train_time:78875ms step_avg:150.24ms step:536/1480 train_time:79032ms step_avg:150.25ms step:537/1480 train_time:79189ms step_avg:150.26ms step:538/1480 train_time:79346ms step_avg:150.28ms step:539/1480 train_time:79503ms step_avg:150.29ms step:540/1480 train_time:79659ms step_avg:150.30ms step:541/1480 train_time:79817ms step_avg:150.31ms step:542/1480 train_time:79974ms step_avg:150.33ms step:543/1480 train_time:80132ms step_avg:150.34ms step:544/1480 train_time:80287ms step_avg:150.35ms step:545/1480 train_time:80443ms step_avg:150.36ms step:546/1480 train_time:80600ms step_avg:150.37ms step:547/1480 train_time:80758ms step_avg:150.39ms step:548/1480 train_time:80917ms step_avg:150.40ms step:549/1480 train_time:81074ms step_avg:150.42ms step:550/1480 train_time:81231ms step_avg:150.43ms step:551/1480 train_time:81390ms step_avg:150.44ms step:552/1480 train_time:81549ms step_avg:150.46ms step:553/1480 train_time:81708ms step_avg:150.47ms step:554/1480 train_time:81866ms step_avg:150.49ms step:555/1480 train_time:82025ms step_avg:150.50ms step:556/1480 train_time:82184ms step_avg:150.52ms step:557/1480 train_time:82342ms step_avg:150.53ms step:558/1480 train_time:82501ms step_avg:150.55ms step:559/1480 train_time:82659ms step_avg:150.56ms step:560/1480 train_time:82818ms step_avg:150.58ms step:561/1480 train_time:82977ms step_avg:150.59ms step:562/1480 train_time:83137ms step_avg:150.61ms step:563/1480 train_time:83297ms step_avg:150.63ms step:564/1480 train_time:83458ms step_avg:150.65ms step:565/1480 train_time:83617ms step_avg:150.66ms step:566/1480 train_time:83778ms step_avg:150.68ms step:567/1480 train_time:83938ms step_avg:150.70ms step:568/1480 train_time:84097ms step_avg:150.71ms step:569/1480 train_time:84257ms step_avg:150.73ms step:570/1480 train_time:84416ms step_avg:150.74ms step:571/1480 train_time:84578ms step_avg:150.76ms step:572/1480 train_time:84738ms step_avg:150.78ms step:573/1480 train_time:84899ms step_avg:150.80ms step:574/1480 train_time:85059ms step_avg:150.81ms step:575/1480 train_time:85220ms step_avg:150.83ms step:576/1480 train_time:85379ms step_avg:150.85ms step:577/1480 train_time:85538ms step_avg:150.86ms step:578/1480 train_time:85698ms step_avg:150.88ms step:579/1480 train_time:85859ms step_avg:150.89ms step:580/1480 train_time:86018ms step_avg:150.91ms step:581/1480 train_time:86180ms step_avg:150.93ms step:582/1480 train_time:86339ms step_avg:150.94ms step:583/1480 train_time:86499ms step_avg:150.96ms step:584/1480 train_time:86658ms step_avg:150.97ms step:585/1480 train_time:86817ms step_avg:150.99ms step:586/1480 train_time:86979ms step_avg:151.00ms step:587/1480 train_time:87138ms step_avg:151.02ms step:588/1480 train_time:87298ms step_avg:151.03ms step:589/1480 train_time:87460ms step_avg:151.05ms step:590/1480 train_time:87620ms step_avg:151.07ms step:591/1480 train_time:87779ms step_avg:151.08ms step:592/1480 train_time:87937ms step_avg:151.10ms step:593/1480 train_time:88099ms step_avg:151.11ms step:594/1480 train_time:88259ms step_avg:151.13ms step:595/1480 train_time:88420ms step_avg:151.15ms step:596/1480 train_time:88581ms step_avg:151.16ms step:597/1480 train_time:88739ms step_avg:151.17ms step:598/1480 train_time:88897ms step_avg:151.19ms step:599/1480 train_time:89056ms step_avg:151.20ms step:600/1480 train_time:89216ms step_avg:151.21ms step:601/1480 train_time:89376ms step_avg:151.23ms step:602/1480 train_time:89537ms step_avg:151.25ms step:603/1480 train_time:89699ms step_avg:151.26ms step:604/1480 train_time:89858ms step_avg:151.28ms step:605/1480 train_time:90018ms step_avg:151.29ms step:606/1480 train_time:90180ms step_avg:151.31ms step:607/1480 train_time:90340ms step_avg:151.32ms step:608/1480 train_time:90499ms step_avg:151.34ms step:609/1480 train_time:90659ms step_avg:151.35ms step:610/1480 train_time:90818ms step_avg:151.36ms step:611/1480 train_time:90979ms step_avg:151.38ms step:612/1480 train_time:91139ms step_avg:151.39ms step:613/1480 train_time:91301ms step_avg:151.41ms step:614/1480 train_time:91460ms step_avg:151.42ms step:615/1480 train_time:91619ms step_avg:151.44ms step:616/1480 train_time:91779ms step_avg:151.45ms step:617/1480 train_time:91938ms step_avg:151.46ms step:618/1480 train_time:92097ms step_avg:151.48ms step:619/1480 train_time:92257ms step_avg:151.49ms step:620/1480 train_time:92417ms step_avg:151.50ms step:621/1480 train_time:92577ms step_avg:151.52ms step:622/1480 train_time:92738ms step_avg:151.53ms step:623/1480 train_time:92899ms step_avg:151.55ms step:624/1480 train_time:93059ms step_avg:151.56ms step:625/1480 train_time:93218ms step_avg:151.57ms step:625/1480 val_loss:3.6094 train_time:93282ms step_avg:151.68ms step:626/1480 train_time:93383ms step_avg:151.60ms step:627/1480 train_time:93543ms step_avg:151.61ms step:628/1480 train_time:93702ms step_avg:151.62ms step:629/1480 train_time:93862ms step_avg:151.64ms step:630/1480 train_time:94021ms step_avg:151.65ms step:631/1480 train_time:94180ms step_avg:151.66ms step:632/1480 train_time:94340ms step_avg:151.67ms step:633/1480 train_time:94500ms step_avg:151.69ms step:634/1480 train_time:94660ms step_avg:151.70ms step:635/1480 train_time:94819ms step_avg:151.71ms step:636/1480 train_time:94977ms step_avg:151.72ms step:637/1480 train_time:95138ms step_avg:151.74ms step:638/1480 train_time:95297ms step_avg:151.75ms step:639/1480 train_time:95457ms step_avg:151.76ms step:640/1480 train_time:95616ms step_avg:151.77ms step:641/1480 train_time:95776ms step_avg:151.78ms step:642/1480 train_time:95934ms step_avg:151.79ms step:643/1480 train_time:96092ms step_avg:151.80ms step:644/1480 train_time:96248ms step_avg:151.81ms step:645/1480 train_time:96405ms step_avg:151.82ms step:646/1480 train_time:96565ms step_avg:151.83ms step:647/1480 train_time:96723ms step_avg:151.84ms step:648/1480 train_time:96885ms step_avg:151.86ms step:649/1480 train_time:97045ms step_avg:151.87ms step:650/1480 train_time:97205ms step_avg:151.88ms step:651/1480 train_time:97366ms step_avg:151.90ms step:652/1480 train_time:97525ms step_avg:151.91ms step:653/1480 train_time:97684ms step_avg:151.92ms step:654/1480 train_time:97844ms step_avg:151.93ms step:655/1480 train_time:98002ms step_avg:151.94ms step:656/1480 train_time:98164ms step_avg:151.96ms step:657/1480 train_time:98325ms step_avg:151.97ms step:658/1480 train_time:98485ms step_avg:151.98ms step:659/1480 train_time:98647ms step_avg:152.00ms step:660/1480 train_time:98808ms step_avg:152.01ms step:661/1480 train_time:98969ms step_avg:152.03ms step:662/1480 train_time:99129ms step_avg:152.04ms step:663/1480 train_time:99289ms step_avg:152.05ms step:664/1480 train_time:99451ms step_avg:152.07ms step:665/1480 train_time:99612ms step_avg:152.08ms step:666/1480 train_time:99773ms step_avg:152.09ms step:667/1480 train_time:99935ms step_avg:152.11ms step:668/1480 train_time:100095ms step_avg:152.12ms step:669/1480 train_time:100257ms step_avg:152.14ms step:670/1480 train_time:100418ms step_avg:152.15ms step:671/1480 train_time:100578ms step_avg:152.16ms step:672/1480 train_time:100741ms step_avg:152.18ms step:673/1480 train_time:100903ms step_avg:152.19ms step:674/1480 train_time:101067ms step_avg:152.21ms step:675/1480 train_time:101228ms step_avg:152.22ms step:676/1480 train_time:101392ms step_avg:152.24ms step:677/1480 train_time:101552ms step_avg:152.25ms step:678/1480 train_time:101711ms step_avg:152.26ms step:679/1480 train_time:101873ms step_avg:152.28ms step:680/1480 train_time:102035ms step_avg:152.29ms step:681/1480 train_time:102195ms step_avg:152.30ms step:682/1480 train_time:102360ms step_avg:152.32ms step:683/1480 train_time:102523ms step_avg:152.34ms step:684/1480 train_time:102685ms step_avg:152.35ms step:685/1480 train_time:102848ms step_avg:152.37ms step:686/1480 train_time:103008ms step_avg:152.38ms step:687/1480 train_time:103169ms step_avg:152.39ms step:688/1480 train_time:103331ms step_avg:152.41ms step:689/1480 train_time:103494ms step_avg:152.42ms step:690/1480 train_time:103660ms step_avg:152.44ms step:691/1480 train_time:103823ms step_avg:152.46ms step:692/1480 train_time:103984ms step_avg:152.47ms step:693/1480 train_time:104147ms step_avg:152.48ms step:694/1480 train_time:104307ms step_avg:152.50ms step:695/1480 train_time:104469ms step_avg:152.51ms step:696/1480 train_time:104630ms step_avg:152.52ms step:697/1480 train_time:104791ms step_avg:152.53ms step:698/1480 train_time:104951ms step_avg:152.55ms step:699/1480 train_time:105112ms step_avg:152.56ms step:700/1480 train_time:105274ms step_avg:152.57ms step:701/1480 train_time:105435ms step_avg:152.58ms step:702/1480 train_time:105594ms step_avg:152.59ms step:703/1480 train_time:105754ms step_avg:152.60ms step:704/1480 train_time:105912ms step_avg:152.61ms step:705/1480 train_time:106077ms step_avg:152.63ms step:706/1480 train_time:106244ms step_avg:152.65ms step:707/1480 train_time:106405ms step_avg:152.66ms step:708/1480 train_time:106566ms step_avg:152.67ms step:709/1480 train_time:106729ms step_avg:152.69ms step:710/1480 train_time:106889ms step_avg:152.70ms step:711/1480 train_time:107050ms step_avg:152.71ms step:712/1480 train_time:107213ms step_avg:152.73ms step:713/1480 train_time:107376ms step_avg:152.74ms step:714/1480 train_time:107537ms step_avg:152.75ms step:715/1480 train_time:107699ms step_avg:152.76ms step:716/1480 train_time:107860ms step_avg:152.78ms step:717/1480 train_time:108024ms step_avg:152.79ms step:718/1480 train_time:108185ms step_avg:152.80ms step:719/1480 train_time:108344ms step_avg:152.81ms step:720/1480 train_time:108506ms step_avg:152.83ms step:721/1480 train_time:108667ms step_avg:152.84ms step:722/1480 train_time:108829ms step_avg:152.85ms step:723/1480 train_time:108990ms step_avg:152.86ms step:724/1480 train_time:109151ms step_avg:152.87ms step:725/1480 train_time:109314ms step_avg:152.89ms step:726/1480 train_time:109478ms step_avg:152.90ms step:727/1480 train_time:109643ms step_avg:152.92ms step:728/1480 train_time:109804ms step_avg:152.93ms step:729/1480 train_time:109966ms step_avg:152.94ms step:730/1480 train_time:110129ms step_avg:152.96ms step:731/1480 train_time:110289ms step_avg:152.97ms step:732/1480 train_time:110449ms step_avg:152.98ms step:733/1480 train_time:110609ms step_avg:152.99ms step:734/1480 train_time:110770ms step_avg:153.00ms step:735/1480 train_time:110930ms step_avg:153.01ms step:736/1480 train_time:111092ms step_avg:153.02ms step:737/1480 train_time:111255ms step_avg:153.03ms step:738/1480 train_time:111417ms step_avg:153.05ms step:739/1480 train_time:111579ms step_avg:153.06ms step:740/1480 train_time:111745ms step_avg:153.08ms step:741/1480 train_time:111908ms step_avg:153.09ms step:742/1480 train_time:112070ms step_avg:153.10ms step:743/1480 train_time:112231ms step_avg:153.11ms step:744/1480 train_time:112394ms step_avg:153.13ms step:745/1480 train_time:112559ms step_avg:153.14ms step:746/1480 train_time:112721ms step_avg:153.15ms step:747/1480 train_time:112882ms step_avg:153.16ms step:748/1480 train_time:113048ms step_avg:153.18ms step:749/1480 train_time:113210ms step_avg:153.19ms step:750/1480 train_time:113369ms step_avg:153.20ms step:750/1480 val_loss:3.5553 train_time:113435ms step_avg:153.29ms step:751/1480 train_time:113535ms step_avg:153.22ms step:752/1480 train_time:113698ms step_avg:153.23ms step:753/1480 train_time:113860ms step_avg:153.24ms step:754/1480 train_time:114021ms step_avg:153.25ms step:755/1480 train_time:114184ms step_avg:153.27ms step:756/1480 train_time:114346ms step_avg:153.28ms step:757/1480 train_time:114511ms step_avg:153.29ms step:758/1480 train_time:114670ms step_avg:153.30ms step:759/1480 train_time:114831ms step_avg:153.31ms step:760/1480 train_time:114992ms step_avg:153.32ms step:761/1480 train_time:115154ms step_avg:153.33ms step:762/1480 train_time:115315ms step_avg:153.34ms step:763/1480 train_time:115476ms step_avg:153.35ms step:764/1480 train_time:115637ms step_avg:153.37ms step:765/1480 train_time:115798ms step_avg:153.38ms step:766/1480 train_time:115962ms step_avg:153.39ms step:767/1480 train_time:116126ms step_avg:153.40ms step:768/1480 train_time:116289ms step_avg:153.42ms step:769/1480 train_time:116452ms step_avg:153.43ms step:770/1480 train_time:116615ms step_avg:153.44ms step:771/1480 train_time:116779ms step_avg:153.45ms step:772/1480 train_time:116940ms step_avg:153.46ms step:773/1480 train_time:117104ms step_avg:153.48ms step:774/1480 train_time:117267ms step_avg:153.49ms step:775/1480 train_time:117429ms step_avg:153.50ms step:776/1480 train_time:117592ms step_avg:153.51ms step:777/1480 train_time:117759ms step_avg:153.53ms step:778/1480 train_time:117922ms step_avg:153.54ms step:779/1480 train_time:118086ms step_avg:153.56ms step:780/1480 train_time:118250ms step_avg:153.57ms step:781/1480 train_time:118414ms step_avg:153.59ms step:782/1480 train_time:118576ms step_avg:153.60ms step:783/1480 train_time:118736ms step_avg:153.60ms step:784/1480 train_time:118900ms step_avg:153.62ms step:785/1480 train_time:119064ms step_avg:153.63ms step:786/1480 train_time:119230ms step_avg:153.65ms step:787/1480 train_time:119393ms step_avg:153.66ms step:788/1480 train_time:119556ms step_avg:153.67ms step:789/1480 train_time:119717ms step_avg:153.68ms step:790/1480 train_time:119883ms step_avg:153.70ms step:791/1480 train_time:120050ms step_avg:153.71ms step:792/1480 train_time:120214ms step_avg:153.73ms step:793/1480 train_time:120374ms step_avg:153.73ms step:794/1480 train_time:120539ms step_avg:153.75ms step:795/1480 train_time:120705ms step_avg:153.76ms step:796/1480 train_time:120870ms step_avg:153.78ms step:797/1480 train_time:121033ms step_avg:153.79ms step:798/1480 train_time:121198ms step_avg:153.80ms step:799/1480 train_time:121365ms step_avg:153.82ms step:800/1480 train_time:121529ms step_avg:153.83ms step:801/1480 train_time:121692ms step_avg:153.85ms step:802/1480 train_time:121858ms step_avg:153.86ms step:803/1480 train_time:122022ms step_avg:153.87ms step:804/1480 train_time:122184ms step_avg:153.88ms step:805/1480 train_time:122349ms step_avg:153.90ms step:806/1480 train_time:122511ms step_avg:153.91ms step:807/1480 train_time:122672ms step_avg:153.92ms step:808/1480 train_time:122835ms step_avg:153.93ms step:809/1480 train_time:122998ms step_avg:153.94ms step:810/1480 train_time:123159ms step_avg:153.95ms step:811/1480 train_time:123322ms step_avg:153.96ms step:812/1480 train_time:123486ms step_avg:153.97ms step:813/1480 train_time:123647ms step_avg:153.98ms step:814/1480 train_time:123810ms step_avg:153.99ms step:815/1480 train_time:123971ms step_avg:154.00ms step:816/1480 train_time:124134ms step_avg:154.01ms step:817/1480 train_time:124296ms step_avg:154.02ms step:818/1480 train_time:124458ms step_avg:154.03ms step:819/1480 train_time:124624ms step_avg:154.05ms step:820/1480 train_time:124788ms step_avg:154.06ms step:821/1480 train_time:124949ms step_avg:154.07ms step:822/1480 train_time:125111ms step_avg:154.08ms step:823/1480 train_time:125273ms step_avg:154.09ms step:824/1480 train_time:125434ms step_avg:154.10ms step:825/1480 train_time:125600ms step_avg:154.11ms step:826/1480 train_time:125766ms step_avg:154.13ms step:827/1480 train_time:125932ms step_avg:154.14ms step:828/1480 train_time:126094ms step_avg:154.15ms step:829/1480 train_time:126256ms step_avg:154.16ms step:830/1480 train_time:126421ms step_avg:154.17ms step:831/1480 train_time:126586ms step_avg:154.19ms step:832/1480 train_time:126750ms step_avg:154.20ms step:833/1480 train_time:126913ms step_avg:154.21ms step:834/1480 train_time:127076ms step_avg:154.22ms step:835/1480 train_time:127239ms step_avg:154.23ms step:836/1480 train_time:127406ms step_avg:154.24ms step:837/1480 train_time:127568ms step_avg:154.25ms step:838/1480 train_time:127732ms step_avg:154.27ms step:839/1480 train_time:127894ms step_avg:154.27ms step:840/1480 train_time:128055ms step_avg:154.28ms step:841/1480 train_time:128215ms step_avg:154.29ms step:842/1480 train_time:128379ms step_avg:154.30ms step:843/1480 train_time:128542ms step_avg:154.31ms step:844/1480 train_time:128706ms step_avg:154.32ms step:845/1480 train_time:128870ms step_avg:154.34ms step:846/1480 train_time:129035ms step_avg:154.35ms step:847/1480 train_time:129200ms step_avg:154.36ms step:848/1480 train_time:129362ms step_avg:154.37ms step:849/1480 train_time:129525ms step_avg:154.38ms step:850/1480 train_time:129689ms step_avg:154.39ms step:851/1480 train_time:129854ms step_avg:154.40ms step:852/1480 train_time:130016ms step_avg:154.41ms step:853/1480 train_time:130178ms step_avg:154.42ms step:854/1480 train_time:130344ms step_avg:154.44ms step:855/1480 train_time:130508ms step_avg:154.45ms step:856/1480 train_time:130670ms step_avg:154.46ms step:857/1480 train_time:130834ms step_avg:154.47ms step:858/1480 train_time:131001ms step_avg:154.48ms step:859/1480 train_time:131166ms step_avg:154.49ms step:860/1480 train_time:131328ms step_avg:154.50ms step:861/1480 train_time:131496ms step_avg:154.52ms step:862/1480 train_time:131666ms step_avg:154.54ms step:863/1480 train_time:131834ms step_avg:154.55ms step:864/1480 train_time:131998ms step_avg:154.56ms step:865/1480 train_time:132160ms step_avg:154.57ms step:866/1480 train_time:132328ms step_avg:154.59ms step:867/1480 train_time:132492ms step_avg:154.60ms step:868/1480 train_time:132652ms step_avg:154.61ms step:869/1480 train_time:132813ms step_avg:154.61ms step:870/1480 train_time:132977ms step_avg:154.62ms step:871/1480 train_time:133140ms step_avg:154.63ms step:872/1480 train_time:133305ms step_avg:154.65ms step:873/1480 train_time:133467ms step_avg:154.65ms step:874/1480 train_time:133633ms step_avg:154.67ms step:875/1480 train_time:133797ms step_avg:154.68ms step:875/1480 val_loss:3.5068 train_time:133862ms step_avg:154.75ms step:876/1480 train_time:133962ms step_avg:154.69ms step:877/1480 train_time:134130ms step_avg:154.71ms step:878/1480 train_time:134294ms step_avg:154.72ms step:879/1480 train_time:134458ms step_avg:154.73ms step:880/1480 train_time:134621ms step_avg:154.74ms step:881/1480 train_time:134782ms step_avg:154.74ms step:882/1480 train_time:134949ms step_avg:154.76ms step:883/1480 train_time:135114ms step_avg:154.77ms step:884/1480 train_time:135280ms step_avg:154.78ms step:885/1480 train_time:135445ms step_avg:154.79ms step:886/1480 train_time:135611ms step_avg:154.81ms step:887/1480 train_time:135778ms step_avg:154.82ms step:888/1480 train_time:135952ms step_avg:154.84ms step:889/1480 train_time:136121ms step_avg:154.86ms step:890/1480 train_time:136282ms step_avg:154.87ms step:891/1480 train_time:136450ms step_avg:154.88ms step:892/1480 train_time:136614ms step_avg:154.89ms step:893/1480 train_time:136776ms step_avg:154.90ms step:894/1480 train_time:136941ms step_avg:154.91ms step:895/1480 train_time:137109ms step_avg:154.92ms step:896/1480 train_time:137275ms step_avg:154.94ms step:897/1480 train_time:137445ms step_avg:154.95ms step:898/1480 train_time:137612ms step_avg:154.97ms step:899/1480 train_time:137776ms step_avg:154.98ms step:900/1480 train_time:137939ms step_avg:154.99ms step:901/1480 train_time:138103ms step_avg:155.00ms step:902/1480 train_time:138267ms step_avg:155.01ms step:903/1480 train_time:138438ms step_avg:155.03ms step:904/1480 train_time:138603ms step_avg:155.04ms step:905/1480 train_time:138765ms step_avg:155.04ms step:906/1480 train_time:138932ms step_avg:155.06ms step:907/1480 train_time:139098ms step_avg:155.07ms step:908/1480 train_time:139261ms step_avg:155.08ms step:909/1480 train_time:139425ms step_avg:155.09ms step:910/1480 train_time:139594ms step_avg:155.10ms step:911/1480 train_time:139758ms step_avg:155.11ms step:912/1480 train_time:139923ms step_avg:155.12ms step:913/1480 train_time:140091ms step_avg:155.14ms step:914/1480 train_time:140258ms step_avg:155.15ms step:915/1480 train_time:140429ms step_avg:155.17ms step:916/1480 train_time:140593ms step_avg:155.18ms step:917/1480 train_time:140757ms step_avg:155.19ms step:918/1480 train_time:140925ms step_avg:155.20ms step:919/1480 train_time:141095ms step_avg:155.22ms step:920/1480 train_time:141259ms step_avg:155.23ms step:921/1480 train_time:141425ms step_avg:155.24ms step:922/1480 train_time:141592ms step_avg:155.25ms step:923/1480 train_time:141754ms step_avg:155.26ms step:924/1480 train_time:141918ms step_avg:155.27ms step:925/1480 train_time:142083ms step_avg:155.28ms step:926/1480 train_time:142248ms step_avg:155.29ms step:927/1480 train_time:142412ms step_avg:155.30ms step:928/1480 train_time:142577ms step_avg:155.31ms step:929/1480 train_time:142742ms step_avg:155.32ms step:930/1480 train_time:142909ms step_avg:155.34ms step:931/1480 train_time:143072ms step_avg:155.34ms step:932/1480 train_time:143238ms step_avg:155.36ms step:933/1480 train_time:143405ms step_avg:155.37ms step:934/1480 train_time:143573ms step_avg:155.38ms step:935/1480 train_time:143742ms step_avg:155.40ms step:936/1480 train_time:143908ms step_avg:155.41ms step:937/1480 train_time:144077ms step_avg:155.42ms step:938/1480 train_time:144239ms step_avg:155.43ms step:939/1480 train_time:144408ms step_avg:155.44ms step:940/1480 train_time:144575ms step_avg:155.46ms step:941/1480 train_time:144738ms step_avg:155.46ms step:942/1480 train_time:144903ms step_avg:155.48ms step:943/1480 train_time:145073ms step_avg:155.49ms step:944/1480 train_time:145246ms step_avg:155.51ms step:945/1480 train_time:145410ms step_avg:155.52ms step:946/1480 train_time:145580ms step_avg:155.53ms step:947/1480 train_time:145747ms step_avg:155.55ms step:948/1480 train_time:145912ms step_avg:155.56ms step:949/1480 train_time:146078ms step_avg:155.57ms step:950/1480 train_time:146240ms step_avg:155.57ms step:951/1480 train_time:146410ms step_avg:155.59ms step:952/1480 train_time:146577ms step_avg:155.60ms step:953/1480 train_time:146746ms step_avg:155.62ms step:954/1480 train_time:146915ms step_avg:155.63ms step:955/1480 train_time:147079ms step_avg:155.64ms step:956/1480 train_time:147244ms step_avg:155.65ms step:957/1480 train_time:147412ms step_avg:155.66ms step:958/1480 train_time:147581ms step_avg:155.68ms step:959/1480 train_time:147747ms step_avg:155.69ms step:960/1480 train_time:147912ms step_avg:155.70ms step:961/1480 train_time:148076ms step_avg:155.71ms step:962/1480 train_time:148240ms step_avg:155.71ms step:963/1480 train_time:148405ms step_avg:155.72ms step:964/1480 train_time:148574ms step_avg:155.74ms step:965/1480 train_time:148738ms step_avg:155.75ms step:966/1480 train_time:148902ms step_avg:155.75ms step:967/1480 train_time:149065ms step_avg:155.76ms step:968/1480 train_time:149230ms step_avg:155.77ms step:969/1480 train_time:149396ms step_avg:155.78ms step:970/1480 train_time:149559ms step_avg:155.79ms step:971/1480 train_time:149722ms step_avg:155.80ms step:972/1480 train_time:149887ms step_avg:155.81ms step:973/1480 train_time:150052ms step_avg:155.82ms step:974/1480 train_time:150220ms step_avg:155.83ms step:975/1480 train_time:150384ms step_avg:155.84ms step:976/1480 train_time:150550ms step_avg:155.85ms step:977/1480 train_time:150714ms step_avg:155.86ms step:978/1480 train_time:150879ms step_avg:155.87ms step:979/1480 train_time:151045ms step_avg:155.88ms step:980/1480 train_time:151212ms step_avg:155.89ms step:981/1480 train_time:151378ms step_avg:155.90ms step:982/1480 train_time:151542ms step_avg:155.91ms step:983/1480 train_time:151708ms step_avg:155.92ms step:984/1480 train_time:151873ms step_avg:155.93ms step:985/1480 train_time:152040ms step_avg:155.94ms step:986/1480 train_time:152205ms step_avg:155.95ms step:987/1480 train_time:152370ms step_avg:155.96ms step:988/1480 train_time:152537ms step_avg:155.97ms step:989/1480 train_time:152702ms step_avg:155.98ms step:990/1480 train_time:152872ms step_avg:155.99ms step:991/1480 train_time:153040ms step_avg:156.00ms step:992/1480 train_time:153216ms step_avg:156.02ms step:993/1480 train_time:153393ms step_avg:156.05ms step:994/1480 train_time:153557ms step_avg:156.05ms step:995/1480 train_time:153720ms step_avg:156.06ms step:996/1480 train_time:153883ms step_avg:156.07ms step:997/1480 train_time:154049ms step_avg:156.08ms step:998/1480 train_time:154212ms step_avg:156.09ms step:999/1480 train_time:154377ms step_avg:156.09ms step:1000/1480 train_time:154545ms step_avg:156.11ms step:1000/1480 val_loss:3.4429 train_time:154611ms step_avg:156.17ms step:1001/1480 train_time:154712ms step_avg:156.12ms step:1002/1480 train_time:154881ms step_avg:156.13ms step:1003/1480 train_time:155052ms step_avg:156.14ms step:1004/1480 train_time:155220ms step_avg:156.16ms step:1005/1480 train_time:155388ms step_avg:156.17ms step:1006/1480 train_time:155556ms step_avg:156.18ms step:1007/1480 train_time:155721ms step_avg:156.19ms step:1008/1480 train_time:155889ms step_avg:156.20ms step:1009/1480 train_time:156063ms step_avg:156.22ms step:1010/1480 train_time:156227ms step_avg:156.23ms step:1011/1480 train_time:156393ms step_avg:156.24ms step:1012/1480 train_time:156559ms step_avg:156.25ms step:1013/1480 train_time:156729ms step_avg:156.26ms step:1014/1480 train_time:156896ms step_avg:156.27ms step:1015/1480 train_time:157067ms step_avg:156.29ms step:1016/1480 train_time:157236ms step_avg:156.30ms step:1017/1480 train_time:157407ms step_avg:156.31ms step:1018/1480 train_time:157576ms step_avg:156.33ms step:1019/1480 train_time:157744ms step_avg:156.34ms step:1020/1480 train_time:157913ms step_avg:156.35ms step:1021/1480 train_time:158078ms step_avg:156.36ms step:1022/1480 train_time:158246ms step_avg:156.37ms step:1023/1480 train_time:158414ms step_avg:156.38ms step:1024/1480 train_time:158582ms step_avg:156.39ms step:1025/1480 train_time:158752ms step_avg:156.41ms step:1026/1480 train_time:158917ms step_avg:156.41ms step:1027/1480 train_time:159084ms step_avg:156.42ms step:1028/1480 train_time:159255ms step_avg:156.44ms step:1029/1480 train_time:159430ms step_avg:156.46ms step:1030/1480 train_time:159598ms step_avg:156.47ms step:1031/1480 train_time:159763ms step_avg:156.48ms step:1032/1480 train_time:159934ms step_avg:156.49ms step:1033/1480 train_time:160102ms step_avg:156.50ms step:1034/1480 train_time:160270ms step_avg:156.51ms step:1035/1480 train_time:160437ms step_avg:156.52ms step:1036/1480 train_time:160603ms step_avg:156.53ms step:1037/1480 train_time:160770ms step_avg:156.54ms step:1038/1480 train_time:160937ms step_avg:156.55ms step:1039/1480 train_time:161107ms step_avg:156.57ms step:1040/1480 train_time:161272ms step_avg:156.58ms step:1041/1480 train_time:161440ms step_avg:156.59ms step:1042/1480 train_time:161605ms step_avg:156.59ms step:1043/1480 train_time:161769ms step_avg:156.60ms step:1044/1480 train_time:161933ms step_avg:156.61ms step:1045/1480 train_time:162104ms step_avg:156.62ms step:1046/1480 train_time:162271ms step_avg:156.63ms step:1047/1480 train_time:162437ms step_avg:156.64ms step:1048/1480 train_time:162605ms step_avg:156.65ms step:1049/1480 train_time:162770ms step_avg:156.66ms step:1050/1480 train_time:162940ms step_avg:156.67ms step:1051/1480 train_time:163109ms step_avg:156.69ms step:1052/1480 train_time:163276ms step_avg:156.70ms step:1053/1480 train_time:163442ms step_avg:156.70ms step:1054/1480 train_time:163610ms step_avg:156.71ms step:1055/1480 train_time:163774ms step_avg:156.72ms step:1056/1480 train_time:163939ms step_avg:156.73ms step:1057/1480 train_time:164107ms step_avg:156.74ms step:1058/1480 train_time:164276ms step_avg:156.75ms step:1059/1480 train_time:164448ms step_avg:156.77ms step:1060/1480 train_time:164616ms step_avg:156.78ms step:1061/1480 train_time:164779ms step_avg:156.78ms step:1062/1480 train_time:164945ms step_avg:156.79ms step:1063/1480 train_time:165110ms step_avg:156.80ms step:1064/1480 train_time:165275ms step_avg:156.81ms step:1065/1480 train_time:165442ms step_avg:156.82ms step:1066/1480 train_time:165609ms step_avg:156.83ms step:1067/1480 train_time:165782ms step_avg:156.84ms step:1068/1480 train_time:165949ms step_avg:156.85ms step:1069/1480 train_time:166120ms step_avg:156.87ms step:1070/1480 train_time:166286ms step_avg:156.87ms step:1071/1480 train_time:166461ms step_avg:156.89ms step:1072/1480 train_time:166628ms step_avg:156.90ms step:1073/1480 train_time:166790ms step_avg:156.91ms step:1074/1480 train_time:166958ms step_avg:156.92ms step:1075/1480 train_time:167129ms step_avg:156.93ms step:1076/1480 train_time:167297ms step_avg:156.94ms step:1077/1480 train_time:167463ms step_avg:156.95ms step:1078/1480 train_time:167637ms step_avg:156.96ms step:1079/1480 train_time:167809ms step_avg:156.98ms step:1080/1480 train_time:167979ms step_avg:156.99ms step:1081/1480 train_time:168145ms step_avg:157.00ms step:1082/1480 train_time:168310ms step_avg:157.01ms step:1083/1480 train_time:168477ms step_avg:157.01ms step:1084/1480 train_time:168645ms step_avg:157.02ms step:1085/1480 train_time:168812ms step_avg:157.03ms step:1086/1480 train_time:168981ms step_avg:157.05ms step:1087/1480 train_time:169147ms step_avg:157.05ms step:1088/1480 train_time:169317ms step_avg:157.07ms step:1089/1480 train_time:169489ms step_avg:157.08ms step:1090/1480 train_time:169662ms step_avg:157.09ms step:1091/1480 train_time:169830ms step_avg:157.10ms step:1092/1480 train_time:169998ms step_avg:157.11ms step:1093/1480 train_time:170166ms step_avg:157.13ms step:1094/1480 train_time:170332ms step_avg:157.13ms step:1095/1480 train_time:170496ms step_avg:157.14ms step:1096/1480 train_time:170664ms step_avg:157.15ms step:1097/1480 train_time:170833ms step_avg:157.16ms step:1098/1480 train_time:171004ms step_avg:157.17ms step:1099/1480 train_time:171175ms step_avg:157.19ms step:1100/1480 train_time:171347ms step_avg:157.20ms step:1101/1480 train_time:171517ms step_avg:157.21ms step:1102/1480 train_time:171689ms step_avg:157.22ms step:1103/1480 train_time:171865ms step_avg:157.24ms step:1104/1480 train_time:172033ms step_avg:157.25ms step:1105/1480 train_time:172204ms step_avg:157.26ms step:1106/1480 train_time:172372ms step_avg:157.27ms step:1107/1480 train_time:172541ms step_avg:157.28ms step:1108/1480 train_time:172706ms step_avg:157.29ms step:1109/1480 train_time:172873ms step_avg:157.30ms step:1110/1480 train_time:173038ms step_avg:157.31ms step:1111/1480 train_time:173205ms step_avg:157.32ms step:1112/1480 train_time:173373ms step_avg:157.33ms step:1113/1480 train_time:173553ms step_avg:157.35ms step:1114/1480 train_time:173726ms step_avg:157.36ms step:1115/1480 train_time:173899ms step_avg:157.37ms step:1116/1480 train_time:174066ms step_avg:157.38ms step:1117/1480 train_time:174239ms step_avg:157.40ms step:1118/1480 train_time:174415ms step_avg:157.41ms step:1119/1480 train_time:174582ms step_avg:157.42ms step:1120/1480 train_time:174749ms step_avg:157.43ms step:1121/1480 train_time:174919ms step_avg:157.44ms step:1122/1480 train_time:175087ms step_avg:157.45ms step:1123/1480 train_time:175254ms step_avg:157.46ms step:1124/1480 train_time:175422ms step_avg:157.47ms step:1125/1480 train_time:175590ms step_avg:157.48ms step:1125/1480 val_loss:3.3877 train_time:175658ms step_avg:157.54ms step:1126/1480 train_time:175760ms step_avg:157.49ms step:1127/1480 train_time:175931ms step_avg:157.50ms step:1128/1480 train_time:176103ms step_avg:157.52ms step:1129/1480 train_time:176277ms step_avg:157.53ms step:1130/1480 train_time:176446ms step_avg:157.54ms step:1131/1480 train_time:176624ms step_avg:157.56ms step:1132/1480 train_time:176790ms step_avg:157.57ms step:1133/1480 train_time:176963ms step_avg:157.58ms step:1134/1480 train_time:177134ms step_avg:157.59ms step:1135/1480 train_time:177301ms step_avg:157.60ms step:1136/1480 train_time:177471ms step_avg:157.61ms step:1137/1480 train_time:177640ms step_avg:157.62ms step:1138/1480 train_time:177812ms step_avg:157.63ms step:1139/1480 train_time:177980ms step_avg:157.64ms step:1140/1480 train_time:178147ms step_avg:157.65ms step:1141/1480 train_time:178319ms step_avg:157.67ms step:1142/1480 train_time:178487ms step_avg:157.67ms step:1143/1480 train_time:178658ms step_avg:157.69ms step:1144/1480 train_time:178827ms step_avg:157.70ms step:1145/1480 train_time:178993ms step_avg:157.70ms step:1146/1480 train_time:179164ms step_avg:157.71ms step:1147/1480 train_time:179332ms step_avg:157.72ms step:1148/1480 train_time:179499ms step_avg:157.73ms step:1149/1480 train_time:179670ms step_avg:157.74ms step:1150/1480 train_time:179839ms step_avg:157.75ms step:1151/1480 train_time:180012ms step_avg:157.77ms step:1152/1480 train_time:180183ms step_avg:157.78ms step:1153/1480 train_time:180356ms step_avg:157.79ms step:1154/1480 train_time:180523ms step_avg:157.80ms step:1155/1480 train_time:180697ms step_avg:157.81ms step:1156/1480 train_time:180875ms step_avg:157.83ms step:1157/1480 train_time:181044ms step_avg:157.84ms step:1158/1480 train_time:181211ms step_avg:157.85ms step:1159/1480 train_time:181378ms step_avg:157.86ms step:1160/1480 train_time:181543ms step_avg:157.86ms step:1161/1480 train_time:181715ms step_avg:157.88ms step:1162/1480 train_time:181885ms step_avg:157.89ms step:1163/1480 train_time:182054ms step_avg:157.90ms step:1164/1480 train_time:182223ms step_avg:157.91ms step:1165/1480 train_time:182390ms step_avg:157.91ms step:1166/1480 train_time:182558ms step_avg:157.92ms step:1167/1480 train_time:182728ms step_avg:157.93ms step:1168/1480 train_time:182895ms step_avg:157.94ms step:1169/1480 train_time:183064ms step_avg:157.95ms step:1170/1480 train_time:183233ms step_avg:157.96ms step:1171/1480 train_time:183399ms step_avg:157.97ms step:1172/1480 train_time:183567ms step_avg:157.97ms step:1173/1480 train_time:183738ms step_avg:157.99ms step:1174/1480 train_time:183920ms step_avg:158.01ms step:1175/1480 train_time:184092ms step_avg:158.02ms step:1176/1480 train_time:184263ms step_avg:158.03ms step:1177/1480 train_time:184440ms step_avg:158.05ms step:1178/1480 train_time:184609ms step_avg:158.06ms step:1179/1480 train_time:184774ms step_avg:158.06ms step:1180/1480 train_time:184954ms step_avg:158.08ms step:1181/1480 train_time:185124ms step_avg:158.09ms step:1182/1480 train_time:185293ms step_avg:158.10ms step:1183/1480 train_time:185463ms step_avg:158.11ms step:1184/1480 train_time:185632ms step_avg:158.12ms step:1185/1480 train_time:185803ms step_avg:158.13ms step:1186/1480 train_time:185974ms step_avg:158.14ms step:1187/1480 train_time:186157ms step_avg:158.16ms step:1188/1480 train_time:186324ms step_avg:158.17ms step:1189/1480 train_time:186495ms step_avg:158.18ms step:1190/1480 train_time:186663ms step_avg:158.19ms step:1191/1480 train_time:186834ms step_avg:158.20ms step:1192/1480 train_time:187000ms step_avg:158.21ms step:1193/1480 train_time:187167ms step_avg:158.21ms step:1194/1480 train_time:187336ms step_avg:158.22ms step:1195/1480 train_time:187509ms step_avg:158.24ms step:1196/1480 train_time:187692ms step_avg:158.26ms step:1197/1480 train_time:187863ms step_avg:158.27ms step:1198/1480 train_time:188046ms step_avg:158.29ms step:1199/1480 train_time:188217ms step_avg:158.30ms step:1200/1480 train_time:188383ms step_avg:158.31ms step:1201/1480 train_time:188550ms step_avg:158.31ms step:1202/1480 train_time:188732ms step_avg:158.33ms step:1203/1480 train_time:188909ms step_avg:158.35ms step:1204/1480 train_time:189083ms step_avg:158.36ms step:1205/1480 train_time:189251ms step_avg:158.37ms step:1206/1480 train_time:189419ms step_avg:158.38ms step:1207/1480 train_time:189589ms step_avg:158.39ms step:1208/1480 train_time:189757ms step_avg:158.39ms step:1209/1480 train_time:189932ms step_avg:158.41ms step:1210/1480 train_time:190107ms step_avg:158.42ms step:1211/1480 train_time:190280ms step_avg:158.43ms step:1212/1480 train_time:190452ms step_avg:158.45ms step:1213/1480 train_time:190624ms step_avg:158.46ms step:1214/1480 train_time:190801ms step_avg:158.47ms step:1215/1480 train_time:190972ms step_avg:158.48ms step:1216/1480 train_time:191139ms step_avg:158.49ms step:1217/1480 train_time:191313ms step_avg:158.50ms step:1218/1480 train_time:191482ms step_avg:158.51ms step:1219/1480 train_time:191661ms step_avg:158.53ms step:1220/1480 train_time:191832ms step_avg:158.54ms step:1221/1480 train_time:192000ms step_avg:158.55ms step:1222/1480 train_time:192167ms step_avg:158.55ms step:1223/1480 train_time:192337ms step_avg:158.56ms step:1224/1480 train_time:192515ms step_avg:158.58ms step:1225/1480 train_time:192687ms step_avg:158.59ms step:1226/1480 train_time:192860ms step_avg:158.60ms step:1227/1480 train_time:193032ms step_avg:158.61ms step:1228/1480 train_time:193200ms step_avg:158.62ms step:1229/1480 train_time:193372ms step_avg:158.63ms step:1230/1480 train_time:193554ms step_avg:158.65ms step:1231/1480 train_time:193730ms step_avg:158.67ms step:1232/1480 train_time:193906ms step_avg:158.68ms step:1233/1480 train_time:194074ms step_avg:158.69ms step:1234/1480 train_time:194246ms step_avg:158.70ms step:1235/1480 train_time:194420ms step_avg:158.71ms step:1236/1480 train_time:194588ms step_avg:158.72ms step:1237/1480 train_time:194759ms step_avg:158.73ms step:1238/1480 train_time:194944ms step_avg:158.75ms step:1239/1480 train_time:195114ms step_avg:158.76ms step:1240/1480 train_time:195286ms step_avg:158.77ms step:1241/1480 train_time:195458ms step_avg:158.78ms step:1242/1480 train_time:195628ms step_avg:158.79ms step:1243/1480 train_time:195799ms step_avg:158.80ms step:1244/1480 train_time:195966ms step_avg:158.81ms step:1245/1480 train_time:196135ms step_avg:158.81ms step:1246/1480 train_time:196306ms step_avg:158.82ms step:1247/1480 train_time:196475ms step_avg:158.83ms step:1248/1480 train_time:196644ms step_avg:158.84ms step:1249/1480 train_time:196813ms step_avg:158.85ms step:1250/1480 train_time:196982ms step_avg:158.86ms step:1250/1480 val_loss:3.3371 train_time:197053ms step_avg:158.91ms step:1251/1480 train_time:197162ms step_avg:158.87ms step:1252/1480 train_time:197331ms step_avg:158.88ms step:1253/1480 train_time:197498ms step_avg:158.89ms step:1254/1480 train_time:197670ms step_avg:158.90ms step:1255/1480 train_time:197857ms step_avg:158.92ms step:1256/1480 train_time:198030ms step_avg:158.93ms step:1257/1480 train_time:198200ms step_avg:158.94ms step:1258/1480 train_time:198376ms step_avg:158.95ms step:1259/1480 train_time:198546ms step_avg:158.96ms step:1260/1480 train_time:198715ms step_avg:158.97ms step:1261/1480 train_time:198886ms step_avg:158.98ms step:1262/1480 train_time:199062ms step_avg:158.99ms step:1263/1480 train_time:199235ms step_avg:159.01ms step:1264/1480 train_time:199401ms step_avg:159.01ms step:1265/1480 train_time:199566ms step_avg:159.02ms step:1266/1480 train_time:199739ms step_avg:159.03ms step:1267/1480 train_time:199909ms step_avg:159.04ms step:1268/1480 train_time:200080ms step_avg:159.05ms step:1269/1480 train_time:200257ms step_avg:159.06ms step:1270/1480 train_time:200426ms step_avg:159.07ms step:1271/1480 train_time:200596ms step_avg:159.08ms step:1272/1480 train_time:200761ms step_avg:159.08ms step:1273/1480 train_time:200934ms step_avg:159.09ms step:1274/1480 train_time:201105ms step_avg:159.10ms step:1275/1480 train_time:201273ms step_avg:159.11ms step:1276/1480 train_time:201438ms step_avg:159.11ms step:1277/1480 train_time:201611ms step_avg:159.12ms step:1278/1480 train_time:201781ms step_avg:159.13ms step:1279/1480 train_time:201954ms step_avg:159.14ms step:1280/1480 train_time:202133ms step_avg:159.16ms step:1281/1480 train_time:202301ms step_avg:159.17ms step:1282/1480 train_time:202468ms step_avg:159.17ms step:1283/1480 train_time:202639ms step_avg:159.18ms step:1284/1480 train_time:202810ms step_avg:159.19ms step:1285/1480 train_time:202979ms step_avg:159.20ms step:1286/1480 train_time:203149ms step_avg:159.21ms step:1287/1480 train_time:203321ms step_avg:159.22ms step:1288/1480 train_time:203493ms step_avg:159.23ms step:1289/1480 train_time:203677ms step_avg:159.25ms step:1290/1480 train_time:203858ms step_avg:159.26ms step:1291/1480 train_time:204031ms step_avg:159.27ms step:1292/1480 train_time:204204ms step_avg:159.29ms step:1293/1480 train_time:204379ms step_avg:159.30ms step:1294/1480 train_time:204552ms step_avg:159.31ms step:1295/1480 train_time:204724ms step_avg:159.32ms step:1296/1480 train_time:204898ms step_avg:159.33ms step:1297/1480 train_time:205070ms step_avg:159.34ms step:1298/1480 train_time:205242ms step_avg:159.35ms step:1299/1480 train_time:205413ms step_avg:159.36ms step:1300/1480 train_time:205581ms step_avg:159.37ms step:1301/1480 train_time:205751ms step_avg:159.37ms step:1302/1480 train_time:205925ms step_avg:159.38ms step:1303/1480 train_time:206099ms step_avg:159.40ms step:1304/1480 train_time:206271ms step_avg:159.41ms step:1305/1480 train_time:206440ms step_avg:159.41ms step:1306/1480 train_time:206614ms step_avg:159.42ms step:1307/1480 train_time:206782ms step_avg:159.43ms step:1308/1480 train_time:206952ms step_avg:159.44ms step:1309/1480 train_time:207123ms step_avg:159.45ms step:1310/1480 train_time:207291ms step_avg:159.45ms step:1311/1480 train_time:207460ms step_avg:159.46ms step:1312/1480 train_time:207633ms step_avg:159.47ms step:1313/1480 train_time:207799ms step_avg:159.48ms step:1314/1480 train_time:207973ms step_avg:159.49ms step:1315/1480 train_time:208142ms step_avg:159.50ms step:1316/1480 train_time:208309ms step_avg:159.50ms step:1317/1480 train_time:208482ms step_avg:159.51ms step:1318/1480 train_time:208662ms step_avg:159.53ms step:1319/1480 train_time:208838ms step_avg:159.54ms step:1320/1480 train_time:209015ms step_avg:159.55ms step:1321/1480 train_time:209186ms step_avg:159.56ms step:1322/1480 train_time:209368ms step_avg:159.58ms step:1323/1480 train_time:209539ms step_avg:159.59ms step:1324/1480 train_time:209716ms step_avg:159.60ms step:1325/1480 train_time:209896ms step_avg:159.62ms step:1326/1480 train_time:210071ms step_avg:159.63ms step:1327/1480 train_time:210241ms step_avg:159.64ms step:1328/1480 train_time:210412ms step_avg:159.64ms step:1329/1480 train_time:210607ms step_avg:159.67ms step:1330/1480 train_time:210785ms step_avg:159.69ms step:1331/1480 train_time:210955ms step_avg:159.69ms step:1332/1480 train_time:211127ms step_avg:159.70ms step:1333/1480 train_time:211303ms step_avg:159.71ms step:1334/1480 train_time:211473ms step_avg:159.72ms step:1335/1480 train_time:211642ms step_avg:159.73ms step:1336/1480 train_time:211825ms step_avg:159.75ms step:1337/1480 train_time:212000ms step_avg:159.76ms step:1338/1480 train_time:212172ms step_avg:159.77ms step:1339/1480 train_time:212346ms step_avg:159.78ms step:1340/1480 train_time:212518ms step_avg:159.79ms step:1341/1480 train_time:212686ms step_avg:159.79ms step:1342/1480 train_time:212860ms step_avg:159.81ms step:1343/1480 train_time:213032ms step_avg:159.81ms step:1344/1480 train_time:213203ms step_avg:159.82ms step:1345/1480 train_time:213381ms step_avg:159.84ms step:1346/1480 train_time:213550ms step_avg:159.84ms step:1347/1480 train_time:213720ms step_avg:159.85ms step:1348/1480 train_time:213891ms step_avg:159.86ms step:1349/1480 train_time:214060ms step_avg:159.87ms step:1350/1480 train_time:214235ms step_avg:159.88ms step:1351/1480 train_time:214406ms step_avg:159.89ms step:1352/1480 train_time:214577ms step_avg:159.89ms step:1353/1480 train_time:214753ms step_avg:159.91ms step:1354/1480 train_time:214923ms step_avg:159.91ms step:1355/1480 train_time:215091ms step_avg:159.92ms step:1356/1480 train_time:215263ms step_avg:159.93ms step:1357/1480 train_time:215437ms step_avg:159.94ms step:1358/1480 train_time:215609ms step_avg:159.95ms step:1359/1480 train_time:215780ms step_avg:159.96ms step:1360/1480 train_time:215955ms step_avg:159.97ms step:1361/1480 train_time:216133ms step_avg:159.98ms step:1362/1480 train_time:216310ms step_avg:159.99ms step:1363/1480 train_time:216489ms step_avg:160.01ms step:1364/1480 train_time:216658ms step_avg:160.01ms step:1365/1480 train_time:216824ms step_avg:160.02ms step:1366/1480 train_time:216996ms step_avg:160.03ms step:1367/1480 train_time:217166ms step_avg:160.03ms step:1368/1480 train_time:217339ms step_avg:160.04ms step:1369/1480 train_time:217521ms step_avg:160.06ms step:1370/1480 train_time:217698ms step_avg:160.07ms step:1371/1480 train_time:217870ms step_avg:160.08ms step:1372/1480 train_time:218049ms step_avg:160.09ms step:1373/1480 train_time:218219ms step_avg:160.10ms step:1374/1480 train_time:218395ms step_avg:160.11ms step:1375/1480 train_time:218566ms step_avg:160.12ms step:1375/1480 val_loss:3.2987 train_time:218633ms step_avg:160.17ms step:1376/1480 train_time:218738ms step_avg:160.13ms step:1377/1480 train_time:218911ms step_avg:160.14ms step:1378/1480 train_time:219079ms step_avg:160.15ms step:1379/1480 train_time:219255ms step_avg:160.16ms step:1380/1480 train_time:219429ms step_avg:160.17ms step:1381/1480 train_time:219611ms step_avg:160.18ms step:1382/1480 train_time:219781ms step_avg:160.19ms step:1383/1480 train_time:219954ms step_avg:160.20ms step:1384/1480 train_time:220131ms step_avg:160.21ms step:1385/1480 train_time:220297ms step_avg:160.22ms step:1386/1480 train_time:220468ms step_avg:160.22ms step:1387/1480 train_time:220640ms step_avg:160.23ms step:1388/1480 train_time:220808ms step_avg:160.24ms step:1389/1480 train_time:220981ms step_avg:160.25ms step:1390/1480 train_time:221149ms step_avg:160.25ms step:1391/1480 train_time:221318ms step_avg:160.26ms step:1392/1480 train_time:221491ms step_avg:160.27ms step:1393/1480 train_time:221662ms step_avg:160.28ms step:1394/1480 train_time:221833ms step_avg:160.28ms step:1395/1480 train_time:222001ms step_avg:160.29ms step:1396/1480 train_time:222170ms step_avg:160.30ms step:1397/1480 train_time:222337ms step_avg:160.30ms step:1398/1480 train_time:222504ms step_avg:160.31ms step:1399/1480 train_time:222674ms step_avg:160.31ms step:1400/1480 train_time:222852ms step_avg:160.33ms step:1401/1480 train_time:223019ms step_avg:160.33ms step:1402/1480 train_time:223191ms step_avg:160.34ms step:1403/1480 train_time:223369ms step_avg:160.35ms step:1404/1480 train_time:223540ms step_avg:160.36ms step:1405/1480 train_time:223714ms step_avg:160.37ms step:1406/1480 train_time:223890ms step_avg:160.38ms step:1407/1480 train_time:224059ms step_avg:160.39ms step:1408/1480 train_time:224227ms step_avg:160.39ms step:1409/1480 train_time:224408ms step_avg:160.41ms step:1410/1480 train_time:224578ms step_avg:160.41ms step:1411/1480 train_time:224749ms step_avg:160.42ms step:1412/1480 train_time:224918ms step_avg:160.43ms step:1413/1480 train_time:225087ms step_avg:160.43ms step:1414/1480 train_time:225259ms step_avg:160.44ms step:1415/1480 train_time:225434ms step_avg:160.45ms step:1416/1480 train_time:225619ms step_avg:160.47ms step:1417/1480 train_time:225793ms step_avg:160.48ms step:1418/1480 train_time:225964ms step_avg:160.49ms step:1419/1480 train_time:226138ms step_avg:160.50ms step:1420/1480 train_time:226313ms step_avg:160.51ms step:1421/1480 train_time:226485ms step_avg:160.51ms step:1422/1480 train_time:226658ms step_avg:160.52ms step:1423/1480 train_time:226828ms step_avg:160.53ms step:1424/1480 train_time:227003ms step_avg:160.54ms step:1425/1480 train_time:227183ms step_avg:160.55ms step:1426/1480 train_time:227355ms step_avg:160.56ms step:1427/1480 train_time:227529ms step_avg:160.57ms step:1428/1480 train_time:227698ms step_avg:160.58ms step:1429/1480 train_time:227867ms step_avg:160.58ms step:1430/1480 train_time:228041ms step_avg:160.59ms step:1431/1480 train_time:228216ms step_avg:160.60ms step:1432/1480 train_time:228393ms step_avg:160.61ms step:1433/1480 train_time:228575ms step_avg:160.63ms step:1434/1480 train_time:228756ms step_avg:160.64ms step:1435/1480 train_time:228931ms step_avg:160.65ms step:1436/1480 train_time:229106ms step_avg:160.66ms step:1437/1480 train_time:229276ms step_avg:160.67ms step:1438/1480 train_time:229443ms step_avg:160.67ms step:1439/1480 train_time:229618ms step_avg:160.68ms step:1440/1480 train_time:229788ms step_avg:160.69ms step:1441/1480 train_time:229959ms step_avg:160.70ms step:1442/1480 train_time:230137ms step_avg:160.71ms step:1443/1480 train_time:230326ms step_avg:160.73ms step:1444/1480 train_time:230497ms step_avg:160.74ms step:1445/1480 train_time:230668ms step_avg:160.74ms step:1446/1480 train_time:230843ms step_avg:160.75ms step:1447/1480 train_time:231020ms step_avg:160.77ms step:1448/1480 train_time:231193ms step_avg:160.77ms step:1449/1480 train_time:231369ms step_avg:160.78ms step:1450/1480 train_time:231540ms step_avg:160.79ms step:1451/1480 train_time:231711ms step_avg:160.80ms step:1452/1480 train_time:231884ms step_avg:160.81ms step:1453/1480 train_time:232054ms step_avg:160.81ms step:1454/1480 train_time:232226ms step_avg:160.82ms step:1455/1480 train_time:232404ms step_avg:160.83ms step:1456/1480 train_time:232577ms step_avg:160.84ms step:1457/1480 train_time:232748ms step_avg:160.85ms step:1458/1480 train_time:232919ms step_avg:160.86ms step:1459/1480 train_time:233095ms step_avg:160.87ms step:1460/1480 train_time:233268ms step_avg:160.87ms step:1461/1480 train_time:233441ms step_avg:160.88ms step:1462/1480 train_time:233613ms step_avg:160.89ms step:1463/1480 train_time:233789ms step_avg:160.90ms step:1464/1480 train_time:233965ms step_avg:160.91ms step:1465/1480 train_time:234138ms step_avg:160.92ms step:1466/1480 train_time:234309ms step_avg:160.93ms step:1467/1480 train_time:234483ms step_avg:160.94ms step:1468/1480 train_time:234654ms step_avg:160.94ms step:1469/1480 train_time:234827ms step_avg:160.95ms step:1470/1480 train_time:235006ms step_avg:160.96ms step:1471/1480 train_time:235191ms step_avg:160.98ms step:1472/1480 train_time:235375ms step_avg:161.00ms step:1473/1480 train_time:235547ms step_avg:161.00ms step:1474/1480 train_time:235725ms step_avg:161.01ms step:1475/1480 train_time:235905ms step_avg:161.03ms step:1476/1480 train_time:236077ms step_avg:161.03ms step:1477/1480 train_time:236259ms step_avg:161.05ms step:1478/1480 train_time:236440ms step_avg:161.06ms step:1479/1480 train_time:236615ms step_avg:161.07ms step:1480/1480 train_time:236788ms step_avg:161.08ms step:1480/1480 val_loss:3.2797 train_time:236859ms step_avg:161.13ms