import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 12:19:01 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 125W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 110W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 45C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23347ms step_avg:nanms step:2/1480 train_time:23434ms step_avg:nanms step:3/1480 train_time:23573ms step_avg:nanms step:4/1480 train_time:23715ms step_avg:nanms step:5/1480 train_time:23857ms step_avg:nanms step:6/1480 train_time:23998ms step_avg:nanms step:7/1480 train_time:24139ms step_avg:nanms step:8/1480 train_time:24281ms step_avg:nanms step:9/1480 train_time:24424ms step_avg:nanms step:10/1480 train_time:24568ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:428ms step_avg:142.75ms step:14/1480 train_time:571ms step_avg:142.64ms step:15/1480 train_time:712ms step_avg:142.48ms step:16/1480 train_time:855ms step_avg:142.44ms step:17/1480 train_time:996ms step_avg:142.33ms step:18/1480 train_time:1138ms step_avg:142.28ms step:19/1480 train_time:1282ms step_avg:142.50ms step:20/1480 train_time:1425ms step_avg:142.52ms step:21/1480 train_time:1568ms step_avg:142.51ms step:22/1480 train_time:1710ms step_avg:142.50ms step:23/1480 train_time:1852ms step_avg:142.47ms step:24/1480 train_time:1995ms step_avg:142.49ms step:25/1480 train_time:2137ms step_avg:142.44ms step:26/1480 train_time:2280ms step_avg:142.49ms step:27/1480 train_time:2421ms step_avg:142.42ms step:28/1480 train_time:2566ms step_avg:142.55ms step:29/1480 train_time:2710ms step_avg:142.62ms step:30/1480 train_time:2853ms step_avg:142.64ms step:31/1480 train_time:2995ms step_avg:142.63ms step:32/1480 train_time:3136ms step_avg:142.56ms step:33/1480 train_time:3278ms step_avg:142.52ms step:34/1480 train_time:3419ms step_avg:142.46ms step:35/1480 train_time:3563ms step_avg:142.51ms step:36/1480 train_time:3706ms step_avg:142.55ms step:37/1480 train_time:3850ms step_avg:142.58ms step:38/1480 train_time:3993ms step_avg:142.61ms step:39/1480 train_time:4136ms step_avg:142.63ms step:40/1480 train_time:4278ms step_avg:142.61ms step:41/1480 train_time:4419ms step_avg:142.55ms step:42/1480 train_time:4564ms step_avg:142.63ms step:43/1480 train_time:4709ms step_avg:142.69ms step:44/1480 train_time:4852ms step_avg:142.70ms step:45/1480 train_time:4995ms step_avg:142.71ms step:46/1480 train_time:5138ms step_avg:142.73ms step:47/1480 train_time:5281ms step_avg:142.72ms step:48/1480 train_time:5424ms step_avg:142.75ms step:49/1480 train_time:5569ms step_avg:142.79ms step:50/1480 train_time:5714ms step_avg:142.85ms step:51/1480 train_time:5856ms step_avg:142.82ms step:52/1480 train_time:5999ms step_avg:142.83ms step:53/1480 train_time:6141ms step_avg:142.81ms step:54/1480 train_time:6285ms step_avg:142.83ms step:55/1480 train_time:6428ms step_avg:142.84ms step:56/1480 train_time:6572ms step_avg:142.87ms step:57/1480 train_time:6716ms step_avg:142.89ms step:58/1480 train_time:6857ms step_avg:142.85ms step:59/1480 train_time:7001ms step_avg:142.87ms step:60/1480 train_time:7144ms step_avg:142.89ms step:61/1480 train_time:7288ms step_avg:142.90ms step:62/1480 train_time:7431ms step_avg:142.91ms step:63/1480 train_time:7575ms step_avg:142.92ms step:64/1480 train_time:7719ms step_avg:142.94ms step:65/1480 train_time:7858ms step_avg:142.87ms step:66/1480 train_time:8000ms step_avg:142.85ms step:67/1480 train_time:8143ms step_avg:142.86ms step:68/1480 train_time:8287ms step_avg:142.88ms step:69/1480 train_time:8430ms step_avg:142.89ms step:70/1480 train_time:8573ms step_avg:142.89ms step:71/1480 train_time:8715ms step_avg:142.87ms step:72/1480 train_time:8857ms step_avg:142.85ms step:73/1480 train_time:8999ms step_avg:142.84ms step:74/1480 train_time:9141ms step_avg:142.83ms step:75/1480 train_time:9285ms step_avg:142.85ms step:76/1480 train_time:9428ms step_avg:142.85ms step:77/1480 train_time:9571ms step_avg:142.85ms step:78/1480 train_time:9713ms step_avg:142.84ms step:79/1480 train_time:9855ms step_avg:142.82ms step:80/1480 train_time:9997ms step_avg:142.82ms step:81/1480 train_time:10139ms step_avg:142.81ms step:82/1480 train_time:10283ms step_avg:142.82ms step:83/1480 train_time:10425ms step_avg:142.81ms step:84/1480 train_time:10570ms step_avg:142.83ms step:85/1480 train_time:10712ms step_avg:142.82ms step:86/1480 train_time:10852ms step_avg:142.79ms step:87/1480 train_time:10995ms step_avg:142.79ms step:88/1480 train_time:11136ms step_avg:142.77ms step:89/1480 train_time:11281ms step_avg:142.79ms step:90/1480 train_time:11424ms step_avg:142.79ms step:91/1480 train_time:11569ms step_avg:142.82ms step:92/1480 train_time:11712ms step_avg:142.83ms step:93/1480 train_time:11854ms step_avg:142.81ms step:94/1480 train_time:11997ms step_avg:142.82ms step:95/1480 train_time:12138ms step_avg:142.80ms step:96/1480 train_time:12281ms step_avg:142.80ms step:97/1480 train_time:12423ms step_avg:142.79ms step:98/1480 train_time:12567ms step_avg:142.81ms step:99/1480 train_time:12711ms step_avg:142.82ms step:100/1480 train_time:12854ms step_avg:142.82ms step:101/1480 train_time:12997ms step_avg:142.83ms step:102/1480 train_time:13141ms step_avg:142.83ms step:103/1480 train_time:13283ms step_avg:142.83ms step:104/1480 train_time:13426ms step_avg:142.83ms step:105/1480 train_time:13570ms step_avg:142.84ms step:106/1480 train_time:13714ms step_avg:142.85ms step:107/1480 train_time:13857ms step_avg:142.85ms step:108/1480 train_time:13999ms step_avg:142.85ms step:109/1480 train_time:14142ms step_avg:142.85ms step:110/1480 train_time:14286ms step_avg:142.86ms step:111/1480 train_time:14430ms step_avg:142.88ms step:112/1480 train_time:14577ms step_avg:142.91ms step:113/1480 train_time:14723ms step_avg:142.95ms step:114/1480 train_time:14871ms step_avg:142.99ms step:115/1480 train_time:15017ms step_avg:143.02ms step:116/1480 train_time:15166ms step_avg:143.07ms step:117/1480 train_time:15315ms step_avg:143.14ms step:118/1480 train_time:15459ms step_avg:143.14ms step:119/1480 train_time:15607ms step_avg:143.18ms step:120/1480 train_time:15754ms step_avg:143.22ms step:121/1480 train_time:15901ms step_avg:143.25ms step:122/1480 train_time:16048ms step_avg:143.29ms step:123/1480 train_time:16196ms step_avg:143.33ms step:124/1480 train_time:16342ms step_avg:143.35ms step:125/1480 train_time:16491ms step_avg:143.40ms step:125/1480 val_loss:4.4056 train_time:16547ms step_avg:143.89ms step:126/1480 train_time:16642ms step_avg:143.46ms step:127/1480 train_time:16790ms step_avg:143.51ms step:128/1480 train_time:16937ms step_avg:143.54ms step:129/1480 train_time:17083ms step_avg:143.55ms step:130/1480 train_time:17228ms step_avg:143.57ms step:131/1480 train_time:17375ms step_avg:143.60ms step:132/1480 train_time:17522ms step_avg:143.62ms step:133/1480 train_time:17671ms step_avg:143.67ms step:134/1480 train_time:17819ms step_avg:143.70ms step:135/1480 train_time:17965ms step_avg:143.72ms step:136/1480 train_time:18113ms step_avg:143.76ms step:137/1480 train_time:18260ms step_avg:143.78ms step:138/1480 train_time:18405ms step_avg:143.79ms step:139/1480 train_time:18551ms step_avg:143.81ms step:140/1480 train_time:18700ms step_avg:143.84ms step:141/1480 train_time:18846ms step_avg:143.86ms step:142/1480 train_time:18994ms step_avg:143.89ms step:143/1480 train_time:19141ms step_avg:143.92ms step:144/1480 train_time:19288ms step_avg:143.94ms step:145/1480 train_time:19435ms step_avg:143.97ms step:146/1480 train_time:19581ms step_avg:143.98ms step:147/1480 train_time:19728ms step_avg:144.00ms step:148/1480 train_time:19876ms step_avg:144.03ms step:149/1480 train_time:20023ms step_avg:144.05ms step:150/1480 train_time:20170ms step_avg:144.07ms step:151/1480 train_time:20319ms step_avg:144.10ms step:152/1480 train_time:20464ms step_avg:144.11ms step:153/1480 train_time:20611ms step_avg:144.13ms step:154/1480 train_time:20758ms step_avg:144.16ms step:155/1480 train_time:20905ms step_avg:144.17ms step:156/1480 train_time:21051ms step_avg:144.19ms step:157/1480 train_time:21199ms step_avg:144.21ms step:158/1480 train_time:21345ms step_avg:144.22ms step:159/1480 train_time:21494ms step_avg:144.26ms step:160/1480 train_time:21641ms step_avg:144.28ms step:161/1480 train_time:21789ms step_avg:144.30ms step:162/1480 train_time:21936ms step_avg:144.32ms step:163/1480 train_time:22082ms step_avg:144.33ms step:164/1480 train_time:22230ms step_avg:144.35ms step:165/1480 train_time:22377ms step_avg:144.37ms step:166/1480 train_time:22522ms step_avg:144.37ms step:167/1480 train_time:22667ms step_avg:144.38ms step:168/1480 train_time:22815ms step_avg:144.40ms step:169/1480 train_time:22961ms step_avg:144.41ms step:170/1480 train_time:23107ms step_avg:144.42ms step:171/1480 train_time:23255ms step_avg:144.44ms step:172/1480 train_time:23402ms step_avg:144.46ms step:173/1480 train_time:23548ms step_avg:144.47ms step:174/1480 train_time:23693ms step_avg:144.47ms step:175/1480 train_time:23841ms step_avg:144.49ms step:176/1480 train_time:23987ms step_avg:144.50ms step:177/1480 train_time:24134ms step_avg:144.52ms step:178/1480 train_time:24280ms step_avg:144.53ms step:179/1480 train_time:24427ms step_avg:144.54ms step:180/1480 train_time:24574ms step_avg:144.55ms step:181/1480 train_time:24722ms step_avg:144.57ms step:182/1480 train_time:24868ms step_avg:144.58ms step:183/1480 train_time:25015ms step_avg:144.60ms step:184/1480 train_time:25162ms step_avg:144.61ms step:185/1480 train_time:25309ms step_avg:144.62ms step:186/1480 train_time:25456ms step_avg:144.64ms step:187/1480 train_time:25603ms step_avg:144.65ms step:188/1480 train_time:25751ms step_avg:144.67ms step:189/1480 train_time:25899ms step_avg:144.69ms step:190/1480 train_time:26044ms step_avg:144.69ms step:191/1480 train_time:26192ms step_avg:144.70ms step:192/1480 train_time:26339ms step_avg:144.72ms step:193/1480 train_time:26485ms step_avg:144.73ms step:194/1480 train_time:26633ms step_avg:144.74ms step:195/1480 train_time:26780ms step_avg:144.76ms step:196/1480 train_time:26928ms step_avg:144.77ms step:197/1480 train_time:27075ms step_avg:144.78ms step:198/1480 train_time:27222ms step_avg:144.80ms step:199/1480 train_time:27370ms step_avg:144.81ms step:200/1480 train_time:27518ms step_avg:144.83ms step:201/1480 train_time:27664ms step_avg:144.84ms step:202/1480 train_time:27811ms step_avg:144.85ms step:203/1480 train_time:27959ms step_avg:144.86ms step:204/1480 train_time:28106ms step_avg:144.88ms step:205/1480 train_time:28254ms step_avg:144.89ms step:206/1480 train_time:28402ms step_avg:144.91ms step:207/1480 train_time:28547ms step_avg:144.91ms step:208/1480 train_time:28696ms step_avg:144.93ms step:209/1480 train_time:28843ms step_avg:144.94ms step:210/1480 train_time:28990ms step_avg:144.95ms step:211/1480 train_time:29137ms step_avg:144.96ms step:212/1480 train_time:29283ms step_avg:144.96ms step:213/1480 train_time:29431ms step_avg:144.98ms step:214/1480 train_time:29578ms step_avg:144.99ms step:215/1480 train_time:29725ms step_avg:145.00ms step:216/1480 train_time:29873ms step_avg:145.01ms step:217/1480 train_time:30020ms step_avg:145.02ms step:218/1480 train_time:30166ms step_avg:145.03ms step:219/1480 train_time:30314ms step_avg:145.04ms step:220/1480 train_time:30461ms step_avg:145.05ms step:221/1480 train_time:30610ms step_avg:145.07ms step:222/1480 train_time:30760ms step_avg:145.09ms step:223/1480 train_time:30910ms step_avg:145.12ms step:224/1480 train_time:31061ms step_avg:145.14ms step:225/1480 train_time:31213ms step_avg:145.18ms step:226/1480 train_time:31364ms step_avg:145.20ms step:227/1480 train_time:31516ms step_avg:145.23ms step:228/1480 train_time:31666ms step_avg:145.26ms step:229/1480 train_time:31818ms step_avg:145.29ms step:230/1480 train_time:31967ms step_avg:145.30ms step:231/1480 train_time:32118ms step_avg:145.33ms step:232/1480 train_time:32267ms step_avg:145.35ms step:233/1480 train_time:32418ms step_avg:145.37ms step:234/1480 train_time:32568ms step_avg:145.39ms step:235/1480 train_time:32720ms step_avg:145.42ms step:236/1480 train_time:32870ms step_avg:145.44ms step:237/1480 train_time:33021ms step_avg:145.47ms step:238/1480 train_time:33170ms step_avg:145.48ms step:239/1480 train_time:33320ms step_avg:145.50ms step:240/1480 train_time:33470ms step_avg:145.52ms step:241/1480 train_time:33619ms step_avg:145.54ms step:242/1480 train_time:33770ms step_avg:145.56ms step:243/1480 train_time:33921ms step_avg:145.58ms step:244/1480 train_time:34072ms step_avg:145.61ms step:245/1480 train_time:34222ms step_avg:145.63ms step:246/1480 train_time:34373ms step_avg:145.65ms step:247/1480 train_time:34523ms step_avg:145.67ms step:248/1480 train_time:34675ms step_avg:145.69ms step:249/1480 train_time:34825ms step_avg:145.71ms step:250/1480 train_time:34977ms step_avg:145.74ms step:250/1480 val_loss:3.9909 train_time:35035ms step_avg:145.98ms step:251/1480 train_time:35131ms step_avg:145.77ms step:252/1480 train_time:35283ms step_avg:145.80ms step:253/1480 train_time:35434ms step_avg:145.82ms step:254/1480 train_time:35583ms step_avg:145.83ms step:255/1480 train_time:35733ms step_avg:145.85ms step:256/1480 train_time:35882ms step_avg:145.86ms step:257/1480 train_time:36034ms step_avg:145.88ms step:258/1480 train_time:36186ms step_avg:145.91ms step:259/1480 train_time:36338ms step_avg:145.94ms step:260/1480 train_time:36488ms step_avg:145.95ms step:261/1480 train_time:36638ms step_avg:145.97ms step:262/1480 train_time:36787ms step_avg:145.98ms step:263/1480 train_time:36938ms step_avg:146.00ms step:264/1480 train_time:37089ms step_avg:146.02ms step:265/1480 train_time:37241ms step_avg:146.04ms step:266/1480 train_time:37392ms step_avg:146.06ms step:267/1480 train_time:37543ms step_avg:146.08ms step:268/1480 train_time:37695ms step_avg:146.10ms step:269/1480 train_time:37843ms step_avg:146.11ms step:270/1480 train_time:37992ms step_avg:146.12ms step:271/1480 train_time:38143ms step_avg:146.14ms step:272/1480 train_time:38293ms step_avg:146.16ms step:273/1480 train_time:38444ms step_avg:146.18ms step:274/1480 train_time:38593ms step_avg:146.19ms step:275/1480 train_time:38745ms step_avg:146.21ms step:276/1480 train_time:38895ms step_avg:146.22ms step:277/1480 train_time:39046ms step_avg:146.24ms step:278/1480 train_time:39194ms step_avg:146.25ms step:279/1480 train_time:39345ms step_avg:146.27ms step:280/1480 train_time:39496ms step_avg:146.28ms step:281/1480 train_time:39646ms step_avg:146.30ms step:282/1480 train_time:39797ms step_avg:146.31ms step:283/1480 train_time:39947ms step_avg:146.33ms step:284/1480 train_time:40098ms step_avg:146.34ms step:285/1480 train_time:40248ms step_avg:146.36ms step:286/1480 train_time:40398ms step_avg:146.37ms step:287/1480 train_time:40549ms step_avg:146.39ms step:288/1480 train_time:40700ms step_avg:146.40ms step:289/1480 train_time:40850ms step_avg:146.41ms step:290/1480 train_time:41000ms step_avg:146.43ms step:291/1480 train_time:41150ms step_avg:146.44ms step:292/1480 train_time:41300ms step_avg:146.45ms step:293/1480 train_time:41450ms step_avg:146.47ms step:294/1480 train_time:41601ms step_avg:146.48ms step:295/1480 train_time:41752ms step_avg:146.50ms step:296/1480 train_time:41903ms step_avg:146.51ms step:297/1480 train_time:42052ms step_avg:146.52ms step:298/1480 train_time:42203ms step_avg:146.54ms step:299/1480 train_time:42352ms step_avg:146.55ms step:300/1480 train_time:42504ms step_avg:146.57ms step:301/1480 train_time:42653ms step_avg:146.57ms step:302/1480 train_time:42805ms step_avg:146.59ms step:303/1480 train_time:42955ms step_avg:146.60ms step:304/1480 train_time:43104ms step_avg:146.61ms step:305/1480 train_time:43256ms step_avg:146.63ms step:306/1480 train_time:43406ms step_avg:146.64ms step:307/1480 train_time:43557ms step_avg:146.66ms step:308/1480 train_time:43707ms step_avg:146.67ms step:309/1480 train_time:43856ms step_avg:146.68ms step:310/1480 train_time:44007ms step_avg:146.69ms step:311/1480 train_time:44158ms step_avg:146.70ms step:312/1480 train_time:44308ms step_avg:146.72ms step:313/1480 train_time:44458ms step_avg:146.73ms step:314/1480 train_time:44608ms step_avg:146.74ms step:315/1480 train_time:44758ms step_avg:146.75ms step:316/1480 train_time:44909ms step_avg:146.76ms step:317/1480 train_time:45059ms step_avg:146.77ms step:318/1480 train_time:45209ms step_avg:146.78ms step:319/1480 train_time:45359ms step_avg:146.79ms step:320/1480 train_time:45510ms step_avg:146.81ms step:321/1480 train_time:45660ms step_avg:146.82ms step:322/1480 train_time:45810ms step_avg:146.83ms step:323/1480 train_time:45960ms step_avg:146.84ms step:324/1480 train_time:46110ms step_avg:146.85ms step:325/1480 train_time:46261ms step_avg:146.86ms step:326/1480 train_time:46410ms step_avg:146.87ms step:327/1480 train_time:46562ms step_avg:146.88ms step:328/1480 train_time:46712ms step_avg:146.89ms step:329/1480 train_time:46862ms step_avg:146.90ms step:330/1480 train_time:47015ms step_avg:146.92ms step:331/1480 train_time:47169ms step_avg:146.94ms step:332/1480 train_time:47323ms step_avg:146.97ms step:333/1480 train_time:47478ms step_avg:146.99ms step:334/1480 train_time:47631ms step_avg:147.01ms step:335/1480 train_time:47785ms step_avg:147.03ms step:336/1480 train_time:47941ms step_avg:147.06ms step:337/1480 train_time:48096ms step_avg:147.08ms step:338/1480 train_time:48249ms step_avg:147.10ms step:339/1480 train_time:48403ms step_avg:147.12ms step:340/1480 train_time:48556ms step_avg:147.14ms step:341/1480 train_time:48709ms step_avg:147.16ms step:342/1480 train_time:48862ms step_avg:147.17ms step:343/1480 train_time:49017ms step_avg:147.20ms step:344/1480 train_time:49170ms step_avg:147.22ms step:345/1480 train_time:49326ms step_avg:147.24ms step:346/1480 train_time:49480ms step_avg:147.26ms step:347/1480 train_time:49635ms step_avg:147.29ms step:348/1480 train_time:49788ms step_avg:147.30ms step:349/1480 train_time:49943ms step_avg:147.32ms step:350/1480 train_time:50098ms step_avg:147.35ms step:351/1480 train_time:50252ms step_avg:147.37ms step:352/1480 train_time:50405ms step_avg:147.38ms step:353/1480 train_time:50559ms step_avg:147.40ms step:354/1480 train_time:50711ms step_avg:147.42ms step:355/1480 train_time:50864ms step_avg:147.43ms step:356/1480 train_time:51019ms step_avg:147.45ms step:357/1480 train_time:51173ms step_avg:147.47ms step:358/1480 train_time:51327ms step_avg:147.49ms step:359/1480 train_time:51481ms step_avg:147.51ms step:360/1480 train_time:51637ms step_avg:147.53ms step:361/1480 train_time:51791ms step_avg:147.55ms step:362/1480 train_time:51944ms step_avg:147.57ms step:363/1480 train_time:52097ms step_avg:147.58ms step:364/1480 train_time:52250ms step_avg:147.60ms step:365/1480 train_time:52405ms step_avg:147.62ms step:366/1480 train_time:52559ms step_avg:147.64ms step:367/1480 train_time:52712ms step_avg:147.65ms step:368/1480 train_time:52865ms step_avg:147.67ms step:369/1480 train_time:53018ms step_avg:147.68ms step:370/1480 train_time:53171ms step_avg:147.70ms step:371/1480 train_time:53324ms step_avg:147.71ms step:372/1480 train_time:53479ms step_avg:147.73ms step:373/1480 train_time:53632ms step_avg:147.75ms step:374/1480 train_time:53785ms step_avg:147.76ms step:375/1480 train_time:53941ms step_avg:147.78ms step:375/1480 val_loss:3.8049 train_time:54001ms step_avg:147.95ms step:376/1480 train_time:54099ms step_avg:147.81ms step:377/1480 train_time:54253ms step_avg:147.83ms step:378/1480 train_time:54406ms step_avg:147.84ms step:379/1480 train_time:54559ms step_avg:147.86ms step:380/1480 train_time:54711ms step_avg:147.87ms step:381/1480 train_time:54864ms step_avg:147.88ms step:382/1480 train_time:55018ms step_avg:147.90ms step:383/1480 train_time:55172ms step_avg:147.92ms step:384/1480 train_time:55326ms step_avg:147.93ms step:385/1480 train_time:55478ms step_avg:147.94ms step:386/1480 train_time:55631ms step_avg:147.96ms step:387/1480 train_time:55784ms step_avg:147.97ms step:388/1480 train_time:55938ms step_avg:147.99ms step:389/1480 train_time:56092ms step_avg:148.00ms step:390/1480 train_time:56246ms step_avg:148.02ms step:391/1480 train_time:56399ms step_avg:148.03ms step:392/1480 train_time:56552ms step_avg:148.04ms step:393/1480 train_time:56706ms step_avg:148.06ms step:394/1480 train_time:56860ms step_avg:148.07ms step:395/1480 train_time:57012ms step_avg:148.08ms step:396/1480 train_time:57166ms step_avg:148.10ms step:397/1480 train_time:57320ms step_avg:148.11ms step:398/1480 train_time:57474ms step_avg:148.13ms step:399/1480 train_time:57627ms step_avg:148.14ms step:400/1480 train_time:57781ms step_avg:148.16ms step:401/1480 train_time:57935ms step_avg:148.17ms step:402/1480 train_time:58089ms step_avg:148.19ms step:403/1480 train_time:58245ms step_avg:148.20ms step:404/1480 train_time:58398ms step_avg:148.22ms step:405/1480 train_time:58552ms step_avg:148.23ms step:406/1480 train_time:58706ms step_avg:148.25ms step:407/1480 train_time:58858ms step_avg:148.26ms step:408/1480 train_time:59012ms step_avg:148.27ms step:409/1480 train_time:59166ms step_avg:148.28ms step:410/1480 train_time:59319ms step_avg:148.30ms step:411/1480 train_time:59473ms step_avg:148.31ms step:412/1480 train_time:59627ms step_avg:148.33ms step:413/1480 train_time:59780ms step_avg:148.34ms step:414/1480 train_time:59935ms step_avg:148.35ms step:415/1480 train_time:60089ms step_avg:148.37ms step:416/1480 train_time:60244ms step_avg:148.38ms step:417/1480 train_time:60398ms step_avg:148.40ms step:418/1480 train_time:60551ms step_avg:148.41ms step:419/1480 train_time:60705ms step_avg:148.42ms step:420/1480 train_time:60859ms step_avg:148.44ms step:421/1480 train_time:61012ms step_avg:148.45ms step:422/1480 train_time:61165ms step_avg:148.46ms step:423/1480 train_time:61319ms step_avg:148.47ms step:424/1480 train_time:61472ms step_avg:148.48ms step:425/1480 train_time:61626ms step_avg:148.50ms step:426/1480 train_time:61779ms step_avg:148.51ms step:427/1480 train_time:61933ms step_avg:148.52ms step:428/1480 train_time:62085ms step_avg:148.53ms step:429/1480 train_time:62239ms step_avg:148.54ms step:430/1480 train_time:62393ms step_avg:148.55ms step:431/1480 train_time:62546ms step_avg:148.57ms step:432/1480 train_time:62699ms step_avg:148.58ms step:433/1480 train_time:62852ms step_avg:148.59ms step:434/1480 train_time:63007ms step_avg:148.60ms step:435/1480 train_time:63160ms step_avg:148.61ms step:436/1480 train_time:63314ms step_avg:148.62ms step:437/1480 train_time:63468ms step_avg:148.64ms step:438/1480 train_time:63622ms step_avg:148.65ms step:439/1480 train_time:63775ms step_avg:148.66ms step:440/1480 train_time:63929ms step_avg:148.67ms step:441/1480 train_time:64085ms step_avg:148.69ms step:442/1480 train_time:64245ms step_avg:148.71ms step:443/1480 train_time:64401ms step_avg:148.73ms step:444/1480 train_time:64558ms step_avg:148.75ms step:445/1480 train_time:64714ms step_avg:148.77ms step:446/1480 train_time:64870ms step_avg:148.78ms step:447/1480 train_time:65027ms step_avg:148.80ms step:448/1480 train_time:65183ms step_avg:148.82ms step:449/1480 train_time:65343ms step_avg:148.84ms step:450/1480 train_time:65500ms step_avg:148.86ms step:451/1480 train_time:65658ms step_avg:148.88ms step:452/1480 train_time:65813ms step_avg:148.90ms step:453/1480 train_time:65968ms step_avg:148.91ms step:454/1480 train_time:66123ms step_avg:148.93ms step:455/1480 train_time:66281ms step_avg:148.95ms step:456/1480 train_time:66437ms step_avg:148.96ms step:457/1480 train_time:66593ms step_avg:148.98ms step:458/1480 train_time:66748ms step_avg:148.99ms step:459/1480 train_time:66905ms step_avg:149.01ms step:460/1480 train_time:67063ms step_avg:149.03ms step:461/1480 train_time:67223ms step_avg:149.05ms step:462/1480 train_time:67380ms step_avg:149.07ms step:463/1480 train_time:67539ms step_avg:149.09ms step:464/1480 train_time:67694ms step_avg:149.11ms step:465/1480 train_time:67850ms step_avg:149.12ms step:466/1480 train_time:68007ms step_avg:149.14ms step:467/1480 train_time:68165ms step_avg:149.16ms step:468/1480 train_time:68322ms step_avg:149.17ms step:469/1480 train_time:68480ms step_avg:149.19ms step:470/1480 train_time:68637ms step_avg:149.21ms step:471/1480 train_time:68794ms step_avg:149.23ms step:472/1480 train_time:68949ms step_avg:149.24ms step:473/1480 train_time:69105ms step_avg:149.26ms step:474/1480 train_time:69261ms step_avg:149.27ms step:475/1480 train_time:69418ms step_avg:149.29ms step:476/1480 train_time:69573ms step_avg:149.30ms step:477/1480 train_time:69731ms step_avg:149.32ms step:478/1480 train_time:69888ms step_avg:149.33ms step:479/1480 train_time:70046ms step_avg:149.35ms step:480/1480 train_time:70204ms step_avg:149.37ms step:481/1480 train_time:70362ms step_avg:149.39ms step:482/1480 train_time:70519ms step_avg:149.40ms step:483/1480 train_time:70675ms step_avg:149.42ms step:484/1480 train_time:70831ms step_avg:149.43ms step:485/1480 train_time:70987ms step_avg:149.45ms step:486/1480 train_time:71146ms step_avg:149.47ms step:487/1480 train_time:71302ms step_avg:149.48ms step:488/1480 train_time:71457ms step_avg:149.49ms step:489/1480 train_time:71612ms step_avg:149.50ms step:490/1480 train_time:71769ms step_avg:149.52ms step:491/1480 train_time:71926ms step_avg:149.53ms step:492/1480 train_time:72082ms step_avg:149.55ms step:493/1480 train_time:72240ms step_avg:149.57ms step:494/1480 train_time:72399ms step_avg:149.58ms step:495/1480 train_time:72555ms step_avg:149.60ms step:496/1480 train_time:72711ms step_avg:149.61ms step:497/1480 train_time:72867ms step_avg:149.63ms step:498/1480 train_time:73024ms step_avg:149.64ms step:499/1480 train_time:73181ms step_avg:149.65ms step:500/1480 train_time:73341ms step_avg:149.68ms step:500/1480 val_loss:3.6832 train_time:73403ms step_avg:149.80ms step:501/1480 train_time:73503ms step_avg:149.70ms step:502/1480 train_time:73659ms step_avg:149.71ms step:503/1480 train_time:73815ms step_avg:149.73ms step:504/1480 train_time:73971ms step_avg:149.74ms step:505/1480 train_time:74127ms step_avg:149.75ms step:506/1480 train_time:74286ms step_avg:149.77ms step:507/1480 train_time:74443ms step_avg:149.78ms step:508/1480 train_time:74602ms step_avg:149.80ms step:509/1480 train_time:74759ms step_avg:149.82ms step:510/1480 train_time:74915ms step_avg:149.83ms step:511/1480 train_time:75071ms step_avg:149.84ms step:512/1480 train_time:75231ms step_avg:149.86ms step:513/1480 train_time:75387ms step_avg:149.87ms step:514/1480 train_time:75544ms step_avg:149.89ms step:515/1480 train_time:75702ms step_avg:149.91ms step:516/1480 train_time:75862ms step_avg:149.92ms step:517/1480 train_time:76020ms step_avg:149.94ms step:518/1480 train_time:76175ms step_avg:149.95ms step:519/1480 train_time:76331ms step_avg:149.96ms step:520/1480 train_time:76488ms step_avg:149.98ms step:521/1480 train_time:76646ms step_avg:149.99ms step:522/1480 train_time:76803ms step_avg:150.01ms step:523/1480 train_time:76959ms step_avg:150.02ms step:524/1480 train_time:77115ms step_avg:150.03ms step:525/1480 train_time:77271ms step_avg:150.04ms step:526/1480 train_time:77429ms step_avg:150.06ms step:527/1480 train_time:77587ms step_avg:150.07ms step:528/1480 train_time:77744ms step_avg:150.08ms step:529/1480 train_time:77902ms step_avg:150.10ms step:530/1480 train_time:78060ms step_avg:150.12ms step:531/1480 train_time:78216ms step_avg:150.13ms step:532/1480 train_time:78371ms step_avg:150.14ms step:533/1480 train_time:78528ms step_avg:150.15ms step:534/1480 train_time:78684ms step_avg:150.16ms step:535/1480 train_time:78840ms step_avg:150.17ms step:536/1480 train_time:78997ms step_avg:150.18ms step:537/1480 train_time:79154ms step_avg:150.20ms step:538/1480 train_time:79311ms step_avg:150.21ms step:539/1480 train_time:79468ms step_avg:150.22ms step:540/1480 train_time:79626ms step_avg:150.24ms step:541/1480 train_time:79782ms step_avg:150.25ms step:542/1480 train_time:79938ms step_avg:150.26ms step:543/1480 train_time:80094ms step_avg:150.27ms step:544/1480 train_time:80251ms step_avg:150.28ms step:545/1480 train_time:80408ms step_avg:150.30ms step:546/1480 train_time:80566ms step_avg:150.31ms step:547/1480 train_time:80724ms step_avg:150.32ms step:548/1480 train_time:80881ms step_avg:150.34ms step:549/1480 train_time:81038ms step_avg:150.35ms step:550/1480 train_time:81196ms step_avg:150.36ms step:551/1480 train_time:81354ms step_avg:150.38ms step:552/1480 train_time:81513ms step_avg:150.39ms step:553/1480 train_time:81673ms step_avg:150.41ms step:554/1480 train_time:81832ms step_avg:150.43ms step:555/1480 train_time:81991ms step_avg:150.44ms step:556/1480 train_time:82149ms step_avg:150.46ms step:557/1480 train_time:82310ms step_avg:150.48ms step:558/1480 train_time:82470ms step_avg:150.49ms step:559/1480 train_time:82630ms step_avg:150.51ms step:560/1480 train_time:82790ms step_avg:150.53ms step:561/1480 train_time:82948ms step_avg:150.54ms step:562/1480 train_time:83109ms step_avg:150.56ms step:563/1480 train_time:83268ms step_avg:150.58ms step:564/1480 train_time:83429ms step_avg:150.59ms step:565/1480 train_time:83589ms step_avg:150.61ms step:566/1480 train_time:83749ms step_avg:150.63ms step:567/1480 train_time:83909ms step_avg:150.64ms step:568/1480 train_time:84068ms step_avg:150.66ms step:569/1480 train_time:84228ms step_avg:150.68ms step:570/1480 train_time:84388ms step_avg:150.69ms step:571/1480 train_time:84548ms step_avg:150.71ms step:572/1480 train_time:84709ms step_avg:150.73ms step:573/1480 train_time:84869ms step_avg:150.74ms step:574/1480 train_time:85030ms step_avg:150.76ms step:575/1480 train_time:85192ms step_avg:150.78ms step:576/1480 train_time:85351ms step_avg:150.80ms step:577/1480 train_time:85510ms step_avg:150.81ms step:578/1480 train_time:85669ms step_avg:150.83ms step:579/1480 train_time:85829ms step_avg:150.84ms step:580/1480 train_time:85989ms step_avg:150.86ms step:581/1480 train_time:86150ms step_avg:150.87ms step:582/1480 train_time:86310ms step_avg:150.89ms step:583/1480 train_time:86469ms step_avg:150.91ms step:584/1480 train_time:86630ms step_avg:150.92ms step:585/1480 train_time:86789ms step_avg:150.94ms step:586/1480 train_time:86950ms step_avg:150.95ms step:587/1480 train_time:87110ms step_avg:150.97ms step:588/1480 train_time:87268ms step_avg:150.98ms step:589/1480 train_time:87429ms step_avg:151.00ms step:590/1480 train_time:87589ms step_avg:151.02ms step:591/1480 train_time:87748ms step_avg:151.03ms step:592/1480 train_time:87908ms step_avg:151.04ms step:593/1480 train_time:88070ms step_avg:151.06ms step:594/1480 train_time:88232ms step_avg:151.08ms step:595/1480 train_time:88393ms step_avg:151.10ms step:596/1480 train_time:88553ms step_avg:151.11ms step:597/1480 train_time:88711ms step_avg:151.13ms step:598/1480 train_time:88869ms step_avg:151.14ms step:599/1480 train_time:89029ms step_avg:151.15ms step:600/1480 train_time:89189ms step_avg:151.17ms step:601/1480 train_time:89349ms step_avg:151.18ms step:602/1480 train_time:89509ms step_avg:151.20ms step:603/1480 train_time:89669ms step_avg:151.21ms step:604/1480 train_time:89829ms step_avg:151.23ms step:605/1480 train_time:89989ms step_avg:151.24ms step:606/1480 train_time:90151ms step_avg:151.26ms step:607/1480 train_time:90313ms step_avg:151.28ms step:608/1480 train_time:90470ms step_avg:151.29ms step:609/1480 train_time:90630ms step_avg:151.30ms step:610/1480 train_time:90789ms step_avg:151.32ms step:611/1480 train_time:90950ms step_avg:151.33ms step:612/1480 train_time:91110ms step_avg:151.34ms step:613/1480 train_time:91270ms step_avg:151.36ms step:614/1480 train_time:91430ms step_avg:151.37ms step:615/1480 train_time:91589ms step_avg:151.39ms step:616/1480 train_time:91749ms step_avg:151.40ms step:617/1480 train_time:91910ms step_avg:151.42ms step:618/1480 train_time:92068ms step_avg:151.43ms step:619/1480 train_time:92229ms step_avg:151.44ms step:620/1480 train_time:92388ms step_avg:151.46ms step:621/1480 train_time:92549ms step_avg:151.47ms step:622/1480 train_time:92709ms step_avg:151.48ms step:623/1480 train_time:92869ms step_avg:151.50ms step:624/1480 train_time:93030ms step_avg:151.51ms step:625/1480 train_time:93189ms step_avg:151.53ms step:625/1480 val_loss:3.6053 train_time:93253ms step_avg:151.63ms step:626/1480 train_time:93353ms step_avg:151.55ms step:627/1480 train_time:93515ms step_avg:151.56ms step:628/1480 train_time:93674ms step_avg:151.58ms step:629/1480 train_time:93832ms step_avg:151.59ms step:630/1480 train_time:93991ms step_avg:151.60ms step:631/1480 train_time:94148ms step_avg:151.61ms step:632/1480 train_time:94307ms step_avg:151.62ms step:633/1480 train_time:94465ms step_avg:151.63ms step:634/1480 train_time:94623ms step_avg:151.64ms step:635/1480 train_time:94782ms step_avg:151.65ms step:636/1480 train_time:94941ms step_avg:151.66ms step:637/1480 train_time:95100ms step_avg:151.67ms step:638/1480 train_time:95257ms step_avg:151.68ms step:639/1480 train_time:95416ms step_avg:151.69ms step:640/1480 train_time:95576ms step_avg:151.71ms step:641/1480 train_time:95737ms step_avg:151.72ms step:642/1480 train_time:95896ms step_avg:151.73ms step:643/1480 train_time:96057ms step_avg:151.75ms step:644/1480 train_time:96215ms step_avg:151.76ms step:645/1480 train_time:96375ms step_avg:151.77ms step:646/1480 train_time:96534ms step_avg:151.78ms step:647/1480 train_time:96694ms step_avg:151.80ms step:648/1480 train_time:96856ms step_avg:151.81ms step:649/1480 train_time:97015ms step_avg:151.82ms step:650/1480 train_time:97176ms step_avg:151.84ms step:651/1480 train_time:97336ms step_avg:151.85ms step:652/1480 train_time:97497ms step_avg:151.86ms step:653/1480 train_time:97655ms step_avg:151.87ms step:654/1480 train_time:97815ms step_avg:151.89ms step:655/1480 train_time:97975ms step_avg:151.90ms step:656/1480 train_time:98135ms step_avg:151.91ms step:657/1480 train_time:98296ms step_avg:151.93ms step:658/1480 train_time:98456ms step_avg:151.94ms step:659/1480 train_time:98618ms step_avg:151.95ms step:660/1480 train_time:98781ms step_avg:151.97ms step:661/1480 train_time:98942ms step_avg:151.99ms step:662/1480 train_time:99102ms step_avg:152.00ms step:663/1480 train_time:99261ms step_avg:152.01ms step:664/1480 train_time:99423ms step_avg:152.02ms step:665/1480 train_time:99584ms step_avg:152.04ms step:666/1480 train_time:99744ms step_avg:152.05ms step:667/1480 train_time:99905ms step_avg:152.06ms step:668/1480 train_time:100068ms step_avg:152.08ms step:669/1480 train_time:100232ms step_avg:152.10ms step:670/1480 train_time:100394ms step_avg:152.11ms step:671/1480 train_time:100556ms step_avg:152.13ms step:672/1480 train_time:100719ms step_avg:152.14ms step:673/1480 train_time:100880ms step_avg:152.16ms step:674/1480 train_time:101041ms step_avg:152.17ms step:675/1480 train_time:101203ms step_avg:152.18ms step:676/1480 train_time:101364ms step_avg:152.20ms step:677/1480 train_time:101525ms step_avg:152.21ms step:678/1480 train_time:101685ms step_avg:152.22ms step:679/1480 train_time:101847ms step_avg:152.24ms step:680/1480 train_time:102007ms step_avg:152.25ms step:681/1480 train_time:102167ms step_avg:152.26ms step:682/1480 train_time:102332ms step_avg:152.28ms step:683/1480 train_time:102495ms step_avg:152.30ms step:684/1480 train_time:102657ms step_avg:152.31ms step:685/1480 train_time:102821ms step_avg:152.33ms step:686/1480 train_time:102983ms step_avg:152.34ms step:687/1480 train_time:103143ms step_avg:152.35ms step:688/1480 train_time:103305ms step_avg:152.37ms step:689/1480 train_time:103467ms step_avg:152.38ms step:690/1480 train_time:103633ms step_avg:152.40ms step:691/1480 train_time:103795ms step_avg:152.42ms step:692/1480 train_time:103958ms step_avg:152.43ms step:693/1480 train_time:104119ms step_avg:152.44ms step:694/1480 train_time:104280ms step_avg:152.46ms step:695/1480 train_time:104439ms step_avg:152.47ms step:696/1480 train_time:104600ms step_avg:152.48ms step:697/1480 train_time:104762ms step_avg:152.49ms step:698/1480 train_time:104922ms step_avg:152.50ms step:699/1480 train_time:105085ms step_avg:152.52ms step:700/1480 train_time:105246ms step_avg:152.53ms step:701/1480 train_time:105404ms step_avg:152.54ms step:702/1480 train_time:105565ms step_avg:152.55ms step:703/1480 train_time:105724ms step_avg:152.56ms step:704/1480 train_time:105883ms step_avg:152.57ms step:705/1480 train_time:106046ms step_avg:152.58ms step:706/1480 train_time:106212ms step_avg:152.60ms step:707/1480 train_time:106374ms step_avg:152.62ms step:708/1480 train_time:106534ms step_avg:152.63ms step:709/1480 train_time:106696ms step_avg:152.64ms step:710/1480 train_time:106858ms step_avg:152.65ms step:711/1480 train_time:107020ms step_avg:152.67ms step:712/1480 train_time:107184ms step_avg:152.68ms step:713/1480 train_time:107345ms step_avg:152.70ms step:714/1480 train_time:107504ms step_avg:152.70ms step:715/1480 train_time:107664ms step_avg:152.71ms step:716/1480 train_time:107823ms step_avg:152.72ms step:717/1480 train_time:107985ms step_avg:152.74ms step:718/1480 train_time:108143ms step_avg:152.74ms step:719/1480 train_time:108302ms step_avg:152.75ms step:720/1480 train_time:108465ms step_avg:152.77ms step:721/1480 train_time:108625ms step_avg:152.78ms step:722/1480 train_time:108787ms step_avg:152.79ms step:723/1480 train_time:108948ms step_avg:152.80ms step:724/1480 train_time:109108ms step_avg:152.81ms step:725/1480 train_time:109273ms step_avg:152.83ms step:726/1480 train_time:109436ms step_avg:152.84ms step:727/1480 train_time:109599ms step_avg:152.86ms step:728/1480 train_time:109759ms step_avg:152.87ms step:729/1480 train_time:109919ms step_avg:152.88ms step:730/1480 train_time:110082ms step_avg:152.89ms step:731/1480 train_time:110243ms step_avg:152.90ms step:732/1480 train_time:110402ms step_avg:152.91ms step:733/1480 train_time:110564ms step_avg:152.92ms step:734/1480 train_time:110724ms step_avg:152.93ms step:735/1480 train_time:110885ms step_avg:152.94ms step:736/1480 train_time:111047ms step_avg:152.96ms step:737/1480 train_time:111207ms step_avg:152.97ms step:738/1480 train_time:111369ms step_avg:152.98ms step:739/1480 train_time:111530ms step_avg:152.99ms step:740/1480 train_time:111697ms step_avg:153.01ms step:741/1480 train_time:111861ms step_avg:153.02ms step:742/1480 train_time:112023ms step_avg:153.04ms step:743/1480 train_time:112184ms step_avg:153.05ms step:744/1480 train_time:112345ms step_avg:153.06ms step:745/1480 train_time:112508ms step_avg:153.07ms step:746/1480 train_time:112668ms step_avg:153.08ms step:747/1480 train_time:112831ms step_avg:153.09ms step:748/1480 train_time:112998ms step_avg:153.11ms step:749/1480 train_time:113160ms step_avg:153.13ms step:750/1480 train_time:113320ms step_avg:153.13ms step:750/1480 val_loss:3.5500 train_time:113385ms step_avg:153.22ms step:751/1480 train_time:113485ms step_avg:153.15ms step:752/1480 train_time:113646ms step_avg:153.16ms step:753/1480 train_time:113807ms step_avg:153.17ms step:754/1480 train_time:113968ms step_avg:153.18ms step:755/1480 train_time:114128ms step_avg:153.19ms step:756/1480 train_time:114289ms step_avg:153.20ms step:757/1480 train_time:114455ms step_avg:153.22ms step:758/1480 train_time:114619ms step_avg:153.23ms step:759/1480 train_time:114782ms step_avg:153.25ms step:760/1480 train_time:114943ms step_avg:153.26ms step:761/1480 train_time:115104ms step_avg:153.27ms step:762/1480 train_time:115265ms step_avg:153.28ms step:763/1480 train_time:115426ms step_avg:153.29ms step:764/1480 train_time:115587ms step_avg:153.30ms step:765/1480 train_time:115749ms step_avg:153.31ms step:766/1480 train_time:115911ms step_avg:153.32ms step:767/1480 train_time:116072ms step_avg:153.33ms step:768/1480 train_time:116234ms step_avg:153.34ms step:769/1480 train_time:116398ms step_avg:153.36ms step:770/1480 train_time:116562ms step_avg:153.37ms step:771/1480 train_time:116725ms step_avg:153.38ms step:772/1480 train_time:116887ms step_avg:153.40ms step:773/1480 train_time:117049ms step_avg:153.41ms step:774/1480 train_time:117211ms step_avg:153.42ms step:775/1480 train_time:117373ms step_avg:153.43ms step:776/1480 train_time:117539ms step_avg:153.44ms step:777/1480 train_time:117705ms step_avg:153.46ms step:778/1480 train_time:117867ms step_avg:153.47ms step:779/1480 train_time:118029ms step_avg:153.48ms step:780/1480 train_time:118192ms step_avg:153.50ms step:781/1480 train_time:118357ms step_avg:153.51ms step:782/1480 train_time:118522ms step_avg:153.53ms step:783/1480 train_time:118683ms step_avg:153.54ms step:784/1480 train_time:118845ms step_avg:153.55ms step:785/1480 train_time:119007ms step_avg:153.56ms step:786/1480 train_time:119171ms step_avg:153.57ms step:787/1480 train_time:119335ms step_avg:153.58ms step:788/1480 train_time:119500ms step_avg:153.60ms step:789/1480 train_time:119663ms step_avg:153.61ms step:790/1480 train_time:119828ms step_avg:153.63ms step:791/1480 train_time:119995ms step_avg:153.64ms step:792/1480 train_time:120160ms step_avg:153.66ms step:793/1480 train_time:120322ms step_avg:153.67ms step:794/1480 train_time:120487ms step_avg:153.68ms step:795/1480 train_time:120653ms step_avg:153.70ms step:796/1480 train_time:120820ms step_avg:153.72ms step:797/1480 train_time:120985ms step_avg:153.73ms step:798/1480 train_time:121148ms step_avg:153.74ms step:799/1480 train_time:121314ms step_avg:153.76ms step:800/1480 train_time:121479ms step_avg:153.77ms step:801/1480 train_time:121643ms step_avg:153.78ms step:802/1480 train_time:121809ms step_avg:153.80ms step:803/1480 train_time:121971ms step_avg:153.81ms step:804/1480 train_time:122133ms step_avg:153.82ms step:805/1480 train_time:122299ms step_avg:153.83ms step:806/1480 train_time:122461ms step_avg:153.85ms step:807/1480 train_time:122623ms step_avg:153.86ms step:808/1480 train_time:122786ms step_avg:153.87ms step:809/1480 train_time:122948ms step_avg:153.88ms step:810/1480 train_time:123109ms step_avg:153.89ms step:811/1480 train_time:123270ms step_avg:153.90ms step:812/1480 train_time:123434ms step_avg:153.91ms step:813/1480 train_time:123596ms step_avg:153.92ms step:814/1480 train_time:123761ms step_avg:153.93ms step:815/1480 train_time:123924ms step_avg:153.94ms step:816/1480 train_time:124088ms step_avg:153.95ms step:817/1480 train_time:124249ms step_avg:153.96ms step:818/1480 train_time:124409ms step_avg:153.97ms step:819/1480 train_time:124573ms step_avg:153.98ms step:820/1480 train_time:124737ms step_avg:154.00ms step:821/1480 train_time:124900ms step_avg:154.01ms step:822/1480 train_time:125064ms step_avg:154.02ms step:823/1480 train_time:125226ms step_avg:154.03ms step:824/1480 train_time:125387ms step_avg:154.04ms step:825/1480 train_time:125551ms step_avg:154.05ms step:826/1480 train_time:125718ms step_avg:154.07ms step:827/1480 train_time:125883ms step_avg:154.08ms step:828/1480 train_time:126046ms step_avg:154.09ms step:829/1480 train_time:126209ms step_avg:154.10ms step:830/1480 train_time:126372ms step_avg:154.11ms step:831/1480 train_time:126537ms step_avg:154.13ms step:832/1480 train_time:126700ms step_avg:154.14ms step:833/1480 train_time:126865ms step_avg:154.15ms step:834/1480 train_time:127029ms step_avg:154.16ms step:835/1480 train_time:127190ms step_avg:154.17ms step:836/1480 train_time:127357ms step_avg:154.19ms step:837/1480 train_time:127521ms step_avg:154.20ms step:838/1480 train_time:127686ms step_avg:154.21ms step:839/1480 train_time:127848ms step_avg:154.22ms step:840/1480 train_time:128008ms step_avg:154.23ms step:841/1480 train_time:128167ms step_avg:154.23ms step:842/1480 train_time:128330ms step_avg:154.24ms step:843/1480 train_time:128491ms step_avg:154.25ms step:844/1480 train_time:128655ms step_avg:154.26ms step:845/1480 train_time:128819ms step_avg:154.27ms step:846/1480 train_time:128983ms step_avg:154.29ms step:847/1480 train_time:129146ms step_avg:154.30ms step:848/1480 train_time:129307ms step_avg:154.30ms step:849/1480 train_time:129470ms step_avg:154.31ms step:850/1480 train_time:129633ms step_avg:154.33ms step:851/1480 train_time:129799ms step_avg:154.34ms step:852/1480 train_time:129962ms step_avg:154.35ms step:853/1480 train_time:130124ms step_avg:154.36ms step:854/1480 train_time:130288ms step_avg:154.37ms step:855/1480 train_time:130451ms step_avg:154.38ms step:856/1480 train_time:130612ms step_avg:154.39ms step:857/1480 train_time:130778ms step_avg:154.40ms step:858/1480 train_time:130945ms step_avg:154.42ms step:859/1480 train_time:131108ms step_avg:154.43ms step:860/1480 train_time:131268ms step_avg:154.43ms step:861/1480 train_time:131435ms step_avg:154.45ms step:862/1480 train_time:131605ms step_avg:154.47ms step:863/1480 train_time:131772ms step_avg:154.48ms step:864/1480 train_time:131937ms step_avg:154.49ms step:865/1480 train_time:132099ms step_avg:154.50ms step:866/1480 train_time:132267ms step_avg:154.52ms step:867/1480 train_time:132430ms step_avg:154.53ms step:868/1480 train_time:132589ms step_avg:154.53ms step:869/1480 train_time:132751ms step_avg:154.54ms step:870/1480 train_time:132917ms step_avg:154.55ms step:871/1480 train_time:133079ms step_avg:154.56ms step:872/1480 train_time:133244ms step_avg:154.58ms step:873/1480 train_time:133406ms step_avg:154.58ms step:874/1480 train_time:133573ms step_avg:154.60ms step:875/1480 train_time:133740ms step_avg:154.61ms step:875/1480 val_loss:3.5018 train_time:133805ms step_avg:154.69ms step:876/1480 train_time:133907ms step_avg:154.63ms step:877/1480 train_time:134073ms step_avg:154.64ms step:878/1480 train_time:134236ms step_avg:154.65ms step:879/1480 train_time:134399ms step_avg:154.66ms step:880/1480 train_time:134561ms step_avg:154.67ms step:881/1480 train_time:134722ms step_avg:154.67ms step:882/1480 train_time:134887ms step_avg:154.69ms step:883/1480 train_time:135053ms step_avg:154.70ms step:884/1480 train_time:135220ms step_avg:154.71ms step:885/1480 train_time:135385ms step_avg:154.73ms step:886/1480 train_time:135553ms step_avg:154.74ms step:887/1480 train_time:135720ms step_avg:154.75ms step:888/1480 train_time:135893ms step_avg:154.78ms step:889/1480 train_time:136061ms step_avg:154.79ms step:890/1480 train_time:136223ms step_avg:154.80ms step:891/1480 train_time:136389ms step_avg:154.81ms step:892/1480 train_time:136555ms step_avg:154.82ms step:893/1480 train_time:136718ms step_avg:154.83ms step:894/1480 train_time:136883ms step_avg:154.85ms step:895/1480 train_time:137050ms step_avg:154.86ms step:896/1480 train_time:137215ms step_avg:154.87ms step:897/1480 train_time:137382ms step_avg:154.88ms step:898/1480 train_time:137549ms step_avg:154.90ms step:899/1480 train_time:137714ms step_avg:154.91ms step:900/1480 train_time:137877ms step_avg:154.92ms step:901/1480 train_time:138040ms step_avg:154.93ms step:902/1480 train_time:138203ms step_avg:154.94ms step:903/1480 train_time:138375ms step_avg:154.96ms step:904/1480 train_time:138540ms step_avg:154.97ms step:905/1480 train_time:138701ms step_avg:154.97ms step:906/1480 train_time:138867ms step_avg:154.99ms step:907/1480 train_time:139034ms step_avg:155.00ms step:908/1480 train_time:139197ms step_avg:155.01ms step:909/1480 train_time:139362ms step_avg:155.02ms step:910/1480 train_time:139533ms step_avg:155.04ms step:911/1480 train_time:139697ms step_avg:155.05ms step:912/1480 train_time:139864ms step_avg:155.06ms step:913/1480 train_time:140031ms step_avg:155.07ms step:914/1480 train_time:140198ms step_avg:155.09ms step:915/1480 train_time:140368ms step_avg:155.10ms step:916/1480 train_time:140534ms step_avg:155.11ms step:917/1480 train_time:140698ms step_avg:155.12ms step:918/1480 train_time:140867ms step_avg:155.14ms step:919/1480 train_time:141037ms step_avg:155.16ms step:920/1480 train_time:141202ms step_avg:155.17ms step:921/1480 train_time:141367ms step_avg:155.18ms step:922/1480 train_time:141534ms step_avg:155.19ms step:923/1480 train_time:141696ms step_avg:155.20ms step:924/1480 train_time:141860ms step_avg:155.21ms step:925/1480 train_time:142026ms step_avg:155.22ms step:926/1480 train_time:142190ms step_avg:155.23ms step:927/1480 train_time:142353ms step_avg:155.24ms step:928/1480 train_time:142518ms step_avg:155.25ms step:929/1480 train_time:142682ms step_avg:155.26ms step:930/1480 train_time:142849ms step_avg:155.27ms step:931/1480 train_time:143013ms step_avg:155.28ms step:932/1480 train_time:143178ms step_avg:155.29ms step:933/1480 train_time:143345ms step_avg:155.30ms step:934/1480 train_time:143514ms step_avg:155.32ms step:935/1480 train_time:143682ms step_avg:155.33ms step:936/1480 train_time:143849ms step_avg:155.34ms step:937/1480 train_time:144018ms step_avg:155.36ms step:938/1480 train_time:144181ms step_avg:155.37ms step:939/1480 train_time:144350ms step_avg:155.38ms step:940/1480 train_time:144517ms step_avg:155.39ms step:941/1480 train_time:144680ms step_avg:155.40ms step:942/1480 train_time:144845ms step_avg:155.41ms step:943/1480 train_time:145016ms step_avg:155.43ms step:944/1480 train_time:145190ms step_avg:155.45ms step:945/1480 train_time:145354ms step_avg:155.46ms step:946/1480 train_time:145522ms step_avg:155.47ms step:947/1480 train_time:145691ms step_avg:155.49ms step:948/1480 train_time:145856ms step_avg:155.50ms step:949/1480 train_time:146020ms step_avg:155.51ms step:950/1480 train_time:146183ms step_avg:155.51ms step:951/1480 train_time:146351ms step_avg:155.53ms step:952/1480 train_time:146517ms step_avg:155.54ms step:953/1480 train_time:146685ms step_avg:155.55ms step:954/1480 train_time:146854ms step_avg:155.57ms step:955/1480 train_time:147018ms step_avg:155.57ms step:956/1480 train_time:147183ms step_avg:155.59ms step:957/1480 train_time:147351ms step_avg:155.60ms step:958/1480 train_time:147520ms step_avg:155.61ms step:959/1480 train_time:147684ms step_avg:155.62ms step:960/1480 train_time:147852ms step_avg:155.63ms step:961/1480 train_time:148017ms step_avg:155.64ms step:962/1480 train_time:148180ms step_avg:155.65ms step:963/1480 train_time:148345ms step_avg:155.66ms step:964/1480 train_time:148515ms step_avg:155.68ms step:965/1480 train_time:148678ms step_avg:155.68ms step:966/1480 train_time:148843ms step_avg:155.69ms step:967/1480 train_time:149007ms step_avg:155.70ms step:968/1480 train_time:149172ms step_avg:155.71ms step:969/1480 train_time:149338ms step_avg:155.72ms step:970/1480 train_time:149501ms step_avg:155.73ms step:971/1480 train_time:149664ms step_avg:155.74ms step:972/1480 train_time:149828ms step_avg:155.75ms step:973/1480 train_time:149992ms step_avg:155.76ms step:974/1480 train_time:150160ms step_avg:155.77ms step:975/1480 train_time:150325ms step_avg:155.78ms step:976/1480 train_time:150491ms step_avg:155.79ms step:977/1480 train_time:150655ms step_avg:155.80ms step:978/1480 train_time:150820ms step_avg:155.81ms step:979/1480 train_time:150987ms step_avg:155.82ms step:980/1480 train_time:151153ms step_avg:155.83ms step:981/1480 train_time:151321ms step_avg:155.84ms step:982/1480 train_time:151487ms step_avg:155.85ms step:983/1480 train_time:151653ms step_avg:155.86ms step:984/1480 train_time:151818ms step_avg:155.87ms step:985/1480 train_time:151984ms step_avg:155.88ms step:986/1480 train_time:152149ms step_avg:155.89ms step:987/1480 train_time:152314ms step_avg:155.90ms step:988/1480 train_time:152481ms step_avg:155.91ms step:989/1480 train_time:152646ms step_avg:155.92ms step:990/1480 train_time:152816ms step_avg:155.93ms step:991/1480 train_time:152982ms step_avg:155.95ms step:992/1480 train_time:153158ms step_avg:155.97ms step:993/1480 train_time:153334ms step_avg:155.99ms step:994/1480 train_time:153499ms step_avg:155.99ms step:995/1480 train_time:153663ms step_avg:156.00ms step:996/1480 train_time:153825ms step_avg:156.01ms step:997/1480 train_time:153991ms step_avg:156.02ms step:998/1480 train_time:154155ms step_avg:156.03ms step:999/1480 train_time:154319ms step_avg:156.04ms step:1000/1480 train_time:154488ms step_avg:156.05ms step:1000/1480 val_loss:3.4387 train_time:154554ms step_avg:156.12ms step:1001/1480 train_time:154654ms step_avg:156.06ms step:1002/1480 train_time:154822ms step_avg:156.07ms step:1003/1480 train_time:154992ms step_avg:156.08ms step:1004/1480 train_time:155160ms step_avg:156.10ms step:1005/1480 train_time:155328ms step_avg:156.11ms step:1006/1480 train_time:155495ms step_avg:156.12ms step:1007/1480 train_time:155661ms step_avg:156.13ms step:1008/1480 train_time:155830ms step_avg:156.14ms step:1009/1480 train_time:156003ms step_avg:156.16ms step:1010/1480 train_time:156170ms step_avg:156.17ms step:1011/1480 train_time:156335ms step_avg:156.18ms step:1012/1480 train_time:156499ms step_avg:156.19ms step:1013/1480 train_time:156669ms step_avg:156.20ms step:1014/1480 train_time:156835ms step_avg:156.21ms step:1015/1480 train_time:157005ms step_avg:156.22ms step:1016/1480 train_time:157174ms step_avg:156.24ms step:1017/1480 train_time:157346ms step_avg:156.25ms step:1018/1480 train_time:157514ms step_avg:156.26ms step:1019/1480 train_time:157682ms step_avg:156.28ms step:1020/1480 train_time:157852ms step_avg:156.29ms step:1021/1480 train_time:158017ms step_avg:156.30ms step:1022/1480 train_time:158185ms step_avg:156.31ms step:1023/1480 train_time:158351ms step_avg:156.32ms step:1024/1480 train_time:158517ms step_avg:156.33ms step:1025/1480 train_time:158687ms step_avg:156.34ms step:1026/1480 train_time:158853ms step_avg:156.35ms step:1027/1480 train_time:159020ms step_avg:156.36ms step:1028/1480 train_time:159192ms step_avg:156.38ms step:1029/1480 train_time:159367ms step_avg:156.40ms step:1030/1480 train_time:159535ms step_avg:156.41ms step:1031/1480 train_time:159698ms step_avg:156.41ms step:1032/1480 train_time:159870ms step_avg:156.43ms step:1033/1480 train_time:160036ms step_avg:156.44ms step:1034/1480 train_time:160203ms step_avg:156.45ms step:1035/1480 train_time:160370ms step_avg:156.46ms step:1036/1480 train_time:160534ms step_avg:156.47ms step:1037/1480 train_time:160702ms step_avg:156.48ms step:1038/1480 train_time:160869ms step_avg:156.49ms step:1039/1480 train_time:161039ms step_avg:156.50ms step:1040/1480 train_time:161206ms step_avg:156.51ms step:1041/1480 train_time:161374ms step_avg:156.52ms step:1042/1480 train_time:161537ms step_avg:156.53ms step:1043/1480 train_time:161702ms step_avg:156.54ms step:1044/1480 train_time:161867ms step_avg:156.54ms step:1045/1480 train_time:162036ms step_avg:156.56ms step:1046/1480 train_time:162205ms step_avg:156.57ms step:1047/1480 train_time:162371ms step_avg:156.58ms step:1048/1480 train_time:162536ms step_avg:156.59ms step:1049/1480 train_time:162703ms step_avg:156.60ms step:1050/1480 train_time:162872ms step_avg:156.61ms step:1051/1480 train_time:163041ms step_avg:156.62ms step:1052/1480 train_time:163209ms step_avg:156.63ms step:1053/1480 train_time:163375ms step_avg:156.64ms step:1054/1480 train_time:163543ms step_avg:156.65ms step:1055/1480 train_time:163709ms step_avg:156.66ms step:1056/1480 train_time:163873ms step_avg:156.67ms step:1057/1480 train_time:164041ms step_avg:156.68ms step:1058/1480 train_time:164209ms step_avg:156.69ms step:1059/1480 train_time:164383ms step_avg:156.70ms step:1060/1480 train_time:164551ms step_avg:156.72ms step:1061/1480 train_time:164715ms step_avg:156.72ms step:1062/1480 train_time:164881ms step_avg:156.73ms step:1063/1480 train_time:165047ms step_avg:156.74ms step:1064/1480 train_time:165210ms step_avg:156.75ms step:1065/1480 train_time:165376ms step_avg:156.75ms step:1066/1480 train_time:165545ms step_avg:156.77ms step:1067/1480 train_time:165714ms step_avg:156.78ms step:1068/1480 train_time:165880ms step_avg:156.79ms step:1069/1480 train_time:166050ms step_avg:156.80ms step:1070/1480 train_time:166215ms step_avg:156.81ms step:1071/1480 train_time:166388ms step_avg:156.82ms step:1072/1480 train_time:166554ms step_avg:156.83ms step:1073/1480 train_time:166716ms step_avg:156.84ms step:1074/1480 train_time:166882ms step_avg:156.84ms step:1075/1480 train_time:167053ms step_avg:156.86ms step:1076/1480 train_time:167219ms step_avg:156.87ms step:1077/1480 train_time:167384ms step_avg:156.87ms step:1078/1480 train_time:167558ms step_avg:156.89ms step:1079/1480 train_time:167730ms step_avg:156.90ms step:1080/1480 train_time:167900ms step_avg:156.92ms step:1081/1480 train_time:168067ms step_avg:156.92ms step:1082/1480 train_time:168233ms step_avg:156.93ms step:1083/1480 train_time:168399ms step_avg:156.94ms step:1084/1480 train_time:168566ms step_avg:156.95ms step:1085/1480 train_time:168735ms step_avg:156.96ms step:1086/1480 train_time:168901ms step_avg:156.97ms step:1087/1480 train_time:169068ms step_avg:156.98ms step:1088/1480 train_time:169237ms step_avg:156.99ms step:1089/1480 train_time:169409ms step_avg:157.01ms step:1090/1480 train_time:169581ms step_avg:157.02ms step:1091/1480 train_time:169750ms step_avg:157.03ms step:1092/1480 train_time:169918ms step_avg:157.04ms step:1093/1480 train_time:170087ms step_avg:157.05ms step:1094/1480 train_time:170253ms step_avg:157.06ms step:1095/1480 train_time:170418ms step_avg:157.07ms step:1096/1480 train_time:170585ms step_avg:157.08ms step:1097/1480 train_time:170754ms step_avg:157.09ms step:1098/1480 train_time:170926ms step_avg:157.10ms step:1099/1480 train_time:171097ms step_avg:157.11ms step:1100/1480 train_time:171270ms step_avg:157.13ms step:1101/1480 train_time:171440ms step_avg:157.14ms step:1102/1480 train_time:171611ms step_avg:157.15ms step:1103/1480 train_time:171787ms step_avg:157.17ms step:1104/1480 train_time:171955ms step_avg:157.18ms step:1105/1480 train_time:172125ms step_avg:157.19ms step:1106/1480 train_time:172294ms step_avg:157.20ms step:1107/1480 train_time:172462ms step_avg:157.21ms step:1108/1480 train_time:172628ms step_avg:157.22ms step:1109/1480 train_time:172794ms step_avg:157.23ms step:1110/1480 train_time:172960ms step_avg:157.24ms step:1111/1480 train_time:173129ms step_avg:157.25ms step:1112/1480 train_time:173298ms step_avg:157.26ms step:1113/1480 train_time:173479ms step_avg:157.28ms step:1114/1480 train_time:173653ms step_avg:157.29ms step:1115/1480 train_time:173827ms step_avg:157.31ms step:1116/1480 train_time:173993ms step_avg:157.32ms step:1117/1480 train_time:174164ms step_avg:157.33ms step:1118/1480 train_time:174341ms step_avg:157.35ms step:1119/1480 train_time:174507ms step_avg:157.36ms step:1120/1480 train_time:174675ms step_avg:157.36ms step:1121/1480 train_time:174845ms step_avg:157.38ms step:1122/1480 train_time:175011ms step_avg:157.38ms step:1123/1480 train_time:175177ms step_avg:157.39ms step:1124/1480 train_time:175347ms step_avg:157.40ms step:1125/1480 train_time:175514ms step_avg:157.41ms step:1125/1480 val_loss:3.3832 train_time:175583ms step_avg:157.47ms step:1126/1480 train_time:175685ms step_avg:157.42ms step:1127/1480 train_time:175854ms step_avg:157.43ms step:1128/1480 train_time:176026ms step_avg:157.45ms step:1129/1480 train_time:176199ms step_avg:157.46ms step:1130/1480 train_time:176371ms step_avg:157.47ms step:1131/1480 train_time:176549ms step_avg:157.49ms step:1132/1480 train_time:176715ms step_avg:157.50ms step:1133/1480 train_time:176887ms step_avg:157.51ms step:1134/1480 train_time:177058ms step_avg:157.52ms step:1135/1480 train_time:177227ms step_avg:157.53ms step:1136/1480 train_time:177396ms step_avg:157.55ms step:1137/1480 train_time:177566ms step_avg:157.56ms step:1138/1480 train_time:177738ms step_avg:157.57ms step:1139/1480 train_time:177905ms step_avg:157.58ms step:1140/1480 train_time:178075ms step_avg:157.59ms step:1141/1480 train_time:178247ms step_avg:157.60ms step:1142/1480 train_time:178415ms step_avg:157.61ms step:1143/1480 train_time:178585ms step_avg:157.62ms step:1144/1480 train_time:178754ms step_avg:157.63ms step:1145/1480 train_time:178919ms step_avg:157.64ms step:1146/1480 train_time:179089ms step_avg:157.65ms step:1147/1480 train_time:179257ms step_avg:157.66ms step:1148/1480 train_time:179426ms step_avg:157.67ms step:1149/1480 train_time:179596ms step_avg:157.68ms step:1150/1480 train_time:179765ms step_avg:157.69ms step:1151/1480 train_time:179937ms step_avg:157.70ms step:1152/1480 train_time:180109ms step_avg:157.71ms step:1153/1480 train_time:180283ms step_avg:157.73ms step:1154/1480 train_time:180450ms step_avg:157.74ms step:1155/1480 train_time:180621ms step_avg:157.75ms step:1156/1480 train_time:180800ms step_avg:157.77ms step:1157/1480 train_time:180971ms step_avg:157.78ms step:1158/1480 train_time:181138ms step_avg:157.79ms step:1159/1480 train_time:181305ms step_avg:157.79ms step:1160/1480 train_time:181471ms step_avg:157.80ms step:1161/1480 train_time:181641ms step_avg:157.81ms step:1162/1480 train_time:181812ms step_avg:157.82ms step:1163/1480 train_time:181980ms step_avg:157.83ms step:1164/1480 train_time:182150ms step_avg:157.84ms step:1165/1480 train_time:182315ms step_avg:157.85ms step:1166/1480 train_time:182484ms step_avg:157.86ms step:1167/1480 train_time:182652ms step_avg:157.87ms step:1168/1480 train_time:182818ms step_avg:157.87ms step:1169/1480 train_time:182987ms step_avg:157.88ms step:1170/1480 train_time:183157ms step_avg:157.89ms step:1171/1480 train_time:183324ms step_avg:157.90ms step:1172/1480 train_time:183490ms step_avg:157.91ms step:1173/1480 train_time:183661ms step_avg:157.92ms step:1174/1480 train_time:183843ms step_avg:157.94ms step:1175/1480 train_time:184015ms step_avg:157.95ms step:1176/1480 train_time:184186ms step_avg:157.96ms step:1177/1480 train_time:184364ms step_avg:157.98ms step:1178/1480 train_time:184532ms step_avg:157.99ms step:1179/1480 train_time:184697ms step_avg:158.00ms step:1180/1480 train_time:184879ms step_avg:158.02ms step:1181/1480 train_time:185049ms step_avg:158.03ms step:1182/1480 train_time:185218ms step_avg:158.04ms step:1183/1480 train_time:185390ms step_avg:158.05ms step:1184/1480 train_time:185558ms step_avg:158.06ms step:1185/1480 train_time:185731ms step_avg:158.07ms step:1186/1480 train_time:185903ms step_avg:158.08ms step:1187/1480 train_time:186086ms step_avg:158.10ms step:1188/1480 train_time:186253ms step_avg:158.11ms step:1189/1480 train_time:186424ms step_avg:158.12ms step:1190/1480 train_time:186592ms step_avg:158.13ms step:1191/1480 train_time:186762ms step_avg:158.14ms step:1192/1480 train_time:186929ms step_avg:158.15ms step:1193/1480 train_time:187095ms step_avg:158.15ms step:1194/1480 train_time:187263ms step_avg:158.16ms step:1195/1480 train_time:187436ms step_avg:158.17ms step:1196/1480 train_time:187617ms step_avg:158.19ms step:1197/1480 train_time:187788ms step_avg:158.20ms step:1198/1480 train_time:187972ms step_avg:158.23ms step:1199/1480 train_time:188142ms step_avg:158.24ms step:1200/1480 train_time:188311ms step_avg:158.24ms step:1201/1480 train_time:188478ms step_avg:158.25ms step:1202/1480 train_time:188660ms step_avg:158.27ms step:1203/1480 train_time:188836ms step_avg:158.29ms step:1204/1480 train_time:189011ms step_avg:158.30ms step:1205/1480 train_time:189178ms step_avg:158.31ms step:1206/1480 train_time:189347ms step_avg:158.32ms step:1207/1480 train_time:189514ms step_avg:158.32ms step:1208/1480 train_time:189682ms step_avg:158.33ms step:1209/1480 train_time:189856ms step_avg:158.35ms step:1210/1480 train_time:190033ms step_avg:158.36ms step:1211/1480 train_time:190208ms step_avg:158.37ms step:1212/1480 train_time:190379ms step_avg:158.39ms step:1213/1480 train_time:190553ms step_avg:158.40ms step:1214/1480 train_time:190729ms step_avg:158.41ms step:1215/1480 train_time:190902ms step_avg:158.43ms step:1216/1480 train_time:191073ms step_avg:158.44ms step:1217/1480 train_time:191247ms step_avg:158.45ms step:1218/1480 train_time:191416ms step_avg:158.46ms step:1219/1480 train_time:191595ms step_avg:158.47ms step:1220/1480 train_time:191763ms step_avg:158.48ms step:1221/1480 train_time:191932ms step_avg:158.49ms step:1222/1480 train_time:192098ms step_avg:158.50ms step:1223/1480 train_time:192269ms step_avg:158.51ms step:1224/1480 train_time:192447ms step_avg:158.52ms step:1225/1480 train_time:192618ms step_avg:158.53ms step:1226/1480 train_time:192791ms step_avg:158.55ms step:1227/1480 train_time:192963ms step_avg:158.56ms step:1228/1480 train_time:193133ms step_avg:158.57ms step:1229/1480 train_time:193306ms step_avg:158.58ms step:1230/1480 train_time:193485ms step_avg:158.59ms step:1231/1480 train_time:193659ms step_avg:158.61ms step:1232/1480 train_time:193834ms step_avg:158.62ms step:1233/1480 train_time:194003ms step_avg:158.63ms step:1234/1480 train_time:194174ms step_avg:158.64ms step:1235/1480 train_time:194348ms step_avg:158.65ms step:1236/1480 train_time:194516ms step_avg:158.66ms step:1237/1480 train_time:194686ms step_avg:158.67ms step:1238/1480 train_time:194872ms step_avg:158.69ms step:1239/1480 train_time:195041ms step_avg:158.70ms step:1240/1480 train_time:195212ms step_avg:158.71ms step:1241/1480 train_time:195384ms step_avg:158.72ms step:1242/1480 train_time:195554ms step_avg:158.73ms step:1243/1480 train_time:195727ms step_avg:158.74ms step:1244/1480 train_time:195893ms step_avg:158.75ms step:1245/1480 train_time:196061ms step_avg:158.75ms step:1246/1480 train_time:196233ms step_avg:158.76ms step:1247/1480 train_time:196401ms step_avg:158.77ms step:1248/1480 train_time:196571ms step_avg:158.78ms step:1249/1480 train_time:196739ms step_avg:158.79ms step:1250/1480 train_time:196908ms step_avg:158.80ms step:1250/1480 val_loss:3.3341 train_time:196979ms step_avg:158.85ms step:1251/1480 train_time:197088ms step_avg:158.81ms step:1252/1480 train_time:197257ms step_avg:158.82ms step:1253/1480 train_time:197426ms step_avg:158.83ms step:1254/1480 train_time:197598ms step_avg:158.84ms step:1255/1480 train_time:197783ms step_avg:158.86ms step:1256/1480 train_time:197958ms step_avg:158.87ms step:1257/1480 train_time:198128ms step_avg:158.88ms step:1258/1480 train_time:198304ms step_avg:158.90ms step:1259/1480 train_time:198475ms step_avg:158.91ms step:1260/1480 train_time:198642ms step_avg:158.91ms step:1261/1480 train_time:198814ms step_avg:158.92ms step:1262/1480 train_time:198990ms step_avg:158.94ms step:1263/1480 train_time:199163ms step_avg:158.95ms step:1264/1480 train_time:199329ms step_avg:158.95ms step:1265/1480 train_time:199498ms step_avg:158.96ms step:1266/1480 train_time:199670ms step_avg:158.97ms step:1267/1480 train_time:199839ms step_avg:158.98ms step:1268/1480 train_time:200010ms step_avg:158.99ms step:1269/1480 train_time:200185ms step_avg:159.00ms step:1270/1480 train_time:200355ms step_avg:159.01ms step:1271/1480 train_time:200525ms step_avg:159.02ms step:1272/1480 train_time:200690ms step_avg:159.03ms step:1273/1480 train_time:200862ms step_avg:159.04ms step:1274/1480 train_time:201033ms step_avg:159.05ms step:1275/1480 train_time:201201ms step_avg:159.05ms step:1276/1480 train_time:201367ms step_avg:159.06ms step:1277/1480 train_time:201539ms step_avg:159.07ms step:1278/1480 train_time:201708ms step_avg:159.08ms step:1279/1480 train_time:201880ms step_avg:159.09ms step:1280/1480 train_time:202060ms step_avg:159.10ms step:1281/1480 train_time:202228ms step_avg:159.11ms step:1282/1480 train_time:202394ms step_avg:159.11ms step:1283/1480 train_time:202565ms step_avg:159.12ms step:1284/1480 train_time:202734ms step_avg:159.13ms step:1285/1480 train_time:202904ms step_avg:159.14ms step:1286/1480 train_time:203072ms step_avg:159.15ms step:1287/1480 train_time:203243ms step_avg:159.16ms step:1288/1480 train_time:203413ms step_avg:159.17ms step:1289/1480 train_time:203598ms step_avg:159.19ms step:1290/1480 train_time:203776ms step_avg:159.20ms step:1291/1480 train_time:203951ms step_avg:159.21ms step:1292/1480 train_time:204125ms step_avg:159.22ms step:1293/1480 train_time:204302ms step_avg:159.24ms step:1294/1480 train_time:204473ms step_avg:159.25ms step:1295/1480 train_time:204645ms step_avg:159.26ms step:1296/1480 train_time:204821ms step_avg:159.27ms step:1297/1480 train_time:204993ms step_avg:159.28ms step:1298/1480 train_time:205164ms step_avg:159.29ms step:1299/1480 train_time:205335ms step_avg:159.30ms step:1300/1480 train_time:205503ms step_avg:159.30ms step:1301/1480 train_time:205671ms step_avg:159.31ms step:1302/1480 train_time:205847ms step_avg:159.32ms step:1303/1480 train_time:206025ms step_avg:159.34ms step:1304/1480 train_time:206200ms step_avg:159.35ms step:1305/1480 train_time:206368ms step_avg:159.36ms step:1306/1480 train_time:206542ms step_avg:159.37ms step:1307/1480 train_time:206710ms step_avg:159.38ms step:1308/1480 train_time:206879ms step_avg:159.38ms step:1309/1480 train_time:207050ms step_avg:159.39ms step:1310/1480 train_time:207219ms step_avg:159.40ms step:1311/1480 train_time:207387ms step_avg:159.41ms step:1312/1480 train_time:207560ms step_avg:159.42ms step:1313/1480 train_time:207728ms step_avg:159.42ms step:1314/1480 train_time:207902ms step_avg:159.43ms step:1315/1480 train_time:208072ms step_avg:159.44ms step:1316/1480 train_time:208240ms step_avg:159.45ms step:1317/1480 train_time:208411ms step_avg:159.46ms step:1318/1480 train_time:208591ms step_avg:159.47ms step:1319/1480 train_time:208766ms step_avg:159.49ms step:1320/1480 train_time:208943ms step_avg:159.50ms step:1321/1480 train_time:209115ms step_avg:159.51ms step:1322/1480 train_time:209296ms step_avg:159.52ms step:1323/1480 train_time:209469ms step_avg:159.53ms step:1324/1480 train_time:209643ms step_avg:159.55ms step:1325/1480 train_time:209824ms step_avg:159.56ms step:1326/1480 train_time:210001ms step_avg:159.58ms step:1327/1480 train_time:210170ms step_avg:159.58ms step:1328/1480 train_time:210341ms step_avg:159.59ms step:1329/1480 train_time:210537ms step_avg:159.62ms step:1330/1480 train_time:210717ms step_avg:159.63ms step:1331/1480 train_time:210886ms step_avg:159.64ms step:1332/1480 train_time:211063ms step_avg:159.65ms step:1333/1480 train_time:211238ms step_avg:159.67ms step:1334/1480 train_time:211408ms step_avg:159.67ms step:1335/1480 train_time:211576ms step_avg:159.68ms step:1336/1480 train_time:211760ms step_avg:159.70ms step:1337/1480 train_time:211936ms step_avg:159.71ms step:1338/1480 train_time:212107ms step_avg:159.72ms step:1339/1480 train_time:212281ms step_avg:159.73ms step:1340/1480 train_time:212452ms step_avg:159.74ms step:1341/1480 train_time:212621ms step_avg:159.75ms step:1342/1480 train_time:212794ms step_avg:159.76ms step:1343/1480 train_time:212964ms step_avg:159.76ms step:1344/1480 train_time:213136ms step_avg:159.77ms step:1345/1480 train_time:213315ms step_avg:159.79ms step:1346/1480 train_time:213485ms step_avg:159.79ms step:1347/1480 train_time:213654ms step_avg:159.80ms step:1348/1480 train_time:213825ms step_avg:159.81ms step:1349/1480 train_time:213997ms step_avg:159.82ms step:1350/1480 train_time:214172ms step_avg:159.83ms step:1351/1480 train_time:214342ms step_avg:159.84ms step:1352/1480 train_time:214513ms step_avg:159.85ms step:1353/1480 train_time:214689ms step_avg:159.86ms step:1354/1480 train_time:214861ms step_avg:159.87ms step:1355/1480 train_time:215027ms step_avg:159.87ms step:1356/1480 train_time:215202ms step_avg:159.88ms step:1357/1480 train_time:215376ms step_avg:159.89ms step:1358/1480 train_time:215548ms step_avg:159.90ms step:1359/1480 train_time:215721ms step_avg:159.91ms step:1360/1480 train_time:215896ms step_avg:159.92ms step:1361/1480 train_time:216073ms step_avg:159.94ms step:1362/1480 train_time:216248ms step_avg:159.95ms step:1363/1480 train_time:216429ms step_avg:159.96ms step:1364/1480 train_time:216600ms step_avg:159.97ms step:1365/1480 train_time:216766ms step_avg:159.98ms step:1366/1480 train_time:216939ms step_avg:159.98ms step:1367/1480 train_time:217109ms step_avg:159.99ms step:1368/1480 train_time:217283ms step_avg:160.00ms step:1369/1480 train_time:217465ms step_avg:160.02ms step:1370/1480 train_time:217642ms step_avg:160.03ms step:1371/1480 train_time:217813ms step_avg:160.04ms step:1372/1480 train_time:217993ms step_avg:160.05ms step:1373/1480 train_time:218164ms step_avg:160.06ms step:1374/1480 train_time:218340ms step_avg:160.07ms step:1375/1480 train_time:218512ms step_avg:160.08ms step:1375/1480 val_loss:3.2954 train_time:218579ms step_avg:160.13ms step:1376/1480 train_time:218684ms step_avg:160.09ms step:1377/1480 train_time:218855ms step_avg:160.10ms step:1378/1480 train_time:219023ms step_avg:160.10ms step:1379/1480 train_time:219200ms step_avg:160.12ms step:1380/1480 train_time:219373ms step_avg:160.13ms step:1381/1480 train_time:219556ms step_avg:160.14ms step:1382/1480 train_time:219726ms step_avg:160.15ms step:1383/1480 train_time:219898ms step_avg:160.16ms step:1384/1480 train_time:220076ms step_avg:160.17ms step:1385/1480 train_time:220242ms step_avg:160.18ms step:1386/1480 train_time:220413ms step_avg:160.18ms step:1387/1480 train_time:220583ms step_avg:160.19ms step:1388/1480 train_time:220751ms step_avg:160.20ms step:1389/1480 train_time:220925ms step_avg:160.21ms step:1390/1480 train_time:221093ms step_avg:160.21ms step:1391/1480 train_time:221261ms step_avg:160.22ms step:1392/1480 train_time:221434ms step_avg:160.23ms step:1393/1480 train_time:221605ms step_avg:160.23ms step:1394/1480 train_time:221775ms step_avg:160.24ms step:1395/1480 train_time:221944ms step_avg:160.25ms step:1396/1480 train_time:222114ms step_avg:160.26ms step:1397/1480 train_time:222281ms step_avg:160.26ms step:1398/1480 train_time:222447ms step_avg:160.26ms step:1399/1480 train_time:222616ms step_avg:160.27ms step:1400/1480 train_time:222792ms step_avg:160.28ms step:1401/1480 train_time:222957ms step_avg:160.29ms step:1402/1480 train_time:223131ms step_avg:160.30ms step:1403/1480 train_time:223309ms step_avg:160.31ms step:1404/1480 train_time:223480ms step_avg:160.32ms step:1405/1480 train_time:223655ms step_avg:160.33ms step:1406/1480 train_time:223830ms step_avg:160.34ms step:1407/1480 train_time:223999ms step_avg:160.34ms step:1408/1480 train_time:224168ms step_avg:160.35ms step:1409/1480 train_time:224353ms step_avg:160.37ms step:1410/1480 train_time:224522ms step_avg:160.37ms step:1411/1480 train_time:224691ms step_avg:160.38ms step:1412/1480 train_time:224861ms step_avg:160.39ms step:1413/1480 train_time:225030ms step_avg:160.39ms step:1414/1480 train_time:225202ms step_avg:160.40ms step:1415/1480 train_time:225378ms step_avg:160.41ms step:1416/1480 train_time:225562ms step_avg:160.43ms step:1417/1480 train_time:225737ms step_avg:160.44ms step:1418/1480 train_time:225909ms step_avg:160.45ms step:1419/1480 train_time:226082ms step_avg:160.46ms step:1420/1480 train_time:226257ms step_avg:160.47ms step:1421/1480 train_time:226429ms step_avg:160.47ms step:1422/1480 train_time:226601ms step_avg:160.48ms step:1423/1480 train_time:226771ms step_avg:160.49ms step:1424/1480 train_time:226947ms step_avg:160.50ms step:1425/1480 train_time:227125ms step_avg:160.51ms step:1426/1480 train_time:227296ms step_avg:160.52ms step:1427/1480 train_time:227471ms step_avg:160.53ms step:1428/1480 train_time:227641ms step_avg:160.54ms step:1429/1480 train_time:227809ms step_avg:160.54ms step:1430/1480 train_time:227982ms step_avg:160.55ms step:1431/1480 train_time:228157ms step_avg:160.56ms step:1432/1480 train_time:228334ms step_avg:160.57ms step:1433/1480 train_time:228516ms step_avg:160.59ms step:1434/1480 train_time:228698ms step_avg:160.60ms step:1435/1480 train_time:228873ms step_avg:160.61ms step:1436/1480 train_time:229049ms step_avg:160.62ms step:1437/1480 train_time:229220ms step_avg:160.63ms step:1438/1480 train_time:229389ms step_avg:160.64ms step:1439/1480 train_time:229563ms step_avg:160.65ms step:1440/1480 train_time:229734ms step_avg:160.65ms step:1441/1480 train_time:229904ms step_avg:160.66ms step:1442/1480 train_time:230081ms step_avg:160.67ms step:1443/1480 train_time:230269ms step_avg:160.69ms step:1444/1480 train_time:230441ms step_avg:160.70ms step:1445/1480 train_time:230612ms step_avg:160.71ms step:1446/1480 train_time:230788ms step_avg:160.72ms step:1447/1480 train_time:230967ms step_avg:160.73ms step:1448/1480 train_time:231138ms step_avg:160.74ms step:1449/1480 train_time:231313ms step_avg:160.75ms step:1450/1480 train_time:231485ms step_avg:160.75ms step:1451/1480 train_time:231655ms step_avg:160.76ms step:1452/1480 train_time:231828ms step_avg:160.77ms step:1453/1480 train_time:231997ms step_avg:160.77ms step:1454/1480 train_time:232168ms step_avg:160.78ms step:1455/1480 train_time:232345ms step_avg:160.79ms step:1456/1480 train_time:232519ms step_avg:160.80ms step:1457/1480 train_time:232689ms step_avg:160.81ms step:1458/1480 train_time:232859ms step_avg:160.81ms step:1459/1480 train_time:233037ms step_avg:160.83ms step:1460/1480 train_time:233208ms step_avg:160.83ms step:1461/1480 train_time:233383ms step_avg:160.84ms step:1462/1480 train_time:233554ms step_avg:160.85ms step:1463/1480 train_time:233730ms step_avg:160.86ms step:1464/1480 train_time:233904ms step_avg:160.87ms step:1465/1480 train_time:234077ms step_avg:160.88ms step:1466/1480 train_time:234247ms step_avg:160.88ms step:1467/1480 train_time:234422ms step_avg:160.89ms step:1468/1480 train_time:234592ms step_avg:160.90ms step:1469/1480 train_time:234763ms step_avg:160.91ms step:1470/1480 train_time:234943ms step_avg:160.92ms step:1471/1480 train_time:235128ms step_avg:160.94ms step:1472/1480 train_time:235309ms step_avg:160.95ms step:1473/1480 train_time:235480ms step_avg:160.96ms step:1474/1480 train_time:235657ms step_avg:160.97ms step:1475/1480 train_time:235836ms step_avg:160.98ms step:1476/1480 train_time:236009ms step_avg:160.99ms step:1477/1480 train_time:236192ms step_avg:161.00ms step:1478/1480 train_time:236376ms step_avg:161.02ms step:1479/1480 train_time:236549ms step_avg:161.03ms step:1480/1480 train_time:236722ms step_avg:161.04ms step:1480/1480 val_loss:3.2762 train_time:236795ms step_avg:161.08ms