import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 13:07:26 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 78W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 92W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 47MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 44C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 97W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22729ms step_avg:nanms step:2/1480 train_time:22924ms step_avg:nanms step:3/1480 train_time:23062ms step_avg:nanms step:4/1480 train_time:23203ms step_avg:nanms step:5/1480 train_time:23343ms step_avg:nanms step:6/1480 train_time:23483ms step_avg:nanms step:7/1480 train_time:23624ms step_avg:nanms step:8/1480 train_time:23767ms step_avg:nanms step:9/1480 train_time:23913ms step_avg:nanms step:10/1480 train_time:24059ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.38ms step:14/1480 train_time:565ms step_avg:141.28ms step:15/1480 train_time:707ms step_avg:141.38ms step:16/1480 train_time:852ms step_avg:141.95ms step:17/1480 train_time:996ms step_avg:142.25ms step:18/1480 train_time:1140ms step_avg:142.48ms step:19/1480 train_time:1282ms step_avg:142.48ms step:20/1480 train_time:1424ms step_avg:142.36ms step:21/1480 train_time:1566ms step_avg:142.36ms step:22/1480 train_time:1708ms step_avg:142.35ms step:23/1480 train_time:1853ms step_avg:142.52ms step:24/1480 train_time:1997ms step_avg:142.62ms step:25/1480 train_time:2141ms step_avg:142.73ms step:26/1480 train_time:2283ms step_avg:142.71ms step:27/1480 train_time:2425ms step_avg:142.67ms step:28/1480 train_time:2567ms step_avg:142.61ms step:29/1480 train_time:2711ms step_avg:142.67ms step:30/1480 train_time:2855ms step_avg:142.73ms step:31/1480 train_time:2999ms step_avg:142.82ms step:32/1480 train_time:3143ms step_avg:142.86ms step:33/1480 train_time:3285ms step_avg:142.83ms step:34/1480 train_time:3427ms step_avg:142.78ms step:35/1480 train_time:3569ms step_avg:142.75ms step:36/1480 train_time:3714ms step_avg:142.84ms step:37/1480 train_time:3859ms step_avg:142.91ms step:38/1480 train_time:4003ms step_avg:142.97ms step:39/1480 train_time:4146ms step_avg:142.97ms step:40/1480 train_time:4288ms step_avg:142.95ms step:41/1480 train_time:4431ms step_avg:142.93ms step:42/1480 train_time:4573ms step_avg:142.92ms step:43/1480 train_time:4717ms step_avg:142.95ms step:44/1480 train_time:4861ms step_avg:142.96ms step:45/1480 train_time:5003ms step_avg:142.95ms step:46/1480 train_time:5146ms step_avg:142.95ms step:47/1480 train_time:5288ms step_avg:142.91ms step:48/1480 train_time:5430ms step_avg:142.89ms step:49/1480 train_time:5571ms step_avg:142.86ms step:50/1480 train_time:5716ms step_avg:142.89ms step:51/1480 train_time:5860ms step_avg:142.92ms step:52/1480 train_time:6002ms step_avg:142.91ms step:53/1480 train_time:6145ms step_avg:142.90ms step:54/1480 train_time:6288ms step_avg:142.91ms step:55/1480 train_time:6432ms step_avg:142.94ms step:56/1480 train_time:6576ms step_avg:142.95ms step:57/1480 train_time:6719ms step_avg:142.96ms step:58/1480 train_time:6863ms step_avg:142.98ms step:59/1480 train_time:7005ms step_avg:142.96ms step:60/1480 train_time:7146ms step_avg:142.93ms step:61/1480 train_time:7290ms step_avg:142.94ms step:62/1480 train_time:7433ms step_avg:142.95ms step:63/1480 train_time:7578ms step_avg:142.97ms step:64/1480 train_time:7720ms step_avg:142.97ms step:65/1480 train_time:7864ms step_avg:142.99ms step:66/1480 train_time:8006ms step_avg:142.97ms step:67/1480 train_time:8147ms step_avg:142.94ms step:68/1480 train_time:8288ms step_avg:142.90ms step:69/1480 train_time:8431ms step_avg:142.89ms step:70/1480 train_time:8576ms step_avg:142.93ms step:71/1480 train_time:8720ms step_avg:142.95ms step:72/1480 train_time:8862ms step_avg:142.94ms step:73/1480 train_time:9004ms step_avg:142.92ms step:74/1480 train_time:9145ms step_avg:142.90ms step:75/1480 train_time:9287ms step_avg:142.87ms step:76/1480 train_time:9429ms step_avg:142.86ms step:77/1480 train_time:9573ms step_avg:142.88ms step:78/1480 train_time:9717ms step_avg:142.89ms step:79/1480 train_time:9860ms step_avg:142.89ms step:80/1480 train_time:10002ms step_avg:142.88ms step:81/1480 train_time:10144ms step_avg:142.87ms step:82/1480 train_time:10286ms step_avg:142.85ms step:83/1480 train_time:10427ms step_avg:142.83ms step:84/1480 train_time:10569ms step_avg:142.83ms step:85/1480 train_time:10714ms step_avg:142.86ms step:86/1480 train_time:10859ms step_avg:142.89ms step:87/1480 train_time:11003ms step_avg:142.89ms step:88/1480 train_time:11144ms step_avg:142.87ms step:89/1480 train_time:11285ms step_avg:142.85ms step:90/1480 train_time:11426ms step_avg:142.83ms step:91/1480 train_time:11570ms step_avg:142.84ms step:92/1480 train_time:11713ms step_avg:142.84ms step:93/1480 train_time:11858ms step_avg:142.86ms step:94/1480 train_time:12001ms step_avg:142.87ms step:95/1480 train_time:12143ms step_avg:142.86ms step:96/1480 train_time:12284ms step_avg:142.84ms step:97/1480 train_time:12425ms step_avg:142.82ms step:98/1480 train_time:12567ms step_avg:142.81ms step:99/1480 train_time:12711ms step_avg:142.82ms step:100/1480 train_time:12856ms step_avg:142.84ms step:101/1480 train_time:12998ms step_avg:142.84ms step:102/1480 train_time:13141ms step_avg:142.84ms step:103/1480 train_time:13282ms step_avg:142.82ms step:104/1480 train_time:13423ms step_avg:142.80ms step:105/1480 train_time:13566ms step_avg:142.80ms step:106/1480 train_time:13710ms step_avg:142.82ms step:107/1480 train_time:13855ms step_avg:142.84ms step:108/1480 train_time:13999ms step_avg:142.85ms step:109/1480 train_time:14142ms step_avg:142.85ms step:110/1480 train_time:14283ms step_avg:142.83ms step:111/1480 train_time:14428ms step_avg:142.85ms step:112/1480 train_time:14577ms step_avg:142.91ms step:113/1480 train_time:14724ms step_avg:142.95ms step:114/1480 train_time:14872ms step_avg:143.00ms step:115/1480 train_time:15020ms step_avg:143.05ms step:116/1480 train_time:15167ms step_avg:143.09ms step:117/1480 train_time:15314ms step_avg:143.12ms step:118/1480 train_time:15461ms step_avg:143.16ms step:119/1480 train_time:15606ms step_avg:143.18ms step:120/1480 train_time:15754ms step_avg:143.22ms step:121/1480 train_time:15902ms step_avg:143.26ms step:122/1480 train_time:16050ms step_avg:143.30ms step:123/1480 train_time:16197ms step_avg:143.34ms step:124/1480 train_time:16345ms step_avg:143.37ms step:125/1480 train_time:16491ms step_avg:143.40ms step:125/1480 val_loss:4.4081 train_time:16549ms step_avg:143.90ms step:126/1480 train_time:16645ms step_avg:143.50ms step:127/1480 train_time:16794ms step_avg:143.54ms step:128/1480 train_time:16939ms step_avg:143.55ms step:129/1480 train_time:17085ms step_avg:143.57ms step:130/1480 train_time:17232ms step_avg:143.60ms step:131/1480 train_time:17377ms step_avg:143.61ms step:132/1480 train_time:17524ms step_avg:143.64ms step:133/1480 train_time:17673ms step_avg:143.69ms step:134/1480 train_time:17822ms step_avg:143.72ms step:135/1480 train_time:17970ms step_avg:143.76ms step:136/1480 train_time:18117ms step_avg:143.78ms step:137/1480 train_time:18264ms step_avg:143.81ms step:138/1480 train_time:18410ms step_avg:143.83ms step:139/1480 train_time:18557ms step_avg:143.85ms step:140/1480 train_time:18706ms step_avg:143.89ms step:141/1480 train_time:18854ms step_avg:143.92ms step:142/1480 train_time:19001ms step_avg:143.94ms step:143/1480 train_time:19148ms step_avg:143.97ms step:144/1480 train_time:19295ms step_avg:143.99ms step:145/1480 train_time:19442ms step_avg:144.01ms step:146/1480 train_time:19589ms step_avg:144.04ms step:147/1480 train_time:19736ms step_avg:144.06ms step:148/1480 train_time:19884ms step_avg:144.08ms step:149/1480 train_time:20032ms step_avg:144.11ms step:150/1480 train_time:20178ms step_avg:144.13ms step:151/1480 train_time:20325ms step_avg:144.15ms step:152/1480 train_time:20473ms step_avg:144.17ms step:153/1480 train_time:20619ms step_avg:144.19ms step:154/1480 train_time:20766ms step_avg:144.21ms step:155/1480 train_time:20914ms step_avg:144.23ms step:156/1480 train_time:21061ms step_avg:144.25ms step:157/1480 train_time:21210ms step_avg:144.28ms step:158/1480 train_time:21355ms step_avg:144.29ms step:159/1480 train_time:21502ms step_avg:144.31ms step:160/1480 train_time:21650ms step_avg:144.33ms step:161/1480 train_time:21798ms step_avg:144.36ms step:162/1480 train_time:21947ms step_avg:144.39ms step:163/1480 train_time:22094ms step_avg:144.41ms step:164/1480 train_time:22241ms step_avg:144.42ms step:165/1480 train_time:22389ms step_avg:144.44ms step:166/1480 train_time:22535ms step_avg:144.45ms step:167/1480 train_time:22681ms step_avg:144.46ms step:168/1480 train_time:22828ms step_avg:144.48ms step:169/1480 train_time:22975ms step_avg:144.50ms step:170/1480 train_time:23123ms step_avg:144.52ms step:171/1480 train_time:23272ms step_avg:144.55ms step:172/1480 train_time:23418ms step_avg:144.56ms step:173/1480 train_time:23567ms step_avg:144.58ms step:174/1480 train_time:23714ms step_avg:144.60ms step:175/1480 train_time:23861ms step_avg:144.61ms step:176/1480 train_time:24009ms step_avg:144.63ms step:177/1480 train_time:24155ms step_avg:144.64ms step:178/1480 train_time:24303ms step_avg:144.66ms step:179/1480 train_time:24450ms step_avg:144.67ms step:180/1480 train_time:24596ms step_avg:144.68ms step:181/1480 train_time:24743ms step_avg:144.70ms step:182/1480 train_time:24891ms step_avg:144.72ms step:183/1480 train_time:25037ms step_avg:144.72ms step:184/1480 train_time:25184ms step_avg:144.74ms step:185/1480 train_time:25331ms step_avg:144.75ms step:186/1480 train_time:25479ms step_avg:144.77ms step:187/1480 train_time:25626ms step_avg:144.78ms step:188/1480 train_time:25775ms step_avg:144.80ms step:189/1480 train_time:25923ms step_avg:144.82ms step:190/1480 train_time:26070ms step_avg:144.84ms step:191/1480 train_time:26216ms step_avg:144.84ms step:192/1480 train_time:26363ms step_avg:144.85ms step:193/1480 train_time:26510ms step_avg:144.86ms step:194/1480 train_time:26656ms step_avg:144.87ms step:195/1480 train_time:26803ms step_avg:144.88ms step:196/1480 train_time:26951ms step_avg:144.90ms step:197/1480 train_time:27097ms step_avg:144.91ms step:198/1480 train_time:27246ms step_avg:144.92ms step:199/1480 train_time:27394ms step_avg:144.94ms step:200/1480 train_time:27543ms step_avg:144.96ms step:201/1480 train_time:27690ms step_avg:144.97ms step:202/1480 train_time:27836ms step_avg:144.98ms step:203/1480 train_time:27984ms step_avg:145.00ms step:204/1480 train_time:28132ms step_avg:145.01ms step:205/1480 train_time:28278ms step_avg:145.01ms step:206/1480 train_time:28424ms step_avg:145.02ms step:207/1480 train_time:28571ms step_avg:145.03ms step:208/1480 train_time:28717ms step_avg:145.04ms step:209/1480 train_time:28865ms step_avg:145.05ms step:210/1480 train_time:29013ms step_avg:145.06ms step:211/1480 train_time:29159ms step_avg:145.07ms step:212/1480 train_time:29307ms step_avg:145.09ms step:213/1480 train_time:29455ms step_avg:145.10ms step:214/1480 train_time:29602ms step_avg:145.11ms step:215/1480 train_time:29750ms step_avg:145.12ms step:216/1480 train_time:29896ms step_avg:145.13ms step:217/1480 train_time:30042ms step_avg:145.13ms step:218/1480 train_time:30190ms step_avg:145.14ms step:219/1480 train_time:30336ms step_avg:145.15ms step:220/1480 train_time:30483ms step_avg:145.16ms step:221/1480 train_time:30631ms step_avg:145.17ms step:222/1480 train_time:30783ms step_avg:145.20ms step:223/1480 train_time:30933ms step_avg:145.22ms step:224/1480 train_time:31083ms step_avg:145.25ms step:225/1480 train_time:31236ms step_avg:145.28ms step:226/1480 train_time:31387ms step_avg:145.31ms step:227/1480 train_time:31537ms step_avg:145.33ms step:228/1480 train_time:31689ms step_avg:145.36ms step:229/1480 train_time:31840ms step_avg:145.39ms step:230/1480 train_time:31991ms step_avg:145.41ms step:231/1480 train_time:32142ms step_avg:145.44ms step:232/1480 train_time:32293ms step_avg:145.46ms step:233/1480 train_time:32443ms step_avg:145.48ms step:234/1480 train_time:32593ms step_avg:145.51ms step:235/1480 train_time:32744ms step_avg:145.53ms step:236/1480 train_time:32895ms step_avg:145.55ms step:237/1480 train_time:33044ms step_avg:145.57ms step:238/1480 train_time:33197ms step_avg:145.60ms step:239/1480 train_time:33348ms step_avg:145.63ms step:240/1480 train_time:33500ms step_avg:145.65ms step:241/1480 train_time:33650ms step_avg:145.67ms step:242/1480 train_time:33801ms step_avg:145.69ms step:243/1480 train_time:33951ms step_avg:145.71ms step:244/1480 train_time:34101ms step_avg:145.73ms step:245/1480 train_time:34251ms step_avg:145.75ms step:246/1480 train_time:34402ms step_avg:145.77ms step:247/1480 train_time:34553ms step_avg:145.79ms step:248/1480 train_time:34703ms step_avg:145.81ms step:249/1480 train_time:34854ms step_avg:145.83ms step:250/1480 train_time:35005ms step_avg:145.86ms step:250/1480 val_loss:3.9927 train_time:35065ms step_avg:146.10ms step:251/1480 train_time:35164ms step_avg:145.91ms step:252/1480 train_time:35316ms step_avg:145.93ms step:253/1480 train_time:35466ms step_avg:145.95ms step:254/1480 train_time:35616ms step_avg:145.97ms step:255/1480 train_time:35766ms step_avg:145.98ms step:256/1480 train_time:35916ms step_avg:146.00ms step:257/1480 train_time:36065ms step_avg:146.01ms step:258/1480 train_time:36218ms step_avg:146.04ms step:259/1480 train_time:36370ms step_avg:146.06ms step:260/1480 train_time:36520ms step_avg:146.08ms step:261/1480 train_time:36671ms step_avg:146.10ms step:262/1480 train_time:36822ms step_avg:146.12ms step:263/1480 train_time:36973ms step_avg:146.14ms step:264/1480 train_time:37124ms step_avg:146.16ms step:265/1480 train_time:37274ms step_avg:146.17ms step:266/1480 train_time:37425ms step_avg:146.19ms step:267/1480 train_time:37575ms step_avg:146.21ms step:268/1480 train_time:37726ms step_avg:146.23ms step:269/1480 train_time:37876ms step_avg:146.24ms step:270/1480 train_time:38027ms step_avg:146.26ms step:271/1480 train_time:38177ms step_avg:146.27ms step:272/1480 train_time:38328ms step_avg:146.29ms step:273/1480 train_time:38479ms step_avg:146.31ms step:274/1480 train_time:38630ms step_avg:146.32ms step:275/1480 train_time:38781ms step_avg:146.34ms step:276/1480 train_time:38932ms step_avg:146.36ms step:277/1480 train_time:39084ms step_avg:146.38ms step:278/1480 train_time:39234ms step_avg:146.39ms step:279/1480 train_time:39386ms step_avg:146.42ms step:280/1480 train_time:39537ms step_avg:146.43ms step:281/1480 train_time:39687ms step_avg:146.45ms step:282/1480 train_time:39838ms step_avg:146.46ms step:283/1480 train_time:39989ms step_avg:146.48ms step:284/1480 train_time:40139ms step_avg:146.49ms step:285/1480 train_time:40290ms step_avg:146.51ms step:286/1480 train_time:40442ms step_avg:146.53ms step:287/1480 train_time:40593ms step_avg:146.54ms step:288/1480 train_time:40743ms step_avg:146.56ms step:289/1480 train_time:40893ms step_avg:146.57ms step:290/1480 train_time:41044ms step_avg:146.59ms step:291/1480 train_time:41194ms step_avg:146.60ms step:292/1480 train_time:41345ms step_avg:146.61ms step:293/1480 train_time:41496ms step_avg:146.63ms step:294/1480 train_time:41647ms step_avg:146.64ms step:295/1480 train_time:41797ms step_avg:146.66ms step:296/1480 train_time:41948ms step_avg:146.67ms step:297/1480 train_time:42100ms step_avg:146.69ms step:298/1480 train_time:42250ms step_avg:146.70ms step:299/1480 train_time:42401ms step_avg:146.71ms step:300/1480 train_time:42552ms step_avg:146.73ms step:301/1480 train_time:42701ms step_avg:146.74ms step:302/1480 train_time:42851ms step_avg:146.75ms step:303/1480 train_time:43001ms step_avg:146.76ms step:304/1480 train_time:43151ms step_avg:146.77ms step:305/1480 train_time:43302ms step_avg:146.79ms step:306/1480 train_time:43453ms step_avg:146.80ms step:307/1480 train_time:43604ms step_avg:146.82ms step:308/1480 train_time:43755ms step_avg:146.83ms step:309/1480 train_time:43906ms step_avg:146.84ms step:310/1480 train_time:44056ms step_avg:146.85ms step:311/1480 train_time:44207ms step_avg:146.87ms step:312/1480 train_time:44356ms step_avg:146.87ms step:313/1480 train_time:44507ms step_avg:146.89ms step:314/1480 train_time:44656ms step_avg:146.90ms step:315/1480 train_time:44807ms step_avg:146.91ms step:316/1480 train_time:44956ms step_avg:146.92ms step:317/1480 train_time:45108ms step_avg:146.93ms step:318/1480 train_time:45257ms step_avg:146.94ms step:319/1480 train_time:45408ms step_avg:146.95ms step:320/1480 train_time:45558ms step_avg:146.96ms step:321/1480 train_time:45708ms step_avg:146.97ms step:322/1480 train_time:45859ms step_avg:146.98ms step:323/1480 train_time:46010ms step_avg:147.00ms step:324/1480 train_time:46161ms step_avg:147.01ms step:325/1480 train_time:46311ms step_avg:147.02ms step:326/1480 train_time:46461ms step_avg:147.03ms step:327/1480 train_time:46614ms step_avg:147.05ms step:328/1480 train_time:46763ms step_avg:147.05ms step:329/1480 train_time:46913ms step_avg:147.06ms step:330/1480 train_time:47066ms step_avg:147.08ms step:331/1480 train_time:47221ms step_avg:147.11ms step:332/1480 train_time:47375ms step_avg:147.13ms step:333/1480 train_time:47528ms step_avg:147.15ms step:334/1480 train_time:47682ms step_avg:147.17ms step:335/1480 train_time:47835ms step_avg:147.19ms step:336/1480 train_time:47989ms step_avg:147.21ms step:337/1480 train_time:48144ms step_avg:147.23ms step:338/1480 train_time:48298ms step_avg:147.25ms step:339/1480 train_time:48452ms step_avg:147.27ms step:340/1480 train_time:48606ms step_avg:147.29ms step:341/1480 train_time:48761ms step_avg:147.31ms step:342/1480 train_time:48914ms step_avg:147.33ms step:343/1480 train_time:49067ms step_avg:147.35ms step:344/1480 train_time:49223ms step_avg:147.38ms step:345/1480 train_time:49378ms step_avg:147.40ms step:346/1480 train_time:49531ms step_avg:147.41ms step:347/1480 train_time:49687ms step_avg:147.44ms step:348/1480 train_time:49840ms step_avg:147.46ms step:349/1480 train_time:49993ms step_avg:147.47ms step:350/1480 train_time:50148ms step_avg:147.50ms step:351/1480 train_time:50304ms step_avg:147.52ms step:352/1480 train_time:50457ms step_avg:147.53ms step:353/1480 train_time:50611ms step_avg:147.55ms step:354/1480 train_time:50764ms step_avg:147.57ms step:355/1480 train_time:50919ms step_avg:147.59ms step:356/1480 train_time:51071ms step_avg:147.60ms step:357/1480 train_time:51226ms step_avg:147.63ms step:358/1480 train_time:51381ms step_avg:147.65ms step:359/1480 train_time:51534ms step_avg:147.66ms step:360/1480 train_time:51689ms step_avg:147.68ms step:361/1480 train_time:51843ms step_avg:147.70ms step:362/1480 train_time:51997ms step_avg:147.72ms step:363/1480 train_time:52150ms step_avg:147.73ms step:364/1480 train_time:52306ms step_avg:147.76ms step:365/1480 train_time:52460ms step_avg:147.77ms step:366/1480 train_time:52613ms step_avg:147.79ms step:367/1480 train_time:52766ms step_avg:147.80ms step:368/1480 train_time:52922ms step_avg:147.83ms step:369/1480 train_time:53075ms step_avg:147.84ms step:370/1480 train_time:53229ms step_avg:147.86ms step:371/1480 train_time:53381ms step_avg:147.87ms step:372/1480 train_time:53534ms step_avg:147.88ms step:373/1480 train_time:53688ms step_avg:147.90ms step:374/1480 train_time:53843ms step_avg:147.92ms step:375/1480 train_time:53997ms step_avg:147.94ms step:375/1480 val_loss:3.8053 train_time:54058ms step_avg:148.10ms step:376/1480 train_time:54156ms step_avg:147.97ms step:377/1480 train_time:54310ms step_avg:147.98ms step:378/1480 train_time:54463ms step_avg:148.00ms step:379/1480 train_time:54617ms step_avg:148.01ms step:380/1480 train_time:54769ms step_avg:148.03ms step:381/1480 train_time:54922ms step_avg:148.04ms step:382/1480 train_time:55077ms step_avg:148.06ms step:383/1480 train_time:55232ms step_avg:148.07ms step:384/1480 train_time:55385ms step_avg:148.09ms step:385/1480 train_time:55541ms step_avg:148.11ms step:386/1480 train_time:55693ms step_avg:148.12ms step:387/1480 train_time:55847ms step_avg:148.14ms step:388/1480 train_time:56001ms step_avg:148.15ms step:389/1480 train_time:56155ms step_avg:148.17ms step:390/1480 train_time:56309ms step_avg:148.18ms step:391/1480 train_time:56463ms step_avg:148.20ms step:392/1480 train_time:56619ms step_avg:148.22ms step:393/1480 train_time:56772ms step_avg:148.23ms step:394/1480 train_time:56926ms step_avg:148.24ms step:395/1480 train_time:57079ms step_avg:148.26ms step:396/1480 train_time:57231ms step_avg:148.27ms step:397/1480 train_time:57386ms step_avg:148.28ms step:398/1480 train_time:57541ms step_avg:148.30ms step:399/1480 train_time:57695ms step_avg:148.32ms step:400/1480 train_time:57849ms step_avg:148.33ms step:401/1480 train_time:58003ms step_avg:148.34ms step:402/1480 train_time:58157ms step_avg:148.36ms step:403/1480 train_time:58311ms step_avg:148.37ms step:404/1480 train_time:58465ms step_avg:148.39ms step:405/1480 train_time:58623ms step_avg:148.41ms step:406/1480 train_time:58777ms step_avg:148.43ms step:407/1480 train_time:58930ms step_avg:148.44ms step:408/1480 train_time:59084ms step_avg:148.45ms step:409/1480 train_time:59238ms step_avg:148.47ms step:410/1480 train_time:59391ms step_avg:148.48ms step:411/1480 train_time:59545ms step_avg:148.49ms step:412/1480 train_time:59701ms step_avg:148.51ms step:413/1480 train_time:59855ms step_avg:148.52ms step:414/1480 train_time:60008ms step_avg:148.53ms step:415/1480 train_time:60162ms step_avg:148.55ms step:416/1480 train_time:60317ms step_avg:148.56ms step:417/1480 train_time:60470ms step_avg:148.58ms step:418/1480 train_time:60624ms step_avg:148.59ms step:419/1480 train_time:60778ms step_avg:148.60ms step:420/1480 train_time:60932ms step_avg:148.61ms step:421/1480 train_time:61086ms step_avg:148.63ms step:422/1480 train_time:61241ms step_avg:148.64ms step:423/1480 train_time:61394ms step_avg:148.65ms step:424/1480 train_time:61547ms step_avg:148.67ms step:425/1480 train_time:61702ms step_avg:148.68ms step:426/1480 train_time:61857ms step_avg:148.69ms step:427/1480 train_time:62009ms step_avg:148.70ms step:428/1480 train_time:62163ms step_avg:148.71ms step:429/1480 train_time:62316ms step_avg:148.73ms step:430/1480 train_time:62469ms step_avg:148.73ms step:431/1480 train_time:62624ms step_avg:148.75ms step:432/1480 train_time:62778ms step_avg:148.76ms step:433/1480 train_time:62932ms step_avg:148.77ms step:434/1480 train_time:63087ms step_avg:148.79ms step:435/1480 train_time:63241ms step_avg:148.80ms step:436/1480 train_time:63395ms step_avg:148.81ms step:437/1480 train_time:63548ms step_avg:148.82ms step:438/1480 train_time:63703ms step_avg:148.84ms step:439/1480 train_time:63856ms step_avg:148.85ms step:440/1480 train_time:64011ms step_avg:148.86ms step:441/1480 train_time:64168ms step_avg:148.88ms step:442/1480 train_time:64327ms step_avg:148.90ms step:443/1480 train_time:64483ms step_avg:148.92ms step:444/1480 train_time:64639ms step_avg:148.94ms step:445/1480 train_time:64795ms step_avg:148.95ms step:446/1480 train_time:64951ms step_avg:148.97ms step:447/1480 train_time:65106ms step_avg:148.98ms step:448/1480 train_time:65263ms step_avg:149.00ms step:449/1480 train_time:65423ms step_avg:149.03ms step:450/1480 train_time:65581ms step_avg:149.05ms step:451/1480 train_time:65739ms step_avg:149.07ms step:452/1480 train_time:65894ms step_avg:149.08ms step:453/1480 train_time:66050ms step_avg:149.10ms step:454/1480 train_time:66205ms step_avg:149.11ms step:455/1480 train_time:66363ms step_avg:149.13ms step:456/1480 train_time:66521ms step_avg:149.15ms step:457/1480 train_time:66676ms step_avg:149.16ms step:458/1480 train_time:66832ms step_avg:149.18ms step:459/1480 train_time:66991ms step_avg:149.20ms step:460/1480 train_time:67147ms step_avg:149.22ms step:461/1480 train_time:67305ms step_avg:149.23ms step:462/1480 train_time:67462ms step_avg:149.25ms step:463/1480 train_time:67619ms step_avg:149.27ms step:464/1480 train_time:67776ms step_avg:149.29ms step:465/1480 train_time:67932ms step_avg:149.30ms step:466/1480 train_time:68090ms step_avg:149.32ms step:467/1480 train_time:68247ms step_avg:149.34ms step:468/1480 train_time:68404ms step_avg:149.35ms step:469/1480 train_time:68561ms step_avg:149.37ms step:470/1480 train_time:68719ms step_avg:149.39ms step:471/1480 train_time:68876ms step_avg:149.41ms step:472/1480 train_time:69033ms step_avg:149.42ms step:473/1480 train_time:69189ms step_avg:149.44ms step:474/1480 train_time:69346ms step_avg:149.45ms step:475/1480 train_time:69503ms step_avg:149.47ms step:476/1480 train_time:69660ms step_avg:149.49ms step:477/1480 train_time:69820ms step_avg:149.51ms step:478/1480 train_time:69976ms step_avg:149.52ms step:479/1480 train_time:70133ms step_avg:149.54ms step:480/1480 train_time:70291ms step_avg:149.56ms step:481/1480 train_time:70447ms step_avg:149.57ms step:482/1480 train_time:70605ms step_avg:149.59ms step:483/1480 train_time:70761ms step_avg:149.60ms step:484/1480 train_time:70918ms step_avg:149.62ms step:485/1480 train_time:71076ms step_avg:149.63ms step:486/1480 train_time:71232ms step_avg:149.65ms step:487/1480 train_time:71390ms step_avg:149.66ms step:488/1480 train_time:71547ms step_avg:149.68ms step:489/1480 train_time:71704ms step_avg:149.69ms step:490/1480 train_time:71860ms step_avg:149.71ms step:491/1480 train_time:72018ms step_avg:149.73ms step:492/1480 train_time:72174ms step_avg:149.74ms step:493/1480 train_time:72331ms step_avg:149.75ms step:494/1480 train_time:72488ms step_avg:149.77ms step:495/1480 train_time:72646ms step_avg:149.79ms step:496/1480 train_time:72804ms step_avg:149.80ms step:497/1480 train_time:72960ms step_avg:149.82ms step:498/1480 train_time:73117ms step_avg:149.83ms step:499/1480 train_time:73274ms step_avg:149.84ms step:500/1480 train_time:73430ms step_avg:149.86ms step:500/1480 val_loss:3.6824 train_time:73492ms step_avg:149.98ms step:501/1480 train_time:73590ms step_avg:149.88ms step:502/1480 train_time:73748ms step_avg:149.89ms step:503/1480 train_time:73905ms step_avg:149.91ms step:504/1480 train_time:74060ms step_avg:149.92ms step:505/1480 train_time:74214ms step_avg:149.93ms step:506/1480 train_time:74372ms step_avg:149.94ms step:507/1480 train_time:74529ms step_avg:149.96ms step:508/1480 train_time:74690ms step_avg:149.98ms step:509/1480 train_time:74848ms step_avg:150.00ms step:510/1480 train_time:75006ms step_avg:150.01ms step:511/1480 train_time:75163ms step_avg:150.03ms step:512/1480 train_time:75320ms step_avg:150.04ms step:513/1480 train_time:75475ms step_avg:150.05ms step:514/1480 train_time:75632ms step_avg:150.06ms step:515/1480 train_time:75789ms step_avg:150.08ms step:516/1480 train_time:75948ms step_avg:150.10ms step:517/1480 train_time:76107ms step_avg:150.11ms step:518/1480 train_time:76265ms step_avg:150.13ms step:519/1480 train_time:76422ms step_avg:150.14ms step:520/1480 train_time:76580ms step_avg:150.16ms step:521/1480 train_time:76736ms step_avg:150.17ms step:522/1480 train_time:76893ms step_avg:150.18ms step:523/1480 train_time:77049ms step_avg:150.19ms step:524/1480 train_time:77208ms step_avg:150.21ms step:525/1480 train_time:77367ms step_avg:150.23ms step:526/1480 train_time:77527ms step_avg:150.25ms step:527/1480 train_time:77685ms step_avg:150.26ms step:528/1480 train_time:77842ms step_avg:150.27ms step:529/1480 train_time:77998ms step_avg:150.29ms step:530/1480 train_time:78155ms step_avg:150.30ms step:531/1480 train_time:78312ms step_avg:150.31ms step:532/1480 train_time:78469ms step_avg:150.32ms step:533/1480 train_time:78628ms step_avg:150.34ms step:534/1480 train_time:78786ms step_avg:150.35ms step:535/1480 train_time:78944ms step_avg:150.37ms step:536/1480 train_time:79102ms step_avg:150.38ms step:537/1480 train_time:79260ms step_avg:150.40ms step:538/1480 train_time:79415ms step_avg:150.41ms step:539/1480 train_time:79573ms step_avg:150.42ms step:540/1480 train_time:79729ms step_avg:150.43ms step:541/1480 train_time:79887ms step_avg:150.45ms step:542/1480 train_time:80044ms step_avg:150.46ms step:543/1480 train_time:80202ms step_avg:150.47ms step:544/1480 train_time:80358ms step_avg:150.48ms step:545/1480 train_time:80515ms step_avg:150.49ms step:546/1480 train_time:80672ms step_avg:150.51ms step:547/1480 train_time:80830ms step_avg:150.52ms step:548/1480 train_time:80989ms step_avg:150.54ms step:549/1480 train_time:81146ms step_avg:150.55ms step:550/1480 train_time:81304ms step_avg:150.56ms step:551/1480 train_time:81463ms step_avg:150.58ms step:552/1480 train_time:81622ms step_avg:150.59ms step:553/1480 train_time:81783ms step_avg:150.61ms step:554/1480 train_time:81943ms step_avg:150.63ms step:555/1480 train_time:82105ms step_avg:150.65ms step:556/1480 train_time:82264ms step_avg:150.67ms step:557/1480 train_time:82425ms step_avg:150.69ms step:558/1480 train_time:82584ms step_avg:150.70ms step:559/1480 train_time:82743ms step_avg:150.72ms step:560/1480 train_time:82903ms step_avg:150.73ms step:561/1480 train_time:83063ms step_avg:150.75ms step:562/1480 train_time:83222ms step_avg:150.76ms step:563/1480 train_time:83380ms step_avg:150.78ms step:564/1480 train_time:83538ms step_avg:150.79ms step:565/1480 train_time:83696ms step_avg:150.80ms step:566/1480 train_time:83854ms step_avg:150.82ms step:567/1480 train_time:84013ms step_avg:150.83ms step:568/1480 train_time:84171ms step_avg:150.84ms step:569/1480 train_time:84332ms step_avg:150.86ms step:570/1480 train_time:84491ms step_avg:150.88ms step:571/1480 train_time:84650ms step_avg:150.89ms step:572/1480 train_time:84810ms step_avg:150.91ms step:573/1480 train_time:84969ms step_avg:150.92ms step:574/1480 train_time:85131ms step_avg:150.94ms step:575/1480 train_time:85291ms step_avg:150.96ms step:576/1480 train_time:85449ms step_avg:150.97ms step:577/1480 train_time:85610ms step_avg:150.99ms step:578/1480 train_time:85769ms step_avg:151.00ms step:579/1480 train_time:85929ms step_avg:151.02ms step:580/1480 train_time:86089ms step_avg:151.03ms step:581/1480 train_time:86250ms step_avg:151.05ms step:582/1480 train_time:86410ms step_avg:151.07ms step:583/1480 train_time:86570ms step_avg:151.08ms step:584/1480 train_time:86730ms step_avg:151.10ms step:585/1480 train_time:86890ms step_avg:151.11ms step:586/1480 train_time:87049ms step_avg:151.13ms step:587/1480 train_time:87210ms step_avg:151.14ms step:588/1480 train_time:87369ms step_avg:151.16ms step:589/1480 train_time:87530ms step_avg:151.18ms step:590/1480 train_time:87691ms step_avg:151.19ms step:591/1480 train_time:87849ms step_avg:151.20ms step:592/1480 train_time:88010ms step_avg:151.22ms step:593/1480 train_time:88171ms step_avg:151.24ms step:594/1480 train_time:88331ms step_avg:151.25ms step:595/1480 train_time:88492ms step_avg:151.27ms step:596/1480 train_time:88651ms step_avg:151.28ms step:597/1480 train_time:88811ms step_avg:151.30ms step:598/1480 train_time:88969ms step_avg:151.31ms step:599/1480 train_time:89129ms step_avg:151.32ms step:600/1480 train_time:89290ms step_avg:151.34ms step:601/1480 train_time:89450ms step_avg:151.35ms step:602/1480 train_time:89610ms step_avg:151.37ms step:603/1480 train_time:89770ms step_avg:151.38ms step:604/1480 train_time:89930ms step_avg:151.40ms step:605/1480 train_time:90089ms step_avg:151.41ms step:606/1480 train_time:90251ms step_avg:151.43ms step:607/1480 train_time:90413ms step_avg:151.45ms step:608/1480 train_time:90572ms step_avg:151.46ms step:609/1480 train_time:90731ms step_avg:151.47ms step:610/1480 train_time:90890ms step_avg:151.48ms step:611/1480 train_time:91050ms step_avg:151.50ms step:612/1480 train_time:91211ms step_avg:151.51ms step:613/1480 train_time:91371ms step_avg:151.53ms step:614/1480 train_time:91531ms step_avg:151.54ms step:615/1480 train_time:91691ms step_avg:151.55ms step:616/1480 train_time:91850ms step_avg:151.57ms step:617/1480 train_time:92010ms step_avg:151.58ms step:618/1480 train_time:92170ms step_avg:151.59ms step:619/1480 train_time:92329ms step_avg:151.61ms step:620/1480 train_time:92489ms step_avg:151.62ms step:621/1480 train_time:92648ms step_avg:151.63ms step:622/1480 train_time:92809ms step_avg:151.65ms step:623/1480 train_time:92970ms step_avg:151.66ms step:624/1480 train_time:93129ms step_avg:151.68ms step:625/1480 train_time:93289ms step_avg:151.69ms step:625/1480 val_loss:3.6029 train_time:93353ms step_avg:151.79ms step:626/1480 train_time:93453ms step_avg:151.71ms step:627/1480 train_time:93613ms step_avg:151.72ms step:628/1480 train_time:93771ms step_avg:151.73ms step:629/1480 train_time:93928ms step_avg:151.74ms step:630/1480 train_time:94086ms step_avg:151.75ms step:631/1480 train_time:94244ms step_avg:151.76ms step:632/1480 train_time:94404ms step_avg:151.77ms step:633/1480 train_time:94563ms step_avg:151.79ms step:634/1480 train_time:94721ms step_avg:151.80ms step:635/1480 train_time:94880ms step_avg:151.81ms step:636/1480 train_time:95040ms step_avg:151.82ms step:637/1480 train_time:95200ms step_avg:151.83ms step:638/1480 train_time:95360ms step_avg:151.85ms step:639/1480 train_time:95519ms step_avg:151.86ms step:640/1480 train_time:95680ms step_avg:151.87ms step:641/1480 train_time:95840ms step_avg:151.89ms step:642/1480 train_time:95999ms step_avg:151.90ms step:643/1480 train_time:96159ms step_avg:151.91ms step:644/1480 train_time:96318ms step_avg:151.92ms step:645/1480 train_time:96478ms step_avg:151.93ms step:646/1480 train_time:96638ms step_avg:151.95ms step:647/1480 train_time:96797ms step_avg:151.96ms step:648/1480 train_time:96960ms step_avg:151.97ms step:649/1480 train_time:97120ms step_avg:151.99ms step:650/1480 train_time:97280ms step_avg:152.00ms step:651/1480 train_time:97440ms step_avg:152.01ms step:652/1480 train_time:97599ms step_avg:152.02ms step:653/1480 train_time:97759ms step_avg:152.04ms step:654/1480 train_time:97919ms step_avg:152.05ms step:655/1480 train_time:98079ms step_avg:152.06ms step:656/1480 train_time:98240ms step_avg:152.07ms step:657/1480 train_time:98401ms step_avg:152.09ms step:658/1480 train_time:98560ms step_avg:152.10ms step:659/1480 train_time:98722ms step_avg:152.11ms step:660/1480 train_time:98884ms step_avg:152.13ms step:661/1480 train_time:99045ms step_avg:152.14ms step:662/1480 train_time:99205ms step_avg:152.15ms step:663/1480 train_time:99364ms step_avg:152.17ms step:664/1480 train_time:99524ms step_avg:152.18ms step:665/1480 train_time:99686ms step_avg:152.19ms step:666/1480 train_time:99846ms step_avg:152.20ms step:667/1480 train_time:100006ms step_avg:152.22ms step:668/1480 train_time:100170ms step_avg:152.23ms step:669/1480 train_time:100332ms step_avg:152.25ms step:670/1480 train_time:100491ms step_avg:152.26ms step:671/1480 train_time:100655ms step_avg:152.28ms step:672/1480 train_time:100816ms step_avg:152.29ms step:673/1480 train_time:100979ms step_avg:152.31ms step:674/1480 train_time:101143ms step_avg:152.32ms step:675/1480 train_time:101305ms step_avg:152.34ms step:676/1480 train_time:101467ms step_avg:152.35ms step:677/1480 train_time:101629ms step_avg:152.37ms step:678/1480 train_time:101787ms step_avg:152.38ms step:679/1480 train_time:101950ms step_avg:152.39ms step:680/1480 train_time:102109ms step_avg:152.40ms step:681/1480 train_time:102272ms step_avg:152.42ms step:682/1480 train_time:102434ms step_avg:152.43ms step:683/1480 train_time:102597ms step_avg:152.45ms step:684/1480 train_time:102760ms step_avg:152.46ms step:685/1480 train_time:102923ms step_avg:152.48ms step:686/1480 train_time:103085ms step_avg:152.49ms step:687/1480 train_time:103246ms step_avg:152.51ms step:688/1480 train_time:103409ms step_avg:152.52ms step:689/1480 train_time:103571ms step_avg:152.53ms step:690/1480 train_time:103738ms step_avg:152.56ms step:691/1480 train_time:103900ms step_avg:152.57ms step:692/1480 train_time:104061ms step_avg:152.58ms step:693/1480 train_time:104222ms step_avg:152.59ms step:694/1480 train_time:104384ms step_avg:152.61ms step:695/1480 train_time:104544ms step_avg:152.62ms step:696/1480 train_time:104705ms step_avg:152.63ms step:697/1480 train_time:104869ms step_avg:152.65ms step:698/1480 train_time:105029ms step_avg:152.66ms step:699/1480 train_time:105191ms step_avg:152.67ms step:700/1480 train_time:105354ms step_avg:152.69ms step:701/1480 train_time:105513ms step_avg:152.70ms step:702/1480 train_time:105674ms step_avg:152.71ms step:703/1480 train_time:105838ms step_avg:152.72ms step:704/1480 train_time:106000ms step_avg:152.74ms step:705/1480 train_time:106162ms step_avg:152.75ms step:706/1480 train_time:106325ms step_avg:152.77ms step:707/1480 train_time:106487ms step_avg:152.78ms step:708/1480 train_time:106647ms step_avg:152.79ms step:709/1480 train_time:106809ms step_avg:152.80ms step:710/1480 train_time:106968ms step_avg:152.81ms step:711/1480 train_time:107131ms step_avg:152.83ms step:712/1480 train_time:107298ms step_avg:152.85ms step:713/1480 train_time:107462ms step_avg:152.86ms step:714/1480 train_time:107623ms step_avg:152.87ms step:715/1480 train_time:107784ms step_avg:152.89ms step:716/1480 train_time:107943ms step_avg:152.89ms step:717/1480 train_time:108104ms step_avg:152.91ms step:718/1480 train_time:108263ms step_avg:152.91ms step:719/1480 train_time:108424ms step_avg:152.93ms step:720/1480 train_time:108587ms step_avg:152.94ms step:721/1480 train_time:108748ms step_avg:152.95ms step:722/1480 train_time:108910ms step_avg:152.96ms step:723/1480 train_time:109071ms step_avg:152.97ms step:724/1480 train_time:109232ms step_avg:152.99ms step:725/1480 train_time:109398ms step_avg:153.00ms step:726/1480 train_time:109562ms step_avg:153.02ms step:727/1480 train_time:109724ms step_avg:153.03ms step:728/1480 train_time:109885ms step_avg:153.04ms step:729/1480 train_time:110045ms step_avg:153.05ms step:730/1480 train_time:110208ms step_avg:153.07ms step:731/1480 train_time:110368ms step_avg:153.08ms step:732/1480 train_time:110527ms step_avg:153.08ms step:733/1480 train_time:110688ms step_avg:153.10ms step:734/1480 train_time:110850ms step_avg:153.11ms step:735/1480 train_time:111010ms step_avg:153.12ms step:736/1480 train_time:111172ms step_avg:153.13ms step:737/1480 train_time:111334ms step_avg:153.14ms step:738/1480 train_time:111497ms step_avg:153.15ms step:739/1480 train_time:111659ms step_avg:153.17ms step:740/1480 train_time:111825ms step_avg:153.18ms step:741/1480 train_time:111987ms step_avg:153.20ms step:742/1480 train_time:112149ms step_avg:153.21ms step:743/1480 train_time:112310ms step_avg:153.22ms step:744/1480 train_time:112474ms step_avg:153.23ms step:745/1480 train_time:112639ms step_avg:153.25ms step:746/1480 train_time:112800ms step_avg:153.26ms step:747/1480 train_time:112961ms step_avg:153.27ms step:748/1480 train_time:113125ms step_avg:153.29ms step:749/1480 train_time:113290ms step_avg:153.30ms step:750/1480 train_time:113449ms step_avg:153.31ms step:750/1480 val_loss:3.5470 train_time:113513ms step_avg:153.40ms step:751/1480 train_time:113613ms step_avg:153.32ms step:752/1480 train_time:113774ms step_avg:153.33ms step:753/1480 train_time:113936ms step_avg:153.35ms step:754/1480 train_time:114097ms step_avg:153.36ms step:755/1480 train_time:114258ms step_avg:153.37ms step:756/1480 train_time:114421ms step_avg:153.38ms step:757/1480 train_time:114584ms step_avg:153.39ms step:758/1480 train_time:114745ms step_avg:153.40ms step:759/1480 train_time:114907ms step_avg:153.41ms step:760/1480 train_time:115069ms step_avg:153.43ms step:761/1480 train_time:115231ms step_avg:153.44ms step:762/1480 train_time:115391ms step_avg:153.44ms step:763/1480 train_time:115551ms step_avg:153.45ms step:764/1480 train_time:115712ms step_avg:153.46ms step:765/1480 train_time:115873ms step_avg:153.47ms step:766/1480 train_time:116036ms step_avg:153.49ms step:767/1480 train_time:116200ms step_avg:153.50ms step:768/1480 train_time:116362ms step_avg:153.51ms step:769/1480 train_time:116525ms step_avg:153.52ms step:770/1480 train_time:116688ms step_avg:153.54ms step:771/1480 train_time:116850ms step_avg:153.55ms step:772/1480 train_time:117012ms step_avg:153.56ms step:773/1480 train_time:117173ms step_avg:153.57ms step:774/1480 train_time:117337ms step_avg:153.58ms step:775/1480 train_time:117500ms step_avg:153.59ms step:776/1480 train_time:117665ms step_avg:153.61ms step:777/1480 train_time:117830ms step_avg:153.62ms step:778/1480 train_time:117993ms step_avg:153.64ms step:779/1480 train_time:118155ms step_avg:153.65ms step:780/1480 train_time:118320ms step_avg:153.66ms step:781/1480 train_time:118484ms step_avg:153.68ms step:782/1480 train_time:118648ms step_avg:153.69ms step:783/1480 train_time:118810ms step_avg:153.70ms step:784/1480 train_time:118974ms step_avg:153.71ms step:785/1480 train_time:119137ms step_avg:153.72ms step:786/1480 train_time:119303ms step_avg:153.74ms step:787/1480 train_time:119465ms step_avg:153.75ms step:788/1480 train_time:119629ms step_avg:153.77ms step:789/1480 train_time:119791ms step_avg:153.78ms step:790/1480 train_time:119957ms step_avg:153.79ms step:791/1480 train_time:120123ms step_avg:153.81ms step:792/1480 train_time:120287ms step_avg:153.82ms step:793/1480 train_time:120448ms step_avg:153.83ms step:794/1480 train_time:120613ms step_avg:153.84ms step:795/1480 train_time:120778ms step_avg:153.86ms step:796/1480 train_time:120945ms step_avg:153.87ms step:797/1480 train_time:121109ms step_avg:153.89ms step:798/1480 train_time:121272ms step_avg:153.90ms step:799/1480 train_time:121439ms step_avg:153.92ms step:800/1480 train_time:121603ms step_avg:153.93ms step:801/1480 train_time:121768ms step_avg:153.94ms step:802/1480 train_time:121935ms step_avg:153.96ms step:803/1480 train_time:122097ms step_avg:153.97ms step:804/1480 train_time:122260ms step_avg:153.98ms step:805/1480 train_time:122426ms step_avg:153.99ms step:806/1480 train_time:122586ms step_avg:154.00ms step:807/1480 train_time:122749ms step_avg:154.01ms step:808/1480 train_time:122914ms step_avg:154.03ms step:809/1480 train_time:123077ms step_avg:154.04ms step:810/1480 train_time:123240ms step_avg:154.05ms step:811/1480 train_time:123404ms step_avg:154.06ms step:812/1480 train_time:123569ms step_avg:154.08ms step:813/1480 train_time:123729ms step_avg:154.08ms step:814/1480 train_time:123893ms step_avg:154.10ms step:815/1480 train_time:124056ms step_avg:154.11ms step:816/1480 train_time:124223ms step_avg:154.12ms step:817/1480 train_time:124386ms step_avg:154.13ms step:818/1480 train_time:124547ms step_avg:154.14ms step:819/1480 train_time:124711ms step_avg:154.15ms step:820/1480 train_time:124876ms step_avg:154.17ms step:821/1480 train_time:125038ms step_avg:154.18ms step:822/1480 train_time:125202ms step_avg:154.19ms step:823/1480 train_time:125365ms step_avg:154.20ms step:824/1480 train_time:125525ms step_avg:154.21ms step:825/1480 train_time:125690ms step_avg:154.22ms step:826/1480 train_time:125856ms step_avg:154.24ms step:827/1480 train_time:126022ms step_avg:154.25ms step:828/1480 train_time:126185ms step_avg:154.26ms step:829/1480 train_time:126348ms step_avg:154.27ms step:830/1480 train_time:126512ms step_avg:154.28ms step:831/1480 train_time:126677ms step_avg:154.30ms step:832/1480 train_time:126842ms step_avg:154.31ms step:833/1480 train_time:127007ms step_avg:154.32ms step:834/1480 train_time:127171ms step_avg:154.33ms step:835/1480 train_time:127336ms step_avg:154.35ms step:836/1480 train_time:127501ms step_avg:154.36ms step:837/1480 train_time:127664ms step_avg:154.37ms step:838/1480 train_time:127827ms step_avg:154.38ms step:839/1480 train_time:127990ms step_avg:154.39ms step:840/1480 train_time:128151ms step_avg:154.40ms step:841/1480 train_time:128312ms step_avg:154.41ms step:842/1480 train_time:128476ms step_avg:154.42ms step:843/1480 train_time:128640ms step_avg:154.43ms step:844/1480 train_time:128803ms step_avg:154.44ms step:845/1480 train_time:128967ms step_avg:154.45ms step:846/1480 train_time:129132ms step_avg:154.46ms step:847/1480 train_time:129297ms step_avg:154.48ms step:848/1480 train_time:129459ms step_avg:154.49ms step:849/1480 train_time:129622ms step_avg:154.50ms step:850/1480 train_time:129785ms step_avg:154.51ms step:851/1480 train_time:129950ms step_avg:154.52ms step:852/1480 train_time:130112ms step_avg:154.53ms step:853/1480 train_time:130273ms step_avg:154.54ms step:854/1480 train_time:130438ms step_avg:154.55ms step:855/1480 train_time:130602ms step_avg:154.56ms step:856/1480 train_time:130764ms step_avg:154.57ms step:857/1480 train_time:130931ms step_avg:154.58ms step:858/1480 train_time:131096ms step_avg:154.59ms step:859/1480 train_time:131261ms step_avg:154.61ms step:860/1480 train_time:131423ms step_avg:154.62ms step:861/1480 train_time:131591ms step_avg:154.63ms step:862/1480 train_time:131760ms step_avg:154.65ms step:863/1480 train_time:131928ms step_avg:154.66ms step:864/1480 train_time:132090ms step_avg:154.67ms step:865/1480 train_time:132250ms step_avg:154.68ms step:866/1480 train_time:132418ms step_avg:154.69ms step:867/1480 train_time:132582ms step_avg:154.70ms step:868/1480 train_time:132744ms step_avg:154.71ms step:869/1480 train_time:132906ms step_avg:154.72ms step:870/1480 train_time:133070ms step_avg:154.73ms step:871/1480 train_time:133233ms step_avg:154.74ms step:872/1480 train_time:133398ms step_avg:154.75ms step:873/1480 train_time:133561ms step_avg:154.76ms step:874/1480 train_time:133727ms step_avg:154.78ms step:875/1480 train_time:133890ms step_avg:154.79ms step:875/1480 val_loss:3.5027 train_time:133954ms step_avg:154.86ms step:876/1480 train_time:134054ms step_avg:154.80ms step:877/1480 train_time:134218ms step_avg:154.81ms step:878/1480 train_time:134380ms step_avg:154.82ms step:879/1480 train_time:134545ms step_avg:154.83ms step:880/1480 train_time:134709ms step_avg:154.84ms step:881/1480 train_time:134871ms step_avg:154.85ms step:882/1480 train_time:135036ms step_avg:154.86ms step:883/1480 train_time:135201ms step_avg:154.87ms step:884/1480 train_time:135368ms step_avg:154.88ms step:885/1480 train_time:135533ms step_avg:154.90ms step:886/1480 train_time:135699ms step_avg:154.91ms step:887/1480 train_time:135868ms step_avg:154.92ms step:888/1480 train_time:136041ms step_avg:154.94ms step:889/1480 train_time:136209ms step_avg:154.96ms step:890/1480 train_time:136372ms step_avg:154.97ms step:891/1480 train_time:136537ms step_avg:154.98ms step:892/1480 train_time:136702ms step_avg:154.99ms step:893/1480 train_time:136867ms step_avg:155.00ms step:894/1480 train_time:137032ms step_avg:155.01ms step:895/1480 train_time:137198ms step_avg:155.03ms step:896/1480 train_time:137364ms step_avg:155.04ms step:897/1480 train_time:137530ms step_avg:155.05ms step:898/1480 train_time:137697ms step_avg:155.06ms step:899/1480 train_time:137860ms step_avg:155.07ms step:900/1480 train_time:138021ms step_avg:155.08ms step:901/1480 train_time:138188ms step_avg:155.09ms step:902/1480 train_time:138352ms step_avg:155.10ms step:903/1480 train_time:138523ms step_avg:155.12ms step:904/1480 train_time:138691ms step_avg:155.14ms step:905/1480 train_time:138853ms step_avg:155.14ms step:906/1480 train_time:139018ms step_avg:155.15ms step:907/1480 train_time:139187ms step_avg:155.17ms step:908/1480 train_time:139351ms step_avg:155.18ms step:909/1480 train_time:139515ms step_avg:155.19ms step:910/1480 train_time:139687ms step_avg:155.21ms step:911/1480 train_time:139852ms step_avg:155.22ms step:912/1480 train_time:140018ms step_avg:155.23ms step:913/1480 train_time:140187ms step_avg:155.25ms step:914/1480 train_time:140354ms step_avg:155.26ms step:915/1480 train_time:140524ms step_avg:155.28ms step:916/1480 train_time:140689ms step_avg:155.29ms step:917/1480 train_time:140851ms step_avg:155.29ms step:918/1480 train_time:141019ms step_avg:155.31ms step:919/1480 train_time:141190ms step_avg:155.32ms step:920/1480 train_time:141355ms step_avg:155.34ms step:921/1480 train_time:141519ms step_avg:155.34ms step:922/1480 train_time:141689ms step_avg:155.36ms step:923/1480 train_time:141852ms step_avg:155.37ms step:924/1480 train_time:142017ms step_avg:155.38ms step:925/1480 train_time:142184ms step_avg:155.39ms step:926/1480 train_time:142348ms step_avg:155.40ms step:927/1480 train_time:142511ms step_avg:155.41ms step:928/1480 train_time:142677ms step_avg:155.42ms step:929/1480 train_time:142841ms step_avg:155.43ms step:930/1480 train_time:143008ms step_avg:155.44ms step:931/1480 train_time:143171ms step_avg:155.45ms step:932/1480 train_time:143339ms step_avg:155.46ms step:933/1480 train_time:143507ms step_avg:155.48ms step:934/1480 train_time:143673ms step_avg:155.49ms step:935/1480 train_time:143843ms step_avg:155.51ms step:936/1480 train_time:144012ms step_avg:155.52ms step:937/1480 train_time:144182ms step_avg:155.54ms step:938/1480 train_time:144345ms step_avg:155.54ms step:939/1480 train_time:144515ms step_avg:155.56ms step:940/1480 train_time:144682ms step_avg:155.57ms step:941/1480 train_time:144848ms step_avg:155.58ms step:942/1480 train_time:145013ms step_avg:155.59ms step:943/1480 train_time:145185ms step_avg:155.61ms step:944/1480 train_time:145356ms step_avg:155.63ms step:945/1480 train_time:145520ms step_avg:155.64ms step:946/1480 train_time:145692ms step_avg:155.65ms step:947/1480 train_time:145859ms step_avg:155.67ms step:948/1480 train_time:146025ms step_avg:155.68ms step:949/1480 train_time:146192ms step_avg:155.69ms step:950/1480 train_time:146356ms step_avg:155.70ms step:951/1480 train_time:146525ms step_avg:155.71ms step:952/1480 train_time:146691ms step_avg:155.72ms step:953/1480 train_time:146860ms step_avg:155.74ms step:954/1480 train_time:147028ms step_avg:155.75ms step:955/1480 train_time:147192ms step_avg:155.76ms step:956/1480 train_time:147358ms step_avg:155.77ms step:957/1480 train_time:147525ms step_avg:155.78ms step:958/1480 train_time:147694ms step_avg:155.80ms step:959/1480 train_time:147858ms step_avg:155.80ms step:960/1480 train_time:148025ms step_avg:155.82ms step:961/1480 train_time:148191ms step_avg:155.83ms step:962/1480 train_time:148356ms step_avg:155.84ms step:963/1480 train_time:148523ms step_avg:155.85ms step:964/1480 train_time:148692ms step_avg:155.86ms step:965/1480 train_time:148856ms step_avg:155.87ms step:966/1480 train_time:149020ms step_avg:155.88ms step:967/1480 train_time:149183ms step_avg:155.89ms step:968/1480 train_time:149349ms step_avg:155.90ms step:969/1480 train_time:149516ms step_avg:155.91ms step:970/1480 train_time:149678ms step_avg:155.91ms step:971/1480 train_time:149844ms step_avg:155.92ms step:972/1480 train_time:150009ms step_avg:155.93ms step:973/1480 train_time:150174ms step_avg:155.94ms step:974/1480 train_time:150341ms step_avg:155.96ms step:975/1480 train_time:150507ms step_avg:155.97ms step:976/1480 train_time:150673ms step_avg:155.98ms step:977/1480 train_time:150836ms step_avg:155.98ms step:978/1480 train_time:151001ms step_avg:155.99ms step:979/1480 train_time:151168ms step_avg:156.00ms step:980/1480 train_time:151334ms step_avg:156.01ms step:981/1480 train_time:151502ms step_avg:156.03ms step:982/1480 train_time:151667ms step_avg:156.04ms step:983/1480 train_time:151832ms step_avg:156.05ms step:984/1480 train_time:151998ms step_avg:156.06ms step:985/1480 train_time:152167ms step_avg:156.07ms step:986/1480 train_time:152333ms step_avg:156.08ms step:987/1480 train_time:152495ms step_avg:156.09ms step:988/1480 train_time:152662ms step_avg:156.10ms step:989/1480 train_time:152828ms step_avg:156.11ms step:990/1480 train_time:152997ms step_avg:156.12ms step:991/1480 train_time:153165ms step_avg:156.13ms step:992/1480 train_time:153338ms step_avg:156.15ms step:993/1480 train_time:153515ms step_avg:156.17ms step:994/1480 train_time:153679ms step_avg:156.18ms step:995/1480 train_time:153843ms step_avg:156.19ms step:996/1480 train_time:154007ms step_avg:156.19ms step:997/1480 train_time:154171ms step_avg:156.20ms step:998/1480 train_time:154334ms step_avg:156.21ms step:999/1480 train_time:154500ms step_avg:156.22ms step:1000/1480 train_time:154670ms step_avg:156.23ms step:1000/1480 val_loss:3.4375 train_time:154737ms step_avg:156.30ms step:1001/1480 train_time:154840ms step_avg:156.25ms step:1002/1480 train_time:155006ms step_avg:156.26ms step:1003/1480 train_time:155178ms step_avg:156.27ms step:1004/1480 train_time:155347ms step_avg:156.28ms step:1005/1480 train_time:155516ms step_avg:156.30ms step:1006/1480 train_time:155683ms step_avg:156.31ms step:1007/1480 train_time:155848ms step_avg:156.32ms step:1008/1480 train_time:156015ms step_avg:156.33ms step:1009/1480 train_time:156189ms step_avg:156.35ms step:1010/1480 train_time:156356ms step_avg:156.36ms step:1011/1480 train_time:156522ms step_avg:156.37ms step:1012/1480 train_time:156687ms step_avg:156.37ms step:1013/1480 train_time:156858ms step_avg:156.39ms step:1014/1480 train_time:157024ms step_avg:156.40ms step:1015/1480 train_time:157195ms step_avg:156.41ms step:1016/1480 train_time:157364ms step_avg:156.43ms step:1017/1480 train_time:157535ms step_avg:156.44ms step:1018/1480 train_time:157703ms step_avg:156.45ms step:1019/1480 train_time:157871ms step_avg:156.46ms step:1020/1480 train_time:158041ms step_avg:156.48ms step:1021/1480 train_time:158205ms step_avg:156.48ms step:1022/1480 train_time:158372ms step_avg:156.49ms step:1023/1480 train_time:158539ms step_avg:156.50ms step:1024/1480 train_time:158705ms step_avg:156.51ms step:1025/1480 train_time:158876ms step_avg:156.53ms step:1026/1480 train_time:159043ms step_avg:156.54ms step:1027/1480 train_time:159208ms step_avg:156.55ms step:1028/1480 train_time:159383ms step_avg:156.56ms step:1029/1480 train_time:159558ms step_avg:156.58ms step:1030/1480 train_time:159725ms step_avg:156.59ms step:1031/1480 train_time:159890ms step_avg:156.60ms step:1032/1480 train_time:160062ms step_avg:156.62ms step:1033/1480 train_time:160228ms step_avg:156.63ms step:1034/1480 train_time:160396ms step_avg:156.64ms step:1035/1480 train_time:160564ms step_avg:156.65ms step:1036/1480 train_time:160729ms step_avg:156.66ms step:1037/1480 train_time:160897ms step_avg:156.67ms step:1038/1480 train_time:161066ms step_avg:156.68ms step:1039/1480 train_time:161237ms step_avg:156.69ms step:1040/1480 train_time:161404ms step_avg:156.70ms step:1041/1480 train_time:161570ms step_avg:156.71ms step:1042/1480 train_time:161734ms step_avg:156.72ms step:1043/1480 train_time:161900ms step_avg:156.73ms step:1044/1480 train_time:162064ms step_avg:156.74ms step:1045/1480 train_time:162234ms step_avg:156.75ms step:1046/1480 train_time:162403ms step_avg:156.76ms step:1047/1480 train_time:162568ms step_avg:156.77ms step:1048/1480 train_time:162732ms step_avg:156.77ms step:1049/1480 train_time:162897ms step_avg:156.78ms step:1050/1480 train_time:163066ms step_avg:156.79ms step:1051/1480 train_time:163234ms step_avg:156.81ms step:1052/1480 train_time:163403ms step_avg:156.82ms step:1053/1480 train_time:163569ms step_avg:156.83ms step:1054/1480 train_time:163737ms step_avg:156.84ms step:1055/1480 train_time:163904ms step_avg:156.85ms step:1056/1480 train_time:164068ms step_avg:156.85ms step:1057/1480 train_time:164235ms step_avg:156.86ms step:1058/1480 train_time:164405ms step_avg:156.88ms step:1059/1480 train_time:164578ms step_avg:156.89ms step:1060/1480 train_time:164746ms step_avg:156.90ms step:1061/1480 train_time:164909ms step_avg:156.91ms step:1062/1480 train_time:165076ms step_avg:156.92ms step:1063/1480 train_time:165242ms step_avg:156.93ms step:1064/1480 train_time:165405ms step_avg:156.93ms step:1065/1480 train_time:165573ms step_avg:156.94ms step:1066/1480 train_time:165741ms step_avg:156.95ms step:1067/1480 train_time:165910ms step_avg:156.96ms step:1068/1480 train_time:166076ms step_avg:156.97ms step:1069/1480 train_time:166247ms step_avg:156.98ms step:1070/1480 train_time:166412ms step_avg:156.99ms step:1071/1480 train_time:166585ms step_avg:157.01ms step:1072/1480 train_time:166750ms step_avg:157.02ms step:1073/1480 train_time:166914ms step_avg:157.02ms step:1074/1480 train_time:167083ms step_avg:157.03ms step:1075/1480 train_time:167256ms step_avg:157.05ms step:1076/1480 train_time:167424ms step_avg:157.06ms step:1077/1480 train_time:167590ms step_avg:157.07ms step:1078/1480 train_time:167765ms step_avg:157.08ms step:1079/1480 train_time:167939ms step_avg:157.10ms step:1080/1480 train_time:168110ms step_avg:157.11ms step:1081/1480 train_time:168277ms step_avg:157.12ms step:1082/1480 train_time:168444ms step_avg:157.13ms step:1083/1480 train_time:168612ms step_avg:157.14ms step:1084/1480 train_time:168780ms step_avg:157.15ms step:1085/1480 train_time:168948ms step_avg:157.16ms step:1086/1480 train_time:169116ms step_avg:157.17ms step:1087/1480 train_time:169283ms step_avg:157.18ms step:1088/1480 train_time:169453ms step_avg:157.19ms step:1089/1480 train_time:169627ms step_avg:157.21ms step:1090/1480 train_time:169800ms step_avg:157.22ms step:1091/1480 train_time:169967ms step_avg:157.23ms step:1092/1480 train_time:170136ms step_avg:157.24ms step:1093/1480 train_time:170304ms step_avg:157.25ms step:1094/1480 train_time:170469ms step_avg:157.26ms step:1095/1480 train_time:170634ms step_avg:157.27ms step:1096/1480 train_time:170804ms step_avg:157.28ms step:1097/1480 train_time:170972ms step_avg:157.29ms step:1098/1480 train_time:171143ms step_avg:157.30ms step:1099/1480 train_time:171314ms step_avg:157.31ms step:1100/1480 train_time:171486ms step_avg:157.33ms step:1101/1480 train_time:171658ms step_avg:157.34ms step:1102/1480 train_time:171830ms step_avg:157.35ms step:1103/1480 train_time:172008ms step_avg:157.37ms step:1104/1480 train_time:172177ms step_avg:157.38ms step:1105/1480 train_time:172346ms step_avg:157.39ms step:1106/1480 train_time:172513ms step_avg:157.40ms step:1107/1480 train_time:172682ms step_avg:157.41ms step:1108/1480 train_time:172847ms step_avg:157.42ms step:1109/1480 train_time:173014ms step_avg:157.43ms step:1110/1480 train_time:173180ms step_avg:157.44ms step:1111/1480 train_time:173347ms step_avg:157.44ms step:1112/1480 train_time:173517ms step_avg:157.46ms step:1113/1480 train_time:173696ms step_avg:157.48ms step:1114/1480 train_time:173868ms step_avg:157.49ms step:1115/1480 train_time:174041ms step_avg:157.50ms step:1116/1480 train_time:174208ms step_avg:157.51ms step:1117/1480 train_time:174383ms step_avg:157.53ms step:1118/1480 train_time:174557ms step_avg:157.54ms step:1119/1480 train_time:174724ms step_avg:157.55ms step:1120/1480 train_time:174891ms step_avg:157.56ms step:1121/1480 train_time:175063ms step_avg:157.57ms step:1122/1480 train_time:175230ms step_avg:157.58ms step:1123/1480 train_time:175397ms step_avg:157.59ms step:1124/1480 train_time:175566ms step_avg:157.60ms step:1125/1480 train_time:175732ms step_avg:157.61ms step:1125/1480 val_loss:3.3835 train_time:175800ms step_avg:157.67ms step:1126/1480 train_time:175903ms step_avg:157.62ms step:1127/1480 train_time:176073ms step_avg:157.63ms step:1128/1480 train_time:176245ms step_avg:157.64ms step:1129/1480 train_time:176418ms step_avg:157.66ms step:1130/1480 train_time:176587ms step_avg:157.67ms step:1131/1480 train_time:176763ms step_avg:157.68ms step:1132/1480 train_time:176929ms step_avg:157.69ms step:1133/1480 train_time:177100ms step_avg:157.70ms step:1134/1480 train_time:177271ms step_avg:157.71ms step:1135/1480 train_time:177440ms step_avg:157.72ms step:1136/1480 train_time:177610ms step_avg:157.74ms step:1137/1480 train_time:177780ms step_avg:157.75ms step:1138/1480 train_time:177951ms step_avg:157.76ms step:1139/1480 train_time:178119ms step_avg:157.77ms step:1140/1480 train_time:178287ms step_avg:157.78ms step:1141/1480 train_time:178459ms step_avg:157.79ms step:1142/1480 train_time:178629ms step_avg:157.80ms step:1143/1480 train_time:178799ms step_avg:157.81ms step:1144/1480 train_time:178969ms step_avg:157.82ms step:1145/1480 train_time:179135ms step_avg:157.83ms step:1146/1480 train_time:179306ms step_avg:157.84ms step:1147/1480 train_time:179473ms step_avg:157.85ms step:1148/1480 train_time:179642ms step_avg:157.86ms step:1149/1480 train_time:179812ms step_avg:157.87ms step:1150/1480 train_time:179980ms step_avg:157.88ms step:1151/1480 train_time:180152ms step_avg:157.89ms step:1152/1480 train_time:180324ms step_avg:157.90ms step:1153/1480 train_time:180498ms step_avg:157.92ms step:1154/1480 train_time:180665ms step_avg:157.92ms step:1155/1480 train_time:180837ms step_avg:157.94ms step:1156/1480 train_time:181016ms step_avg:157.95ms step:1157/1480 train_time:181185ms step_avg:157.96ms step:1158/1480 train_time:181353ms step_avg:157.97ms step:1159/1480 train_time:181521ms step_avg:157.98ms step:1160/1480 train_time:181687ms step_avg:157.99ms step:1161/1480 train_time:181856ms step_avg:158.00ms step:1162/1480 train_time:182027ms step_avg:158.01ms step:1163/1480 train_time:182195ms step_avg:158.02ms step:1164/1480 train_time:182364ms step_avg:158.03ms step:1165/1480 train_time:182529ms step_avg:158.03ms step:1166/1480 train_time:182699ms step_avg:158.04ms step:1167/1480 train_time:182869ms step_avg:158.05ms step:1168/1480 train_time:183039ms step_avg:158.07ms step:1169/1480 train_time:183209ms step_avg:158.07ms step:1170/1480 train_time:183378ms step_avg:158.08ms step:1171/1480 train_time:183545ms step_avg:158.09ms step:1172/1480 train_time:183710ms step_avg:158.10ms step:1173/1480 train_time:183882ms step_avg:158.11ms step:1174/1480 train_time:184064ms step_avg:158.13ms step:1175/1480 train_time:184235ms step_avg:158.14ms step:1176/1480 train_time:184408ms step_avg:158.15ms step:1177/1480 train_time:184583ms step_avg:158.17ms step:1178/1480 train_time:184751ms step_avg:158.18ms step:1179/1480 train_time:184915ms step_avg:158.18ms step:1180/1480 train_time:185097ms step_avg:158.20ms step:1181/1480 train_time:185266ms step_avg:158.21ms step:1182/1480 train_time:185434ms step_avg:158.22ms step:1183/1480 train_time:185603ms step_avg:158.23ms step:1184/1480 train_time:185770ms step_avg:158.24ms step:1185/1480 train_time:185943ms step_avg:158.25ms step:1186/1480 train_time:186115ms step_avg:158.26ms step:1187/1480 train_time:186296ms step_avg:158.28ms step:1188/1480 train_time:186463ms step_avg:158.29ms step:1189/1480 train_time:186633ms step_avg:158.30ms step:1190/1480 train_time:186801ms step_avg:158.31ms step:1191/1480 train_time:186972ms step_avg:158.32ms step:1192/1480 train_time:187138ms step_avg:158.32ms step:1193/1480 train_time:187307ms step_avg:158.33ms step:1194/1480 train_time:187474ms step_avg:158.34ms step:1195/1480 train_time:187649ms step_avg:158.35ms step:1196/1480 train_time:187830ms step_avg:158.37ms step:1197/1480 train_time:188002ms step_avg:158.38ms step:1198/1480 train_time:188187ms step_avg:158.41ms step:1199/1480 train_time:188356ms step_avg:158.42ms step:1200/1480 train_time:188526ms step_avg:158.43ms step:1201/1480 train_time:188693ms step_avg:158.43ms step:1202/1480 train_time:188875ms step_avg:158.45ms step:1203/1480 train_time:189050ms step_avg:158.47ms step:1204/1480 train_time:189225ms step_avg:158.48ms step:1205/1480 train_time:189393ms step_avg:158.49ms step:1206/1480 train_time:189561ms step_avg:158.50ms step:1207/1480 train_time:189732ms step_avg:158.51ms step:1208/1480 train_time:189899ms step_avg:158.51ms step:1209/1480 train_time:190072ms step_avg:158.53ms step:1210/1480 train_time:190248ms step_avg:158.54ms step:1211/1480 train_time:190421ms step_avg:158.55ms step:1212/1480 train_time:190591ms step_avg:158.56ms step:1213/1480 train_time:190764ms step_avg:158.57ms step:1214/1480 train_time:190940ms step_avg:158.59ms step:1215/1480 train_time:191112ms step_avg:158.60ms step:1216/1480 train_time:191282ms step_avg:158.61ms step:1217/1480 train_time:191457ms step_avg:158.62ms step:1218/1480 train_time:191627ms step_avg:158.63ms step:1219/1480 train_time:191807ms step_avg:158.65ms step:1220/1480 train_time:191975ms step_avg:158.66ms step:1221/1480 train_time:192146ms step_avg:158.67ms step:1222/1480 train_time:192311ms step_avg:158.67ms step:1223/1480 train_time:192481ms step_avg:158.68ms step:1224/1480 train_time:192657ms step_avg:158.70ms step:1225/1480 train_time:192828ms step_avg:158.71ms step:1226/1480 train_time:193003ms step_avg:158.72ms step:1227/1480 train_time:193177ms step_avg:158.73ms step:1228/1480 train_time:193346ms step_avg:158.74ms step:1229/1480 train_time:193518ms step_avg:158.75ms step:1230/1480 train_time:193696ms step_avg:158.77ms step:1231/1480 train_time:193871ms step_avg:158.78ms step:1232/1480 train_time:194048ms step_avg:158.80ms step:1233/1480 train_time:194216ms step_avg:158.80ms step:1234/1480 train_time:194386ms step_avg:158.81ms step:1235/1480 train_time:194560ms step_avg:158.82ms step:1236/1480 train_time:194730ms step_avg:158.83ms step:1237/1480 train_time:194899ms step_avg:158.84ms step:1238/1480 train_time:195084ms step_avg:158.86ms step:1239/1480 train_time:195255ms step_avg:158.87ms step:1240/1480 train_time:195426ms step_avg:158.88ms step:1241/1480 train_time:195599ms step_avg:158.89ms step:1242/1480 train_time:195768ms step_avg:158.90ms step:1243/1480 train_time:195941ms step_avg:158.91ms step:1244/1480 train_time:196108ms step_avg:158.92ms step:1245/1480 train_time:196278ms step_avg:158.93ms step:1246/1480 train_time:196448ms step_avg:158.94ms step:1247/1480 train_time:196617ms step_avg:158.95ms step:1248/1480 train_time:196786ms step_avg:158.95ms step:1249/1480 train_time:196954ms step_avg:158.96ms step:1250/1480 train_time:197124ms step_avg:158.97ms step:1250/1480 val_loss:3.3338 train_time:197196ms step_avg:159.03ms step:1251/1480 train_time:197302ms step_avg:158.99ms step:1252/1480 train_time:197471ms step_avg:158.99ms step:1253/1480 train_time:197639ms step_avg:159.00ms step:1254/1480 train_time:197811ms step_avg:159.01ms step:1255/1480 train_time:197996ms step_avg:159.03ms step:1256/1480 train_time:198171ms step_avg:159.05ms step:1257/1480 train_time:198341ms step_avg:159.05ms step:1258/1480 train_time:198515ms step_avg:159.07ms step:1259/1480 train_time:198687ms step_avg:159.08ms step:1260/1480 train_time:198854ms step_avg:159.08ms step:1261/1480 train_time:199025ms step_avg:159.09ms step:1262/1480 train_time:199201ms step_avg:159.11ms step:1263/1480 train_time:199375ms step_avg:159.12ms step:1264/1480 train_time:199540ms step_avg:159.12ms step:1265/1480 train_time:199708ms step_avg:159.13ms step:1266/1480 train_time:199879ms step_avg:159.14ms step:1267/1480 train_time:200050ms step_avg:159.15ms step:1268/1480 train_time:200221ms step_avg:159.16ms step:1269/1480 train_time:200396ms step_avg:159.17ms step:1270/1480 train_time:200566ms step_avg:159.18ms step:1271/1480 train_time:200737ms step_avg:159.19ms step:1272/1480 train_time:200903ms step_avg:159.19ms step:1273/1480 train_time:201074ms step_avg:159.20ms step:1274/1480 train_time:201247ms step_avg:159.21ms step:1275/1480 train_time:201413ms step_avg:159.22ms step:1276/1480 train_time:201580ms step_avg:159.23ms step:1277/1480 train_time:201752ms step_avg:159.24ms step:1278/1480 train_time:201920ms step_avg:159.24ms step:1279/1480 train_time:202093ms step_avg:159.25ms step:1280/1480 train_time:202273ms step_avg:159.27ms step:1281/1480 train_time:202441ms step_avg:159.28ms step:1282/1480 train_time:202607ms step_avg:159.28ms step:1283/1480 train_time:202778ms step_avg:159.29ms step:1284/1480 train_time:202948ms step_avg:159.30ms step:1285/1480 train_time:203116ms step_avg:159.31ms step:1286/1480 train_time:203286ms step_avg:159.32ms step:1287/1480 train_time:203459ms step_avg:159.33ms step:1288/1480 train_time:203633ms step_avg:159.34ms step:1289/1480 train_time:203817ms step_avg:159.36ms step:1290/1480 train_time:203997ms step_avg:159.37ms step:1291/1480 train_time:204171ms step_avg:159.38ms step:1292/1480 train_time:204343ms step_avg:159.39ms step:1293/1480 train_time:204518ms step_avg:159.41ms step:1294/1480 train_time:204689ms step_avg:159.41ms step:1295/1480 train_time:204859ms step_avg:159.42ms step:1296/1480 train_time:205033ms step_avg:159.44ms step:1297/1480 train_time:205204ms step_avg:159.44ms step:1298/1480 train_time:205374ms step_avg:159.45ms step:1299/1480 train_time:205545ms step_avg:159.46ms step:1300/1480 train_time:205715ms step_avg:159.47ms step:1301/1480 train_time:205884ms step_avg:159.48ms step:1302/1480 train_time:206058ms step_avg:159.49ms step:1303/1480 train_time:206235ms step_avg:159.50ms step:1304/1480 train_time:206408ms step_avg:159.51ms step:1305/1480 train_time:206576ms step_avg:159.52ms step:1306/1480 train_time:206750ms step_avg:159.53ms step:1307/1480 train_time:206918ms step_avg:159.54ms step:1308/1480 train_time:207087ms step_avg:159.54ms step:1309/1480 train_time:207261ms step_avg:159.55ms step:1310/1480 train_time:207431ms step_avg:159.56ms step:1311/1480 train_time:207599ms step_avg:159.57ms step:1312/1480 train_time:207774ms step_avg:159.58ms step:1313/1480 train_time:207942ms step_avg:159.59ms step:1314/1480 train_time:208116ms step_avg:159.60ms step:1315/1480 train_time:208286ms step_avg:159.61ms step:1316/1480 train_time:208454ms step_avg:159.61ms step:1317/1480 train_time:208625ms step_avg:159.62ms step:1318/1480 train_time:208804ms step_avg:159.64ms step:1319/1480 train_time:208980ms step_avg:159.65ms step:1320/1480 train_time:209157ms step_avg:159.66ms step:1321/1480 train_time:209328ms step_avg:159.67ms step:1322/1480 train_time:209510ms step_avg:159.69ms step:1323/1480 train_time:209681ms step_avg:159.70ms step:1324/1480 train_time:209857ms step_avg:159.71ms step:1325/1480 train_time:210039ms step_avg:159.73ms step:1326/1480 train_time:210214ms step_avg:159.74ms step:1327/1480 train_time:210385ms step_avg:159.75ms step:1328/1480 train_time:210557ms step_avg:159.75ms step:1329/1480 train_time:210753ms step_avg:159.78ms step:1330/1480 train_time:210934ms step_avg:159.80ms step:1331/1480 train_time:211105ms step_avg:159.81ms step:1332/1480 train_time:211280ms step_avg:159.82ms step:1333/1480 train_time:211457ms step_avg:159.83ms step:1334/1480 train_time:211630ms step_avg:159.84ms step:1335/1480 train_time:211798ms step_avg:159.85ms step:1336/1480 train_time:211984ms step_avg:159.87ms step:1337/1480 train_time:212159ms step_avg:159.88ms step:1338/1480 train_time:212333ms step_avg:159.89ms step:1339/1480 train_time:212505ms step_avg:159.90ms step:1340/1480 train_time:212678ms step_avg:159.91ms step:1341/1480 train_time:212848ms step_avg:159.92ms step:1342/1480 train_time:213020ms step_avg:159.92ms step:1343/1480 train_time:213192ms step_avg:159.93ms step:1344/1480 train_time:213363ms step_avg:159.94ms step:1345/1480 train_time:213543ms step_avg:159.96ms step:1346/1480 train_time:213713ms step_avg:159.96ms step:1347/1480 train_time:213883ms step_avg:159.97ms step:1348/1480 train_time:214053ms step_avg:159.98ms step:1349/1480 train_time:214222ms step_avg:159.99ms step:1350/1480 train_time:214397ms step_avg:160.00ms step:1351/1480 train_time:214569ms step_avg:160.01ms step:1352/1480 train_time:214739ms step_avg:160.01ms step:1353/1480 train_time:214916ms step_avg:160.03ms step:1354/1480 train_time:215087ms step_avg:160.04ms step:1355/1480 train_time:215257ms step_avg:160.04ms step:1356/1480 train_time:215431ms step_avg:160.05ms step:1357/1480 train_time:215602ms step_avg:160.06ms step:1358/1480 train_time:215776ms step_avg:160.07ms step:1359/1480 train_time:215948ms step_avg:160.08ms step:1360/1480 train_time:216121ms step_avg:160.09ms step:1361/1480 train_time:216299ms step_avg:160.10ms step:1362/1480 train_time:216475ms step_avg:160.11ms step:1363/1480 train_time:216655ms step_avg:160.13ms step:1364/1480 train_time:216823ms step_avg:160.14ms step:1365/1480 train_time:216990ms step_avg:160.14ms step:1366/1480 train_time:217162ms step_avg:160.15ms step:1367/1480 train_time:217334ms step_avg:160.16ms step:1368/1480 train_time:217508ms step_avg:160.17ms step:1369/1480 train_time:217690ms step_avg:160.18ms step:1370/1480 train_time:217870ms step_avg:160.20ms step:1371/1480 train_time:218040ms step_avg:160.21ms step:1372/1480 train_time:218217ms step_avg:160.22ms step:1373/1480 train_time:218386ms step_avg:160.22ms step:1374/1480 train_time:218563ms step_avg:160.24ms step:1375/1480 train_time:218734ms step_avg:160.24ms step:1375/1480 val_loss:3.2950 train_time:218801ms step_avg:160.29ms step:1376/1480 train_time:218908ms step_avg:160.25ms step:1377/1480 train_time:219081ms step_avg:160.26ms step:1378/1480 train_time:219250ms step_avg:160.27ms step:1379/1480 train_time:219424ms step_avg:160.28ms step:1380/1480 train_time:219597ms step_avg:160.29ms step:1381/1480 train_time:219777ms step_avg:160.30ms step:1382/1480 train_time:219948ms step_avg:160.31ms step:1383/1480 train_time:220122ms step_avg:160.32ms step:1384/1480 train_time:220300ms step_avg:160.33ms step:1385/1480 train_time:220466ms step_avg:160.34ms step:1386/1480 train_time:220636ms step_avg:160.35ms step:1387/1480 train_time:220807ms step_avg:160.35ms step:1388/1480 train_time:220977ms step_avg:160.36ms step:1389/1480 train_time:221151ms step_avg:160.37ms step:1390/1480 train_time:221318ms step_avg:160.38ms step:1391/1480 train_time:221489ms step_avg:160.38ms step:1392/1480 train_time:221663ms step_avg:160.39ms step:1393/1480 train_time:221833ms step_avg:160.40ms step:1394/1480 train_time:222003ms step_avg:160.41ms step:1395/1480 train_time:222173ms step_avg:160.41ms step:1396/1480 train_time:222343ms step_avg:160.42ms step:1397/1480 train_time:222511ms step_avg:160.43ms step:1398/1480 train_time:222678ms step_avg:160.43ms step:1399/1480 train_time:222846ms step_avg:160.44ms step:1400/1480 train_time:223022ms step_avg:160.45ms step:1401/1480 train_time:223188ms step_avg:160.45ms step:1402/1480 train_time:223359ms step_avg:160.46ms step:1403/1480 train_time:223535ms step_avg:160.47ms step:1404/1480 train_time:223707ms step_avg:160.48ms step:1405/1480 train_time:223883ms step_avg:160.49ms step:1406/1480 train_time:224058ms step_avg:160.50ms step:1407/1480 train_time:224227ms step_avg:160.51ms step:1408/1480 train_time:224395ms step_avg:160.51ms step:1409/1480 train_time:224580ms step_avg:160.53ms step:1410/1480 train_time:224749ms step_avg:160.53ms step:1411/1480 train_time:224917ms step_avg:160.54ms step:1412/1480 train_time:225088ms step_avg:160.55ms step:1413/1480 train_time:225258ms step_avg:160.55ms step:1414/1480 train_time:225429ms step_avg:160.56ms step:1415/1480 train_time:225603ms step_avg:160.57ms step:1416/1480 train_time:225792ms step_avg:160.59ms step:1417/1480 train_time:225966ms step_avg:160.60ms step:1418/1480 train_time:226138ms step_avg:160.61ms step:1419/1480 train_time:226313ms step_avg:160.62ms step:1420/1480 train_time:226488ms step_avg:160.63ms step:1421/1480 train_time:226660ms step_avg:160.64ms step:1422/1480 train_time:226832ms step_avg:160.65ms step:1423/1480 train_time:227000ms step_avg:160.65ms step:1424/1480 train_time:227177ms step_avg:160.66ms step:1425/1480 train_time:227358ms step_avg:160.68ms step:1426/1480 train_time:227529ms step_avg:160.68ms step:1427/1480 train_time:227706ms step_avg:160.70ms step:1428/1480 train_time:227877ms step_avg:160.70ms step:1429/1480 train_time:228045ms step_avg:160.71ms step:1430/1480 train_time:228218ms step_avg:160.72ms step:1431/1480 train_time:228393ms step_avg:160.73ms step:1432/1480 train_time:228570ms step_avg:160.74ms step:1433/1480 train_time:228749ms step_avg:160.75ms step:1434/1480 train_time:228930ms step_avg:160.77ms step:1435/1480 train_time:229105ms step_avg:160.78ms step:1436/1480 train_time:229278ms step_avg:160.78ms step:1437/1480 train_time:229449ms step_avg:160.79ms step:1438/1480 train_time:229617ms step_avg:160.80ms step:1439/1480 train_time:229791ms step_avg:160.81ms step:1440/1480 train_time:229961ms step_avg:160.81ms step:1441/1480 train_time:230131ms step_avg:160.82ms step:1442/1480 train_time:230309ms step_avg:160.83ms step:1443/1480 train_time:230497ms step_avg:160.85ms step:1444/1480 train_time:230669ms step_avg:160.86ms step:1445/1480 train_time:230841ms step_avg:160.86ms step:1446/1480 train_time:231017ms step_avg:160.88ms step:1447/1480 train_time:231195ms step_avg:160.89ms step:1448/1480 train_time:231368ms step_avg:160.90ms step:1449/1480 train_time:231541ms step_avg:160.90ms step:1450/1480 train_time:231712ms step_avg:160.91ms step:1451/1480 train_time:231885ms step_avg:160.92ms step:1452/1480 train_time:232058ms step_avg:160.93ms step:1453/1480 train_time:232227ms step_avg:160.93ms step:1454/1480 train_time:232400ms step_avg:160.94ms step:1455/1480 train_time:232579ms step_avg:160.95ms step:1456/1480 train_time:232752ms step_avg:160.96ms step:1457/1480 train_time:232923ms step_avg:160.97ms step:1458/1480 train_time:233094ms step_avg:160.98ms step:1459/1480 train_time:233271ms step_avg:160.99ms step:1460/1480 train_time:233444ms step_avg:161.00ms step:1461/1480 train_time:233619ms step_avg:161.01ms step:1462/1480 train_time:233791ms step_avg:161.01ms step:1463/1480 train_time:233969ms step_avg:161.02ms step:1464/1480 train_time:234143ms step_avg:161.03ms step:1465/1480 train_time:234315ms step_avg:161.04ms step:1466/1480 train_time:234486ms step_avg:161.05ms step:1467/1480 train_time:234660ms step_avg:161.06ms step:1468/1480 train_time:234830ms step_avg:161.06ms step:1469/1480 train_time:235004ms step_avg:161.07ms step:1470/1480 train_time:235185ms step_avg:161.09ms step:1471/1480 train_time:235372ms step_avg:161.10ms step:1472/1480 train_time:235552ms step_avg:161.12ms step:1473/1480 train_time:235723ms step_avg:161.12ms step:1474/1480 train_time:235901ms step_avg:161.13ms step:1475/1480 train_time:236081ms step_avg:161.15ms step:1476/1480 train_time:236252ms step_avg:161.15ms step:1477/1480 train_time:236434ms step_avg:161.17ms step:1478/1480 train_time:236618ms step_avg:161.18ms step:1479/1480 train_time:236791ms step_avg:161.19ms step:1480/1480 train_time:236963ms step_avg:161.20ms step:1480/1480 val_loss:3.2759 train_time:237034ms step_avg:161.25ms