import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 08:33:13 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 125W / 700W | 47MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 97W / 700W | 27MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 74W / 700W | 24MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 111W / 700W | 35MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 115W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23386ms step_avg:nanms step:2/1480 train_time:23472ms step_avg:nanms step:3/1480 train_time:23611ms step_avg:nanms step:4/1480 train_time:23752ms step_avg:nanms step:5/1480 train_time:23893ms step_avg:nanms step:6/1480 train_time:24034ms step_avg:nanms step:7/1480 train_time:24174ms step_avg:nanms step:8/1480 train_time:24317ms step_avg:nanms step:9/1480 train_time:24462ms step_avg:nanms step:10/1480 train_time:24609ms step_avg:nanms step:11/1480 train_time:140ms step_avg:nanms step:12/1480 train_time:281ms step_avg:nanms step:13/1480 train_time:423ms step_avg:140.92ms step:14/1480 train_time:564ms step_avg:141.08ms step:15/1480 train_time:707ms step_avg:141.30ms step:16/1480 train_time:849ms step_avg:141.54ms step:17/1480 train_time:993ms step_avg:141.86ms step:18/1480 train_time:1136ms step_avg:142.06ms step:19/1480 train_time:1280ms step_avg:142.26ms step:20/1480 train_time:1422ms step_avg:142.21ms step:21/1480 train_time:1563ms step_avg:142.08ms step:22/1480 train_time:1704ms step_avg:142.03ms step:23/1480 train_time:1847ms step_avg:142.08ms step:24/1480 train_time:1989ms step_avg:142.08ms step:25/1480 train_time:2132ms step_avg:142.13ms step:26/1480 train_time:2278ms step_avg:142.35ms step:27/1480 train_time:2421ms step_avg:142.41ms step:28/1480 train_time:2562ms step_avg:142.32ms step:29/1480 train_time:2704ms step_avg:142.31ms step:30/1480 train_time:2846ms step_avg:142.32ms step:31/1480 train_time:2987ms step_avg:142.26ms step:32/1480 train_time:3131ms step_avg:142.30ms step:33/1480 train_time:3277ms step_avg:142.50ms step:34/1480 train_time:3421ms step_avg:142.54ms step:35/1480 train_time:3562ms step_avg:142.48ms step:36/1480 train_time:3705ms step_avg:142.49ms step:37/1480 train_time:3847ms step_avg:142.47ms step:38/1480 train_time:3988ms step_avg:142.43ms step:39/1480 train_time:4131ms step_avg:142.44ms step:40/1480 train_time:4275ms step_avg:142.51ms step:41/1480 train_time:4420ms step_avg:142.58ms step:42/1480 train_time:4563ms step_avg:142.59ms step:43/1480 train_time:4706ms step_avg:142.59ms step:44/1480 train_time:4848ms step_avg:142.59ms step:45/1480 train_time:4990ms step_avg:142.57ms step:46/1480 train_time:5131ms step_avg:142.52ms step:47/1480 train_time:5275ms step_avg:142.56ms step:48/1480 train_time:5420ms step_avg:142.64ms step:49/1480 train_time:5563ms step_avg:142.64ms step:50/1480 train_time:5706ms step_avg:142.65ms step:51/1480 train_time:5848ms step_avg:142.64ms step:52/1480 train_time:5992ms step_avg:142.68ms step:53/1480 train_time:6135ms step_avg:142.66ms step:54/1480 train_time:6278ms step_avg:142.68ms step:55/1480 train_time:6421ms step_avg:142.70ms step:56/1480 train_time:6564ms step_avg:142.69ms step:57/1480 train_time:6707ms step_avg:142.71ms step:58/1480 train_time:6851ms step_avg:142.74ms step:59/1480 train_time:6996ms step_avg:142.77ms step:60/1480 train_time:7138ms step_avg:142.76ms step:61/1480 train_time:7283ms step_avg:142.80ms step:62/1480 train_time:7426ms step_avg:142.81ms step:63/1480 train_time:7569ms step_avg:142.81ms step:64/1480 train_time:7713ms step_avg:142.83ms step:65/1480 train_time:7855ms step_avg:142.82ms step:66/1480 train_time:8000ms step_avg:142.86ms step:67/1480 train_time:8144ms step_avg:142.87ms step:68/1480 train_time:8285ms step_avg:142.85ms step:69/1480 train_time:8426ms step_avg:142.81ms step:70/1480 train_time:8567ms step_avg:142.78ms step:71/1480 train_time:8709ms step_avg:142.78ms step:72/1480 train_time:8853ms step_avg:142.79ms step:73/1480 train_time:8997ms step_avg:142.82ms step:74/1480 train_time:9141ms step_avg:142.83ms step:75/1480 train_time:9284ms step_avg:142.83ms step:76/1480 train_time:9426ms step_avg:142.82ms step:77/1480 train_time:9568ms step_avg:142.80ms step:78/1480 train_time:9710ms step_avg:142.80ms step:79/1480 train_time:9853ms step_avg:142.80ms step:80/1480 train_time:9997ms step_avg:142.81ms step:81/1480 train_time:10139ms step_avg:142.81ms step:82/1480 train_time:10283ms step_avg:142.81ms step:83/1480 train_time:10425ms step_avg:142.80ms step:84/1480 train_time:10564ms step_avg:142.76ms step:85/1480 train_time:10708ms step_avg:142.77ms step:86/1480 train_time:10851ms step_avg:142.78ms step:87/1480 train_time:10993ms step_avg:142.77ms step:88/1480 train_time:11136ms step_avg:142.76ms step:89/1480 train_time:11279ms step_avg:142.77ms step:90/1480 train_time:11421ms step_avg:142.76ms step:91/1480 train_time:11562ms step_avg:142.74ms step:92/1480 train_time:11706ms step_avg:142.75ms step:93/1480 train_time:11849ms step_avg:142.76ms step:94/1480 train_time:11993ms step_avg:142.77ms step:95/1480 train_time:12134ms step_avg:142.75ms step:96/1480 train_time:12279ms step_avg:142.78ms step:97/1480 train_time:12422ms step_avg:142.78ms step:98/1480 train_time:12562ms step_avg:142.75ms step:99/1480 train_time:12704ms step_avg:142.74ms step:100/1480 train_time:12847ms step_avg:142.74ms step:101/1480 train_time:12988ms step_avg:142.72ms step:102/1480 train_time:13129ms step_avg:142.70ms step:103/1480 train_time:13273ms step_avg:142.72ms step:104/1480 train_time:13415ms step_avg:142.72ms step:105/1480 train_time:13558ms step_avg:142.72ms step:106/1480 train_time:13700ms step_avg:142.71ms step:107/1480 train_time:13841ms step_avg:142.69ms step:108/1480 train_time:13983ms step_avg:142.68ms step:109/1480 train_time:14125ms step_avg:142.68ms step:110/1480 train_time:14268ms step_avg:142.68ms step:111/1480 train_time:14413ms step_avg:142.70ms step:112/1480 train_time:14561ms step_avg:142.75ms step:113/1480 train_time:14707ms step_avg:142.79ms step:114/1480 train_time:14853ms step_avg:142.82ms step:115/1480 train_time:15000ms step_avg:142.86ms step:116/1480 train_time:15146ms step_avg:142.89ms step:117/1480 train_time:15292ms step_avg:142.91ms step:118/1480 train_time:15440ms step_avg:142.96ms step:119/1480 train_time:15586ms step_avg:142.99ms step:120/1480 train_time:15731ms step_avg:143.01ms step:121/1480 train_time:15879ms step_avg:143.05ms step:122/1480 train_time:16026ms step_avg:143.09ms step:123/1480 train_time:16172ms step_avg:143.11ms step:124/1480 train_time:16319ms step_avg:143.15ms step:125/1480 train_time:16465ms step_avg:143.17ms step:125/1480 val_loss:4.4204 train_time:16523ms step_avg:143.68ms step:126/1480 train_time:16617ms step_avg:143.25ms step:127/1480 train_time:16764ms step_avg:143.28ms step:128/1480 train_time:16911ms step_avg:143.31ms step:129/1480 train_time:17056ms step_avg:143.33ms step:130/1480 train_time:17201ms step_avg:143.34ms step:131/1480 train_time:17348ms step_avg:143.37ms step:132/1480 train_time:17494ms step_avg:143.39ms step:133/1480 train_time:17641ms step_avg:143.43ms step:134/1480 train_time:17789ms step_avg:143.46ms step:135/1480 train_time:17937ms step_avg:143.49ms step:136/1480 train_time:18082ms step_avg:143.51ms step:137/1480 train_time:18228ms step_avg:143.53ms step:138/1480 train_time:18374ms step_avg:143.55ms step:139/1480 train_time:18519ms step_avg:143.56ms step:140/1480 train_time:18666ms step_avg:143.58ms step:141/1480 train_time:18814ms step_avg:143.62ms step:142/1480 train_time:18960ms step_avg:143.63ms step:143/1480 train_time:19108ms step_avg:143.67ms step:144/1480 train_time:19255ms step_avg:143.70ms step:145/1480 train_time:19401ms step_avg:143.71ms step:146/1480 train_time:19548ms step_avg:143.73ms step:147/1480 train_time:19695ms step_avg:143.76ms step:148/1480 train_time:19840ms step_avg:143.77ms step:149/1480 train_time:19987ms step_avg:143.79ms step:150/1480 train_time:20137ms step_avg:143.83ms step:151/1480 train_time:20280ms step_avg:143.83ms step:152/1480 train_time:20427ms step_avg:143.85ms step:153/1480 train_time:20574ms step_avg:143.88ms step:154/1480 train_time:20720ms step_avg:143.89ms step:155/1480 train_time:20867ms step_avg:143.91ms step:156/1480 train_time:21015ms step_avg:143.94ms step:157/1480 train_time:21160ms step_avg:143.95ms step:158/1480 train_time:21309ms step_avg:143.98ms step:159/1480 train_time:21456ms step_avg:144.00ms step:160/1480 train_time:21601ms step_avg:144.01ms step:161/1480 train_time:21748ms step_avg:144.03ms step:162/1480 train_time:21894ms step_avg:144.04ms step:163/1480 train_time:22041ms step_avg:144.06ms step:164/1480 train_time:22187ms step_avg:144.07ms step:165/1480 train_time:22335ms step_avg:144.10ms step:166/1480 train_time:22481ms step_avg:144.11ms step:167/1480 train_time:22629ms step_avg:144.13ms step:168/1480 train_time:22775ms step_avg:144.15ms step:169/1480 train_time:22920ms step_avg:144.15ms step:170/1480 train_time:23067ms step_avg:144.17ms step:171/1480 train_time:23215ms step_avg:144.19ms step:172/1480 train_time:23360ms step_avg:144.20ms step:173/1480 train_time:23508ms step_avg:144.22ms step:174/1480 train_time:23656ms step_avg:144.24ms step:175/1480 train_time:23802ms step_avg:144.25ms step:176/1480 train_time:23949ms step_avg:144.27ms step:177/1480 train_time:24095ms step_avg:144.28ms step:178/1480 train_time:24241ms step_avg:144.29ms step:179/1480 train_time:24388ms step_avg:144.31ms step:180/1480 train_time:24536ms step_avg:144.33ms step:181/1480 train_time:24681ms step_avg:144.34ms step:182/1480 train_time:24830ms step_avg:144.36ms step:183/1480 train_time:24977ms step_avg:144.37ms step:184/1480 train_time:25124ms step_avg:144.39ms step:185/1480 train_time:25271ms step_avg:144.41ms step:186/1480 train_time:25417ms step_avg:144.41ms step:187/1480 train_time:25562ms step_avg:144.42ms step:188/1480 train_time:25710ms step_avg:144.44ms step:189/1480 train_time:25857ms step_avg:144.45ms step:190/1480 train_time:26002ms step_avg:144.46ms step:191/1480 train_time:26151ms step_avg:144.48ms step:192/1480 train_time:26297ms step_avg:144.49ms step:193/1480 train_time:26445ms step_avg:144.51ms step:194/1480 train_time:26592ms step_avg:144.52ms step:195/1480 train_time:26739ms step_avg:144.53ms step:196/1480 train_time:26887ms step_avg:144.55ms step:197/1480 train_time:27034ms step_avg:144.57ms step:198/1480 train_time:27179ms step_avg:144.57ms step:199/1480 train_time:27328ms step_avg:144.59ms step:200/1480 train_time:27475ms step_avg:144.61ms step:201/1480 train_time:27621ms step_avg:144.61ms step:202/1480 train_time:27769ms step_avg:144.63ms step:203/1480 train_time:27916ms step_avg:144.64ms step:204/1480 train_time:28063ms step_avg:144.65ms step:205/1480 train_time:28211ms step_avg:144.67ms step:206/1480 train_time:28357ms step_avg:144.68ms step:207/1480 train_time:28503ms step_avg:144.69ms step:208/1480 train_time:28651ms step_avg:144.70ms step:209/1480 train_time:28796ms step_avg:144.70ms step:210/1480 train_time:28943ms step_avg:144.71ms step:211/1480 train_time:29090ms step_avg:144.73ms step:212/1480 train_time:29237ms step_avg:144.74ms step:213/1480 train_time:29382ms step_avg:144.74ms step:214/1480 train_time:29531ms step_avg:144.76ms step:215/1480 train_time:29677ms step_avg:144.76ms step:216/1480 train_time:29822ms step_avg:144.77ms step:217/1480 train_time:29968ms step_avg:144.77ms step:218/1480 train_time:30115ms step_avg:144.79ms step:219/1480 train_time:30261ms step_avg:144.79ms step:220/1480 train_time:30408ms step_avg:144.80ms step:221/1480 train_time:30558ms step_avg:144.82ms step:222/1480 train_time:30709ms step_avg:144.86ms step:223/1480 train_time:30860ms step_avg:144.88ms step:224/1480 train_time:31011ms step_avg:144.91ms step:225/1480 train_time:31160ms step_avg:144.93ms step:226/1480 train_time:31311ms step_avg:144.96ms step:227/1480 train_time:31460ms step_avg:144.98ms step:228/1480 train_time:31611ms step_avg:145.00ms step:229/1480 train_time:31760ms step_avg:145.03ms step:230/1480 train_time:31912ms step_avg:145.05ms step:231/1480 train_time:32062ms step_avg:145.08ms step:232/1480 train_time:32212ms step_avg:145.10ms step:233/1480 train_time:32361ms step_avg:145.12ms step:234/1480 train_time:32512ms step_avg:145.14ms step:235/1480 train_time:32663ms step_avg:145.17ms step:236/1480 train_time:32814ms step_avg:145.19ms step:237/1480 train_time:32963ms step_avg:145.21ms step:238/1480 train_time:33113ms step_avg:145.23ms step:239/1480 train_time:33262ms step_avg:145.25ms step:240/1480 train_time:33413ms step_avg:145.27ms step:241/1480 train_time:33562ms step_avg:145.29ms step:242/1480 train_time:33715ms step_avg:145.32ms step:243/1480 train_time:33865ms step_avg:145.34ms step:244/1480 train_time:34016ms step_avg:145.37ms step:245/1480 train_time:34165ms step_avg:145.39ms step:246/1480 train_time:34316ms step_avg:145.41ms step:247/1480 train_time:34466ms step_avg:145.43ms step:248/1480 train_time:34616ms step_avg:145.45ms step:249/1480 train_time:34765ms step_avg:145.46ms step:250/1480 train_time:34917ms step_avg:145.49ms step:250/1480 val_loss:4.0000 train_time:34975ms step_avg:145.73ms step:251/1480 train_time:35071ms step_avg:145.52ms step:252/1480 train_time:35223ms step_avg:145.55ms step:253/1480 train_time:35373ms step_avg:145.57ms step:254/1480 train_time:35522ms step_avg:145.58ms step:255/1480 train_time:35671ms step_avg:145.60ms step:256/1480 train_time:35820ms step_avg:145.61ms step:257/1480 train_time:35971ms step_avg:145.63ms step:258/1480 train_time:36124ms step_avg:145.66ms step:259/1480 train_time:36275ms step_avg:145.68ms step:260/1480 train_time:36426ms step_avg:145.70ms step:261/1480 train_time:36575ms step_avg:145.72ms step:262/1480 train_time:36726ms step_avg:145.74ms step:263/1480 train_time:36875ms step_avg:145.75ms step:264/1480 train_time:37028ms step_avg:145.78ms step:265/1480 train_time:37178ms step_avg:145.80ms step:266/1480 train_time:37330ms step_avg:145.82ms step:267/1480 train_time:37479ms step_avg:145.83ms step:268/1480 train_time:37630ms step_avg:145.85ms step:269/1480 train_time:37778ms step_avg:145.86ms step:270/1480 train_time:37928ms step_avg:145.88ms step:271/1480 train_time:38077ms step_avg:145.89ms step:272/1480 train_time:38229ms step_avg:145.91ms step:273/1480 train_time:38378ms step_avg:145.92ms step:274/1480 train_time:38529ms step_avg:145.94ms step:275/1480 train_time:38679ms step_avg:145.96ms step:276/1480 train_time:38829ms step_avg:145.97ms step:277/1480 train_time:38978ms step_avg:145.99ms step:278/1480 train_time:39129ms step_avg:146.01ms step:279/1480 train_time:39278ms step_avg:146.01ms step:280/1480 train_time:39429ms step_avg:146.03ms step:281/1480 train_time:39578ms step_avg:146.04ms step:282/1480 train_time:39730ms step_avg:146.07ms step:283/1480 train_time:39880ms step_avg:146.08ms step:284/1480 train_time:40031ms step_avg:146.10ms step:285/1480 train_time:40180ms step_avg:146.11ms step:286/1480 train_time:40331ms step_avg:146.13ms step:287/1480 train_time:40481ms step_avg:146.14ms step:288/1480 train_time:40631ms step_avg:146.15ms step:289/1480 train_time:40780ms step_avg:146.17ms step:290/1480 train_time:40932ms step_avg:146.18ms step:291/1480 train_time:41083ms step_avg:146.20ms step:292/1480 train_time:41233ms step_avg:146.22ms step:293/1480 train_time:41384ms step_avg:146.23ms step:294/1480 train_time:41534ms step_avg:146.25ms step:295/1480 train_time:41685ms step_avg:146.26ms step:296/1480 train_time:41835ms step_avg:146.28ms step:297/1480 train_time:41987ms step_avg:146.29ms step:298/1480 train_time:42136ms step_avg:146.31ms step:299/1480 train_time:42286ms step_avg:146.32ms step:300/1480 train_time:42437ms step_avg:146.33ms step:301/1480 train_time:42587ms step_avg:146.35ms step:302/1480 train_time:42738ms step_avg:146.36ms step:303/1480 train_time:42888ms step_avg:146.38ms step:304/1480 train_time:43038ms step_avg:146.39ms step:305/1480 train_time:43189ms step_avg:146.40ms step:306/1480 train_time:43338ms step_avg:146.41ms step:307/1480 train_time:43489ms step_avg:146.43ms step:308/1480 train_time:43640ms step_avg:146.44ms step:309/1480 train_time:43790ms step_avg:146.46ms step:310/1480 train_time:43941ms step_avg:146.47ms step:311/1480 train_time:44092ms step_avg:146.48ms step:312/1480 train_time:44242ms step_avg:146.50ms step:313/1480 train_time:44392ms step_avg:146.51ms step:314/1480 train_time:44541ms step_avg:146.52ms step:315/1480 train_time:44691ms step_avg:146.53ms step:316/1480 train_time:44842ms step_avg:146.54ms step:317/1480 train_time:44993ms step_avg:146.56ms step:318/1480 train_time:45144ms step_avg:146.57ms step:319/1480 train_time:45294ms step_avg:146.58ms step:320/1480 train_time:45445ms step_avg:146.60ms step:321/1480 train_time:45594ms step_avg:146.60ms step:322/1480 train_time:45746ms step_avg:146.62ms step:323/1480 train_time:45895ms step_avg:146.63ms step:324/1480 train_time:46047ms step_avg:146.65ms step:325/1480 train_time:46197ms step_avg:146.66ms step:326/1480 train_time:46347ms step_avg:146.67ms step:327/1480 train_time:46496ms step_avg:146.68ms step:328/1480 train_time:46647ms step_avg:146.69ms step:329/1480 train_time:46797ms step_avg:146.70ms step:330/1480 train_time:46950ms step_avg:146.72ms step:331/1480 train_time:47104ms step_avg:146.74ms step:332/1480 train_time:47258ms step_avg:146.76ms step:333/1480 train_time:47411ms step_avg:146.78ms step:334/1480 train_time:47563ms step_avg:146.80ms step:335/1480 train_time:47717ms step_avg:146.82ms step:336/1480 train_time:47871ms step_avg:146.84ms step:337/1480 train_time:48026ms step_avg:146.87ms step:338/1480 train_time:48180ms step_avg:146.89ms step:339/1480 train_time:48334ms step_avg:146.91ms step:340/1480 train_time:48488ms step_avg:146.93ms step:341/1480 train_time:48643ms step_avg:146.96ms step:342/1480 train_time:48795ms step_avg:146.97ms step:343/1480 train_time:48950ms step_avg:147.00ms step:344/1480 train_time:49104ms step_avg:147.02ms step:345/1480 train_time:49259ms step_avg:147.04ms step:346/1480 train_time:49413ms step_avg:147.06ms step:347/1480 train_time:49567ms step_avg:147.08ms step:348/1480 train_time:49720ms step_avg:147.10ms step:349/1480 train_time:49874ms step_avg:147.12ms step:350/1480 train_time:50028ms step_avg:147.14ms step:351/1480 train_time:50183ms step_avg:147.16ms step:352/1480 train_time:50336ms step_avg:147.18ms step:353/1480 train_time:50491ms step_avg:147.20ms step:354/1480 train_time:50644ms step_avg:147.22ms step:355/1480 train_time:50797ms step_avg:147.24ms step:356/1480 train_time:50951ms step_avg:147.26ms step:357/1480 train_time:51105ms step_avg:147.28ms step:358/1480 train_time:51258ms step_avg:147.29ms step:359/1480 train_time:51413ms step_avg:147.31ms step:360/1480 train_time:51569ms step_avg:147.34ms step:361/1480 train_time:51725ms step_avg:147.36ms step:362/1480 train_time:51879ms step_avg:147.38ms step:363/1480 train_time:52032ms step_avg:147.40ms step:364/1480 train_time:52186ms step_avg:147.42ms step:365/1480 train_time:52339ms step_avg:147.43ms step:366/1480 train_time:52492ms step_avg:147.45ms step:367/1480 train_time:52645ms step_avg:147.46ms step:368/1480 train_time:52797ms step_avg:147.48ms step:369/1480 train_time:52951ms step_avg:147.50ms step:370/1480 train_time:53105ms step_avg:147.51ms step:371/1480 train_time:53259ms step_avg:147.53ms step:372/1480 train_time:53413ms step_avg:147.55ms step:373/1480 train_time:53567ms step_avg:147.57ms step:374/1480 train_time:53720ms step_avg:147.58ms step:375/1480 train_time:53874ms step_avg:147.60ms step:375/1480 val_loss:3.8130 train_time:53933ms step_avg:147.76ms step:376/1480 train_time:54030ms step_avg:147.62ms step:377/1480 train_time:54184ms step_avg:147.64ms step:378/1480 train_time:54338ms step_avg:147.66ms step:379/1480 train_time:54491ms step_avg:147.67ms step:380/1480 train_time:54644ms step_avg:147.69ms step:381/1480 train_time:54796ms step_avg:147.70ms step:382/1480 train_time:54950ms step_avg:147.72ms step:383/1480 train_time:55106ms step_avg:147.74ms step:384/1480 train_time:55261ms step_avg:147.76ms step:385/1480 train_time:55414ms step_avg:147.77ms step:386/1480 train_time:55567ms step_avg:147.79ms step:387/1480 train_time:55722ms step_avg:147.80ms step:388/1480 train_time:55875ms step_avg:147.82ms step:389/1480 train_time:56030ms step_avg:147.84ms step:390/1480 train_time:56184ms step_avg:147.85ms step:391/1480 train_time:56338ms step_avg:147.87ms step:392/1480 train_time:56490ms step_avg:147.88ms step:393/1480 train_time:56644ms step_avg:147.90ms step:394/1480 train_time:56797ms step_avg:147.91ms step:395/1480 train_time:56951ms step_avg:147.92ms step:396/1480 train_time:57105ms step_avg:147.94ms step:397/1480 train_time:57261ms step_avg:147.96ms step:398/1480 train_time:57414ms step_avg:147.98ms step:399/1480 train_time:57569ms step_avg:147.99ms step:400/1480 train_time:57724ms step_avg:148.01ms step:401/1480 train_time:57878ms step_avg:148.03ms step:402/1480 train_time:58032ms step_avg:148.04ms step:403/1480 train_time:58186ms step_avg:148.06ms step:404/1480 train_time:58340ms step_avg:148.07ms step:405/1480 train_time:58495ms step_avg:148.09ms step:406/1480 train_time:58650ms step_avg:148.11ms step:407/1480 train_time:58804ms step_avg:148.12ms step:408/1480 train_time:58957ms step_avg:148.13ms step:409/1480 train_time:59110ms step_avg:148.15ms step:410/1480 train_time:59265ms step_avg:148.16ms step:411/1480 train_time:59419ms step_avg:148.18ms step:412/1480 train_time:59572ms step_avg:148.19ms step:413/1480 train_time:59726ms step_avg:148.20ms step:414/1480 train_time:59880ms step_avg:148.22ms step:415/1480 train_time:60034ms step_avg:148.23ms step:416/1480 train_time:60187ms step_avg:148.24ms step:417/1480 train_time:60341ms step_avg:148.26ms step:418/1480 train_time:60496ms step_avg:148.27ms step:419/1480 train_time:60649ms step_avg:148.29ms step:420/1480 train_time:60803ms step_avg:148.30ms step:421/1480 train_time:60957ms step_avg:148.31ms step:422/1480 train_time:61110ms step_avg:148.32ms step:423/1480 train_time:61264ms step_avg:148.34ms step:424/1480 train_time:61419ms step_avg:148.35ms step:425/1480 train_time:61572ms step_avg:148.37ms step:426/1480 train_time:61726ms step_avg:148.38ms step:427/1480 train_time:61879ms step_avg:148.39ms step:428/1480 train_time:62033ms step_avg:148.40ms step:429/1480 train_time:62187ms step_avg:148.42ms step:430/1480 train_time:62340ms step_avg:148.43ms step:431/1480 train_time:62494ms step_avg:148.44ms step:432/1480 train_time:62648ms step_avg:148.45ms step:433/1480 train_time:62802ms step_avg:148.47ms step:434/1480 train_time:62958ms step_avg:148.49ms step:435/1480 train_time:63111ms step_avg:148.50ms step:436/1480 train_time:63264ms step_avg:148.51ms step:437/1480 train_time:63417ms step_avg:148.52ms step:438/1480 train_time:63570ms step_avg:148.53ms step:439/1480 train_time:63725ms step_avg:148.54ms step:440/1480 train_time:63880ms step_avg:148.56ms step:441/1480 train_time:64037ms step_avg:148.58ms step:442/1480 train_time:64194ms step_avg:148.60ms step:443/1480 train_time:64349ms step_avg:148.61ms step:444/1480 train_time:64505ms step_avg:148.63ms step:445/1480 train_time:64662ms step_avg:148.65ms step:446/1480 train_time:64818ms step_avg:148.66ms step:447/1480 train_time:64973ms step_avg:148.68ms step:448/1480 train_time:65130ms step_avg:148.70ms step:449/1480 train_time:65287ms step_avg:148.72ms step:450/1480 train_time:65445ms step_avg:148.74ms step:451/1480 train_time:65603ms step_avg:148.76ms step:452/1480 train_time:65760ms step_avg:148.78ms step:453/1480 train_time:65918ms step_avg:148.80ms step:454/1480 train_time:66073ms step_avg:148.81ms step:455/1480 train_time:66230ms step_avg:148.83ms step:456/1480 train_time:66387ms step_avg:148.85ms step:457/1480 train_time:66543ms step_avg:148.87ms step:458/1480 train_time:66700ms step_avg:148.88ms step:459/1480 train_time:66858ms step_avg:148.91ms step:460/1480 train_time:67015ms step_avg:148.92ms step:461/1480 train_time:67172ms step_avg:148.94ms step:462/1480 train_time:67329ms step_avg:148.96ms step:463/1480 train_time:67487ms step_avg:148.98ms step:464/1480 train_time:67644ms step_avg:149.00ms step:465/1480 train_time:67802ms step_avg:149.02ms step:466/1480 train_time:67961ms step_avg:149.04ms step:467/1480 train_time:68119ms step_avg:149.06ms step:468/1480 train_time:68275ms step_avg:149.07ms step:469/1480 train_time:68431ms step_avg:149.09ms step:470/1480 train_time:68587ms step_avg:149.10ms step:471/1480 train_time:68743ms step_avg:149.12ms step:472/1480 train_time:68900ms step_avg:149.14ms step:473/1480 train_time:69057ms step_avg:149.15ms step:474/1480 train_time:69213ms step_avg:149.17ms step:475/1480 train_time:69368ms step_avg:149.18ms step:476/1480 train_time:69525ms step_avg:149.20ms step:477/1480 train_time:69682ms step_avg:149.21ms step:478/1480 train_time:69838ms step_avg:149.23ms step:479/1480 train_time:69994ms step_avg:149.24ms step:480/1480 train_time:70151ms step_avg:149.26ms step:481/1480 train_time:70307ms step_avg:149.27ms step:482/1480 train_time:70465ms step_avg:149.29ms step:483/1480 train_time:70623ms step_avg:149.31ms step:484/1480 train_time:70781ms step_avg:149.33ms step:485/1480 train_time:70939ms step_avg:149.34ms step:486/1480 train_time:71096ms step_avg:149.36ms step:487/1480 train_time:71251ms step_avg:149.37ms step:488/1480 train_time:71407ms step_avg:149.39ms step:489/1480 train_time:71564ms step_avg:149.40ms step:490/1480 train_time:71720ms step_avg:149.42ms step:491/1480 train_time:71878ms step_avg:149.43ms step:492/1480 train_time:72036ms step_avg:149.45ms step:493/1480 train_time:72192ms step_avg:149.47ms step:494/1480 train_time:72349ms step_avg:149.48ms step:495/1480 train_time:72505ms step_avg:149.50ms step:496/1480 train_time:72663ms step_avg:149.51ms step:497/1480 train_time:72820ms step_avg:149.53ms step:498/1480 train_time:72978ms step_avg:149.55ms step:499/1480 train_time:73136ms step_avg:149.56ms step:500/1480 train_time:73293ms step_avg:149.58ms step:500/1480 val_loss:3.6939 train_time:73355ms step_avg:149.70ms step:501/1480 train_time:73453ms step_avg:149.60ms step:502/1480 train_time:73610ms step_avg:149.61ms step:503/1480 train_time:73765ms step_avg:149.62ms step:504/1480 train_time:73920ms step_avg:149.64ms step:505/1480 train_time:74075ms step_avg:149.65ms step:506/1480 train_time:74232ms step_avg:149.66ms step:507/1480 train_time:74388ms step_avg:149.67ms step:508/1480 train_time:74548ms step_avg:149.69ms step:509/1480 train_time:74704ms step_avg:149.71ms step:510/1480 train_time:74861ms step_avg:149.72ms step:511/1480 train_time:75018ms step_avg:149.74ms step:512/1480 train_time:75174ms step_avg:149.75ms step:513/1480 train_time:75330ms step_avg:149.76ms step:514/1480 train_time:75487ms step_avg:149.78ms step:515/1480 train_time:75646ms step_avg:149.79ms step:516/1480 train_time:75805ms step_avg:149.81ms step:517/1480 train_time:75964ms step_avg:149.83ms step:518/1480 train_time:76122ms step_avg:149.85ms step:519/1480 train_time:76280ms step_avg:149.86ms step:520/1480 train_time:76437ms step_avg:149.88ms step:521/1480 train_time:76593ms step_avg:149.89ms step:522/1480 train_time:76749ms step_avg:149.90ms step:523/1480 train_time:76906ms step_avg:149.91ms step:524/1480 train_time:77062ms step_avg:149.93ms step:525/1480 train_time:77220ms step_avg:149.94ms step:526/1480 train_time:77377ms step_avg:149.96ms step:527/1480 train_time:77534ms step_avg:149.97ms step:528/1480 train_time:77690ms step_avg:149.98ms step:529/1480 train_time:77849ms step_avg:150.00ms step:530/1480 train_time:78005ms step_avg:150.01ms step:531/1480 train_time:78163ms step_avg:150.02ms step:532/1480 train_time:78320ms step_avg:150.04ms step:533/1480 train_time:78476ms step_avg:150.05ms step:534/1480 train_time:78632ms step_avg:150.06ms step:535/1480 train_time:78788ms step_avg:150.07ms step:536/1480 train_time:78947ms step_avg:150.09ms step:537/1480 train_time:79104ms step_avg:150.10ms step:538/1480 train_time:79263ms step_avg:150.12ms step:539/1480 train_time:79421ms step_avg:150.13ms step:540/1480 train_time:79578ms step_avg:150.15ms step:541/1480 train_time:79734ms step_avg:150.16ms step:542/1480 train_time:79890ms step_avg:150.17ms step:543/1480 train_time:80048ms step_avg:150.18ms step:544/1480 train_time:80205ms step_avg:150.20ms step:545/1480 train_time:80361ms step_avg:150.21ms step:546/1480 train_time:80519ms step_avg:150.22ms step:547/1480 train_time:80674ms step_avg:150.23ms step:548/1480 train_time:80831ms step_avg:150.24ms step:549/1480 train_time:80986ms step_avg:150.25ms step:550/1480 train_time:81145ms step_avg:150.27ms step:551/1480 train_time:81305ms step_avg:150.29ms step:552/1480 train_time:81465ms step_avg:150.30ms step:553/1480 train_time:81625ms step_avg:150.32ms step:554/1480 train_time:81786ms step_avg:150.34ms step:555/1480 train_time:81948ms step_avg:150.36ms step:556/1480 train_time:82107ms step_avg:150.38ms step:557/1480 train_time:82267ms step_avg:150.40ms step:558/1480 train_time:82427ms step_avg:150.41ms step:559/1480 train_time:82586ms step_avg:150.43ms step:560/1480 train_time:82745ms step_avg:150.45ms step:561/1480 train_time:82905ms step_avg:150.46ms step:562/1480 train_time:83065ms step_avg:150.48ms step:563/1480 train_time:83225ms step_avg:150.50ms step:564/1480 train_time:83385ms step_avg:150.51ms step:565/1480 train_time:83545ms step_avg:150.53ms step:566/1480 train_time:83706ms step_avg:150.55ms step:567/1480 train_time:83866ms step_avg:150.57ms step:568/1480 train_time:84024ms step_avg:150.58ms step:569/1480 train_time:84183ms step_avg:150.60ms step:570/1480 train_time:84343ms step_avg:150.61ms step:571/1480 train_time:84503ms step_avg:150.63ms step:572/1480 train_time:84663ms step_avg:150.65ms step:573/1480 train_time:84824ms step_avg:150.66ms step:574/1480 train_time:84985ms step_avg:150.68ms step:575/1480 train_time:85147ms step_avg:150.70ms step:576/1480 train_time:85307ms step_avg:150.72ms step:577/1480 train_time:85466ms step_avg:150.73ms step:578/1480 train_time:85625ms step_avg:150.75ms step:579/1480 train_time:85785ms step_avg:150.77ms step:580/1480 train_time:85946ms step_avg:150.78ms step:581/1480 train_time:86106ms step_avg:150.80ms step:582/1480 train_time:86267ms step_avg:150.82ms step:583/1480 train_time:86426ms step_avg:150.83ms step:584/1480 train_time:86585ms step_avg:150.85ms step:585/1480 train_time:86745ms step_avg:150.86ms step:586/1480 train_time:86905ms step_avg:150.88ms step:587/1480 train_time:87065ms step_avg:150.89ms step:588/1480 train_time:87223ms step_avg:150.90ms step:589/1480 train_time:87383ms step_avg:150.92ms step:590/1480 train_time:87543ms step_avg:150.94ms step:591/1480 train_time:87703ms step_avg:150.95ms step:592/1480 train_time:87865ms step_avg:150.97ms step:593/1480 train_time:88026ms step_avg:150.99ms step:594/1480 train_time:88187ms step_avg:151.00ms step:595/1480 train_time:88349ms step_avg:151.02ms step:596/1480 train_time:88510ms step_avg:151.04ms step:597/1480 train_time:88668ms step_avg:151.05ms step:598/1480 train_time:88826ms step_avg:151.07ms step:599/1480 train_time:88985ms step_avg:151.08ms step:600/1480 train_time:89145ms step_avg:151.09ms step:601/1480 train_time:89305ms step_avg:151.11ms step:602/1480 train_time:89466ms step_avg:151.12ms step:603/1480 train_time:89626ms step_avg:151.14ms step:604/1480 train_time:89786ms step_avg:151.15ms step:605/1480 train_time:89946ms step_avg:151.17ms step:606/1480 train_time:90108ms step_avg:151.19ms step:607/1480 train_time:90269ms step_avg:151.20ms step:608/1480 train_time:90428ms step_avg:151.22ms step:609/1480 train_time:90586ms step_avg:151.23ms step:610/1480 train_time:90745ms step_avg:151.24ms step:611/1480 train_time:90905ms step_avg:151.26ms step:612/1480 train_time:91067ms step_avg:151.27ms step:613/1480 train_time:91227ms step_avg:151.29ms step:614/1480 train_time:91387ms step_avg:151.30ms step:615/1480 train_time:91546ms step_avg:151.32ms step:616/1480 train_time:91704ms step_avg:151.33ms step:617/1480 train_time:91864ms step_avg:151.34ms step:618/1480 train_time:92023ms step_avg:151.35ms step:619/1480 train_time:92182ms step_avg:151.37ms step:620/1480 train_time:92345ms step_avg:151.39ms step:621/1480 train_time:92506ms step_avg:151.40ms step:622/1480 train_time:92665ms step_avg:151.41ms step:623/1480 train_time:92827ms step_avg:151.43ms step:624/1480 train_time:92986ms step_avg:151.44ms step:625/1480 train_time:93146ms step_avg:151.46ms step:625/1480 val_loss:3.6112 train_time:93208ms step_avg:151.56ms step:626/1480 train_time:93307ms step_avg:151.47ms step:627/1480 train_time:93466ms step_avg:151.48ms step:628/1480 train_time:93623ms step_avg:151.49ms step:629/1480 train_time:93781ms step_avg:151.50ms step:630/1480 train_time:93938ms step_avg:151.51ms step:631/1480 train_time:94095ms step_avg:151.52ms step:632/1480 train_time:94254ms step_avg:151.53ms step:633/1480 train_time:94415ms step_avg:151.55ms step:634/1480 train_time:94575ms step_avg:151.56ms step:635/1480 train_time:94734ms step_avg:151.58ms step:636/1480 train_time:94893ms step_avg:151.59ms step:637/1480 train_time:95053ms step_avg:151.60ms step:638/1480 train_time:95213ms step_avg:151.61ms step:639/1480 train_time:95373ms step_avg:151.63ms step:640/1480 train_time:95532ms step_avg:151.64ms step:641/1480 train_time:95692ms step_avg:151.65ms step:642/1480 train_time:95852ms step_avg:151.66ms step:643/1480 train_time:96013ms step_avg:151.68ms step:644/1480 train_time:96172ms step_avg:151.69ms step:645/1480 train_time:96331ms step_avg:151.70ms step:646/1480 train_time:96491ms step_avg:151.72ms step:647/1480 train_time:96651ms step_avg:151.73ms step:648/1480 train_time:96814ms step_avg:151.75ms step:649/1480 train_time:96974ms step_avg:151.76ms step:650/1480 train_time:97133ms step_avg:151.77ms step:651/1480 train_time:97293ms step_avg:151.78ms step:652/1480 train_time:97454ms step_avg:151.80ms step:653/1480 train_time:97614ms step_avg:151.81ms step:654/1480 train_time:97774ms step_avg:151.82ms step:655/1480 train_time:97934ms step_avg:151.84ms step:656/1480 train_time:98094ms step_avg:151.85ms step:657/1480 train_time:98253ms step_avg:151.86ms step:658/1480 train_time:98413ms step_avg:151.87ms step:659/1480 train_time:98575ms step_avg:151.89ms step:660/1480 train_time:98737ms step_avg:151.90ms step:661/1480 train_time:98900ms step_avg:151.92ms step:662/1480 train_time:99059ms step_avg:151.93ms step:663/1480 train_time:99217ms step_avg:151.94ms step:664/1480 train_time:99380ms step_avg:151.96ms step:665/1480 train_time:99541ms step_avg:151.97ms step:666/1480 train_time:99701ms step_avg:151.98ms step:667/1480 train_time:99861ms step_avg:151.99ms step:668/1480 train_time:100021ms step_avg:152.01ms step:669/1480 train_time:100183ms step_avg:152.02ms step:670/1480 train_time:100342ms step_avg:152.03ms step:671/1480 train_time:100503ms step_avg:152.05ms step:672/1480 train_time:100664ms step_avg:152.06ms step:673/1480 train_time:100827ms step_avg:152.08ms step:674/1480 train_time:100991ms step_avg:152.09ms step:675/1480 train_time:101154ms step_avg:152.11ms step:676/1480 train_time:101318ms step_avg:152.13ms step:677/1480 train_time:101478ms step_avg:152.14ms step:678/1480 train_time:101638ms step_avg:152.15ms step:679/1480 train_time:101799ms step_avg:152.17ms step:680/1480 train_time:101959ms step_avg:152.18ms step:681/1480 train_time:102118ms step_avg:152.19ms step:682/1480 train_time:102280ms step_avg:152.20ms step:683/1480 train_time:102441ms step_avg:152.22ms step:684/1480 train_time:102601ms step_avg:152.23ms step:685/1480 train_time:102763ms step_avg:152.24ms step:686/1480 train_time:102924ms step_avg:152.25ms step:687/1480 train_time:103085ms step_avg:152.27ms step:688/1480 train_time:103248ms step_avg:152.28ms step:689/1480 train_time:103413ms step_avg:152.30ms step:690/1480 train_time:103576ms step_avg:152.32ms step:691/1480 train_time:103738ms step_avg:152.33ms step:692/1480 train_time:103898ms step_avg:152.34ms step:693/1480 train_time:104059ms step_avg:152.36ms step:694/1480 train_time:104220ms step_avg:152.37ms step:695/1480 train_time:104381ms step_avg:152.38ms step:696/1480 train_time:104541ms step_avg:152.39ms step:697/1480 train_time:104702ms step_avg:152.40ms step:698/1480 train_time:104863ms step_avg:152.42ms step:699/1480 train_time:105026ms step_avg:152.43ms step:700/1480 train_time:105190ms step_avg:152.45ms step:701/1480 train_time:105351ms step_avg:152.46ms step:702/1480 train_time:105513ms step_avg:152.47ms step:703/1480 train_time:105674ms step_avg:152.49ms step:704/1480 train_time:105835ms step_avg:152.50ms step:705/1480 train_time:105997ms step_avg:152.51ms step:706/1480 train_time:106162ms step_avg:152.53ms step:707/1480 train_time:106323ms step_avg:152.54ms step:708/1480 train_time:106484ms step_avg:152.56ms step:709/1480 train_time:106645ms step_avg:152.57ms step:710/1480 train_time:106807ms step_avg:152.58ms step:711/1480 train_time:106971ms step_avg:152.60ms step:712/1480 train_time:107138ms step_avg:152.62ms step:713/1480 train_time:107300ms step_avg:152.63ms step:714/1480 train_time:107461ms step_avg:152.64ms step:715/1480 train_time:107620ms step_avg:152.65ms step:716/1480 train_time:107779ms step_avg:152.66ms step:717/1480 train_time:107942ms step_avg:152.68ms step:718/1480 train_time:108102ms step_avg:152.69ms step:719/1480 train_time:108261ms step_avg:152.70ms step:720/1480 train_time:108425ms step_avg:152.71ms step:721/1480 train_time:108585ms step_avg:152.72ms step:722/1480 train_time:108749ms step_avg:152.74ms step:723/1480 train_time:108910ms step_avg:152.75ms step:724/1480 train_time:109073ms step_avg:152.76ms step:725/1480 train_time:109237ms step_avg:152.78ms step:726/1480 train_time:109400ms step_avg:152.79ms step:727/1480 train_time:109564ms step_avg:152.81ms step:728/1480 train_time:109724ms step_avg:152.82ms step:729/1480 train_time:109885ms step_avg:152.83ms step:730/1480 train_time:110047ms step_avg:152.84ms step:731/1480 train_time:110209ms step_avg:152.86ms step:732/1480 train_time:110371ms step_avg:152.87ms step:733/1480 train_time:110533ms step_avg:152.88ms step:734/1480 train_time:110696ms step_avg:152.89ms step:735/1480 train_time:110857ms step_avg:152.91ms step:736/1480 train_time:111019ms step_avg:152.92ms step:737/1480 train_time:111181ms step_avg:152.93ms step:738/1480 train_time:111342ms step_avg:152.94ms step:739/1480 train_time:111503ms step_avg:152.95ms step:740/1480 train_time:111667ms step_avg:152.97ms step:741/1480 train_time:111830ms step_avg:152.98ms step:742/1480 train_time:111993ms step_avg:153.00ms step:743/1480 train_time:112155ms step_avg:153.01ms step:744/1480 train_time:112319ms step_avg:153.02ms step:745/1480 train_time:112482ms step_avg:153.04ms step:746/1480 train_time:112641ms step_avg:153.05ms step:747/1480 train_time:112801ms step_avg:153.05ms step:748/1480 train_time:112967ms step_avg:153.07ms step:749/1480 train_time:113132ms step_avg:153.09ms step:750/1480 train_time:113292ms step_avg:153.10ms step:750/1480 val_loss:3.5591 train_time:113356ms step_avg:153.18ms step:751/1480 train_time:113456ms step_avg:153.11ms step:752/1480 train_time:113617ms step_avg:153.12ms step:753/1480 train_time:113778ms step_avg:153.13ms step:754/1480 train_time:113938ms step_avg:153.14ms step:755/1480 train_time:114097ms step_avg:153.15ms step:756/1480 train_time:114258ms step_avg:153.16ms step:757/1480 train_time:114422ms step_avg:153.17ms step:758/1480 train_time:114583ms step_avg:153.19ms step:759/1480 train_time:114745ms step_avg:153.20ms step:760/1480 train_time:114906ms step_avg:153.21ms step:761/1480 train_time:115071ms step_avg:153.22ms step:762/1480 train_time:115233ms step_avg:153.23ms step:763/1480 train_time:115395ms step_avg:153.25ms step:764/1480 train_time:115557ms step_avg:153.26ms step:765/1480 train_time:115718ms step_avg:153.27ms step:766/1480 train_time:115880ms step_avg:153.28ms step:767/1480 train_time:116042ms step_avg:153.29ms step:768/1480 train_time:116203ms step_avg:153.30ms step:769/1480 train_time:116368ms step_avg:153.32ms step:770/1480 train_time:116531ms step_avg:153.33ms step:771/1480 train_time:116694ms step_avg:153.34ms step:772/1480 train_time:116856ms step_avg:153.35ms step:773/1480 train_time:117019ms step_avg:153.37ms step:774/1480 train_time:117180ms step_avg:153.38ms step:775/1480 train_time:117342ms step_avg:153.39ms step:776/1480 train_time:117506ms step_avg:153.40ms step:777/1480 train_time:117674ms step_avg:153.42ms step:778/1480 train_time:117837ms step_avg:153.43ms step:779/1480 train_time:117999ms step_avg:153.44ms step:780/1480 train_time:118161ms step_avg:153.46ms step:781/1480 train_time:118324ms step_avg:153.47ms step:782/1480 train_time:118489ms step_avg:153.48ms step:783/1480 train_time:118651ms step_avg:153.49ms step:784/1480 train_time:118813ms step_avg:153.51ms step:785/1480 train_time:118975ms step_avg:153.52ms step:786/1480 train_time:119140ms step_avg:153.53ms step:787/1480 train_time:119303ms step_avg:153.54ms step:788/1480 train_time:119467ms step_avg:153.56ms step:789/1480 train_time:119630ms step_avg:153.57ms step:790/1480 train_time:119796ms step_avg:153.58ms step:791/1480 train_time:119961ms step_avg:153.60ms step:792/1480 train_time:120126ms step_avg:153.61ms step:793/1480 train_time:120288ms step_avg:153.63ms step:794/1480 train_time:120453ms step_avg:153.64ms step:795/1480 train_time:120617ms step_avg:153.65ms step:796/1480 train_time:120781ms step_avg:153.67ms step:797/1480 train_time:120947ms step_avg:153.68ms step:798/1480 train_time:121111ms step_avg:153.69ms step:799/1480 train_time:121276ms step_avg:153.71ms step:800/1480 train_time:121438ms step_avg:153.72ms step:801/1480 train_time:121600ms step_avg:153.73ms step:802/1480 train_time:121770ms step_avg:153.75ms step:803/1480 train_time:121934ms step_avg:153.76ms step:804/1480 train_time:122097ms step_avg:153.77ms step:805/1480 train_time:122262ms step_avg:153.79ms step:806/1480 train_time:122424ms step_avg:153.80ms step:807/1480 train_time:122585ms step_avg:153.81ms step:808/1480 train_time:122751ms step_avg:153.82ms step:809/1480 train_time:122913ms step_avg:153.83ms step:810/1480 train_time:123076ms step_avg:153.84ms step:811/1480 train_time:123238ms step_avg:153.86ms step:812/1480 train_time:123399ms step_avg:153.86ms step:813/1480 train_time:123559ms step_avg:153.87ms step:814/1480 train_time:123721ms step_avg:153.88ms step:815/1480 train_time:123884ms step_avg:153.89ms step:816/1480 train_time:124049ms step_avg:153.91ms step:817/1480 train_time:124213ms step_avg:153.92ms step:818/1480 train_time:124374ms step_avg:153.93ms step:819/1480 train_time:124538ms step_avg:153.94ms step:820/1480 train_time:124702ms step_avg:153.95ms step:821/1480 train_time:124864ms step_avg:153.96ms step:822/1480 train_time:125028ms step_avg:153.98ms step:823/1480 train_time:125192ms step_avg:153.99ms step:824/1480 train_time:125354ms step_avg:154.00ms step:825/1480 train_time:125517ms step_avg:154.01ms step:826/1480 train_time:125682ms step_avg:154.02ms step:827/1480 train_time:125846ms step_avg:154.03ms step:828/1480 train_time:126010ms step_avg:154.05ms step:829/1480 train_time:126175ms step_avg:154.06ms step:830/1480 train_time:126339ms step_avg:154.07ms step:831/1480 train_time:126502ms step_avg:154.08ms step:832/1480 train_time:126666ms step_avg:154.09ms step:833/1480 train_time:126831ms step_avg:154.11ms step:834/1480 train_time:126994ms step_avg:154.12ms step:835/1480 train_time:127158ms step_avg:154.13ms step:836/1480 train_time:127325ms step_avg:154.15ms step:837/1480 train_time:127487ms step_avg:154.16ms step:838/1480 train_time:127653ms step_avg:154.17ms step:839/1480 train_time:127814ms step_avg:154.18ms step:840/1480 train_time:127976ms step_avg:154.19ms step:841/1480 train_time:128138ms step_avg:154.20ms step:842/1480 train_time:128300ms step_avg:154.21ms step:843/1480 train_time:128462ms step_avg:154.22ms step:844/1480 train_time:128624ms step_avg:154.22ms step:845/1480 train_time:128789ms step_avg:154.24ms step:846/1480 train_time:128953ms step_avg:154.25ms step:847/1480 train_time:129116ms step_avg:154.26ms step:848/1480 train_time:129278ms step_avg:154.27ms step:849/1480 train_time:129441ms step_avg:154.28ms step:850/1480 train_time:129604ms step_avg:154.29ms step:851/1480 train_time:129771ms step_avg:154.31ms step:852/1480 train_time:129934ms step_avg:154.32ms step:853/1480 train_time:130095ms step_avg:154.32ms step:854/1480 train_time:130259ms step_avg:154.34ms step:855/1480 train_time:130421ms step_avg:154.34ms step:856/1480 train_time:130584ms step_avg:154.35ms step:857/1480 train_time:130751ms step_avg:154.37ms step:858/1480 train_time:130915ms step_avg:154.38ms step:859/1480 train_time:131078ms step_avg:154.39ms step:860/1480 train_time:131238ms step_avg:154.40ms step:861/1480 train_time:131404ms step_avg:154.41ms step:862/1480 train_time:131574ms step_avg:154.43ms step:863/1480 train_time:131741ms step_avg:154.44ms step:864/1480 train_time:131905ms step_avg:154.46ms step:865/1480 train_time:132067ms step_avg:154.46ms step:866/1480 train_time:132235ms step_avg:154.48ms step:867/1480 train_time:132397ms step_avg:154.49ms step:868/1480 train_time:132557ms step_avg:154.50ms step:869/1480 train_time:132718ms step_avg:154.50ms step:870/1480 train_time:132883ms step_avg:154.52ms step:871/1480 train_time:133047ms step_avg:154.53ms step:872/1480 train_time:133212ms step_avg:154.54ms step:873/1480 train_time:133376ms step_avg:154.55ms step:874/1480 train_time:133541ms step_avg:154.56ms step:875/1480 train_time:133706ms step_avg:154.57ms step:875/1480 val_loss:3.5091 train_time:133772ms step_avg:154.65ms step:876/1480 train_time:133874ms step_avg:154.59ms step:877/1480 train_time:134040ms step_avg:154.60ms step:878/1480 train_time:134203ms step_avg:154.61ms step:879/1480 train_time:134366ms step_avg:154.62ms step:880/1480 train_time:134528ms step_avg:154.63ms step:881/1480 train_time:134692ms step_avg:154.64ms step:882/1480 train_time:134858ms step_avg:154.65ms step:883/1480 train_time:135025ms step_avg:154.67ms step:884/1480 train_time:135191ms step_avg:154.68ms step:885/1480 train_time:135357ms step_avg:154.69ms step:886/1480 train_time:135523ms step_avg:154.71ms step:887/1480 train_time:135691ms step_avg:154.72ms step:888/1480 train_time:135864ms step_avg:154.74ms step:889/1480 train_time:136032ms step_avg:154.76ms step:890/1480 train_time:136196ms step_avg:154.77ms step:891/1480 train_time:136362ms step_avg:154.78ms step:892/1480 train_time:136527ms step_avg:154.79ms step:893/1480 train_time:136689ms step_avg:154.80ms step:894/1480 train_time:136857ms step_avg:154.82ms step:895/1480 train_time:137023ms step_avg:154.83ms step:896/1480 train_time:137186ms step_avg:154.84ms step:897/1480 train_time:137353ms step_avg:154.85ms step:898/1480 train_time:137521ms step_avg:154.87ms step:899/1480 train_time:137684ms step_avg:154.88ms step:900/1480 train_time:137846ms step_avg:154.88ms step:901/1480 train_time:138011ms step_avg:154.89ms step:902/1480 train_time:138175ms step_avg:154.91ms step:903/1480 train_time:138348ms step_avg:154.92ms step:904/1480 train_time:138516ms step_avg:154.94ms step:905/1480 train_time:138679ms step_avg:154.95ms step:906/1480 train_time:138845ms step_avg:154.96ms step:907/1480 train_time:139014ms step_avg:154.98ms step:908/1480 train_time:139177ms step_avg:154.99ms step:909/1480 train_time:139342ms step_avg:155.00ms step:910/1480 train_time:139513ms step_avg:155.01ms step:911/1480 train_time:139679ms step_avg:155.03ms step:912/1480 train_time:139845ms step_avg:155.04ms step:913/1480 train_time:140012ms step_avg:155.05ms step:914/1480 train_time:140180ms step_avg:155.07ms step:915/1480 train_time:140349ms step_avg:155.08ms step:916/1480 train_time:140514ms step_avg:155.09ms step:917/1480 train_time:140678ms step_avg:155.10ms step:918/1480 train_time:140846ms step_avg:155.12ms step:919/1480 train_time:141017ms step_avg:155.13ms step:920/1480 train_time:141182ms step_avg:155.15ms step:921/1480 train_time:141349ms step_avg:155.16ms step:922/1480 train_time:141518ms step_avg:155.17ms step:923/1480 train_time:141680ms step_avg:155.18ms step:924/1480 train_time:141844ms step_avg:155.19ms step:925/1480 train_time:142010ms step_avg:155.20ms step:926/1480 train_time:142173ms step_avg:155.21ms step:927/1480 train_time:142338ms step_avg:155.22ms step:928/1480 train_time:142503ms step_avg:155.23ms step:929/1480 train_time:142667ms step_avg:155.24ms step:930/1480 train_time:142834ms step_avg:155.25ms step:931/1480 train_time:142998ms step_avg:155.26ms step:932/1480 train_time:143163ms step_avg:155.27ms step:933/1480 train_time:143329ms step_avg:155.29ms step:934/1480 train_time:143498ms step_avg:155.30ms step:935/1480 train_time:143668ms step_avg:155.32ms step:936/1480 train_time:143837ms step_avg:155.33ms step:937/1480 train_time:144007ms step_avg:155.35ms step:938/1480 train_time:144168ms step_avg:155.35ms step:939/1480 train_time:144338ms step_avg:155.37ms step:940/1480 train_time:144504ms step_avg:155.38ms step:941/1480 train_time:144668ms step_avg:155.39ms step:942/1480 train_time:144834ms step_avg:155.40ms step:943/1480 train_time:145003ms step_avg:155.42ms step:944/1480 train_time:145177ms step_avg:155.44ms step:945/1480 train_time:145340ms step_avg:155.44ms step:946/1480 train_time:145509ms step_avg:155.46ms step:947/1480 train_time:145677ms step_avg:155.47ms step:948/1480 train_time:145842ms step_avg:155.48ms step:949/1480 train_time:146008ms step_avg:155.49ms step:950/1480 train_time:146172ms step_avg:155.50ms step:951/1480 train_time:146342ms step_avg:155.52ms step:952/1480 train_time:146506ms step_avg:155.53ms step:953/1480 train_time:146673ms step_avg:155.54ms step:954/1480 train_time:146841ms step_avg:155.55ms step:955/1480 train_time:147005ms step_avg:155.56ms step:956/1480 train_time:147169ms step_avg:155.57ms step:957/1480 train_time:147337ms step_avg:155.58ms step:958/1480 train_time:147507ms step_avg:155.60ms step:959/1480 train_time:147671ms step_avg:155.61ms step:960/1480 train_time:147838ms step_avg:155.62ms step:961/1480 train_time:148004ms step_avg:155.63ms step:962/1480 train_time:148168ms step_avg:155.64ms step:963/1480 train_time:148334ms step_avg:155.65ms step:964/1480 train_time:148502ms step_avg:155.66ms step:965/1480 train_time:148665ms step_avg:155.67ms step:966/1480 train_time:148829ms step_avg:155.68ms step:967/1480 train_time:148993ms step_avg:155.69ms step:968/1480 train_time:149159ms step_avg:155.70ms step:969/1480 train_time:149325ms step_avg:155.71ms step:970/1480 train_time:149488ms step_avg:155.72ms step:971/1480 train_time:149653ms step_avg:155.73ms step:972/1480 train_time:149819ms step_avg:155.74ms step:973/1480 train_time:149983ms step_avg:155.75ms step:974/1480 train_time:150151ms step_avg:155.76ms step:975/1480 train_time:150317ms step_avg:155.77ms step:976/1480 train_time:150482ms step_avg:155.78ms step:977/1480 train_time:150646ms step_avg:155.79ms step:978/1480 train_time:150810ms step_avg:155.80ms step:979/1480 train_time:150978ms step_avg:155.81ms step:980/1480 train_time:151144ms step_avg:155.82ms step:981/1480 train_time:151312ms step_avg:155.83ms step:982/1480 train_time:151477ms step_avg:155.84ms step:983/1480 train_time:151643ms step_avg:155.85ms step:984/1480 train_time:151807ms step_avg:155.86ms step:985/1480 train_time:151974ms step_avg:155.87ms step:986/1480 train_time:152140ms step_avg:155.88ms step:987/1480 train_time:152303ms step_avg:155.89ms step:988/1480 train_time:152469ms step_avg:155.90ms step:989/1480 train_time:152635ms step_avg:155.91ms step:990/1480 train_time:152805ms step_avg:155.92ms step:991/1480 train_time:152972ms step_avg:155.93ms step:992/1480 train_time:153147ms step_avg:155.95ms step:993/1480 train_time:153325ms step_avg:155.98ms step:994/1480 train_time:153489ms step_avg:155.98ms step:995/1480 train_time:153653ms step_avg:155.99ms step:996/1480 train_time:153818ms step_avg:156.00ms step:997/1480 train_time:153982ms step_avg:156.01ms step:998/1480 train_time:154145ms step_avg:156.02ms step:999/1480 train_time:154312ms step_avg:156.03ms step:1000/1480 train_time:154481ms step_avg:156.04ms step:1000/1480 val_loss:3.4460 train_time:154550ms step_avg:156.11ms step:1001/1480 train_time:154653ms step_avg:156.06ms step:1002/1480 train_time:154818ms step_avg:156.07ms step:1003/1480 train_time:154992ms step_avg:156.08ms step:1004/1480 train_time:155159ms step_avg:156.10ms step:1005/1480 train_time:155327ms step_avg:156.11ms step:1006/1480 train_time:155494ms step_avg:156.12ms step:1007/1480 train_time:155660ms step_avg:156.13ms step:1008/1480 train_time:155827ms step_avg:156.14ms step:1009/1480 train_time:155999ms step_avg:156.16ms step:1010/1480 train_time:156164ms step_avg:156.16ms step:1011/1480 train_time:156331ms step_avg:156.17ms step:1012/1480 train_time:156495ms step_avg:156.18ms step:1013/1480 train_time:156666ms step_avg:156.20ms step:1014/1480 train_time:156834ms step_avg:156.21ms step:1015/1480 train_time:157003ms step_avg:156.22ms step:1016/1480 train_time:157172ms step_avg:156.24ms step:1017/1480 train_time:157343ms step_avg:156.25ms step:1018/1480 train_time:157511ms step_avg:156.26ms step:1019/1480 train_time:157678ms step_avg:156.27ms step:1020/1480 train_time:157848ms step_avg:156.29ms step:1021/1480 train_time:158014ms step_avg:156.29ms step:1022/1480 train_time:158180ms step_avg:156.30ms step:1023/1480 train_time:158348ms step_avg:156.32ms step:1024/1480 train_time:158515ms step_avg:156.33ms step:1025/1480 train_time:158685ms step_avg:156.34ms step:1026/1480 train_time:158851ms step_avg:156.35ms step:1027/1480 train_time:159017ms step_avg:156.36ms step:1028/1480 train_time:159191ms step_avg:156.38ms step:1029/1480 train_time:159366ms step_avg:156.39ms step:1030/1480 train_time:159534ms step_avg:156.41ms step:1031/1480 train_time:159697ms step_avg:156.41ms step:1032/1480 train_time:159870ms step_avg:156.43ms step:1033/1480 train_time:160037ms step_avg:156.44ms step:1034/1480 train_time:160203ms step_avg:156.45ms step:1035/1480 train_time:160372ms step_avg:156.46ms step:1036/1480 train_time:160538ms step_avg:156.47ms step:1037/1480 train_time:160706ms step_avg:156.48ms step:1038/1480 train_time:160874ms step_avg:156.49ms step:1039/1480 train_time:161044ms step_avg:156.51ms step:1040/1480 train_time:161211ms step_avg:156.52ms step:1041/1480 train_time:161377ms step_avg:156.52ms step:1042/1480 train_time:161540ms step_avg:156.53ms step:1043/1480 train_time:161706ms step_avg:156.54ms step:1044/1480 train_time:161872ms step_avg:156.55ms step:1045/1480 train_time:162040ms step_avg:156.56ms step:1046/1480 train_time:162208ms step_avg:156.57ms step:1047/1480 train_time:162375ms step_avg:156.58ms step:1048/1480 train_time:162542ms step_avg:156.59ms step:1049/1480 train_time:162708ms step_avg:156.60ms step:1050/1480 train_time:162877ms step_avg:156.61ms step:1051/1480 train_time:163047ms step_avg:156.63ms step:1052/1480 train_time:163215ms step_avg:156.64ms step:1053/1480 train_time:163380ms step_avg:156.64ms step:1054/1480 train_time:163550ms step_avg:156.66ms step:1055/1480 train_time:163716ms step_avg:156.67ms step:1056/1480 train_time:163880ms step_avg:156.67ms step:1057/1480 train_time:164048ms step_avg:156.68ms step:1058/1480 train_time:164216ms step_avg:156.70ms step:1059/1480 train_time:164388ms step_avg:156.71ms step:1060/1480 train_time:164556ms step_avg:156.72ms step:1061/1480 train_time:164720ms step_avg:156.73ms step:1062/1480 train_time:164887ms step_avg:156.74ms step:1063/1480 train_time:165054ms step_avg:156.75ms step:1064/1480 train_time:165218ms step_avg:156.75ms step:1065/1480 train_time:165384ms step_avg:156.76ms step:1066/1480 train_time:165554ms step_avg:156.77ms step:1067/1480 train_time:165722ms step_avg:156.78ms step:1068/1480 train_time:165887ms step_avg:156.79ms step:1069/1480 train_time:166057ms step_avg:156.81ms step:1070/1480 train_time:166222ms step_avg:156.81ms step:1071/1480 train_time:166396ms step_avg:156.83ms step:1072/1480 train_time:166561ms step_avg:156.84ms step:1073/1480 train_time:166725ms step_avg:156.84ms step:1074/1480 train_time:166893ms step_avg:156.85ms step:1075/1480 train_time:167060ms step_avg:156.86ms step:1076/1480 train_time:167231ms step_avg:156.88ms step:1077/1480 train_time:167397ms step_avg:156.89ms step:1078/1480 train_time:167572ms step_avg:156.90ms step:1079/1480 train_time:167744ms step_avg:156.92ms step:1080/1480 train_time:167915ms step_avg:156.93ms step:1081/1480 train_time:168080ms step_avg:156.94ms step:1082/1480 train_time:168248ms step_avg:156.95ms step:1083/1480 train_time:168415ms step_avg:156.96ms step:1084/1480 train_time:168580ms step_avg:156.96ms step:1085/1480 train_time:168748ms step_avg:156.97ms step:1086/1480 train_time:168917ms step_avg:156.99ms step:1087/1480 train_time:169083ms step_avg:156.99ms step:1088/1480 train_time:169253ms step_avg:157.01ms step:1089/1480 train_time:169424ms step_avg:157.02ms step:1090/1480 train_time:169595ms step_avg:157.03ms step:1091/1480 train_time:169761ms step_avg:157.04ms step:1092/1480 train_time:169930ms step_avg:157.05ms step:1093/1480 train_time:170096ms step_avg:157.06ms step:1094/1480 train_time:170262ms step_avg:157.07ms step:1095/1480 train_time:170427ms step_avg:157.08ms step:1096/1480 train_time:170596ms step_avg:157.09ms step:1097/1480 train_time:170763ms step_avg:157.10ms step:1098/1480 train_time:170936ms step_avg:157.11ms step:1099/1480 train_time:171107ms step_avg:157.12ms step:1100/1480 train_time:171277ms step_avg:157.14ms step:1101/1480 train_time:171448ms step_avg:157.15ms step:1102/1480 train_time:171618ms step_avg:157.16ms step:1103/1480 train_time:171793ms step_avg:157.18ms step:1104/1480 train_time:171960ms step_avg:157.18ms step:1105/1480 train_time:172132ms step_avg:157.20ms step:1106/1480 train_time:172301ms step_avg:157.21ms step:1107/1480 train_time:172470ms step_avg:157.22ms step:1108/1480 train_time:172635ms step_avg:157.23ms step:1109/1480 train_time:172801ms step_avg:157.23ms step:1110/1480 train_time:172967ms step_avg:157.24ms step:1111/1480 train_time:173134ms step_avg:157.25ms step:1112/1480 train_time:173302ms step_avg:157.26ms step:1113/1480 train_time:173481ms step_avg:157.28ms step:1114/1480 train_time:173655ms step_avg:157.30ms step:1115/1480 train_time:173828ms step_avg:157.31ms step:1116/1480 train_time:173995ms step_avg:157.32ms step:1117/1480 train_time:174168ms step_avg:157.33ms step:1118/1480 train_time:174344ms step_avg:157.35ms step:1119/1480 train_time:174511ms step_avg:157.36ms step:1120/1480 train_time:174678ms step_avg:157.37ms step:1121/1480 train_time:174848ms step_avg:157.38ms step:1122/1480 train_time:175015ms step_avg:157.39ms step:1123/1480 train_time:175181ms step_avg:157.40ms step:1124/1480 train_time:175351ms step_avg:157.41ms step:1125/1480 train_time:175518ms step_avg:157.42ms step:1125/1480 val_loss:3.3914 train_time:175586ms step_avg:157.48ms step:1126/1480 train_time:175688ms step_avg:157.43ms step:1127/1480 train_time:175859ms step_avg:157.44ms step:1128/1480 train_time:176029ms step_avg:157.45ms step:1129/1480 train_time:176202ms step_avg:157.46ms step:1130/1480 train_time:176372ms step_avg:157.47ms step:1131/1480 train_time:176550ms step_avg:157.49ms step:1132/1480 train_time:176716ms step_avg:157.50ms step:1133/1480 train_time:176888ms step_avg:157.51ms step:1134/1480 train_time:177058ms step_avg:157.52ms step:1135/1480 train_time:177225ms step_avg:157.53ms step:1136/1480 train_time:177397ms step_avg:157.55ms step:1137/1480 train_time:177565ms step_avg:157.56ms step:1138/1480 train_time:177738ms step_avg:157.57ms step:1139/1480 train_time:177905ms step_avg:157.58ms step:1140/1480 train_time:178075ms step_avg:157.59ms step:1141/1480 train_time:178247ms step_avg:157.60ms step:1142/1480 train_time:178414ms step_avg:157.61ms step:1143/1480 train_time:178582ms step_avg:157.62ms step:1144/1480 train_time:178751ms step_avg:157.63ms step:1145/1480 train_time:178917ms step_avg:157.64ms step:1146/1480 train_time:179088ms step_avg:157.65ms step:1147/1480 train_time:179259ms step_avg:157.66ms step:1148/1480 train_time:179427ms step_avg:157.67ms step:1149/1480 train_time:179599ms step_avg:157.68ms step:1150/1480 train_time:179767ms step_avg:157.69ms step:1151/1480 train_time:179940ms step_avg:157.70ms step:1152/1480 train_time:180110ms step_avg:157.71ms step:1153/1480 train_time:180284ms step_avg:157.73ms step:1154/1480 train_time:180451ms step_avg:157.74ms step:1155/1480 train_time:180623ms step_avg:157.75ms step:1156/1480 train_time:180802ms step_avg:157.77ms step:1157/1480 train_time:180971ms step_avg:157.78ms step:1158/1480 train_time:181138ms step_avg:157.79ms step:1159/1480 train_time:181305ms step_avg:157.79ms step:1160/1480 train_time:181471ms step_avg:157.80ms step:1161/1480 train_time:181641ms step_avg:157.81ms step:1162/1480 train_time:181811ms step_avg:157.82ms step:1163/1480 train_time:181982ms step_avg:157.83ms step:1164/1480 train_time:182150ms step_avg:157.84ms step:1165/1480 train_time:182316ms step_avg:157.85ms step:1166/1480 train_time:182484ms step_avg:157.86ms step:1167/1480 train_time:182654ms step_avg:157.87ms step:1168/1480 train_time:182821ms step_avg:157.88ms step:1169/1480 train_time:182990ms step_avg:157.89ms step:1170/1480 train_time:183161ms step_avg:157.90ms step:1171/1480 train_time:183329ms step_avg:157.91ms step:1172/1480 train_time:183496ms step_avg:157.91ms step:1173/1480 train_time:183667ms step_avg:157.93ms step:1174/1480 train_time:183851ms step_avg:157.95ms step:1175/1480 train_time:184022ms step_avg:157.96ms step:1176/1480 train_time:184194ms step_avg:157.97ms step:1177/1480 train_time:184371ms step_avg:157.99ms step:1178/1480 train_time:184539ms step_avg:158.00ms step:1179/1480 train_time:184706ms step_avg:158.00ms step:1180/1480 train_time:184886ms step_avg:158.02ms step:1181/1480 train_time:185055ms step_avg:158.03ms step:1182/1480 train_time:185222ms step_avg:158.04ms step:1183/1480 train_time:185393ms step_avg:158.05ms step:1184/1480 train_time:185561ms step_avg:158.06ms step:1185/1480 train_time:185736ms step_avg:158.07ms step:1186/1480 train_time:185906ms step_avg:158.08ms step:1187/1480 train_time:186089ms step_avg:158.10ms step:1188/1480 train_time:186257ms step_avg:158.11ms step:1189/1480 train_time:186430ms step_avg:158.13ms step:1190/1480 train_time:186597ms step_avg:158.13ms step:1191/1480 train_time:186768ms step_avg:158.14ms step:1192/1480 train_time:186935ms step_avg:158.15ms step:1193/1480 train_time:187100ms step_avg:158.16ms step:1194/1480 train_time:187269ms step_avg:158.17ms step:1195/1480 train_time:187443ms step_avg:158.18ms step:1196/1480 train_time:187625ms step_avg:158.20ms step:1197/1480 train_time:187797ms step_avg:158.21ms step:1198/1480 train_time:187980ms step_avg:158.23ms step:1199/1480 train_time:188149ms step_avg:158.24ms step:1200/1480 train_time:188317ms step_avg:158.25ms step:1201/1480 train_time:188484ms step_avg:158.26ms step:1202/1480 train_time:188666ms step_avg:158.28ms step:1203/1480 train_time:188842ms step_avg:158.29ms step:1204/1480 train_time:189016ms step_avg:158.30ms step:1205/1480 train_time:189184ms step_avg:158.31ms step:1206/1480 train_time:189354ms step_avg:158.32ms step:1207/1480 train_time:189523ms step_avg:158.33ms step:1208/1480 train_time:189691ms step_avg:158.34ms step:1209/1480 train_time:189867ms step_avg:158.35ms step:1210/1480 train_time:190041ms step_avg:158.37ms step:1211/1480 train_time:190215ms step_avg:158.38ms step:1212/1480 train_time:190386ms step_avg:158.39ms step:1213/1480 train_time:190561ms step_avg:158.40ms step:1214/1480 train_time:190740ms step_avg:158.42ms step:1215/1480 train_time:190911ms step_avg:158.43ms step:1216/1480 train_time:191082ms step_avg:158.44ms step:1217/1480 train_time:191256ms step_avg:158.46ms step:1218/1480 train_time:191424ms step_avg:158.46ms step:1219/1480 train_time:191603ms step_avg:158.48ms step:1220/1480 train_time:191773ms step_avg:158.49ms step:1221/1480 train_time:191942ms step_avg:158.50ms step:1222/1480 train_time:192109ms step_avg:158.51ms step:1223/1480 train_time:192280ms step_avg:158.52ms step:1224/1480 train_time:192457ms step_avg:158.53ms step:1225/1480 train_time:192629ms step_avg:158.54ms step:1226/1480 train_time:192802ms step_avg:158.55ms step:1227/1480 train_time:192977ms step_avg:158.57ms step:1228/1480 train_time:193144ms step_avg:158.58ms step:1229/1480 train_time:193318ms step_avg:158.59ms step:1230/1480 train_time:193498ms step_avg:158.60ms step:1231/1480 train_time:193674ms step_avg:158.62ms step:1232/1480 train_time:193848ms step_avg:158.63ms step:1233/1480 train_time:194018ms step_avg:158.64ms step:1234/1480 train_time:194187ms step_avg:158.65ms step:1235/1480 train_time:194363ms step_avg:158.66ms step:1236/1480 train_time:194531ms step_avg:158.67ms step:1237/1480 train_time:194702ms step_avg:158.68ms step:1238/1480 train_time:194886ms step_avg:158.70ms step:1239/1480 train_time:195057ms step_avg:158.71ms step:1240/1480 train_time:195228ms step_avg:158.72ms step:1241/1480 train_time:195400ms step_avg:158.73ms step:1242/1480 train_time:195569ms step_avg:158.74ms step:1243/1480 train_time:195742ms step_avg:158.75ms step:1244/1480 train_time:195908ms step_avg:158.76ms step:1245/1480 train_time:196079ms step_avg:158.77ms step:1246/1480 train_time:196247ms step_avg:158.78ms step:1247/1480 train_time:196417ms step_avg:158.79ms step:1248/1480 train_time:196585ms step_avg:158.79ms step:1249/1480 train_time:196753ms step_avg:158.80ms step:1250/1480 train_time:196922ms step_avg:158.81ms step:1250/1480 val_loss:3.3405 train_time:196994ms step_avg:158.87ms step:1251/1480 train_time:197103ms step_avg:158.83ms step:1252/1480 train_time:197273ms step_avg:158.83ms step:1253/1480 train_time:197440ms step_avg:158.84ms step:1254/1480 train_time:197612ms step_avg:158.85ms step:1255/1480 train_time:197799ms step_avg:158.87ms step:1256/1480 train_time:197972ms step_avg:158.89ms step:1257/1480 train_time:198143ms step_avg:158.90ms step:1258/1480 train_time:198317ms step_avg:158.91ms step:1259/1480 train_time:198488ms step_avg:158.92ms step:1260/1480 train_time:198657ms step_avg:158.93ms step:1261/1480 train_time:198829ms step_avg:158.94ms step:1262/1480 train_time:199003ms step_avg:158.95ms step:1263/1480 train_time:199177ms step_avg:158.96ms step:1264/1480 train_time:199344ms step_avg:158.97ms step:1265/1480 train_time:199512ms step_avg:158.97ms step:1266/1480 train_time:199684ms step_avg:158.98ms step:1267/1480 train_time:199855ms step_avg:158.99ms step:1268/1480 train_time:200026ms step_avg:159.00ms step:1269/1480 train_time:200202ms step_avg:159.02ms step:1270/1480 train_time:200371ms step_avg:159.02ms step:1271/1480 train_time:200540ms step_avg:159.03ms step:1272/1480 train_time:200705ms step_avg:159.04ms step:1273/1480 train_time:200877ms step_avg:159.05ms step:1274/1480 train_time:201048ms step_avg:159.06ms step:1275/1480 train_time:201217ms step_avg:159.06ms step:1276/1480 train_time:201382ms step_avg:159.07ms step:1277/1480 train_time:201554ms step_avg:159.08ms step:1278/1480 train_time:201721ms step_avg:159.09ms step:1279/1480 train_time:201892ms step_avg:159.10ms step:1280/1480 train_time:202073ms step_avg:159.11ms step:1281/1480 train_time:202240ms step_avg:159.12ms step:1282/1480 train_time:202407ms step_avg:159.12ms step:1283/1480 train_time:202578ms step_avg:159.13ms step:1284/1480 train_time:202747ms step_avg:159.14ms step:1285/1480 train_time:202915ms step_avg:159.15ms step:1286/1480 train_time:203085ms step_avg:159.16ms step:1287/1480 train_time:203256ms step_avg:159.17ms step:1288/1480 train_time:203426ms step_avg:159.18ms step:1289/1480 train_time:203610ms step_avg:159.19ms step:1290/1480 train_time:203791ms step_avg:159.21ms step:1291/1480 train_time:203964ms step_avg:159.22ms step:1292/1480 train_time:204139ms step_avg:159.23ms step:1293/1480 train_time:204314ms step_avg:159.25ms step:1294/1480 train_time:204484ms step_avg:159.26ms step:1295/1480 train_time:204655ms step_avg:159.26ms step:1296/1480 train_time:204831ms step_avg:159.28ms step:1297/1480 train_time:205001ms step_avg:159.29ms step:1298/1480 train_time:205171ms step_avg:159.29ms step:1299/1480 train_time:205340ms step_avg:159.30ms step:1300/1480 train_time:205507ms step_avg:159.31ms step:1301/1480 train_time:205677ms step_avg:159.32ms step:1302/1480 train_time:205851ms step_avg:159.33ms step:1303/1480 train_time:206028ms step_avg:159.34ms step:1304/1480 train_time:206201ms step_avg:159.35ms step:1305/1480 train_time:206369ms step_avg:159.36ms step:1306/1480 train_time:206543ms step_avg:159.37ms step:1307/1480 train_time:206712ms step_avg:159.38ms step:1308/1480 train_time:206880ms step_avg:159.38ms step:1309/1480 train_time:207052ms step_avg:159.39ms step:1310/1480 train_time:207220ms step_avg:159.40ms step:1311/1480 train_time:207388ms step_avg:159.41ms step:1312/1480 train_time:207561ms step_avg:159.42ms step:1313/1480 train_time:207730ms step_avg:159.42ms step:1314/1480 train_time:207902ms step_avg:159.43ms step:1315/1480 train_time:208073ms step_avg:159.44ms step:1316/1480 train_time:208240ms step_avg:159.45ms step:1317/1480 train_time:208412ms step_avg:159.46ms step:1318/1480 train_time:208592ms step_avg:159.47ms step:1319/1480 train_time:208768ms step_avg:159.49ms step:1320/1480 train_time:208945ms step_avg:159.50ms step:1321/1480 train_time:209118ms step_avg:159.51ms step:1322/1480 train_time:209298ms step_avg:159.53ms step:1323/1480 train_time:209469ms step_avg:159.53ms step:1324/1480 train_time:209644ms step_avg:159.55ms step:1325/1480 train_time:209825ms step_avg:159.56ms step:1326/1480 train_time:210001ms step_avg:159.58ms step:1327/1480 train_time:210170ms step_avg:159.58ms step:1328/1480 train_time:210341ms step_avg:159.59ms step:1329/1480 train_time:210538ms step_avg:159.62ms step:1330/1480 train_time:210716ms step_avg:159.63ms step:1331/1480 train_time:210886ms step_avg:159.64ms step:1332/1480 train_time:211060ms step_avg:159.65ms step:1333/1480 train_time:211235ms step_avg:159.66ms step:1334/1480 train_time:211406ms step_avg:159.67ms step:1335/1480 train_time:211576ms step_avg:159.68ms step:1336/1480 train_time:211760ms step_avg:159.70ms step:1337/1480 train_time:211936ms step_avg:159.71ms step:1338/1480 train_time:212107ms step_avg:159.72ms step:1339/1480 train_time:212282ms step_avg:159.73ms step:1340/1480 train_time:212455ms step_avg:159.74ms step:1341/1480 train_time:212623ms step_avg:159.75ms step:1342/1480 train_time:212799ms step_avg:159.76ms step:1343/1480 train_time:212969ms step_avg:159.77ms step:1344/1480 train_time:213141ms step_avg:159.78ms step:1345/1480 train_time:213320ms step_avg:159.79ms step:1346/1480 train_time:213489ms step_avg:159.80ms step:1347/1480 train_time:213659ms step_avg:159.80ms step:1348/1480 train_time:213829ms step_avg:159.81ms step:1349/1480 train_time:213999ms step_avg:159.82ms step:1350/1480 train_time:214173ms step_avg:159.83ms step:1351/1480 train_time:214345ms step_avg:159.84ms step:1352/1480 train_time:214515ms step_avg:159.85ms step:1353/1480 train_time:214692ms step_avg:159.86ms step:1354/1480 train_time:214864ms step_avg:159.87ms step:1355/1480 train_time:215033ms step_avg:159.88ms step:1356/1480 train_time:215205ms step_avg:159.89ms step:1357/1480 train_time:215380ms step_avg:159.90ms step:1358/1480 train_time:215553ms step_avg:159.91ms step:1359/1480 train_time:215725ms step_avg:159.91ms step:1360/1480 train_time:215901ms step_avg:159.93ms step:1361/1480 train_time:216078ms step_avg:159.94ms step:1362/1480 train_time:216255ms step_avg:159.95ms step:1363/1480 train_time:216437ms step_avg:159.97ms step:1364/1480 train_time:216605ms step_avg:159.97ms step:1365/1480 train_time:216773ms step_avg:159.98ms step:1366/1480 train_time:216945ms step_avg:159.99ms step:1367/1480 train_time:217117ms step_avg:160.00ms step:1368/1480 train_time:217290ms step_avg:160.01ms step:1369/1480 train_time:217470ms step_avg:160.02ms step:1370/1480 train_time:217646ms step_avg:160.03ms step:1371/1480 train_time:217820ms step_avg:160.04ms step:1372/1480 train_time:217998ms step_avg:160.06ms step:1373/1480 train_time:218167ms step_avg:160.06ms step:1374/1480 train_time:218341ms step_avg:160.07ms step:1375/1480 train_time:218511ms step_avg:160.08ms step:1375/1480 val_loss:3.3013 train_time:218578ms step_avg:160.13ms step:1376/1480 train_time:218685ms step_avg:160.09ms step:1377/1480 train_time:218856ms step_avg:160.10ms step:1378/1480 train_time:219025ms step_avg:160.11ms step:1379/1480 train_time:219199ms step_avg:160.12ms step:1380/1480 train_time:219374ms step_avg:160.13ms step:1381/1480 train_time:219554ms step_avg:160.14ms step:1382/1480 train_time:219725ms step_avg:160.15ms step:1383/1480 train_time:219898ms step_avg:160.16ms step:1384/1480 train_time:220076ms step_avg:160.17ms step:1385/1480 train_time:220242ms step_avg:160.18ms step:1386/1480 train_time:220414ms step_avg:160.18ms step:1387/1480 train_time:220587ms step_avg:160.19ms step:1388/1480 train_time:220756ms step_avg:160.20ms step:1389/1480 train_time:220931ms step_avg:160.21ms step:1390/1480 train_time:221099ms step_avg:160.22ms step:1391/1480 train_time:221270ms step_avg:160.22ms step:1392/1480 train_time:221442ms step_avg:160.23ms step:1393/1480 train_time:221614ms step_avg:160.24ms step:1394/1480 train_time:221783ms step_avg:160.25ms step:1395/1480 train_time:221951ms step_avg:160.25ms step:1396/1480 train_time:222120ms step_avg:160.26ms step:1397/1480 train_time:222289ms step_avg:160.27ms step:1398/1480 train_time:222455ms step_avg:160.27ms step:1399/1480 train_time:222623ms step_avg:160.28ms step:1400/1480 train_time:222800ms step_avg:160.29ms step:1401/1480 train_time:222966ms step_avg:160.29ms step:1402/1480 train_time:223136ms step_avg:160.30ms step:1403/1480 train_time:223315ms step_avg:160.31ms step:1404/1480 train_time:223486ms step_avg:160.32ms step:1405/1480 train_time:223659ms step_avg:160.33ms step:1406/1480 train_time:223832ms step_avg:160.34ms step:1407/1480 train_time:224000ms step_avg:160.34ms step:1408/1480 train_time:224169ms step_avg:160.35ms step:1409/1480 train_time:224353ms step_avg:160.37ms step:1410/1480 train_time:224522ms step_avg:160.37ms step:1411/1480 train_time:224691ms step_avg:160.38ms step:1412/1480 train_time:224861ms step_avg:160.39ms step:1413/1480 train_time:225031ms step_avg:160.39ms step:1414/1480 train_time:225203ms step_avg:160.40ms step:1415/1480 train_time:225377ms step_avg:160.41ms step:1416/1480 train_time:225564ms step_avg:160.43ms step:1417/1480 train_time:225737ms step_avg:160.44ms step:1418/1480 train_time:225910ms step_avg:160.45ms step:1419/1480 train_time:226083ms step_avg:160.46ms step:1420/1480 train_time:226259ms step_avg:160.47ms step:1421/1480 train_time:226434ms step_avg:160.48ms step:1422/1480 train_time:226606ms step_avg:160.49ms step:1423/1480 train_time:226776ms step_avg:160.49ms step:1424/1480 train_time:226952ms step_avg:160.50ms step:1425/1480 train_time:227132ms step_avg:160.52ms step:1426/1480 train_time:227305ms step_avg:160.53ms step:1427/1480 train_time:227480ms step_avg:160.54ms step:1428/1480 train_time:227651ms step_avg:160.54ms step:1429/1480 train_time:227818ms step_avg:160.55ms step:1430/1480 train_time:227992ms step_avg:160.56ms step:1431/1480 train_time:228168ms step_avg:160.57ms step:1432/1480 train_time:228343ms step_avg:160.58ms step:1433/1480 train_time:228523ms step_avg:160.59ms step:1434/1480 train_time:228702ms step_avg:160.61ms step:1435/1480 train_time:228877ms step_avg:160.62ms step:1436/1480 train_time:229051ms step_avg:160.62ms step:1437/1480 train_time:229220ms step_avg:160.63ms step:1438/1480 train_time:229391ms step_avg:160.64ms step:1439/1480 train_time:229565ms step_avg:160.65ms step:1440/1480 train_time:229736ms step_avg:160.65ms step:1441/1480 train_time:229908ms step_avg:160.66ms step:1442/1480 train_time:230084ms step_avg:160.67ms step:1443/1480 train_time:230276ms step_avg:160.70ms step:1444/1480 train_time:230447ms step_avg:160.70ms step:1445/1480 train_time:230619ms step_avg:160.71ms step:1446/1480 train_time:230794ms step_avg:160.72ms step:1447/1480 train_time:230973ms step_avg:160.73ms step:1448/1480 train_time:231145ms step_avg:160.74ms step:1449/1480 train_time:231319ms step_avg:160.75ms step:1450/1480 train_time:231491ms step_avg:160.76ms step:1451/1480 train_time:231661ms step_avg:160.76ms step:1452/1480 train_time:231835ms step_avg:160.77ms step:1453/1480 train_time:232004ms step_avg:160.78ms step:1454/1480 train_time:232177ms step_avg:160.79ms step:1455/1480 train_time:232355ms step_avg:160.80ms step:1456/1480 train_time:232529ms step_avg:160.81ms step:1457/1480 train_time:232700ms step_avg:160.82ms step:1458/1480 train_time:232873ms step_avg:160.82ms step:1459/1480 train_time:233050ms step_avg:160.84ms step:1460/1480 train_time:233221ms step_avg:160.84ms step:1461/1480 train_time:233396ms step_avg:160.85ms step:1462/1480 train_time:233566ms step_avg:160.86ms step:1463/1480 train_time:233743ms step_avg:160.87ms step:1464/1480 train_time:233918ms step_avg:160.88ms step:1465/1480 train_time:234091ms step_avg:160.89ms step:1466/1480 train_time:234261ms step_avg:160.89ms step:1467/1480 train_time:234436ms step_avg:160.90ms step:1468/1480 train_time:234608ms step_avg:160.91ms step:1469/1480 train_time:234781ms step_avg:160.92ms step:1470/1480 train_time:234960ms step_avg:160.93ms step:1471/1480 train_time:235147ms step_avg:160.95ms step:1472/1480 train_time:235325ms step_avg:160.96ms step:1473/1480 train_time:235496ms step_avg:160.97ms step:1474/1480 train_time:235675ms step_avg:160.98ms step:1475/1480 train_time:235854ms step_avg:160.99ms step:1476/1480 train_time:236026ms step_avg:161.00ms step:1477/1480 train_time:236210ms step_avg:161.02ms step:1478/1480 train_time:236392ms step_avg:161.03ms step:1479/1480 train_time:236566ms step_avg:161.04ms step:1480/1480 train_time:236738ms step_avg:161.05ms step:1480/1480 val_loss:3.2820 train_time:236809ms step_avg:161.09ms