import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 12:56:40 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 130W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 40C P0 119W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 97W / 700W | 37MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 120W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23696ms step_avg:nanms step:2/1480 train_time:23814ms step_avg:nanms step:3/1480 train_time:23953ms step_avg:nanms step:4/1480 train_time:24095ms step_avg:nanms step:5/1480 train_time:24235ms step_avg:nanms step:6/1480 train_time:24379ms step_avg:nanms step:7/1480 train_time:24517ms step_avg:nanms step:8/1480 train_time:24660ms step_avg:nanms step:9/1480 train_time:24804ms step_avg:nanms step:10/1480 train_time:24950ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:285ms step_avg:nanms step:13/1480 train_time:428ms step_avg:142.72ms step:14/1480 train_time:569ms step_avg:142.22ms step:15/1480 train_time:710ms step_avg:142.06ms step:16/1480 train_time:853ms step_avg:142.23ms step:17/1480 train_time:997ms step_avg:142.36ms step:18/1480 train_time:1140ms step_avg:142.55ms step:19/1480 train_time:1283ms step_avg:142.58ms step:20/1480 train_time:1426ms step_avg:142.62ms step:21/1480 train_time:1567ms step_avg:142.45ms step:22/1480 train_time:1710ms step_avg:142.47ms step:23/1480 train_time:1852ms step_avg:142.42ms step:24/1480 train_time:1996ms step_avg:142.55ms step:25/1480 train_time:2139ms step_avg:142.60ms step:26/1480 train_time:2282ms step_avg:142.61ms step:27/1480 train_time:2425ms step_avg:142.63ms step:28/1480 train_time:2566ms step_avg:142.54ms step:29/1480 train_time:2709ms step_avg:142.57ms step:30/1480 train_time:2850ms step_avg:142.51ms step:31/1480 train_time:2992ms step_avg:142.50ms step:32/1480 train_time:3137ms step_avg:142.59ms step:33/1480 train_time:3281ms step_avg:142.66ms step:34/1480 train_time:3425ms step_avg:142.71ms step:35/1480 train_time:3567ms step_avg:142.68ms step:36/1480 train_time:3709ms step_avg:142.64ms step:37/1480 train_time:3850ms step_avg:142.60ms step:38/1480 train_time:3991ms step_avg:142.55ms step:39/1480 train_time:4135ms step_avg:142.57ms step:40/1480 train_time:4278ms step_avg:142.59ms step:41/1480 train_time:4422ms step_avg:142.66ms step:42/1480 train_time:4566ms step_avg:142.69ms step:43/1480 train_time:4709ms step_avg:142.70ms step:44/1480 train_time:4851ms step_avg:142.67ms step:45/1480 train_time:4994ms step_avg:142.68ms step:46/1480 train_time:5137ms step_avg:142.69ms step:47/1480 train_time:5282ms step_avg:142.75ms step:48/1480 train_time:5425ms step_avg:142.78ms step:49/1480 train_time:5567ms step_avg:142.75ms step:50/1480 train_time:5710ms step_avg:142.76ms step:51/1480 train_time:5851ms step_avg:142.71ms step:52/1480 train_time:5993ms step_avg:142.70ms step:53/1480 train_time:6137ms step_avg:142.71ms step:54/1480 train_time:6280ms step_avg:142.73ms step:55/1480 train_time:6425ms step_avg:142.77ms step:56/1480 train_time:6566ms step_avg:142.74ms step:57/1480 train_time:6709ms step_avg:142.74ms step:58/1480 train_time:6850ms step_avg:142.71ms step:59/1480 train_time:6991ms step_avg:142.67ms step:60/1480 train_time:7134ms step_avg:142.67ms step:61/1480 train_time:7276ms step_avg:142.67ms step:62/1480 train_time:7420ms step_avg:142.70ms step:63/1480 train_time:7563ms step_avg:142.70ms step:64/1480 train_time:7708ms step_avg:142.74ms step:65/1480 train_time:7849ms step_avg:142.72ms step:66/1480 train_time:7991ms step_avg:142.70ms step:67/1480 train_time:8132ms step_avg:142.67ms step:68/1480 train_time:8274ms step_avg:142.66ms step:69/1480 train_time:8419ms step_avg:142.69ms step:70/1480 train_time:8563ms step_avg:142.72ms step:71/1480 train_time:8707ms step_avg:142.73ms step:72/1480 train_time:8850ms step_avg:142.74ms step:73/1480 train_time:8992ms step_avg:142.73ms step:74/1480 train_time:9133ms step_avg:142.70ms step:75/1480 train_time:9275ms step_avg:142.69ms step:76/1480 train_time:9418ms step_avg:142.70ms step:77/1480 train_time:9562ms step_avg:142.71ms step:78/1480 train_time:9705ms step_avg:142.72ms step:79/1480 train_time:9847ms step_avg:142.71ms step:80/1480 train_time:9988ms step_avg:142.69ms step:81/1480 train_time:10130ms step_avg:142.67ms step:82/1480 train_time:10271ms step_avg:142.65ms step:83/1480 train_time:10414ms step_avg:142.66ms step:84/1480 train_time:10557ms step_avg:142.66ms step:85/1480 train_time:10701ms step_avg:142.68ms step:86/1480 train_time:10843ms step_avg:142.67ms step:87/1480 train_time:10985ms step_avg:142.66ms step:88/1480 train_time:11126ms step_avg:142.65ms step:89/1480 train_time:11268ms step_avg:142.63ms step:90/1480 train_time:11410ms step_avg:142.63ms step:91/1480 train_time:11555ms step_avg:142.65ms step:92/1480 train_time:11702ms step_avg:142.70ms step:93/1480 train_time:11846ms step_avg:142.72ms step:94/1480 train_time:11988ms step_avg:142.72ms step:95/1480 train_time:12129ms step_avg:142.70ms step:96/1480 train_time:12270ms step_avg:142.68ms step:97/1480 train_time:12412ms step_avg:142.67ms step:98/1480 train_time:12556ms step_avg:142.68ms step:99/1480 train_time:12700ms step_avg:142.70ms step:100/1480 train_time:12844ms step_avg:142.72ms step:101/1480 train_time:12987ms step_avg:142.71ms step:102/1480 train_time:13128ms step_avg:142.69ms step:103/1480 train_time:13269ms step_avg:142.67ms step:104/1480 train_time:13411ms step_avg:142.67ms step:105/1480 train_time:13552ms step_avg:142.65ms step:106/1480 train_time:13694ms step_avg:142.65ms step:107/1480 train_time:13839ms step_avg:142.67ms step:108/1480 train_time:13984ms step_avg:142.69ms step:109/1480 train_time:14126ms step_avg:142.69ms step:110/1480 train_time:14268ms step_avg:142.68ms step:111/1480 train_time:14412ms step_avg:142.69ms step:112/1480 train_time:14558ms step_avg:142.73ms step:113/1480 train_time:14705ms step_avg:142.77ms step:114/1480 train_time:14852ms step_avg:142.81ms step:115/1480 train_time:15000ms step_avg:142.86ms step:116/1480 train_time:15147ms step_avg:142.90ms step:117/1480 train_time:15293ms step_avg:142.93ms step:118/1480 train_time:15439ms step_avg:142.95ms step:119/1480 train_time:15585ms step_avg:142.98ms step:120/1480 train_time:15731ms step_avg:143.01ms step:121/1480 train_time:15879ms step_avg:143.05ms step:122/1480 train_time:16027ms step_avg:143.10ms step:123/1480 train_time:16173ms step_avg:143.12ms step:124/1480 train_time:16322ms step_avg:143.18ms step:125/1480 train_time:16469ms step_avg:143.21ms step:125/1480 val_loss:4.4216 train_time:16525ms step_avg:143.70ms step:126/1480 train_time:16619ms step_avg:143.27ms step:127/1480 train_time:16768ms step_avg:143.32ms step:128/1480 train_time:16916ms step_avg:143.36ms step:129/1480 train_time:17062ms step_avg:143.38ms step:130/1480 train_time:17209ms step_avg:143.41ms step:131/1480 train_time:17355ms step_avg:143.43ms step:132/1480 train_time:17500ms step_avg:143.44ms step:133/1480 train_time:17650ms step_avg:143.50ms step:134/1480 train_time:17799ms step_avg:143.54ms step:135/1480 train_time:17947ms step_avg:143.58ms step:136/1480 train_time:18094ms step_avg:143.61ms step:137/1480 train_time:18241ms step_avg:143.63ms step:138/1480 train_time:18388ms step_avg:143.65ms step:139/1480 train_time:18534ms step_avg:143.68ms step:140/1480 train_time:18680ms step_avg:143.69ms step:141/1480 train_time:18829ms step_avg:143.73ms step:142/1480 train_time:18976ms step_avg:143.76ms step:143/1480 train_time:19121ms step_avg:143.77ms step:144/1480 train_time:19268ms step_avg:143.79ms step:145/1480 train_time:19416ms step_avg:143.82ms step:146/1480 train_time:19561ms step_avg:143.83ms step:147/1480 train_time:19708ms step_avg:143.86ms step:148/1480 train_time:19855ms step_avg:143.88ms step:149/1480 train_time:20000ms step_avg:143.88ms step:150/1480 train_time:20147ms step_avg:143.91ms step:151/1480 train_time:20294ms step_avg:143.93ms step:152/1480 train_time:20440ms step_avg:143.94ms step:153/1480 train_time:20586ms step_avg:143.96ms step:154/1480 train_time:20734ms step_avg:143.98ms step:155/1480 train_time:20880ms step_avg:144.00ms step:156/1480 train_time:21028ms step_avg:144.03ms step:157/1480 train_time:21174ms step_avg:144.04ms step:158/1480 train_time:21321ms step_avg:144.06ms step:159/1480 train_time:21467ms step_avg:144.07ms step:160/1480 train_time:21614ms step_avg:144.09ms step:161/1480 train_time:21760ms step_avg:144.11ms step:162/1480 train_time:21908ms step_avg:144.13ms step:163/1480 train_time:22056ms step_avg:144.15ms step:164/1480 train_time:22201ms step_avg:144.16ms step:165/1480 train_time:22349ms step_avg:144.19ms step:166/1480 train_time:22495ms step_avg:144.20ms step:167/1480 train_time:22642ms step_avg:144.22ms step:168/1480 train_time:22789ms step_avg:144.24ms step:169/1480 train_time:22936ms step_avg:144.25ms step:170/1480 train_time:23083ms step_avg:144.27ms step:171/1480 train_time:23230ms step_avg:144.29ms step:172/1480 train_time:23377ms step_avg:144.30ms step:173/1480 train_time:23523ms step_avg:144.31ms step:174/1480 train_time:23669ms step_avg:144.32ms step:175/1480 train_time:23817ms step_avg:144.35ms step:176/1480 train_time:23963ms step_avg:144.35ms step:177/1480 train_time:24110ms step_avg:144.37ms step:178/1480 train_time:24259ms step_avg:144.40ms step:179/1480 train_time:24403ms step_avg:144.40ms step:180/1480 train_time:24551ms step_avg:144.42ms step:181/1480 train_time:24698ms step_avg:144.43ms step:182/1480 train_time:24845ms step_avg:144.45ms step:183/1480 train_time:24991ms step_avg:144.46ms step:184/1480 train_time:25138ms step_avg:144.47ms step:185/1480 train_time:25285ms step_avg:144.49ms step:186/1480 train_time:25433ms step_avg:144.50ms step:187/1480 train_time:25579ms step_avg:144.51ms step:188/1480 train_time:25725ms step_avg:144.52ms step:189/1480 train_time:25872ms step_avg:144.54ms step:190/1480 train_time:26020ms step_avg:144.55ms step:191/1480 train_time:26166ms step_avg:144.56ms step:192/1480 train_time:26313ms step_avg:144.58ms step:193/1480 train_time:26460ms step_avg:144.59ms step:194/1480 train_time:26605ms step_avg:144.59ms step:195/1480 train_time:26752ms step_avg:144.61ms step:196/1480 train_time:26898ms step_avg:144.61ms step:197/1480 train_time:27045ms step_avg:144.63ms step:198/1480 train_time:27192ms step_avg:144.64ms step:199/1480 train_time:27338ms step_avg:144.65ms step:200/1480 train_time:27486ms step_avg:144.66ms step:201/1480 train_time:27633ms step_avg:144.68ms step:202/1480 train_time:27779ms step_avg:144.68ms step:203/1480 train_time:27926ms step_avg:144.69ms step:204/1480 train_time:28073ms step_avg:144.71ms step:205/1480 train_time:28219ms step_avg:144.71ms step:206/1480 train_time:28366ms step_avg:144.73ms step:207/1480 train_time:28514ms step_avg:144.74ms step:208/1480 train_time:28660ms step_avg:144.75ms step:209/1480 train_time:28808ms step_avg:144.76ms step:210/1480 train_time:28954ms step_avg:144.77ms step:211/1480 train_time:29101ms step_avg:144.78ms step:212/1480 train_time:29249ms step_avg:144.80ms step:213/1480 train_time:29396ms step_avg:144.81ms step:214/1480 train_time:29543ms step_avg:144.82ms step:215/1480 train_time:29691ms step_avg:144.83ms step:216/1480 train_time:29839ms step_avg:144.85ms step:217/1480 train_time:29983ms step_avg:144.85ms step:218/1480 train_time:30130ms step_avg:144.86ms step:219/1480 train_time:30277ms step_avg:144.87ms step:220/1480 train_time:30424ms step_avg:144.88ms step:221/1480 train_time:30572ms step_avg:144.89ms step:222/1480 train_time:30723ms step_avg:144.92ms step:223/1480 train_time:30873ms step_avg:144.95ms step:224/1480 train_time:31024ms step_avg:144.97ms step:225/1480 train_time:31174ms step_avg:144.99ms step:226/1480 train_time:31324ms step_avg:145.02ms step:227/1480 train_time:31474ms step_avg:145.04ms step:228/1480 train_time:31624ms step_avg:145.07ms step:229/1480 train_time:31776ms step_avg:145.10ms step:230/1480 train_time:31927ms step_avg:145.12ms step:231/1480 train_time:32077ms step_avg:145.14ms step:232/1480 train_time:32227ms step_avg:145.17ms step:233/1480 train_time:32377ms step_avg:145.19ms step:234/1480 train_time:32527ms step_avg:145.21ms step:235/1480 train_time:32677ms step_avg:145.23ms step:236/1480 train_time:32828ms step_avg:145.26ms step:237/1480 train_time:32978ms step_avg:145.28ms step:238/1480 train_time:33129ms step_avg:145.30ms step:239/1480 train_time:33279ms step_avg:145.32ms step:240/1480 train_time:33430ms step_avg:145.35ms step:241/1480 train_time:33579ms step_avg:145.37ms step:242/1480 train_time:33730ms step_avg:145.39ms step:243/1480 train_time:33880ms step_avg:145.41ms step:244/1480 train_time:34032ms step_avg:145.44ms step:245/1480 train_time:34181ms step_avg:145.45ms step:246/1480 train_time:34333ms step_avg:145.48ms step:247/1480 train_time:34483ms step_avg:145.50ms step:248/1480 train_time:34633ms step_avg:145.52ms step:249/1480 train_time:34783ms step_avg:145.53ms step:250/1480 train_time:34934ms step_avg:145.56ms step:250/1480 val_loss:3.9893 train_time:34992ms step_avg:145.80ms step:251/1480 train_time:35092ms step_avg:145.61ms step:252/1480 train_time:35244ms step_avg:145.64ms step:253/1480 train_time:35394ms step_avg:145.65ms step:254/1480 train_time:35544ms step_avg:145.67ms step:255/1480 train_time:35694ms step_avg:145.69ms step:256/1480 train_time:35843ms step_avg:145.70ms step:257/1480 train_time:35994ms step_avg:145.72ms step:258/1480 train_time:36148ms step_avg:145.76ms step:259/1480 train_time:36299ms step_avg:145.78ms step:260/1480 train_time:36449ms step_avg:145.80ms step:261/1480 train_time:36598ms step_avg:145.81ms step:262/1480 train_time:36749ms step_avg:145.83ms step:263/1480 train_time:36898ms step_avg:145.84ms step:264/1480 train_time:37050ms step_avg:145.86ms step:265/1480 train_time:37200ms step_avg:145.88ms step:266/1480 train_time:37352ms step_avg:145.91ms step:267/1480 train_time:37502ms step_avg:145.92ms step:268/1480 train_time:37652ms step_avg:145.94ms step:269/1480 train_time:37801ms step_avg:145.95ms step:270/1480 train_time:37952ms step_avg:145.97ms step:271/1480 train_time:38101ms step_avg:145.98ms step:272/1480 train_time:38252ms step_avg:146.00ms step:273/1480 train_time:38402ms step_avg:146.01ms step:274/1480 train_time:38552ms step_avg:146.03ms step:275/1480 train_time:38701ms step_avg:146.04ms step:276/1480 train_time:38852ms step_avg:146.06ms step:277/1480 train_time:39002ms step_avg:146.07ms step:278/1480 train_time:39152ms step_avg:146.09ms step:279/1480 train_time:39303ms step_avg:146.11ms step:280/1480 train_time:39454ms step_avg:146.12ms step:281/1480 train_time:39606ms step_avg:146.15ms step:282/1480 train_time:39756ms step_avg:146.16ms step:283/1480 train_time:39906ms step_avg:146.18ms step:284/1480 train_time:40057ms step_avg:146.19ms step:285/1480 train_time:40208ms step_avg:146.21ms step:286/1480 train_time:40358ms step_avg:146.22ms step:287/1480 train_time:40508ms step_avg:146.24ms step:288/1480 train_time:40659ms step_avg:146.25ms step:289/1480 train_time:40810ms step_avg:146.27ms step:290/1480 train_time:40961ms step_avg:146.29ms step:291/1480 train_time:41111ms step_avg:146.30ms step:292/1480 train_time:41262ms step_avg:146.32ms step:293/1480 train_time:41413ms step_avg:146.33ms step:294/1480 train_time:41563ms step_avg:146.35ms step:295/1480 train_time:41714ms step_avg:146.36ms step:296/1480 train_time:41865ms step_avg:146.38ms step:297/1480 train_time:42015ms step_avg:146.39ms step:298/1480 train_time:42165ms step_avg:146.41ms step:299/1480 train_time:42316ms step_avg:146.42ms step:300/1480 train_time:42466ms step_avg:146.44ms step:301/1480 train_time:42617ms step_avg:146.45ms step:302/1480 train_time:42769ms step_avg:146.47ms step:303/1480 train_time:42918ms step_avg:146.48ms step:304/1480 train_time:43070ms step_avg:146.50ms step:305/1480 train_time:43219ms step_avg:146.50ms step:306/1480 train_time:43371ms step_avg:146.52ms step:307/1480 train_time:43521ms step_avg:146.54ms step:308/1480 train_time:43672ms step_avg:146.55ms step:309/1480 train_time:43822ms step_avg:146.56ms step:310/1480 train_time:43977ms step_avg:146.59ms step:311/1480 train_time:44124ms step_avg:146.59ms step:312/1480 train_time:44275ms step_avg:146.61ms step:313/1480 train_time:44426ms step_avg:146.62ms step:314/1480 train_time:44579ms step_avg:146.64ms step:315/1480 train_time:44726ms step_avg:146.64ms step:316/1480 train_time:44877ms step_avg:146.66ms step:317/1480 train_time:45029ms step_avg:146.67ms step:318/1480 train_time:45178ms step_avg:146.68ms step:319/1480 train_time:45329ms step_avg:146.70ms step:320/1480 train_time:45479ms step_avg:146.71ms step:321/1480 train_time:45630ms step_avg:146.72ms step:322/1480 train_time:45780ms step_avg:146.73ms step:323/1480 train_time:45931ms step_avg:146.74ms step:324/1480 train_time:46082ms step_avg:146.76ms step:325/1480 train_time:46232ms step_avg:146.77ms step:326/1480 train_time:46381ms step_avg:146.78ms step:327/1480 train_time:46532ms step_avg:146.79ms step:328/1480 train_time:46683ms step_avg:146.80ms step:329/1480 train_time:46834ms step_avg:146.81ms step:330/1480 train_time:46986ms step_avg:146.83ms step:331/1480 train_time:47141ms step_avg:146.86ms step:332/1480 train_time:47295ms step_avg:146.88ms step:333/1480 train_time:47450ms step_avg:146.90ms step:334/1480 train_time:47603ms step_avg:146.92ms step:335/1480 train_time:47757ms step_avg:146.94ms step:336/1480 train_time:47910ms step_avg:146.96ms step:337/1480 train_time:48064ms step_avg:146.98ms step:338/1480 train_time:48217ms step_avg:147.00ms step:339/1480 train_time:48371ms step_avg:147.02ms step:340/1480 train_time:48526ms step_avg:147.05ms step:341/1480 train_time:48680ms step_avg:147.07ms step:342/1480 train_time:48834ms step_avg:147.09ms step:343/1480 train_time:48990ms step_avg:147.12ms step:344/1480 train_time:49144ms step_avg:147.14ms step:345/1480 train_time:49298ms step_avg:147.16ms step:346/1480 train_time:49453ms step_avg:147.18ms step:347/1480 train_time:49607ms step_avg:147.20ms step:348/1480 train_time:49761ms step_avg:147.22ms step:349/1480 train_time:49915ms step_avg:147.24ms step:350/1480 train_time:50070ms step_avg:147.26ms step:351/1480 train_time:50223ms step_avg:147.28ms step:352/1480 train_time:50376ms step_avg:147.30ms step:353/1480 train_time:50529ms step_avg:147.32ms step:354/1480 train_time:50682ms step_avg:147.33ms step:355/1480 train_time:50837ms step_avg:147.35ms step:356/1480 train_time:50992ms step_avg:147.37ms step:357/1480 train_time:51147ms step_avg:147.40ms step:358/1480 train_time:51301ms step_avg:147.42ms step:359/1480 train_time:51455ms step_avg:147.44ms step:360/1480 train_time:51610ms step_avg:147.46ms step:361/1480 train_time:51766ms step_avg:147.48ms step:362/1480 train_time:51919ms step_avg:147.50ms step:363/1480 train_time:52073ms step_avg:147.51ms step:364/1480 train_time:52228ms step_avg:147.54ms step:365/1480 train_time:52382ms step_avg:147.55ms step:366/1480 train_time:52534ms step_avg:147.57ms step:367/1480 train_time:52688ms step_avg:147.58ms step:368/1480 train_time:52841ms step_avg:147.60ms step:369/1480 train_time:52995ms step_avg:147.62ms step:370/1480 train_time:53148ms step_avg:147.63ms step:371/1480 train_time:53301ms step_avg:147.65ms step:372/1480 train_time:53456ms step_avg:147.67ms step:373/1480 train_time:53610ms step_avg:147.69ms step:374/1480 train_time:53764ms step_avg:147.70ms step:375/1480 train_time:53918ms step_avg:147.72ms step:375/1480 val_loss:3.8090 train_time:53978ms step_avg:147.89ms step:376/1480 train_time:54075ms step_avg:147.74ms step:377/1480 train_time:54230ms step_avg:147.77ms step:378/1480 train_time:54383ms step_avg:147.78ms step:379/1480 train_time:54535ms step_avg:147.79ms step:380/1480 train_time:54687ms step_avg:147.80ms step:381/1480 train_time:54839ms step_avg:147.81ms step:382/1480 train_time:54992ms step_avg:147.83ms step:383/1480 train_time:55150ms step_avg:147.85ms step:384/1480 train_time:55306ms step_avg:147.88ms step:385/1480 train_time:55461ms step_avg:147.90ms step:386/1480 train_time:55613ms step_avg:147.91ms step:387/1480 train_time:55767ms step_avg:147.92ms step:388/1480 train_time:55921ms step_avg:147.94ms step:389/1480 train_time:56074ms step_avg:147.95ms step:390/1480 train_time:56230ms step_avg:147.97ms step:391/1480 train_time:56384ms step_avg:147.99ms step:392/1480 train_time:56537ms step_avg:148.00ms step:393/1480 train_time:56691ms step_avg:148.02ms step:394/1480 train_time:56846ms step_avg:148.04ms step:395/1480 train_time:56998ms step_avg:148.05ms step:396/1480 train_time:57151ms step_avg:148.06ms step:397/1480 train_time:57304ms step_avg:148.07ms step:398/1480 train_time:57458ms step_avg:148.09ms step:399/1480 train_time:57611ms step_avg:148.10ms step:400/1480 train_time:57767ms step_avg:148.12ms step:401/1480 train_time:57921ms step_avg:148.14ms step:402/1480 train_time:58074ms step_avg:148.15ms step:403/1480 train_time:58230ms step_avg:148.17ms step:404/1480 train_time:58385ms step_avg:148.18ms step:405/1480 train_time:58539ms step_avg:148.20ms step:406/1480 train_time:58692ms step_avg:148.21ms step:407/1480 train_time:58846ms step_avg:148.23ms step:408/1480 train_time:58998ms step_avg:148.24ms step:409/1480 train_time:59152ms step_avg:148.25ms step:410/1480 train_time:59305ms step_avg:148.26ms step:411/1480 train_time:59458ms step_avg:148.27ms step:412/1480 train_time:59611ms step_avg:148.29ms step:413/1480 train_time:59767ms step_avg:148.30ms step:414/1480 train_time:59920ms step_avg:148.32ms step:415/1480 train_time:60073ms step_avg:148.33ms step:416/1480 train_time:60227ms step_avg:148.34ms step:417/1480 train_time:60380ms step_avg:148.35ms step:418/1480 train_time:60534ms step_avg:148.37ms step:419/1480 train_time:60686ms step_avg:148.38ms step:420/1480 train_time:60841ms step_avg:148.39ms step:421/1480 train_time:60995ms step_avg:148.41ms step:422/1480 train_time:61148ms step_avg:148.42ms step:423/1480 train_time:61302ms step_avg:148.43ms step:424/1480 train_time:61456ms step_avg:148.44ms step:425/1480 train_time:61611ms step_avg:148.46ms step:426/1480 train_time:61765ms step_avg:148.47ms step:427/1480 train_time:61920ms step_avg:148.49ms step:428/1480 train_time:62073ms step_avg:148.50ms step:429/1480 train_time:62227ms step_avg:148.51ms step:430/1480 train_time:62380ms step_avg:148.52ms step:431/1480 train_time:62533ms step_avg:148.53ms step:432/1480 train_time:62686ms step_avg:148.55ms step:433/1480 train_time:62840ms step_avg:148.56ms step:434/1480 train_time:62994ms step_avg:148.57ms step:435/1480 train_time:63148ms step_avg:148.58ms step:436/1480 train_time:63302ms step_avg:148.60ms step:437/1480 train_time:63456ms step_avg:148.61ms step:438/1480 train_time:63608ms step_avg:148.62ms step:439/1480 train_time:63763ms step_avg:148.63ms step:440/1480 train_time:63917ms step_avg:148.64ms step:441/1480 train_time:64075ms step_avg:148.67ms step:442/1480 train_time:64231ms step_avg:148.68ms step:443/1480 train_time:64388ms step_avg:148.70ms step:444/1480 train_time:64547ms step_avg:148.73ms step:445/1480 train_time:64704ms step_avg:148.74ms step:446/1480 train_time:64859ms step_avg:148.76ms step:447/1480 train_time:65015ms step_avg:148.77ms step:448/1480 train_time:65171ms step_avg:148.79ms step:449/1480 train_time:65330ms step_avg:148.81ms step:450/1480 train_time:65487ms step_avg:148.83ms step:451/1480 train_time:65648ms step_avg:148.86ms step:452/1480 train_time:65804ms step_avg:148.88ms step:453/1480 train_time:65960ms step_avg:148.89ms step:454/1480 train_time:66115ms step_avg:148.91ms step:455/1480 train_time:66271ms step_avg:148.92ms step:456/1480 train_time:66429ms step_avg:148.94ms step:457/1480 train_time:66587ms step_avg:148.96ms step:458/1480 train_time:66744ms step_avg:148.98ms step:459/1480 train_time:66903ms step_avg:149.00ms step:460/1480 train_time:67060ms step_avg:149.02ms step:461/1480 train_time:67218ms step_avg:149.04ms step:462/1480 train_time:67374ms step_avg:149.06ms step:463/1480 train_time:67532ms step_avg:149.08ms step:464/1480 train_time:67689ms step_avg:149.09ms step:465/1480 train_time:67847ms step_avg:149.11ms step:466/1480 train_time:68003ms step_avg:149.13ms step:467/1480 train_time:68161ms step_avg:149.15ms step:468/1480 train_time:68316ms step_avg:149.16ms step:469/1480 train_time:68473ms step_avg:149.18ms step:470/1480 train_time:68630ms step_avg:149.20ms step:471/1480 train_time:68786ms step_avg:149.21ms step:472/1480 train_time:68943ms step_avg:149.23ms step:473/1480 train_time:69099ms step_avg:149.24ms step:474/1480 train_time:69255ms step_avg:149.26ms step:475/1480 train_time:69411ms step_avg:149.27ms step:476/1480 train_time:69569ms step_avg:149.29ms step:477/1480 train_time:69727ms step_avg:149.31ms step:478/1480 train_time:69884ms step_avg:149.33ms step:479/1480 train_time:70041ms step_avg:149.34ms step:480/1480 train_time:70198ms step_avg:149.36ms step:481/1480 train_time:70354ms step_avg:149.37ms step:482/1480 train_time:70509ms step_avg:149.38ms step:483/1480 train_time:70667ms step_avg:149.40ms step:484/1480 train_time:70824ms step_avg:149.42ms step:485/1480 train_time:70980ms step_avg:149.43ms step:486/1480 train_time:71137ms step_avg:149.45ms step:487/1480 train_time:71293ms step_avg:149.46ms step:488/1480 train_time:71451ms step_avg:149.48ms step:489/1480 train_time:71607ms step_avg:149.49ms step:490/1480 train_time:71765ms step_avg:149.51ms step:491/1480 train_time:71921ms step_avg:149.52ms step:492/1480 train_time:72077ms step_avg:149.54ms step:493/1480 train_time:72234ms step_avg:149.55ms step:494/1480 train_time:72391ms step_avg:149.57ms step:495/1480 train_time:72549ms step_avg:149.58ms step:496/1480 train_time:72707ms step_avg:149.60ms step:497/1480 train_time:72864ms step_avg:149.62ms step:498/1480 train_time:73022ms step_avg:149.64ms step:499/1480 train_time:73180ms step_avg:149.65ms step:500/1480 train_time:73337ms step_avg:149.67ms step:500/1480 val_loss:3.6887 train_time:73398ms step_avg:149.79ms step:501/1480 train_time:73496ms step_avg:149.69ms step:502/1480 train_time:73654ms step_avg:149.70ms step:503/1480 train_time:73811ms step_avg:149.72ms step:504/1480 train_time:73967ms step_avg:149.73ms step:505/1480 train_time:74122ms step_avg:149.74ms step:506/1480 train_time:74279ms step_avg:149.76ms step:507/1480 train_time:74435ms step_avg:149.77ms step:508/1480 train_time:74593ms step_avg:149.78ms step:509/1480 train_time:74751ms step_avg:149.80ms step:510/1480 train_time:74908ms step_avg:149.82ms step:511/1480 train_time:75064ms step_avg:149.83ms step:512/1480 train_time:75223ms step_avg:149.85ms step:513/1480 train_time:75378ms step_avg:149.86ms step:514/1480 train_time:75535ms step_avg:149.87ms step:515/1480 train_time:75692ms step_avg:149.88ms step:516/1480 train_time:75852ms step_avg:149.91ms step:517/1480 train_time:76010ms step_avg:149.92ms step:518/1480 train_time:76167ms step_avg:149.93ms step:519/1480 train_time:76324ms step_avg:149.95ms step:520/1480 train_time:76480ms step_avg:149.96ms step:521/1480 train_time:76635ms step_avg:149.97ms step:522/1480 train_time:76792ms step_avg:149.99ms step:523/1480 train_time:76952ms step_avg:150.00ms step:524/1480 train_time:77110ms step_avg:150.02ms step:525/1480 train_time:77267ms step_avg:150.03ms step:526/1480 train_time:77427ms step_avg:150.05ms step:527/1480 train_time:77582ms step_avg:150.06ms step:528/1480 train_time:77737ms step_avg:150.07ms step:529/1480 train_time:77893ms step_avg:150.08ms step:530/1480 train_time:78052ms step_avg:150.10ms step:531/1480 train_time:78208ms step_avg:150.11ms step:532/1480 train_time:78364ms step_avg:150.12ms step:533/1480 train_time:78522ms step_avg:150.14ms step:534/1480 train_time:78677ms step_avg:150.15ms step:535/1480 train_time:78833ms step_avg:150.16ms step:536/1480 train_time:78991ms step_avg:150.17ms step:537/1480 train_time:79149ms step_avg:150.19ms step:538/1480 train_time:79307ms step_avg:150.20ms step:539/1480 train_time:79464ms step_avg:150.22ms step:540/1480 train_time:79620ms step_avg:150.23ms step:541/1480 train_time:79776ms step_avg:150.24ms step:542/1480 train_time:79932ms step_avg:150.25ms step:543/1480 train_time:80089ms step_avg:150.26ms step:544/1480 train_time:80246ms step_avg:150.27ms step:545/1480 train_time:80403ms step_avg:150.29ms step:546/1480 train_time:80559ms step_avg:150.30ms step:547/1480 train_time:80716ms step_avg:150.31ms step:548/1480 train_time:80874ms step_avg:150.32ms step:549/1480 train_time:81031ms step_avg:150.34ms step:550/1480 train_time:81187ms step_avg:150.35ms step:551/1480 train_time:81345ms step_avg:150.36ms step:552/1480 train_time:81503ms step_avg:150.37ms step:553/1480 train_time:81661ms step_avg:150.39ms step:554/1480 train_time:81819ms step_avg:150.40ms step:555/1480 train_time:81979ms step_avg:150.42ms step:556/1480 train_time:82136ms step_avg:150.43ms step:557/1480 train_time:82296ms step_avg:150.45ms step:558/1480 train_time:82456ms step_avg:150.47ms step:559/1480 train_time:82615ms step_avg:150.48ms step:560/1480 train_time:82775ms step_avg:150.50ms step:561/1480 train_time:82933ms step_avg:150.51ms step:562/1480 train_time:83093ms step_avg:150.53ms step:563/1480 train_time:83253ms step_avg:150.55ms step:564/1480 train_time:83414ms step_avg:150.57ms step:565/1480 train_time:83573ms step_avg:150.58ms step:566/1480 train_time:83734ms step_avg:150.60ms step:567/1480 train_time:83893ms step_avg:150.62ms step:568/1480 train_time:84054ms step_avg:150.63ms step:569/1480 train_time:84214ms step_avg:150.65ms step:570/1480 train_time:84373ms step_avg:150.67ms step:571/1480 train_time:84534ms step_avg:150.68ms step:572/1480 train_time:84693ms step_avg:150.70ms step:573/1480 train_time:84855ms step_avg:150.72ms step:574/1480 train_time:85016ms step_avg:150.74ms step:575/1480 train_time:85176ms step_avg:150.75ms step:576/1480 train_time:85336ms step_avg:150.77ms step:577/1480 train_time:85496ms step_avg:150.79ms step:578/1480 train_time:85655ms step_avg:150.80ms step:579/1480 train_time:85814ms step_avg:150.82ms step:580/1480 train_time:85973ms step_avg:150.83ms step:581/1480 train_time:86133ms step_avg:150.85ms step:582/1480 train_time:86293ms step_avg:150.86ms step:583/1480 train_time:86454ms step_avg:150.88ms step:584/1480 train_time:86614ms step_avg:150.90ms step:585/1480 train_time:86774ms step_avg:150.91ms step:586/1480 train_time:86934ms step_avg:150.93ms step:587/1480 train_time:87094ms step_avg:150.94ms step:588/1480 train_time:87253ms step_avg:150.96ms step:589/1480 train_time:87414ms step_avg:150.97ms step:590/1480 train_time:87575ms step_avg:150.99ms step:591/1480 train_time:87733ms step_avg:151.00ms step:592/1480 train_time:87893ms step_avg:151.02ms step:593/1480 train_time:88055ms step_avg:151.04ms step:594/1480 train_time:88216ms step_avg:151.05ms step:595/1480 train_time:88376ms step_avg:151.07ms step:596/1480 train_time:88537ms step_avg:151.09ms step:597/1480 train_time:88695ms step_avg:151.10ms step:598/1480 train_time:88854ms step_avg:151.11ms step:599/1480 train_time:89013ms step_avg:151.13ms step:600/1480 train_time:89174ms step_avg:151.14ms step:601/1480 train_time:89333ms step_avg:151.16ms step:602/1480 train_time:89494ms step_avg:151.17ms step:603/1480 train_time:89655ms step_avg:151.19ms step:604/1480 train_time:89814ms step_avg:151.20ms step:605/1480 train_time:89973ms step_avg:151.22ms step:606/1480 train_time:90135ms step_avg:151.23ms step:607/1480 train_time:90296ms step_avg:151.25ms step:608/1480 train_time:90455ms step_avg:151.26ms step:609/1480 train_time:90615ms step_avg:151.28ms step:610/1480 train_time:90773ms step_avg:151.29ms step:611/1480 train_time:90935ms step_avg:151.31ms step:612/1480 train_time:91095ms step_avg:151.32ms step:613/1480 train_time:91256ms step_avg:151.34ms step:614/1480 train_time:91416ms step_avg:151.35ms step:615/1480 train_time:91575ms step_avg:151.36ms step:616/1480 train_time:91734ms step_avg:151.38ms step:617/1480 train_time:91893ms step_avg:151.39ms step:618/1480 train_time:92053ms step_avg:151.40ms step:619/1480 train_time:92213ms step_avg:151.42ms step:620/1480 train_time:92372ms step_avg:151.43ms step:621/1480 train_time:92533ms step_avg:151.45ms step:622/1480 train_time:92693ms step_avg:151.46ms step:623/1480 train_time:92855ms step_avg:151.48ms step:624/1480 train_time:93014ms step_avg:151.49ms step:625/1480 train_time:93174ms step_avg:151.50ms step:625/1480 val_loss:3.6096 train_time:93239ms step_avg:151.61ms step:626/1480 train_time:93338ms step_avg:151.52ms step:627/1480 train_time:93501ms step_avg:151.54ms step:628/1480 train_time:93659ms step_avg:151.55ms step:629/1480 train_time:93818ms step_avg:151.56ms step:630/1480 train_time:93977ms step_avg:151.58ms step:631/1480 train_time:94136ms step_avg:151.59ms step:632/1480 train_time:94296ms step_avg:151.60ms step:633/1480 train_time:94456ms step_avg:151.62ms step:634/1480 train_time:94619ms step_avg:151.63ms step:635/1480 train_time:94779ms step_avg:151.65ms step:636/1480 train_time:94939ms step_avg:151.66ms step:637/1480 train_time:95099ms step_avg:151.67ms step:638/1480 train_time:95258ms step_avg:151.69ms step:639/1480 train_time:95420ms step_avg:151.70ms step:640/1480 train_time:95580ms step_avg:151.71ms step:641/1480 train_time:95740ms step_avg:151.73ms step:642/1480 train_time:95900ms step_avg:151.74ms step:643/1480 train_time:96059ms step_avg:151.75ms step:644/1480 train_time:96217ms step_avg:151.76ms step:645/1480 train_time:96377ms step_avg:151.77ms step:646/1480 train_time:96536ms step_avg:151.79ms step:647/1480 train_time:96695ms step_avg:151.80ms step:648/1480 train_time:96856ms step_avg:151.81ms step:649/1480 train_time:97017ms step_avg:151.83ms step:650/1480 train_time:97176ms step_avg:151.84ms step:651/1480 train_time:97336ms step_avg:151.85ms step:652/1480 train_time:97496ms step_avg:151.86ms step:653/1480 train_time:97654ms step_avg:151.87ms step:654/1480 train_time:97814ms step_avg:151.88ms step:655/1480 train_time:97974ms step_avg:151.90ms step:656/1480 train_time:98134ms step_avg:151.91ms step:657/1480 train_time:98293ms step_avg:151.92ms step:658/1480 train_time:98453ms step_avg:151.93ms step:659/1480 train_time:98617ms step_avg:151.95ms step:660/1480 train_time:98779ms step_avg:151.97ms step:661/1480 train_time:98941ms step_avg:151.98ms step:662/1480 train_time:99101ms step_avg:152.00ms step:663/1480 train_time:99261ms step_avg:152.01ms step:664/1480 train_time:99423ms step_avg:152.02ms step:665/1480 train_time:99584ms step_avg:152.04ms step:666/1480 train_time:99745ms step_avg:152.05ms step:667/1480 train_time:99907ms step_avg:152.07ms step:668/1480 train_time:100069ms step_avg:152.08ms step:669/1480 train_time:100230ms step_avg:152.09ms step:670/1480 train_time:100389ms step_avg:152.10ms step:671/1480 train_time:100548ms step_avg:152.12ms step:672/1480 train_time:100709ms step_avg:152.13ms step:673/1480 train_time:100871ms step_avg:152.14ms step:674/1480 train_time:101032ms step_avg:152.16ms step:675/1480 train_time:101195ms step_avg:152.17ms step:676/1480 train_time:101359ms step_avg:152.19ms step:677/1480 train_time:101521ms step_avg:152.21ms step:678/1480 train_time:101682ms step_avg:152.22ms step:679/1480 train_time:101843ms step_avg:152.23ms step:680/1480 train_time:102006ms step_avg:152.25ms step:681/1480 train_time:102165ms step_avg:152.26ms step:682/1480 train_time:102326ms step_avg:152.27ms step:683/1480 train_time:102489ms step_avg:152.29ms step:684/1480 train_time:102650ms step_avg:152.30ms step:685/1480 train_time:102813ms step_avg:152.32ms step:686/1480 train_time:102974ms step_avg:152.33ms step:687/1480 train_time:103134ms step_avg:152.34ms step:688/1480 train_time:103299ms step_avg:152.36ms step:689/1480 train_time:103463ms step_avg:152.38ms step:690/1480 train_time:103625ms step_avg:152.39ms step:691/1480 train_time:103787ms step_avg:152.40ms step:692/1480 train_time:103947ms step_avg:152.41ms step:693/1480 train_time:104107ms step_avg:152.43ms step:694/1480 train_time:104268ms step_avg:152.44ms step:695/1480 train_time:104427ms step_avg:152.45ms step:696/1480 train_time:104587ms step_avg:152.46ms step:697/1480 train_time:104749ms step_avg:152.47ms step:698/1480 train_time:104910ms step_avg:152.49ms step:699/1480 train_time:105073ms step_avg:152.50ms step:700/1480 train_time:105234ms step_avg:152.51ms step:701/1480 train_time:105396ms step_avg:152.53ms step:702/1480 train_time:105557ms step_avg:152.54ms step:703/1480 train_time:105718ms step_avg:152.55ms step:704/1480 train_time:105880ms step_avg:152.56ms step:705/1480 train_time:106043ms step_avg:152.58ms step:706/1480 train_time:106206ms step_avg:152.60ms step:707/1480 train_time:106366ms step_avg:152.61ms step:708/1480 train_time:106528ms step_avg:152.62ms step:709/1480 train_time:106691ms step_avg:152.63ms step:710/1480 train_time:106851ms step_avg:152.64ms step:711/1480 train_time:107016ms step_avg:152.66ms step:712/1480 train_time:107182ms step_avg:152.68ms step:713/1480 train_time:107346ms step_avg:152.70ms step:714/1480 train_time:107507ms step_avg:152.71ms step:715/1480 train_time:107666ms step_avg:152.72ms step:716/1480 train_time:107825ms step_avg:152.73ms step:717/1480 train_time:107987ms step_avg:152.74ms step:718/1480 train_time:108146ms step_avg:152.75ms step:719/1480 train_time:108305ms step_avg:152.76ms step:720/1480 train_time:108467ms step_avg:152.77ms step:721/1480 train_time:108627ms step_avg:152.78ms step:722/1480 train_time:108788ms step_avg:152.79ms step:723/1480 train_time:108947ms step_avg:152.80ms step:724/1480 train_time:109110ms step_avg:152.82ms step:725/1480 train_time:109274ms step_avg:152.83ms step:726/1480 train_time:109437ms step_avg:152.84ms step:727/1480 train_time:109600ms step_avg:152.86ms step:728/1480 train_time:109761ms step_avg:152.87ms step:729/1480 train_time:109923ms step_avg:152.88ms step:730/1480 train_time:110087ms step_avg:152.90ms step:731/1480 train_time:110247ms step_avg:152.91ms step:732/1480 train_time:110406ms step_avg:152.92ms step:733/1480 train_time:110567ms step_avg:152.93ms step:734/1480 train_time:110727ms step_avg:152.94ms step:735/1480 train_time:110887ms step_avg:152.95ms step:736/1480 train_time:111048ms step_avg:152.96ms step:737/1480 train_time:111212ms step_avg:152.97ms step:738/1480 train_time:111373ms step_avg:152.99ms step:739/1480 train_time:111534ms step_avg:153.00ms step:740/1480 train_time:111700ms step_avg:153.01ms step:741/1480 train_time:111863ms step_avg:153.03ms step:742/1480 train_time:112025ms step_avg:153.04ms step:743/1480 train_time:112186ms step_avg:153.05ms step:744/1480 train_time:112348ms step_avg:153.06ms step:745/1480 train_time:112511ms step_avg:153.08ms step:746/1480 train_time:112670ms step_avg:153.08ms step:747/1480 train_time:112832ms step_avg:153.10ms step:748/1480 train_time:112998ms step_avg:153.11ms step:749/1480 train_time:113162ms step_avg:153.13ms step:750/1480 train_time:113322ms step_avg:153.14ms step:750/1480 val_loss:3.5540 train_time:113386ms step_avg:153.22ms step:751/1480 train_time:113486ms step_avg:153.15ms step:752/1480 train_time:113647ms step_avg:153.16ms step:753/1480 train_time:113807ms step_avg:153.17ms step:754/1480 train_time:113968ms step_avg:153.18ms step:755/1480 train_time:114129ms step_avg:153.19ms step:756/1480 train_time:114288ms step_avg:153.20ms step:757/1480 train_time:114454ms step_avg:153.22ms step:758/1480 train_time:114615ms step_avg:153.23ms step:759/1480 train_time:114777ms step_avg:153.24ms step:760/1480 train_time:114941ms step_avg:153.25ms step:761/1480 train_time:115103ms step_avg:153.27ms step:762/1480 train_time:115264ms step_avg:153.28ms step:763/1480 train_time:115425ms step_avg:153.29ms step:764/1480 train_time:115586ms step_avg:153.30ms step:765/1480 train_time:115747ms step_avg:153.31ms step:766/1480 train_time:115909ms step_avg:153.32ms step:767/1480 train_time:116072ms step_avg:153.33ms step:768/1480 train_time:116235ms step_avg:153.34ms step:769/1480 train_time:116399ms step_avg:153.36ms step:770/1480 train_time:116563ms step_avg:153.37ms step:771/1480 train_time:116724ms step_avg:153.38ms step:772/1480 train_time:116886ms step_avg:153.39ms step:773/1480 train_time:117048ms step_avg:153.40ms step:774/1480 train_time:117209ms step_avg:153.41ms step:775/1480 train_time:117372ms step_avg:153.43ms step:776/1480 train_time:117538ms step_avg:153.44ms step:777/1480 train_time:117705ms step_avg:153.46ms step:778/1480 train_time:117867ms step_avg:153.47ms step:779/1480 train_time:118030ms step_avg:153.49ms step:780/1480 train_time:118193ms step_avg:153.50ms step:781/1480 train_time:118357ms step_avg:153.51ms step:782/1480 train_time:118521ms step_avg:153.52ms step:783/1480 train_time:118683ms step_avg:153.54ms step:784/1480 train_time:118847ms step_avg:153.55ms step:785/1480 train_time:119008ms step_avg:153.56ms step:786/1480 train_time:119174ms step_avg:153.57ms step:787/1480 train_time:119339ms step_avg:153.59ms step:788/1480 train_time:119503ms step_avg:153.60ms step:789/1480 train_time:119664ms step_avg:153.61ms step:790/1480 train_time:119828ms step_avg:153.63ms step:791/1480 train_time:119996ms step_avg:153.64ms step:792/1480 train_time:120161ms step_avg:153.66ms step:793/1480 train_time:120323ms step_avg:153.67ms step:794/1480 train_time:120486ms step_avg:153.68ms step:795/1480 train_time:120653ms step_avg:153.70ms step:796/1480 train_time:120820ms step_avg:153.72ms step:797/1480 train_time:120984ms step_avg:153.73ms step:798/1480 train_time:121147ms step_avg:153.74ms step:799/1480 train_time:121313ms step_avg:153.76ms step:800/1480 train_time:121477ms step_avg:153.77ms step:801/1480 train_time:121641ms step_avg:153.78ms step:802/1480 train_time:121807ms step_avg:153.80ms step:803/1480 train_time:121969ms step_avg:153.81ms step:804/1480 train_time:122129ms step_avg:153.81ms step:805/1480 train_time:122294ms step_avg:153.83ms step:806/1480 train_time:122458ms step_avg:153.84ms step:807/1480 train_time:122620ms step_avg:153.85ms step:808/1480 train_time:122783ms step_avg:153.86ms step:809/1480 train_time:122945ms step_avg:153.87ms step:810/1480 train_time:123106ms step_avg:153.88ms step:811/1480 train_time:123268ms step_avg:153.89ms step:812/1480 train_time:123430ms step_avg:153.90ms step:813/1480 train_time:123591ms step_avg:153.91ms step:814/1480 train_time:123757ms step_avg:153.93ms step:815/1480 train_time:123919ms step_avg:153.94ms step:816/1480 train_time:124083ms step_avg:153.95ms step:817/1480 train_time:124245ms step_avg:153.96ms step:818/1480 train_time:124405ms step_avg:153.97ms step:819/1480 train_time:124569ms step_avg:153.98ms step:820/1480 train_time:124734ms step_avg:153.99ms step:821/1480 train_time:124895ms step_avg:154.00ms step:822/1480 train_time:125060ms step_avg:154.01ms step:823/1480 train_time:125222ms step_avg:154.02ms step:824/1480 train_time:125384ms step_avg:154.03ms step:825/1480 train_time:125549ms step_avg:154.05ms step:826/1480 train_time:125716ms step_avg:154.06ms step:827/1480 train_time:125881ms step_avg:154.08ms step:828/1480 train_time:126044ms step_avg:154.09ms step:829/1480 train_time:126207ms step_avg:154.10ms step:830/1480 train_time:126370ms step_avg:154.11ms step:831/1480 train_time:126535ms step_avg:154.12ms step:832/1480 train_time:126699ms step_avg:154.14ms step:833/1480 train_time:126865ms step_avg:154.15ms step:834/1480 train_time:127028ms step_avg:154.16ms step:835/1480 train_time:127191ms step_avg:154.17ms step:836/1480 train_time:127358ms step_avg:154.19ms step:837/1480 train_time:127520ms step_avg:154.20ms step:838/1480 train_time:127684ms step_avg:154.21ms step:839/1480 train_time:127847ms step_avg:154.22ms step:840/1480 train_time:128007ms step_avg:154.23ms step:841/1480 train_time:128167ms step_avg:154.23ms step:842/1480 train_time:128331ms step_avg:154.24ms step:843/1480 train_time:128492ms step_avg:154.25ms step:844/1480 train_time:128656ms step_avg:154.26ms step:845/1480 train_time:128820ms step_avg:154.28ms step:846/1480 train_time:128984ms step_avg:154.29ms step:847/1480 train_time:129146ms step_avg:154.30ms step:848/1480 train_time:129308ms step_avg:154.31ms step:849/1480 train_time:129470ms step_avg:154.31ms step:850/1480 train_time:129634ms step_avg:154.33ms step:851/1480 train_time:129799ms step_avg:154.34ms step:852/1480 train_time:129962ms step_avg:154.35ms step:853/1480 train_time:130124ms step_avg:154.36ms step:854/1480 train_time:130287ms step_avg:154.37ms step:855/1480 train_time:130451ms step_avg:154.38ms step:856/1480 train_time:130615ms step_avg:154.39ms step:857/1480 train_time:130780ms step_avg:154.40ms step:858/1480 train_time:130946ms step_avg:154.42ms step:859/1480 train_time:131109ms step_avg:154.43ms step:860/1480 train_time:131270ms step_avg:154.44ms step:861/1480 train_time:131435ms step_avg:154.45ms step:862/1480 train_time:131603ms step_avg:154.46ms step:863/1480 train_time:131771ms step_avg:154.48ms step:864/1480 train_time:131936ms step_avg:154.49ms step:865/1480 train_time:132099ms step_avg:154.50ms step:866/1480 train_time:132267ms step_avg:154.52ms step:867/1480 train_time:132429ms step_avg:154.53ms step:868/1480 train_time:132589ms step_avg:154.53ms step:869/1480 train_time:132752ms step_avg:154.54ms step:870/1480 train_time:132918ms step_avg:154.56ms step:871/1480 train_time:133081ms step_avg:154.57ms step:872/1480 train_time:133245ms step_avg:154.58ms step:873/1480 train_time:133407ms step_avg:154.58ms step:874/1480 train_time:133573ms step_avg:154.60ms step:875/1480 train_time:133739ms step_avg:154.61ms step:875/1480 val_loss:3.5080 train_time:133804ms step_avg:154.69ms step:876/1480 train_time:133906ms step_avg:154.63ms step:877/1480 train_time:134070ms step_avg:154.64ms step:878/1480 train_time:134232ms step_avg:154.65ms step:879/1480 train_time:134396ms step_avg:154.66ms step:880/1480 train_time:134558ms step_avg:154.66ms step:881/1480 train_time:134720ms step_avg:154.67ms step:882/1480 train_time:134887ms step_avg:154.69ms step:883/1480 train_time:135053ms step_avg:154.70ms step:884/1480 train_time:135218ms step_avg:154.71ms step:885/1480 train_time:135383ms step_avg:154.72ms step:886/1480 train_time:135550ms step_avg:154.74ms step:887/1480 train_time:135717ms step_avg:154.75ms step:888/1480 train_time:135891ms step_avg:154.77ms step:889/1480 train_time:136059ms step_avg:154.79ms step:890/1480 train_time:136223ms step_avg:154.80ms step:891/1480 train_time:136389ms step_avg:154.81ms step:892/1480 train_time:136554ms step_avg:154.82ms step:893/1480 train_time:136716ms step_avg:154.83ms step:894/1480 train_time:136883ms step_avg:154.85ms step:895/1480 train_time:137049ms step_avg:154.86ms step:896/1480 train_time:137214ms step_avg:154.87ms step:897/1480 train_time:137382ms step_avg:154.88ms step:898/1480 train_time:137550ms step_avg:154.90ms step:899/1480 train_time:137713ms step_avg:154.91ms step:900/1480 train_time:137876ms step_avg:154.92ms step:901/1480 train_time:138039ms step_avg:154.93ms step:902/1480 train_time:138202ms step_avg:154.94ms step:903/1480 train_time:138372ms step_avg:154.95ms step:904/1480 train_time:138537ms step_avg:154.96ms step:905/1480 train_time:138699ms step_avg:154.97ms step:906/1480 train_time:138866ms step_avg:154.98ms step:907/1480 train_time:139033ms step_avg:155.00ms step:908/1480 train_time:139195ms step_avg:155.01ms step:909/1480 train_time:139359ms step_avg:155.02ms step:910/1480 train_time:139529ms step_avg:155.03ms step:911/1480 train_time:139694ms step_avg:155.04ms step:912/1480 train_time:139862ms step_avg:155.06ms step:913/1480 train_time:140030ms step_avg:155.07ms step:914/1480 train_time:140196ms step_avg:155.08ms step:915/1480 train_time:140367ms step_avg:155.10ms step:916/1480 train_time:140531ms step_avg:155.11ms step:917/1480 train_time:140695ms step_avg:155.12ms step:918/1480 train_time:140863ms step_avg:155.14ms step:919/1480 train_time:141033ms step_avg:155.15ms step:920/1480 train_time:141197ms step_avg:155.16ms step:921/1480 train_time:141362ms step_avg:155.17ms step:922/1480 train_time:141530ms step_avg:155.19ms step:923/1480 train_time:141694ms step_avg:155.20ms step:924/1480 train_time:141858ms step_avg:155.21ms step:925/1480 train_time:142024ms step_avg:155.22ms step:926/1480 train_time:142188ms step_avg:155.23ms step:927/1480 train_time:142351ms step_avg:155.24ms step:928/1480 train_time:142516ms step_avg:155.25ms step:929/1480 train_time:142683ms step_avg:155.26ms step:930/1480 train_time:142850ms step_avg:155.27ms step:931/1480 train_time:143013ms step_avg:155.28ms step:932/1480 train_time:143180ms step_avg:155.29ms step:933/1480 train_time:143347ms step_avg:155.31ms step:934/1480 train_time:143513ms step_avg:155.32ms step:935/1480 train_time:143682ms step_avg:155.33ms step:936/1480 train_time:143851ms step_avg:155.35ms step:937/1480 train_time:144021ms step_avg:155.36ms step:938/1480 train_time:144184ms step_avg:155.37ms step:939/1480 train_time:144353ms step_avg:155.38ms step:940/1480 train_time:144519ms step_avg:155.40ms step:941/1480 train_time:144682ms step_avg:155.41ms step:942/1480 train_time:144847ms step_avg:155.42ms step:943/1480 train_time:145016ms step_avg:155.43ms step:944/1480 train_time:145189ms step_avg:155.45ms step:945/1480 train_time:145353ms step_avg:155.46ms step:946/1480 train_time:145523ms step_avg:155.47ms step:947/1480 train_time:145691ms step_avg:155.49ms step:948/1480 train_time:145855ms step_avg:155.50ms step:949/1480 train_time:146021ms step_avg:155.51ms step:950/1480 train_time:146185ms step_avg:155.52ms step:951/1480 train_time:146352ms step_avg:155.53ms step:952/1480 train_time:146517ms step_avg:155.54ms step:953/1480 train_time:146685ms step_avg:155.55ms step:954/1480 train_time:146852ms step_avg:155.56ms step:955/1480 train_time:147015ms step_avg:155.57ms step:956/1480 train_time:147181ms step_avg:155.58ms step:957/1480 train_time:147349ms step_avg:155.60ms step:958/1480 train_time:147518ms step_avg:155.61ms step:959/1480 train_time:147684ms step_avg:155.62ms step:960/1480 train_time:147853ms step_avg:155.63ms step:961/1480 train_time:148017ms step_avg:155.64ms step:962/1480 train_time:148181ms step_avg:155.65ms step:963/1480 train_time:148347ms step_avg:155.66ms step:964/1480 train_time:148515ms step_avg:155.68ms step:965/1480 train_time:148677ms step_avg:155.68ms step:966/1480 train_time:148842ms step_avg:155.69ms step:967/1480 train_time:149006ms step_avg:155.70ms step:968/1480 train_time:149171ms step_avg:155.71ms step:969/1480 train_time:149338ms step_avg:155.72ms step:970/1480 train_time:149500ms step_avg:155.73ms step:971/1480 train_time:149666ms step_avg:155.74ms step:972/1480 train_time:149831ms step_avg:155.75ms step:973/1480 train_time:149995ms step_avg:155.76ms step:974/1480 train_time:150162ms step_avg:155.77ms step:975/1480 train_time:150328ms step_avg:155.78ms step:976/1480 train_time:150493ms step_avg:155.79ms step:977/1480 train_time:150657ms step_avg:155.80ms step:978/1480 train_time:150823ms step_avg:155.81ms step:979/1480 train_time:150990ms step_avg:155.82ms step:980/1480 train_time:151155ms step_avg:155.83ms step:981/1480 train_time:151324ms step_avg:155.84ms step:982/1480 train_time:151488ms step_avg:155.85ms step:983/1480 train_time:151652ms step_avg:155.86ms step:984/1480 train_time:151815ms step_avg:155.87ms step:985/1480 train_time:151984ms step_avg:155.88ms step:986/1480 train_time:152150ms step_avg:155.89ms step:987/1480 train_time:152313ms step_avg:155.90ms step:988/1480 train_time:152479ms step_avg:155.91ms step:989/1480 train_time:152646ms step_avg:155.92ms step:990/1480 train_time:152815ms step_avg:155.93ms step:991/1480 train_time:152983ms step_avg:155.95ms step:992/1480 train_time:153157ms step_avg:155.96ms step:993/1480 train_time:153332ms step_avg:155.98ms step:994/1480 train_time:153497ms step_avg:155.99ms step:995/1480 train_time:153660ms step_avg:156.00ms step:996/1480 train_time:153824ms step_avg:156.01ms step:997/1480 train_time:153989ms step_avg:156.02ms step:998/1480 train_time:154152ms step_avg:156.02ms step:999/1480 train_time:154317ms step_avg:156.03ms step:1000/1480 train_time:154487ms step_avg:156.05ms step:1000/1480 val_loss:3.4435 train_time:154554ms step_avg:156.12ms step:1001/1480 train_time:154660ms step_avg:156.07ms step:1002/1480 train_time:154825ms step_avg:156.07ms step:1003/1480 train_time:154997ms step_avg:156.09ms step:1004/1480 train_time:155166ms step_avg:156.10ms step:1005/1480 train_time:155333ms step_avg:156.11ms step:1006/1480 train_time:155500ms step_avg:156.12ms step:1007/1480 train_time:155665ms step_avg:156.13ms step:1008/1480 train_time:155832ms step_avg:156.14ms step:1009/1480 train_time:156005ms step_avg:156.16ms step:1010/1480 train_time:156169ms step_avg:156.17ms step:1011/1480 train_time:156336ms step_avg:156.18ms step:1012/1480 train_time:156503ms step_avg:156.19ms step:1013/1480 train_time:156673ms step_avg:156.20ms step:1014/1480 train_time:156840ms step_avg:156.22ms step:1015/1480 train_time:157011ms step_avg:156.23ms step:1016/1480 train_time:157179ms step_avg:156.24ms step:1017/1480 train_time:157350ms step_avg:156.26ms step:1018/1480 train_time:157519ms step_avg:156.27ms step:1019/1480 train_time:157686ms step_avg:156.28ms step:1020/1480 train_time:157856ms step_avg:156.29ms step:1021/1480 train_time:158022ms step_avg:156.30ms step:1022/1480 train_time:158189ms step_avg:156.31ms step:1023/1480 train_time:158356ms step_avg:156.32ms step:1024/1480 train_time:158524ms step_avg:156.34ms step:1025/1480 train_time:158695ms step_avg:156.35ms step:1026/1480 train_time:158860ms step_avg:156.36ms step:1027/1480 train_time:159026ms step_avg:156.37ms step:1028/1480 train_time:159199ms step_avg:156.38ms step:1029/1480 train_time:159373ms step_avg:156.40ms step:1030/1480 train_time:159541ms step_avg:156.41ms step:1031/1480 train_time:159705ms step_avg:156.42ms step:1032/1480 train_time:159878ms step_avg:156.44ms step:1033/1480 train_time:160045ms step_avg:156.45ms step:1034/1480 train_time:160214ms step_avg:156.46ms step:1035/1480 train_time:160382ms step_avg:156.47ms step:1036/1480 train_time:160547ms step_avg:156.48ms step:1037/1480 train_time:160715ms step_avg:156.49ms step:1038/1480 train_time:160884ms step_avg:156.50ms step:1039/1480 train_time:161055ms step_avg:156.52ms step:1040/1480 train_time:161222ms step_avg:156.53ms step:1041/1480 train_time:161388ms step_avg:156.54ms step:1042/1480 train_time:161550ms step_avg:156.54ms step:1043/1480 train_time:161715ms step_avg:156.55ms step:1044/1480 train_time:161880ms step_avg:156.56ms step:1045/1480 train_time:162049ms step_avg:156.57ms step:1046/1480 train_time:162217ms step_avg:156.58ms step:1047/1480 train_time:162385ms step_avg:156.59ms step:1048/1480 train_time:162550ms step_avg:156.60ms step:1049/1480 train_time:162717ms step_avg:156.61ms step:1050/1480 train_time:162886ms step_avg:156.62ms step:1051/1480 train_time:163055ms step_avg:156.63ms step:1052/1480 train_time:163225ms step_avg:156.65ms step:1053/1480 train_time:163391ms step_avg:156.66ms step:1054/1480 train_time:163560ms step_avg:156.67ms step:1055/1480 train_time:163726ms step_avg:156.68ms step:1056/1480 train_time:163890ms step_avg:156.68ms step:1057/1480 train_time:164058ms step_avg:156.69ms step:1058/1480 train_time:164227ms step_avg:156.70ms step:1059/1480 train_time:164400ms step_avg:156.72ms step:1060/1480 train_time:164568ms step_avg:156.73ms step:1061/1480 train_time:164731ms step_avg:156.74ms step:1062/1480 train_time:164897ms step_avg:156.75ms step:1063/1480 train_time:165062ms step_avg:156.75ms step:1064/1480 train_time:165225ms step_avg:156.76ms step:1065/1480 train_time:165391ms step_avg:156.77ms step:1066/1480 train_time:165560ms step_avg:156.78ms step:1067/1480 train_time:165729ms step_avg:156.79ms step:1068/1480 train_time:165896ms step_avg:156.80ms step:1069/1480 train_time:166066ms step_avg:156.81ms step:1070/1480 train_time:166231ms step_avg:156.82ms step:1071/1480 train_time:166404ms step_avg:156.84ms step:1072/1480 train_time:166569ms step_avg:156.84ms step:1073/1480 train_time:166733ms step_avg:156.85ms step:1074/1480 train_time:166899ms step_avg:156.86ms step:1075/1480 train_time:167072ms step_avg:156.88ms step:1076/1480 train_time:167240ms step_avg:156.89ms step:1077/1480 train_time:167406ms step_avg:156.89ms step:1078/1480 train_time:167579ms step_avg:156.91ms step:1079/1480 train_time:167750ms step_avg:156.92ms step:1080/1480 train_time:167921ms step_avg:156.94ms step:1081/1480 train_time:168089ms step_avg:156.95ms step:1082/1480 train_time:168256ms step_avg:156.96ms step:1083/1480 train_time:168422ms step_avg:156.96ms step:1084/1480 train_time:168588ms step_avg:156.97ms step:1085/1480 train_time:168757ms step_avg:156.98ms step:1086/1480 train_time:168924ms step_avg:156.99ms step:1087/1480 train_time:169090ms step_avg:157.00ms step:1088/1480 train_time:169260ms step_avg:157.01ms step:1089/1480 train_time:169431ms step_avg:157.03ms step:1090/1480 train_time:169603ms step_avg:157.04ms step:1091/1480 train_time:169771ms step_avg:157.05ms step:1092/1480 train_time:169939ms step_avg:157.06ms step:1093/1480 train_time:170107ms step_avg:157.07ms step:1094/1480 train_time:170272ms step_avg:157.08ms step:1095/1480 train_time:170438ms step_avg:157.09ms step:1096/1480 train_time:170605ms step_avg:157.10ms step:1097/1480 train_time:170774ms step_avg:157.11ms step:1098/1480 train_time:170945ms step_avg:157.12ms step:1099/1480 train_time:171116ms step_avg:157.13ms step:1100/1480 train_time:171286ms step_avg:157.14ms step:1101/1480 train_time:171458ms step_avg:157.16ms step:1102/1480 train_time:171630ms step_avg:157.17ms step:1103/1480 train_time:171806ms step_avg:157.19ms step:1104/1480 train_time:171974ms step_avg:157.20ms step:1105/1480 train_time:172144ms step_avg:157.21ms step:1106/1480 train_time:172312ms step_avg:157.22ms step:1107/1480 train_time:172481ms step_avg:157.23ms step:1108/1480 train_time:172646ms step_avg:157.24ms step:1109/1480 train_time:172812ms step_avg:157.24ms step:1110/1480 train_time:172979ms step_avg:157.25ms step:1111/1480 train_time:173146ms step_avg:157.26ms step:1112/1480 train_time:173315ms step_avg:157.27ms step:1113/1480 train_time:173497ms step_avg:157.30ms step:1114/1480 train_time:173670ms step_avg:157.31ms step:1115/1480 train_time:173844ms step_avg:157.32ms step:1116/1480 train_time:174010ms step_avg:157.33ms step:1117/1480 train_time:174183ms step_avg:157.35ms step:1118/1480 train_time:174358ms step_avg:157.36ms step:1119/1480 train_time:174524ms step_avg:157.37ms step:1120/1480 train_time:174691ms step_avg:157.38ms step:1121/1480 train_time:174861ms step_avg:157.39ms step:1122/1480 train_time:175027ms step_avg:157.40ms step:1123/1480 train_time:175194ms step_avg:157.41ms step:1124/1480 train_time:175363ms step_avg:157.42ms step:1125/1480 train_time:175530ms step_avg:157.43ms step:1125/1480 val_loss:3.3876 train_time:175598ms step_avg:157.49ms step:1126/1480 train_time:175700ms step_avg:157.44ms step:1127/1480 train_time:175871ms step_avg:157.45ms step:1128/1480 train_time:176042ms step_avg:157.46ms step:1129/1480 train_time:176216ms step_avg:157.48ms step:1130/1480 train_time:176386ms step_avg:157.49ms step:1131/1480 train_time:176564ms step_avg:157.51ms step:1132/1480 train_time:176729ms step_avg:157.51ms step:1133/1480 train_time:176900ms step_avg:157.52ms step:1134/1480 train_time:177070ms step_avg:157.54ms step:1135/1480 train_time:177238ms step_avg:157.55ms step:1136/1480 train_time:177408ms step_avg:157.56ms step:1137/1480 train_time:177577ms step_avg:157.57ms step:1138/1480 train_time:177749ms step_avg:157.58ms step:1139/1480 train_time:177916ms step_avg:157.59ms step:1140/1480 train_time:178083ms step_avg:157.60ms step:1141/1480 train_time:178254ms step_avg:157.61ms step:1142/1480 train_time:178421ms step_avg:157.62ms step:1143/1480 train_time:178591ms step_avg:157.63ms step:1144/1480 train_time:178759ms step_avg:157.64ms step:1145/1480 train_time:178925ms step_avg:157.64ms step:1146/1480 train_time:179095ms step_avg:157.65ms step:1147/1480 train_time:179266ms step_avg:157.67ms step:1148/1480 train_time:179433ms step_avg:157.67ms step:1149/1480 train_time:179604ms step_avg:157.69ms step:1150/1480 train_time:179774ms step_avg:157.70ms step:1151/1480 train_time:179948ms step_avg:157.71ms step:1152/1480 train_time:180118ms step_avg:157.72ms step:1153/1480 train_time:180291ms step_avg:157.74ms step:1154/1480 train_time:180458ms step_avg:157.74ms step:1155/1480 train_time:180631ms step_avg:157.76ms step:1156/1480 train_time:180811ms step_avg:157.78ms step:1157/1480 train_time:180981ms step_avg:157.79ms step:1158/1480 train_time:181148ms step_avg:157.79ms step:1159/1480 train_time:181314ms step_avg:157.80ms step:1160/1480 train_time:181481ms step_avg:157.81ms step:1161/1480 train_time:181650ms step_avg:157.82ms step:1162/1480 train_time:181819ms step_avg:157.83ms step:1163/1480 train_time:181990ms step_avg:157.84ms step:1164/1480 train_time:182158ms step_avg:157.85ms step:1165/1480 train_time:182323ms step_avg:157.86ms step:1166/1480 train_time:182492ms step_avg:157.86ms step:1167/1480 train_time:182660ms step_avg:157.87ms step:1168/1480 train_time:182829ms step_avg:157.88ms step:1169/1480 train_time:182998ms step_avg:157.89ms step:1170/1480 train_time:183167ms step_avg:157.90ms step:1171/1480 train_time:183334ms step_avg:157.91ms step:1172/1480 train_time:183502ms step_avg:157.92ms step:1173/1480 train_time:183674ms step_avg:157.93ms step:1174/1480 train_time:183855ms step_avg:157.95ms step:1175/1480 train_time:184027ms step_avg:157.96ms step:1176/1480 train_time:184198ms step_avg:157.97ms step:1177/1480 train_time:184374ms step_avg:157.99ms step:1178/1480 train_time:184543ms step_avg:158.00ms step:1179/1480 train_time:184709ms step_avg:158.01ms step:1180/1480 train_time:184890ms step_avg:158.03ms step:1181/1480 train_time:185060ms step_avg:158.04ms step:1182/1480 train_time:185228ms step_avg:158.04ms step:1183/1480 train_time:185399ms step_avg:158.06ms step:1184/1480 train_time:185568ms step_avg:158.06ms step:1185/1480 train_time:185740ms step_avg:158.08ms step:1186/1480 train_time:185912ms step_avg:158.09ms step:1187/1480 train_time:186094ms step_avg:158.11ms step:1188/1480 train_time:186261ms step_avg:158.12ms step:1189/1480 train_time:186431ms step_avg:158.13ms step:1190/1480 train_time:186598ms step_avg:158.13ms step:1191/1480 train_time:186770ms step_avg:158.15ms step:1192/1480 train_time:186936ms step_avg:158.15ms step:1193/1480 train_time:187102ms step_avg:158.16ms step:1194/1480 train_time:187272ms step_avg:158.17ms step:1195/1480 train_time:187445ms step_avg:158.18ms step:1196/1480 train_time:187628ms step_avg:158.20ms step:1197/1480 train_time:187798ms step_avg:158.21ms step:1198/1480 train_time:187981ms step_avg:158.23ms step:1199/1480 train_time:188151ms step_avg:158.24ms step:1200/1480 train_time:188322ms step_avg:158.25ms step:1201/1480 train_time:188491ms step_avg:158.26ms step:1202/1480 train_time:188674ms step_avg:158.28ms step:1203/1480 train_time:188850ms step_avg:158.30ms step:1204/1480 train_time:189025ms step_avg:158.31ms step:1205/1480 train_time:189193ms step_avg:158.32ms step:1206/1480 train_time:189361ms step_avg:158.33ms step:1207/1480 train_time:189531ms step_avg:158.34ms step:1208/1480 train_time:189698ms step_avg:158.35ms step:1209/1480 train_time:189872ms step_avg:158.36ms step:1210/1480 train_time:190048ms step_avg:158.37ms step:1211/1480 train_time:190223ms step_avg:158.39ms step:1212/1480 train_time:190394ms step_avg:158.40ms step:1213/1480 train_time:190568ms step_avg:158.41ms step:1214/1480 train_time:190746ms step_avg:158.43ms step:1215/1480 train_time:190917ms step_avg:158.44ms step:1216/1480 train_time:191087ms step_avg:158.45ms step:1217/1480 train_time:191261ms step_avg:158.46ms step:1218/1480 train_time:191430ms step_avg:158.47ms step:1219/1480 train_time:191609ms step_avg:158.49ms step:1220/1480 train_time:191779ms step_avg:158.49ms step:1221/1480 train_time:191948ms step_avg:158.50ms step:1222/1480 train_time:192113ms step_avg:158.51ms step:1223/1480 train_time:192284ms step_avg:158.52ms step:1224/1480 train_time:192463ms step_avg:158.54ms step:1225/1480 train_time:192634ms step_avg:158.55ms step:1226/1480 train_time:192808ms step_avg:158.56ms step:1227/1480 train_time:192980ms step_avg:158.57ms step:1228/1480 train_time:193150ms step_avg:158.58ms step:1229/1480 train_time:193322ms step_avg:158.59ms step:1230/1480 train_time:193500ms step_avg:158.61ms step:1231/1480 train_time:193676ms step_avg:158.62ms step:1232/1480 train_time:193851ms step_avg:158.63ms step:1233/1480 train_time:194021ms step_avg:158.64ms step:1234/1480 train_time:194191ms step_avg:158.65ms step:1235/1480 train_time:194367ms step_avg:158.67ms step:1236/1480 train_time:194535ms step_avg:158.67ms step:1237/1480 train_time:194705ms step_avg:158.68ms step:1238/1480 train_time:194892ms step_avg:158.71ms step:1239/1480 train_time:195063ms step_avg:158.72ms step:1240/1480 train_time:195233ms step_avg:158.73ms step:1241/1480 train_time:195407ms step_avg:158.74ms step:1242/1480 train_time:195576ms step_avg:158.75ms step:1243/1480 train_time:195750ms step_avg:158.76ms step:1244/1480 train_time:195916ms step_avg:158.76ms step:1245/1480 train_time:196085ms step_avg:158.77ms step:1246/1480 train_time:196254ms step_avg:158.78ms step:1247/1480 train_time:196423ms step_avg:158.79ms step:1248/1480 train_time:196592ms step_avg:158.80ms step:1249/1480 train_time:196761ms step_avg:158.81ms step:1250/1480 train_time:196930ms step_avg:158.81ms step:1250/1480 val_loss:3.3378 train_time:197002ms step_avg:158.87ms step:1251/1480 train_time:197113ms step_avg:158.83ms step:1252/1480 train_time:197282ms step_avg:158.84ms step:1253/1480 train_time:197449ms step_avg:158.85ms step:1254/1480 train_time:197621ms step_avg:158.86ms step:1255/1480 train_time:197810ms step_avg:158.88ms step:1256/1480 train_time:197984ms step_avg:158.90ms step:1257/1480 train_time:198153ms step_avg:158.90ms step:1258/1480 train_time:198330ms step_avg:158.92ms step:1259/1480 train_time:198502ms step_avg:158.93ms step:1260/1480 train_time:198669ms step_avg:158.94ms step:1261/1480 train_time:198841ms step_avg:158.95ms step:1262/1480 train_time:199017ms step_avg:158.96ms step:1263/1480 train_time:199192ms step_avg:158.97ms step:1264/1480 train_time:199357ms step_avg:158.98ms step:1265/1480 train_time:199523ms step_avg:158.98ms step:1266/1480 train_time:199695ms step_avg:158.99ms step:1267/1480 train_time:199866ms step_avg:159.00ms step:1268/1480 train_time:200036ms step_avg:159.01ms step:1269/1480 train_time:200213ms step_avg:159.03ms step:1270/1480 train_time:200382ms step_avg:159.03ms step:1271/1480 train_time:200552ms step_avg:159.04ms step:1272/1480 train_time:200717ms step_avg:159.05ms step:1273/1480 train_time:200889ms step_avg:159.06ms step:1274/1480 train_time:201062ms step_avg:159.07ms step:1275/1480 train_time:201230ms step_avg:159.08ms step:1276/1480 train_time:201395ms step_avg:159.08ms step:1277/1480 train_time:201569ms step_avg:159.09ms step:1278/1480 train_time:201737ms step_avg:159.10ms step:1279/1480 train_time:201909ms step_avg:159.11ms step:1280/1480 train_time:202087ms step_avg:159.12ms step:1281/1480 train_time:202255ms step_avg:159.13ms step:1282/1480 train_time:202422ms step_avg:159.14ms step:1283/1480 train_time:202594ms step_avg:159.15ms step:1284/1480 train_time:202764ms step_avg:159.16ms step:1285/1480 train_time:202933ms step_avg:159.16ms step:1286/1480 train_time:203104ms step_avg:159.17ms step:1287/1480 train_time:203274ms step_avg:159.18ms step:1288/1480 train_time:203447ms step_avg:159.19ms step:1289/1480 train_time:203632ms step_avg:159.21ms step:1290/1480 train_time:203814ms step_avg:159.23ms step:1291/1480 train_time:203987ms step_avg:159.24ms step:1292/1480 train_time:204160ms step_avg:159.25ms step:1293/1480 train_time:204334ms step_avg:159.26ms step:1294/1480 train_time:204504ms step_avg:159.27ms step:1295/1480 train_time:204674ms step_avg:159.28ms step:1296/1480 train_time:204848ms step_avg:159.29ms step:1297/1480 train_time:205018ms step_avg:159.30ms step:1298/1480 train_time:205191ms step_avg:159.31ms step:1299/1480 train_time:205362ms step_avg:159.32ms step:1300/1480 train_time:205530ms step_avg:159.33ms step:1301/1480 train_time:205699ms step_avg:159.33ms step:1302/1480 train_time:205874ms step_avg:159.35ms step:1303/1480 train_time:206051ms step_avg:159.36ms step:1304/1480 train_time:206223ms step_avg:159.37ms step:1305/1480 train_time:206391ms step_avg:159.38ms step:1306/1480 train_time:206567ms step_avg:159.39ms step:1307/1480 train_time:206735ms step_avg:159.39ms step:1308/1480 train_time:206904ms step_avg:159.40ms step:1309/1480 train_time:207076ms step_avg:159.41ms step:1310/1480 train_time:207245ms step_avg:159.42ms step:1311/1480 train_time:207415ms step_avg:159.43ms step:1312/1480 train_time:207588ms step_avg:159.44ms step:1313/1480 train_time:207757ms step_avg:159.44ms step:1314/1480 train_time:207930ms step_avg:159.46ms step:1315/1480 train_time:208100ms step_avg:159.46ms step:1316/1480 train_time:208268ms step_avg:159.47ms step:1317/1480 train_time:208440ms step_avg:159.48ms step:1318/1480 train_time:208621ms step_avg:159.50ms step:1319/1480 train_time:208796ms step_avg:159.51ms step:1320/1480 train_time:208973ms step_avg:159.52ms step:1321/1480 train_time:209146ms step_avg:159.53ms step:1322/1480 train_time:209327ms step_avg:159.55ms step:1323/1480 train_time:209498ms step_avg:159.56ms step:1324/1480 train_time:209672ms step_avg:159.57ms step:1325/1480 train_time:209854ms step_avg:159.58ms step:1326/1480 train_time:210030ms step_avg:159.60ms step:1327/1480 train_time:210201ms step_avg:159.61ms step:1328/1480 train_time:210372ms step_avg:159.61ms step:1329/1480 train_time:210568ms step_avg:159.64ms step:1330/1480 train_time:210747ms step_avg:159.66ms step:1331/1480 train_time:210917ms step_avg:159.66ms step:1332/1480 train_time:211093ms step_avg:159.68ms step:1333/1480 train_time:211269ms step_avg:159.69ms step:1334/1480 train_time:211439ms step_avg:159.70ms step:1335/1480 train_time:211608ms step_avg:159.70ms step:1336/1480 train_time:211791ms step_avg:159.72ms step:1337/1480 train_time:211968ms step_avg:159.73ms step:1338/1480 train_time:212138ms step_avg:159.74ms step:1339/1480 train_time:212313ms step_avg:159.75ms step:1340/1480 train_time:212486ms step_avg:159.76ms step:1341/1480 train_time:212653ms step_avg:159.77ms step:1342/1480 train_time:212826ms step_avg:159.78ms step:1343/1480 train_time:212996ms step_avg:159.79ms step:1344/1480 train_time:213169ms step_avg:159.80ms step:1345/1480 train_time:213347ms step_avg:159.81ms step:1346/1480 train_time:213515ms step_avg:159.82ms step:1347/1480 train_time:213685ms step_avg:159.82ms step:1348/1480 train_time:213854ms step_avg:159.83ms step:1349/1480 train_time:214024ms step_avg:159.84ms step:1350/1480 train_time:214197ms step_avg:159.85ms step:1351/1480 train_time:214369ms step_avg:159.86ms step:1352/1480 train_time:214540ms step_avg:159.87ms step:1353/1480 train_time:214716ms step_avg:159.88ms step:1354/1480 train_time:214889ms step_avg:159.89ms step:1355/1480 train_time:215057ms step_avg:159.89ms step:1356/1480 train_time:215229ms step_avg:159.90ms step:1357/1480 train_time:215403ms step_avg:159.91ms step:1358/1480 train_time:215575ms step_avg:159.92ms step:1359/1480 train_time:215748ms step_avg:159.93ms step:1360/1480 train_time:215920ms step_avg:159.94ms step:1361/1480 train_time:216098ms step_avg:159.95ms step:1362/1480 train_time:216273ms step_avg:159.96ms step:1363/1480 train_time:216451ms step_avg:159.98ms step:1364/1480 train_time:216619ms step_avg:159.98ms step:1365/1480 train_time:216786ms step_avg:159.99ms step:1366/1480 train_time:216957ms step_avg:160.00ms step:1367/1480 train_time:217129ms step_avg:160.01ms step:1368/1480 train_time:217304ms step_avg:160.02ms step:1369/1480 train_time:217485ms step_avg:160.03ms step:1370/1480 train_time:217662ms step_avg:160.05ms step:1371/1480 train_time:217833ms step_avg:160.05ms step:1372/1480 train_time:218011ms step_avg:160.07ms step:1373/1480 train_time:218181ms step_avg:160.07ms step:1374/1480 train_time:218355ms step_avg:160.08ms step:1375/1480 train_time:218528ms step_avg:160.09ms step:1375/1480 val_loss:3.2995 train_time:218595ms step_avg:160.14ms step:1376/1480 train_time:218703ms step_avg:160.10ms step:1377/1480 train_time:218875ms step_avg:160.11ms step:1378/1480 train_time:219043ms step_avg:160.12ms step:1379/1480 train_time:219219ms step_avg:160.13ms step:1380/1480 train_time:219392ms step_avg:160.14ms step:1381/1480 train_time:219572ms step_avg:160.15ms step:1382/1480 train_time:219743ms step_avg:160.16ms step:1383/1480 train_time:219915ms step_avg:160.17ms step:1384/1480 train_time:220092ms step_avg:160.18ms step:1385/1480 train_time:220259ms step_avg:160.19ms step:1386/1480 train_time:220429ms step_avg:160.20ms step:1387/1480 train_time:220601ms step_avg:160.20ms step:1388/1480 train_time:220769ms step_avg:160.21ms step:1389/1480 train_time:220942ms step_avg:160.22ms step:1390/1480 train_time:221110ms step_avg:160.22ms step:1391/1480 train_time:221281ms step_avg:160.23ms step:1392/1480 train_time:221452ms step_avg:160.24ms step:1393/1480 train_time:221624ms step_avg:160.25ms step:1394/1480 train_time:221795ms step_avg:160.26ms step:1395/1480 train_time:221964ms step_avg:160.26ms step:1396/1480 train_time:222133ms step_avg:160.27ms step:1397/1480 train_time:222301ms step_avg:160.27ms step:1398/1480 train_time:222467ms step_avg:160.28ms step:1399/1480 train_time:222636ms step_avg:160.29ms step:1400/1480 train_time:222815ms step_avg:160.30ms step:1401/1480 train_time:222982ms step_avg:160.30ms step:1402/1480 train_time:223152ms step_avg:160.31ms step:1403/1480 train_time:223328ms step_avg:160.32ms step:1404/1480 train_time:223499ms step_avg:160.33ms step:1405/1480 train_time:223672ms step_avg:160.34ms step:1406/1480 train_time:223846ms step_avg:160.35ms step:1407/1480 train_time:224016ms step_avg:160.35ms step:1408/1480 train_time:224185ms step_avg:160.36ms step:1409/1480 train_time:224366ms step_avg:160.38ms step:1410/1480 train_time:224537ms step_avg:160.38ms step:1411/1480 train_time:224706ms step_avg:160.39ms step:1412/1480 train_time:224876ms step_avg:160.40ms step:1413/1480 train_time:225045ms step_avg:160.40ms step:1414/1480 train_time:225217ms step_avg:160.41ms step:1415/1480 train_time:225390ms step_avg:160.42ms step:1416/1480 train_time:225578ms step_avg:160.44ms step:1417/1480 train_time:225753ms step_avg:160.45ms step:1418/1480 train_time:225924ms step_avg:160.46ms step:1419/1480 train_time:226098ms step_avg:160.47ms step:1420/1480 train_time:226272ms step_avg:160.48ms step:1421/1480 train_time:226446ms step_avg:160.49ms step:1422/1480 train_time:226619ms step_avg:160.50ms step:1423/1480 train_time:226788ms step_avg:160.50ms step:1424/1480 train_time:226964ms step_avg:160.51ms step:1425/1480 train_time:227144ms step_avg:160.53ms step:1426/1480 train_time:227317ms step_avg:160.53ms step:1427/1480 train_time:227491ms step_avg:160.54ms step:1428/1480 train_time:227662ms step_avg:160.55ms step:1429/1480 train_time:227829ms step_avg:160.56ms step:1430/1480 train_time:228005ms step_avg:160.57ms step:1431/1480 train_time:228181ms step_avg:160.58ms step:1432/1480 train_time:228360ms step_avg:160.59ms step:1433/1480 train_time:228538ms step_avg:160.60ms step:1434/1480 train_time:228719ms step_avg:160.62ms step:1435/1480 train_time:228894ms step_avg:160.63ms step:1436/1480 train_time:229066ms step_avg:160.64ms step:1437/1480 train_time:229237ms step_avg:160.64ms step:1438/1480 train_time:229405ms step_avg:160.65ms step:1439/1480 train_time:229580ms step_avg:160.66ms step:1440/1480 train_time:229747ms step_avg:160.66ms step:1441/1480 train_time:229919ms step_avg:160.67ms step:1442/1480 train_time:230096ms step_avg:160.68ms step:1443/1480 train_time:230285ms step_avg:160.70ms step:1444/1480 train_time:230456ms step_avg:160.71ms step:1445/1480 train_time:230628ms step_avg:160.72ms step:1446/1480 train_time:230803ms step_avg:160.73ms step:1447/1480 train_time:230981ms step_avg:160.74ms step:1448/1480 train_time:231151ms step_avg:160.74ms step:1449/1480 train_time:231324ms step_avg:160.75ms step:1450/1480 train_time:231498ms step_avg:160.76ms step:1451/1480 train_time:231669ms step_avg:160.77ms step:1452/1480 train_time:231843ms step_avg:160.78ms step:1453/1480 train_time:232013ms step_avg:160.79ms step:1454/1480 train_time:232186ms step_avg:160.79ms step:1455/1480 train_time:232364ms step_avg:160.81ms step:1456/1480 train_time:232538ms step_avg:160.81ms step:1457/1480 train_time:232709ms step_avg:160.82ms step:1458/1480 train_time:232880ms step_avg:160.83ms step:1459/1480 train_time:233056ms step_avg:160.84ms step:1460/1480 train_time:233228ms step_avg:160.85ms step:1461/1480 train_time:233404ms step_avg:160.86ms step:1462/1480 train_time:233576ms step_avg:160.87ms step:1463/1480 train_time:233753ms step_avg:160.88ms step:1464/1480 train_time:233928ms step_avg:160.89ms step:1465/1480 train_time:234102ms step_avg:160.89ms step:1466/1480 train_time:234273ms step_avg:160.90ms step:1467/1480 train_time:234447ms step_avg:160.91ms step:1468/1480 train_time:234618ms step_avg:160.92ms step:1469/1480 train_time:234790ms step_avg:160.93ms step:1470/1480 train_time:234971ms step_avg:160.94ms step:1471/1480 train_time:235159ms step_avg:160.96ms step:1472/1480 train_time:235339ms step_avg:160.97ms step:1473/1480 train_time:235510ms step_avg:160.98ms step:1474/1480 train_time:235687ms step_avg:160.99ms step:1475/1480 train_time:235867ms step_avg:161.00ms step:1476/1480 train_time:236040ms step_avg:161.01ms step:1477/1480 train_time:236223ms step_avg:161.02ms step:1478/1480 train_time:236405ms step_avg:161.04ms step:1479/1480 train_time:236581ms step_avg:161.05ms step:1480/1480 train_time:236755ms step_avg:161.06ms step:1480/1480 val_loss:3.2808 train_time:236826ms step_avg:161.11ms