import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 10:42:19 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 121W / 700W | 119MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 115W / 700W | 119MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 73W / 700W | 19MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 47C P0 128W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23514ms step_avg:nanms step:2/1480 train_time:23601ms step_avg:nanms step:3/1480 train_time:23740ms step_avg:nanms step:4/1480 train_time:23882ms step_avg:nanms step:5/1480 train_time:24023ms step_avg:nanms step:6/1480 train_time:24165ms step_avg:nanms step:7/1480 train_time:24307ms step_avg:nanms step:8/1480 train_time:24449ms step_avg:nanms step:9/1480 train_time:24593ms step_avg:nanms step:10/1480 train_time:24737ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:426ms step_avg:142.13ms step:14/1480 train_time:569ms step_avg:142.16ms step:15/1480 train_time:711ms step_avg:142.11ms step:16/1480 train_time:854ms step_avg:142.40ms step:17/1480 train_time:996ms step_avg:142.30ms step:18/1480 train_time:1137ms step_avg:142.19ms step:19/1480 train_time:1279ms step_avg:142.10ms step:20/1480 train_time:1423ms step_avg:142.29ms step:21/1480 train_time:1567ms step_avg:142.41ms step:22/1480 train_time:1710ms step_avg:142.49ms step:23/1480 train_time:1853ms step_avg:142.54ms step:24/1480 train_time:1995ms step_avg:142.52ms step:25/1480 train_time:2137ms step_avg:142.47ms step:26/1480 train_time:2278ms step_avg:142.38ms step:27/1480 train_time:2423ms step_avg:142.52ms step:28/1480 train_time:2566ms step_avg:142.57ms step:29/1480 train_time:2708ms step_avg:142.52ms step:30/1480 train_time:2851ms step_avg:142.56ms step:31/1480 train_time:2993ms step_avg:142.53ms step:32/1480 train_time:3136ms step_avg:142.52ms step:33/1480 train_time:3277ms step_avg:142.48ms step:34/1480 train_time:3420ms step_avg:142.48ms step:35/1480 train_time:3565ms step_avg:142.59ms step:36/1480 train_time:3708ms step_avg:142.63ms step:37/1480 train_time:3851ms step_avg:142.64ms step:38/1480 train_time:3993ms step_avg:142.60ms step:39/1480 train_time:4135ms step_avg:142.60ms step:40/1480 train_time:4277ms step_avg:142.57ms step:41/1480 train_time:4420ms step_avg:142.57ms step:42/1480 train_time:4563ms step_avg:142.58ms step:43/1480 train_time:4705ms step_avg:142.59ms step:44/1480 train_time:4849ms step_avg:142.62ms step:45/1480 train_time:4991ms step_avg:142.61ms step:46/1480 train_time:5133ms step_avg:142.58ms step:47/1480 train_time:5275ms step_avg:142.57ms step:48/1480 train_time:5417ms step_avg:142.54ms step:49/1480 train_time:5561ms step_avg:142.59ms step:50/1480 train_time:5705ms step_avg:142.63ms step:51/1480 train_time:5849ms step_avg:142.65ms step:52/1480 train_time:5990ms step_avg:142.62ms step:53/1480 train_time:6132ms step_avg:142.61ms step:54/1480 train_time:6273ms step_avg:142.58ms step:55/1480 train_time:6414ms step_avg:142.54ms step:56/1480 train_time:6556ms step_avg:142.52ms step:57/1480 train_time:6698ms step_avg:142.52ms step:58/1480 train_time:6844ms step_avg:142.58ms step:59/1480 train_time:6988ms step_avg:142.61ms step:60/1480 train_time:7131ms step_avg:142.61ms step:61/1480 train_time:7272ms step_avg:142.60ms step:62/1480 train_time:7413ms step_avg:142.57ms step:63/1480 train_time:7555ms step_avg:142.56ms step:64/1480 train_time:7697ms step_avg:142.54ms step:65/1480 train_time:7840ms step_avg:142.55ms step:66/1480 train_time:7983ms step_avg:142.56ms step:67/1480 train_time:8127ms step_avg:142.59ms step:68/1480 train_time:8271ms step_avg:142.61ms step:69/1480 train_time:8413ms step_avg:142.59ms step:70/1480 train_time:8556ms step_avg:142.59ms step:71/1480 train_time:8698ms step_avg:142.58ms step:72/1480 train_time:8840ms step_avg:142.59ms step:73/1480 train_time:8984ms step_avg:142.60ms step:74/1480 train_time:9127ms step_avg:142.60ms step:75/1480 train_time:9270ms step_avg:142.61ms step:76/1480 train_time:9412ms step_avg:142.60ms step:77/1480 train_time:9554ms step_avg:142.60ms step:78/1480 train_time:9696ms step_avg:142.58ms step:79/1480 train_time:9837ms step_avg:142.57ms step:80/1480 train_time:9978ms step_avg:142.55ms step:81/1480 train_time:10122ms step_avg:142.57ms step:82/1480 train_time:10266ms step_avg:142.58ms step:83/1480 train_time:10408ms step_avg:142.57ms step:84/1480 train_time:10551ms step_avg:142.59ms step:85/1480 train_time:10693ms step_avg:142.58ms step:86/1480 train_time:10835ms step_avg:142.56ms step:87/1480 train_time:10976ms step_avg:142.54ms step:88/1480 train_time:11119ms step_avg:142.55ms step:89/1480 train_time:11262ms step_avg:142.56ms step:90/1480 train_time:11406ms step_avg:142.58ms step:91/1480 train_time:11550ms step_avg:142.59ms step:92/1480 train_time:11691ms step_avg:142.58ms step:93/1480 train_time:11833ms step_avg:142.57ms step:94/1480 train_time:11975ms step_avg:142.55ms step:95/1480 train_time:12116ms step_avg:142.54ms step:96/1480 train_time:12258ms step_avg:142.54ms step:97/1480 train_time:12400ms step_avg:142.53ms step:98/1480 train_time:12544ms step_avg:142.55ms step:99/1480 train_time:12686ms step_avg:142.54ms step:100/1480 train_time:12830ms step_avg:142.56ms step:101/1480 train_time:12972ms step_avg:142.55ms step:102/1480 train_time:13113ms step_avg:142.54ms step:103/1480 train_time:13255ms step_avg:142.53ms step:104/1480 train_time:13397ms step_avg:142.53ms step:105/1480 train_time:13542ms step_avg:142.55ms step:106/1480 train_time:13685ms step_avg:142.56ms step:107/1480 train_time:13829ms step_avg:142.56ms step:108/1480 train_time:13970ms step_avg:142.55ms step:109/1480 train_time:14111ms step_avg:142.54ms step:110/1480 train_time:14253ms step_avg:142.53ms step:111/1480 train_time:14397ms step_avg:142.54ms step:112/1480 train_time:14546ms step_avg:142.61ms step:113/1480 train_time:14694ms step_avg:142.66ms step:114/1480 train_time:14840ms step_avg:142.69ms step:115/1480 train_time:14987ms step_avg:142.74ms step:116/1480 train_time:15134ms step_avg:142.77ms step:117/1480 train_time:15280ms step_avg:142.80ms step:118/1480 train_time:15427ms step_avg:142.84ms step:119/1480 train_time:15574ms step_avg:142.88ms step:120/1480 train_time:15722ms step_avg:142.93ms step:121/1480 train_time:15870ms step_avg:142.97ms step:122/1480 train_time:16015ms step_avg:142.99ms step:123/1480 train_time:16163ms step_avg:143.03ms step:124/1480 train_time:16310ms step_avg:143.07ms step:125/1480 train_time:16456ms step_avg:143.10ms step:125/1480 val_loss:4.4120 train_time:16514ms step_avg:143.60ms step:126/1480 train_time:16608ms step_avg:143.17ms step:127/1480 train_time:16758ms step_avg:143.23ms step:128/1480 train_time:16903ms step_avg:143.25ms step:129/1480 train_time:17049ms step_avg:143.27ms step:130/1480 train_time:17195ms step_avg:143.30ms step:131/1480 train_time:17340ms step_avg:143.31ms step:132/1480 train_time:17486ms step_avg:143.33ms step:133/1480 train_time:17635ms step_avg:143.37ms step:134/1480 train_time:17783ms step_avg:143.42ms step:135/1480 train_time:17932ms step_avg:143.46ms step:136/1480 train_time:18079ms step_avg:143.48ms step:137/1480 train_time:18224ms step_avg:143.49ms step:138/1480 train_time:18370ms step_avg:143.51ms step:139/1480 train_time:18516ms step_avg:143.54ms step:140/1480 train_time:18663ms step_avg:143.56ms step:141/1480 train_time:18811ms step_avg:143.59ms step:142/1480 train_time:18959ms step_avg:143.63ms step:143/1480 train_time:19104ms step_avg:143.64ms step:144/1480 train_time:19252ms step_avg:143.67ms step:145/1480 train_time:19399ms step_avg:143.70ms step:146/1480 train_time:19545ms step_avg:143.71ms step:147/1480 train_time:19692ms step_avg:143.74ms step:148/1480 train_time:19839ms step_avg:143.76ms step:149/1480 train_time:19985ms step_avg:143.78ms step:150/1480 train_time:20132ms step_avg:143.80ms step:151/1480 train_time:20279ms step_avg:143.83ms step:152/1480 train_time:20426ms step_avg:143.84ms step:153/1480 train_time:20574ms step_avg:143.87ms step:154/1480 train_time:20720ms step_avg:143.89ms step:155/1480 train_time:20867ms step_avg:143.91ms step:156/1480 train_time:21014ms step_avg:143.93ms step:157/1480 train_time:21161ms step_avg:143.95ms step:158/1480 train_time:21307ms step_avg:143.97ms step:159/1480 train_time:21456ms step_avg:144.00ms step:160/1480 train_time:21602ms step_avg:144.01ms step:161/1480 train_time:21749ms step_avg:144.03ms step:162/1480 train_time:21896ms step_avg:144.05ms step:163/1480 train_time:22042ms step_avg:144.07ms step:164/1480 train_time:22189ms step_avg:144.08ms step:165/1480 train_time:22336ms step_avg:144.10ms step:166/1480 train_time:22482ms step_avg:144.12ms step:167/1480 train_time:22630ms step_avg:144.14ms step:168/1480 train_time:22778ms step_avg:144.16ms step:169/1480 train_time:22923ms step_avg:144.17ms step:170/1480 train_time:23070ms step_avg:144.19ms step:171/1480 train_time:23216ms step_avg:144.20ms step:172/1480 train_time:23363ms step_avg:144.21ms step:173/1480 train_time:23508ms step_avg:144.22ms step:174/1480 train_time:23656ms step_avg:144.24ms step:175/1480 train_time:23803ms step_avg:144.26ms step:176/1480 train_time:23951ms step_avg:144.28ms step:177/1480 train_time:24098ms step_avg:144.30ms step:178/1480 train_time:24244ms step_avg:144.31ms step:179/1480 train_time:24391ms step_avg:144.33ms step:180/1480 train_time:24538ms step_avg:144.34ms step:181/1480 train_time:24684ms step_avg:144.35ms step:182/1480 train_time:24833ms step_avg:144.38ms step:183/1480 train_time:24980ms step_avg:144.39ms step:184/1480 train_time:25125ms step_avg:144.40ms step:185/1480 train_time:25272ms step_avg:144.41ms step:186/1480 train_time:25419ms step_avg:144.43ms step:187/1480 train_time:25565ms step_avg:144.44ms step:188/1480 train_time:25711ms step_avg:144.45ms step:189/1480 train_time:25859ms step_avg:144.46ms step:190/1480 train_time:26006ms step_avg:144.48ms step:191/1480 train_time:26153ms step_avg:144.49ms step:192/1480 train_time:26299ms step_avg:144.50ms step:193/1480 train_time:26444ms step_avg:144.50ms step:194/1480 train_time:26591ms step_avg:144.52ms step:195/1480 train_time:26738ms step_avg:144.53ms step:196/1480 train_time:26885ms step_avg:144.54ms step:197/1480 train_time:27032ms step_avg:144.56ms step:198/1480 train_time:27179ms step_avg:144.57ms step:199/1480 train_time:27325ms step_avg:144.58ms step:200/1480 train_time:27473ms step_avg:144.60ms step:201/1480 train_time:27620ms step_avg:144.61ms step:202/1480 train_time:27766ms step_avg:144.61ms step:203/1480 train_time:27914ms step_avg:144.63ms step:204/1480 train_time:28062ms step_avg:144.65ms step:205/1480 train_time:28207ms step_avg:144.65ms step:206/1480 train_time:28355ms step_avg:144.67ms step:207/1480 train_time:28502ms step_avg:144.68ms step:208/1480 train_time:28648ms step_avg:144.69ms step:209/1480 train_time:28795ms step_avg:144.70ms step:210/1480 train_time:28941ms step_avg:144.71ms step:211/1480 train_time:29089ms step_avg:144.72ms step:212/1480 train_time:29236ms step_avg:144.74ms step:213/1480 train_time:29382ms step_avg:144.74ms step:214/1480 train_time:29528ms step_avg:144.74ms step:215/1480 train_time:29674ms step_avg:144.75ms step:216/1480 train_time:29821ms step_avg:144.76ms step:217/1480 train_time:29967ms step_avg:144.77ms step:218/1480 train_time:30115ms step_avg:144.78ms step:219/1480 train_time:30262ms step_avg:144.80ms step:220/1480 train_time:30410ms step_avg:144.81ms step:221/1480 train_time:30559ms step_avg:144.83ms step:222/1480 train_time:30708ms step_avg:144.85ms step:223/1480 train_time:30859ms step_avg:144.88ms step:224/1480 train_time:31008ms step_avg:144.90ms step:225/1480 train_time:31159ms step_avg:144.93ms step:226/1480 train_time:31308ms step_avg:144.95ms step:227/1480 train_time:31460ms step_avg:144.98ms step:228/1480 train_time:31610ms step_avg:145.00ms step:229/1480 train_time:31761ms step_avg:145.03ms step:230/1480 train_time:31911ms step_avg:145.05ms step:231/1480 train_time:32061ms step_avg:145.07ms step:232/1480 train_time:32211ms step_avg:145.10ms step:233/1480 train_time:32362ms step_avg:145.12ms step:234/1480 train_time:32511ms step_avg:145.14ms step:235/1480 train_time:32663ms step_avg:145.17ms step:236/1480 train_time:32813ms step_avg:145.19ms step:237/1480 train_time:32963ms step_avg:145.21ms step:238/1480 train_time:33113ms step_avg:145.23ms step:239/1480 train_time:33263ms step_avg:145.25ms step:240/1480 train_time:33413ms step_avg:145.27ms step:241/1480 train_time:33562ms step_avg:145.29ms step:242/1480 train_time:33712ms step_avg:145.31ms step:243/1480 train_time:33862ms step_avg:145.33ms step:244/1480 train_time:34012ms step_avg:145.35ms step:245/1480 train_time:34163ms step_avg:145.37ms step:246/1480 train_time:34314ms step_avg:145.40ms step:247/1480 train_time:34464ms step_avg:145.42ms step:248/1480 train_time:34613ms step_avg:145.43ms step:249/1480 train_time:34764ms step_avg:145.45ms step:250/1480 train_time:34915ms step_avg:145.48ms step:250/1480 val_loss:3.9963 train_time:34974ms step_avg:145.72ms step:251/1480 train_time:35070ms step_avg:145.52ms step:252/1480 train_time:35221ms step_avg:145.54ms step:253/1480 train_time:35372ms step_avg:145.56ms step:254/1480 train_time:35522ms step_avg:145.58ms step:255/1480 train_time:35672ms step_avg:145.60ms step:256/1480 train_time:35821ms step_avg:145.61ms step:257/1480 train_time:35970ms step_avg:145.63ms step:258/1480 train_time:36122ms step_avg:145.65ms step:259/1480 train_time:36274ms step_avg:145.68ms step:260/1480 train_time:36424ms step_avg:145.70ms step:261/1480 train_time:36575ms step_avg:145.72ms step:262/1480 train_time:36724ms step_avg:145.73ms step:263/1480 train_time:36875ms step_avg:145.75ms step:264/1480 train_time:37025ms step_avg:145.77ms step:265/1480 train_time:37177ms step_avg:145.79ms step:266/1480 train_time:37328ms step_avg:145.81ms step:267/1480 train_time:37479ms step_avg:145.83ms step:268/1480 train_time:37629ms step_avg:145.85ms step:269/1480 train_time:37780ms step_avg:145.87ms step:270/1480 train_time:37929ms step_avg:145.88ms step:271/1480 train_time:38079ms step_avg:145.90ms step:272/1480 train_time:38229ms step_avg:145.91ms step:273/1480 train_time:38382ms step_avg:145.94ms step:274/1480 train_time:38532ms step_avg:145.95ms step:275/1480 train_time:38683ms step_avg:145.97ms step:276/1480 train_time:38835ms step_avg:145.99ms step:277/1480 train_time:38984ms step_avg:146.01ms step:278/1480 train_time:39135ms step_avg:146.02ms step:279/1480 train_time:39284ms step_avg:146.04ms step:280/1480 train_time:39434ms step_avg:146.05ms step:281/1480 train_time:39584ms step_avg:146.07ms step:282/1480 train_time:39735ms step_avg:146.09ms step:283/1480 train_time:39886ms step_avg:146.10ms step:284/1480 train_time:40037ms step_avg:146.12ms step:285/1480 train_time:40188ms step_avg:146.14ms step:286/1480 train_time:40340ms step_avg:146.16ms step:287/1480 train_time:40490ms step_avg:146.17ms step:288/1480 train_time:40640ms step_avg:146.19ms step:289/1480 train_time:40790ms step_avg:146.20ms step:290/1480 train_time:40941ms step_avg:146.22ms step:291/1480 train_time:41091ms step_avg:146.23ms step:292/1480 train_time:41240ms step_avg:146.24ms step:293/1480 train_time:41391ms step_avg:146.26ms step:294/1480 train_time:41541ms step_avg:146.27ms step:295/1480 train_time:41691ms step_avg:146.28ms step:296/1480 train_time:41842ms step_avg:146.30ms step:297/1480 train_time:41994ms step_avg:146.32ms step:298/1480 train_time:42145ms step_avg:146.34ms step:299/1480 train_time:42296ms step_avg:146.35ms step:300/1480 train_time:42447ms step_avg:146.37ms step:301/1480 train_time:42597ms step_avg:146.38ms step:302/1480 train_time:42746ms step_avg:146.39ms step:303/1480 train_time:42895ms step_avg:146.40ms step:304/1480 train_time:43045ms step_avg:146.41ms step:305/1480 train_time:43196ms step_avg:146.43ms step:306/1480 train_time:43346ms step_avg:146.44ms step:307/1480 train_time:43496ms step_avg:146.45ms step:308/1480 train_time:43648ms step_avg:146.47ms step:309/1480 train_time:43800ms step_avg:146.49ms step:310/1480 train_time:43948ms step_avg:146.49ms step:311/1480 train_time:44099ms step_avg:146.51ms step:312/1480 train_time:44249ms step_avg:146.52ms step:313/1480 train_time:44401ms step_avg:146.54ms step:314/1480 train_time:44551ms step_avg:146.55ms step:315/1480 train_time:44702ms step_avg:146.56ms step:316/1480 train_time:44853ms step_avg:146.58ms step:317/1480 train_time:45003ms step_avg:146.59ms step:318/1480 train_time:45154ms step_avg:146.60ms step:319/1480 train_time:45306ms step_avg:146.62ms step:320/1480 train_time:45457ms step_avg:146.63ms step:321/1480 train_time:45607ms step_avg:146.65ms step:322/1480 train_time:45758ms step_avg:146.66ms step:323/1480 train_time:45908ms step_avg:146.67ms step:324/1480 train_time:46059ms step_avg:146.68ms step:325/1480 train_time:46209ms step_avg:146.70ms step:326/1480 train_time:46359ms step_avg:146.71ms step:327/1480 train_time:46510ms step_avg:146.72ms step:328/1480 train_time:46660ms step_avg:146.73ms step:329/1480 train_time:46812ms step_avg:146.75ms step:330/1480 train_time:46964ms step_avg:146.76ms step:331/1480 train_time:47118ms step_avg:146.79ms step:332/1480 train_time:47274ms step_avg:146.81ms step:333/1480 train_time:47426ms step_avg:146.83ms step:334/1480 train_time:47580ms step_avg:146.85ms step:335/1480 train_time:47735ms step_avg:146.88ms step:336/1480 train_time:47890ms step_avg:146.90ms step:337/1480 train_time:48046ms step_avg:146.93ms step:338/1480 train_time:48199ms step_avg:146.95ms step:339/1480 train_time:48353ms step_avg:146.97ms step:340/1480 train_time:48507ms step_avg:146.99ms step:341/1480 train_time:48660ms step_avg:147.01ms step:342/1480 train_time:48814ms step_avg:147.03ms step:343/1480 train_time:48968ms step_avg:147.05ms step:344/1480 train_time:49122ms step_avg:147.07ms step:345/1480 train_time:49277ms step_avg:147.09ms step:346/1480 train_time:49431ms step_avg:147.12ms step:347/1480 train_time:49585ms step_avg:147.14ms step:348/1480 train_time:49739ms step_avg:147.16ms step:349/1480 train_time:49892ms step_avg:147.17ms step:350/1480 train_time:50047ms step_avg:147.20ms step:351/1480 train_time:50202ms step_avg:147.22ms step:352/1480 train_time:50355ms step_avg:147.24ms step:353/1480 train_time:50510ms step_avg:147.26ms step:354/1480 train_time:50663ms step_avg:147.28ms step:355/1480 train_time:50819ms step_avg:147.30ms step:356/1480 train_time:50975ms step_avg:147.33ms step:357/1480 train_time:51129ms step_avg:147.35ms step:358/1480 train_time:51283ms step_avg:147.37ms step:359/1480 train_time:51438ms step_avg:147.39ms step:360/1480 train_time:51593ms step_avg:147.41ms step:361/1480 train_time:51748ms step_avg:147.43ms step:362/1480 train_time:51902ms step_avg:147.45ms step:363/1480 train_time:52055ms step_avg:147.46ms step:364/1480 train_time:52209ms step_avg:147.48ms step:365/1480 train_time:52362ms step_avg:147.50ms step:366/1480 train_time:52516ms step_avg:147.52ms step:367/1480 train_time:52670ms step_avg:147.54ms step:368/1480 train_time:52823ms step_avg:147.55ms step:369/1480 train_time:52978ms step_avg:147.57ms step:370/1480 train_time:53132ms step_avg:147.59ms step:371/1480 train_time:53286ms step_avg:147.61ms step:372/1480 train_time:53440ms step_avg:147.62ms step:373/1480 train_time:53594ms step_avg:147.64ms step:374/1480 train_time:53748ms step_avg:147.66ms step:375/1480 train_time:53900ms step_avg:147.67ms step:375/1480 val_loss:3.8039 train_time:53960ms step_avg:147.84ms step:376/1480 train_time:54056ms step_avg:147.70ms step:377/1480 train_time:54212ms step_avg:147.72ms step:378/1480 train_time:54366ms step_avg:147.73ms step:379/1480 train_time:54518ms step_avg:147.75ms step:380/1480 train_time:54671ms step_avg:147.76ms step:381/1480 train_time:54823ms step_avg:147.77ms step:382/1480 train_time:54976ms step_avg:147.78ms step:383/1480 train_time:55131ms step_avg:147.80ms step:384/1480 train_time:55285ms step_avg:147.82ms step:385/1480 train_time:55437ms step_avg:147.83ms step:386/1480 train_time:55591ms step_avg:147.85ms step:387/1480 train_time:55745ms step_avg:147.86ms step:388/1480 train_time:55897ms step_avg:147.88ms step:389/1480 train_time:56050ms step_avg:147.89ms step:390/1480 train_time:56206ms step_avg:147.91ms step:391/1480 train_time:56361ms step_avg:147.93ms step:392/1480 train_time:56513ms step_avg:147.94ms step:393/1480 train_time:56667ms step_avg:147.96ms step:394/1480 train_time:56822ms step_avg:147.97ms step:395/1480 train_time:56975ms step_avg:147.99ms step:396/1480 train_time:57128ms step_avg:148.00ms step:397/1480 train_time:57282ms step_avg:148.02ms step:398/1480 train_time:57436ms step_avg:148.03ms step:399/1480 train_time:57590ms step_avg:148.05ms step:400/1480 train_time:57745ms step_avg:148.06ms step:401/1480 train_time:57897ms step_avg:148.08ms step:402/1480 train_time:58050ms step_avg:148.09ms step:403/1480 train_time:58205ms step_avg:148.10ms step:404/1480 train_time:58357ms step_avg:148.12ms step:405/1480 train_time:58512ms step_avg:148.13ms step:406/1480 train_time:58665ms step_avg:148.14ms step:407/1480 train_time:58819ms step_avg:148.16ms step:408/1480 train_time:58973ms step_avg:148.17ms step:409/1480 train_time:59127ms step_avg:148.19ms step:410/1480 train_time:59279ms step_avg:148.20ms step:411/1480 train_time:59432ms step_avg:148.21ms step:412/1480 train_time:59585ms step_avg:148.22ms step:413/1480 train_time:59737ms step_avg:148.23ms step:414/1480 train_time:59893ms step_avg:148.25ms step:415/1480 train_time:60048ms step_avg:148.27ms step:416/1480 train_time:60203ms step_avg:148.28ms step:417/1480 train_time:60357ms step_avg:148.30ms step:418/1480 train_time:60510ms step_avg:148.31ms step:419/1480 train_time:60664ms step_avg:148.32ms step:420/1480 train_time:60816ms step_avg:148.33ms step:421/1480 train_time:60971ms step_avg:148.35ms step:422/1480 train_time:61124ms step_avg:148.36ms step:423/1480 train_time:61278ms step_avg:148.37ms step:424/1480 train_time:61433ms step_avg:148.39ms step:425/1480 train_time:61587ms step_avg:148.40ms step:426/1480 train_time:61742ms step_avg:148.42ms step:427/1480 train_time:61895ms step_avg:148.43ms step:428/1480 train_time:62049ms step_avg:148.44ms step:429/1480 train_time:62204ms step_avg:148.46ms step:430/1480 train_time:62356ms step_avg:148.47ms step:431/1480 train_time:62510ms step_avg:148.48ms step:432/1480 train_time:62665ms step_avg:148.50ms step:433/1480 train_time:62819ms step_avg:148.51ms step:434/1480 train_time:62973ms step_avg:148.52ms step:435/1480 train_time:63127ms step_avg:148.53ms step:436/1480 train_time:63282ms step_avg:148.55ms step:437/1480 train_time:63436ms step_avg:148.56ms step:438/1480 train_time:63591ms step_avg:148.58ms step:439/1480 train_time:63745ms step_avg:148.59ms step:440/1480 train_time:63900ms step_avg:148.61ms step:441/1480 train_time:64056ms step_avg:148.62ms step:442/1480 train_time:64212ms step_avg:148.64ms step:443/1480 train_time:64369ms step_avg:148.66ms step:444/1480 train_time:64525ms step_avg:148.68ms step:445/1480 train_time:64682ms step_avg:148.69ms step:446/1480 train_time:64837ms step_avg:148.71ms step:447/1480 train_time:64994ms step_avg:148.73ms step:448/1480 train_time:65150ms step_avg:148.74ms step:449/1480 train_time:65310ms step_avg:148.77ms step:450/1480 train_time:65468ms step_avg:148.79ms step:451/1480 train_time:65628ms step_avg:148.82ms step:452/1480 train_time:65785ms step_avg:148.83ms step:453/1480 train_time:65942ms step_avg:148.85ms step:454/1480 train_time:66099ms step_avg:148.87ms step:455/1480 train_time:66255ms step_avg:148.89ms step:456/1480 train_time:66411ms step_avg:148.90ms step:457/1480 train_time:66569ms step_avg:148.92ms step:458/1480 train_time:66725ms step_avg:148.94ms step:459/1480 train_time:66883ms step_avg:148.96ms step:460/1480 train_time:67039ms step_avg:148.98ms step:461/1480 train_time:67199ms step_avg:149.00ms step:462/1480 train_time:67355ms step_avg:149.02ms step:463/1480 train_time:67513ms step_avg:149.04ms step:464/1480 train_time:67670ms step_avg:149.05ms step:465/1480 train_time:67827ms step_avg:149.07ms step:466/1480 train_time:67985ms step_avg:149.09ms step:467/1480 train_time:68141ms step_avg:149.10ms step:468/1480 train_time:68298ms step_avg:149.12ms step:469/1480 train_time:68454ms step_avg:149.14ms step:470/1480 train_time:68611ms step_avg:149.15ms step:471/1480 train_time:68768ms step_avg:149.17ms step:472/1480 train_time:68927ms step_avg:149.19ms step:473/1480 train_time:69084ms step_avg:149.21ms step:474/1480 train_time:69243ms step_avg:149.23ms step:475/1480 train_time:69398ms step_avg:149.24ms step:476/1480 train_time:69555ms step_avg:149.26ms step:477/1480 train_time:69712ms step_avg:149.28ms step:478/1480 train_time:69869ms step_avg:149.29ms step:479/1480 train_time:70027ms step_avg:149.31ms step:480/1480 train_time:70185ms step_avg:149.33ms step:481/1480 train_time:70343ms step_avg:149.35ms step:482/1480 train_time:70500ms step_avg:149.36ms step:483/1480 train_time:70655ms step_avg:149.38ms step:484/1480 train_time:70813ms step_avg:149.39ms step:485/1480 train_time:70970ms step_avg:149.41ms step:486/1480 train_time:71128ms step_avg:149.43ms step:487/1480 train_time:71286ms step_avg:149.45ms step:488/1480 train_time:71445ms step_avg:149.47ms step:489/1480 train_time:71601ms step_avg:149.48ms step:490/1480 train_time:71757ms step_avg:149.49ms step:491/1480 train_time:71913ms step_avg:149.51ms step:492/1480 train_time:72069ms step_avg:149.52ms step:493/1480 train_time:72227ms step_avg:149.54ms step:494/1480 train_time:72385ms step_avg:149.56ms step:495/1480 train_time:72543ms step_avg:149.57ms step:496/1480 train_time:72701ms step_avg:149.59ms step:497/1480 train_time:72856ms step_avg:149.60ms step:498/1480 train_time:73013ms step_avg:149.62ms step:499/1480 train_time:73169ms step_avg:149.63ms step:500/1480 train_time:73327ms step_avg:149.65ms step:500/1480 val_loss:3.6860 train_time:73389ms step_avg:149.77ms step:501/1480 train_time:73487ms step_avg:149.67ms step:502/1480 train_time:73646ms step_avg:149.69ms step:503/1480 train_time:73801ms step_avg:149.70ms step:504/1480 train_time:73956ms step_avg:149.71ms step:505/1480 train_time:74112ms step_avg:149.72ms step:506/1480 train_time:74269ms step_avg:149.74ms step:507/1480 train_time:74424ms step_avg:149.75ms step:508/1480 train_time:74583ms step_avg:149.76ms step:509/1480 train_time:74739ms step_avg:149.78ms step:510/1480 train_time:74895ms step_avg:149.79ms step:511/1480 train_time:75053ms step_avg:149.81ms step:512/1480 train_time:75209ms step_avg:149.82ms step:513/1480 train_time:75366ms step_avg:149.83ms step:514/1480 train_time:75522ms step_avg:149.85ms step:515/1480 train_time:75680ms step_avg:149.86ms step:516/1480 train_time:75838ms step_avg:149.88ms step:517/1480 train_time:75996ms step_avg:149.89ms step:518/1480 train_time:76154ms step_avg:149.91ms step:519/1480 train_time:76311ms step_avg:149.92ms step:520/1480 train_time:76470ms step_avg:149.94ms step:521/1480 train_time:76627ms step_avg:149.96ms step:522/1480 train_time:76784ms step_avg:149.97ms step:523/1480 train_time:76939ms step_avg:149.98ms step:524/1480 train_time:77096ms step_avg:149.99ms step:525/1480 train_time:77254ms step_avg:150.01ms step:526/1480 train_time:77413ms step_avg:150.02ms step:527/1480 train_time:77569ms step_avg:150.04ms step:528/1480 train_time:77726ms step_avg:150.05ms step:529/1480 train_time:77882ms step_avg:150.06ms step:530/1480 train_time:78039ms step_avg:150.07ms step:531/1480 train_time:78196ms step_avg:150.09ms step:532/1480 train_time:78353ms step_avg:150.10ms step:533/1480 train_time:78510ms step_avg:150.11ms step:534/1480 train_time:78666ms step_avg:150.13ms step:535/1480 train_time:78823ms step_avg:150.14ms step:536/1480 train_time:78981ms step_avg:150.15ms step:537/1480 train_time:79137ms step_avg:150.16ms step:538/1480 train_time:79294ms step_avg:150.18ms step:539/1480 train_time:79452ms step_avg:150.19ms step:540/1480 train_time:79609ms step_avg:150.20ms step:541/1480 train_time:79764ms step_avg:150.22ms step:542/1480 train_time:79920ms step_avg:150.23ms step:543/1480 train_time:80077ms step_avg:150.24ms step:544/1480 train_time:80234ms step_avg:150.25ms step:545/1480 train_time:80393ms step_avg:150.27ms step:546/1480 train_time:80549ms step_avg:150.28ms step:547/1480 train_time:80703ms step_avg:150.29ms step:548/1480 train_time:80861ms step_avg:150.30ms step:549/1480 train_time:81018ms step_avg:150.31ms step:550/1480 train_time:81176ms step_avg:150.33ms step:551/1480 train_time:81336ms step_avg:150.34ms step:552/1480 train_time:81497ms step_avg:150.36ms step:553/1480 train_time:81658ms step_avg:150.38ms step:554/1480 train_time:81818ms step_avg:150.40ms step:555/1480 train_time:81978ms step_avg:150.42ms step:556/1480 train_time:82135ms step_avg:150.43ms step:557/1480 train_time:82297ms step_avg:150.45ms step:558/1480 train_time:82456ms step_avg:150.47ms step:559/1480 train_time:82616ms step_avg:150.49ms step:560/1480 train_time:82777ms step_avg:150.50ms step:561/1480 train_time:82936ms step_avg:150.52ms step:562/1480 train_time:83097ms step_avg:150.54ms step:563/1480 train_time:83257ms step_avg:150.55ms step:564/1480 train_time:83417ms step_avg:150.57ms step:565/1480 train_time:83575ms step_avg:150.59ms step:566/1480 train_time:83735ms step_avg:150.60ms step:567/1480 train_time:83895ms step_avg:150.62ms step:568/1480 train_time:84054ms step_avg:150.63ms step:569/1480 train_time:84212ms step_avg:150.65ms step:570/1480 train_time:84372ms step_avg:150.66ms step:571/1480 train_time:84532ms step_avg:150.68ms step:572/1480 train_time:84693ms step_avg:150.70ms step:573/1480 train_time:84853ms step_avg:150.72ms step:574/1480 train_time:85014ms step_avg:150.73ms step:575/1480 train_time:85176ms step_avg:150.75ms step:576/1480 train_time:85335ms step_avg:150.77ms step:577/1480 train_time:85497ms step_avg:150.79ms step:578/1480 train_time:85657ms step_avg:150.80ms step:579/1480 train_time:85816ms step_avg:150.82ms step:580/1480 train_time:85977ms step_avg:150.84ms step:581/1480 train_time:86137ms step_avg:150.85ms step:582/1480 train_time:86298ms step_avg:150.87ms step:583/1480 train_time:86457ms step_avg:150.88ms step:584/1480 train_time:86617ms step_avg:150.90ms step:585/1480 train_time:86776ms step_avg:150.91ms step:586/1480 train_time:86935ms step_avg:150.93ms step:587/1480 train_time:87097ms step_avg:150.95ms step:588/1480 train_time:87256ms step_avg:150.96ms step:589/1480 train_time:87417ms step_avg:150.98ms step:590/1480 train_time:87578ms step_avg:151.00ms step:591/1480 train_time:87735ms step_avg:151.01ms step:592/1480 train_time:87896ms step_avg:151.02ms step:593/1480 train_time:88057ms step_avg:151.04ms step:594/1480 train_time:88217ms step_avg:151.06ms step:595/1480 train_time:88379ms step_avg:151.08ms step:596/1480 train_time:88541ms step_avg:151.09ms step:597/1480 train_time:88700ms step_avg:151.11ms step:598/1480 train_time:88857ms step_avg:151.12ms step:599/1480 train_time:89016ms step_avg:151.13ms step:600/1480 train_time:89177ms step_avg:151.15ms step:601/1480 train_time:89337ms step_avg:151.16ms step:602/1480 train_time:89498ms step_avg:151.18ms step:603/1480 train_time:89657ms step_avg:151.19ms step:604/1480 train_time:89818ms step_avg:151.21ms step:605/1480 train_time:89977ms step_avg:151.22ms step:606/1480 train_time:90138ms step_avg:151.24ms step:607/1480 train_time:90298ms step_avg:151.25ms step:608/1480 train_time:90458ms step_avg:151.27ms step:609/1480 train_time:90617ms step_avg:151.28ms step:610/1480 train_time:90777ms step_avg:151.29ms step:611/1480 train_time:90936ms step_avg:151.31ms step:612/1480 train_time:91097ms step_avg:151.32ms step:613/1480 train_time:91258ms step_avg:151.34ms step:614/1480 train_time:91417ms step_avg:151.35ms step:615/1480 train_time:91577ms step_avg:151.37ms step:616/1480 train_time:91736ms step_avg:151.38ms step:617/1480 train_time:91896ms step_avg:151.39ms step:618/1480 train_time:92056ms step_avg:151.41ms step:619/1480 train_time:92216ms step_avg:151.42ms step:620/1480 train_time:92376ms step_avg:151.44ms step:621/1480 train_time:92535ms step_avg:151.45ms step:622/1480 train_time:92698ms step_avg:151.47ms step:623/1480 train_time:92859ms step_avg:151.48ms step:624/1480 train_time:93018ms step_avg:151.49ms step:625/1480 train_time:93177ms step_avg:151.51ms step:625/1480 val_loss:3.6049 train_time:93239ms step_avg:151.61ms step:626/1480 train_time:93340ms step_avg:151.53ms step:627/1480 train_time:93501ms step_avg:151.54ms step:628/1480 train_time:93659ms step_avg:151.55ms step:629/1480 train_time:93816ms step_avg:151.56ms step:630/1480 train_time:93974ms step_avg:151.57ms step:631/1480 train_time:94132ms step_avg:151.58ms step:632/1480 train_time:94291ms step_avg:151.59ms step:633/1480 train_time:94452ms step_avg:151.61ms step:634/1480 train_time:94611ms step_avg:151.62ms step:635/1480 train_time:94771ms step_avg:151.63ms step:636/1480 train_time:94931ms step_avg:151.65ms step:637/1480 train_time:95090ms step_avg:151.66ms step:638/1480 train_time:95249ms step_avg:151.67ms step:639/1480 train_time:95407ms step_avg:151.68ms step:640/1480 train_time:95567ms step_avg:151.69ms step:641/1480 train_time:95729ms step_avg:151.71ms step:642/1480 train_time:95889ms step_avg:151.72ms step:643/1480 train_time:96049ms step_avg:151.74ms step:644/1480 train_time:96208ms step_avg:151.75ms step:645/1480 train_time:96367ms step_avg:151.76ms step:646/1480 train_time:96527ms step_avg:151.77ms step:647/1480 train_time:96687ms step_avg:151.79ms step:648/1480 train_time:96848ms step_avg:151.80ms step:649/1480 train_time:97009ms step_avg:151.81ms step:650/1480 train_time:97170ms step_avg:151.83ms step:651/1480 train_time:97329ms step_avg:151.84ms step:652/1480 train_time:97488ms step_avg:151.85ms step:653/1480 train_time:97648ms step_avg:151.86ms step:654/1480 train_time:97809ms step_avg:151.88ms step:655/1480 train_time:97970ms step_avg:151.89ms step:656/1480 train_time:98130ms step_avg:151.90ms step:657/1480 train_time:98289ms step_avg:151.92ms step:658/1480 train_time:98449ms step_avg:151.93ms step:659/1480 train_time:98609ms step_avg:151.94ms step:660/1480 train_time:98772ms step_avg:151.96ms step:661/1480 train_time:98935ms step_avg:151.97ms step:662/1480 train_time:99094ms step_avg:151.98ms step:663/1480 train_time:99254ms step_avg:152.00ms step:664/1480 train_time:99415ms step_avg:152.01ms step:665/1480 train_time:99576ms step_avg:152.02ms step:666/1480 train_time:99736ms step_avg:152.04ms step:667/1480 train_time:99898ms step_avg:152.05ms step:668/1480 train_time:100062ms step_avg:152.07ms step:669/1480 train_time:100223ms step_avg:152.08ms step:670/1480 train_time:100384ms step_avg:152.10ms step:671/1480 train_time:100547ms step_avg:152.11ms step:672/1480 train_time:100710ms step_avg:152.13ms step:673/1480 train_time:100872ms step_avg:152.14ms step:674/1480 train_time:101033ms step_avg:152.16ms step:675/1480 train_time:101195ms step_avg:152.17ms step:676/1480 train_time:101357ms step_avg:152.19ms step:677/1480 train_time:101517ms step_avg:152.20ms step:678/1480 train_time:101677ms step_avg:152.21ms step:679/1480 train_time:101839ms step_avg:152.23ms step:680/1480 train_time:102002ms step_avg:152.24ms step:681/1480 train_time:102164ms step_avg:152.26ms step:682/1480 train_time:102326ms step_avg:152.27ms step:683/1480 train_time:102488ms step_avg:152.29ms step:684/1480 train_time:102650ms step_avg:152.30ms step:685/1480 train_time:102812ms step_avg:152.31ms step:686/1480 train_time:102973ms step_avg:152.33ms step:687/1480 train_time:103134ms step_avg:152.34ms step:688/1480 train_time:103297ms step_avg:152.35ms step:689/1480 train_time:103459ms step_avg:152.37ms step:690/1480 train_time:103623ms step_avg:152.39ms step:691/1480 train_time:103785ms step_avg:152.40ms step:692/1480 train_time:103947ms step_avg:152.42ms step:693/1480 train_time:104110ms step_avg:152.43ms step:694/1480 train_time:104272ms step_avg:152.44ms step:695/1480 train_time:104432ms step_avg:152.46ms step:696/1480 train_time:104592ms step_avg:152.47ms step:697/1480 train_time:104756ms step_avg:152.48ms step:698/1480 train_time:104915ms step_avg:152.49ms step:699/1480 train_time:105078ms step_avg:152.51ms step:700/1480 train_time:105239ms step_avg:152.52ms step:701/1480 train_time:105400ms step_avg:152.53ms step:702/1480 train_time:105559ms step_avg:152.54ms step:703/1480 train_time:105720ms step_avg:152.55ms step:704/1480 train_time:105881ms step_avg:152.57ms step:705/1480 train_time:106045ms step_avg:152.58ms step:706/1480 train_time:106209ms step_avg:152.60ms step:707/1480 train_time:106370ms step_avg:152.61ms step:708/1480 train_time:106530ms step_avg:152.62ms step:709/1480 train_time:106691ms step_avg:152.63ms step:710/1480 train_time:106852ms step_avg:152.65ms step:711/1480 train_time:107014ms step_avg:152.66ms step:712/1480 train_time:107177ms step_avg:152.67ms step:713/1480 train_time:107342ms step_avg:152.69ms step:714/1480 train_time:107503ms step_avg:152.70ms step:715/1480 train_time:107665ms step_avg:152.72ms step:716/1480 train_time:107826ms step_avg:152.73ms step:717/1480 train_time:107990ms step_avg:152.74ms step:718/1480 train_time:108149ms step_avg:152.75ms step:719/1480 train_time:108310ms step_avg:152.76ms step:720/1480 train_time:108472ms step_avg:152.78ms step:721/1480 train_time:108633ms step_avg:152.79ms step:722/1480 train_time:108794ms step_avg:152.80ms step:723/1480 train_time:108954ms step_avg:152.81ms step:724/1480 train_time:109116ms step_avg:152.82ms step:725/1480 train_time:109280ms step_avg:152.84ms step:726/1480 train_time:109445ms step_avg:152.86ms step:727/1480 train_time:109608ms step_avg:152.87ms step:728/1480 train_time:109770ms step_avg:152.88ms step:729/1480 train_time:109931ms step_avg:152.89ms step:730/1480 train_time:110093ms step_avg:152.91ms step:731/1480 train_time:110253ms step_avg:152.92ms step:732/1480 train_time:110411ms step_avg:152.92ms step:733/1480 train_time:110573ms step_avg:152.94ms step:734/1480 train_time:110735ms step_avg:152.95ms step:735/1480 train_time:110895ms step_avg:152.96ms step:736/1480 train_time:111056ms step_avg:152.97ms step:737/1480 train_time:111219ms step_avg:152.98ms step:738/1480 train_time:111379ms step_avg:152.99ms step:739/1480 train_time:111538ms step_avg:153.00ms step:740/1480 train_time:111703ms step_avg:153.02ms step:741/1480 train_time:111868ms step_avg:153.03ms step:742/1480 train_time:112032ms step_avg:153.05ms step:743/1480 train_time:112192ms step_avg:153.06ms step:744/1480 train_time:112355ms step_avg:153.07ms step:745/1480 train_time:112519ms step_avg:153.09ms step:746/1480 train_time:112678ms step_avg:153.09ms step:747/1480 train_time:112839ms step_avg:153.11ms step:748/1480 train_time:113005ms step_avg:153.12ms step:749/1480 train_time:113171ms step_avg:153.14ms step:750/1480 train_time:113330ms step_avg:153.15ms step:750/1480 val_loss:3.5491 train_time:113395ms step_avg:153.24ms step:751/1480 train_time:113497ms step_avg:153.17ms step:752/1480 train_time:113658ms step_avg:153.18ms step:753/1480 train_time:113819ms step_avg:153.19ms step:754/1480 train_time:113979ms step_avg:153.20ms step:755/1480 train_time:114139ms step_avg:153.21ms step:756/1480 train_time:114300ms step_avg:153.22ms step:757/1480 train_time:114464ms step_avg:153.23ms step:758/1480 train_time:114625ms step_avg:153.24ms step:759/1480 train_time:114787ms step_avg:153.25ms step:760/1480 train_time:114949ms step_avg:153.27ms step:761/1480 train_time:115114ms step_avg:153.28ms step:762/1480 train_time:115274ms step_avg:153.29ms step:763/1480 train_time:115436ms step_avg:153.30ms step:764/1480 train_time:115598ms step_avg:153.31ms step:765/1480 train_time:115759ms step_avg:153.32ms step:766/1480 train_time:115921ms step_avg:153.33ms step:767/1480 train_time:116083ms step_avg:153.35ms step:768/1480 train_time:116245ms step_avg:153.36ms step:769/1480 train_time:116410ms step_avg:153.37ms step:770/1480 train_time:116572ms step_avg:153.38ms step:771/1480 train_time:116736ms step_avg:153.40ms step:772/1480 train_time:116897ms step_avg:153.41ms step:773/1480 train_time:117060ms step_avg:153.42ms step:774/1480 train_time:117223ms step_avg:153.43ms step:775/1480 train_time:117386ms step_avg:153.45ms step:776/1480 train_time:117549ms step_avg:153.46ms step:777/1480 train_time:117715ms step_avg:153.47ms step:778/1480 train_time:117877ms step_avg:153.49ms step:779/1480 train_time:118040ms step_avg:153.50ms step:780/1480 train_time:118204ms step_avg:153.51ms step:781/1480 train_time:118368ms step_avg:153.53ms step:782/1480 train_time:118533ms step_avg:153.54ms step:783/1480 train_time:118695ms step_avg:153.55ms step:784/1480 train_time:118857ms step_avg:153.56ms step:785/1480 train_time:119019ms step_avg:153.57ms step:786/1480 train_time:119184ms step_avg:153.59ms step:787/1480 train_time:119347ms step_avg:153.60ms step:788/1480 train_time:119513ms step_avg:153.62ms step:789/1480 train_time:119674ms step_avg:153.63ms step:790/1480 train_time:119839ms step_avg:153.64ms step:791/1480 train_time:120004ms step_avg:153.65ms step:792/1480 train_time:120171ms step_avg:153.67ms step:793/1480 train_time:120334ms step_avg:153.68ms step:794/1480 train_time:120497ms step_avg:153.69ms step:795/1480 train_time:120662ms step_avg:153.71ms step:796/1480 train_time:120829ms step_avg:153.73ms step:797/1480 train_time:120994ms step_avg:153.74ms step:798/1480 train_time:121157ms step_avg:153.75ms step:799/1480 train_time:121325ms step_avg:153.77ms step:800/1480 train_time:121489ms step_avg:153.78ms step:801/1480 train_time:121652ms step_avg:153.80ms step:802/1480 train_time:121819ms step_avg:153.81ms step:803/1480 train_time:121981ms step_avg:153.82ms step:804/1480 train_time:122142ms step_avg:153.83ms step:805/1480 train_time:122307ms step_avg:153.85ms step:806/1480 train_time:122470ms step_avg:153.86ms step:807/1480 train_time:122632ms step_avg:153.87ms step:808/1480 train_time:122796ms step_avg:153.88ms step:809/1480 train_time:122957ms step_avg:153.89ms step:810/1480 train_time:123120ms step_avg:153.90ms step:811/1480 train_time:123284ms step_avg:153.91ms step:812/1480 train_time:123448ms step_avg:153.93ms step:813/1480 train_time:123610ms step_avg:153.93ms step:814/1480 train_time:123772ms step_avg:153.95ms step:815/1480 train_time:123936ms step_avg:153.96ms step:816/1480 train_time:124099ms step_avg:153.97ms step:817/1480 train_time:124260ms step_avg:153.98ms step:818/1480 train_time:124421ms step_avg:153.99ms step:819/1480 train_time:124585ms step_avg:154.00ms step:820/1480 train_time:124748ms step_avg:154.01ms step:821/1480 train_time:124912ms step_avg:154.02ms step:822/1480 train_time:125075ms step_avg:154.03ms step:823/1480 train_time:125237ms step_avg:154.04ms step:824/1480 train_time:125397ms step_avg:154.05ms step:825/1480 train_time:125562ms step_avg:154.06ms step:826/1480 train_time:125729ms step_avg:154.08ms step:827/1480 train_time:125894ms step_avg:154.09ms step:828/1480 train_time:126057ms step_avg:154.10ms step:829/1480 train_time:126220ms step_avg:154.11ms step:830/1480 train_time:126384ms step_avg:154.13ms step:831/1480 train_time:126550ms step_avg:154.14ms step:832/1480 train_time:126714ms step_avg:154.15ms step:833/1480 train_time:126877ms step_avg:154.16ms step:834/1480 train_time:127041ms step_avg:154.18ms step:835/1480 train_time:127205ms step_avg:154.19ms step:836/1480 train_time:127368ms step_avg:154.20ms step:837/1480 train_time:127532ms step_avg:154.21ms step:838/1480 train_time:127696ms step_avg:154.22ms step:839/1480 train_time:127858ms step_avg:154.23ms step:840/1480 train_time:128018ms step_avg:154.24ms step:841/1480 train_time:128178ms step_avg:154.25ms step:842/1480 train_time:128340ms step_avg:154.25ms step:843/1480 train_time:128502ms step_avg:154.26ms step:844/1480 train_time:128664ms step_avg:154.27ms step:845/1480 train_time:128829ms step_avg:154.29ms step:846/1480 train_time:128994ms step_avg:154.30ms step:847/1480 train_time:129158ms step_avg:154.31ms step:848/1480 train_time:129319ms step_avg:154.32ms step:849/1480 train_time:129482ms step_avg:154.33ms step:850/1480 train_time:129645ms step_avg:154.34ms step:851/1480 train_time:129809ms step_avg:154.35ms step:852/1480 train_time:129972ms step_avg:154.36ms step:853/1480 train_time:130135ms step_avg:154.37ms step:854/1480 train_time:130298ms step_avg:154.38ms step:855/1480 train_time:130461ms step_avg:154.39ms step:856/1480 train_time:130623ms step_avg:154.40ms step:857/1480 train_time:130789ms step_avg:154.41ms step:858/1480 train_time:130954ms step_avg:154.43ms step:859/1480 train_time:131118ms step_avg:154.44ms step:860/1480 train_time:131279ms step_avg:154.45ms step:861/1480 train_time:131445ms step_avg:154.46ms step:862/1480 train_time:131614ms step_avg:154.48ms step:863/1480 train_time:131781ms step_avg:154.49ms step:864/1480 train_time:131945ms step_avg:154.50ms step:865/1480 train_time:132108ms step_avg:154.51ms step:866/1480 train_time:132275ms step_avg:154.53ms step:867/1480 train_time:132438ms step_avg:154.54ms step:868/1480 train_time:132600ms step_avg:154.55ms step:869/1480 train_time:132762ms step_avg:154.55ms step:870/1480 train_time:132927ms step_avg:154.57ms step:871/1480 train_time:133090ms step_avg:154.58ms step:872/1480 train_time:133255ms step_avg:154.59ms step:873/1480 train_time:133418ms step_avg:154.60ms step:874/1480 train_time:133583ms step_avg:154.61ms step:875/1480 train_time:133747ms step_avg:154.62ms step:875/1480 val_loss:3.5063 train_time:133813ms step_avg:154.70ms step:876/1480 train_time:133915ms step_avg:154.64ms step:877/1480 train_time:134082ms step_avg:154.65ms step:878/1480 train_time:134244ms step_avg:154.66ms step:879/1480 train_time:134407ms step_avg:154.67ms step:880/1480 train_time:134569ms step_avg:154.68ms step:881/1480 train_time:134731ms step_avg:154.69ms step:882/1480 train_time:134896ms step_avg:154.70ms step:883/1480 train_time:135064ms step_avg:154.71ms step:884/1480 train_time:135230ms step_avg:154.73ms step:885/1480 train_time:135395ms step_avg:154.74ms step:886/1480 train_time:135562ms step_avg:154.75ms step:887/1480 train_time:135729ms step_avg:154.77ms step:888/1480 train_time:135902ms step_avg:154.79ms step:889/1480 train_time:136069ms step_avg:154.80ms step:890/1480 train_time:136231ms step_avg:154.81ms step:891/1480 train_time:136399ms step_avg:154.82ms step:892/1480 train_time:136565ms step_avg:154.84ms step:893/1480 train_time:136726ms step_avg:154.84ms step:894/1480 train_time:136893ms step_avg:154.86ms step:895/1480 train_time:137060ms step_avg:154.87ms step:896/1480 train_time:137223ms step_avg:154.88ms step:897/1480 train_time:137391ms step_avg:154.89ms step:898/1480 train_time:137560ms step_avg:154.91ms step:899/1480 train_time:137723ms step_avg:154.92ms step:900/1480 train_time:137887ms step_avg:154.93ms step:901/1480 train_time:138052ms step_avg:154.94ms step:902/1480 train_time:138216ms step_avg:154.95ms step:903/1480 train_time:138390ms step_avg:154.97ms step:904/1480 train_time:138554ms step_avg:154.98ms step:905/1480 train_time:138717ms step_avg:154.99ms step:906/1480 train_time:138884ms step_avg:155.00ms step:907/1480 train_time:139052ms step_avg:155.02ms step:908/1480 train_time:139215ms step_avg:155.03ms step:909/1480 train_time:139381ms step_avg:155.04ms step:910/1480 train_time:139551ms step_avg:155.06ms step:911/1480 train_time:139715ms step_avg:155.07ms step:912/1480 train_time:139882ms step_avg:155.08ms step:913/1480 train_time:140048ms step_avg:155.09ms step:914/1480 train_time:140215ms step_avg:155.10ms step:915/1480 train_time:140385ms step_avg:155.12ms step:916/1480 train_time:140548ms step_avg:155.13ms step:917/1480 train_time:140712ms step_avg:155.14ms step:918/1480 train_time:140880ms step_avg:155.15ms step:919/1480 train_time:141049ms step_avg:155.17ms step:920/1480 train_time:141214ms step_avg:155.18ms step:921/1480 train_time:141381ms step_avg:155.19ms step:922/1480 train_time:141548ms step_avg:155.21ms step:923/1480 train_time:141710ms step_avg:155.21ms step:924/1480 train_time:141875ms step_avg:155.22ms step:925/1480 train_time:142041ms step_avg:155.24ms step:926/1480 train_time:142203ms step_avg:155.24ms step:927/1480 train_time:142369ms step_avg:155.26ms step:928/1480 train_time:142535ms step_avg:155.27ms step:929/1480 train_time:142701ms step_avg:155.28ms step:930/1480 train_time:142866ms step_avg:155.29ms step:931/1480 train_time:143030ms step_avg:155.30ms step:932/1480 train_time:143195ms step_avg:155.31ms step:933/1480 train_time:143363ms step_avg:155.32ms step:934/1480 train_time:143529ms step_avg:155.33ms step:935/1480 train_time:143702ms step_avg:155.35ms step:936/1480 train_time:143869ms step_avg:155.37ms step:937/1480 train_time:144040ms step_avg:155.38ms step:938/1480 train_time:144202ms step_avg:155.39ms step:939/1480 train_time:144370ms step_avg:155.40ms step:940/1480 train_time:144537ms step_avg:155.42ms step:941/1480 train_time:144701ms step_avg:155.43ms step:942/1480 train_time:144866ms step_avg:155.44ms step:943/1480 train_time:145038ms step_avg:155.45ms step:944/1480 train_time:145208ms step_avg:155.47ms step:945/1480 train_time:145374ms step_avg:155.48ms step:946/1480 train_time:145544ms step_avg:155.50ms step:947/1480 train_time:145711ms step_avg:155.51ms step:948/1480 train_time:145877ms step_avg:155.52ms step:949/1480 train_time:146043ms step_avg:155.53ms step:950/1480 train_time:146207ms step_avg:155.54ms step:951/1480 train_time:146375ms step_avg:155.55ms step:952/1480 train_time:146541ms step_avg:155.56ms step:953/1480 train_time:146709ms step_avg:155.58ms step:954/1480 train_time:146878ms step_avg:155.59ms step:955/1480 train_time:147041ms step_avg:155.60ms step:956/1480 train_time:147204ms step_avg:155.61ms step:957/1480 train_time:147372ms step_avg:155.62ms step:958/1480 train_time:147542ms step_avg:155.63ms step:959/1480 train_time:147708ms step_avg:155.65ms step:960/1480 train_time:147877ms step_avg:155.66ms step:961/1480 train_time:148042ms step_avg:155.67ms step:962/1480 train_time:148205ms step_avg:155.68ms step:963/1480 train_time:148370ms step_avg:155.69ms step:964/1480 train_time:148541ms step_avg:155.70ms step:965/1480 train_time:148706ms step_avg:155.71ms step:966/1480 train_time:148872ms step_avg:155.72ms step:967/1480 train_time:149037ms step_avg:155.73ms step:968/1480 train_time:149203ms step_avg:155.74ms step:969/1480 train_time:149368ms step_avg:155.75ms step:970/1480 train_time:149530ms step_avg:155.76ms step:971/1480 train_time:149697ms step_avg:155.77ms step:972/1480 train_time:149862ms step_avg:155.78ms step:973/1480 train_time:150025ms step_avg:155.79ms step:974/1480 train_time:150195ms step_avg:155.80ms step:975/1480 train_time:150361ms step_avg:155.81ms step:976/1480 train_time:150525ms step_avg:155.82ms step:977/1480 train_time:150689ms step_avg:155.83ms step:978/1480 train_time:150857ms step_avg:155.84ms step:979/1480 train_time:151023ms step_avg:155.85ms step:980/1480 train_time:151188ms step_avg:155.86ms step:981/1480 train_time:151357ms step_avg:155.88ms step:982/1480 train_time:151519ms step_avg:155.88ms step:983/1480 train_time:151685ms step_avg:155.89ms step:984/1480 train_time:151848ms step_avg:155.90ms step:985/1480 train_time:152015ms step_avg:155.91ms step:986/1480 train_time:152183ms step_avg:155.92ms step:987/1480 train_time:152346ms step_avg:155.93ms step:988/1480 train_time:152514ms step_avg:155.94ms step:989/1480 train_time:152681ms step_avg:155.96ms step:990/1480 train_time:152849ms step_avg:155.97ms step:991/1480 train_time:153016ms step_avg:155.98ms step:992/1480 train_time:153191ms step_avg:156.00ms step:993/1480 train_time:153367ms step_avg:156.02ms step:994/1480 train_time:153533ms step_avg:156.03ms step:995/1480 train_time:153697ms step_avg:156.04ms step:996/1480 train_time:153861ms step_avg:156.05ms step:997/1480 train_time:154025ms step_avg:156.05ms step:998/1480 train_time:154188ms step_avg:156.06ms step:999/1480 train_time:154355ms step_avg:156.07ms step:1000/1480 train_time:154524ms step_avg:156.08ms step:1000/1480 val_loss:3.4412 train_time:154592ms step_avg:156.15ms step:1001/1480 train_time:154694ms step_avg:156.10ms step:1002/1480 train_time:154861ms step_avg:156.11ms step:1003/1480 train_time:155033ms step_avg:156.13ms step:1004/1480 train_time:155202ms step_avg:156.14ms step:1005/1480 train_time:155371ms step_avg:156.15ms step:1006/1480 train_time:155539ms step_avg:156.16ms step:1007/1480 train_time:155705ms step_avg:156.17ms step:1008/1480 train_time:155874ms step_avg:156.19ms step:1009/1480 train_time:156048ms step_avg:156.20ms step:1010/1480 train_time:156214ms step_avg:156.21ms step:1011/1480 train_time:156379ms step_avg:156.22ms step:1012/1480 train_time:156543ms step_avg:156.23ms step:1013/1480 train_time:156713ms step_avg:156.24ms step:1014/1480 train_time:156879ms step_avg:156.25ms step:1015/1480 train_time:157050ms step_avg:156.27ms step:1016/1480 train_time:157218ms step_avg:156.28ms step:1017/1480 train_time:157389ms step_avg:156.29ms step:1018/1480 train_time:157557ms step_avg:156.31ms step:1019/1480 train_time:157726ms step_avg:156.32ms step:1020/1480 train_time:157897ms step_avg:156.33ms step:1021/1480 train_time:158063ms step_avg:156.34ms step:1022/1480 train_time:158232ms step_avg:156.36ms step:1023/1480 train_time:158397ms step_avg:156.36ms step:1024/1480 train_time:158562ms step_avg:156.37ms step:1025/1480 train_time:158734ms step_avg:156.39ms step:1026/1480 train_time:158900ms step_avg:156.40ms step:1027/1480 train_time:159068ms step_avg:156.41ms step:1028/1480 train_time:159241ms step_avg:156.43ms step:1029/1480 train_time:159417ms step_avg:156.44ms step:1030/1480 train_time:159585ms step_avg:156.46ms step:1031/1480 train_time:159750ms step_avg:156.46ms step:1032/1480 train_time:159921ms step_avg:156.48ms step:1033/1480 train_time:160087ms step_avg:156.49ms step:1034/1480 train_time:160254ms step_avg:156.50ms step:1035/1480 train_time:160421ms step_avg:156.51ms step:1036/1480 train_time:160586ms step_avg:156.52ms step:1037/1480 train_time:160753ms step_avg:156.53ms step:1038/1480 train_time:160921ms step_avg:156.54ms step:1039/1480 train_time:161092ms step_avg:156.55ms step:1040/1480 train_time:161258ms step_avg:156.56ms step:1041/1480 train_time:161425ms step_avg:156.57ms step:1042/1480 train_time:161590ms step_avg:156.58ms step:1043/1480 train_time:161754ms step_avg:156.59ms step:1044/1480 train_time:161919ms step_avg:156.59ms step:1045/1480 train_time:162091ms step_avg:156.61ms step:1046/1480 train_time:162258ms step_avg:156.62ms step:1047/1480 train_time:162425ms step_avg:156.63ms step:1048/1480 train_time:162592ms step_avg:156.64ms step:1049/1480 train_time:162758ms step_avg:156.65ms step:1050/1480 train_time:162927ms step_avg:156.66ms step:1051/1480 train_time:163097ms step_avg:156.67ms step:1052/1480 train_time:163265ms step_avg:156.68ms step:1053/1480 train_time:163432ms step_avg:156.69ms step:1054/1480 train_time:163600ms step_avg:156.70ms step:1055/1480 train_time:163766ms step_avg:156.71ms step:1056/1480 train_time:163931ms step_avg:156.72ms step:1057/1480 train_time:164097ms step_avg:156.73ms step:1058/1480 train_time:164268ms step_avg:156.74ms step:1059/1480 train_time:164441ms step_avg:156.76ms step:1060/1480 train_time:164611ms step_avg:156.77ms step:1061/1480 train_time:164774ms step_avg:156.78ms step:1062/1480 train_time:164940ms step_avg:156.79ms step:1063/1480 train_time:165106ms step_avg:156.80ms step:1064/1480 train_time:165270ms step_avg:156.80ms step:1065/1480 train_time:165437ms step_avg:156.81ms step:1066/1480 train_time:165607ms step_avg:156.83ms step:1067/1480 train_time:165776ms step_avg:156.84ms step:1068/1480 train_time:165941ms step_avg:156.84ms step:1069/1480 train_time:166113ms step_avg:156.86ms step:1070/1480 train_time:166279ms step_avg:156.87ms step:1071/1480 train_time:166452ms step_avg:156.88ms step:1072/1480 train_time:166619ms step_avg:156.89ms step:1073/1480 train_time:166782ms step_avg:156.90ms step:1074/1480 train_time:166950ms step_avg:156.91ms step:1075/1480 train_time:167121ms step_avg:156.92ms step:1076/1480 train_time:167289ms step_avg:156.93ms step:1077/1480 train_time:167454ms step_avg:156.94ms step:1078/1480 train_time:167628ms step_avg:156.95ms step:1079/1480 train_time:167799ms step_avg:156.97ms step:1080/1480 train_time:167969ms step_avg:156.98ms step:1081/1480 train_time:168136ms step_avg:156.99ms step:1082/1480 train_time:168303ms step_avg:157.00ms step:1083/1480 train_time:168469ms step_avg:157.01ms step:1084/1480 train_time:168636ms step_avg:157.02ms step:1085/1480 train_time:168804ms step_avg:157.03ms step:1086/1480 train_time:168972ms step_avg:157.04ms step:1087/1480 train_time:169137ms step_avg:157.04ms step:1088/1480 train_time:169307ms step_avg:157.06ms step:1089/1480 train_time:169480ms step_avg:157.07ms step:1090/1480 train_time:169652ms step_avg:157.08ms step:1091/1480 train_time:169819ms step_avg:157.09ms step:1092/1480 train_time:169988ms step_avg:157.11ms step:1093/1480 train_time:170155ms step_avg:157.11ms step:1094/1480 train_time:170320ms step_avg:157.12ms step:1095/1480 train_time:170485ms step_avg:157.13ms step:1096/1480 train_time:170654ms step_avg:157.14ms step:1097/1480 train_time:170822ms step_avg:157.15ms step:1098/1480 train_time:170993ms step_avg:157.16ms step:1099/1480 train_time:171163ms step_avg:157.17ms step:1100/1480 train_time:171333ms step_avg:157.19ms step:1101/1480 train_time:171503ms step_avg:157.20ms step:1102/1480 train_time:171675ms step_avg:157.21ms step:1103/1480 train_time:171851ms step_avg:157.23ms step:1104/1480 train_time:172019ms step_avg:157.24ms step:1105/1480 train_time:172189ms step_avg:157.25ms step:1106/1480 train_time:172357ms step_avg:157.26ms step:1107/1480 train_time:172525ms step_avg:157.27ms step:1108/1480 train_time:172691ms step_avg:157.28ms step:1109/1480 train_time:172857ms step_avg:157.29ms step:1110/1480 train_time:173022ms step_avg:157.29ms step:1111/1480 train_time:173191ms step_avg:157.30ms step:1112/1480 train_time:173362ms step_avg:157.32ms step:1113/1480 train_time:173540ms step_avg:157.33ms step:1114/1480 train_time:173715ms step_avg:157.35ms step:1115/1480 train_time:173887ms step_avg:157.36ms step:1116/1480 train_time:174055ms step_avg:157.37ms step:1117/1480 train_time:174227ms step_avg:157.39ms step:1118/1480 train_time:174404ms step_avg:157.40ms step:1119/1480 train_time:174570ms step_avg:157.41ms step:1120/1480 train_time:174739ms step_avg:157.42ms step:1121/1480 train_time:174909ms step_avg:157.43ms step:1122/1480 train_time:175075ms step_avg:157.44ms step:1123/1480 train_time:175240ms step_avg:157.45ms step:1124/1480 train_time:175409ms step_avg:157.46ms step:1125/1480 train_time:175577ms step_avg:157.47ms step:1125/1480 val_loss:3.3860 train_time:175644ms step_avg:157.53ms step:1126/1480 train_time:175745ms step_avg:157.48ms step:1127/1480 train_time:175916ms step_avg:157.49ms step:1128/1480 train_time:176086ms step_avg:157.50ms step:1129/1480 train_time:176260ms step_avg:157.52ms step:1130/1480 train_time:176429ms step_avg:157.53ms step:1131/1480 train_time:176606ms step_avg:157.54ms step:1132/1480 train_time:176771ms step_avg:157.55ms step:1133/1480 train_time:176944ms step_avg:157.56ms step:1134/1480 train_time:177115ms step_avg:157.58ms step:1135/1480 train_time:177282ms step_avg:157.58ms step:1136/1480 train_time:177450ms step_avg:157.59ms step:1137/1480 train_time:177621ms step_avg:157.61ms step:1138/1480 train_time:177789ms step_avg:157.61ms step:1139/1480 train_time:177958ms step_avg:157.62ms step:1140/1480 train_time:178127ms step_avg:157.63ms step:1141/1480 train_time:178299ms step_avg:157.65ms step:1142/1480 train_time:178466ms step_avg:157.66ms step:1143/1480 train_time:178638ms step_avg:157.67ms step:1144/1480 train_time:178806ms step_avg:157.68ms step:1145/1480 train_time:178970ms step_avg:157.68ms step:1146/1480 train_time:179142ms step_avg:157.69ms step:1147/1480 train_time:179309ms step_avg:157.70ms step:1148/1480 train_time:179477ms step_avg:157.71ms step:1149/1480 train_time:179648ms step_avg:157.72ms step:1150/1480 train_time:179815ms step_avg:157.73ms step:1151/1480 train_time:179986ms step_avg:157.74ms step:1152/1480 train_time:180158ms step_avg:157.76ms step:1153/1480 train_time:180331ms step_avg:157.77ms step:1154/1480 train_time:180498ms step_avg:157.78ms step:1155/1480 train_time:180669ms step_avg:157.79ms step:1156/1480 train_time:180848ms step_avg:157.81ms step:1157/1480 train_time:181018ms step_avg:157.82ms step:1158/1480 train_time:181185ms step_avg:157.83ms step:1159/1480 train_time:181352ms step_avg:157.83ms step:1160/1480 train_time:181519ms step_avg:157.84ms step:1161/1480 train_time:181690ms step_avg:157.85ms step:1162/1480 train_time:181861ms step_avg:157.87ms step:1163/1480 train_time:182030ms step_avg:157.88ms step:1164/1480 train_time:182201ms step_avg:157.89ms step:1165/1480 train_time:182367ms step_avg:157.89ms step:1166/1480 train_time:182536ms step_avg:157.90ms step:1167/1480 train_time:182705ms step_avg:157.91ms step:1168/1480 train_time:182872ms step_avg:157.92ms step:1169/1480 train_time:183043ms step_avg:157.93ms step:1170/1480 train_time:183210ms step_avg:157.94ms step:1171/1480 train_time:183377ms step_avg:157.95ms step:1172/1480 train_time:183545ms step_avg:157.96ms step:1173/1480 train_time:183715ms step_avg:157.97ms step:1174/1480 train_time:183895ms step_avg:157.99ms step:1175/1480 train_time:184067ms step_avg:158.00ms step:1176/1480 train_time:184240ms step_avg:158.01ms step:1177/1480 train_time:184416ms step_avg:158.03ms step:1178/1480 train_time:184583ms step_avg:158.03ms step:1179/1480 train_time:184748ms step_avg:158.04ms step:1180/1480 train_time:184928ms step_avg:158.06ms step:1181/1480 train_time:185099ms step_avg:158.07ms step:1182/1480 train_time:185266ms step_avg:158.08ms step:1183/1480 train_time:185437ms step_avg:158.09ms step:1184/1480 train_time:185604ms step_avg:158.10ms step:1185/1480 train_time:185777ms step_avg:158.11ms step:1186/1480 train_time:185948ms step_avg:158.12ms step:1187/1480 train_time:186131ms step_avg:158.14ms step:1188/1480 train_time:186297ms step_avg:158.15ms step:1189/1480 train_time:186467ms step_avg:158.16ms step:1190/1480 train_time:186635ms step_avg:158.16ms step:1191/1480 train_time:186805ms step_avg:158.18ms step:1192/1480 train_time:186971ms step_avg:158.18ms step:1193/1480 train_time:187138ms step_avg:158.19ms step:1194/1480 train_time:187306ms step_avg:158.20ms step:1195/1480 train_time:187481ms step_avg:158.21ms step:1196/1480 train_time:187665ms step_avg:158.23ms step:1197/1480 train_time:187838ms step_avg:158.25ms step:1198/1480 train_time:188021ms step_avg:158.27ms step:1199/1480 train_time:188191ms step_avg:158.28ms step:1200/1480 train_time:188360ms step_avg:158.29ms step:1201/1480 train_time:188527ms step_avg:158.29ms step:1202/1480 train_time:188709ms step_avg:158.31ms step:1203/1480 train_time:188885ms step_avg:158.33ms step:1204/1480 train_time:189060ms step_avg:158.34ms step:1205/1480 train_time:189229ms step_avg:158.35ms step:1206/1480 train_time:189395ms step_avg:158.36ms step:1207/1480 train_time:189564ms step_avg:158.37ms step:1208/1480 train_time:189731ms step_avg:158.37ms step:1209/1480 train_time:189905ms step_avg:158.39ms step:1210/1480 train_time:190081ms step_avg:158.40ms step:1211/1480 train_time:190255ms step_avg:158.41ms step:1212/1480 train_time:190427ms step_avg:158.43ms step:1213/1480 train_time:190600ms step_avg:158.44ms step:1214/1480 train_time:190778ms step_avg:158.45ms step:1215/1480 train_time:190951ms step_avg:158.47ms step:1216/1480 train_time:191119ms step_avg:158.47ms step:1217/1480 train_time:191293ms step_avg:158.49ms step:1218/1480 train_time:191464ms step_avg:158.50ms step:1219/1480 train_time:191644ms step_avg:158.51ms step:1220/1480 train_time:191813ms step_avg:158.52ms step:1221/1480 train_time:191981ms step_avg:158.53ms step:1222/1480 train_time:192148ms step_avg:158.54ms step:1223/1480 train_time:192320ms step_avg:158.55ms step:1224/1480 train_time:192499ms step_avg:158.57ms step:1225/1480 train_time:192670ms step_avg:158.58ms step:1226/1480 train_time:192844ms step_avg:158.59ms step:1227/1480 train_time:193017ms step_avg:158.60ms step:1228/1480 train_time:193186ms step_avg:158.61ms step:1229/1480 train_time:193359ms step_avg:158.62ms step:1230/1480 train_time:193538ms step_avg:158.64ms step:1231/1480 train_time:193714ms step_avg:158.65ms step:1232/1480 train_time:193888ms step_avg:158.66ms step:1233/1480 train_time:194059ms step_avg:158.67ms step:1234/1480 train_time:194228ms step_avg:158.68ms step:1235/1480 train_time:194402ms step_avg:158.70ms step:1236/1480 train_time:194570ms step_avg:158.70ms step:1237/1480 train_time:194742ms step_avg:158.71ms step:1238/1480 train_time:194926ms step_avg:158.73ms step:1239/1480 train_time:195098ms step_avg:158.75ms step:1240/1480 train_time:195268ms step_avg:158.75ms step:1241/1480 train_time:195443ms step_avg:158.77ms step:1242/1480 train_time:195612ms step_avg:158.78ms step:1243/1480 train_time:195786ms step_avg:158.79ms step:1244/1480 train_time:195952ms step_avg:158.79ms step:1245/1480 train_time:196122ms step_avg:158.80ms step:1246/1480 train_time:196292ms step_avg:158.81ms step:1247/1480 train_time:196462ms step_avg:158.82ms step:1248/1480 train_time:196632ms step_avg:158.83ms step:1249/1480 train_time:196800ms step_avg:158.84ms step:1250/1480 train_time:196970ms step_avg:158.85ms step:1250/1480 val_loss:3.3365 train_time:197042ms step_avg:158.91ms step:1251/1480 train_time:197154ms step_avg:158.87ms step:1252/1480 train_time:197325ms step_avg:158.88ms step:1253/1480 train_time:197493ms step_avg:158.88ms step:1254/1480 train_time:197664ms step_avg:158.89ms step:1255/1480 train_time:197850ms step_avg:158.92ms step:1256/1480 train_time:198024ms step_avg:158.93ms step:1257/1480 train_time:198195ms step_avg:158.94ms step:1258/1480 train_time:198372ms step_avg:158.95ms step:1259/1480 train_time:198544ms step_avg:158.96ms step:1260/1480 train_time:198711ms step_avg:158.97ms step:1261/1480 train_time:198882ms step_avg:158.98ms step:1262/1480 train_time:199057ms step_avg:158.99ms step:1263/1480 train_time:199231ms step_avg:159.00ms step:1264/1480 train_time:199398ms step_avg:159.01ms step:1265/1480 train_time:199565ms step_avg:159.02ms step:1266/1480 train_time:199736ms step_avg:159.03ms step:1267/1480 train_time:199907ms step_avg:159.03ms step:1268/1480 train_time:200076ms step_avg:159.04ms step:1269/1480 train_time:200252ms step_avg:159.06ms step:1270/1480 train_time:200422ms step_avg:159.07ms step:1271/1480 train_time:200592ms step_avg:159.07ms step:1272/1480 train_time:200758ms step_avg:159.08ms step:1273/1480 train_time:200930ms step_avg:159.09ms step:1274/1480 train_time:201102ms step_avg:159.10ms step:1275/1480 train_time:201271ms step_avg:159.11ms step:1276/1480 train_time:201436ms step_avg:159.11ms step:1277/1480 train_time:201606ms step_avg:159.12ms step:1278/1480 train_time:201773ms step_avg:159.13ms step:1279/1480 train_time:201943ms step_avg:159.14ms step:1280/1480 train_time:202121ms step_avg:159.15ms step:1281/1480 train_time:202290ms step_avg:159.16ms step:1282/1480 train_time:202457ms step_avg:159.16ms step:1283/1480 train_time:202629ms step_avg:159.17ms step:1284/1480 train_time:202798ms step_avg:159.18ms step:1285/1480 train_time:202970ms step_avg:159.19ms step:1286/1480 train_time:203138ms step_avg:159.20ms step:1287/1480 train_time:203309ms step_avg:159.21ms step:1288/1480 train_time:203482ms step_avg:159.22ms step:1289/1480 train_time:203667ms step_avg:159.24ms step:1290/1480 train_time:203846ms step_avg:159.25ms step:1291/1480 train_time:204018ms step_avg:159.26ms step:1292/1480 train_time:204193ms step_avg:159.28ms step:1293/1480 train_time:204369ms step_avg:159.29ms step:1294/1480 train_time:204539ms step_avg:159.30ms step:1295/1480 train_time:204711ms step_avg:159.31ms step:1296/1480 train_time:204884ms step_avg:159.32ms step:1297/1480 train_time:205055ms step_avg:159.33ms step:1298/1480 train_time:205226ms step_avg:159.34ms step:1299/1480 train_time:205397ms step_avg:159.35ms step:1300/1480 train_time:205564ms step_avg:159.35ms step:1301/1480 train_time:205733ms step_avg:159.36ms step:1302/1480 train_time:205907ms step_avg:159.37ms step:1303/1480 train_time:206081ms step_avg:159.38ms step:1304/1480 train_time:206255ms step_avg:159.39ms step:1305/1480 train_time:206423ms step_avg:159.40ms step:1306/1480 train_time:206597ms step_avg:159.41ms step:1307/1480 train_time:206766ms step_avg:159.42ms step:1308/1480 train_time:206935ms step_avg:159.43ms step:1309/1480 train_time:207107ms step_avg:159.44ms step:1310/1480 train_time:207276ms step_avg:159.44ms step:1311/1480 train_time:207444ms step_avg:159.45ms step:1312/1480 train_time:207616ms step_avg:159.46ms step:1313/1480 train_time:207787ms step_avg:159.47ms step:1314/1480 train_time:207960ms step_avg:159.48ms step:1315/1480 train_time:208131ms step_avg:159.49ms step:1316/1480 train_time:208297ms step_avg:159.49ms step:1317/1480 train_time:208469ms step_avg:159.50ms step:1318/1480 train_time:208648ms step_avg:159.52ms step:1319/1480 train_time:208824ms step_avg:159.53ms step:1320/1480 train_time:209001ms step_avg:159.54ms step:1321/1480 train_time:209174ms step_avg:159.55ms step:1322/1480 train_time:209352ms step_avg:159.57ms step:1323/1480 train_time:209526ms step_avg:159.58ms step:1324/1480 train_time:209700ms step_avg:159.59ms step:1325/1480 train_time:209881ms step_avg:159.61ms step:1326/1480 train_time:210057ms step_avg:159.62ms step:1327/1480 train_time:210228ms step_avg:159.63ms step:1328/1480 train_time:210397ms step_avg:159.63ms step:1329/1480 train_time:210592ms step_avg:159.66ms step:1330/1480 train_time:210773ms step_avg:159.68ms step:1331/1480 train_time:210944ms step_avg:159.68ms step:1332/1480 train_time:211117ms step_avg:159.70ms step:1333/1480 train_time:211292ms step_avg:159.71ms step:1334/1480 train_time:211464ms step_avg:159.72ms step:1335/1480 train_time:211633ms step_avg:159.72ms step:1336/1480 train_time:211817ms step_avg:159.74ms step:1337/1480 train_time:211994ms step_avg:159.75ms step:1338/1480 train_time:212165ms step_avg:159.76ms step:1339/1480 train_time:212338ms step_avg:159.77ms step:1340/1480 train_time:212510ms step_avg:159.78ms step:1341/1480 train_time:212679ms step_avg:159.79ms step:1342/1480 train_time:212852ms step_avg:159.80ms step:1343/1480 train_time:213023ms step_avg:159.81ms step:1344/1480 train_time:213196ms step_avg:159.82ms step:1345/1480 train_time:213374ms step_avg:159.83ms step:1346/1480 train_time:213542ms step_avg:159.84ms step:1347/1480 train_time:213711ms step_avg:159.84ms step:1348/1480 train_time:213879ms step_avg:159.85ms step:1349/1480 train_time:214050ms step_avg:159.86ms step:1350/1480 train_time:214225ms step_avg:159.87ms step:1351/1480 train_time:214397ms step_avg:159.88ms step:1352/1480 train_time:214567ms step_avg:159.89ms step:1353/1480 train_time:214743ms step_avg:159.90ms step:1354/1480 train_time:214913ms step_avg:159.91ms step:1355/1480 train_time:215081ms step_avg:159.91ms step:1356/1480 train_time:215253ms step_avg:159.92ms step:1357/1480 train_time:215427ms step_avg:159.93ms step:1358/1480 train_time:215598ms step_avg:159.94ms step:1359/1480 train_time:215771ms step_avg:159.95ms step:1360/1480 train_time:215947ms step_avg:159.96ms step:1361/1480 train_time:216123ms step_avg:159.97ms step:1362/1480 train_time:216298ms step_avg:159.98ms step:1363/1480 train_time:216478ms step_avg:160.00ms step:1364/1480 train_time:216649ms step_avg:160.01ms step:1365/1480 train_time:216816ms step_avg:160.01ms step:1366/1480 train_time:216989ms step_avg:160.02ms step:1367/1480 train_time:217161ms step_avg:160.03ms step:1368/1480 train_time:217334ms step_avg:160.04ms step:1369/1480 train_time:217516ms step_avg:160.06ms step:1370/1480 train_time:217693ms step_avg:160.07ms step:1371/1480 train_time:217865ms step_avg:160.08ms step:1372/1480 train_time:218040ms step_avg:160.09ms step:1373/1480 train_time:218210ms step_avg:160.10ms step:1374/1480 train_time:218384ms step_avg:160.11ms step:1375/1480 train_time:218555ms step_avg:160.11ms step:1375/1480 val_loss:3.2974 train_time:218622ms step_avg:160.16ms step:1376/1480 train_time:218728ms step_avg:160.12ms step:1377/1480 train_time:218898ms step_avg:160.13ms step:1378/1480 train_time:219067ms step_avg:160.14ms step:1379/1480 train_time:219243ms step_avg:160.15ms step:1380/1480 train_time:219416ms step_avg:160.16ms step:1381/1480 train_time:219595ms step_avg:160.17ms step:1382/1480 train_time:219766ms step_avg:160.18ms step:1383/1480 train_time:219939ms step_avg:160.19ms step:1384/1480 train_time:220115ms step_avg:160.20ms step:1385/1480 train_time:220281ms step_avg:160.20ms step:1386/1480 train_time:220451ms step_avg:160.21ms step:1387/1480 train_time:220622ms step_avg:160.22ms step:1388/1480 train_time:220790ms step_avg:160.22ms step:1389/1480 train_time:220962ms step_avg:160.23ms step:1390/1480 train_time:221130ms step_avg:160.24ms step:1391/1480 train_time:221304ms step_avg:160.25ms step:1392/1480 train_time:221477ms step_avg:160.26ms step:1393/1480 train_time:221648ms step_avg:160.27ms step:1394/1480 train_time:221819ms step_avg:160.27ms step:1395/1480 train_time:221988ms step_avg:160.28ms step:1396/1480 train_time:222156ms step_avg:160.29ms step:1397/1480 train_time:222324ms step_avg:160.29ms step:1398/1480 train_time:222491ms step_avg:160.30ms step:1399/1480 train_time:222660ms step_avg:160.30ms step:1400/1480 train_time:222837ms step_avg:160.31ms step:1401/1480 train_time:223003ms step_avg:160.32ms step:1402/1480 train_time:223175ms step_avg:160.33ms step:1403/1480 train_time:223351ms step_avg:160.34ms step:1404/1480 train_time:223522ms step_avg:160.35ms step:1405/1480 train_time:223695ms step_avg:160.36ms step:1406/1480 train_time:223871ms step_avg:160.37ms step:1407/1480 train_time:224039ms step_avg:160.37ms step:1408/1480 train_time:224209ms step_avg:160.38ms step:1409/1480 train_time:224390ms step_avg:160.39ms step:1410/1480 train_time:224560ms step_avg:160.40ms step:1411/1480 train_time:224728ms step_avg:160.41ms step:1412/1480 train_time:224897ms step_avg:160.41ms step:1413/1480 train_time:225067ms step_avg:160.42ms step:1414/1480 train_time:225239ms step_avg:160.43ms step:1415/1480 train_time:225413ms step_avg:160.44ms step:1416/1480 train_time:225599ms step_avg:160.45ms step:1417/1480 train_time:225773ms step_avg:160.46ms step:1418/1480 train_time:225944ms step_avg:160.47ms step:1419/1480 train_time:226118ms step_avg:160.48ms step:1420/1480 train_time:226293ms step_avg:160.49ms step:1421/1480 train_time:226466ms step_avg:160.50ms step:1422/1480 train_time:226638ms step_avg:160.51ms step:1423/1480 train_time:226808ms step_avg:160.52ms step:1424/1480 train_time:226986ms step_avg:160.53ms step:1425/1480 train_time:227168ms step_avg:160.54ms step:1426/1480 train_time:227341ms step_avg:160.55ms step:1427/1480 train_time:227516ms step_avg:160.56ms step:1428/1480 train_time:227688ms step_avg:160.57ms step:1429/1480 train_time:227856ms step_avg:160.57ms step:1430/1480 train_time:228029ms step_avg:160.58ms step:1431/1480 train_time:228202ms step_avg:160.59ms step:1432/1480 train_time:228379ms step_avg:160.60ms step:1433/1480 train_time:228557ms step_avg:160.62ms step:1434/1480 train_time:228737ms step_avg:160.63ms step:1435/1480 train_time:228912ms step_avg:160.64ms step:1436/1480 train_time:229086ms step_avg:160.65ms step:1437/1480 train_time:229257ms step_avg:160.66ms step:1438/1480 train_time:229426ms step_avg:160.66ms step:1439/1480 train_time:229601ms step_avg:160.67ms step:1440/1480 train_time:229770ms step_avg:160.68ms step:1441/1480 train_time:229941ms step_avg:160.69ms step:1442/1480 train_time:230118ms step_avg:160.70ms step:1443/1480 train_time:230307ms step_avg:160.72ms step:1444/1480 train_time:230479ms step_avg:160.72ms step:1445/1480 train_time:230649ms step_avg:160.73ms step:1446/1480 train_time:230825ms step_avg:160.74ms step:1447/1480 train_time:231003ms step_avg:160.75ms step:1448/1480 train_time:231174ms step_avg:160.76ms step:1449/1480 train_time:231348ms step_avg:160.77ms step:1450/1480 train_time:231522ms step_avg:160.78ms step:1451/1480 train_time:231691ms step_avg:160.78ms step:1452/1480 train_time:231865ms step_avg:160.79ms step:1453/1480 train_time:232034ms step_avg:160.80ms step:1454/1480 train_time:232206ms step_avg:160.81ms step:1455/1480 train_time:232386ms step_avg:160.82ms step:1456/1480 train_time:232559ms step_avg:160.83ms step:1457/1480 train_time:232729ms step_avg:160.84ms step:1458/1480 train_time:232900ms step_avg:160.84ms step:1459/1480 train_time:233077ms step_avg:160.85ms step:1460/1480 train_time:233248ms step_avg:160.86ms step:1461/1480 train_time:233423ms step_avg:160.87ms step:1462/1480 train_time:233593ms step_avg:160.88ms step:1463/1480 train_time:233770ms step_avg:160.89ms step:1464/1480 train_time:233945ms step_avg:160.90ms step:1465/1480 train_time:234117ms step_avg:160.91ms step:1466/1480 train_time:234287ms step_avg:160.91ms step:1467/1480 train_time:234461ms step_avg:160.92ms step:1468/1480 train_time:234629ms step_avg:160.93ms step:1469/1480 train_time:234803ms step_avg:160.93ms step:1470/1480 train_time:234986ms step_avg:160.95ms step:1471/1480 train_time:235170ms step_avg:160.97ms step:1472/1480 train_time:235351ms step_avg:160.98ms step:1473/1480 train_time:235522ms step_avg:160.99ms step:1474/1480 train_time:235699ms step_avg:161.00ms step:1475/1480 train_time:235879ms step_avg:161.01ms step:1476/1480 train_time:236051ms step_avg:161.02ms step:1477/1480 train_time:236234ms step_avg:161.03ms step:1478/1480 train_time:236418ms step_avg:161.05ms step:1479/1480 train_time:236590ms step_avg:161.06ms step:1480/1480 train_time:236763ms step_avg:161.06ms step:1480/1480 val_loss:3.2787 train_time:236834ms step_avg:161.11ms