import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 09:48:28 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 45C P0 121W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 45C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 121W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23541ms step_avg:nanms step:2/1480 train_time:23631ms step_avg:nanms step:3/1480 train_time:23770ms step_avg:nanms step:4/1480 train_time:23911ms step_avg:nanms step:5/1480 train_time:24052ms step_avg:nanms step:6/1480 train_time:24193ms step_avg:nanms step:7/1480 train_time:24334ms step_avg:nanms step:8/1480 train_time:24476ms step_avg:nanms step:9/1480 train_time:24623ms step_avg:nanms step:10/1480 train_time:24767ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.62ms step:14/1480 train_time:566ms step_avg:141.45ms step:15/1480 train_time:707ms step_avg:141.33ms step:16/1480 train_time:850ms step_avg:141.73ms step:17/1480 train_time:994ms step_avg:141.97ms step:18/1480 train_time:1138ms step_avg:142.31ms step:19/1480 train_time:1282ms step_avg:142.40ms step:20/1480 train_time:1423ms step_avg:142.34ms step:21/1480 train_time:1565ms step_avg:142.27ms step:22/1480 train_time:1706ms step_avg:142.19ms step:23/1480 train_time:1849ms step_avg:142.21ms step:24/1480 train_time:1993ms step_avg:142.35ms step:25/1480 train_time:2138ms step_avg:142.50ms step:26/1480 train_time:2281ms step_avg:142.56ms step:27/1480 train_time:2424ms step_avg:142.57ms step:28/1480 train_time:2565ms step_avg:142.52ms step:29/1480 train_time:2706ms step_avg:142.44ms step:30/1480 train_time:2850ms step_avg:142.52ms step:31/1480 train_time:2995ms step_avg:142.62ms step:32/1480 train_time:3140ms step_avg:142.71ms step:33/1480 train_time:3283ms step_avg:142.73ms step:34/1480 train_time:3424ms step_avg:142.68ms step:35/1480 train_time:3566ms step_avg:142.64ms step:36/1480 train_time:3707ms step_avg:142.58ms step:37/1480 train_time:3849ms step_avg:142.57ms step:38/1480 train_time:3992ms step_avg:142.56ms step:39/1480 train_time:4135ms step_avg:142.59ms step:40/1480 train_time:4278ms step_avg:142.61ms step:41/1480 train_time:4422ms step_avg:142.65ms step:42/1480 train_time:4564ms step_avg:142.63ms step:43/1480 train_time:4705ms step_avg:142.59ms step:44/1480 train_time:4847ms step_avg:142.55ms step:45/1480 train_time:4990ms step_avg:142.56ms step:46/1480 train_time:5134ms step_avg:142.62ms step:47/1480 train_time:5278ms step_avg:142.65ms step:48/1480 train_time:5421ms step_avg:142.65ms step:49/1480 train_time:5563ms step_avg:142.64ms step:50/1480 train_time:5704ms step_avg:142.59ms step:51/1480 train_time:5845ms step_avg:142.57ms step:52/1480 train_time:5986ms step_avg:142.51ms step:53/1480 train_time:6127ms step_avg:142.49ms step:54/1480 train_time:6269ms step_avg:142.47ms step:55/1480 train_time:6412ms step_avg:142.49ms step:56/1480 train_time:6554ms step_avg:142.48ms step:57/1480 train_time:6697ms step_avg:142.48ms step:58/1480 train_time:6839ms step_avg:142.48ms step:59/1480 train_time:6982ms step_avg:142.49ms step:60/1480 train_time:7124ms step_avg:142.48ms step:61/1480 train_time:7267ms step_avg:142.48ms step:62/1480 train_time:7410ms step_avg:142.50ms step:63/1480 train_time:7554ms step_avg:142.52ms step:64/1480 train_time:7697ms step_avg:142.53ms step:65/1480 train_time:7840ms step_avg:142.54ms step:66/1480 train_time:7982ms step_avg:142.53ms step:67/1480 train_time:8123ms step_avg:142.51ms step:68/1480 train_time:8265ms step_avg:142.50ms step:69/1480 train_time:8408ms step_avg:142.50ms step:70/1480 train_time:8550ms step_avg:142.49ms step:71/1480 train_time:8692ms step_avg:142.49ms step:72/1480 train_time:8834ms step_avg:142.49ms step:73/1480 train_time:8976ms step_avg:142.47ms step:74/1480 train_time:9119ms step_avg:142.48ms step:75/1480 train_time:9261ms step_avg:142.48ms step:76/1480 train_time:9403ms step_avg:142.47ms step:77/1480 train_time:9545ms step_avg:142.46ms step:78/1480 train_time:9687ms step_avg:142.45ms step:79/1480 train_time:9829ms step_avg:142.44ms step:80/1480 train_time:9969ms step_avg:142.42ms step:81/1480 train_time:10112ms step_avg:142.43ms step:82/1480 train_time:10255ms step_avg:142.43ms step:83/1480 train_time:10398ms step_avg:142.43ms step:84/1480 train_time:10541ms step_avg:142.45ms step:85/1480 train_time:10684ms step_avg:142.45ms step:86/1480 train_time:10826ms step_avg:142.45ms step:87/1480 train_time:10967ms step_avg:142.43ms step:88/1480 train_time:11109ms step_avg:142.42ms step:89/1480 train_time:11251ms step_avg:142.41ms step:90/1480 train_time:11394ms step_avg:142.43ms step:91/1480 train_time:11539ms step_avg:142.45ms step:92/1480 train_time:11682ms step_avg:142.46ms step:93/1480 train_time:11825ms step_avg:142.47ms step:94/1480 train_time:11966ms step_avg:142.45ms step:95/1480 train_time:12107ms step_avg:142.44ms step:96/1480 train_time:12250ms step_avg:142.45ms step:97/1480 train_time:12392ms step_avg:142.43ms step:98/1480 train_time:12534ms step_avg:142.43ms step:99/1480 train_time:12677ms step_avg:142.44ms step:100/1480 train_time:12821ms step_avg:142.46ms step:101/1480 train_time:12962ms step_avg:142.44ms step:102/1480 train_time:13103ms step_avg:142.43ms step:103/1480 train_time:13245ms step_avg:142.42ms step:104/1480 train_time:13387ms step_avg:142.42ms step:105/1480 train_time:13529ms step_avg:142.41ms step:106/1480 train_time:13672ms step_avg:142.42ms step:107/1480 train_time:13816ms step_avg:142.44ms step:108/1480 train_time:13960ms step_avg:142.45ms step:109/1480 train_time:14103ms step_avg:142.45ms step:110/1480 train_time:14245ms step_avg:142.45ms step:111/1480 train_time:14388ms step_avg:142.46ms step:112/1480 train_time:14535ms step_avg:142.50ms step:113/1480 train_time:14682ms step_avg:142.54ms step:114/1480 train_time:14828ms step_avg:142.58ms step:115/1480 train_time:14975ms step_avg:142.62ms step:116/1480 train_time:15123ms step_avg:142.67ms step:117/1480 train_time:15269ms step_avg:142.70ms step:118/1480 train_time:15417ms step_avg:142.75ms step:119/1480 train_time:15564ms step_avg:142.79ms step:120/1480 train_time:15709ms step_avg:142.81ms step:121/1480 train_time:15856ms step_avg:142.85ms step:122/1480 train_time:16004ms step_avg:142.89ms step:123/1480 train_time:16150ms step_avg:142.92ms step:124/1480 train_time:16296ms step_avg:142.95ms step:125/1480 train_time:16443ms step_avg:142.98ms step:125/1480 val_loss:4.4161 train_time:16500ms step_avg:143.47ms step:126/1480 train_time:16598ms step_avg:143.09ms step:127/1480 train_time:16747ms step_avg:143.14ms step:128/1480 train_time:16895ms step_avg:143.18ms step:129/1480 train_time:17040ms step_avg:143.20ms step:130/1480 train_time:17186ms step_avg:143.22ms step:131/1480 train_time:17333ms step_avg:143.25ms step:132/1480 train_time:17478ms step_avg:143.27ms step:133/1480 train_time:17626ms step_avg:143.30ms step:134/1480 train_time:17775ms step_avg:143.35ms step:135/1480 train_time:17921ms step_avg:143.37ms step:136/1480 train_time:18069ms step_avg:143.41ms step:137/1480 train_time:18215ms step_avg:143.43ms step:138/1480 train_time:18361ms step_avg:143.45ms step:139/1480 train_time:18507ms step_avg:143.47ms step:140/1480 train_time:18655ms step_avg:143.50ms step:141/1480 train_time:18801ms step_avg:143.52ms step:142/1480 train_time:18949ms step_avg:143.55ms step:143/1480 train_time:19096ms step_avg:143.58ms step:144/1480 train_time:19243ms step_avg:143.60ms step:145/1480 train_time:19390ms step_avg:143.63ms step:146/1480 train_time:19537ms step_avg:143.65ms step:147/1480 train_time:19682ms step_avg:143.67ms step:148/1480 train_time:19829ms step_avg:143.69ms step:149/1480 train_time:19977ms step_avg:143.72ms step:150/1480 train_time:20123ms step_avg:143.74ms step:151/1480 train_time:20271ms step_avg:143.77ms step:152/1480 train_time:20419ms step_avg:143.80ms step:153/1480 train_time:20564ms step_avg:143.81ms step:154/1480 train_time:20712ms step_avg:143.83ms step:155/1480 train_time:20859ms step_avg:143.85ms step:156/1480 train_time:21004ms step_avg:143.86ms step:157/1480 train_time:21152ms step_avg:143.89ms step:158/1480 train_time:21299ms step_avg:143.91ms step:159/1480 train_time:21445ms step_avg:143.93ms step:160/1480 train_time:21593ms step_avg:143.95ms step:161/1480 train_time:21740ms step_avg:143.97ms step:162/1480 train_time:21886ms step_avg:143.99ms step:163/1480 train_time:22034ms step_avg:144.01ms step:164/1480 train_time:22180ms step_avg:144.03ms step:165/1480 train_time:22327ms step_avg:144.04ms step:166/1480 train_time:22474ms step_avg:144.07ms step:167/1480 train_time:22619ms step_avg:144.07ms step:168/1480 train_time:22766ms step_avg:144.09ms step:169/1480 train_time:22913ms step_avg:144.11ms step:170/1480 train_time:23060ms step_avg:144.13ms step:171/1480 train_time:23205ms step_avg:144.13ms step:172/1480 train_time:23352ms step_avg:144.15ms step:173/1480 train_time:23499ms step_avg:144.17ms step:174/1480 train_time:23644ms step_avg:144.17ms step:175/1480 train_time:23792ms step_avg:144.19ms step:176/1480 train_time:23939ms step_avg:144.21ms step:177/1480 train_time:24086ms step_avg:144.23ms step:178/1480 train_time:24232ms step_avg:144.24ms step:179/1480 train_time:24379ms step_avg:144.26ms step:180/1480 train_time:24525ms step_avg:144.26ms step:181/1480 train_time:24673ms step_avg:144.29ms step:182/1480 train_time:24819ms step_avg:144.30ms step:183/1480 train_time:24967ms step_avg:144.32ms step:184/1480 train_time:25114ms step_avg:144.33ms step:185/1480 train_time:25260ms step_avg:144.34ms step:186/1480 train_time:25406ms step_avg:144.35ms step:187/1480 train_time:25555ms step_avg:144.38ms step:188/1480 train_time:25701ms step_avg:144.39ms step:189/1480 train_time:25850ms step_avg:144.41ms step:190/1480 train_time:25997ms step_avg:144.43ms step:191/1480 train_time:26144ms step_avg:144.44ms step:192/1480 train_time:26292ms step_avg:144.46ms step:193/1480 train_time:26438ms step_avg:144.47ms step:194/1480 train_time:26584ms step_avg:144.48ms step:195/1480 train_time:26732ms step_avg:144.50ms step:196/1480 train_time:26878ms step_avg:144.51ms step:197/1480 train_time:27024ms step_avg:144.51ms step:198/1480 train_time:27172ms step_avg:144.53ms step:199/1480 train_time:27319ms step_avg:144.54ms step:200/1480 train_time:27465ms step_avg:144.55ms step:201/1480 train_time:27611ms step_avg:144.56ms step:202/1480 train_time:27758ms step_avg:144.57ms step:203/1480 train_time:27904ms step_avg:144.58ms step:204/1480 train_time:28051ms step_avg:144.59ms step:205/1480 train_time:28198ms step_avg:144.61ms step:206/1480 train_time:28344ms step_avg:144.61ms step:207/1480 train_time:28492ms step_avg:144.63ms step:208/1480 train_time:28638ms step_avg:144.64ms step:209/1480 train_time:28784ms step_avg:144.64ms step:210/1480 train_time:28930ms step_avg:144.65ms step:211/1480 train_time:29078ms step_avg:144.67ms step:212/1480 train_time:29224ms step_avg:144.67ms step:213/1480 train_time:29371ms step_avg:144.69ms step:214/1480 train_time:29519ms step_avg:144.70ms step:215/1480 train_time:29666ms step_avg:144.71ms step:216/1480 train_time:29813ms step_avg:144.72ms step:217/1480 train_time:29959ms step_avg:144.73ms step:218/1480 train_time:30105ms step_avg:144.73ms step:219/1480 train_time:30252ms step_avg:144.75ms step:220/1480 train_time:30399ms step_avg:144.76ms step:221/1480 train_time:30547ms step_avg:144.77ms step:222/1480 train_time:30698ms step_avg:144.80ms step:223/1480 train_time:30849ms step_avg:144.83ms step:224/1480 train_time:31000ms step_avg:144.86ms step:225/1480 train_time:31150ms step_avg:144.88ms step:226/1480 train_time:31301ms step_avg:144.91ms step:227/1480 train_time:31451ms step_avg:144.94ms step:228/1480 train_time:31601ms step_avg:144.96ms step:229/1480 train_time:31753ms step_avg:144.99ms step:230/1480 train_time:31901ms step_avg:145.01ms step:231/1480 train_time:32051ms step_avg:145.03ms step:232/1480 train_time:32201ms step_avg:145.05ms step:233/1480 train_time:32352ms step_avg:145.08ms step:234/1480 train_time:32503ms step_avg:145.10ms step:235/1480 train_time:32655ms step_avg:145.13ms step:236/1480 train_time:32804ms step_avg:145.15ms step:237/1480 train_time:32955ms step_avg:145.18ms step:238/1480 train_time:33106ms step_avg:145.20ms step:239/1480 train_time:33257ms step_avg:145.23ms step:240/1480 train_time:33406ms step_avg:145.24ms step:241/1480 train_time:33558ms step_avg:145.27ms step:242/1480 train_time:33708ms step_avg:145.29ms step:243/1480 train_time:33858ms step_avg:145.31ms step:244/1480 train_time:34008ms step_avg:145.33ms step:245/1480 train_time:34159ms step_avg:145.36ms step:246/1480 train_time:34308ms step_avg:145.37ms step:247/1480 train_time:34459ms step_avg:145.40ms step:248/1480 train_time:34610ms step_avg:145.42ms step:249/1480 train_time:34760ms step_avg:145.44ms step:250/1480 train_time:34909ms step_avg:145.46ms step:250/1480 val_loss:3.9930 train_time:34968ms step_avg:145.70ms step:251/1480 train_time:35066ms step_avg:145.50ms step:252/1480 train_time:35216ms step_avg:145.52ms step:253/1480 train_time:35367ms step_avg:145.54ms step:254/1480 train_time:35515ms step_avg:145.56ms step:255/1480 train_time:35665ms step_avg:145.57ms step:256/1480 train_time:35814ms step_avg:145.58ms step:257/1480 train_time:35964ms step_avg:145.60ms step:258/1480 train_time:36116ms step_avg:145.63ms step:259/1480 train_time:36267ms step_avg:145.65ms step:260/1480 train_time:36418ms step_avg:145.67ms step:261/1480 train_time:36569ms step_avg:145.69ms step:262/1480 train_time:36717ms step_avg:145.70ms step:263/1480 train_time:36868ms step_avg:145.72ms step:264/1480 train_time:37017ms step_avg:145.74ms step:265/1480 train_time:37169ms step_avg:145.76ms step:266/1480 train_time:37318ms step_avg:145.77ms step:267/1480 train_time:37469ms step_avg:145.79ms step:268/1480 train_time:37618ms step_avg:145.81ms step:269/1480 train_time:37769ms step_avg:145.83ms step:270/1480 train_time:37918ms step_avg:145.84ms step:271/1480 train_time:38070ms step_avg:145.86ms step:272/1480 train_time:38220ms step_avg:145.88ms step:273/1480 train_time:38370ms step_avg:145.89ms step:274/1480 train_time:38520ms step_avg:145.91ms step:275/1480 train_time:38672ms step_avg:145.93ms step:276/1480 train_time:38823ms step_avg:145.95ms step:277/1480 train_time:38973ms step_avg:145.97ms step:278/1480 train_time:39124ms step_avg:145.98ms step:279/1480 train_time:39275ms step_avg:146.00ms step:280/1480 train_time:39426ms step_avg:146.02ms step:281/1480 train_time:39577ms step_avg:146.04ms step:282/1480 train_time:39728ms step_avg:146.06ms step:283/1480 train_time:39878ms step_avg:146.07ms step:284/1480 train_time:40029ms step_avg:146.09ms step:285/1480 train_time:40179ms step_avg:146.10ms step:286/1480 train_time:40331ms step_avg:146.13ms step:287/1480 train_time:40481ms step_avg:146.14ms step:288/1480 train_time:40631ms step_avg:146.15ms step:289/1480 train_time:40780ms step_avg:146.17ms step:290/1480 train_time:40931ms step_avg:146.18ms step:291/1480 train_time:41081ms step_avg:146.20ms step:292/1480 train_time:41231ms step_avg:146.21ms step:293/1480 train_time:41381ms step_avg:146.22ms step:294/1480 train_time:41532ms step_avg:146.24ms step:295/1480 train_time:41681ms step_avg:146.25ms step:296/1480 train_time:41832ms step_avg:146.27ms step:297/1480 train_time:41982ms step_avg:146.28ms step:298/1480 train_time:42132ms step_avg:146.29ms step:299/1480 train_time:42281ms step_avg:146.30ms step:300/1480 train_time:42432ms step_avg:146.32ms step:301/1480 train_time:42583ms step_avg:146.33ms step:302/1480 train_time:42732ms step_avg:146.34ms step:303/1480 train_time:42883ms step_avg:146.36ms step:304/1480 train_time:43034ms step_avg:146.37ms step:305/1480 train_time:43185ms step_avg:146.39ms step:306/1480 train_time:43334ms step_avg:146.40ms step:307/1480 train_time:43485ms step_avg:146.42ms step:308/1480 train_time:43635ms step_avg:146.43ms step:309/1480 train_time:43785ms step_avg:146.44ms step:310/1480 train_time:43934ms step_avg:146.45ms step:311/1480 train_time:44086ms step_avg:146.46ms step:312/1480 train_time:44236ms step_avg:146.48ms step:313/1480 train_time:44387ms step_avg:146.49ms step:314/1480 train_time:44536ms step_avg:146.50ms step:315/1480 train_time:44687ms step_avg:146.52ms step:316/1480 train_time:44837ms step_avg:146.53ms step:317/1480 train_time:44987ms step_avg:146.54ms step:318/1480 train_time:45137ms step_avg:146.55ms step:319/1480 train_time:45288ms step_avg:146.56ms step:320/1480 train_time:45438ms step_avg:146.57ms step:321/1480 train_time:45588ms step_avg:146.59ms step:322/1480 train_time:45738ms step_avg:146.60ms step:323/1480 train_time:45888ms step_avg:146.61ms step:324/1480 train_time:46038ms step_avg:146.62ms step:325/1480 train_time:46189ms step_avg:146.63ms step:326/1480 train_time:46340ms step_avg:146.64ms step:327/1480 train_time:46491ms step_avg:146.66ms step:328/1480 train_time:46640ms step_avg:146.67ms step:329/1480 train_time:46792ms step_avg:146.68ms step:330/1480 train_time:46944ms step_avg:146.70ms step:331/1480 train_time:47098ms step_avg:146.72ms step:332/1480 train_time:47250ms step_avg:146.74ms step:333/1480 train_time:47405ms step_avg:146.76ms step:334/1480 train_time:47559ms step_avg:146.79ms step:335/1480 train_time:47712ms step_avg:146.81ms step:336/1480 train_time:47866ms step_avg:146.83ms step:337/1480 train_time:48019ms step_avg:146.85ms step:338/1480 train_time:48174ms step_avg:146.87ms step:339/1480 train_time:48327ms step_avg:146.89ms step:340/1480 train_time:48482ms step_avg:146.92ms step:341/1480 train_time:48634ms step_avg:146.93ms step:342/1480 train_time:48788ms step_avg:146.95ms step:343/1480 train_time:48942ms step_avg:146.97ms step:344/1480 train_time:49095ms step_avg:146.99ms step:345/1480 train_time:49248ms step_avg:147.01ms step:346/1480 train_time:49404ms step_avg:147.03ms step:347/1480 train_time:49558ms step_avg:147.06ms step:348/1480 train_time:49711ms step_avg:147.07ms step:349/1480 train_time:49865ms step_avg:147.09ms step:350/1480 train_time:50018ms step_avg:147.11ms step:351/1480 train_time:50172ms step_avg:147.13ms step:352/1480 train_time:50326ms step_avg:147.15ms step:353/1480 train_time:50482ms step_avg:147.18ms step:354/1480 train_time:50635ms step_avg:147.19ms step:355/1480 train_time:50788ms step_avg:147.21ms step:356/1480 train_time:50942ms step_avg:147.23ms step:357/1480 train_time:51097ms step_avg:147.25ms step:358/1480 train_time:51250ms step_avg:147.27ms step:359/1480 train_time:51406ms step_avg:147.29ms step:360/1480 train_time:51560ms step_avg:147.31ms step:361/1480 train_time:51713ms step_avg:147.33ms step:362/1480 train_time:51866ms step_avg:147.35ms step:363/1480 train_time:52019ms step_avg:147.36ms step:364/1480 train_time:52172ms step_avg:147.38ms step:365/1480 train_time:52326ms step_avg:147.40ms step:366/1480 train_time:52479ms step_avg:147.41ms step:367/1480 train_time:52633ms step_avg:147.43ms step:368/1480 train_time:52787ms step_avg:147.45ms step:369/1480 train_time:52940ms step_avg:147.47ms step:370/1480 train_time:53094ms step_avg:147.48ms step:371/1480 train_time:53247ms step_avg:147.50ms step:372/1480 train_time:53400ms step_avg:147.51ms step:373/1480 train_time:53553ms step_avg:147.53ms step:374/1480 train_time:53708ms step_avg:147.55ms step:375/1480 train_time:53860ms step_avg:147.56ms step:375/1480 val_loss:3.8071 train_time:53922ms step_avg:147.73ms step:376/1480 train_time:54023ms step_avg:147.60ms step:377/1480 train_time:54179ms step_avg:147.63ms step:378/1480 train_time:54331ms step_avg:147.64ms step:379/1480 train_time:54484ms step_avg:147.65ms step:380/1480 train_time:54636ms step_avg:147.66ms step:381/1480 train_time:54788ms step_avg:147.68ms step:382/1480 train_time:54941ms step_avg:147.69ms step:383/1480 train_time:55097ms step_avg:147.71ms step:384/1480 train_time:55252ms step_avg:147.73ms step:385/1480 train_time:55405ms step_avg:147.75ms step:386/1480 train_time:55559ms step_avg:147.76ms step:387/1480 train_time:55713ms step_avg:147.78ms step:388/1480 train_time:55866ms step_avg:147.79ms step:389/1480 train_time:56018ms step_avg:147.81ms step:390/1480 train_time:56171ms step_avg:147.82ms step:391/1480 train_time:56326ms step_avg:147.84ms step:392/1480 train_time:56479ms step_avg:147.85ms step:393/1480 train_time:56634ms step_avg:147.87ms step:394/1480 train_time:56789ms step_avg:147.89ms step:395/1480 train_time:56941ms step_avg:147.90ms step:396/1480 train_time:57096ms step_avg:147.92ms step:397/1480 train_time:57248ms step_avg:147.93ms step:398/1480 train_time:57404ms step_avg:147.95ms step:399/1480 train_time:57558ms step_avg:147.96ms step:400/1480 train_time:57711ms step_avg:147.98ms step:401/1480 train_time:57864ms step_avg:147.99ms step:402/1480 train_time:58016ms step_avg:148.00ms step:403/1480 train_time:58171ms step_avg:148.02ms step:404/1480 train_time:58325ms step_avg:148.03ms step:405/1480 train_time:58479ms step_avg:148.05ms step:406/1480 train_time:58633ms step_avg:148.06ms step:407/1480 train_time:58787ms step_avg:148.08ms step:408/1480 train_time:58941ms step_avg:148.09ms step:409/1480 train_time:59095ms step_avg:148.11ms step:410/1480 train_time:59247ms step_avg:148.12ms step:411/1480 train_time:59402ms step_avg:148.13ms step:412/1480 train_time:59554ms step_avg:148.14ms step:413/1480 train_time:59708ms step_avg:148.16ms step:414/1480 train_time:59862ms step_avg:148.17ms step:415/1480 train_time:60019ms step_avg:148.19ms step:416/1480 train_time:60171ms step_avg:148.20ms step:417/1480 train_time:60324ms step_avg:148.22ms step:418/1480 train_time:60479ms step_avg:148.23ms step:419/1480 train_time:60631ms step_avg:148.24ms step:420/1480 train_time:60784ms step_avg:148.25ms step:421/1480 train_time:60939ms step_avg:148.27ms step:422/1480 train_time:61091ms step_avg:148.28ms step:423/1480 train_time:61246ms step_avg:148.30ms step:424/1480 train_time:61400ms step_avg:148.31ms step:425/1480 train_time:61555ms step_avg:148.32ms step:426/1480 train_time:61709ms step_avg:148.34ms step:427/1480 train_time:61864ms step_avg:148.36ms step:428/1480 train_time:62019ms step_avg:148.37ms step:429/1480 train_time:62172ms step_avg:148.38ms step:430/1480 train_time:62325ms step_avg:148.39ms step:431/1480 train_time:62478ms step_avg:148.40ms step:432/1480 train_time:62630ms step_avg:148.41ms step:433/1480 train_time:62785ms step_avg:148.43ms step:434/1480 train_time:62939ms step_avg:148.44ms step:435/1480 train_time:63092ms step_avg:148.45ms step:436/1480 train_time:63246ms step_avg:148.46ms step:437/1480 train_time:63400ms step_avg:148.48ms step:438/1480 train_time:63552ms step_avg:148.49ms step:439/1480 train_time:63706ms step_avg:148.50ms step:440/1480 train_time:63861ms step_avg:148.51ms step:441/1480 train_time:64018ms step_avg:148.53ms step:442/1480 train_time:64176ms step_avg:148.56ms step:443/1480 train_time:64332ms step_avg:148.57ms step:444/1480 train_time:64488ms step_avg:148.59ms step:445/1480 train_time:64644ms step_avg:148.61ms step:446/1480 train_time:64801ms step_avg:148.63ms step:447/1480 train_time:64958ms step_avg:148.65ms step:448/1480 train_time:65114ms step_avg:148.66ms step:449/1480 train_time:65271ms step_avg:148.68ms step:450/1480 train_time:65428ms step_avg:148.70ms step:451/1480 train_time:65586ms step_avg:148.72ms step:452/1480 train_time:65741ms step_avg:148.74ms step:453/1480 train_time:65898ms step_avg:148.75ms step:454/1480 train_time:66053ms step_avg:148.77ms step:455/1480 train_time:66209ms step_avg:148.78ms step:456/1480 train_time:66367ms step_avg:148.80ms step:457/1480 train_time:66522ms step_avg:148.82ms step:458/1480 train_time:66678ms step_avg:148.84ms step:459/1480 train_time:66836ms step_avg:148.85ms step:460/1480 train_time:66991ms step_avg:148.87ms step:461/1480 train_time:67147ms step_avg:148.89ms step:462/1480 train_time:67304ms step_avg:148.90ms step:463/1480 train_time:67463ms step_avg:148.92ms step:464/1480 train_time:67620ms step_avg:148.94ms step:465/1480 train_time:67778ms step_avg:148.96ms step:466/1480 train_time:67935ms step_avg:148.98ms step:467/1480 train_time:68092ms step_avg:149.00ms step:468/1480 train_time:68247ms step_avg:149.01ms step:469/1480 train_time:68404ms step_avg:149.03ms step:470/1480 train_time:68562ms step_avg:149.05ms step:471/1480 train_time:68718ms step_avg:149.06ms step:472/1480 train_time:68875ms step_avg:149.08ms step:473/1480 train_time:69031ms step_avg:149.09ms step:474/1480 train_time:69186ms step_avg:149.11ms step:475/1480 train_time:69343ms step_avg:149.12ms step:476/1480 train_time:69501ms step_avg:149.14ms step:477/1480 train_time:69658ms step_avg:149.16ms step:478/1480 train_time:69815ms step_avg:149.18ms step:479/1480 train_time:69971ms step_avg:149.19ms step:480/1480 train_time:70129ms step_avg:149.21ms step:481/1480 train_time:70286ms step_avg:149.23ms step:482/1480 train_time:70443ms step_avg:149.24ms step:483/1480 train_time:70601ms step_avg:149.26ms step:484/1480 train_time:70759ms step_avg:149.28ms step:485/1480 train_time:70916ms step_avg:149.30ms step:486/1480 train_time:71073ms step_avg:149.31ms step:487/1480 train_time:71229ms step_avg:149.33ms step:488/1480 train_time:71387ms step_avg:149.35ms step:489/1480 train_time:71544ms step_avg:149.36ms step:490/1480 train_time:71702ms step_avg:149.38ms step:491/1480 train_time:71859ms step_avg:149.39ms step:492/1480 train_time:72015ms step_avg:149.41ms step:493/1480 train_time:72171ms step_avg:149.42ms step:494/1480 train_time:72329ms step_avg:149.44ms step:495/1480 train_time:72487ms step_avg:149.46ms step:496/1480 train_time:72643ms step_avg:149.47ms step:497/1480 train_time:72800ms step_avg:149.49ms step:498/1480 train_time:72958ms step_avg:149.51ms step:499/1480 train_time:73116ms step_avg:149.52ms step:500/1480 train_time:73272ms step_avg:149.53ms step:500/1480 val_loss:3.6853 train_time:73333ms step_avg:149.66ms step:501/1480 train_time:73431ms step_avg:149.55ms step:502/1480 train_time:73589ms step_avg:149.57ms step:503/1480 train_time:73746ms step_avg:149.59ms step:504/1480 train_time:73902ms step_avg:149.60ms step:505/1480 train_time:74057ms step_avg:149.61ms step:506/1480 train_time:74213ms step_avg:149.62ms step:507/1480 train_time:74371ms step_avg:149.64ms step:508/1480 train_time:74529ms step_avg:149.66ms step:509/1480 train_time:74686ms step_avg:149.67ms step:510/1480 train_time:74843ms step_avg:149.69ms step:511/1480 train_time:75000ms step_avg:149.70ms step:512/1480 train_time:75159ms step_avg:149.72ms step:513/1480 train_time:75315ms step_avg:149.73ms step:514/1480 train_time:75473ms step_avg:149.75ms step:515/1480 train_time:75630ms step_avg:149.76ms step:516/1480 train_time:75789ms step_avg:149.78ms step:517/1480 train_time:75947ms step_avg:149.80ms step:518/1480 train_time:76105ms step_avg:149.81ms step:519/1480 train_time:76263ms step_avg:149.83ms step:520/1480 train_time:76421ms step_avg:149.84ms step:521/1480 train_time:76576ms step_avg:149.86ms step:522/1480 train_time:76732ms step_avg:149.87ms step:523/1480 train_time:76889ms step_avg:149.88ms step:524/1480 train_time:77047ms step_avg:149.90ms step:525/1480 train_time:77204ms step_avg:149.91ms step:526/1480 train_time:77363ms step_avg:149.93ms step:527/1480 train_time:77521ms step_avg:149.94ms step:528/1480 train_time:77678ms step_avg:149.96ms step:529/1480 train_time:77834ms step_avg:149.97ms step:530/1480 train_time:77990ms step_avg:149.98ms step:531/1480 train_time:78148ms step_avg:150.00ms step:532/1480 train_time:78308ms step_avg:150.01ms step:533/1480 train_time:78464ms step_avg:150.03ms step:534/1480 train_time:78621ms step_avg:150.04ms step:535/1480 train_time:78778ms step_avg:150.05ms step:536/1480 train_time:78935ms step_avg:150.07ms step:537/1480 train_time:79092ms step_avg:150.08ms step:538/1480 train_time:79249ms step_avg:150.09ms step:539/1480 train_time:79407ms step_avg:150.11ms step:540/1480 train_time:79564ms step_avg:150.12ms step:541/1480 train_time:79720ms step_avg:150.13ms step:542/1480 train_time:79875ms step_avg:150.14ms step:543/1480 train_time:80030ms step_avg:150.15ms step:544/1480 train_time:80189ms step_avg:150.17ms step:545/1480 train_time:80346ms step_avg:150.18ms step:546/1480 train_time:80502ms step_avg:150.19ms step:547/1480 train_time:80659ms step_avg:150.20ms step:548/1480 train_time:80816ms step_avg:150.22ms step:549/1480 train_time:80972ms step_avg:150.23ms step:550/1480 train_time:81130ms step_avg:150.24ms step:551/1480 train_time:81289ms step_avg:150.26ms step:552/1480 train_time:81449ms step_avg:150.28ms step:553/1480 train_time:81610ms step_avg:150.29ms step:554/1480 train_time:81769ms step_avg:150.31ms step:555/1480 train_time:81929ms step_avg:150.33ms step:556/1480 train_time:82089ms step_avg:150.35ms step:557/1480 train_time:82249ms step_avg:150.36ms step:558/1480 train_time:82409ms step_avg:150.38ms step:559/1480 train_time:82569ms step_avg:150.40ms step:560/1480 train_time:82730ms step_avg:150.42ms step:561/1480 train_time:82889ms step_avg:150.43ms step:562/1480 train_time:83050ms step_avg:150.45ms step:563/1480 train_time:83209ms step_avg:150.47ms step:564/1480 train_time:83369ms step_avg:150.49ms step:565/1480 train_time:83528ms step_avg:150.50ms step:566/1480 train_time:83689ms step_avg:150.52ms step:567/1480 train_time:83849ms step_avg:150.54ms step:568/1480 train_time:84008ms step_avg:150.55ms step:569/1480 train_time:84167ms step_avg:150.57ms step:570/1480 train_time:84327ms step_avg:150.58ms step:571/1480 train_time:84486ms step_avg:150.60ms step:572/1480 train_time:84645ms step_avg:150.61ms step:573/1480 train_time:84804ms step_avg:150.63ms step:574/1480 train_time:84965ms step_avg:150.65ms step:575/1480 train_time:85125ms step_avg:150.66ms step:576/1480 train_time:85286ms step_avg:150.68ms step:577/1480 train_time:85445ms step_avg:150.70ms step:578/1480 train_time:85604ms step_avg:150.71ms step:579/1480 train_time:85764ms step_avg:150.73ms step:580/1480 train_time:85923ms step_avg:150.74ms step:581/1480 train_time:86084ms step_avg:150.76ms step:582/1480 train_time:86244ms step_avg:150.78ms step:583/1480 train_time:86402ms step_avg:150.79ms step:584/1480 train_time:86560ms step_avg:150.80ms step:585/1480 train_time:86719ms step_avg:150.82ms step:586/1480 train_time:86877ms step_avg:150.83ms step:587/1480 train_time:87034ms step_avg:150.84ms step:588/1480 train_time:87193ms step_avg:150.85ms step:589/1480 train_time:87352ms step_avg:150.87ms step:590/1480 train_time:87511ms step_avg:150.88ms step:591/1480 train_time:87669ms step_avg:150.89ms step:592/1480 train_time:87829ms step_avg:150.91ms step:593/1480 train_time:87991ms step_avg:150.93ms step:594/1480 train_time:88152ms step_avg:150.94ms step:595/1480 train_time:88313ms step_avg:150.96ms step:596/1480 train_time:88473ms step_avg:150.98ms step:597/1480 train_time:88631ms step_avg:150.99ms step:598/1480 train_time:88790ms step_avg:151.00ms step:599/1480 train_time:88949ms step_avg:151.02ms step:600/1480 train_time:89109ms step_avg:151.03ms step:601/1480 train_time:89270ms step_avg:151.05ms step:602/1480 train_time:89430ms step_avg:151.06ms step:603/1480 train_time:89591ms step_avg:151.08ms step:604/1480 train_time:89750ms step_avg:151.09ms step:605/1480 train_time:89908ms step_avg:151.11ms step:606/1480 train_time:90070ms step_avg:151.12ms step:607/1480 train_time:90231ms step_avg:151.14ms step:608/1480 train_time:90391ms step_avg:151.16ms step:609/1480 train_time:90551ms step_avg:151.17ms step:610/1480 train_time:90710ms step_avg:151.18ms step:611/1480 train_time:90870ms step_avg:151.20ms step:612/1480 train_time:91030ms step_avg:151.21ms step:613/1480 train_time:91190ms step_avg:151.23ms step:614/1480 train_time:91350ms step_avg:151.24ms step:615/1480 train_time:91509ms step_avg:151.25ms step:616/1480 train_time:91669ms step_avg:151.27ms step:617/1480 train_time:91829ms step_avg:151.28ms step:618/1480 train_time:91989ms step_avg:151.30ms step:619/1480 train_time:92149ms step_avg:151.31ms step:620/1480 train_time:92309ms step_avg:151.33ms step:621/1480 train_time:92469ms step_avg:151.34ms step:622/1480 train_time:92629ms step_avg:151.35ms step:623/1480 train_time:92792ms step_avg:151.37ms step:624/1480 train_time:92951ms step_avg:151.39ms step:625/1480 train_time:93110ms step_avg:151.40ms step:625/1480 val_loss:3.6069 train_time:93174ms step_avg:151.50ms step:626/1480 train_time:93273ms step_avg:151.42ms step:627/1480 train_time:93434ms step_avg:151.43ms step:628/1480 train_time:93594ms step_avg:151.45ms step:629/1480 train_time:93751ms step_avg:151.46ms step:630/1480 train_time:93909ms step_avg:151.47ms step:631/1480 train_time:94067ms step_avg:151.48ms step:632/1480 train_time:94226ms step_avg:151.49ms step:633/1480 train_time:94385ms step_avg:151.50ms step:634/1480 train_time:94543ms step_avg:151.51ms step:635/1480 train_time:94701ms step_avg:151.52ms step:636/1480 train_time:94861ms step_avg:151.53ms step:637/1480 train_time:95021ms step_avg:151.55ms step:638/1480 train_time:95180ms step_avg:151.56ms step:639/1480 train_time:95340ms step_avg:151.57ms step:640/1480 train_time:95500ms step_avg:151.59ms step:641/1480 train_time:95660ms step_avg:151.60ms step:642/1480 train_time:95819ms step_avg:151.61ms step:643/1480 train_time:95979ms step_avg:151.63ms step:644/1480 train_time:96138ms step_avg:151.64ms step:645/1480 train_time:96297ms step_avg:151.65ms step:646/1480 train_time:96458ms step_avg:151.66ms step:647/1480 train_time:96618ms step_avg:151.68ms step:648/1480 train_time:96779ms step_avg:151.69ms step:649/1480 train_time:96939ms step_avg:151.70ms step:650/1480 train_time:97099ms step_avg:151.72ms step:651/1480 train_time:97260ms step_avg:151.73ms step:652/1480 train_time:97420ms step_avg:151.74ms step:653/1480 train_time:97578ms step_avg:151.75ms step:654/1480 train_time:97739ms step_avg:151.77ms step:655/1480 train_time:97899ms step_avg:151.78ms step:656/1480 train_time:98059ms step_avg:151.79ms step:657/1480 train_time:98220ms step_avg:151.81ms step:658/1480 train_time:98379ms step_avg:151.82ms step:659/1480 train_time:98541ms step_avg:151.84ms step:660/1480 train_time:98701ms step_avg:151.85ms step:661/1480 train_time:98864ms step_avg:151.87ms step:662/1480 train_time:99024ms step_avg:151.88ms step:663/1480 train_time:99183ms step_avg:151.89ms step:664/1480 train_time:99346ms step_avg:151.91ms step:665/1480 train_time:99507ms step_avg:151.92ms step:666/1480 train_time:99667ms step_avg:151.93ms step:667/1480 train_time:99828ms step_avg:151.95ms step:668/1480 train_time:99990ms step_avg:151.96ms step:669/1480 train_time:100155ms step_avg:151.98ms step:670/1480 train_time:100315ms step_avg:151.99ms step:671/1480 train_time:100476ms step_avg:152.01ms step:672/1480 train_time:100640ms step_avg:152.02ms step:673/1480 train_time:100801ms step_avg:152.04ms step:674/1480 train_time:100963ms step_avg:152.05ms step:675/1480 train_time:101125ms step_avg:152.07ms step:676/1480 train_time:101286ms step_avg:152.08ms step:677/1480 train_time:101447ms step_avg:152.09ms step:678/1480 train_time:101608ms step_avg:152.11ms step:679/1480 train_time:101768ms step_avg:152.12ms step:680/1480 train_time:101932ms step_avg:152.14ms step:681/1480 train_time:102092ms step_avg:152.15ms step:682/1480 train_time:102255ms step_avg:152.17ms step:683/1480 train_time:102418ms step_avg:152.18ms step:684/1480 train_time:102579ms step_avg:152.19ms step:685/1480 train_time:102742ms step_avg:152.21ms step:686/1480 train_time:102902ms step_avg:152.22ms step:687/1480 train_time:103062ms step_avg:152.23ms step:688/1480 train_time:103224ms step_avg:152.25ms step:689/1480 train_time:103387ms step_avg:152.26ms step:690/1480 train_time:103550ms step_avg:152.28ms step:691/1480 train_time:103710ms step_avg:152.29ms step:692/1480 train_time:103871ms step_avg:152.30ms step:693/1480 train_time:104033ms step_avg:152.32ms step:694/1480 train_time:104195ms step_avg:152.33ms step:695/1480 train_time:104357ms step_avg:152.35ms step:696/1480 train_time:104519ms step_avg:152.36ms step:697/1480 train_time:104683ms step_avg:152.38ms step:698/1480 train_time:104844ms step_avg:152.39ms step:699/1480 train_time:105005ms step_avg:152.40ms step:700/1480 train_time:105166ms step_avg:152.41ms step:701/1480 train_time:105327ms step_avg:152.43ms step:702/1480 train_time:105488ms step_avg:152.44ms step:703/1480 train_time:105648ms step_avg:152.45ms step:704/1480 train_time:105807ms step_avg:152.46ms step:705/1480 train_time:105971ms step_avg:152.48ms step:706/1480 train_time:106137ms step_avg:152.50ms step:707/1480 train_time:106297ms step_avg:152.51ms step:708/1480 train_time:106458ms step_avg:152.52ms step:709/1480 train_time:106620ms step_avg:152.53ms step:710/1480 train_time:106781ms step_avg:152.54ms step:711/1480 train_time:106944ms step_avg:152.56ms step:712/1480 train_time:107108ms step_avg:152.58ms step:713/1480 train_time:107272ms step_avg:152.59ms step:714/1480 train_time:107433ms step_avg:152.60ms step:715/1480 train_time:107594ms step_avg:152.62ms step:716/1480 train_time:107754ms step_avg:152.63ms step:717/1480 train_time:107916ms step_avg:152.64ms step:718/1480 train_time:108077ms step_avg:152.65ms step:719/1480 train_time:108238ms step_avg:152.66ms step:720/1480 train_time:108401ms step_avg:152.68ms step:721/1480 train_time:108563ms step_avg:152.69ms step:722/1480 train_time:108724ms step_avg:152.70ms step:723/1480 train_time:108885ms step_avg:152.71ms step:724/1480 train_time:109046ms step_avg:152.73ms step:725/1480 train_time:109209ms step_avg:152.74ms step:726/1480 train_time:109374ms step_avg:152.76ms step:727/1480 train_time:109538ms step_avg:152.77ms step:728/1480 train_time:109700ms step_avg:152.79ms step:729/1480 train_time:109862ms step_avg:152.80ms step:730/1480 train_time:110024ms step_avg:152.81ms step:731/1480 train_time:110185ms step_avg:152.82ms step:732/1480 train_time:110345ms step_avg:152.83ms step:733/1480 train_time:110506ms step_avg:152.84ms step:734/1480 train_time:110668ms step_avg:152.86ms step:735/1480 train_time:110828ms step_avg:152.87ms step:736/1480 train_time:110989ms step_avg:152.88ms step:737/1480 train_time:111148ms step_avg:152.89ms step:738/1480 train_time:111307ms step_avg:152.89ms step:739/1480 train_time:111467ms step_avg:152.90ms step:740/1480 train_time:111631ms step_avg:152.92ms step:741/1480 train_time:111794ms step_avg:152.93ms step:742/1480 train_time:111957ms step_avg:152.95ms step:743/1480 train_time:112120ms step_avg:152.96ms step:744/1480 train_time:112284ms step_avg:152.98ms step:745/1480 train_time:112448ms step_avg:152.99ms step:746/1480 train_time:112607ms step_avg:153.00ms step:747/1480 train_time:112770ms step_avg:153.01ms step:748/1480 train_time:112936ms step_avg:153.03ms step:749/1480 train_time:113101ms step_avg:153.05ms step:750/1480 train_time:113262ms step_avg:153.06ms step:750/1480 val_loss:3.5511 train_time:113326ms step_avg:153.14ms step:751/1480 train_time:113426ms step_avg:153.07ms step:752/1480 train_time:113588ms step_avg:153.08ms step:753/1480 train_time:113748ms step_avg:153.09ms step:754/1480 train_time:113908ms step_avg:153.10ms step:755/1480 train_time:114067ms step_avg:153.11ms step:756/1480 train_time:114229ms step_avg:153.12ms step:757/1480 train_time:114395ms step_avg:153.14ms step:758/1480 train_time:114557ms step_avg:153.15ms step:759/1480 train_time:114721ms step_avg:153.17ms step:760/1480 train_time:114883ms step_avg:153.18ms step:761/1480 train_time:115045ms step_avg:153.19ms step:762/1480 train_time:115206ms step_avg:153.20ms step:763/1480 train_time:115367ms step_avg:153.21ms step:764/1480 train_time:115528ms step_avg:153.22ms step:765/1480 train_time:115688ms step_avg:153.23ms step:766/1480 train_time:115851ms step_avg:153.24ms step:767/1480 train_time:116012ms step_avg:153.25ms step:768/1480 train_time:116174ms step_avg:153.26ms step:769/1480 train_time:116339ms step_avg:153.28ms step:770/1480 train_time:116503ms step_avg:153.29ms step:771/1480 train_time:116666ms step_avg:153.31ms step:772/1480 train_time:116828ms step_avg:153.32ms step:773/1480 train_time:116990ms step_avg:153.33ms step:774/1480 train_time:117152ms step_avg:153.34ms step:775/1480 train_time:117313ms step_avg:153.35ms step:776/1480 train_time:117479ms step_avg:153.37ms step:777/1480 train_time:117645ms step_avg:153.38ms step:778/1480 train_time:117808ms step_avg:153.40ms step:779/1480 train_time:117970ms step_avg:153.41ms step:780/1480 train_time:118133ms step_avg:153.42ms step:781/1480 train_time:118297ms step_avg:153.43ms step:782/1480 train_time:118462ms step_avg:153.45ms step:783/1480 train_time:118624ms step_avg:153.46ms step:784/1480 train_time:118787ms step_avg:153.47ms step:785/1480 train_time:118948ms step_avg:153.48ms step:786/1480 train_time:119111ms step_avg:153.49ms step:787/1480 train_time:119274ms step_avg:153.51ms step:788/1480 train_time:119440ms step_avg:153.52ms step:789/1480 train_time:119603ms step_avg:153.53ms step:790/1480 train_time:119766ms step_avg:153.55ms step:791/1480 train_time:119932ms step_avg:153.56ms step:792/1480 train_time:120098ms step_avg:153.58ms step:793/1480 train_time:120262ms step_avg:153.59ms step:794/1480 train_time:120427ms step_avg:153.61ms step:795/1480 train_time:120591ms step_avg:153.62ms step:796/1480 train_time:120758ms step_avg:153.64ms step:797/1480 train_time:120924ms step_avg:153.65ms step:798/1480 train_time:121087ms step_avg:153.66ms step:799/1480 train_time:121254ms step_avg:153.68ms step:800/1480 train_time:121417ms step_avg:153.69ms step:801/1480 train_time:121580ms step_avg:153.70ms step:802/1480 train_time:121748ms step_avg:153.72ms step:803/1480 train_time:121910ms step_avg:153.73ms step:804/1480 train_time:122072ms step_avg:153.74ms step:805/1480 train_time:122237ms step_avg:153.76ms step:806/1480 train_time:122401ms step_avg:153.77ms step:807/1480 train_time:122562ms step_avg:153.78ms step:808/1480 train_time:122725ms step_avg:153.79ms step:809/1480 train_time:122887ms step_avg:153.80ms step:810/1480 train_time:123048ms step_avg:153.81ms step:811/1480 train_time:123210ms step_avg:153.82ms step:812/1480 train_time:123372ms step_avg:153.83ms step:813/1480 train_time:123532ms step_avg:153.84ms step:814/1480 train_time:123696ms step_avg:153.85ms step:815/1480 train_time:123858ms step_avg:153.86ms step:816/1480 train_time:124024ms step_avg:153.88ms step:817/1480 train_time:124186ms step_avg:153.89ms step:818/1480 train_time:124347ms step_avg:153.89ms step:819/1480 train_time:124511ms step_avg:153.91ms step:820/1480 train_time:124675ms step_avg:153.92ms step:821/1480 train_time:124837ms step_avg:153.93ms step:822/1480 train_time:125001ms step_avg:153.94ms step:823/1480 train_time:125164ms step_avg:153.95ms step:824/1480 train_time:125327ms step_avg:153.96ms step:825/1480 train_time:125492ms step_avg:153.98ms step:826/1480 train_time:125659ms step_avg:153.99ms step:827/1480 train_time:125824ms step_avg:154.01ms step:828/1480 train_time:125987ms step_avg:154.02ms step:829/1480 train_time:126151ms step_avg:154.03ms step:830/1480 train_time:126314ms step_avg:154.04ms step:831/1480 train_time:126479ms step_avg:154.06ms step:832/1480 train_time:126643ms step_avg:154.07ms step:833/1480 train_time:126808ms step_avg:154.08ms step:834/1480 train_time:126972ms step_avg:154.09ms step:835/1480 train_time:127134ms step_avg:154.10ms step:836/1480 train_time:127299ms step_avg:154.12ms step:837/1480 train_time:127463ms step_avg:154.13ms step:838/1480 train_time:127628ms step_avg:154.14ms step:839/1480 train_time:127789ms step_avg:154.15ms step:840/1480 train_time:127950ms step_avg:154.16ms step:841/1480 train_time:128111ms step_avg:154.16ms step:842/1480 train_time:128275ms step_avg:154.18ms step:843/1480 train_time:128437ms step_avg:154.19ms step:844/1480 train_time:128600ms step_avg:154.20ms step:845/1480 train_time:128763ms step_avg:154.21ms step:846/1480 train_time:128927ms step_avg:154.22ms step:847/1480 train_time:129092ms step_avg:154.23ms step:848/1480 train_time:129254ms step_avg:154.24ms step:849/1480 train_time:129417ms step_avg:154.25ms step:850/1480 train_time:129581ms step_avg:154.26ms step:851/1480 train_time:129745ms step_avg:154.27ms step:852/1480 train_time:129906ms step_avg:154.28ms step:853/1480 train_time:130067ms step_avg:154.29ms step:854/1480 train_time:130232ms step_avg:154.30ms step:855/1480 train_time:130397ms step_avg:154.32ms step:856/1480 train_time:130560ms step_avg:154.33ms step:857/1480 train_time:130727ms step_avg:154.34ms step:858/1480 train_time:130893ms step_avg:154.35ms step:859/1480 train_time:131058ms step_avg:154.37ms step:860/1480 train_time:131220ms step_avg:154.38ms step:861/1480 train_time:131386ms step_avg:154.39ms step:862/1480 train_time:131554ms step_avg:154.41ms step:863/1480 train_time:131724ms step_avg:154.42ms step:864/1480 train_time:131888ms step_avg:154.44ms step:865/1480 train_time:132048ms step_avg:154.44ms step:866/1480 train_time:132215ms step_avg:154.46ms step:867/1480 train_time:132379ms step_avg:154.47ms step:868/1480 train_time:132542ms step_avg:154.48ms step:869/1480 train_time:132705ms step_avg:154.49ms step:870/1480 train_time:132870ms step_avg:154.50ms step:871/1480 train_time:133032ms step_avg:154.51ms step:872/1480 train_time:133197ms step_avg:154.52ms step:873/1480 train_time:133359ms step_avg:154.53ms step:874/1480 train_time:133525ms step_avg:154.54ms step:875/1480 train_time:133690ms step_avg:154.55ms step:875/1480 val_loss:3.5057 train_time:133754ms step_avg:154.63ms step:876/1480 train_time:133855ms step_avg:154.57ms step:877/1480 train_time:134017ms step_avg:154.58ms step:878/1480 train_time:134180ms step_avg:154.59ms step:879/1480 train_time:134344ms step_avg:154.60ms step:880/1480 train_time:134507ms step_avg:154.61ms step:881/1480 train_time:134670ms step_avg:154.61ms step:882/1480 train_time:134835ms step_avg:154.63ms step:883/1480 train_time:135001ms step_avg:154.64ms step:884/1480 train_time:135169ms step_avg:154.66ms step:885/1480 train_time:135335ms step_avg:154.67ms step:886/1480 train_time:135501ms step_avg:154.68ms step:887/1480 train_time:135669ms step_avg:154.70ms step:888/1480 train_time:135841ms step_avg:154.72ms step:889/1480 train_time:136009ms step_avg:154.73ms step:890/1480 train_time:136172ms step_avg:154.74ms step:891/1480 train_time:136338ms step_avg:154.75ms step:892/1480 train_time:136503ms step_avg:154.77ms step:893/1480 train_time:136666ms step_avg:154.77ms step:894/1480 train_time:136833ms step_avg:154.79ms step:895/1480 train_time:136999ms step_avg:154.80ms step:896/1480 train_time:137163ms step_avg:154.81ms step:897/1480 train_time:137331ms step_avg:154.83ms step:898/1480 train_time:137497ms step_avg:154.84ms step:899/1480 train_time:137659ms step_avg:154.85ms step:900/1480 train_time:137822ms step_avg:154.86ms step:901/1480 train_time:137985ms step_avg:154.87ms step:902/1480 train_time:138151ms step_avg:154.88ms step:903/1480 train_time:138321ms step_avg:154.89ms step:904/1480 train_time:138486ms step_avg:154.91ms step:905/1480 train_time:138650ms step_avg:154.92ms step:906/1480 train_time:138815ms step_avg:154.93ms step:907/1480 train_time:138982ms step_avg:154.94ms step:908/1480 train_time:139146ms step_avg:154.95ms step:909/1480 train_time:139310ms step_avg:154.96ms step:910/1480 train_time:139479ms step_avg:154.98ms step:911/1480 train_time:139644ms step_avg:154.99ms step:912/1480 train_time:139812ms step_avg:155.00ms step:913/1480 train_time:139980ms step_avg:155.02ms step:914/1480 train_time:140148ms step_avg:155.03ms step:915/1480 train_time:140316ms step_avg:155.05ms step:916/1480 train_time:140480ms step_avg:155.06ms step:917/1480 train_time:140644ms step_avg:155.07ms step:918/1480 train_time:140814ms step_avg:155.08ms step:919/1480 train_time:140982ms step_avg:155.10ms step:920/1480 train_time:141150ms step_avg:155.11ms step:921/1480 train_time:141316ms step_avg:155.12ms step:922/1480 train_time:141484ms step_avg:155.14ms step:923/1480 train_time:141648ms step_avg:155.15ms step:924/1480 train_time:141812ms step_avg:155.16ms step:925/1480 train_time:141977ms step_avg:155.17ms step:926/1480 train_time:142140ms step_avg:155.17ms step:927/1480 train_time:142304ms step_avg:155.18ms step:928/1480 train_time:142470ms step_avg:155.20ms step:929/1480 train_time:142635ms step_avg:155.21ms step:930/1480 train_time:142799ms step_avg:155.22ms step:931/1480 train_time:142962ms step_avg:155.22ms step:932/1480 train_time:143129ms step_avg:155.24ms step:933/1480 train_time:143297ms step_avg:155.25ms step:934/1480 train_time:143465ms step_avg:155.26ms step:935/1480 train_time:143636ms step_avg:155.28ms step:936/1480 train_time:143802ms step_avg:155.29ms step:937/1480 train_time:143971ms step_avg:155.31ms step:938/1480 train_time:144134ms step_avg:155.32ms step:939/1480 train_time:144302ms step_avg:155.33ms step:940/1480 train_time:144470ms step_avg:155.34ms step:941/1480 train_time:144634ms step_avg:155.35ms step:942/1480 train_time:144798ms step_avg:155.36ms step:943/1480 train_time:144969ms step_avg:155.38ms step:944/1480 train_time:145142ms step_avg:155.40ms step:945/1480 train_time:145307ms step_avg:155.41ms step:946/1480 train_time:145476ms step_avg:155.42ms step:947/1480 train_time:145643ms step_avg:155.44ms step:948/1480 train_time:145809ms step_avg:155.45ms step:949/1480 train_time:145975ms step_avg:155.46ms step:950/1480 train_time:146139ms step_avg:155.47ms step:951/1480 train_time:146309ms step_avg:155.48ms step:952/1480 train_time:146475ms step_avg:155.49ms step:953/1480 train_time:146643ms step_avg:155.51ms step:954/1480 train_time:146813ms step_avg:155.52ms step:955/1480 train_time:146976ms step_avg:155.53ms step:956/1480 train_time:147140ms step_avg:155.54ms step:957/1480 train_time:147310ms step_avg:155.55ms step:958/1480 train_time:147478ms step_avg:155.57ms step:959/1480 train_time:147642ms step_avg:155.58ms step:960/1480 train_time:147809ms step_avg:155.59ms step:961/1480 train_time:147976ms step_avg:155.60ms step:962/1480 train_time:148139ms step_avg:155.61ms step:963/1480 train_time:148304ms step_avg:155.62ms step:964/1480 train_time:148474ms step_avg:155.63ms step:965/1480 train_time:148639ms step_avg:155.64ms step:966/1480 train_time:148802ms step_avg:155.65ms step:967/1480 train_time:148967ms step_avg:155.66ms step:968/1480 train_time:149132ms step_avg:155.67ms step:969/1480 train_time:149297ms step_avg:155.68ms step:970/1480 train_time:149460ms step_avg:155.69ms step:971/1480 train_time:149624ms step_avg:155.70ms step:972/1480 train_time:149790ms step_avg:155.71ms step:973/1480 train_time:149955ms step_avg:155.72ms step:974/1480 train_time:150123ms step_avg:155.73ms step:975/1480 train_time:150288ms step_avg:155.74ms step:976/1480 train_time:150454ms step_avg:155.75ms step:977/1480 train_time:150617ms step_avg:155.76ms step:978/1480 train_time:150783ms step_avg:155.77ms step:979/1480 train_time:150950ms step_avg:155.78ms step:980/1480 train_time:151116ms step_avg:155.79ms step:981/1480 train_time:151285ms step_avg:155.80ms step:982/1480 train_time:151449ms step_avg:155.81ms step:983/1480 train_time:151615ms step_avg:155.82ms step:984/1480 train_time:151779ms step_avg:155.83ms step:985/1480 train_time:151948ms step_avg:155.84ms step:986/1480 train_time:152113ms step_avg:155.85ms step:987/1480 train_time:152277ms step_avg:155.86ms step:988/1480 train_time:152443ms step_avg:155.87ms step:989/1480 train_time:152608ms step_avg:155.88ms step:990/1480 train_time:152778ms step_avg:155.90ms step:991/1480 train_time:152946ms step_avg:155.91ms step:992/1480 train_time:153119ms step_avg:155.93ms step:993/1480 train_time:153296ms step_avg:155.95ms step:994/1480 train_time:153461ms step_avg:155.96ms step:995/1480 train_time:153624ms step_avg:155.96ms step:996/1480 train_time:153787ms step_avg:155.97ms step:997/1480 train_time:153953ms step_avg:155.98ms step:998/1480 train_time:154116ms step_avg:155.99ms step:999/1480 train_time:154283ms step_avg:156.00ms step:1000/1480 train_time:154452ms step_avg:156.01ms step:1000/1480 val_loss:3.4425 train_time:154521ms step_avg:156.08ms step:1001/1480 train_time:154623ms step_avg:156.03ms step:1002/1480 train_time:154790ms step_avg:156.04ms step:1003/1480 train_time:154960ms step_avg:156.05ms step:1004/1480 train_time:155128ms step_avg:156.06ms step:1005/1480 train_time:155296ms step_avg:156.08ms step:1006/1480 train_time:155464ms step_avg:156.09ms step:1007/1480 train_time:155630ms step_avg:156.10ms step:1008/1480 train_time:155798ms step_avg:156.11ms step:1009/1480 train_time:155972ms step_avg:156.13ms step:1010/1480 train_time:156137ms step_avg:156.14ms step:1011/1480 train_time:156304ms step_avg:156.15ms step:1012/1480 train_time:156469ms step_avg:156.16ms step:1013/1480 train_time:156639ms step_avg:156.17ms step:1014/1480 train_time:156807ms step_avg:156.18ms step:1015/1480 train_time:156975ms step_avg:156.19ms step:1016/1480 train_time:157144ms step_avg:156.21ms step:1017/1480 train_time:157316ms step_avg:156.22ms step:1018/1480 train_time:157482ms step_avg:156.23ms step:1019/1480 train_time:157650ms step_avg:156.24ms step:1020/1480 train_time:157818ms step_avg:156.26ms step:1021/1480 train_time:157982ms step_avg:156.26ms step:1022/1480 train_time:158150ms step_avg:156.28ms step:1023/1480 train_time:158316ms step_avg:156.28ms step:1024/1480 train_time:158482ms step_avg:156.29ms step:1025/1480 train_time:158652ms step_avg:156.31ms step:1026/1480 train_time:158817ms step_avg:156.32ms step:1027/1480 train_time:158984ms step_avg:156.33ms step:1028/1480 train_time:159156ms step_avg:156.34ms step:1029/1480 train_time:159330ms step_avg:156.36ms step:1030/1480 train_time:159497ms step_avg:156.37ms step:1031/1480 train_time:159660ms step_avg:156.38ms step:1032/1480 train_time:159833ms step_avg:156.39ms step:1033/1480 train_time:159999ms step_avg:156.40ms step:1034/1480 train_time:160167ms step_avg:156.41ms step:1035/1480 train_time:160335ms step_avg:156.42ms step:1036/1480 train_time:160502ms step_avg:156.43ms step:1037/1480 train_time:160669ms step_avg:156.45ms step:1038/1480 train_time:160837ms step_avg:156.46ms step:1039/1480 train_time:161010ms step_avg:156.47ms step:1040/1480 train_time:161175ms step_avg:156.48ms step:1041/1480 train_time:161343ms step_avg:156.49ms step:1042/1480 train_time:161508ms step_avg:156.50ms step:1043/1480 train_time:161673ms step_avg:156.51ms step:1044/1480 train_time:161838ms step_avg:156.52ms step:1045/1480 train_time:162008ms step_avg:156.53ms step:1046/1480 train_time:162175ms step_avg:156.54ms step:1047/1480 train_time:162341ms step_avg:156.55ms step:1048/1480 train_time:162509ms step_avg:156.56ms step:1049/1480 train_time:162675ms step_avg:156.57ms step:1050/1480 train_time:162844ms step_avg:156.58ms step:1051/1480 train_time:163014ms step_avg:156.59ms step:1052/1480 train_time:163183ms step_avg:156.61ms step:1053/1480 train_time:163350ms step_avg:156.62ms step:1054/1480 train_time:163516ms step_avg:156.62ms step:1055/1480 train_time:163681ms step_avg:156.63ms step:1056/1480 train_time:163846ms step_avg:156.64ms step:1057/1480 train_time:164012ms step_avg:156.65ms step:1058/1480 train_time:164181ms step_avg:156.66ms step:1059/1480 train_time:164353ms step_avg:156.68ms step:1060/1480 train_time:164521ms step_avg:156.69ms step:1061/1480 train_time:164684ms step_avg:156.69ms step:1062/1480 train_time:164851ms step_avg:156.70ms step:1063/1480 train_time:165015ms step_avg:156.71ms step:1064/1480 train_time:165178ms step_avg:156.72ms step:1065/1480 train_time:165347ms step_avg:156.73ms step:1066/1480 train_time:165514ms step_avg:156.74ms step:1067/1480 train_time:165684ms step_avg:156.75ms step:1068/1480 train_time:165850ms step_avg:156.76ms step:1069/1480 train_time:166022ms step_avg:156.77ms step:1070/1480 train_time:166189ms step_avg:156.78ms step:1071/1480 train_time:166359ms step_avg:156.79ms step:1072/1480 train_time:166525ms step_avg:156.80ms step:1073/1480 train_time:166688ms step_avg:156.81ms step:1074/1480 train_time:166854ms step_avg:156.82ms step:1075/1480 train_time:167024ms step_avg:156.83ms step:1076/1480 train_time:167192ms step_avg:156.84ms step:1077/1480 train_time:167356ms step_avg:156.85ms step:1078/1480 train_time:167531ms step_avg:156.86ms step:1079/1480 train_time:167704ms step_avg:156.88ms step:1080/1480 train_time:167875ms step_avg:156.89ms step:1081/1480 train_time:168041ms step_avg:156.90ms step:1082/1480 train_time:168210ms step_avg:156.91ms step:1083/1480 train_time:168376ms step_avg:156.92ms step:1084/1480 train_time:168542ms step_avg:156.93ms step:1085/1480 train_time:168710ms step_avg:156.94ms step:1086/1480 train_time:168877ms step_avg:156.95ms step:1087/1480 train_time:169043ms step_avg:156.96ms step:1088/1480 train_time:169215ms step_avg:156.97ms step:1089/1480 train_time:169387ms step_avg:156.99ms step:1090/1480 train_time:169558ms step_avg:157.00ms step:1091/1480 train_time:169726ms step_avg:157.01ms step:1092/1480 train_time:169894ms step_avg:157.02ms step:1093/1480 train_time:170061ms step_avg:157.03ms step:1094/1480 train_time:170228ms step_avg:157.04ms step:1095/1480 train_time:170392ms step_avg:157.04ms step:1096/1480 train_time:170561ms step_avg:157.05ms step:1097/1480 train_time:170730ms step_avg:157.07ms step:1098/1480 train_time:170900ms step_avg:157.08ms step:1099/1480 train_time:171071ms step_avg:157.09ms step:1100/1480 train_time:171244ms step_avg:157.10ms step:1101/1480 train_time:171414ms step_avg:157.12ms step:1102/1480 train_time:171587ms step_avg:157.13ms step:1103/1480 train_time:171761ms step_avg:157.15ms step:1104/1480 train_time:171930ms step_avg:157.16ms step:1105/1480 train_time:172099ms step_avg:157.17ms step:1106/1480 train_time:172268ms step_avg:157.18ms step:1107/1480 train_time:172437ms step_avg:157.19ms step:1108/1480 train_time:172602ms step_avg:157.20ms step:1109/1480 train_time:172770ms step_avg:157.21ms step:1110/1480 train_time:172935ms step_avg:157.21ms step:1111/1480 train_time:173103ms step_avg:157.22ms step:1112/1480 train_time:173273ms step_avg:157.24ms step:1113/1480 train_time:173451ms step_avg:157.25ms step:1114/1480 train_time:173623ms step_avg:157.27ms step:1115/1480 train_time:173796ms step_avg:157.28ms step:1116/1480 train_time:173962ms step_avg:157.29ms step:1117/1480 train_time:174135ms step_avg:157.30ms step:1118/1480 train_time:174310ms step_avg:157.32ms step:1119/1480 train_time:174475ms step_avg:157.33ms step:1120/1480 train_time:174643ms step_avg:157.34ms step:1121/1480 train_time:174814ms step_avg:157.35ms step:1122/1480 train_time:174981ms step_avg:157.36ms step:1123/1480 train_time:175148ms step_avg:157.37ms step:1124/1480 train_time:175315ms step_avg:157.37ms step:1125/1480 train_time:175483ms step_avg:157.38ms step:1125/1480 val_loss:3.3865 train_time:175552ms step_avg:157.45ms step:1126/1480 train_time:175655ms step_avg:157.40ms step:1127/1480 train_time:175824ms step_avg:157.41ms step:1128/1480 train_time:175996ms step_avg:157.42ms step:1129/1480 train_time:176169ms step_avg:157.43ms step:1130/1480 train_time:176338ms step_avg:157.44ms step:1131/1480 train_time:176515ms step_avg:157.46ms step:1132/1480 train_time:176681ms step_avg:157.47ms step:1133/1480 train_time:176854ms step_avg:157.48ms step:1134/1480 train_time:177024ms step_avg:157.50ms step:1135/1480 train_time:177193ms step_avg:157.51ms step:1136/1480 train_time:177362ms step_avg:157.51ms step:1137/1480 train_time:177531ms step_avg:157.53ms step:1138/1480 train_time:177702ms step_avg:157.54ms step:1139/1480 train_time:177872ms step_avg:157.55ms step:1140/1480 train_time:178039ms step_avg:157.56ms step:1141/1480 train_time:178212ms step_avg:157.57ms step:1142/1480 train_time:178379ms step_avg:157.58ms step:1143/1480 train_time:178549ms step_avg:157.59ms step:1144/1480 train_time:178719ms step_avg:157.60ms step:1145/1480 train_time:178883ms step_avg:157.61ms step:1146/1480 train_time:179053ms step_avg:157.62ms step:1147/1480 train_time:179221ms step_avg:157.63ms step:1148/1480 train_time:179390ms step_avg:157.64ms step:1149/1480 train_time:179561ms step_avg:157.65ms step:1150/1480 train_time:179729ms step_avg:157.66ms step:1151/1480 train_time:179902ms step_avg:157.67ms step:1152/1480 train_time:180074ms step_avg:157.68ms step:1153/1480 train_time:180246ms step_avg:157.70ms step:1154/1480 train_time:180414ms step_avg:157.70ms step:1155/1480 train_time:180585ms step_avg:157.72ms step:1156/1480 train_time:180764ms step_avg:157.73ms step:1157/1480 train_time:180935ms step_avg:157.75ms step:1158/1480 train_time:181102ms step_avg:157.75ms step:1159/1480 train_time:181269ms step_avg:157.76ms step:1160/1480 train_time:181437ms step_avg:157.77ms step:1161/1480 train_time:181606ms step_avg:157.78ms step:1162/1480 train_time:181776ms step_avg:157.79ms step:1163/1480 train_time:181944ms step_avg:157.80ms step:1164/1480 train_time:182115ms step_avg:157.81ms step:1165/1480 train_time:182280ms step_avg:157.82ms step:1166/1480 train_time:182450ms step_avg:157.83ms step:1167/1480 train_time:182620ms step_avg:157.84ms step:1168/1480 train_time:182787ms step_avg:157.85ms step:1169/1480 train_time:182958ms step_avg:157.86ms step:1170/1480 train_time:183125ms step_avg:157.87ms step:1171/1480 train_time:183293ms step_avg:157.87ms step:1172/1480 train_time:183460ms step_avg:157.88ms step:1173/1480 train_time:183632ms step_avg:157.90ms step:1174/1480 train_time:183814ms step_avg:157.92ms step:1175/1480 train_time:183984ms step_avg:157.93ms step:1176/1480 train_time:184157ms step_avg:157.94ms step:1177/1480 train_time:184334ms step_avg:157.96ms step:1178/1480 train_time:184501ms step_avg:157.96ms step:1179/1480 train_time:184666ms step_avg:157.97ms step:1180/1480 train_time:184845ms step_avg:157.99ms step:1181/1480 train_time:185016ms step_avg:158.00ms step:1182/1480 train_time:185182ms step_avg:158.01ms step:1183/1480 train_time:185353ms step_avg:158.02ms step:1184/1480 train_time:185521ms step_avg:158.02ms step:1185/1480 train_time:185693ms step_avg:158.04ms step:1186/1480 train_time:185864ms step_avg:158.05ms step:1187/1480 train_time:186046ms step_avg:158.07ms step:1188/1480 train_time:186213ms step_avg:158.08ms step:1189/1480 train_time:186385ms step_avg:158.09ms step:1190/1480 train_time:186552ms step_avg:158.10ms step:1191/1480 train_time:186723ms step_avg:158.11ms step:1192/1480 train_time:186890ms step_avg:158.11ms step:1193/1480 train_time:187055ms step_avg:158.12ms step:1194/1480 train_time:187224ms step_avg:158.13ms step:1195/1480 train_time:187398ms step_avg:158.14ms step:1196/1480 train_time:187580ms step_avg:158.16ms step:1197/1480 train_time:187752ms step_avg:158.17ms step:1198/1480 train_time:187937ms step_avg:158.20ms step:1199/1480 train_time:188108ms step_avg:158.21ms step:1200/1480 train_time:188278ms step_avg:158.22ms step:1201/1480 train_time:188445ms step_avg:158.22ms step:1202/1480 train_time:188626ms step_avg:158.24ms step:1203/1480 train_time:188801ms step_avg:158.26ms step:1204/1480 train_time:188975ms step_avg:158.27ms step:1205/1480 train_time:189143ms step_avg:158.28ms step:1206/1480 train_time:189311ms step_avg:158.29ms step:1207/1480 train_time:189481ms step_avg:158.30ms step:1208/1480 train_time:189649ms step_avg:158.30ms step:1209/1480 train_time:189822ms step_avg:158.32ms step:1210/1480 train_time:189999ms step_avg:158.33ms step:1211/1480 train_time:190172ms step_avg:158.34ms step:1212/1480 train_time:190343ms step_avg:158.36ms step:1213/1480 train_time:190517ms step_avg:158.37ms step:1214/1480 train_time:190695ms step_avg:158.38ms step:1215/1480 train_time:190871ms step_avg:158.40ms step:1216/1480 train_time:191041ms step_avg:158.41ms step:1217/1480 train_time:191215ms step_avg:158.42ms step:1218/1480 train_time:191384ms step_avg:158.43ms step:1219/1480 train_time:191564ms step_avg:158.45ms step:1220/1480 train_time:191737ms step_avg:158.46ms step:1221/1480 train_time:191904ms step_avg:158.47ms step:1222/1480 train_time:192071ms step_avg:158.47ms step:1223/1480 train_time:192241ms step_avg:158.48ms step:1224/1480 train_time:192419ms step_avg:158.50ms step:1225/1480 train_time:192592ms step_avg:158.51ms step:1226/1480 train_time:192764ms step_avg:158.52ms step:1227/1480 train_time:192938ms step_avg:158.54ms step:1228/1480 train_time:193107ms step_avg:158.54ms step:1229/1480 train_time:193279ms step_avg:158.56ms step:1230/1480 train_time:193459ms step_avg:158.57ms step:1231/1480 train_time:193635ms step_avg:158.59ms step:1232/1480 train_time:193809ms step_avg:158.60ms step:1233/1480 train_time:193979ms step_avg:158.61ms step:1234/1480 train_time:194148ms step_avg:158.62ms step:1235/1480 train_time:194322ms step_avg:158.63ms step:1236/1480 train_time:194491ms step_avg:158.64ms step:1237/1480 train_time:194662ms step_avg:158.65ms step:1238/1480 train_time:194849ms step_avg:158.67ms step:1239/1480 train_time:195021ms step_avg:158.68ms step:1240/1480 train_time:195191ms step_avg:158.69ms step:1241/1480 train_time:195363ms step_avg:158.70ms step:1242/1480 train_time:195533ms step_avg:158.71ms step:1243/1480 train_time:195707ms step_avg:158.72ms step:1244/1480 train_time:195873ms step_avg:158.73ms step:1245/1480 train_time:196042ms step_avg:158.74ms step:1246/1480 train_time:196213ms step_avg:158.75ms step:1247/1480 train_time:196381ms step_avg:158.76ms step:1248/1480 train_time:196549ms step_avg:158.76ms step:1249/1480 train_time:196718ms step_avg:158.77ms step:1250/1480 train_time:196887ms step_avg:158.78ms step:1250/1480 val_loss:3.3376 train_time:196959ms step_avg:158.84ms step:1251/1480 train_time:197068ms step_avg:158.80ms step:1252/1480 train_time:197237ms step_avg:158.81ms step:1253/1480 train_time:197404ms step_avg:158.81ms step:1254/1480 train_time:197575ms step_avg:158.82ms step:1255/1480 train_time:197764ms step_avg:158.85ms step:1256/1480 train_time:197937ms step_avg:158.86ms step:1257/1480 train_time:198106ms step_avg:158.87ms step:1258/1480 train_time:198280ms step_avg:158.88ms step:1259/1480 train_time:198452ms step_avg:158.89ms step:1260/1480 train_time:198620ms step_avg:158.90ms step:1261/1480 train_time:198791ms step_avg:158.91ms step:1262/1480 train_time:198966ms step_avg:158.92ms step:1263/1480 train_time:199140ms step_avg:158.93ms step:1264/1480 train_time:199307ms step_avg:158.94ms step:1265/1480 train_time:199474ms step_avg:158.94ms step:1266/1480 train_time:199646ms step_avg:158.95ms step:1267/1480 train_time:199817ms step_avg:158.96ms step:1268/1480 train_time:199988ms step_avg:158.97ms step:1269/1480 train_time:200165ms step_avg:158.99ms step:1270/1480 train_time:200334ms step_avg:159.00ms step:1271/1480 train_time:200503ms step_avg:159.00ms step:1272/1480 train_time:200669ms step_avg:159.01ms step:1273/1480 train_time:200841ms step_avg:159.02ms step:1274/1480 train_time:201012ms step_avg:159.03ms step:1275/1480 train_time:201180ms step_avg:159.04ms step:1276/1480 train_time:201346ms step_avg:159.04ms step:1277/1480 train_time:201518ms step_avg:159.05ms step:1278/1480 train_time:201686ms step_avg:159.06ms step:1279/1480 train_time:201857ms step_avg:159.07ms step:1280/1480 train_time:202037ms step_avg:159.08ms step:1281/1480 train_time:202204ms step_avg:159.09ms step:1282/1480 train_time:202369ms step_avg:159.10ms step:1283/1480 train_time:202542ms step_avg:159.11ms step:1284/1480 train_time:202711ms step_avg:159.11ms step:1285/1480 train_time:202880ms step_avg:159.12ms step:1286/1480 train_time:203049ms step_avg:159.13ms step:1287/1480 train_time:203220ms step_avg:159.14ms step:1288/1480 train_time:203391ms step_avg:159.15ms step:1289/1480 train_time:203575ms step_avg:159.17ms step:1290/1480 train_time:203755ms step_avg:159.18ms step:1291/1480 train_time:203927ms step_avg:159.19ms step:1292/1480 train_time:204101ms step_avg:159.20ms step:1293/1480 train_time:204274ms step_avg:159.22ms step:1294/1480 train_time:204446ms step_avg:159.23ms step:1295/1480 train_time:204618ms step_avg:159.24ms step:1296/1480 train_time:204791ms step_avg:159.25ms step:1297/1480 train_time:204962ms step_avg:159.26ms step:1298/1480 train_time:205132ms step_avg:159.26ms step:1299/1480 train_time:205302ms step_avg:159.27ms step:1300/1480 train_time:205469ms step_avg:159.28ms step:1301/1480 train_time:205638ms step_avg:159.29ms step:1302/1480 train_time:205811ms step_avg:159.30ms step:1303/1480 train_time:205989ms step_avg:159.31ms step:1304/1480 train_time:206163ms step_avg:159.32ms step:1305/1480 train_time:206331ms step_avg:159.33ms step:1306/1480 train_time:206505ms step_avg:159.34ms step:1307/1480 train_time:206672ms step_avg:159.35ms step:1308/1480 train_time:206841ms step_avg:159.35ms step:1309/1480 train_time:207012ms step_avg:159.36ms step:1310/1480 train_time:207182ms step_avg:159.37ms step:1311/1480 train_time:207349ms step_avg:159.38ms step:1312/1480 train_time:207524ms step_avg:159.39ms step:1313/1480 train_time:207691ms step_avg:159.39ms step:1314/1480 train_time:207868ms step_avg:159.41ms step:1315/1480 train_time:208038ms step_avg:159.42ms step:1316/1480 train_time:208206ms step_avg:159.42ms step:1317/1480 train_time:208377ms step_avg:159.43ms step:1318/1480 train_time:208559ms step_avg:159.45ms step:1319/1480 train_time:208735ms step_avg:159.46ms step:1320/1480 train_time:208912ms step_avg:159.48ms step:1321/1480 train_time:209085ms step_avg:159.49ms step:1322/1480 train_time:209265ms step_avg:159.50ms step:1323/1480 train_time:209438ms step_avg:159.51ms step:1324/1480 train_time:209612ms step_avg:159.52ms step:1325/1480 train_time:209794ms step_avg:159.54ms step:1326/1480 train_time:209969ms step_avg:159.55ms step:1327/1480 train_time:210140ms step_avg:159.56ms step:1328/1480 train_time:210310ms step_avg:159.57ms step:1329/1480 train_time:210507ms step_avg:159.60ms step:1330/1480 train_time:210686ms step_avg:159.61ms step:1331/1480 train_time:210855ms step_avg:159.62ms step:1332/1480 train_time:211029ms step_avg:159.63ms step:1333/1480 train_time:211204ms step_avg:159.64ms step:1334/1480 train_time:211374ms step_avg:159.65ms step:1335/1480 train_time:211544ms step_avg:159.66ms step:1336/1480 train_time:211726ms step_avg:159.67ms step:1337/1480 train_time:211902ms step_avg:159.68ms step:1338/1480 train_time:212072ms step_avg:159.69ms step:1339/1480 train_time:212247ms step_avg:159.70ms step:1340/1480 train_time:212419ms step_avg:159.71ms step:1341/1480 train_time:212587ms step_avg:159.72ms step:1342/1480 train_time:212763ms step_avg:159.73ms step:1343/1480 train_time:212933ms step_avg:159.74ms step:1344/1480 train_time:213105ms step_avg:159.75ms step:1345/1480 train_time:213283ms step_avg:159.76ms step:1346/1480 train_time:213452ms step_avg:159.77ms step:1347/1480 train_time:213623ms step_avg:159.78ms step:1348/1480 train_time:213792ms step_avg:159.78ms step:1349/1480 train_time:213962ms step_avg:159.79ms step:1350/1480 train_time:214135ms step_avg:159.80ms step:1351/1480 train_time:214305ms step_avg:159.81ms step:1352/1480 train_time:214475ms step_avg:159.82ms step:1353/1480 train_time:214651ms step_avg:159.83ms step:1354/1480 train_time:214822ms step_avg:159.84ms step:1355/1480 train_time:214989ms step_avg:159.84ms step:1356/1480 train_time:215163ms step_avg:159.85ms step:1357/1480 train_time:215335ms step_avg:159.86ms step:1358/1480 train_time:215506ms step_avg:159.87ms step:1359/1480 train_time:215678ms step_avg:159.88ms step:1360/1480 train_time:215851ms step_avg:159.89ms step:1361/1480 train_time:216030ms step_avg:159.90ms step:1362/1480 train_time:216205ms step_avg:159.91ms step:1363/1480 train_time:216385ms step_avg:159.93ms step:1364/1480 train_time:216554ms step_avg:159.94ms step:1365/1480 train_time:216721ms step_avg:159.94ms step:1366/1480 train_time:216892ms step_avg:159.95ms step:1367/1480 train_time:217064ms step_avg:159.96ms step:1368/1480 train_time:217237ms step_avg:159.97ms step:1369/1480 train_time:217416ms step_avg:159.98ms step:1370/1480 train_time:217595ms step_avg:160.00ms step:1371/1480 train_time:217768ms step_avg:160.01ms step:1372/1480 train_time:217945ms step_avg:160.02ms step:1373/1480 train_time:218113ms step_avg:160.02ms step:1374/1480 train_time:218290ms step_avg:160.04ms step:1375/1480 train_time:218463ms step_avg:160.05ms step:1375/1480 val_loss:3.2986 train_time:218530ms step_avg:160.10ms step:1376/1480 train_time:218637ms step_avg:160.06ms step:1377/1480 train_time:218810ms step_avg:160.07ms step:1378/1480 train_time:218979ms step_avg:160.07ms step:1379/1480 train_time:219154ms step_avg:160.08ms step:1380/1480 train_time:219328ms step_avg:160.09ms step:1381/1480 train_time:219508ms step_avg:160.11ms step:1382/1480 train_time:219680ms step_avg:160.12ms step:1383/1480 train_time:219853ms step_avg:160.13ms step:1384/1480 train_time:220032ms step_avg:160.14ms step:1385/1480 train_time:220198ms step_avg:160.14ms step:1386/1480 train_time:220368ms step_avg:160.15ms step:1387/1480 train_time:220539ms step_avg:160.16ms step:1388/1480 train_time:220707ms step_avg:160.16ms step:1389/1480 train_time:220881ms step_avg:160.17ms step:1390/1480 train_time:221048ms step_avg:160.18ms step:1391/1480 train_time:221218ms step_avg:160.19ms step:1392/1480 train_time:221392ms step_avg:160.20ms step:1393/1480 train_time:221564ms step_avg:160.21ms step:1394/1480 train_time:221735ms step_avg:160.21ms step:1395/1480 train_time:221904ms step_avg:160.22ms step:1396/1480 train_time:222074ms step_avg:160.23ms step:1397/1480 train_time:222241ms step_avg:160.23ms step:1398/1480 train_time:222409ms step_avg:160.24ms step:1399/1480 train_time:222579ms step_avg:160.24ms step:1400/1480 train_time:222757ms step_avg:160.26ms step:1401/1480 train_time:222922ms step_avg:160.26ms step:1402/1480 train_time:223093ms step_avg:160.27ms step:1403/1480 train_time:223271ms step_avg:160.28ms step:1404/1480 train_time:223442ms step_avg:160.29ms step:1405/1480 train_time:223617ms step_avg:160.30ms step:1406/1480 train_time:223792ms step_avg:160.31ms step:1407/1480 train_time:223959ms step_avg:160.31ms step:1408/1480 train_time:224127ms step_avg:160.32ms step:1409/1480 train_time:224311ms step_avg:160.34ms step:1410/1480 train_time:224480ms step_avg:160.34ms step:1411/1480 train_time:224647ms step_avg:160.35ms step:1412/1480 train_time:224816ms step_avg:160.35ms step:1413/1480 train_time:224985ms step_avg:160.36ms step:1414/1480 train_time:225158ms step_avg:160.37ms step:1415/1480 train_time:225333ms step_avg:160.38ms step:1416/1480 train_time:225519ms step_avg:160.40ms step:1417/1480 train_time:225691ms step_avg:160.41ms step:1418/1480 train_time:225861ms step_avg:160.41ms step:1419/1480 train_time:226035ms step_avg:160.42ms step:1420/1480 train_time:226209ms step_avg:160.43ms step:1421/1480 train_time:226382ms step_avg:160.44ms step:1422/1480 train_time:226556ms step_avg:160.45ms step:1423/1480 train_time:226726ms step_avg:160.46ms step:1424/1480 train_time:226904ms step_avg:160.47ms step:1425/1480 train_time:227084ms step_avg:160.48ms step:1426/1480 train_time:227256ms step_avg:160.49ms step:1427/1480 train_time:227430ms step_avg:160.50ms step:1428/1480 train_time:227602ms step_avg:160.51ms step:1429/1480 train_time:227771ms step_avg:160.52ms step:1430/1480 train_time:227945ms step_avg:160.52ms step:1431/1480 train_time:228120ms step_avg:160.53ms step:1432/1480 train_time:228296ms step_avg:160.55ms step:1433/1480 train_time:228476ms step_avg:160.56ms step:1434/1480 train_time:228656ms step_avg:160.57ms step:1435/1480 train_time:228832ms step_avg:160.58ms step:1436/1480 train_time:229004ms step_avg:160.59ms step:1437/1480 train_time:229176ms step_avg:160.60ms step:1438/1480 train_time:229344ms step_avg:160.60ms step:1439/1480 train_time:229520ms step_avg:160.62ms step:1440/1480 train_time:229690ms step_avg:160.62ms step:1441/1480 train_time:229860ms step_avg:160.63ms step:1442/1480 train_time:230038ms step_avg:160.64ms step:1443/1480 train_time:230226ms step_avg:160.66ms step:1444/1480 train_time:230398ms step_avg:160.67ms step:1445/1480 train_time:230568ms step_avg:160.67ms step:1446/1480 train_time:230744ms step_avg:160.68ms step:1447/1480 train_time:230920ms step_avg:160.70ms step:1448/1480 train_time:231092ms step_avg:160.70ms step:1449/1480 train_time:231266ms step_avg:160.71ms step:1450/1480 train_time:231439ms step_avg:160.72ms step:1451/1480 train_time:231610ms step_avg:160.73ms step:1452/1480 train_time:231782ms step_avg:160.74ms step:1453/1480 train_time:231953ms step_avg:160.74ms step:1454/1480 train_time:232124ms step_avg:160.75ms step:1455/1480 train_time:232304ms step_avg:160.76ms step:1456/1480 train_time:232476ms step_avg:160.77ms step:1457/1480 train_time:232646ms step_avg:160.78ms step:1458/1480 train_time:232816ms step_avg:160.78ms step:1459/1480 train_time:232992ms step_avg:160.80ms step:1460/1480 train_time:233164ms step_avg:160.80ms step:1461/1480 train_time:233338ms step_avg:160.81ms step:1462/1480 train_time:233510ms step_avg:160.82ms step:1463/1480 train_time:233685ms step_avg:160.83ms step:1464/1480 train_time:233859ms step_avg:160.84ms step:1465/1480 train_time:234031ms step_avg:160.85ms step:1466/1480 train_time:234202ms step_avg:160.85ms step:1467/1480 train_time:234377ms step_avg:160.86ms step:1468/1480 train_time:234546ms step_avg:160.87ms step:1469/1480 train_time:234718ms step_avg:160.88ms step:1470/1480 train_time:234899ms step_avg:160.89ms step:1471/1480 train_time:235085ms step_avg:160.91ms step:1472/1480 train_time:235264ms step_avg:160.92ms step:1473/1480 train_time:235436ms step_avg:160.93ms step:1474/1480 train_time:235613ms step_avg:160.94ms step:1475/1480 train_time:235792ms step_avg:160.95ms step:1476/1480 train_time:235964ms step_avg:160.96ms step:1477/1480 train_time:236146ms step_avg:160.97ms step:1478/1480 train_time:236330ms step_avg:160.99ms step:1479/1480 train_time:236504ms step_avg:161.00ms step:1480/1480 train_time:236676ms step_avg:161.00ms step:1480/1480 val_loss:3.2797 train_time:236747ms step_avg:161.05ms