import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 11:25:19 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 93W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 123W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22817ms step_avg:nanms step:2/1480 train_time:22909ms step_avg:nanms step:3/1480 train_time:23048ms step_avg:nanms step:4/1480 train_time:23188ms step_avg:nanms step:5/1480 train_time:23329ms step_avg:nanms step:6/1480 train_time:23469ms step_avg:nanms step:7/1480 train_time:23610ms step_avg:nanms step:8/1480 train_time:23753ms step_avg:nanms step:9/1480 train_time:23901ms step_avg:nanms step:10/1480 train_time:24045ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:422ms step_avg:140.69ms step:14/1480 train_time:563ms step_avg:140.65ms step:15/1480 train_time:705ms step_avg:141.03ms step:16/1480 train_time:852ms step_avg:142.04ms step:17/1480 train_time:997ms step_avg:142.42ms step:18/1480 train_time:1140ms step_avg:142.53ms step:19/1480 train_time:1281ms step_avg:142.30ms step:20/1480 train_time:1421ms step_avg:142.11ms step:21/1480 train_time:1561ms step_avg:141.93ms step:22/1480 train_time:1703ms step_avg:141.89ms step:23/1480 train_time:1846ms step_avg:142.04ms step:24/1480 train_time:1993ms step_avg:142.32ms step:25/1480 train_time:2136ms step_avg:142.39ms step:26/1480 train_time:2279ms step_avg:142.42ms step:27/1480 train_time:2420ms step_avg:142.36ms step:28/1480 train_time:2560ms step_avg:142.25ms step:29/1480 train_time:2701ms step_avg:142.18ms step:30/1480 train_time:2843ms step_avg:142.16ms step:31/1480 train_time:2988ms step_avg:142.30ms step:32/1480 train_time:3131ms step_avg:142.34ms step:33/1480 train_time:3274ms step_avg:142.34ms step:34/1480 train_time:3417ms step_avg:142.39ms step:35/1480 train_time:3558ms step_avg:142.33ms step:36/1480 train_time:3701ms step_avg:142.33ms step:37/1480 train_time:3843ms step_avg:142.33ms step:38/1480 train_time:3985ms step_avg:142.32ms step:39/1480 train_time:4129ms step_avg:142.39ms step:40/1480 train_time:4274ms step_avg:142.45ms step:41/1480 train_time:4416ms step_avg:142.44ms step:42/1480 train_time:4557ms step_avg:142.40ms step:43/1480 train_time:4699ms step_avg:142.38ms step:44/1480 train_time:4841ms step_avg:142.37ms step:45/1480 train_time:4981ms step_avg:142.32ms step:46/1480 train_time:5123ms step_avg:142.31ms step:47/1480 train_time:5265ms step_avg:142.30ms step:48/1480 train_time:5408ms step_avg:142.32ms step:49/1480 train_time:5552ms step_avg:142.36ms step:50/1480 train_time:5695ms step_avg:142.37ms step:51/1480 train_time:5837ms step_avg:142.38ms step:52/1480 train_time:5980ms step_avg:142.37ms step:53/1480 train_time:6121ms step_avg:142.34ms step:54/1480 train_time:6262ms step_avg:142.32ms step:55/1480 train_time:6404ms step_avg:142.32ms step:56/1480 train_time:6547ms step_avg:142.32ms step:57/1480 train_time:6688ms step_avg:142.30ms step:58/1480 train_time:6830ms step_avg:142.29ms step:59/1480 train_time:6973ms step_avg:142.30ms step:60/1480 train_time:7116ms step_avg:142.32ms step:61/1480 train_time:7258ms step_avg:142.31ms step:62/1480 train_time:7401ms step_avg:142.33ms step:63/1480 train_time:7543ms step_avg:142.32ms step:64/1480 train_time:7685ms step_avg:142.31ms step:65/1480 train_time:7826ms step_avg:142.29ms step:66/1480 train_time:7967ms step_avg:142.27ms step:67/1480 train_time:8111ms step_avg:142.30ms step:68/1480 train_time:8254ms step_avg:142.32ms step:69/1480 train_time:8398ms step_avg:142.33ms step:70/1480 train_time:8540ms step_avg:142.33ms step:71/1480 train_time:8683ms step_avg:142.34ms step:72/1480 train_time:8824ms step_avg:142.32ms step:73/1480 train_time:8967ms step_avg:142.33ms step:74/1480 train_time:9110ms step_avg:142.35ms step:75/1480 train_time:9253ms step_avg:142.36ms step:76/1480 train_time:9396ms step_avg:142.37ms step:77/1480 train_time:9538ms step_avg:142.36ms step:78/1480 train_time:9680ms step_avg:142.35ms step:79/1480 train_time:9821ms step_avg:142.33ms step:80/1480 train_time:9962ms step_avg:142.31ms step:81/1480 train_time:10104ms step_avg:142.31ms step:82/1480 train_time:10248ms step_avg:142.34ms step:83/1480 train_time:10393ms step_avg:142.38ms step:84/1480 train_time:10537ms step_avg:142.39ms step:85/1480 train_time:10679ms step_avg:142.39ms step:86/1480 train_time:10820ms step_avg:142.37ms step:87/1480 train_time:10961ms step_avg:142.35ms step:88/1480 train_time:11103ms step_avg:142.34ms step:89/1480 train_time:11246ms step_avg:142.36ms step:90/1480 train_time:11392ms step_avg:142.40ms step:91/1480 train_time:11533ms step_avg:142.38ms step:92/1480 train_time:11676ms step_avg:142.39ms step:93/1480 train_time:11818ms step_avg:142.39ms step:94/1480 train_time:11960ms step_avg:142.38ms step:95/1480 train_time:12101ms step_avg:142.37ms step:96/1480 train_time:12244ms step_avg:142.37ms step:97/1480 train_time:12387ms step_avg:142.38ms step:98/1480 train_time:12530ms step_avg:142.38ms step:99/1480 train_time:12673ms step_avg:142.39ms step:100/1480 train_time:12816ms step_avg:142.40ms step:101/1480 train_time:12957ms step_avg:142.38ms step:102/1480 train_time:13099ms step_avg:142.38ms step:103/1480 train_time:13242ms step_avg:142.38ms step:104/1480 train_time:13384ms step_avg:142.38ms step:105/1480 train_time:13526ms step_avg:142.38ms step:106/1480 train_time:13668ms step_avg:142.38ms step:107/1480 train_time:13812ms step_avg:142.39ms step:108/1480 train_time:13955ms step_avg:142.40ms step:109/1480 train_time:14097ms step_avg:142.39ms step:110/1480 train_time:14239ms step_avg:142.39ms step:111/1480 train_time:14384ms step_avg:142.41ms step:112/1480 train_time:14532ms step_avg:142.47ms step:113/1480 train_time:14678ms step_avg:142.51ms step:114/1480 train_time:14824ms step_avg:142.53ms step:115/1480 train_time:14970ms step_avg:142.57ms step:116/1480 train_time:15116ms step_avg:142.61ms step:117/1480 train_time:15262ms step_avg:142.64ms step:118/1480 train_time:15410ms step_avg:142.68ms step:119/1480 train_time:15558ms step_avg:142.73ms step:120/1480 train_time:15703ms step_avg:142.75ms step:121/1480 train_time:15849ms step_avg:142.78ms step:122/1480 train_time:15996ms step_avg:142.82ms step:123/1480 train_time:16142ms step_avg:142.85ms step:124/1480 train_time:16288ms step_avg:142.88ms step:125/1480 train_time:16435ms step_avg:142.91ms step:125/1480 val_loss:4.4264 train_time:16492ms step_avg:143.41ms step:126/1480 train_time:16589ms step_avg:143.01ms step:127/1480 train_time:16742ms step_avg:143.10ms step:128/1480 train_time:16889ms step_avg:143.13ms step:129/1480 train_time:17036ms step_avg:143.16ms step:130/1480 train_time:17181ms step_avg:143.18ms step:131/1480 train_time:17327ms step_avg:143.19ms step:132/1480 train_time:17473ms step_avg:143.22ms step:133/1480 train_time:17622ms step_avg:143.27ms step:134/1480 train_time:17768ms step_avg:143.29ms step:135/1480 train_time:17918ms step_avg:143.34ms step:136/1480 train_time:18063ms step_avg:143.36ms step:137/1480 train_time:18209ms step_avg:143.38ms step:138/1480 train_time:18356ms step_avg:143.41ms step:139/1480 train_time:18502ms step_avg:143.43ms step:140/1480 train_time:18649ms step_avg:143.45ms step:141/1480 train_time:18796ms step_avg:143.48ms step:142/1480 train_time:18942ms step_avg:143.50ms step:143/1480 train_time:19088ms step_avg:143.52ms step:144/1480 train_time:19235ms step_avg:143.55ms step:145/1480 train_time:19381ms step_avg:143.56ms step:146/1480 train_time:19528ms step_avg:143.59ms step:147/1480 train_time:19674ms step_avg:143.61ms step:148/1480 train_time:19822ms step_avg:143.63ms step:149/1480 train_time:19968ms step_avg:143.65ms step:150/1480 train_time:20116ms step_avg:143.69ms step:151/1480 train_time:20262ms step_avg:143.70ms step:152/1480 train_time:20409ms step_avg:143.73ms step:153/1480 train_time:20556ms step_avg:143.75ms step:154/1480 train_time:20702ms step_avg:143.76ms step:155/1480 train_time:20849ms step_avg:143.79ms step:156/1480 train_time:20996ms step_avg:143.81ms step:157/1480 train_time:21143ms step_avg:143.83ms step:158/1480 train_time:21291ms step_avg:143.86ms step:159/1480 train_time:21438ms step_avg:143.88ms step:160/1480 train_time:21582ms step_avg:143.88ms step:161/1480 train_time:21729ms step_avg:143.90ms step:162/1480 train_time:21876ms step_avg:143.92ms step:163/1480 train_time:22022ms step_avg:143.93ms step:164/1480 train_time:22169ms step_avg:143.95ms step:165/1480 train_time:22317ms step_avg:143.98ms step:166/1480 train_time:22463ms step_avg:143.99ms step:167/1480 train_time:22610ms step_avg:144.01ms step:168/1480 train_time:22757ms step_avg:144.03ms step:169/1480 train_time:22903ms step_avg:144.05ms step:170/1480 train_time:23051ms step_avg:144.07ms step:171/1480 train_time:23197ms step_avg:144.08ms step:172/1480 train_time:23344ms step_avg:144.10ms step:173/1480 train_time:23491ms step_avg:144.12ms step:174/1480 train_time:23638ms step_avg:144.13ms step:175/1480 train_time:23785ms step_avg:144.15ms step:176/1480 train_time:23931ms step_avg:144.16ms step:177/1480 train_time:24077ms step_avg:144.17ms step:178/1480 train_time:24223ms step_avg:144.18ms step:179/1480 train_time:24369ms step_avg:144.20ms step:180/1480 train_time:24517ms step_avg:144.22ms step:181/1480 train_time:24663ms step_avg:144.23ms step:182/1480 train_time:24812ms step_avg:144.25ms step:183/1480 train_time:24959ms step_avg:144.27ms step:184/1480 train_time:25103ms step_avg:144.27ms step:185/1480 train_time:25250ms step_avg:144.28ms step:186/1480 train_time:25397ms step_avg:144.30ms step:187/1480 train_time:25543ms step_avg:144.31ms step:188/1480 train_time:25692ms step_avg:144.34ms step:189/1480 train_time:25839ms step_avg:144.35ms step:190/1480 train_time:25985ms step_avg:144.36ms step:191/1480 train_time:26131ms step_avg:144.37ms step:192/1480 train_time:26278ms step_avg:144.38ms step:193/1480 train_time:26424ms step_avg:144.39ms step:194/1480 train_time:26571ms step_avg:144.41ms step:195/1480 train_time:26718ms step_avg:144.42ms step:196/1480 train_time:26864ms step_avg:144.43ms step:197/1480 train_time:27012ms step_avg:144.45ms step:198/1480 train_time:27160ms step_avg:144.47ms step:199/1480 train_time:27305ms step_avg:144.47ms step:200/1480 train_time:27452ms step_avg:144.48ms step:201/1480 train_time:27599ms step_avg:144.49ms step:202/1480 train_time:27746ms step_avg:144.51ms step:203/1480 train_time:27893ms step_avg:144.52ms step:204/1480 train_time:28040ms step_avg:144.54ms step:205/1480 train_time:28187ms step_avg:144.55ms step:206/1480 train_time:28334ms step_avg:144.56ms step:207/1480 train_time:28481ms step_avg:144.57ms step:208/1480 train_time:28627ms step_avg:144.58ms step:209/1480 train_time:28773ms step_avg:144.59ms step:210/1480 train_time:28921ms step_avg:144.61ms step:211/1480 train_time:29066ms step_avg:144.61ms step:212/1480 train_time:29215ms step_avg:144.63ms step:213/1480 train_time:29361ms step_avg:144.64ms step:214/1480 train_time:29508ms step_avg:144.65ms step:215/1480 train_time:29657ms step_avg:144.67ms step:216/1480 train_time:29803ms step_avg:144.67ms step:217/1480 train_time:29951ms step_avg:144.69ms step:218/1480 train_time:30097ms step_avg:144.70ms step:219/1480 train_time:30243ms step_avg:144.70ms step:220/1480 train_time:30391ms step_avg:144.72ms step:221/1480 train_time:30540ms step_avg:144.74ms step:222/1480 train_time:30690ms step_avg:144.76ms step:223/1480 train_time:30841ms step_avg:144.79ms step:224/1480 train_time:30990ms step_avg:144.82ms step:225/1480 train_time:31142ms step_avg:144.85ms step:226/1480 train_time:31292ms step_avg:144.87ms step:227/1480 train_time:31442ms step_avg:144.89ms step:228/1480 train_time:31592ms step_avg:144.92ms step:229/1480 train_time:31742ms step_avg:144.94ms step:230/1480 train_time:31893ms step_avg:144.97ms step:231/1480 train_time:32044ms step_avg:144.99ms step:232/1480 train_time:32195ms step_avg:145.02ms step:233/1480 train_time:32346ms step_avg:145.05ms step:234/1480 train_time:32497ms step_avg:145.08ms step:235/1480 train_time:32648ms step_avg:145.10ms step:236/1480 train_time:32799ms step_avg:145.13ms step:237/1480 train_time:32947ms step_avg:145.14ms step:238/1480 train_time:33097ms step_avg:145.16ms step:239/1480 train_time:33248ms step_avg:145.19ms step:240/1480 train_time:33399ms step_avg:145.21ms step:241/1480 train_time:33549ms step_avg:145.24ms step:242/1480 train_time:33700ms step_avg:145.26ms step:243/1480 train_time:33850ms step_avg:145.28ms step:244/1480 train_time:33999ms step_avg:145.30ms step:245/1480 train_time:34151ms step_avg:145.32ms step:246/1480 train_time:34301ms step_avg:145.34ms step:247/1480 train_time:34452ms step_avg:145.37ms step:248/1480 train_time:34601ms step_avg:145.38ms step:249/1480 train_time:34751ms step_avg:145.40ms step:250/1480 train_time:34902ms step_avg:145.42ms step:250/1480 val_loss:3.9940 train_time:34960ms step_avg:145.67ms step:251/1480 train_time:35056ms step_avg:145.46ms step:252/1480 train_time:35206ms step_avg:145.48ms step:253/1480 train_time:35356ms step_avg:145.50ms step:254/1480 train_time:35505ms step_avg:145.51ms step:255/1480 train_time:35653ms step_avg:145.52ms step:256/1480 train_time:35802ms step_avg:145.54ms step:257/1480 train_time:35954ms step_avg:145.56ms step:258/1480 train_time:36106ms step_avg:145.59ms step:259/1480 train_time:36257ms step_avg:145.61ms step:260/1480 train_time:36409ms step_avg:145.64ms step:261/1480 train_time:36559ms step_avg:145.65ms step:262/1480 train_time:36709ms step_avg:145.67ms step:263/1480 train_time:36857ms step_avg:145.68ms step:264/1480 train_time:37009ms step_avg:145.71ms step:265/1480 train_time:37160ms step_avg:145.73ms step:266/1480 train_time:37312ms step_avg:145.75ms step:267/1480 train_time:37462ms step_avg:145.77ms step:268/1480 train_time:37613ms step_avg:145.79ms step:269/1480 train_time:37762ms step_avg:145.80ms step:270/1480 train_time:37913ms step_avg:145.82ms step:271/1480 train_time:38064ms step_avg:145.84ms step:272/1480 train_time:38214ms step_avg:145.86ms step:273/1480 train_time:38366ms step_avg:145.88ms step:274/1480 train_time:38517ms step_avg:145.90ms step:275/1480 train_time:38669ms step_avg:145.92ms step:276/1480 train_time:38819ms step_avg:145.93ms step:277/1480 train_time:38970ms step_avg:145.95ms step:278/1480 train_time:39120ms step_avg:145.97ms step:279/1480 train_time:39271ms step_avg:145.99ms step:280/1480 train_time:39421ms step_avg:146.00ms step:281/1480 train_time:39572ms step_avg:146.02ms step:282/1480 train_time:39722ms step_avg:146.04ms step:283/1480 train_time:39873ms step_avg:146.06ms step:284/1480 train_time:40023ms step_avg:146.07ms step:285/1480 train_time:40174ms step_avg:146.09ms step:286/1480 train_time:40325ms step_avg:146.11ms step:287/1480 train_time:40475ms step_avg:146.12ms step:288/1480 train_time:40625ms step_avg:146.13ms step:289/1480 train_time:40776ms step_avg:146.15ms step:290/1480 train_time:40924ms step_avg:146.16ms step:291/1480 train_time:41075ms step_avg:146.17ms step:292/1480 train_time:41225ms step_avg:146.19ms step:293/1480 train_time:41375ms step_avg:146.20ms step:294/1480 train_time:41525ms step_avg:146.22ms step:295/1480 train_time:41675ms step_avg:146.23ms step:296/1480 train_time:41826ms step_avg:146.24ms step:297/1480 train_time:41976ms step_avg:146.26ms step:298/1480 train_time:42128ms step_avg:146.28ms step:299/1480 train_time:42278ms step_avg:146.29ms step:300/1480 train_time:42429ms step_avg:146.31ms step:301/1480 train_time:42578ms step_avg:146.32ms step:302/1480 train_time:42729ms step_avg:146.33ms step:303/1480 train_time:42878ms step_avg:146.34ms step:304/1480 train_time:43029ms step_avg:146.36ms step:305/1480 train_time:43180ms step_avg:146.37ms step:306/1480 train_time:43330ms step_avg:146.39ms step:307/1480 train_time:43481ms step_avg:146.40ms step:308/1480 train_time:43632ms step_avg:146.42ms step:309/1480 train_time:43783ms step_avg:146.43ms step:310/1480 train_time:43934ms step_avg:146.45ms step:311/1480 train_time:44082ms step_avg:146.45ms step:312/1480 train_time:44234ms step_avg:146.47ms step:313/1480 train_time:44383ms step_avg:146.48ms step:314/1480 train_time:44535ms step_avg:146.50ms step:315/1480 train_time:44685ms step_avg:146.51ms step:316/1480 train_time:44836ms step_avg:146.52ms step:317/1480 train_time:44986ms step_avg:146.53ms step:318/1480 train_time:45136ms step_avg:146.54ms step:319/1480 train_time:45286ms step_avg:146.56ms step:320/1480 train_time:45437ms step_avg:146.57ms step:321/1480 train_time:45587ms step_avg:146.58ms step:322/1480 train_time:45737ms step_avg:146.59ms step:323/1480 train_time:45887ms step_avg:146.60ms step:324/1480 train_time:46037ms step_avg:146.61ms step:325/1480 train_time:46187ms step_avg:146.63ms step:326/1480 train_time:46338ms step_avg:146.64ms step:327/1480 train_time:46490ms step_avg:146.65ms step:328/1480 train_time:46639ms step_avg:146.66ms step:329/1480 train_time:46790ms step_avg:146.68ms step:330/1480 train_time:46942ms step_avg:146.70ms step:331/1480 train_time:47096ms step_avg:146.72ms step:332/1480 train_time:47250ms step_avg:146.74ms step:333/1480 train_time:47405ms step_avg:146.76ms step:334/1480 train_time:47558ms step_avg:146.78ms step:335/1480 train_time:47713ms step_avg:146.81ms step:336/1480 train_time:47869ms step_avg:146.84ms step:337/1480 train_time:48024ms step_avg:146.86ms step:338/1480 train_time:48176ms step_avg:146.88ms step:339/1480 train_time:48329ms step_avg:146.90ms step:340/1480 train_time:48483ms step_avg:146.92ms step:341/1480 train_time:48636ms step_avg:146.94ms step:342/1480 train_time:48789ms step_avg:146.95ms step:343/1480 train_time:48945ms step_avg:146.98ms step:344/1480 train_time:49099ms step_avg:147.00ms step:345/1480 train_time:49253ms step_avg:147.02ms step:346/1480 train_time:49406ms step_avg:147.04ms step:347/1480 train_time:49560ms step_avg:147.06ms step:348/1480 train_time:49714ms step_avg:147.08ms step:349/1480 train_time:49868ms step_avg:147.10ms step:350/1480 train_time:50022ms step_avg:147.12ms step:351/1480 train_time:50176ms step_avg:147.14ms step:352/1480 train_time:50330ms step_avg:147.16ms step:353/1480 train_time:50485ms step_avg:147.19ms step:354/1480 train_time:50639ms step_avg:147.21ms step:355/1480 train_time:50793ms step_avg:147.23ms step:356/1480 train_time:50948ms step_avg:147.25ms step:357/1480 train_time:51101ms step_avg:147.27ms step:358/1480 train_time:51255ms step_avg:147.28ms step:359/1480 train_time:51409ms step_avg:147.30ms step:360/1480 train_time:51565ms step_avg:147.33ms step:361/1480 train_time:51718ms step_avg:147.35ms step:362/1480 train_time:51874ms step_avg:147.37ms step:363/1480 train_time:52028ms step_avg:147.39ms step:364/1480 train_time:52182ms step_avg:147.41ms step:365/1480 train_time:52336ms step_avg:147.43ms step:366/1480 train_time:52488ms step_avg:147.44ms step:367/1480 train_time:52642ms step_avg:147.46ms step:368/1480 train_time:52795ms step_avg:147.47ms step:369/1480 train_time:52949ms step_avg:147.49ms step:370/1480 train_time:53101ms step_avg:147.50ms step:371/1480 train_time:53256ms step_avg:147.52ms step:372/1480 train_time:53410ms step_avg:147.54ms step:373/1480 train_time:53563ms step_avg:147.56ms step:374/1480 train_time:53716ms step_avg:147.57ms step:375/1480 train_time:53869ms step_avg:147.59ms step:375/1480 val_loss:3.8076 train_time:53930ms step_avg:147.75ms step:376/1480 train_time:54029ms step_avg:147.62ms step:377/1480 train_time:54183ms step_avg:147.64ms step:378/1480 train_time:54336ms step_avg:147.65ms step:379/1480 train_time:54488ms step_avg:147.66ms step:380/1480 train_time:54640ms step_avg:147.68ms step:381/1480 train_time:54792ms step_avg:147.69ms step:382/1480 train_time:54945ms step_avg:147.70ms step:383/1480 train_time:55100ms step_avg:147.72ms step:384/1480 train_time:55254ms step_avg:147.74ms step:385/1480 train_time:55408ms step_avg:147.76ms step:386/1480 train_time:55562ms step_avg:147.77ms step:387/1480 train_time:55715ms step_avg:147.79ms step:388/1480 train_time:55868ms step_avg:147.80ms step:389/1480 train_time:56021ms step_avg:147.81ms step:390/1480 train_time:56176ms step_avg:147.83ms step:391/1480 train_time:56331ms step_avg:147.85ms step:392/1480 train_time:56483ms step_avg:147.86ms step:393/1480 train_time:56637ms step_avg:147.88ms step:394/1480 train_time:56790ms step_avg:147.89ms step:395/1480 train_time:56943ms step_avg:147.90ms step:396/1480 train_time:57096ms step_avg:147.92ms step:397/1480 train_time:57250ms step_avg:147.93ms step:398/1480 train_time:57403ms step_avg:147.95ms step:399/1480 train_time:57558ms step_avg:147.96ms step:400/1480 train_time:57712ms step_avg:147.98ms step:401/1480 train_time:57866ms step_avg:147.99ms step:402/1480 train_time:58018ms step_avg:148.01ms step:403/1480 train_time:58173ms step_avg:148.02ms step:404/1480 train_time:58328ms step_avg:148.04ms step:405/1480 train_time:58482ms step_avg:148.06ms step:406/1480 train_time:58637ms step_avg:148.07ms step:407/1480 train_time:58791ms step_avg:148.09ms step:408/1480 train_time:58945ms step_avg:148.10ms step:409/1480 train_time:59099ms step_avg:148.12ms step:410/1480 train_time:59253ms step_avg:148.13ms step:411/1480 train_time:59407ms step_avg:148.15ms step:412/1480 train_time:59561ms step_avg:148.16ms step:413/1480 train_time:59715ms step_avg:148.18ms step:414/1480 train_time:59870ms step_avg:148.19ms step:415/1480 train_time:60023ms step_avg:148.20ms step:416/1480 train_time:60176ms step_avg:148.22ms step:417/1480 train_time:60331ms step_avg:148.23ms step:418/1480 train_time:60484ms step_avg:148.25ms step:419/1480 train_time:60638ms step_avg:148.26ms step:420/1480 train_time:60792ms step_avg:148.27ms step:421/1480 train_time:60946ms step_avg:148.29ms step:422/1480 train_time:61099ms step_avg:148.30ms step:423/1480 train_time:61253ms step_avg:148.31ms step:424/1480 train_time:61408ms step_avg:148.33ms step:425/1480 train_time:61564ms step_avg:148.35ms step:426/1480 train_time:61717ms step_avg:148.36ms step:427/1480 train_time:61871ms step_avg:148.37ms step:428/1480 train_time:62025ms step_avg:148.39ms step:429/1480 train_time:62178ms step_avg:148.40ms step:430/1480 train_time:62331ms step_avg:148.41ms step:431/1480 train_time:62485ms step_avg:148.42ms step:432/1480 train_time:62639ms step_avg:148.43ms step:433/1480 train_time:62792ms step_avg:148.44ms step:434/1480 train_time:62947ms step_avg:148.46ms step:435/1480 train_time:63099ms step_avg:148.47ms step:436/1480 train_time:63254ms step_avg:148.48ms step:437/1480 train_time:63410ms step_avg:148.50ms step:438/1480 train_time:63563ms step_avg:148.51ms step:439/1480 train_time:63717ms step_avg:148.52ms step:440/1480 train_time:63873ms step_avg:148.54ms step:441/1480 train_time:64031ms step_avg:148.56ms step:442/1480 train_time:64188ms step_avg:148.58ms step:443/1480 train_time:64344ms step_avg:148.60ms step:444/1480 train_time:64500ms step_avg:148.62ms step:445/1480 train_time:64657ms step_avg:148.64ms step:446/1480 train_time:64813ms step_avg:148.65ms step:447/1480 train_time:64969ms step_avg:148.67ms step:448/1480 train_time:65124ms step_avg:148.69ms step:449/1480 train_time:65281ms step_avg:148.70ms step:450/1480 train_time:65439ms step_avg:148.72ms step:451/1480 train_time:65598ms step_avg:148.75ms step:452/1480 train_time:65754ms step_avg:148.76ms step:453/1480 train_time:65910ms step_avg:148.78ms step:454/1480 train_time:66065ms step_avg:148.80ms step:455/1480 train_time:66220ms step_avg:148.81ms step:456/1480 train_time:66377ms step_avg:148.83ms step:457/1480 train_time:66533ms step_avg:148.84ms step:458/1480 train_time:66689ms step_avg:148.86ms step:459/1480 train_time:66847ms step_avg:148.88ms step:460/1480 train_time:67004ms step_avg:148.90ms step:461/1480 train_time:67162ms step_avg:148.92ms step:462/1480 train_time:67317ms step_avg:148.93ms step:463/1480 train_time:67475ms step_avg:148.95ms step:464/1480 train_time:67634ms step_avg:148.97ms step:465/1480 train_time:67791ms step_avg:148.99ms step:466/1480 train_time:67948ms step_avg:149.01ms step:467/1480 train_time:68105ms step_avg:149.03ms step:468/1480 train_time:68260ms step_avg:149.04ms step:469/1480 train_time:68415ms step_avg:149.05ms step:470/1480 train_time:68573ms step_avg:149.07ms step:471/1480 train_time:68730ms step_avg:149.09ms step:472/1480 train_time:68888ms step_avg:149.11ms step:473/1480 train_time:69044ms step_avg:149.12ms step:474/1480 train_time:69200ms step_avg:149.14ms step:475/1480 train_time:69358ms step_avg:149.16ms step:476/1480 train_time:69515ms step_avg:149.17ms step:477/1480 train_time:69675ms step_avg:149.20ms step:478/1480 train_time:69832ms step_avg:149.21ms step:479/1480 train_time:69990ms step_avg:149.23ms step:480/1480 train_time:70148ms step_avg:149.25ms step:481/1480 train_time:70304ms step_avg:149.27ms step:482/1480 train_time:70461ms step_avg:149.28ms step:483/1480 train_time:70617ms step_avg:149.30ms step:484/1480 train_time:70775ms step_avg:149.31ms step:485/1480 train_time:70934ms step_avg:149.34ms step:486/1480 train_time:71093ms step_avg:149.35ms step:487/1480 train_time:71250ms step_avg:149.37ms step:488/1480 train_time:71407ms step_avg:149.39ms step:489/1480 train_time:71563ms step_avg:149.40ms step:490/1480 train_time:71719ms step_avg:149.41ms step:491/1480 train_time:71875ms step_avg:149.43ms step:492/1480 train_time:72033ms step_avg:149.45ms step:493/1480 train_time:72192ms step_avg:149.47ms step:494/1480 train_time:72350ms step_avg:149.48ms step:495/1480 train_time:72509ms step_avg:149.50ms step:496/1480 train_time:72666ms step_avg:149.52ms step:497/1480 train_time:72822ms step_avg:149.53ms step:498/1480 train_time:72979ms step_avg:149.55ms step:499/1480 train_time:73135ms step_avg:149.56ms step:500/1480 train_time:73293ms step_avg:149.58ms step:500/1480 val_loss:3.6895 train_time:73356ms step_avg:149.71ms step:501/1480 train_time:73455ms step_avg:149.60ms step:502/1480 train_time:73614ms step_avg:149.62ms step:503/1480 train_time:73769ms step_avg:149.63ms step:504/1480 train_time:73925ms step_avg:149.65ms step:505/1480 train_time:74080ms step_avg:149.66ms step:506/1480 train_time:74238ms step_avg:149.67ms step:507/1480 train_time:74394ms step_avg:149.69ms step:508/1480 train_time:74552ms step_avg:149.70ms step:509/1480 train_time:74709ms step_avg:149.72ms step:510/1480 train_time:74865ms step_avg:149.73ms step:511/1480 train_time:75021ms step_avg:149.74ms step:512/1480 train_time:75178ms step_avg:149.76ms step:513/1480 train_time:75336ms step_avg:149.77ms step:514/1480 train_time:75493ms step_avg:149.79ms step:515/1480 train_time:75652ms step_avg:149.81ms step:516/1480 train_time:75810ms step_avg:149.82ms step:517/1480 train_time:75966ms step_avg:149.83ms step:518/1480 train_time:76123ms step_avg:149.85ms step:519/1480 train_time:76279ms step_avg:149.86ms step:520/1480 train_time:76437ms step_avg:149.88ms step:521/1480 train_time:76594ms step_avg:149.89ms step:522/1480 train_time:76753ms step_avg:149.91ms step:523/1480 train_time:76909ms step_avg:149.92ms step:524/1480 train_time:77065ms step_avg:149.93ms step:525/1480 train_time:77222ms step_avg:149.95ms step:526/1480 train_time:77379ms step_avg:149.96ms step:527/1480 train_time:77535ms step_avg:149.97ms step:528/1480 train_time:77692ms step_avg:149.98ms step:529/1480 train_time:77849ms step_avg:150.00ms step:530/1480 train_time:78006ms step_avg:150.01ms step:531/1480 train_time:78163ms step_avg:150.03ms step:532/1480 train_time:78321ms step_avg:150.04ms step:533/1480 train_time:78478ms step_avg:150.05ms step:534/1480 train_time:78633ms step_avg:150.06ms step:535/1480 train_time:78788ms step_avg:150.07ms step:536/1480 train_time:78946ms step_avg:150.09ms step:537/1480 train_time:79102ms step_avg:150.10ms step:538/1480 train_time:79261ms step_avg:150.12ms step:539/1480 train_time:79419ms step_avg:150.13ms step:540/1480 train_time:79576ms step_avg:150.14ms step:541/1480 train_time:79733ms step_avg:150.16ms step:542/1480 train_time:79889ms step_avg:150.17ms step:543/1480 train_time:80045ms step_avg:150.18ms step:544/1480 train_time:80201ms step_avg:150.19ms step:545/1480 train_time:80359ms step_avg:150.20ms step:546/1480 train_time:80516ms step_avg:150.22ms step:547/1480 train_time:80673ms step_avg:150.23ms step:548/1480 train_time:80831ms step_avg:150.24ms step:549/1480 train_time:80987ms step_avg:150.25ms step:550/1480 train_time:81146ms step_avg:150.27ms step:551/1480 train_time:81303ms step_avg:150.28ms step:552/1480 train_time:81462ms step_avg:150.30ms step:553/1480 train_time:81621ms step_avg:150.32ms step:554/1480 train_time:81781ms step_avg:150.33ms step:555/1480 train_time:81941ms step_avg:150.35ms step:556/1480 train_time:82099ms step_avg:150.37ms step:557/1480 train_time:82262ms step_avg:150.39ms step:558/1480 train_time:82422ms step_avg:150.40ms step:559/1480 train_time:82582ms step_avg:150.42ms step:560/1480 train_time:82741ms step_avg:150.44ms step:561/1480 train_time:82900ms step_avg:150.45ms step:562/1480 train_time:83061ms step_avg:150.47ms step:563/1480 train_time:83220ms step_avg:150.49ms step:564/1480 train_time:83379ms step_avg:150.50ms step:565/1480 train_time:83539ms step_avg:150.52ms step:566/1480 train_time:83700ms step_avg:150.54ms step:567/1480 train_time:83860ms step_avg:150.56ms step:568/1480 train_time:84019ms step_avg:150.57ms step:569/1480 train_time:84177ms step_avg:150.59ms step:570/1480 train_time:84336ms step_avg:150.60ms step:571/1480 train_time:84496ms step_avg:150.62ms step:572/1480 train_time:84657ms step_avg:150.64ms step:573/1480 train_time:84817ms step_avg:150.65ms step:574/1480 train_time:84979ms step_avg:150.67ms step:575/1480 train_time:85140ms step_avg:150.69ms step:576/1480 train_time:85299ms step_avg:150.71ms step:577/1480 train_time:85460ms step_avg:150.72ms step:578/1480 train_time:85619ms step_avg:150.74ms step:579/1480 train_time:85779ms step_avg:150.75ms step:580/1480 train_time:85939ms step_avg:150.77ms step:581/1480 train_time:86100ms step_avg:150.79ms step:582/1480 train_time:86261ms step_avg:150.81ms step:583/1480 train_time:86421ms step_avg:150.82ms step:584/1480 train_time:86580ms step_avg:150.84ms step:585/1480 train_time:86739ms step_avg:150.85ms step:586/1480 train_time:86899ms step_avg:150.87ms step:587/1480 train_time:87059ms step_avg:150.88ms step:588/1480 train_time:87220ms step_avg:150.90ms step:589/1480 train_time:87380ms step_avg:150.92ms step:590/1480 train_time:87542ms step_avg:150.93ms step:591/1480 train_time:87699ms step_avg:150.94ms step:592/1480 train_time:87861ms step_avg:150.96ms step:593/1480 train_time:88021ms step_avg:150.98ms step:594/1480 train_time:88182ms step_avg:151.00ms step:595/1480 train_time:88343ms step_avg:151.01ms step:596/1480 train_time:88503ms step_avg:151.03ms step:597/1480 train_time:88662ms step_avg:151.04ms step:598/1480 train_time:88820ms step_avg:151.05ms step:599/1480 train_time:88979ms step_avg:151.07ms step:600/1480 train_time:89139ms step_avg:151.08ms step:601/1480 train_time:89299ms step_avg:151.10ms step:602/1480 train_time:89459ms step_avg:151.11ms step:603/1480 train_time:89621ms step_avg:151.13ms step:604/1480 train_time:89781ms step_avg:151.15ms step:605/1480 train_time:89941ms step_avg:151.16ms step:606/1480 train_time:90102ms step_avg:151.18ms step:607/1480 train_time:90264ms step_avg:151.20ms step:608/1480 train_time:90422ms step_avg:151.21ms step:609/1480 train_time:90582ms step_avg:151.22ms step:610/1480 train_time:90741ms step_avg:151.23ms step:611/1480 train_time:90901ms step_avg:151.25ms step:612/1480 train_time:91061ms step_avg:151.26ms step:613/1480 train_time:91222ms step_avg:151.28ms step:614/1480 train_time:91380ms step_avg:151.29ms step:615/1480 train_time:91539ms step_avg:151.30ms step:616/1480 train_time:91697ms step_avg:151.32ms step:617/1480 train_time:91856ms step_avg:151.33ms step:618/1480 train_time:92015ms step_avg:151.34ms step:619/1480 train_time:92175ms step_avg:151.35ms step:620/1480 train_time:92335ms step_avg:151.37ms step:621/1480 train_time:92494ms step_avg:151.38ms step:622/1480 train_time:92654ms step_avg:151.39ms step:623/1480 train_time:92812ms step_avg:151.41ms step:624/1480 train_time:92971ms step_avg:151.42ms step:625/1480 train_time:93129ms step_avg:151.43ms step:625/1480 val_loss:3.6086 train_time:93191ms step_avg:151.53ms step:626/1480 train_time:93291ms step_avg:151.45ms step:627/1480 train_time:93451ms step_avg:151.46ms step:628/1480 train_time:93610ms step_avg:151.47ms step:629/1480 train_time:93767ms step_avg:151.48ms step:630/1480 train_time:93925ms step_avg:151.49ms step:631/1480 train_time:94083ms step_avg:151.50ms step:632/1480 train_time:94242ms step_avg:151.51ms step:633/1480 train_time:94402ms step_avg:151.53ms step:634/1480 train_time:94561ms step_avg:151.54ms step:635/1480 train_time:94720ms step_avg:151.55ms step:636/1480 train_time:94878ms step_avg:151.56ms step:637/1480 train_time:95038ms step_avg:151.58ms step:638/1480 train_time:95197ms step_avg:151.59ms step:639/1480 train_time:95356ms step_avg:151.60ms step:640/1480 train_time:95515ms step_avg:151.61ms step:641/1480 train_time:95675ms step_avg:151.62ms step:642/1480 train_time:95836ms step_avg:151.64ms step:643/1480 train_time:95996ms step_avg:151.65ms step:644/1480 train_time:96156ms step_avg:151.67ms step:645/1480 train_time:96317ms step_avg:151.68ms step:646/1480 train_time:96476ms step_avg:151.69ms step:647/1480 train_time:96635ms step_avg:151.70ms step:648/1480 train_time:96798ms step_avg:151.72ms step:649/1480 train_time:96957ms step_avg:151.73ms step:650/1480 train_time:97117ms step_avg:151.74ms step:651/1480 train_time:97277ms step_avg:151.76ms step:652/1480 train_time:97437ms step_avg:151.77ms step:653/1480 train_time:97595ms step_avg:151.78ms step:654/1480 train_time:97755ms step_avg:151.79ms step:655/1480 train_time:97916ms step_avg:151.81ms step:656/1480 train_time:98076ms step_avg:151.82ms step:657/1480 train_time:98237ms step_avg:151.83ms step:658/1480 train_time:98398ms step_avg:151.85ms step:659/1480 train_time:98558ms step_avg:151.86ms step:660/1480 train_time:98720ms step_avg:151.88ms step:661/1480 train_time:98880ms step_avg:151.89ms step:662/1480 train_time:99039ms step_avg:151.90ms step:663/1480 train_time:99198ms step_avg:151.91ms step:664/1480 train_time:99360ms step_avg:151.93ms step:665/1480 train_time:99523ms step_avg:151.94ms step:666/1480 train_time:99683ms step_avg:151.96ms step:667/1480 train_time:99844ms step_avg:151.97ms step:668/1480 train_time:100006ms step_avg:151.98ms step:669/1480 train_time:100169ms step_avg:152.00ms step:670/1480 train_time:100329ms step_avg:152.01ms step:671/1480 train_time:100490ms step_avg:152.03ms step:672/1480 train_time:100652ms step_avg:152.04ms step:673/1480 train_time:100817ms step_avg:152.06ms step:674/1480 train_time:100978ms step_avg:152.08ms step:675/1480 train_time:101140ms step_avg:152.09ms step:676/1480 train_time:101302ms step_avg:152.10ms step:677/1480 train_time:101464ms step_avg:152.12ms step:678/1480 train_time:101625ms step_avg:152.13ms step:679/1480 train_time:101785ms step_avg:152.15ms step:680/1480 train_time:101947ms step_avg:152.16ms step:681/1480 train_time:102108ms step_avg:152.17ms step:682/1480 train_time:102271ms step_avg:152.19ms step:683/1480 train_time:102435ms step_avg:152.21ms step:684/1480 train_time:102596ms step_avg:152.22ms step:685/1480 train_time:102760ms step_avg:152.24ms step:686/1480 train_time:102921ms step_avg:152.25ms step:687/1480 train_time:103081ms step_avg:152.26ms step:688/1480 train_time:103243ms step_avg:152.28ms step:689/1480 train_time:103404ms step_avg:152.29ms step:690/1480 train_time:103567ms step_avg:152.30ms step:691/1480 train_time:103726ms step_avg:152.31ms step:692/1480 train_time:103887ms step_avg:152.33ms step:693/1480 train_time:104049ms step_avg:152.34ms step:694/1480 train_time:104211ms step_avg:152.36ms step:695/1480 train_time:104372ms step_avg:152.37ms step:696/1480 train_time:104536ms step_avg:152.38ms step:697/1480 train_time:104699ms step_avg:152.40ms step:698/1480 train_time:104859ms step_avg:152.41ms step:699/1480 train_time:105021ms step_avg:152.43ms step:700/1480 train_time:105182ms step_avg:152.44ms step:701/1480 train_time:105342ms step_avg:152.45ms step:702/1480 train_time:105504ms step_avg:152.46ms step:703/1480 train_time:105664ms step_avg:152.47ms step:704/1480 train_time:105825ms step_avg:152.49ms step:705/1480 train_time:105988ms step_avg:152.50ms step:706/1480 train_time:106152ms step_avg:152.52ms step:707/1480 train_time:106313ms step_avg:152.53ms step:708/1480 train_time:106474ms step_avg:152.54ms step:709/1480 train_time:106637ms step_avg:152.56ms step:710/1480 train_time:106798ms step_avg:152.57ms step:711/1480 train_time:106960ms step_avg:152.58ms step:712/1480 train_time:107126ms step_avg:152.60ms step:713/1480 train_time:107290ms step_avg:152.62ms step:714/1480 train_time:107452ms step_avg:152.63ms step:715/1480 train_time:107613ms step_avg:152.64ms step:716/1480 train_time:107774ms step_avg:152.65ms step:717/1480 train_time:107939ms step_avg:152.67ms step:718/1480 train_time:108098ms step_avg:152.68ms step:719/1480 train_time:108258ms step_avg:152.69ms step:720/1480 train_time:108422ms step_avg:152.71ms step:721/1480 train_time:108583ms step_avg:152.72ms step:722/1480 train_time:108744ms step_avg:152.73ms step:723/1480 train_time:108904ms step_avg:152.74ms step:724/1480 train_time:109065ms step_avg:152.75ms step:725/1480 train_time:109230ms step_avg:152.77ms step:726/1480 train_time:109395ms step_avg:152.79ms step:727/1480 train_time:109558ms step_avg:152.80ms step:728/1480 train_time:109718ms step_avg:152.81ms step:729/1480 train_time:109878ms step_avg:152.82ms step:730/1480 train_time:110041ms step_avg:152.84ms step:731/1480 train_time:110202ms step_avg:152.85ms step:732/1480 train_time:110362ms step_avg:152.86ms step:733/1480 train_time:110524ms step_avg:152.87ms step:734/1480 train_time:110685ms step_avg:152.88ms step:735/1480 train_time:110845ms step_avg:152.89ms step:736/1480 train_time:111007ms step_avg:152.90ms step:737/1480 train_time:111166ms step_avg:152.91ms step:738/1480 train_time:111327ms step_avg:152.92ms step:739/1480 train_time:111486ms step_avg:152.93ms step:740/1480 train_time:111652ms step_avg:152.95ms step:741/1480 train_time:111815ms step_avg:152.96ms step:742/1480 train_time:111977ms step_avg:152.97ms step:743/1480 train_time:112139ms step_avg:152.99ms step:744/1480 train_time:112301ms step_avg:153.00ms step:745/1480 train_time:112464ms step_avg:153.01ms step:746/1480 train_time:112624ms step_avg:153.02ms step:747/1480 train_time:112788ms step_avg:153.04ms step:748/1480 train_time:112953ms step_avg:153.05ms step:749/1480 train_time:113118ms step_avg:153.07ms step:750/1480 train_time:113278ms step_avg:153.08ms step:750/1480 val_loss:3.5496 train_time:113342ms step_avg:153.17ms step:751/1480 train_time:113443ms step_avg:153.10ms step:752/1480 train_time:113605ms step_avg:153.11ms step:753/1480 train_time:113765ms step_avg:153.12ms step:754/1480 train_time:113927ms step_avg:153.13ms step:755/1480 train_time:114088ms step_avg:153.14ms step:756/1480 train_time:114250ms step_avg:153.15ms step:757/1480 train_time:114414ms step_avg:153.17ms step:758/1480 train_time:114576ms step_avg:153.18ms step:759/1480 train_time:114739ms step_avg:153.19ms step:760/1480 train_time:114902ms step_avg:153.20ms step:761/1480 train_time:115064ms step_avg:153.21ms step:762/1480 train_time:115225ms step_avg:153.22ms step:763/1480 train_time:115386ms step_avg:153.23ms step:764/1480 train_time:115547ms step_avg:153.24ms step:765/1480 train_time:115707ms step_avg:153.25ms step:766/1480 train_time:115870ms step_avg:153.27ms step:767/1480 train_time:116032ms step_avg:153.28ms step:768/1480 train_time:116194ms step_avg:153.29ms step:769/1480 train_time:116358ms step_avg:153.30ms step:770/1480 train_time:116522ms step_avg:153.32ms step:771/1480 train_time:116686ms step_avg:153.33ms step:772/1480 train_time:116846ms step_avg:153.34ms step:773/1480 train_time:117008ms step_avg:153.35ms step:774/1480 train_time:117170ms step_avg:153.36ms step:775/1480 train_time:117333ms step_avg:153.38ms step:776/1480 train_time:117499ms step_avg:153.39ms step:777/1480 train_time:117665ms step_avg:153.41ms step:778/1480 train_time:117827ms step_avg:153.42ms step:779/1480 train_time:117990ms step_avg:153.43ms step:780/1480 train_time:118155ms step_avg:153.45ms step:781/1480 train_time:118318ms step_avg:153.46ms step:782/1480 train_time:118483ms step_avg:153.48ms step:783/1480 train_time:118644ms step_avg:153.48ms step:784/1480 train_time:118808ms step_avg:153.50ms step:785/1480 train_time:118969ms step_avg:153.51ms step:786/1480 train_time:119133ms step_avg:153.52ms step:787/1480 train_time:119296ms step_avg:153.53ms step:788/1480 train_time:119460ms step_avg:153.55ms step:789/1480 train_time:119623ms step_avg:153.56ms step:790/1480 train_time:119789ms step_avg:153.58ms step:791/1480 train_time:119956ms step_avg:153.59ms step:792/1480 train_time:120122ms step_avg:153.61ms step:793/1480 train_time:120283ms step_avg:153.62ms step:794/1480 train_time:120446ms step_avg:153.63ms step:795/1480 train_time:120612ms step_avg:153.65ms step:796/1480 train_time:120778ms step_avg:153.66ms step:797/1480 train_time:120942ms step_avg:153.68ms step:798/1480 train_time:121106ms step_avg:153.69ms step:799/1480 train_time:121274ms step_avg:153.71ms step:800/1480 train_time:121439ms step_avg:153.72ms step:801/1480 train_time:121602ms step_avg:153.73ms step:802/1480 train_time:121769ms step_avg:153.75ms step:803/1480 train_time:121931ms step_avg:153.76ms step:804/1480 train_time:122093ms step_avg:153.77ms step:805/1480 train_time:122259ms step_avg:153.78ms step:806/1480 train_time:122421ms step_avg:153.79ms step:807/1480 train_time:122582ms step_avg:153.80ms step:808/1480 train_time:122745ms step_avg:153.82ms step:809/1480 train_time:122907ms step_avg:153.83ms step:810/1480 train_time:123069ms step_avg:153.84ms step:811/1480 train_time:123231ms step_avg:153.85ms step:812/1480 train_time:123397ms step_avg:153.86ms step:813/1480 train_time:123560ms step_avg:153.87ms step:814/1480 train_time:123723ms step_avg:153.88ms step:815/1480 train_time:123885ms step_avg:153.89ms step:816/1480 train_time:124049ms step_avg:153.91ms step:817/1480 train_time:124210ms step_avg:153.92ms step:818/1480 train_time:124371ms step_avg:153.92ms step:819/1480 train_time:124536ms step_avg:153.94ms step:820/1480 train_time:124701ms step_avg:153.95ms step:821/1480 train_time:124863ms step_avg:153.96ms step:822/1480 train_time:125027ms step_avg:153.97ms step:823/1480 train_time:125189ms step_avg:153.98ms step:824/1480 train_time:125351ms step_avg:153.99ms step:825/1480 train_time:125515ms step_avg:154.01ms step:826/1480 train_time:125682ms step_avg:154.02ms step:827/1480 train_time:125846ms step_avg:154.03ms step:828/1480 train_time:126008ms step_avg:154.04ms step:829/1480 train_time:126171ms step_avg:154.06ms step:830/1480 train_time:126337ms step_avg:154.07ms step:831/1480 train_time:126502ms step_avg:154.08ms step:832/1480 train_time:126665ms step_avg:154.09ms step:833/1480 train_time:126829ms step_avg:154.11ms step:834/1480 train_time:126995ms step_avg:154.12ms step:835/1480 train_time:127159ms step_avg:154.13ms step:836/1480 train_time:127323ms step_avg:154.14ms step:837/1480 train_time:127485ms step_avg:154.15ms step:838/1480 train_time:127648ms step_avg:154.16ms step:839/1480 train_time:127811ms step_avg:154.17ms step:840/1480 train_time:127972ms step_avg:154.18ms step:841/1480 train_time:128133ms step_avg:154.19ms step:842/1480 train_time:128297ms step_avg:154.20ms step:843/1480 train_time:128460ms step_avg:154.21ms step:844/1480 train_time:128622ms step_avg:154.22ms step:845/1480 train_time:128787ms step_avg:154.24ms step:846/1480 train_time:128952ms step_avg:154.25ms step:847/1480 train_time:129116ms step_avg:154.26ms step:848/1480 train_time:129279ms step_avg:154.27ms step:849/1480 train_time:129441ms step_avg:154.28ms step:850/1480 train_time:129605ms step_avg:154.29ms step:851/1480 train_time:129769ms step_avg:154.30ms step:852/1480 train_time:129930ms step_avg:154.31ms step:853/1480 train_time:130092ms step_avg:154.32ms step:854/1480 train_time:130258ms step_avg:154.33ms step:855/1480 train_time:130422ms step_avg:154.35ms step:856/1480 train_time:130583ms step_avg:154.35ms step:857/1480 train_time:130747ms step_avg:154.36ms step:858/1480 train_time:130911ms step_avg:154.38ms step:859/1480 train_time:131077ms step_avg:154.39ms step:860/1480 train_time:131240ms step_avg:154.40ms step:861/1480 train_time:131406ms step_avg:154.41ms step:862/1480 train_time:131573ms step_avg:154.43ms step:863/1480 train_time:131742ms step_avg:154.45ms step:864/1480 train_time:131906ms step_avg:154.46ms step:865/1480 train_time:132067ms step_avg:154.46ms step:866/1480 train_time:132234ms step_avg:154.48ms step:867/1480 train_time:132398ms step_avg:154.49ms step:868/1480 train_time:132562ms step_avg:154.50ms step:869/1480 train_time:132724ms step_avg:154.51ms step:870/1480 train_time:132889ms step_avg:154.52ms step:871/1480 train_time:133052ms step_avg:154.53ms step:872/1480 train_time:133215ms step_avg:154.54ms step:873/1480 train_time:133380ms step_avg:154.55ms step:874/1480 train_time:133545ms step_avg:154.57ms step:875/1480 train_time:133709ms step_avg:154.58ms step:875/1480 val_loss:3.5072 train_time:133774ms step_avg:154.65ms step:876/1480 train_time:133874ms step_avg:154.59ms step:877/1480 train_time:134039ms step_avg:154.60ms step:878/1480 train_time:134202ms step_avg:154.61ms step:879/1480 train_time:134365ms step_avg:154.62ms step:880/1480 train_time:134530ms step_avg:154.63ms step:881/1480 train_time:134691ms step_avg:154.64ms step:882/1480 train_time:134857ms step_avg:154.65ms step:883/1480 train_time:135024ms step_avg:154.67ms step:884/1480 train_time:135191ms step_avg:154.68ms step:885/1480 train_time:135357ms step_avg:154.69ms step:886/1480 train_time:135523ms step_avg:154.71ms step:887/1480 train_time:135691ms step_avg:154.72ms step:888/1480 train_time:135863ms step_avg:154.74ms step:889/1480 train_time:136031ms step_avg:154.76ms step:890/1480 train_time:136192ms step_avg:154.76ms step:891/1480 train_time:136357ms step_avg:154.78ms step:892/1480 train_time:136522ms step_avg:154.79ms step:893/1480 train_time:136685ms step_avg:154.80ms step:894/1480 train_time:136853ms step_avg:154.81ms step:895/1480 train_time:137018ms step_avg:154.82ms step:896/1480 train_time:137183ms step_avg:154.83ms step:897/1480 train_time:137351ms step_avg:154.85ms step:898/1480 train_time:137517ms step_avg:154.86ms step:899/1480 train_time:137681ms step_avg:154.87ms step:900/1480 train_time:137845ms step_avg:154.88ms step:901/1480 train_time:138010ms step_avg:154.89ms step:902/1480 train_time:138174ms step_avg:154.90ms step:903/1480 train_time:138345ms step_avg:154.92ms step:904/1480 train_time:138511ms step_avg:154.93ms step:905/1480 train_time:138673ms step_avg:154.94ms step:906/1480 train_time:138840ms step_avg:154.95ms step:907/1480 train_time:139008ms step_avg:154.97ms step:908/1480 train_time:139171ms step_avg:154.98ms step:909/1480 train_time:139337ms step_avg:154.99ms step:910/1480 train_time:139508ms step_avg:155.01ms step:911/1480 train_time:139673ms step_avg:155.02ms step:912/1480 train_time:139838ms step_avg:155.03ms step:913/1480 train_time:140008ms step_avg:155.05ms step:914/1480 train_time:140177ms step_avg:155.06ms step:915/1480 train_time:140348ms step_avg:155.08ms step:916/1480 train_time:140511ms step_avg:155.09ms step:917/1480 train_time:140674ms step_avg:155.10ms step:918/1480 train_time:140842ms step_avg:155.11ms step:919/1480 train_time:141012ms step_avg:155.13ms step:920/1480 train_time:141178ms step_avg:155.14ms step:921/1480 train_time:141343ms step_avg:155.15ms step:922/1480 train_time:141510ms step_avg:155.16ms step:923/1480 train_time:141672ms step_avg:155.17ms step:924/1480 train_time:141837ms step_avg:155.18ms step:925/1480 train_time:142004ms step_avg:155.20ms step:926/1480 train_time:142168ms step_avg:155.20ms step:927/1480 train_time:142332ms step_avg:155.21ms step:928/1480 train_time:142496ms step_avg:155.22ms step:929/1480 train_time:142660ms step_avg:155.23ms step:930/1480 train_time:142827ms step_avg:155.25ms step:931/1480 train_time:142991ms step_avg:155.26ms step:932/1480 train_time:143156ms step_avg:155.27ms step:933/1480 train_time:143324ms step_avg:155.28ms step:934/1480 train_time:143490ms step_avg:155.29ms step:935/1480 train_time:143659ms step_avg:155.31ms step:936/1480 train_time:143828ms step_avg:155.32ms step:937/1480 train_time:143996ms step_avg:155.34ms step:938/1480 train_time:144158ms step_avg:155.34ms step:939/1480 train_time:144327ms step_avg:155.36ms step:940/1480 train_time:144493ms step_avg:155.37ms step:941/1480 train_time:144657ms step_avg:155.38ms step:942/1480 train_time:144822ms step_avg:155.39ms step:943/1480 train_time:144993ms step_avg:155.40ms step:944/1480 train_time:145165ms step_avg:155.42ms step:945/1480 train_time:145329ms step_avg:155.43ms step:946/1480 train_time:145497ms step_avg:155.45ms step:947/1480 train_time:145666ms step_avg:155.46ms step:948/1480 train_time:145832ms step_avg:155.47ms step:949/1480 train_time:145998ms step_avg:155.48ms step:950/1480 train_time:146162ms step_avg:155.49ms step:951/1480 train_time:146330ms step_avg:155.51ms step:952/1480 train_time:146495ms step_avg:155.51ms step:953/1480 train_time:146663ms step_avg:155.53ms step:954/1480 train_time:146831ms step_avg:155.54ms step:955/1480 train_time:146993ms step_avg:155.55ms step:956/1480 train_time:147158ms step_avg:155.56ms step:957/1480 train_time:147326ms step_avg:155.57ms step:958/1480 train_time:147494ms step_avg:155.58ms step:959/1480 train_time:147658ms step_avg:155.59ms step:960/1480 train_time:147827ms step_avg:155.61ms step:961/1480 train_time:147992ms step_avg:155.62ms step:962/1480 train_time:148155ms step_avg:155.63ms step:963/1480 train_time:148322ms step_avg:155.64ms step:964/1480 train_time:148491ms step_avg:155.65ms step:965/1480 train_time:148655ms step_avg:155.66ms step:966/1480 train_time:148821ms step_avg:155.67ms step:967/1480 train_time:148984ms step_avg:155.68ms step:968/1480 train_time:149149ms step_avg:155.69ms step:969/1480 train_time:149315ms step_avg:155.70ms step:970/1480 train_time:149479ms step_avg:155.71ms step:971/1480 train_time:149644ms step_avg:155.72ms step:972/1480 train_time:149809ms step_avg:155.73ms step:973/1480 train_time:149974ms step_avg:155.74ms step:974/1480 train_time:150142ms step_avg:155.75ms step:975/1480 train_time:150309ms step_avg:155.76ms step:976/1480 train_time:150474ms step_avg:155.77ms step:977/1480 train_time:150638ms step_avg:155.78ms step:978/1480 train_time:150803ms step_avg:155.79ms step:979/1480 train_time:150969ms step_avg:155.80ms step:980/1480 train_time:151134ms step_avg:155.81ms step:981/1480 train_time:151305ms step_avg:155.82ms step:982/1480 train_time:151469ms step_avg:155.83ms step:983/1480 train_time:151633ms step_avg:155.84ms step:984/1480 train_time:151798ms step_avg:155.85ms step:985/1480 train_time:151966ms step_avg:155.86ms step:986/1480 train_time:152134ms step_avg:155.88ms step:987/1480 train_time:152299ms step_avg:155.88ms step:988/1480 train_time:152467ms step_avg:155.90ms step:989/1480 train_time:152633ms step_avg:155.91ms step:990/1480 train_time:152803ms step_avg:155.92ms step:991/1480 train_time:152971ms step_avg:155.93ms step:992/1480 train_time:153146ms step_avg:155.95ms step:993/1480 train_time:153325ms step_avg:155.98ms step:994/1480 train_time:153492ms step_avg:155.99ms step:995/1480 train_time:153655ms step_avg:156.00ms step:996/1480 train_time:153818ms step_avg:156.00ms step:997/1480 train_time:153984ms step_avg:156.01ms step:998/1480 train_time:154149ms step_avg:156.02ms step:999/1480 train_time:154315ms step_avg:156.03ms step:1000/1480 train_time:154487ms step_avg:156.05ms step:1000/1480 val_loss:3.4406 train_time:154556ms step_avg:156.12ms step:1001/1480 train_time:154656ms step_avg:156.06ms step:1002/1480 train_time:154822ms step_avg:156.07ms step:1003/1480 train_time:154992ms step_avg:156.08ms step:1004/1480 train_time:155162ms step_avg:156.10ms step:1005/1480 train_time:155329ms step_avg:156.11ms step:1006/1480 train_time:155498ms step_avg:156.12ms step:1007/1480 train_time:155665ms step_avg:156.13ms step:1008/1480 train_time:155831ms step_avg:156.14ms step:1009/1480 train_time:156006ms step_avg:156.16ms step:1010/1480 train_time:156171ms step_avg:156.17ms step:1011/1480 train_time:156336ms step_avg:156.18ms step:1012/1480 train_time:156500ms step_avg:156.19ms step:1013/1480 train_time:156671ms step_avg:156.20ms step:1014/1480 train_time:156840ms step_avg:156.21ms step:1015/1480 train_time:157010ms step_avg:156.23ms step:1016/1480 train_time:157176ms step_avg:156.24ms step:1017/1480 train_time:157347ms step_avg:156.25ms step:1018/1480 train_time:157515ms step_avg:156.26ms step:1019/1480 train_time:157685ms step_avg:156.28ms step:1020/1480 train_time:157854ms step_avg:156.29ms step:1021/1480 train_time:158021ms step_avg:156.30ms step:1022/1480 train_time:158189ms step_avg:156.31ms step:1023/1480 train_time:158356ms step_avg:156.32ms step:1024/1480 train_time:158524ms step_avg:156.34ms step:1025/1480 train_time:158694ms step_avg:156.35ms step:1026/1480 train_time:158859ms step_avg:156.36ms step:1027/1480 train_time:159026ms step_avg:156.37ms step:1028/1480 train_time:159197ms step_avg:156.38ms step:1029/1480 train_time:159371ms step_avg:156.40ms step:1030/1480 train_time:159538ms step_avg:156.41ms step:1031/1480 train_time:159701ms step_avg:156.42ms step:1032/1480 train_time:159873ms step_avg:156.43ms step:1033/1480 train_time:160039ms step_avg:156.44ms step:1034/1480 train_time:160208ms step_avg:156.45ms step:1035/1480 train_time:160375ms step_avg:156.46ms step:1036/1480 train_time:160541ms step_avg:156.47ms step:1037/1480 train_time:160709ms step_avg:156.48ms step:1038/1480 train_time:160878ms step_avg:156.50ms step:1039/1480 train_time:161049ms step_avg:156.51ms step:1040/1480 train_time:161214ms step_avg:156.52ms step:1041/1480 train_time:161382ms step_avg:156.53ms step:1042/1480 train_time:161547ms step_avg:156.54ms step:1043/1480 train_time:161713ms step_avg:156.55ms step:1044/1480 train_time:161878ms step_avg:156.55ms step:1045/1480 train_time:162048ms step_avg:156.57ms step:1046/1480 train_time:162216ms step_avg:156.58ms step:1047/1480 train_time:162382ms step_avg:156.59ms step:1048/1480 train_time:162549ms step_avg:156.60ms step:1049/1480 train_time:162714ms step_avg:156.61ms step:1050/1480 train_time:162884ms step_avg:156.62ms step:1051/1480 train_time:163052ms step_avg:156.63ms step:1052/1480 train_time:163221ms step_avg:156.64ms step:1053/1480 train_time:163387ms step_avg:156.65ms step:1054/1480 train_time:163555ms step_avg:156.66ms step:1055/1480 train_time:163720ms step_avg:156.67ms step:1056/1480 train_time:163886ms step_avg:156.68ms step:1057/1480 train_time:164051ms step_avg:156.69ms step:1058/1480 train_time:164219ms step_avg:156.70ms step:1059/1480 train_time:164392ms step_avg:156.71ms step:1060/1480 train_time:164560ms step_avg:156.72ms step:1061/1480 train_time:164724ms step_avg:156.73ms step:1062/1480 train_time:164891ms step_avg:156.74ms step:1063/1480 train_time:165055ms step_avg:156.75ms step:1064/1480 train_time:165218ms step_avg:156.75ms step:1065/1480 train_time:165387ms step_avg:156.76ms step:1066/1480 train_time:165554ms step_avg:156.77ms step:1067/1480 train_time:165724ms step_avg:156.79ms step:1068/1480 train_time:165890ms step_avg:156.80ms step:1069/1480 train_time:166062ms step_avg:156.81ms step:1070/1480 train_time:166229ms step_avg:156.82ms step:1071/1480 train_time:166403ms step_avg:156.84ms step:1072/1480 train_time:166570ms step_avg:156.85ms step:1073/1480 train_time:166732ms step_avg:156.85ms step:1074/1480 train_time:166898ms step_avg:156.86ms step:1075/1480 train_time:167069ms step_avg:156.87ms step:1076/1480 train_time:167235ms step_avg:156.88ms step:1077/1480 train_time:167401ms step_avg:156.89ms step:1078/1480 train_time:167576ms step_avg:156.91ms step:1079/1480 train_time:167748ms step_avg:156.92ms step:1080/1480 train_time:167917ms step_avg:156.93ms step:1081/1480 train_time:168086ms step_avg:156.94ms step:1082/1480 train_time:168251ms step_avg:156.95ms step:1083/1480 train_time:168418ms step_avg:156.96ms step:1084/1480 train_time:168586ms step_avg:156.97ms step:1085/1480 train_time:168753ms step_avg:156.98ms step:1086/1480 train_time:168921ms step_avg:156.99ms step:1087/1480 train_time:169088ms step_avg:157.00ms step:1088/1480 train_time:169257ms step_avg:157.01ms step:1089/1480 train_time:169429ms step_avg:157.02ms step:1090/1480 train_time:169600ms step_avg:157.04ms step:1091/1480 train_time:169770ms step_avg:157.05ms step:1092/1480 train_time:169939ms step_avg:157.06ms step:1093/1480 train_time:170107ms step_avg:157.07ms step:1094/1480 train_time:170273ms step_avg:157.08ms step:1095/1480 train_time:170439ms step_avg:157.09ms step:1096/1480 train_time:170607ms step_avg:157.10ms step:1097/1480 train_time:170775ms step_avg:157.11ms step:1098/1480 train_time:170947ms step_avg:157.12ms step:1099/1480 train_time:171118ms step_avg:157.13ms step:1100/1480 train_time:171289ms step_avg:157.15ms step:1101/1480 train_time:171461ms step_avg:157.16ms step:1102/1480 train_time:171631ms step_avg:157.17ms step:1103/1480 train_time:171809ms step_avg:157.19ms step:1104/1480 train_time:171976ms step_avg:157.20ms step:1105/1480 train_time:172148ms step_avg:157.21ms step:1106/1480 train_time:172316ms step_avg:157.22ms step:1107/1480 train_time:172485ms step_avg:157.23ms step:1108/1480 train_time:172650ms step_avg:157.24ms step:1109/1480 train_time:172815ms step_avg:157.25ms step:1110/1480 train_time:172982ms step_avg:157.26ms step:1111/1480 train_time:173148ms step_avg:157.26ms step:1112/1480 train_time:173317ms step_avg:157.28ms step:1113/1480 train_time:173496ms step_avg:157.30ms step:1114/1480 train_time:173670ms step_avg:157.31ms step:1115/1480 train_time:173841ms step_avg:157.32ms step:1116/1480 train_time:174008ms step_avg:157.33ms step:1117/1480 train_time:174182ms step_avg:157.35ms step:1118/1480 train_time:174356ms step_avg:157.36ms step:1119/1480 train_time:174522ms step_avg:157.37ms step:1120/1480 train_time:174692ms step_avg:157.38ms step:1121/1480 train_time:174861ms step_avg:157.39ms step:1122/1480 train_time:175027ms step_avg:157.40ms step:1123/1480 train_time:175193ms step_avg:157.41ms step:1124/1480 train_time:175362ms step_avg:157.42ms step:1125/1480 train_time:175530ms step_avg:157.43ms step:1125/1480 val_loss:3.3860 train_time:175598ms step_avg:157.49ms step:1126/1480 train_time:175699ms step_avg:157.44ms step:1127/1480 train_time:175869ms step_avg:157.45ms step:1128/1480 train_time:176041ms step_avg:157.46ms step:1129/1480 train_time:176216ms step_avg:157.48ms step:1130/1480 train_time:176384ms step_avg:157.49ms step:1131/1480 train_time:176561ms step_avg:157.50ms step:1132/1480 train_time:176726ms step_avg:157.51ms step:1133/1480 train_time:176899ms step_avg:157.52ms step:1134/1480 train_time:177071ms step_avg:157.54ms step:1135/1480 train_time:177237ms step_avg:157.54ms step:1136/1480 train_time:177409ms step_avg:157.56ms step:1137/1480 train_time:177576ms step_avg:157.57ms step:1138/1480 train_time:177748ms step_avg:157.58ms step:1139/1480 train_time:177917ms step_avg:157.59ms step:1140/1480 train_time:178085ms step_avg:157.60ms step:1141/1480 train_time:178257ms step_avg:157.61ms step:1142/1480 train_time:178424ms step_avg:157.62ms step:1143/1480 train_time:178596ms step_avg:157.63ms step:1144/1480 train_time:178763ms step_avg:157.64ms step:1145/1480 train_time:178929ms step_avg:157.65ms step:1146/1480 train_time:179100ms step_avg:157.66ms step:1147/1480 train_time:179271ms step_avg:157.67ms step:1148/1480 train_time:179438ms step_avg:157.68ms step:1149/1480 train_time:179610ms step_avg:157.69ms step:1150/1480 train_time:179777ms step_avg:157.70ms step:1151/1480 train_time:179950ms step_avg:157.71ms step:1152/1480 train_time:180121ms step_avg:157.72ms step:1153/1480 train_time:180295ms step_avg:157.74ms step:1154/1480 train_time:180462ms step_avg:157.75ms step:1155/1480 train_time:180635ms step_avg:157.76ms step:1156/1480 train_time:180816ms step_avg:157.78ms step:1157/1480 train_time:180985ms step_avg:157.79ms step:1158/1480 train_time:181152ms step_avg:157.80ms step:1159/1480 train_time:181320ms step_avg:157.81ms step:1160/1480 train_time:181487ms step_avg:157.81ms step:1161/1480 train_time:181656ms step_avg:157.82ms step:1162/1480 train_time:181825ms step_avg:157.83ms step:1163/1480 train_time:181996ms step_avg:157.85ms step:1164/1480 train_time:182164ms step_avg:157.85ms step:1165/1480 train_time:182330ms step_avg:157.86ms step:1166/1480 train_time:182499ms step_avg:157.87ms step:1167/1480 train_time:182667ms step_avg:157.88ms step:1168/1480 train_time:182834ms step_avg:157.89ms step:1169/1480 train_time:183002ms step_avg:157.90ms step:1170/1480 train_time:183170ms step_avg:157.90ms step:1171/1480 train_time:183336ms step_avg:157.91ms step:1172/1480 train_time:183502ms step_avg:157.92ms step:1173/1480 train_time:183673ms step_avg:157.93ms step:1174/1480 train_time:183854ms step_avg:157.95ms step:1175/1480 train_time:184024ms step_avg:157.96ms step:1176/1480 train_time:184197ms step_avg:157.97ms step:1177/1480 train_time:184373ms step_avg:157.99ms step:1178/1480 train_time:184541ms step_avg:158.00ms step:1179/1480 train_time:184707ms step_avg:158.00ms step:1180/1480 train_time:184887ms step_avg:158.02ms step:1181/1480 train_time:185057ms step_avg:158.03ms step:1182/1480 train_time:185225ms step_avg:158.04ms step:1183/1480 train_time:185397ms step_avg:158.05ms step:1184/1480 train_time:185565ms step_avg:158.06ms step:1185/1480 train_time:185738ms step_avg:158.07ms step:1186/1480 train_time:185907ms step_avg:158.08ms step:1187/1480 train_time:186090ms step_avg:158.11ms step:1188/1480 train_time:186256ms step_avg:158.11ms step:1189/1480 train_time:186427ms step_avg:158.12ms step:1190/1480 train_time:186596ms step_avg:158.13ms step:1191/1480 train_time:186770ms step_avg:158.15ms step:1192/1480 train_time:186937ms step_avg:158.15ms step:1193/1480 train_time:187102ms step_avg:158.16ms step:1194/1480 train_time:187270ms step_avg:158.17ms step:1195/1480 train_time:187443ms step_avg:158.18ms step:1196/1480 train_time:187626ms step_avg:158.20ms step:1197/1480 train_time:187798ms step_avg:158.21ms step:1198/1480 train_time:187978ms step_avg:158.23ms step:1199/1480 train_time:188149ms step_avg:158.24ms step:1200/1480 train_time:188317ms step_avg:158.25ms step:1201/1480 train_time:188484ms step_avg:158.26ms step:1202/1480 train_time:188665ms step_avg:158.28ms step:1203/1480 train_time:188840ms step_avg:158.29ms step:1204/1480 train_time:189015ms step_avg:158.30ms step:1205/1480 train_time:189183ms step_avg:158.31ms step:1206/1480 train_time:189351ms step_avg:158.32ms step:1207/1480 train_time:189519ms step_avg:158.33ms step:1208/1480 train_time:189686ms step_avg:158.34ms step:1209/1480 train_time:189858ms step_avg:158.35ms step:1210/1480 train_time:190034ms step_avg:158.36ms step:1211/1480 train_time:190208ms step_avg:158.37ms step:1212/1480 train_time:190380ms step_avg:158.39ms step:1213/1480 train_time:190553ms step_avg:158.40ms step:1214/1480 train_time:190730ms step_avg:158.41ms step:1215/1480 train_time:190905ms step_avg:158.43ms step:1216/1480 train_time:191074ms step_avg:158.44ms step:1217/1480 train_time:191246ms step_avg:158.45ms step:1218/1480 train_time:191417ms step_avg:158.46ms step:1219/1480 train_time:191596ms step_avg:158.47ms step:1220/1480 train_time:191765ms step_avg:158.48ms step:1221/1480 train_time:191935ms step_avg:158.49ms step:1222/1480 train_time:192101ms step_avg:158.50ms step:1223/1480 train_time:192272ms step_avg:158.51ms step:1224/1480 train_time:192452ms step_avg:158.53ms step:1225/1480 train_time:192623ms step_avg:158.54ms step:1226/1480 train_time:192796ms step_avg:158.55ms step:1227/1480 train_time:192969ms step_avg:158.56ms step:1228/1480 train_time:193139ms step_avg:158.57ms step:1229/1480 train_time:193312ms step_avg:158.58ms step:1230/1480 train_time:193490ms step_avg:158.60ms step:1231/1480 train_time:193666ms step_avg:158.61ms step:1232/1480 train_time:193840ms step_avg:158.63ms step:1233/1480 train_time:194011ms step_avg:158.64ms step:1234/1480 train_time:194180ms step_avg:158.64ms step:1235/1480 train_time:194355ms step_avg:158.66ms step:1236/1480 train_time:194523ms step_avg:158.66ms step:1237/1480 train_time:194694ms step_avg:158.68ms step:1238/1480 train_time:194878ms step_avg:158.70ms step:1239/1480 train_time:195048ms step_avg:158.70ms step:1240/1480 train_time:195219ms step_avg:158.71ms step:1241/1480 train_time:195392ms step_avg:158.73ms step:1242/1480 train_time:195561ms step_avg:158.73ms step:1243/1480 train_time:195734ms step_avg:158.75ms step:1244/1480 train_time:195900ms step_avg:158.75ms step:1245/1480 train_time:196070ms step_avg:158.76ms step:1246/1480 train_time:196239ms step_avg:158.77ms step:1247/1480 train_time:196410ms step_avg:158.78ms step:1248/1480 train_time:196579ms step_avg:158.79ms step:1249/1480 train_time:196748ms step_avg:158.80ms step:1250/1480 train_time:196918ms step_avg:158.80ms step:1250/1480 val_loss:3.3362 train_time:196989ms step_avg:158.86ms step:1251/1480 train_time:197101ms step_avg:158.82ms step:1252/1480 train_time:197270ms step_avg:158.83ms step:1253/1480 train_time:197439ms step_avg:158.84ms step:1254/1480 train_time:197609ms step_avg:158.85ms step:1255/1480 train_time:197795ms step_avg:158.87ms step:1256/1480 train_time:197966ms step_avg:158.88ms step:1257/1480 train_time:198138ms step_avg:158.89ms step:1258/1480 train_time:198311ms step_avg:158.90ms step:1259/1480 train_time:198482ms step_avg:158.91ms step:1260/1480 train_time:198648ms step_avg:158.92ms step:1261/1480 train_time:198821ms step_avg:158.93ms step:1262/1480 train_time:198995ms step_avg:158.94ms step:1263/1480 train_time:199169ms step_avg:158.95ms step:1264/1480 train_time:199337ms step_avg:158.96ms step:1265/1480 train_time:199503ms step_avg:158.97ms step:1266/1480 train_time:199676ms step_avg:158.98ms step:1267/1480 train_time:199845ms step_avg:158.99ms step:1268/1480 train_time:200017ms step_avg:159.00ms step:1269/1480 train_time:200191ms step_avg:159.01ms step:1270/1480 train_time:200361ms step_avg:159.02ms step:1271/1480 train_time:200533ms step_avg:159.03ms step:1272/1480 train_time:200700ms step_avg:159.03ms step:1273/1480 train_time:200871ms step_avg:159.04ms step:1274/1480 train_time:201043ms step_avg:159.05ms step:1275/1480 train_time:201209ms step_avg:159.06ms step:1276/1480 train_time:201376ms step_avg:159.06ms step:1277/1480 train_time:201548ms step_avg:159.07ms step:1278/1480 train_time:201717ms step_avg:159.08ms step:1279/1480 train_time:201887ms step_avg:159.09ms step:1280/1480 train_time:202065ms step_avg:159.11ms step:1281/1480 train_time:202234ms step_avg:159.11ms step:1282/1480 train_time:202400ms step_avg:159.12ms step:1283/1480 train_time:202569ms step_avg:159.13ms step:1284/1480 train_time:202740ms step_avg:159.14ms step:1285/1480 train_time:202907ms step_avg:159.14ms step:1286/1480 train_time:203078ms step_avg:159.15ms step:1287/1480 train_time:203251ms step_avg:159.16ms step:1288/1480 train_time:203423ms step_avg:159.17ms step:1289/1480 train_time:203603ms step_avg:159.19ms step:1290/1480 train_time:203782ms step_avg:159.20ms step:1291/1480 train_time:203956ms step_avg:159.22ms step:1292/1480 train_time:204130ms step_avg:159.23ms step:1293/1480 train_time:204305ms step_avg:159.24ms step:1294/1480 train_time:204475ms step_avg:159.25ms step:1295/1480 train_time:204646ms step_avg:159.26ms step:1296/1480 train_time:204819ms step_avg:159.27ms step:1297/1480 train_time:204990ms step_avg:159.28ms step:1298/1480 train_time:205162ms step_avg:159.29ms step:1299/1480 train_time:205334ms step_avg:159.30ms step:1300/1480 train_time:205503ms step_avg:159.30ms step:1301/1480 train_time:205670ms step_avg:159.31ms step:1302/1480 train_time:205844ms step_avg:159.32ms step:1303/1480 train_time:206022ms step_avg:159.34ms step:1304/1480 train_time:206195ms step_avg:159.35ms step:1305/1480 train_time:206363ms step_avg:159.35ms step:1306/1480 train_time:206540ms step_avg:159.37ms step:1307/1480 train_time:206708ms step_avg:159.37ms step:1308/1480 train_time:206877ms step_avg:159.38ms step:1309/1480 train_time:207048ms step_avg:159.39ms step:1310/1480 train_time:207218ms step_avg:159.40ms step:1311/1480 train_time:207386ms step_avg:159.41ms step:1312/1480 train_time:207560ms step_avg:159.42ms step:1313/1480 train_time:207728ms step_avg:159.42ms step:1314/1480 train_time:207902ms step_avg:159.43ms step:1315/1480 train_time:208073ms step_avg:159.44ms step:1316/1480 train_time:208241ms step_avg:159.45ms step:1317/1480 train_time:208413ms step_avg:159.46ms step:1318/1480 train_time:208593ms step_avg:159.48ms step:1319/1480 train_time:208769ms step_avg:159.49ms step:1320/1480 train_time:208946ms step_avg:159.50ms step:1321/1480 train_time:209119ms step_avg:159.51ms step:1322/1480 train_time:209299ms step_avg:159.53ms step:1323/1480 train_time:209471ms step_avg:159.54ms step:1324/1480 train_time:209647ms step_avg:159.55ms step:1325/1480 train_time:209828ms step_avg:159.57ms step:1326/1480 train_time:210004ms step_avg:159.58ms step:1327/1480 train_time:210174ms step_avg:159.59ms step:1328/1480 train_time:210345ms step_avg:159.59ms step:1329/1480 train_time:210543ms step_avg:159.62ms step:1330/1480 train_time:210724ms step_avg:159.64ms step:1331/1480 train_time:210894ms step_avg:159.65ms step:1332/1480 train_time:211068ms step_avg:159.66ms step:1333/1480 train_time:211244ms step_avg:159.67ms step:1334/1480 train_time:211414ms step_avg:159.68ms step:1335/1480 train_time:211583ms step_avg:159.69ms step:1336/1480 train_time:211766ms step_avg:159.70ms step:1337/1480 train_time:211941ms step_avg:159.71ms step:1338/1480 train_time:212113ms step_avg:159.72ms step:1339/1480 train_time:212287ms step_avg:159.73ms step:1340/1480 train_time:212459ms step_avg:159.74ms step:1341/1480 train_time:212628ms step_avg:159.75ms step:1342/1480 train_time:212802ms step_avg:159.76ms step:1343/1480 train_time:212970ms step_avg:159.77ms step:1344/1480 train_time:213143ms step_avg:159.78ms step:1345/1480 train_time:213321ms step_avg:159.79ms step:1346/1480 train_time:213488ms step_avg:159.80ms step:1347/1480 train_time:213660ms step_avg:159.81ms step:1348/1480 train_time:213828ms step_avg:159.81ms step:1349/1480 train_time:213998ms step_avg:159.82ms step:1350/1480 train_time:214172ms step_avg:159.83ms step:1351/1480 train_time:214342ms step_avg:159.84ms step:1352/1480 train_time:214512ms step_avg:159.84ms step:1353/1480 train_time:214687ms step_avg:159.86ms step:1354/1480 train_time:214859ms step_avg:159.87ms step:1355/1480 train_time:215029ms step_avg:159.87ms step:1356/1480 train_time:215202ms step_avg:159.88ms step:1357/1480 train_time:215375ms step_avg:159.89ms step:1358/1480 train_time:215548ms step_avg:159.90ms step:1359/1480 train_time:215720ms step_avg:159.91ms step:1360/1480 train_time:215895ms step_avg:159.92ms step:1361/1480 train_time:216071ms step_avg:159.93ms step:1362/1480 train_time:216246ms step_avg:159.94ms step:1363/1480 train_time:216428ms step_avg:159.96ms step:1364/1480 train_time:216599ms step_avg:159.97ms step:1365/1480 train_time:216764ms step_avg:159.97ms step:1366/1480 train_time:216938ms step_avg:159.98ms step:1367/1480 train_time:217108ms step_avg:159.99ms step:1368/1480 train_time:217282ms step_avg:160.00ms step:1369/1480 train_time:217464ms step_avg:160.02ms step:1370/1480 train_time:217640ms step_avg:160.03ms step:1371/1480 train_time:217811ms step_avg:160.04ms step:1372/1480 train_time:217990ms step_avg:160.05ms step:1373/1480 train_time:218159ms step_avg:160.06ms step:1374/1480 train_time:218336ms step_avg:160.07ms step:1375/1480 train_time:218506ms step_avg:160.08ms step:1375/1480 val_loss:3.2969 train_time:218574ms step_avg:160.13ms step:1376/1480 train_time:218679ms step_avg:160.09ms step:1377/1480 train_time:218852ms step_avg:160.10ms step:1378/1480 train_time:219021ms step_avg:160.10ms step:1379/1480 train_time:219195ms step_avg:160.11ms step:1380/1480 train_time:219368ms step_avg:160.12ms step:1381/1480 train_time:219548ms step_avg:160.14ms step:1382/1480 train_time:219720ms step_avg:160.15ms step:1383/1480 train_time:219893ms step_avg:160.16ms step:1384/1480 train_time:220071ms step_avg:160.17ms step:1385/1480 train_time:220236ms step_avg:160.17ms step:1386/1480 train_time:220406ms step_avg:160.18ms step:1387/1480 train_time:220577ms step_avg:160.19ms step:1388/1480 train_time:220746ms step_avg:160.19ms step:1389/1480 train_time:220920ms step_avg:160.20ms step:1390/1480 train_time:221088ms step_avg:160.21ms step:1391/1480 train_time:221258ms step_avg:160.22ms step:1392/1480 train_time:221431ms step_avg:160.22ms step:1393/1480 train_time:221601ms step_avg:160.23ms step:1394/1480 train_time:221773ms step_avg:160.24ms step:1395/1480 train_time:221943ms step_avg:160.25ms step:1396/1480 train_time:222113ms step_avg:160.25ms step:1397/1480 train_time:222280ms step_avg:160.26ms step:1398/1480 train_time:222446ms step_avg:160.26ms step:1399/1480 train_time:222616ms step_avg:160.27ms step:1400/1480 train_time:222792ms step_avg:160.28ms step:1401/1480 train_time:222958ms step_avg:160.29ms step:1402/1480 train_time:223129ms step_avg:160.29ms step:1403/1480 train_time:223305ms step_avg:160.30ms step:1404/1480 train_time:223475ms step_avg:160.31ms step:1405/1480 train_time:223651ms step_avg:160.32ms step:1406/1480 train_time:223823ms step_avg:160.33ms step:1407/1480 train_time:223991ms step_avg:160.34ms step:1408/1480 train_time:224161ms step_avg:160.34ms step:1409/1480 train_time:224343ms step_avg:160.36ms step:1410/1480 train_time:224513ms step_avg:160.37ms step:1411/1480 train_time:224683ms step_avg:160.37ms step:1412/1480 train_time:224854ms step_avg:160.38ms step:1413/1480 train_time:225024ms step_avg:160.39ms step:1414/1480 train_time:225196ms step_avg:160.40ms step:1415/1480 train_time:225371ms step_avg:160.41ms step:1416/1480 train_time:225556ms step_avg:160.42ms step:1417/1480 train_time:225730ms step_avg:160.43ms step:1418/1480 train_time:225901ms step_avg:160.44ms step:1419/1480 train_time:226076ms step_avg:160.45ms step:1420/1480 train_time:226251ms step_avg:160.46ms step:1421/1480 train_time:226424ms step_avg:160.47ms step:1422/1480 train_time:226597ms step_avg:160.48ms step:1423/1480 train_time:226765ms step_avg:160.48ms step:1424/1480 train_time:226943ms step_avg:160.50ms step:1425/1480 train_time:227123ms step_avg:160.51ms step:1426/1480 train_time:227295ms step_avg:160.52ms step:1427/1480 train_time:227469ms step_avg:160.53ms step:1428/1480 train_time:227641ms step_avg:160.54ms step:1429/1480 train_time:227809ms step_avg:160.54ms step:1430/1480 train_time:227983ms step_avg:160.55ms step:1431/1480 train_time:228161ms step_avg:160.56ms step:1432/1480 train_time:228337ms step_avg:160.57ms step:1433/1480 train_time:228517ms step_avg:160.59ms step:1434/1480 train_time:228697ms step_avg:160.60ms step:1435/1480 train_time:228872ms step_avg:160.61ms step:1436/1480 train_time:229045ms step_avg:160.62ms step:1437/1480 train_time:229217ms step_avg:160.63ms step:1438/1480 train_time:229385ms step_avg:160.63ms step:1439/1480 train_time:229559ms step_avg:160.64ms step:1440/1480 train_time:229728ms step_avg:160.65ms step:1441/1480 train_time:229898ms step_avg:160.66ms step:1442/1480 train_time:230078ms step_avg:160.67ms step:1443/1480 train_time:230266ms step_avg:160.69ms step:1444/1480 train_time:230437ms step_avg:160.70ms step:1445/1480 train_time:230606ms step_avg:160.70ms step:1446/1480 train_time:230782ms step_avg:160.71ms step:1447/1480 train_time:230960ms step_avg:160.72ms step:1448/1480 train_time:231131ms step_avg:160.73ms step:1449/1480 train_time:231304ms step_avg:160.74ms step:1450/1480 train_time:231478ms step_avg:160.75ms step:1451/1480 train_time:231650ms step_avg:160.76ms step:1452/1480 train_time:231823ms step_avg:160.76ms step:1453/1480 train_time:231993ms step_avg:160.77ms step:1454/1480 train_time:232166ms step_avg:160.78ms step:1455/1480 train_time:232346ms step_avg:160.79ms step:1456/1480 train_time:232517ms step_avg:160.80ms step:1457/1480 train_time:232688ms step_avg:160.81ms step:1458/1480 train_time:232859ms step_avg:160.81ms step:1459/1480 train_time:233036ms step_avg:160.83ms step:1460/1480 train_time:233207ms step_avg:160.83ms step:1461/1480 train_time:233381ms step_avg:160.84ms step:1462/1480 train_time:233554ms step_avg:160.85ms step:1463/1480 train_time:233728ms step_avg:160.86ms step:1464/1480 train_time:233902ms step_avg:160.87ms step:1465/1480 train_time:234076ms step_avg:160.88ms step:1466/1480 train_time:234247ms step_avg:160.88ms step:1467/1480 train_time:234421ms step_avg:160.89ms step:1468/1480 train_time:234590ms step_avg:160.90ms step:1469/1480 train_time:234763ms step_avg:160.91ms step:1470/1480 train_time:234942ms step_avg:160.92ms step:1471/1480 train_time:235128ms step_avg:160.94ms step:1472/1480 train_time:235307ms step_avg:160.95ms step:1473/1480 train_time:235478ms step_avg:160.96ms step:1474/1480 train_time:235656ms step_avg:160.97ms step:1475/1480 train_time:235836ms step_avg:160.98ms step:1476/1480 train_time:236009ms step_avg:160.99ms step:1477/1480 train_time:236189ms step_avg:161.00ms step:1478/1480 train_time:236377ms step_avg:161.02ms step:1479/1480 train_time:236551ms step_avg:161.03ms step:1480/1480 train_time:236722ms step_avg:161.04ms step:1480/1480 val_loss:3.2776 train_time:236794ms step_avg:161.08ms