import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 07:23:15 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 32C P0 72W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 38C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 38C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 34C P0 115W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 34C P0 111W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 38C P0 116W / 700W | 119MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 39C P0 120W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 33C P0 119W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22862ms step_avg:nanms step:2/1480 train_time:22952ms step_avg:nanms step:3/1480 train_time:23092ms step_avg:nanms step:4/1480 train_time:23234ms step_avg:nanms step:5/1480 train_time:23375ms step_avg:nanms step:6/1480 train_time:23515ms step_avg:nanms step:7/1480 train_time:23657ms step_avg:nanms step:8/1480 train_time:23799ms step_avg:nanms step:9/1480 train_time:23942ms step_avg:nanms step:10/1480 train_time:24084ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:428ms step_avg:142.64ms step:14/1480 train_time:571ms step_avg:142.76ms step:15/1480 train_time:712ms step_avg:142.48ms step:16/1480 train_time:856ms step_avg:142.69ms step:17/1480 train_time:998ms step_avg:142.61ms step:18/1480 train_time:1141ms step_avg:142.59ms step:19/1480 train_time:1283ms step_avg:142.59ms step:20/1480 train_time:1427ms step_avg:142.74ms step:21/1480 train_time:1571ms step_avg:142.78ms step:22/1480 train_time:1713ms step_avg:142.74ms step:23/1480 train_time:1855ms step_avg:142.69ms step:24/1480 train_time:1997ms step_avg:142.65ms step:25/1480 train_time:2138ms step_avg:142.56ms step:26/1480 train_time:2279ms step_avg:142.44ms step:27/1480 train_time:2421ms step_avg:142.41ms step:28/1480 train_time:2563ms step_avg:142.40ms step:29/1480 train_time:2706ms step_avg:142.42ms step:30/1480 train_time:2849ms step_avg:142.43ms step:31/1480 train_time:2991ms step_avg:142.43ms step:32/1480 train_time:3134ms step_avg:142.44ms step:33/1480 train_time:3278ms step_avg:142.52ms step:34/1480 train_time:3418ms step_avg:142.43ms step:35/1480 train_time:3560ms step_avg:142.41ms step:36/1480 train_time:3703ms step_avg:142.44ms step:37/1480 train_time:3845ms step_avg:142.41ms step:38/1480 train_time:3988ms step_avg:142.42ms step:39/1480 train_time:4131ms step_avg:142.44ms step:40/1480 train_time:4274ms step_avg:142.46ms step:41/1480 train_time:4414ms step_avg:142.40ms step:42/1480 train_time:4557ms step_avg:142.40ms step:43/1480 train_time:4698ms step_avg:142.36ms step:44/1480 train_time:4840ms step_avg:142.35ms step:45/1480 train_time:4982ms step_avg:142.34ms step:46/1480 train_time:5126ms step_avg:142.39ms step:47/1480 train_time:5270ms step_avg:142.44ms step:48/1480 train_time:5412ms step_avg:142.43ms step:49/1480 train_time:5554ms step_avg:142.40ms step:50/1480 train_time:5697ms step_avg:142.41ms step:51/1480 train_time:5839ms step_avg:142.42ms step:52/1480 train_time:5982ms step_avg:142.42ms step:53/1480 train_time:6126ms step_avg:142.47ms step:54/1480 train_time:6271ms step_avg:142.52ms step:55/1480 train_time:6413ms step_avg:142.51ms step:56/1480 train_time:6555ms step_avg:142.49ms step:57/1480 train_time:6696ms step_avg:142.46ms step:58/1480 train_time:6836ms step_avg:142.43ms step:59/1480 train_time:6977ms step_avg:142.39ms step:60/1480 train_time:7119ms step_avg:142.38ms step:61/1480 train_time:7263ms step_avg:142.41ms step:62/1480 train_time:7404ms step_avg:142.39ms step:63/1480 train_time:7548ms step_avg:142.41ms step:64/1480 train_time:7691ms step_avg:142.43ms step:65/1480 train_time:7834ms step_avg:142.43ms step:66/1480 train_time:7976ms step_avg:142.42ms step:67/1480 train_time:8117ms step_avg:142.40ms step:68/1480 train_time:8258ms step_avg:142.38ms step:69/1480 train_time:8399ms step_avg:142.36ms step:70/1480 train_time:8542ms step_avg:142.36ms step:71/1480 train_time:8685ms step_avg:142.38ms step:72/1480 train_time:8828ms step_avg:142.39ms step:73/1480 train_time:8971ms step_avg:142.40ms step:74/1480 train_time:9114ms step_avg:142.41ms step:75/1480 train_time:9256ms step_avg:142.41ms step:76/1480 train_time:9399ms step_avg:142.40ms step:77/1480 train_time:9540ms step_avg:142.39ms step:78/1480 train_time:9682ms step_avg:142.38ms step:79/1480 train_time:9826ms step_avg:142.40ms step:80/1480 train_time:9968ms step_avg:142.40ms step:81/1480 train_time:10111ms step_avg:142.41ms step:82/1480 train_time:10254ms step_avg:142.42ms step:83/1480 train_time:10395ms step_avg:142.40ms step:84/1480 train_time:10536ms step_avg:142.37ms step:85/1480 train_time:10677ms step_avg:142.36ms step:86/1480 train_time:10820ms step_avg:142.36ms step:87/1480 train_time:10963ms step_avg:142.38ms step:88/1480 train_time:11106ms step_avg:142.38ms step:89/1480 train_time:11250ms step_avg:142.41ms step:90/1480 train_time:11393ms step_avg:142.42ms step:91/1480 train_time:11536ms step_avg:142.42ms step:92/1480 train_time:11679ms step_avg:142.43ms step:93/1480 train_time:11821ms step_avg:142.43ms step:94/1480 train_time:11963ms step_avg:142.42ms step:95/1480 train_time:12104ms step_avg:142.40ms step:96/1480 train_time:12248ms step_avg:142.42ms step:97/1480 train_time:12392ms step_avg:142.44ms step:98/1480 train_time:12534ms step_avg:142.43ms step:99/1480 train_time:12676ms step_avg:142.42ms step:100/1480 train_time:12817ms step_avg:142.41ms step:101/1480 train_time:12958ms step_avg:142.39ms step:102/1480 train_time:13099ms step_avg:142.38ms step:103/1480 train_time:13239ms step_avg:142.36ms step:104/1480 train_time:13381ms step_avg:142.35ms step:105/1480 train_time:13523ms step_avg:142.35ms step:106/1480 train_time:13667ms step_avg:142.36ms step:107/1480 train_time:13809ms step_avg:142.36ms step:108/1480 train_time:13952ms step_avg:142.37ms step:109/1480 train_time:14094ms step_avg:142.36ms step:110/1480 train_time:14234ms step_avg:142.34ms step:111/1480 train_time:14378ms step_avg:142.36ms step:112/1480 train_time:14524ms step_avg:142.40ms step:113/1480 train_time:14673ms step_avg:142.46ms step:114/1480 train_time:14819ms step_avg:142.49ms step:115/1480 train_time:14965ms step_avg:142.53ms step:116/1480 train_time:15112ms step_avg:142.56ms step:117/1480 train_time:15258ms step_avg:142.59ms step:118/1480 train_time:15402ms step_avg:142.61ms step:119/1480 train_time:15549ms step_avg:142.65ms step:120/1480 train_time:15697ms step_avg:142.70ms step:121/1480 train_time:15842ms step_avg:142.72ms step:122/1480 train_time:15990ms step_avg:142.76ms step:123/1480 train_time:16136ms step_avg:142.80ms step:124/1480 train_time:16282ms step_avg:142.83ms step:125/1480 train_time:16429ms step_avg:142.86ms step:125/1480 val_loss:4.4099 train_time:16486ms step_avg:143.36ms step:126/1480 train_time:16582ms step_avg:142.94ms step:127/1480 train_time:16731ms step_avg:143.00ms step:128/1480 train_time:16877ms step_avg:143.02ms step:129/1480 train_time:17022ms step_avg:143.04ms step:130/1480 train_time:17168ms step_avg:143.07ms step:131/1480 train_time:17314ms step_avg:143.09ms step:132/1480 train_time:17459ms step_avg:143.11ms step:133/1480 train_time:17609ms step_avg:143.17ms step:134/1480 train_time:17756ms step_avg:143.19ms step:135/1480 train_time:17901ms step_avg:143.21ms step:136/1480 train_time:18047ms step_avg:143.23ms step:137/1480 train_time:18194ms step_avg:143.26ms step:138/1480 train_time:18338ms step_avg:143.26ms step:139/1480 train_time:18485ms step_avg:143.29ms step:140/1480 train_time:18633ms step_avg:143.33ms step:141/1480 train_time:18779ms step_avg:143.35ms step:142/1480 train_time:18930ms step_avg:143.41ms step:143/1480 train_time:19075ms step_avg:143.42ms step:144/1480 train_time:19221ms step_avg:143.44ms step:145/1480 train_time:19369ms step_avg:143.47ms step:146/1480 train_time:19515ms step_avg:143.50ms step:147/1480 train_time:19662ms step_avg:143.52ms step:148/1480 train_time:19810ms step_avg:143.55ms step:149/1480 train_time:19957ms step_avg:143.57ms step:150/1480 train_time:20103ms step_avg:143.60ms step:151/1480 train_time:20251ms step_avg:143.63ms step:152/1480 train_time:20397ms step_avg:143.64ms step:153/1480 train_time:20545ms step_avg:143.67ms step:154/1480 train_time:20694ms step_avg:143.71ms step:155/1480 train_time:20839ms step_avg:143.72ms step:156/1480 train_time:20988ms step_avg:143.75ms step:157/1480 train_time:21135ms step_avg:143.77ms step:158/1480 train_time:21279ms step_avg:143.78ms step:159/1480 train_time:21428ms step_avg:143.82ms step:160/1480 train_time:21575ms step_avg:143.84ms step:161/1480 train_time:21722ms step_avg:143.86ms step:162/1480 train_time:21870ms step_avg:143.88ms step:163/1480 train_time:22017ms step_avg:143.90ms step:164/1480 train_time:22164ms step_avg:143.92ms step:165/1480 train_time:22311ms step_avg:143.94ms step:166/1480 train_time:22456ms step_avg:143.95ms step:167/1480 train_time:22600ms step_avg:143.95ms step:168/1480 train_time:22749ms step_avg:143.98ms step:169/1480 train_time:22895ms step_avg:144.00ms step:170/1480 train_time:23043ms step_avg:144.02ms step:171/1480 train_time:23191ms step_avg:144.05ms step:172/1480 train_time:23337ms step_avg:144.06ms step:173/1480 train_time:23484ms step_avg:144.07ms step:174/1480 train_time:23631ms step_avg:144.09ms step:175/1480 train_time:23776ms step_avg:144.10ms step:176/1480 train_time:23922ms step_avg:144.11ms step:177/1480 train_time:24070ms step_avg:144.13ms step:178/1480 train_time:24218ms step_avg:144.15ms step:179/1480 train_time:24365ms step_avg:144.17ms step:180/1480 train_time:24513ms step_avg:144.19ms step:181/1480 train_time:24659ms step_avg:144.20ms step:182/1480 train_time:24806ms step_avg:144.22ms step:183/1480 train_time:24953ms step_avg:144.24ms step:184/1480 train_time:25099ms step_avg:144.25ms step:185/1480 train_time:25246ms step_avg:144.26ms step:186/1480 train_time:25394ms step_avg:144.29ms step:187/1480 train_time:25540ms step_avg:144.29ms step:188/1480 train_time:25687ms step_avg:144.31ms step:189/1480 train_time:25834ms step_avg:144.32ms step:190/1480 train_time:25981ms step_avg:144.34ms step:191/1480 train_time:26129ms step_avg:144.36ms step:192/1480 train_time:26275ms step_avg:144.37ms step:193/1480 train_time:26422ms step_avg:144.38ms step:194/1480 train_time:26569ms step_avg:144.39ms step:195/1480 train_time:26715ms step_avg:144.40ms step:196/1480 train_time:26859ms step_avg:144.41ms step:197/1480 train_time:27006ms step_avg:144.42ms step:198/1480 train_time:27153ms step_avg:144.43ms step:199/1480 train_time:27298ms step_avg:144.43ms step:200/1480 train_time:27445ms step_avg:144.45ms step:201/1480 train_time:27592ms step_avg:144.46ms step:202/1480 train_time:27738ms step_avg:144.47ms step:203/1480 train_time:27885ms step_avg:144.48ms step:204/1480 train_time:28033ms step_avg:144.50ms step:205/1480 train_time:28180ms step_avg:144.51ms step:206/1480 train_time:28328ms step_avg:144.53ms step:207/1480 train_time:28474ms step_avg:144.54ms step:208/1480 train_time:28620ms step_avg:144.55ms step:209/1480 train_time:28768ms step_avg:144.56ms step:210/1480 train_time:28914ms step_avg:144.57ms step:211/1480 train_time:29059ms step_avg:144.57ms step:212/1480 train_time:29206ms step_avg:144.59ms step:213/1480 train_time:29353ms step_avg:144.60ms step:214/1480 train_time:29498ms step_avg:144.60ms step:215/1480 train_time:29644ms step_avg:144.61ms step:216/1480 train_time:29793ms step_avg:144.63ms step:217/1480 train_time:29938ms step_avg:144.63ms step:218/1480 train_time:30085ms step_avg:144.64ms step:219/1480 train_time:30233ms step_avg:144.66ms step:220/1480 train_time:30379ms step_avg:144.66ms step:221/1480 train_time:30528ms step_avg:144.68ms step:222/1480 train_time:30678ms step_avg:144.71ms step:223/1480 train_time:30830ms step_avg:144.74ms step:224/1480 train_time:30980ms step_avg:144.77ms step:225/1480 train_time:31131ms step_avg:144.80ms step:226/1480 train_time:31281ms step_avg:144.82ms step:227/1480 train_time:31432ms step_avg:144.85ms step:228/1480 train_time:31581ms step_avg:144.87ms step:229/1480 train_time:31733ms step_avg:144.90ms step:230/1480 train_time:31883ms step_avg:144.92ms step:231/1480 train_time:32033ms step_avg:144.95ms step:232/1480 train_time:32184ms step_avg:144.97ms step:233/1480 train_time:32334ms step_avg:145.00ms step:234/1480 train_time:32484ms step_avg:145.02ms step:235/1480 train_time:32636ms step_avg:145.05ms step:236/1480 train_time:32786ms step_avg:145.07ms step:237/1480 train_time:32938ms step_avg:145.10ms step:238/1480 train_time:33087ms step_avg:145.12ms step:239/1480 train_time:33238ms step_avg:145.15ms step:240/1480 train_time:33390ms step_avg:145.18ms step:241/1480 train_time:33540ms step_avg:145.20ms step:242/1480 train_time:33693ms step_avg:145.23ms step:243/1480 train_time:33841ms step_avg:145.24ms step:244/1480 train_time:33992ms step_avg:145.27ms step:245/1480 train_time:34142ms step_avg:145.28ms step:246/1480 train_time:34293ms step_avg:145.31ms step:247/1480 train_time:34441ms step_avg:145.32ms step:248/1480 train_time:34595ms step_avg:145.36ms step:249/1480 train_time:34745ms step_avg:145.38ms step:250/1480 train_time:34896ms step_avg:145.40ms step:250/1480 val_loss:3.9869 train_time:34954ms step_avg:145.64ms step:251/1480 train_time:35052ms step_avg:145.44ms step:252/1480 train_time:35204ms step_avg:145.47ms step:253/1480 train_time:35353ms step_avg:145.49ms step:254/1480 train_time:35503ms step_avg:145.50ms step:255/1480 train_time:35652ms step_avg:145.52ms step:256/1480 train_time:35801ms step_avg:145.53ms step:257/1480 train_time:35952ms step_avg:145.55ms step:258/1480 train_time:36104ms step_avg:145.58ms step:259/1480 train_time:36253ms step_avg:145.60ms step:260/1480 train_time:36405ms step_avg:145.62ms step:261/1480 train_time:36554ms step_avg:145.64ms step:262/1480 train_time:36705ms step_avg:145.65ms step:263/1480 train_time:36854ms step_avg:145.67ms step:264/1480 train_time:37005ms step_avg:145.69ms step:265/1480 train_time:37155ms step_avg:145.71ms step:266/1480 train_time:37307ms step_avg:145.73ms step:267/1480 train_time:37456ms step_avg:145.74ms step:268/1480 train_time:37608ms step_avg:145.77ms step:269/1480 train_time:37758ms step_avg:145.78ms step:270/1480 train_time:37909ms step_avg:145.80ms step:271/1480 train_time:38059ms step_avg:145.82ms step:272/1480 train_time:38211ms step_avg:145.84ms step:273/1480 train_time:38362ms step_avg:145.86ms step:274/1480 train_time:38513ms step_avg:145.88ms step:275/1480 train_time:38662ms step_avg:145.90ms step:276/1480 train_time:38812ms step_avg:145.91ms step:277/1480 train_time:38961ms step_avg:145.92ms step:278/1480 train_time:39112ms step_avg:145.94ms step:279/1480 train_time:39264ms step_avg:145.96ms step:280/1480 train_time:39415ms step_avg:145.98ms step:281/1480 train_time:39566ms step_avg:146.00ms step:282/1480 train_time:39715ms step_avg:146.01ms step:283/1480 train_time:39866ms step_avg:146.03ms step:284/1480 train_time:40015ms step_avg:146.04ms step:285/1480 train_time:40166ms step_avg:146.06ms step:286/1480 train_time:40315ms step_avg:146.07ms step:287/1480 train_time:40467ms step_avg:146.09ms step:288/1480 train_time:40619ms step_avg:146.11ms step:289/1480 train_time:40771ms step_avg:146.13ms step:290/1480 train_time:40920ms step_avg:146.14ms step:291/1480 train_time:41071ms step_avg:146.16ms step:292/1480 train_time:41220ms step_avg:146.17ms step:293/1480 train_time:41370ms step_avg:146.18ms step:294/1480 train_time:41522ms step_avg:146.20ms step:295/1480 train_time:41673ms step_avg:146.22ms step:296/1480 train_time:41823ms step_avg:146.24ms step:297/1480 train_time:41974ms step_avg:146.25ms step:298/1480 train_time:42126ms step_avg:146.27ms step:299/1480 train_time:42274ms step_avg:146.28ms step:300/1480 train_time:42428ms step_avg:146.30ms step:301/1480 train_time:42577ms step_avg:146.31ms step:302/1480 train_time:42728ms step_avg:146.33ms step:303/1480 train_time:42878ms step_avg:146.34ms step:304/1480 train_time:43029ms step_avg:146.36ms step:305/1480 train_time:43179ms step_avg:146.37ms step:306/1480 train_time:43331ms step_avg:146.39ms step:307/1480 train_time:43482ms step_avg:146.41ms step:308/1480 train_time:43634ms step_avg:146.42ms step:309/1480 train_time:43784ms step_avg:146.44ms step:310/1480 train_time:43935ms step_avg:146.45ms step:311/1480 train_time:44086ms step_avg:146.46ms step:312/1480 train_time:44236ms step_avg:146.48ms step:313/1480 train_time:44387ms step_avg:146.49ms step:314/1480 train_time:44538ms step_avg:146.51ms step:315/1480 train_time:44688ms step_avg:146.52ms step:316/1480 train_time:44838ms step_avg:146.53ms step:317/1480 train_time:44989ms step_avg:146.54ms step:318/1480 train_time:45140ms step_avg:146.56ms step:319/1480 train_time:45291ms step_avg:146.57ms step:320/1480 train_time:45443ms step_avg:146.59ms step:321/1480 train_time:45594ms step_avg:146.60ms step:322/1480 train_time:45744ms step_avg:146.62ms step:323/1480 train_time:45894ms step_avg:146.63ms step:324/1480 train_time:46046ms step_avg:146.64ms step:325/1480 train_time:46195ms step_avg:146.65ms step:326/1480 train_time:46348ms step_avg:146.67ms step:327/1480 train_time:46499ms step_avg:146.68ms step:328/1480 train_time:46650ms step_avg:146.70ms step:329/1480 train_time:46802ms step_avg:146.71ms step:330/1480 train_time:46954ms step_avg:146.73ms step:331/1480 train_time:47108ms step_avg:146.75ms step:332/1480 train_time:47262ms step_avg:146.78ms step:333/1480 train_time:47417ms step_avg:146.80ms step:334/1480 train_time:47570ms step_avg:146.82ms step:335/1480 train_time:47724ms step_avg:146.84ms step:336/1480 train_time:47877ms step_avg:146.86ms step:337/1480 train_time:48034ms step_avg:146.89ms step:338/1480 train_time:48188ms step_avg:146.92ms step:339/1480 train_time:48343ms step_avg:146.94ms step:340/1480 train_time:48496ms step_avg:146.96ms step:341/1480 train_time:48651ms step_avg:146.98ms step:342/1480 train_time:48805ms step_avg:147.00ms step:343/1480 train_time:48959ms step_avg:147.02ms step:344/1480 train_time:49112ms step_avg:147.04ms step:345/1480 train_time:49266ms step_avg:147.06ms step:346/1480 train_time:49420ms step_avg:147.08ms step:347/1480 train_time:49574ms step_avg:147.10ms step:348/1480 train_time:49728ms step_avg:147.12ms step:349/1480 train_time:49881ms step_avg:147.14ms step:350/1480 train_time:50035ms step_avg:147.16ms step:351/1480 train_time:50190ms step_avg:147.18ms step:352/1480 train_time:50344ms step_avg:147.20ms step:353/1480 train_time:50497ms step_avg:147.22ms step:354/1480 train_time:50650ms step_avg:147.24ms step:355/1480 train_time:50805ms step_avg:147.26ms step:356/1480 train_time:50958ms step_avg:147.28ms step:357/1480 train_time:51112ms step_avg:147.30ms step:358/1480 train_time:51268ms step_avg:147.32ms step:359/1480 train_time:51425ms step_avg:147.35ms step:360/1480 train_time:51581ms step_avg:147.37ms step:361/1480 train_time:51736ms step_avg:147.39ms step:362/1480 train_time:51890ms step_avg:147.42ms step:363/1480 train_time:52044ms step_avg:147.43ms step:364/1480 train_time:52198ms step_avg:147.45ms step:365/1480 train_time:52352ms step_avg:147.47ms step:366/1480 train_time:52506ms step_avg:147.49ms step:367/1480 train_time:52659ms step_avg:147.50ms step:368/1480 train_time:52813ms step_avg:147.52ms step:369/1480 train_time:52967ms step_avg:147.54ms step:370/1480 train_time:53121ms step_avg:147.56ms step:371/1480 train_time:53275ms step_avg:147.58ms step:372/1480 train_time:53430ms step_avg:147.60ms step:373/1480 train_time:53584ms step_avg:147.61ms step:374/1480 train_time:53738ms step_avg:147.63ms step:375/1480 train_time:53891ms step_avg:147.65ms step:375/1480 val_loss:3.8038 train_time:53951ms step_avg:147.81ms step:376/1480 train_time:54049ms step_avg:147.67ms step:377/1480 train_time:54205ms step_avg:147.70ms step:378/1480 train_time:54358ms step_avg:147.71ms step:379/1480 train_time:54510ms step_avg:147.72ms step:380/1480 train_time:54662ms step_avg:147.74ms step:381/1480 train_time:54815ms step_avg:147.75ms step:382/1480 train_time:54968ms step_avg:147.76ms step:383/1480 train_time:55123ms step_avg:147.78ms step:384/1480 train_time:55277ms step_avg:147.80ms step:385/1480 train_time:55430ms step_avg:147.81ms step:386/1480 train_time:55584ms step_avg:147.83ms step:387/1480 train_time:55736ms step_avg:147.84ms step:388/1480 train_time:55889ms step_avg:147.85ms step:389/1480 train_time:56043ms step_avg:147.87ms step:390/1480 train_time:56197ms step_avg:147.89ms step:391/1480 train_time:56350ms step_avg:147.90ms step:392/1480 train_time:56503ms step_avg:147.91ms step:393/1480 train_time:56656ms step_avg:147.93ms step:394/1480 train_time:56810ms step_avg:147.94ms step:395/1480 train_time:56964ms step_avg:147.96ms step:396/1480 train_time:57119ms step_avg:147.98ms step:397/1480 train_time:57272ms step_avg:147.99ms step:398/1480 train_time:57426ms step_avg:148.01ms step:399/1480 train_time:57580ms step_avg:148.02ms step:400/1480 train_time:57732ms step_avg:148.03ms step:401/1480 train_time:57885ms step_avg:148.04ms step:402/1480 train_time:58040ms step_avg:148.06ms step:403/1480 train_time:58193ms step_avg:148.07ms step:404/1480 train_time:58348ms step_avg:148.09ms step:405/1480 train_time:58503ms step_avg:148.11ms step:406/1480 train_time:58655ms step_avg:148.12ms step:407/1480 train_time:58811ms step_avg:148.14ms step:408/1480 train_time:58964ms step_avg:148.15ms step:409/1480 train_time:59116ms step_avg:148.16ms step:410/1480 train_time:59270ms step_avg:148.17ms step:411/1480 train_time:59424ms step_avg:148.19ms step:412/1480 train_time:59578ms step_avg:148.20ms step:413/1480 train_time:59731ms step_avg:148.22ms step:414/1480 train_time:59885ms step_avg:148.23ms step:415/1480 train_time:60039ms step_avg:148.25ms step:416/1480 train_time:60193ms step_avg:148.26ms step:417/1480 train_time:60348ms step_avg:148.27ms step:418/1480 train_time:60502ms step_avg:148.29ms step:419/1480 train_time:60655ms step_avg:148.30ms step:420/1480 train_time:60810ms step_avg:148.32ms step:421/1480 train_time:60962ms step_avg:148.33ms step:422/1480 train_time:61116ms step_avg:148.34ms step:423/1480 train_time:61269ms step_avg:148.35ms step:424/1480 train_time:61423ms step_avg:148.36ms step:425/1480 train_time:61575ms step_avg:148.37ms step:426/1480 train_time:61730ms step_avg:148.39ms step:427/1480 train_time:61883ms step_avg:148.40ms step:428/1480 train_time:62038ms step_avg:148.42ms step:429/1480 train_time:62191ms step_avg:148.43ms step:430/1480 train_time:62345ms step_avg:148.44ms step:431/1480 train_time:62499ms step_avg:148.45ms step:432/1480 train_time:62652ms step_avg:148.46ms step:433/1480 train_time:62805ms step_avg:148.47ms step:434/1480 train_time:62957ms step_avg:148.48ms step:435/1480 train_time:63110ms step_avg:148.49ms step:436/1480 train_time:63263ms step_avg:148.51ms step:437/1480 train_time:63418ms step_avg:148.52ms step:438/1480 train_time:63571ms step_avg:148.53ms step:439/1480 train_time:63726ms step_avg:148.55ms step:440/1480 train_time:63882ms step_avg:148.56ms step:441/1480 train_time:64039ms step_avg:148.58ms step:442/1480 train_time:64196ms step_avg:148.60ms step:443/1480 train_time:64352ms step_avg:148.62ms step:444/1480 train_time:64508ms step_avg:148.63ms step:445/1480 train_time:64662ms step_avg:148.65ms step:446/1480 train_time:64819ms step_avg:148.67ms step:447/1480 train_time:64975ms step_avg:148.68ms step:448/1480 train_time:65132ms step_avg:148.70ms step:449/1480 train_time:65289ms step_avg:148.72ms step:450/1480 train_time:65446ms step_avg:148.74ms step:451/1480 train_time:65604ms step_avg:148.76ms step:452/1480 train_time:65761ms step_avg:148.78ms step:453/1480 train_time:65920ms step_avg:148.80ms step:454/1480 train_time:66077ms step_avg:148.82ms step:455/1480 train_time:66233ms step_avg:148.84ms step:456/1480 train_time:66388ms step_avg:148.85ms step:457/1480 train_time:66545ms step_avg:148.87ms step:458/1480 train_time:66702ms step_avg:148.89ms step:459/1480 train_time:66859ms step_avg:148.91ms step:460/1480 train_time:67015ms step_avg:148.92ms step:461/1480 train_time:67173ms step_avg:148.94ms step:462/1480 train_time:67329ms step_avg:148.96ms step:463/1480 train_time:67486ms step_avg:148.97ms step:464/1480 train_time:67642ms step_avg:148.99ms step:465/1480 train_time:67798ms step_avg:149.01ms step:466/1480 train_time:67956ms step_avg:149.03ms step:467/1480 train_time:68113ms step_avg:149.04ms step:468/1480 train_time:68269ms step_avg:149.06ms step:469/1480 train_time:68426ms step_avg:149.08ms step:470/1480 train_time:68582ms step_avg:149.09ms step:471/1480 train_time:68740ms step_avg:149.11ms step:472/1480 train_time:68896ms step_avg:149.13ms step:473/1480 train_time:69052ms step_avg:149.14ms step:474/1480 train_time:69208ms step_avg:149.15ms step:475/1480 train_time:69364ms step_avg:149.17ms step:476/1480 train_time:69522ms step_avg:149.19ms step:477/1480 train_time:69680ms step_avg:149.21ms step:478/1480 train_time:69835ms step_avg:149.22ms step:479/1480 train_time:69990ms step_avg:149.23ms step:480/1480 train_time:70147ms step_avg:149.25ms step:481/1480 train_time:70304ms step_avg:149.26ms step:482/1480 train_time:70462ms step_avg:149.28ms step:483/1480 train_time:70619ms step_avg:149.30ms step:484/1480 train_time:70776ms step_avg:149.32ms step:485/1480 train_time:70933ms step_avg:149.33ms step:486/1480 train_time:71088ms step_avg:149.35ms step:487/1480 train_time:71245ms step_avg:149.36ms step:488/1480 train_time:71402ms step_avg:149.38ms step:489/1480 train_time:71560ms step_avg:149.40ms step:490/1480 train_time:71717ms step_avg:149.41ms step:491/1480 train_time:71872ms step_avg:149.42ms step:492/1480 train_time:72029ms step_avg:149.44ms step:493/1480 train_time:72185ms step_avg:149.45ms step:494/1480 train_time:72342ms step_avg:149.47ms step:495/1480 train_time:72500ms step_avg:149.48ms step:496/1480 train_time:72658ms step_avg:149.50ms step:497/1480 train_time:72813ms step_avg:149.51ms step:498/1480 train_time:72971ms step_avg:149.53ms step:499/1480 train_time:73128ms step_avg:149.55ms step:500/1480 train_time:73285ms step_avg:149.56ms step:500/1480 val_loss:3.6815 train_time:73346ms step_avg:149.69ms step:501/1480 train_time:73443ms step_avg:149.58ms step:502/1480 train_time:73603ms step_avg:149.60ms step:503/1480 train_time:73758ms step_avg:149.61ms step:504/1480 train_time:73913ms step_avg:149.62ms step:505/1480 train_time:74068ms step_avg:149.63ms step:506/1480 train_time:74225ms step_avg:149.65ms step:507/1480 train_time:74382ms step_avg:149.66ms step:508/1480 train_time:74539ms step_avg:149.68ms step:509/1480 train_time:74696ms step_avg:149.69ms step:510/1480 train_time:74852ms step_avg:149.70ms step:511/1480 train_time:75009ms step_avg:149.72ms step:512/1480 train_time:75166ms step_avg:149.73ms step:513/1480 train_time:75322ms step_avg:149.75ms step:514/1480 train_time:75479ms step_avg:149.76ms step:515/1480 train_time:75635ms step_avg:149.77ms step:516/1480 train_time:75793ms step_avg:149.79ms step:517/1480 train_time:75951ms step_avg:149.80ms step:518/1480 train_time:76109ms step_avg:149.82ms step:519/1480 train_time:76266ms step_avg:149.83ms step:520/1480 train_time:76422ms step_avg:149.85ms step:521/1480 train_time:76578ms step_avg:149.86ms step:522/1480 train_time:76735ms step_avg:149.87ms step:523/1480 train_time:76892ms step_avg:149.89ms step:524/1480 train_time:77049ms step_avg:149.90ms step:525/1480 train_time:77207ms step_avg:149.92ms step:526/1480 train_time:77366ms step_avg:149.93ms step:527/1480 train_time:77522ms step_avg:149.95ms step:528/1480 train_time:77678ms step_avg:149.96ms step:529/1480 train_time:77834ms step_avg:149.97ms step:530/1480 train_time:77991ms step_avg:149.98ms step:531/1480 train_time:78148ms step_avg:150.00ms step:532/1480 train_time:78304ms step_avg:150.01ms step:533/1480 train_time:78460ms step_avg:150.02ms step:534/1480 train_time:78616ms step_avg:150.03ms step:535/1480 train_time:78772ms step_avg:150.04ms step:536/1480 train_time:78929ms step_avg:150.06ms step:537/1480 train_time:79087ms step_avg:150.07ms step:538/1480 train_time:79242ms step_avg:150.08ms step:539/1480 train_time:79399ms step_avg:150.09ms step:540/1480 train_time:79556ms step_avg:150.10ms step:541/1480 train_time:79712ms step_avg:150.12ms step:542/1480 train_time:79869ms step_avg:150.13ms step:543/1480 train_time:80026ms step_avg:150.14ms step:544/1480 train_time:80183ms step_avg:150.16ms step:545/1480 train_time:80339ms step_avg:150.17ms step:546/1480 train_time:80496ms step_avg:150.18ms step:547/1480 train_time:80653ms step_avg:150.19ms step:548/1480 train_time:80811ms step_avg:150.21ms step:549/1480 train_time:80968ms step_avg:150.22ms step:550/1480 train_time:81129ms step_avg:150.24ms step:551/1480 train_time:81288ms step_avg:150.25ms step:552/1480 train_time:81447ms step_avg:150.27ms step:553/1480 train_time:81608ms step_avg:150.29ms step:554/1480 train_time:81768ms step_avg:150.31ms step:555/1480 train_time:81930ms step_avg:150.33ms step:556/1480 train_time:82090ms step_avg:150.35ms step:557/1480 train_time:82250ms step_avg:150.37ms step:558/1480 train_time:82409ms step_avg:150.38ms step:559/1480 train_time:82568ms step_avg:150.40ms step:560/1480 train_time:82727ms step_avg:150.41ms step:561/1480 train_time:82886ms step_avg:150.43ms step:562/1480 train_time:83045ms step_avg:150.44ms step:563/1480 train_time:83203ms step_avg:150.46ms step:564/1480 train_time:83361ms step_avg:150.47ms step:565/1480 train_time:83520ms step_avg:150.49ms step:566/1480 train_time:83678ms step_avg:150.50ms step:567/1480 train_time:83835ms step_avg:150.51ms step:568/1480 train_time:83993ms step_avg:150.53ms step:569/1480 train_time:84152ms step_avg:150.54ms step:570/1480 train_time:84311ms step_avg:150.56ms step:571/1480 train_time:84471ms step_avg:150.57ms step:572/1480 train_time:84631ms step_avg:150.59ms step:573/1480 train_time:84791ms step_avg:150.61ms step:574/1480 train_time:84952ms step_avg:150.62ms step:575/1480 train_time:85112ms step_avg:150.64ms step:576/1480 train_time:85271ms step_avg:150.66ms step:577/1480 train_time:85430ms step_avg:150.67ms step:578/1480 train_time:85591ms step_avg:150.69ms step:579/1480 train_time:85749ms step_avg:150.70ms step:580/1480 train_time:85908ms step_avg:150.72ms step:581/1480 train_time:86070ms step_avg:150.74ms step:582/1480 train_time:86230ms step_avg:150.75ms step:583/1480 train_time:86389ms step_avg:150.77ms step:584/1480 train_time:86548ms step_avg:150.78ms step:585/1480 train_time:86708ms step_avg:150.80ms step:586/1480 train_time:86867ms step_avg:150.81ms step:587/1480 train_time:87026ms step_avg:150.83ms step:588/1480 train_time:87184ms step_avg:150.84ms step:589/1480 train_time:87344ms step_avg:150.85ms step:590/1480 train_time:87506ms step_avg:150.87ms step:591/1480 train_time:87665ms step_avg:150.89ms step:592/1480 train_time:87825ms step_avg:150.90ms step:593/1480 train_time:87985ms step_avg:150.92ms step:594/1480 train_time:88144ms step_avg:150.93ms step:595/1480 train_time:88305ms step_avg:150.95ms step:596/1480 train_time:88468ms step_avg:150.97ms step:597/1480 train_time:88629ms step_avg:150.99ms step:598/1480 train_time:88787ms step_avg:151.00ms step:599/1480 train_time:88945ms step_avg:151.01ms step:600/1480 train_time:89106ms step_avg:151.03ms step:601/1480 train_time:89264ms step_avg:151.04ms step:602/1480 train_time:89421ms step_avg:151.05ms step:603/1480 train_time:89582ms step_avg:151.07ms step:604/1480 train_time:89742ms step_avg:151.08ms step:605/1480 train_time:89902ms step_avg:151.10ms step:606/1480 train_time:90062ms step_avg:151.11ms step:607/1480 train_time:90223ms step_avg:151.13ms step:608/1480 train_time:90383ms step_avg:151.14ms step:609/1480 train_time:90541ms step_avg:151.15ms step:610/1480 train_time:90700ms step_avg:151.17ms step:611/1480 train_time:90860ms step_avg:151.18ms step:612/1480 train_time:91019ms step_avg:151.19ms step:613/1480 train_time:91179ms step_avg:151.21ms step:614/1480 train_time:91337ms step_avg:151.22ms step:615/1480 train_time:91495ms step_avg:151.23ms step:616/1480 train_time:91653ms step_avg:151.24ms step:617/1480 train_time:91813ms step_avg:151.26ms step:618/1480 train_time:91972ms step_avg:151.27ms step:619/1480 train_time:92131ms step_avg:151.28ms step:620/1480 train_time:92291ms step_avg:151.30ms step:621/1480 train_time:92449ms step_avg:151.31ms step:622/1480 train_time:92609ms step_avg:151.32ms step:623/1480 train_time:92770ms step_avg:151.34ms step:624/1480 train_time:92931ms step_avg:151.35ms step:625/1480 train_time:93091ms step_avg:151.37ms step:625/1480 val_loss:3.6006 train_time:93154ms step_avg:151.47ms step:626/1480 train_time:93253ms step_avg:151.38ms step:627/1480 train_time:93413ms step_avg:151.40ms step:628/1480 train_time:93571ms step_avg:151.41ms step:629/1480 train_time:93730ms step_avg:151.42ms step:630/1480 train_time:93889ms step_avg:151.43ms step:631/1480 train_time:94046ms step_avg:151.44ms step:632/1480 train_time:94206ms step_avg:151.46ms step:633/1480 train_time:94366ms step_avg:151.47ms step:634/1480 train_time:94525ms step_avg:151.48ms step:635/1480 train_time:94685ms step_avg:151.50ms step:636/1480 train_time:94844ms step_avg:151.51ms step:637/1480 train_time:95005ms step_avg:151.52ms step:638/1480 train_time:95165ms step_avg:151.54ms step:639/1480 train_time:95323ms step_avg:151.55ms step:640/1480 train_time:95483ms step_avg:151.56ms step:641/1480 train_time:95643ms step_avg:151.57ms step:642/1480 train_time:95803ms step_avg:151.59ms step:643/1480 train_time:95962ms step_avg:151.60ms step:644/1480 train_time:96122ms step_avg:151.61ms step:645/1480 train_time:96281ms step_avg:151.62ms step:646/1480 train_time:96439ms step_avg:151.63ms step:647/1480 train_time:96597ms step_avg:151.64ms step:648/1480 train_time:96759ms step_avg:151.66ms step:649/1480 train_time:96917ms step_avg:151.67ms step:650/1480 train_time:97078ms step_avg:151.68ms step:651/1480 train_time:97238ms step_avg:151.70ms step:652/1480 train_time:97398ms step_avg:151.71ms step:653/1480 train_time:97556ms step_avg:151.72ms step:654/1480 train_time:97715ms step_avg:151.73ms step:655/1480 train_time:97874ms step_avg:151.74ms step:656/1480 train_time:98032ms step_avg:151.75ms step:657/1480 train_time:98193ms step_avg:151.77ms step:658/1480 train_time:98353ms step_avg:151.78ms step:659/1480 train_time:98514ms step_avg:151.79ms step:660/1480 train_time:98677ms step_avg:151.81ms step:661/1480 train_time:98840ms step_avg:151.83ms step:662/1480 train_time:99002ms step_avg:151.84ms step:663/1480 train_time:99163ms step_avg:151.86ms step:664/1480 train_time:99325ms step_avg:151.87ms step:665/1480 train_time:99488ms step_avg:151.89ms step:666/1480 train_time:99648ms step_avg:151.90ms step:667/1480 train_time:99809ms step_avg:151.92ms step:668/1480 train_time:99969ms step_avg:151.93ms step:669/1480 train_time:100132ms step_avg:151.95ms step:670/1480 train_time:100292ms step_avg:151.96ms step:671/1480 train_time:100453ms step_avg:151.97ms step:672/1480 train_time:100613ms step_avg:151.98ms step:673/1480 train_time:100777ms step_avg:152.00ms step:674/1480 train_time:100939ms step_avg:152.02ms step:675/1480 train_time:101103ms step_avg:152.03ms step:676/1480 train_time:101266ms step_avg:152.05ms step:677/1480 train_time:101426ms step_avg:152.06ms step:678/1480 train_time:101588ms step_avg:152.08ms step:679/1480 train_time:101748ms step_avg:152.09ms step:680/1480 train_time:101911ms step_avg:152.11ms step:681/1480 train_time:102070ms step_avg:152.12ms step:682/1480 train_time:102232ms step_avg:152.13ms step:683/1480 train_time:102393ms step_avg:152.14ms step:684/1480 train_time:102556ms step_avg:152.16ms step:685/1480 train_time:102720ms step_avg:152.18ms step:686/1480 train_time:102884ms step_avg:152.19ms step:687/1480 train_time:103044ms step_avg:152.21ms step:688/1480 train_time:103207ms step_avg:152.22ms step:689/1480 train_time:103370ms step_avg:152.24ms step:690/1480 train_time:103533ms step_avg:152.26ms step:691/1480 train_time:103693ms step_avg:152.27ms step:692/1480 train_time:103853ms step_avg:152.28ms step:693/1480 train_time:104014ms step_avg:152.29ms step:694/1480 train_time:104179ms step_avg:152.31ms step:695/1480 train_time:104342ms step_avg:152.32ms step:696/1480 train_time:104504ms step_avg:152.34ms step:697/1480 train_time:104666ms step_avg:152.35ms step:698/1480 train_time:104826ms step_avg:152.36ms step:699/1480 train_time:104989ms step_avg:152.38ms step:700/1480 train_time:105151ms step_avg:152.39ms step:701/1480 train_time:105310ms step_avg:152.40ms step:702/1480 train_time:105471ms step_avg:152.42ms step:703/1480 train_time:105632ms step_avg:152.43ms step:704/1480 train_time:105792ms step_avg:152.44ms step:705/1480 train_time:105955ms step_avg:152.45ms step:706/1480 train_time:106120ms step_avg:152.47ms step:707/1480 train_time:106283ms step_avg:152.49ms step:708/1480 train_time:106444ms step_avg:152.50ms step:709/1480 train_time:106605ms step_avg:152.51ms step:710/1480 train_time:106766ms step_avg:152.52ms step:711/1480 train_time:106927ms step_avg:152.54ms step:712/1480 train_time:107092ms step_avg:152.55ms step:713/1480 train_time:107254ms step_avg:152.57ms step:714/1480 train_time:107414ms step_avg:152.58ms step:715/1480 train_time:107576ms step_avg:152.59ms step:716/1480 train_time:107736ms step_avg:152.60ms step:717/1480 train_time:107900ms step_avg:152.62ms step:718/1480 train_time:108060ms step_avg:152.63ms step:719/1480 train_time:108222ms step_avg:152.64ms step:720/1480 train_time:108387ms step_avg:152.66ms step:721/1480 train_time:108548ms step_avg:152.67ms step:722/1480 train_time:108710ms step_avg:152.68ms step:723/1480 train_time:108869ms step_avg:152.69ms step:724/1480 train_time:109030ms step_avg:152.70ms step:725/1480 train_time:109192ms step_avg:152.72ms step:726/1480 train_time:109355ms step_avg:152.73ms step:727/1480 train_time:109519ms step_avg:152.75ms step:728/1480 train_time:109682ms step_avg:152.76ms step:729/1480 train_time:109843ms step_avg:152.77ms step:730/1480 train_time:110007ms step_avg:152.79ms step:731/1480 train_time:110168ms step_avg:152.80ms step:732/1480 train_time:110328ms step_avg:152.81ms step:733/1480 train_time:110489ms step_avg:152.82ms step:734/1480 train_time:110651ms step_avg:152.83ms step:735/1480 train_time:110811ms step_avg:152.84ms step:736/1480 train_time:110973ms step_avg:152.86ms step:737/1480 train_time:111135ms step_avg:152.87ms step:738/1480 train_time:111297ms step_avg:152.88ms step:739/1480 train_time:111457ms step_avg:152.89ms step:740/1480 train_time:111622ms step_avg:152.91ms step:741/1480 train_time:111786ms step_avg:152.92ms step:742/1480 train_time:111946ms step_avg:152.93ms step:743/1480 train_time:112107ms step_avg:152.94ms step:744/1480 train_time:112270ms step_avg:152.96ms step:745/1480 train_time:112433ms step_avg:152.97ms step:746/1480 train_time:112592ms step_avg:152.98ms step:747/1480 train_time:112754ms step_avg:152.99ms step:748/1480 train_time:112919ms step_avg:153.01ms step:749/1480 train_time:113084ms step_avg:153.02ms step:750/1480 train_time:113244ms step_avg:153.03ms step:750/1480 val_loss:3.5460 train_time:113307ms step_avg:153.12ms step:751/1480 train_time:113408ms step_avg:153.05ms step:752/1480 train_time:113570ms step_avg:153.06ms step:753/1480 train_time:113731ms step_avg:153.07ms step:754/1480 train_time:113892ms step_avg:153.08ms step:755/1480 train_time:114052ms step_avg:153.09ms step:756/1480 train_time:114213ms step_avg:153.10ms step:757/1480 train_time:114377ms step_avg:153.11ms step:758/1480 train_time:114538ms step_avg:153.13ms step:759/1480 train_time:114699ms step_avg:153.14ms step:760/1480 train_time:114861ms step_avg:153.15ms step:761/1480 train_time:115024ms step_avg:153.16ms step:762/1480 train_time:115186ms step_avg:153.17ms step:763/1480 train_time:115347ms step_avg:153.18ms step:764/1480 train_time:115508ms step_avg:153.19ms step:765/1480 train_time:115670ms step_avg:153.20ms step:766/1480 train_time:115833ms step_avg:153.22ms step:767/1480 train_time:115993ms step_avg:153.23ms step:768/1480 train_time:116157ms step_avg:153.24ms step:769/1480 train_time:116321ms step_avg:153.26ms step:770/1480 train_time:116485ms step_avg:153.27ms step:771/1480 train_time:116648ms step_avg:153.28ms step:772/1480 train_time:116810ms step_avg:153.29ms step:773/1480 train_time:116971ms step_avg:153.30ms step:774/1480 train_time:117133ms step_avg:153.32ms step:775/1480 train_time:117295ms step_avg:153.33ms step:776/1480 train_time:117462ms step_avg:153.34ms step:777/1480 train_time:117628ms step_avg:153.36ms step:778/1480 train_time:117790ms step_avg:153.37ms step:779/1480 train_time:117952ms step_avg:153.38ms step:780/1480 train_time:118115ms step_avg:153.40ms step:781/1480 train_time:118279ms step_avg:153.41ms step:782/1480 train_time:118444ms step_avg:153.43ms step:783/1480 train_time:118606ms step_avg:153.44ms step:784/1480 train_time:118770ms step_avg:153.45ms step:785/1480 train_time:118931ms step_avg:153.46ms step:786/1480 train_time:119096ms step_avg:153.47ms step:787/1480 train_time:119260ms step_avg:153.49ms step:788/1480 train_time:119425ms step_avg:153.50ms step:789/1480 train_time:119586ms step_avg:153.51ms step:790/1480 train_time:119751ms step_avg:153.53ms step:791/1480 train_time:119917ms step_avg:153.54ms step:792/1480 train_time:120082ms step_avg:153.56ms step:793/1480 train_time:120245ms step_avg:153.57ms step:794/1480 train_time:120408ms step_avg:153.58ms step:795/1480 train_time:120573ms step_avg:153.60ms step:796/1480 train_time:120739ms step_avg:153.61ms step:797/1480 train_time:120904ms step_avg:153.63ms step:798/1480 train_time:121068ms step_avg:153.64ms step:799/1480 train_time:121234ms step_avg:153.66ms step:800/1480 train_time:121397ms step_avg:153.67ms step:801/1480 train_time:121561ms step_avg:153.68ms step:802/1480 train_time:121729ms step_avg:153.70ms step:803/1480 train_time:121891ms step_avg:153.71ms step:804/1480 train_time:122053ms step_avg:153.72ms step:805/1480 train_time:122219ms step_avg:153.73ms step:806/1480 train_time:122381ms step_avg:153.75ms step:807/1480 train_time:122544ms step_avg:153.76ms step:808/1480 train_time:122707ms step_avg:153.77ms step:809/1480 train_time:122869ms step_avg:153.78ms step:810/1480 train_time:123030ms step_avg:153.79ms step:811/1480 train_time:123192ms step_avg:153.80ms step:812/1480 train_time:123354ms step_avg:153.81ms step:813/1480 train_time:123516ms step_avg:153.82ms step:814/1480 train_time:123680ms step_avg:153.83ms step:815/1480 train_time:123844ms step_avg:153.84ms step:816/1480 train_time:124006ms step_avg:153.85ms step:817/1480 train_time:124168ms step_avg:153.86ms step:818/1480 train_time:124329ms step_avg:153.87ms step:819/1480 train_time:124493ms step_avg:153.88ms step:820/1480 train_time:124657ms step_avg:153.90ms step:821/1480 train_time:124819ms step_avg:153.91ms step:822/1480 train_time:124981ms step_avg:153.92ms step:823/1480 train_time:125145ms step_avg:153.93ms step:824/1480 train_time:125306ms step_avg:153.94ms step:825/1480 train_time:125471ms step_avg:153.95ms step:826/1480 train_time:125636ms step_avg:153.97ms step:827/1480 train_time:125801ms step_avg:153.98ms step:828/1480 train_time:125964ms step_avg:153.99ms step:829/1480 train_time:126128ms step_avg:154.00ms step:830/1480 train_time:126293ms step_avg:154.02ms step:831/1480 train_time:126456ms step_avg:154.03ms step:832/1480 train_time:126621ms step_avg:154.04ms step:833/1480 train_time:126787ms step_avg:154.05ms step:834/1480 train_time:126950ms step_avg:154.07ms step:835/1480 train_time:127112ms step_avg:154.08ms step:836/1480 train_time:127277ms step_avg:154.09ms step:837/1480 train_time:127440ms step_avg:154.10ms step:838/1480 train_time:127604ms step_avg:154.11ms step:839/1480 train_time:127767ms step_avg:154.12ms step:840/1480 train_time:127929ms step_avg:154.13ms step:841/1480 train_time:128089ms step_avg:154.14ms step:842/1480 train_time:128253ms step_avg:154.15ms step:843/1480 train_time:128415ms step_avg:154.16ms step:844/1480 train_time:128579ms step_avg:154.17ms step:845/1480 train_time:128745ms step_avg:154.19ms step:846/1480 train_time:128908ms step_avg:154.20ms step:847/1480 train_time:129072ms step_avg:154.21ms step:848/1480 train_time:129235ms step_avg:154.22ms step:849/1480 train_time:129399ms step_avg:154.23ms step:850/1480 train_time:129563ms step_avg:154.24ms step:851/1480 train_time:129728ms step_avg:154.25ms step:852/1480 train_time:129889ms step_avg:154.26ms step:853/1480 train_time:130051ms step_avg:154.27ms step:854/1480 train_time:130216ms step_avg:154.28ms step:855/1480 train_time:130381ms step_avg:154.30ms step:856/1480 train_time:130545ms step_avg:154.31ms step:857/1480 train_time:130708ms step_avg:154.32ms step:858/1480 train_time:130873ms step_avg:154.33ms step:859/1480 train_time:131036ms step_avg:154.34ms step:860/1480 train_time:131198ms step_avg:154.35ms step:861/1480 train_time:131364ms step_avg:154.36ms step:862/1480 train_time:131530ms step_avg:154.38ms step:863/1480 train_time:131698ms step_avg:154.39ms step:864/1480 train_time:131863ms step_avg:154.41ms step:865/1480 train_time:132025ms step_avg:154.42ms step:866/1480 train_time:132191ms step_avg:154.43ms step:867/1480 train_time:132353ms step_avg:154.44ms step:868/1480 train_time:132513ms step_avg:154.44ms step:869/1480 train_time:132676ms step_avg:154.45ms step:870/1480 train_time:132841ms step_avg:154.47ms step:871/1480 train_time:133005ms step_avg:154.48ms step:872/1480 train_time:133168ms step_avg:154.49ms step:873/1480 train_time:133329ms step_avg:154.50ms step:874/1480 train_time:133495ms step_avg:154.51ms step:875/1480 train_time:133660ms step_avg:154.52ms step:875/1480 val_loss:3.5010 train_time:133725ms step_avg:154.60ms step:876/1480 train_time:133827ms step_avg:154.53ms step:877/1480 train_time:133993ms step_avg:154.55ms step:878/1480 train_time:134156ms step_avg:154.56ms step:879/1480 train_time:134321ms step_avg:154.57ms step:880/1480 train_time:134483ms step_avg:154.58ms step:881/1480 train_time:134644ms step_avg:154.59ms step:882/1480 train_time:134811ms step_avg:154.60ms step:883/1480 train_time:134979ms step_avg:154.61ms step:884/1480 train_time:135144ms step_avg:154.63ms step:885/1480 train_time:135309ms step_avg:154.64ms step:886/1480 train_time:135475ms step_avg:154.65ms step:887/1480 train_time:135644ms step_avg:154.67ms step:888/1480 train_time:135818ms step_avg:154.69ms step:889/1480 train_time:135986ms step_avg:154.71ms step:890/1480 train_time:136148ms step_avg:154.71ms step:891/1480 train_time:136315ms step_avg:154.73ms step:892/1480 train_time:136480ms step_avg:154.74ms step:893/1480 train_time:136642ms step_avg:154.75ms step:894/1480 train_time:136810ms step_avg:154.76ms step:895/1480 train_time:136979ms step_avg:154.78ms step:896/1480 train_time:137143ms step_avg:154.79ms step:897/1480 train_time:137311ms step_avg:154.80ms step:898/1480 train_time:137479ms step_avg:154.82ms step:899/1480 train_time:137643ms step_avg:154.83ms step:900/1480 train_time:137807ms step_avg:154.84ms step:901/1480 train_time:137972ms step_avg:154.85ms step:902/1480 train_time:138136ms step_avg:154.86ms step:903/1480 train_time:138309ms step_avg:154.88ms step:904/1480 train_time:138476ms step_avg:154.89ms step:905/1480 train_time:138638ms step_avg:154.90ms step:906/1480 train_time:138804ms step_avg:154.92ms step:907/1480 train_time:138971ms step_avg:154.93ms step:908/1480 train_time:139134ms step_avg:154.94ms step:909/1480 train_time:139300ms step_avg:154.95ms step:910/1480 train_time:139469ms step_avg:154.97ms step:911/1480 train_time:139633ms step_avg:154.98ms step:912/1480 train_time:139800ms step_avg:154.99ms step:913/1480 train_time:139967ms step_avg:155.00ms step:914/1480 train_time:140133ms step_avg:155.01ms step:915/1480 train_time:140302ms step_avg:155.03ms step:916/1480 train_time:140465ms step_avg:155.04ms step:917/1480 train_time:140630ms step_avg:155.05ms step:918/1480 train_time:140799ms step_avg:155.07ms step:919/1480 train_time:140968ms step_avg:155.08ms step:920/1480 train_time:141133ms step_avg:155.09ms step:921/1480 train_time:141299ms step_avg:155.10ms step:922/1480 train_time:141464ms step_avg:155.11ms step:923/1480 train_time:141627ms step_avg:155.12ms step:924/1480 train_time:141792ms step_avg:155.13ms step:925/1480 train_time:141957ms step_avg:155.14ms step:926/1480 train_time:142121ms step_avg:155.15ms step:927/1480 train_time:142284ms step_avg:155.16ms step:928/1480 train_time:142450ms step_avg:155.17ms step:929/1480 train_time:142617ms step_avg:155.19ms step:930/1480 train_time:142781ms step_avg:155.20ms step:931/1480 train_time:142944ms step_avg:155.20ms step:932/1480 train_time:143109ms step_avg:155.22ms step:933/1480 train_time:143278ms step_avg:155.23ms step:934/1480 train_time:143443ms step_avg:155.24ms step:935/1480 train_time:143614ms step_avg:155.26ms step:936/1480 train_time:143781ms step_avg:155.27ms step:937/1480 train_time:143952ms step_avg:155.29ms step:938/1480 train_time:144114ms step_avg:155.30ms step:939/1480 train_time:144284ms step_avg:155.31ms step:940/1480 train_time:144451ms step_avg:155.32ms step:941/1480 train_time:144616ms step_avg:155.33ms step:942/1480 train_time:144782ms step_avg:155.35ms step:943/1480 train_time:144950ms step_avg:155.36ms step:944/1480 train_time:145122ms step_avg:155.38ms step:945/1480 train_time:145286ms step_avg:155.39ms step:946/1480 train_time:145457ms step_avg:155.40ms step:947/1480 train_time:145624ms step_avg:155.42ms step:948/1480 train_time:145789ms step_avg:155.42ms step:949/1480 train_time:145954ms step_avg:155.44ms step:950/1480 train_time:146119ms step_avg:155.45ms step:951/1480 train_time:146287ms step_avg:155.46ms step:952/1480 train_time:146453ms step_avg:155.47ms step:953/1480 train_time:146622ms step_avg:155.49ms step:954/1480 train_time:146791ms step_avg:155.50ms step:955/1480 train_time:146955ms step_avg:155.51ms step:956/1480 train_time:147121ms step_avg:155.52ms step:957/1480 train_time:147289ms step_avg:155.53ms step:958/1480 train_time:147459ms step_avg:155.55ms step:959/1480 train_time:147624ms step_avg:155.56ms step:960/1480 train_time:147789ms step_avg:155.57ms step:961/1480 train_time:147954ms step_avg:155.58ms step:962/1480 train_time:148120ms step_avg:155.59ms step:963/1480 train_time:148285ms step_avg:155.60ms step:964/1480 train_time:148452ms step_avg:155.61ms step:965/1480 train_time:148617ms step_avg:155.62ms step:966/1480 train_time:148781ms step_avg:155.63ms step:967/1480 train_time:148945ms step_avg:155.64ms step:968/1480 train_time:149112ms step_avg:155.65ms step:969/1480 train_time:149280ms step_avg:155.66ms step:970/1480 train_time:149442ms step_avg:155.67ms step:971/1480 train_time:149607ms step_avg:155.68ms step:972/1480 train_time:149770ms step_avg:155.69ms step:973/1480 train_time:149935ms step_avg:155.70ms step:974/1480 train_time:150104ms step_avg:155.71ms step:975/1480 train_time:150268ms step_avg:155.72ms step:976/1480 train_time:150432ms step_avg:155.73ms step:977/1480 train_time:150598ms step_avg:155.74ms step:978/1480 train_time:150763ms step_avg:155.75ms step:979/1480 train_time:150928ms step_avg:155.76ms step:980/1480 train_time:151095ms step_avg:155.77ms step:981/1480 train_time:151261ms step_avg:155.78ms step:982/1480 train_time:151425ms step_avg:155.79ms step:983/1480 train_time:151591ms step_avg:155.80ms step:984/1480 train_time:151755ms step_avg:155.81ms step:985/1480 train_time:151923ms step_avg:155.82ms step:986/1480 train_time:152088ms step_avg:155.83ms step:987/1480 train_time:152253ms step_avg:155.84ms step:988/1480 train_time:152421ms step_avg:155.85ms step:989/1480 train_time:152585ms step_avg:155.86ms step:990/1480 train_time:152755ms step_avg:155.87ms step:991/1480 train_time:152922ms step_avg:155.88ms step:992/1480 train_time:153097ms step_avg:155.90ms step:993/1480 train_time:153273ms step_avg:155.92ms step:994/1480 train_time:153439ms step_avg:155.93ms step:995/1480 train_time:153603ms step_avg:155.94ms step:996/1480 train_time:153765ms step_avg:155.95ms step:997/1480 train_time:153929ms step_avg:155.96ms step:998/1480 train_time:154094ms step_avg:155.97ms step:999/1480 train_time:154259ms step_avg:155.97ms step:1000/1480 train_time:154428ms step_avg:155.99ms step:1000/1480 val_loss:3.4376 train_time:154497ms step_avg:156.06ms step:1001/1480 train_time:154599ms step_avg:156.00ms step:1002/1480 train_time:154764ms step_avg:156.01ms step:1003/1480 train_time:154936ms step_avg:156.03ms step:1004/1480 train_time:155103ms step_avg:156.04ms step:1005/1480 train_time:155272ms step_avg:156.05ms step:1006/1480 train_time:155439ms step_avg:156.06ms step:1007/1480 train_time:155603ms step_avg:156.07ms step:1008/1480 train_time:155770ms step_avg:156.08ms step:1009/1480 train_time:155944ms step_avg:156.10ms step:1010/1480 train_time:156109ms step_avg:156.11ms step:1011/1480 train_time:156276ms step_avg:156.12ms step:1012/1480 train_time:156442ms step_avg:156.13ms step:1013/1480 train_time:156612ms step_avg:156.14ms step:1014/1480 train_time:156779ms step_avg:156.15ms step:1015/1480 train_time:156950ms step_avg:156.17ms step:1016/1480 train_time:157121ms step_avg:156.18ms step:1017/1480 train_time:157290ms step_avg:156.20ms step:1018/1480 train_time:157459ms step_avg:156.21ms step:1019/1480 train_time:157627ms step_avg:156.22ms step:1020/1480 train_time:157796ms step_avg:156.23ms step:1021/1480 train_time:157962ms step_avg:156.24ms step:1022/1480 train_time:158128ms step_avg:156.25ms step:1023/1480 train_time:158295ms step_avg:156.26ms step:1024/1480 train_time:158462ms step_avg:156.27ms step:1025/1480 train_time:158631ms step_avg:156.29ms step:1026/1480 train_time:158797ms step_avg:156.30ms step:1027/1480 train_time:158963ms step_avg:156.31ms step:1028/1480 train_time:159135ms step_avg:156.32ms step:1029/1480 train_time:159309ms step_avg:156.34ms step:1030/1480 train_time:159478ms step_avg:156.35ms step:1031/1480 train_time:159643ms step_avg:156.36ms step:1032/1480 train_time:159816ms step_avg:156.38ms step:1033/1480 train_time:159982ms step_avg:156.39ms step:1034/1480 train_time:160152ms step_avg:156.40ms step:1035/1480 train_time:160320ms step_avg:156.41ms step:1036/1480 train_time:160484ms step_avg:156.42ms step:1037/1480 train_time:160653ms step_avg:156.43ms step:1038/1480 train_time:160821ms step_avg:156.44ms step:1039/1480 train_time:160992ms step_avg:156.45ms step:1040/1480 train_time:161159ms step_avg:156.47ms step:1041/1480 train_time:161326ms step_avg:156.47ms step:1042/1480 train_time:161489ms step_avg:156.48ms step:1043/1480 train_time:161657ms step_avg:156.49ms step:1044/1480 train_time:161822ms step_avg:156.50ms step:1045/1480 train_time:161991ms step_avg:156.51ms step:1046/1480 train_time:162161ms step_avg:156.53ms step:1047/1480 train_time:162326ms step_avg:156.53ms step:1048/1480 train_time:162491ms step_avg:156.54ms step:1049/1480 train_time:162658ms step_avg:156.55ms step:1050/1480 train_time:162825ms step_avg:156.56ms step:1051/1480 train_time:162994ms step_avg:156.57ms step:1052/1480 train_time:163164ms step_avg:156.59ms step:1053/1480 train_time:163331ms step_avg:156.60ms step:1054/1480 train_time:163499ms step_avg:156.61ms step:1055/1480 train_time:163664ms step_avg:156.62ms step:1056/1480 train_time:163828ms step_avg:156.62ms step:1057/1480 train_time:163995ms step_avg:156.63ms step:1058/1480 train_time:164165ms step_avg:156.65ms step:1059/1480 train_time:164338ms step_avg:156.66ms step:1060/1480 train_time:164507ms step_avg:156.67ms step:1061/1480 train_time:164672ms step_avg:156.68ms step:1062/1480 train_time:164839ms step_avg:156.69ms step:1063/1480 train_time:165002ms step_avg:156.70ms step:1064/1480 train_time:165166ms step_avg:156.70ms step:1065/1480 train_time:165334ms step_avg:156.71ms step:1066/1480 train_time:165501ms step_avg:156.72ms step:1067/1480 train_time:165669ms step_avg:156.73ms step:1068/1480 train_time:165835ms step_avg:156.74ms step:1069/1480 train_time:166004ms step_avg:156.76ms step:1070/1480 train_time:166169ms step_avg:156.76ms step:1071/1480 train_time:166342ms step_avg:156.78ms step:1072/1480 train_time:166507ms step_avg:156.79ms step:1073/1480 train_time:166670ms step_avg:156.79ms step:1074/1480 train_time:166838ms step_avg:156.80ms step:1075/1480 train_time:167007ms step_avg:156.81ms step:1076/1480 train_time:167174ms step_avg:156.82ms step:1077/1480 train_time:167341ms step_avg:156.83ms step:1078/1480 train_time:167515ms step_avg:156.85ms step:1079/1480 train_time:167687ms step_avg:156.86ms step:1080/1480 train_time:167859ms step_avg:156.88ms step:1081/1480 train_time:168025ms step_avg:156.89ms step:1082/1480 train_time:168190ms step_avg:156.89ms step:1083/1480 train_time:168358ms step_avg:156.90ms step:1084/1480 train_time:168524ms step_avg:156.91ms step:1085/1480 train_time:168693ms step_avg:156.92ms step:1086/1480 train_time:168862ms step_avg:156.94ms step:1087/1480 train_time:169028ms step_avg:156.94ms step:1088/1480 train_time:169198ms step_avg:156.96ms step:1089/1480 train_time:169369ms step_avg:156.97ms step:1090/1480 train_time:169543ms step_avg:156.98ms step:1091/1480 train_time:169709ms step_avg:156.99ms step:1092/1480 train_time:169877ms step_avg:157.00ms step:1093/1480 train_time:170045ms step_avg:157.01ms step:1094/1480 train_time:170211ms step_avg:157.02ms step:1095/1480 train_time:170376ms step_avg:157.03ms step:1096/1480 train_time:170545ms step_avg:157.04ms step:1097/1480 train_time:170715ms step_avg:157.05ms step:1098/1480 train_time:170886ms step_avg:157.06ms step:1099/1480 train_time:171057ms step_avg:157.08ms step:1100/1480 train_time:171228ms step_avg:157.09ms step:1101/1480 train_time:171399ms step_avg:157.10ms step:1102/1480 train_time:171569ms step_avg:157.11ms step:1103/1480 train_time:171745ms step_avg:157.13ms step:1104/1480 train_time:171914ms step_avg:157.14ms step:1105/1480 train_time:172083ms step_avg:157.15ms step:1106/1480 train_time:172250ms step_avg:157.16ms step:1107/1480 train_time:172421ms step_avg:157.18ms step:1108/1480 train_time:172586ms step_avg:157.18ms step:1109/1480 train_time:172752ms step_avg:157.19ms step:1110/1480 train_time:172919ms step_avg:157.20ms step:1111/1480 train_time:173083ms step_avg:157.21ms step:1112/1480 train_time:173254ms step_avg:157.22ms step:1113/1480 train_time:173434ms step_avg:157.24ms step:1114/1480 train_time:173605ms step_avg:157.25ms step:1115/1480 train_time:173777ms step_avg:157.26ms step:1116/1480 train_time:173946ms step_avg:157.27ms step:1117/1480 train_time:174120ms step_avg:157.29ms step:1118/1480 train_time:174294ms step_avg:157.31ms step:1119/1480 train_time:174460ms step_avg:157.31ms step:1120/1480 train_time:174627ms step_avg:157.32ms step:1121/1480 train_time:174798ms step_avg:157.33ms step:1122/1480 train_time:174965ms step_avg:157.34ms step:1123/1480 train_time:175132ms step_avg:157.35ms step:1124/1480 train_time:175300ms step_avg:157.36ms step:1125/1480 train_time:175466ms step_avg:157.37ms step:1125/1480 val_loss:3.3829 train_time:175535ms step_avg:157.43ms step:1126/1480 train_time:175636ms step_avg:157.38ms step:1127/1480 train_time:175806ms step_avg:157.39ms step:1128/1480 train_time:175978ms step_avg:157.40ms step:1129/1480 train_time:176153ms step_avg:157.42ms step:1130/1480 train_time:176322ms step_avg:157.43ms step:1131/1480 train_time:176498ms step_avg:157.45ms step:1132/1480 train_time:176663ms step_avg:157.45ms step:1133/1480 train_time:176834ms step_avg:157.47ms step:1134/1480 train_time:177004ms step_avg:157.48ms step:1135/1480 train_time:177173ms step_avg:157.49ms step:1136/1480 train_time:177343ms step_avg:157.50ms step:1137/1480 train_time:177513ms step_avg:157.51ms step:1138/1480 train_time:177685ms step_avg:157.52ms step:1139/1480 train_time:177853ms step_avg:157.53ms step:1140/1480 train_time:178020ms step_avg:157.54ms step:1141/1480 train_time:178193ms step_avg:157.55ms step:1142/1480 train_time:178360ms step_avg:157.56ms step:1143/1480 train_time:178531ms step_avg:157.57ms step:1144/1480 train_time:178698ms step_avg:157.58ms step:1145/1480 train_time:178865ms step_avg:157.59ms step:1146/1480 train_time:179035ms step_avg:157.60ms step:1147/1480 train_time:179205ms step_avg:157.61ms step:1148/1480 train_time:179373ms step_avg:157.62ms step:1149/1480 train_time:179544ms step_avg:157.63ms step:1150/1480 train_time:179713ms step_avg:157.64ms step:1151/1480 train_time:179885ms step_avg:157.66ms step:1152/1480 train_time:180057ms step_avg:157.67ms step:1153/1480 train_time:180231ms step_avg:157.68ms step:1154/1480 train_time:180398ms step_avg:157.69ms step:1155/1480 train_time:180571ms step_avg:157.70ms step:1156/1480 train_time:180749ms step_avg:157.72ms step:1157/1480 train_time:180918ms step_avg:157.73ms step:1158/1480 train_time:181086ms step_avg:157.74ms step:1159/1480 train_time:181254ms step_avg:157.75ms step:1160/1480 train_time:181419ms step_avg:157.76ms step:1161/1480 train_time:181592ms step_avg:157.77ms step:1162/1480 train_time:181762ms step_avg:157.78ms step:1163/1480 train_time:181932ms step_avg:157.79ms step:1164/1480 train_time:182099ms step_avg:157.80ms step:1165/1480 train_time:182263ms step_avg:157.80ms step:1166/1480 train_time:182432ms step_avg:157.81ms step:1167/1480 train_time:182599ms step_avg:157.82ms step:1168/1480 train_time:182769ms step_avg:157.83ms step:1169/1480 train_time:182936ms step_avg:157.84ms step:1170/1480 train_time:183104ms step_avg:157.85ms step:1171/1480 train_time:183272ms step_avg:157.86ms step:1172/1480 train_time:183436ms step_avg:157.86ms step:1173/1480 train_time:183609ms step_avg:157.88ms step:1174/1480 train_time:183792ms step_avg:157.90ms step:1175/1480 train_time:183962ms step_avg:157.91ms step:1176/1480 train_time:184133ms step_avg:157.92ms step:1177/1480 train_time:184309ms step_avg:157.93ms step:1178/1480 train_time:184477ms step_avg:157.94ms step:1179/1480 train_time:184642ms step_avg:157.95ms step:1180/1480 train_time:184824ms step_avg:157.97ms step:1181/1480 train_time:184993ms step_avg:157.98ms step:1182/1480 train_time:185160ms step_avg:157.99ms step:1183/1480 train_time:185330ms step_avg:158.00ms step:1184/1480 train_time:185498ms step_avg:158.01ms step:1185/1480 train_time:185673ms step_avg:158.02ms step:1186/1480 train_time:185842ms step_avg:158.03ms step:1187/1480 train_time:186026ms step_avg:158.05ms step:1188/1480 train_time:186193ms step_avg:158.06ms step:1189/1480 train_time:186362ms step_avg:158.07ms step:1190/1480 train_time:186530ms step_avg:158.08ms step:1191/1480 train_time:186700ms step_avg:158.09ms step:1192/1480 train_time:186866ms step_avg:158.09ms step:1193/1480 train_time:187032ms step_avg:158.10ms step:1194/1480 train_time:187200ms step_avg:158.11ms step:1195/1480 train_time:187374ms step_avg:158.12ms step:1196/1480 train_time:187557ms step_avg:158.14ms step:1197/1480 train_time:187729ms step_avg:158.15ms step:1198/1480 train_time:187912ms step_avg:158.17ms step:1199/1480 train_time:188082ms step_avg:158.18ms step:1200/1480 train_time:188250ms step_avg:158.19ms step:1201/1480 train_time:188417ms step_avg:158.20ms step:1202/1480 train_time:188599ms step_avg:158.22ms step:1203/1480 train_time:188774ms step_avg:158.23ms step:1204/1480 train_time:188947ms step_avg:158.25ms step:1205/1480 train_time:189115ms step_avg:158.26ms step:1206/1480 train_time:189285ms step_avg:158.26ms step:1207/1480 train_time:189454ms step_avg:158.27ms step:1208/1480 train_time:189622ms step_avg:158.28ms step:1209/1480 train_time:189796ms step_avg:158.30ms step:1210/1480 train_time:189972ms step_avg:158.31ms step:1211/1480 train_time:190144ms step_avg:158.32ms step:1212/1480 train_time:190317ms step_avg:158.33ms step:1213/1480 train_time:190490ms step_avg:158.35ms step:1214/1480 train_time:190668ms step_avg:158.36ms step:1215/1480 train_time:190841ms step_avg:158.37ms step:1216/1480 train_time:191011ms step_avg:158.38ms step:1217/1480 train_time:191185ms step_avg:158.40ms step:1218/1480 train_time:191355ms step_avg:158.41ms step:1219/1480 train_time:191533ms step_avg:158.42ms step:1220/1480 train_time:191701ms step_avg:158.43ms step:1221/1480 train_time:191872ms step_avg:158.44ms step:1222/1480 train_time:192037ms step_avg:158.45ms step:1223/1480 train_time:192209ms step_avg:158.46ms step:1224/1480 train_time:192389ms step_avg:158.48ms step:1225/1480 train_time:192561ms step_avg:158.49ms step:1226/1480 train_time:192733ms step_avg:158.50ms step:1227/1480 train_time:192905ms step_avg:158.51ms step:1228/1480 train_time:193074ms step_avg:158.52ms step:1229/1480 train_time:193246ms step_avg:158.53ms step:1230/1480 train_time:193425ms step_avg:158.55ms step:1231/1480 train_time:193601ms step_avg:158.56ms step:1232/1480 train_time:193777ms step_avg:158.57ms step:1233/1480 train_time:193949ms step_avg:158.58ms step:1234/1480 train_time:194120ms step_avg:158.59ms step:1235/1480 train_time:194294ms step_avg:158.61ms step:1236/1480 train_time:194462ms step_avg:158.62ms step:1237/1480 train_time:194633ms step_avg:158.62ms step:1238/1480 train_time:194818ms step_avg:158.65ms step:1239/1480 train_time:194990ms step_avg:158.66ms step:1240/1480 train_time:195160ms step_avg:158.67ms step:1241/1480 train_time:195332ms step_avg:158.68ms step:1242/1480 train_time:195500ms step_avg:158.69ms step:1243/1480 train_time:195674ms step_avg:158.70ms step:1244/1480 train_time:195839ms step_avg:158.70ms step:1245/1480 train_time:196008ms step_avg:158.71ms step:1246/1480 train_time:196178ms step_avg:158.72ms step:1247/1480 train_time:196349ms step_avg:158.73ms step:1248/1480 train_time:196518ms step_avg:158.74ms step:1249/1480 train_time:196688ms step_avg:158.75ms step:1250/1480 train_time:196857ms step_avg:158.76ms step:1250/1480 val_loss:3.3332 train_time:196927ms step_avg:158.81ms step:1251/1480 train_time:197037ms step_avg:158.77ms step:1252/1480 train_time:197207ms step_avg:158.78ms step:1253/1480 train_time:197376ms step_avg:158.79ms step:1254/1480 train_time:197547ms step_avg:158.80ms step:1255/1480 train_time:197733ms step_avg:158.82ms step:1256/1480 train_time:197907ms step_avg:158.83ms step:1257/1480 train_time:198077ms step_avg:158.84ms step:1258/1480 train_time:198251ms step_avg:158.86ms step:1259/1480 train_time:198422ms step_avg:158.86ms step:1260/1480 train_time:198589ms step_avg:158.87ms step:1261/1480 train_time:198760ms step_avg:158.88ms step:1262/1480 train_time:198937ms step_avg:158.90ms step:1263/1480 train_time:199113ms step_avg:158.91ms step:1264/1480 train_time:199278ms step_avg:158.91ms step:1265/1480 train_time:199445ms step_avg:158.92ms step:1266/1480 train_time:199617ms step_avg:158.93ms step:1267/1480 train_time:199789ms step_avg:158.94ms step:1268/1480 train_time:199960ms step_avg:158.95ms step:1269/1480 train_time:200137ms step_avg:158.96ms step:1270/1480 train_time:200306ms step_avg:158.97ms step:1271/1480 train_time:200478ms step_avg:158.98ms step:1272/1480 train_time:200643ms step_avg:158.99ms step:1273/1480 train_time:200815ms step_avg:159.00ms step:1274/1480 train_time:200987ms step_avg:159.01ms step:1275/1480 train_time:201155ms step_avg:159.02ms step:1276/1480 train_time:201320ms step_avg:159.02ms step:1277/1480 train_time:201491ms step_avg:159.03ms step:1278/1480 train_time:201657ms step_avg:159.04ms step:1279/1480 train_time:201829ms step_avg:159.05ms step:1280/1480 train_time:202010ms step_avg:159.06ms step:1281/1480 train_time:202179ms step_avg:159.07ms step:1282/1480 train_time:202345ms step_avg:159.08ms step:1283/1480 train_time:202517ms step_avg:159.09ms step:1284/1480 train_time:202688ms step_avg:159.10ms step:1285/1480 train_time:202857ms step_avg:159.10ms step:1286/1480 train_time:203025ms step_avg:159.11ms step:1287/1480 train_time:203197ms step_avg:159.12ms step:1288/1480 train_time:203368ms step_avg:159.13ms step:1289/1480 train_time:203554ms step_avg:159.15ms step:1290/1480 train_time:203734ms step_avg:159.17ms step:1291/1480 train_time:203908ms step_avg:159.18ms step:1292/1480 train_time:204082ms step_avg:159.19ms step:1293/1480 train_time:204256ms step_avg:159.20ms step:1294/1480 train_time:204426ms step_avg:159.21ms step:1295/1480 train_time:204598ms step_avg:159.22ms step:1296/1480 train_time:204772ms step_avg:159.23ms step:1297/1480 train_time:204942ms step_avg:159.24ms step:1298/1480 train_time:205114ms step_avg:159.25ms step:1299/1480 train_time:205284ms step_avg:159.26ms step:1300/1480 train_time:205452ms step_avg:159.27ms step:1301/1480 train_time:205622ms step_avg:159.27ms step:1302/1480 train_time:205797ms step_avg:159.29ms step:1303/1480 train_time:205973ms step_avg:159.30ms step:1304/1480 train_time:206146ms step_avg:159.31ms step:1305/1480 train_time:206315ms step_avg:159.32ms step:1306/1480 train_time:206487ms step_avg:159.33ms step:1307/1480 train_time:206656ms step_avg:159.33ms step:1308/1480 train_time:206825ms step_avg:159.34ms step:1309/1480 train_time:206997ms step_avg:159.35ms step:1310/1480 train_time:207164ms step_avg:159.36ms step:1311/1480 train_time:207332ms step_avg:159.36ms step:1312/1480 train_time:207504ms step_avg:159.37ms step:1313/1480 train_time:207674ms step_avg:159.38ms step:1314/1480 train_time:207846ms step_avg:159.39ms step:1315/1480 train_time:208017ms step_avg:159.40ms step:1316/1480 train_time:208183ms step_avg:159.40ms step:1317/1480 train_time:208354ms step_avg:159.41ms step:1318/1480 train_time:208532ms step_avg:159.43ms step:1319/1480 train_time:208708ms step_avg:159.44ms step:1320/1480 train_time:208883ms step_avg:159.45ms step:1321/1480 train_time:209056ms step_avg:159.46ms step:1322/1480 train_time:209238ms step_avg:159.48ms step:1323/1480 train_time:209411ms step_avg:159.49ms step:1324/1480 train_time:209586ms step_avg:159.50ms step:1325/1480 train_time:209767ms step_avg:159.52ms step:1326/1480 train_time:209942ms step_avg:159.53ms step:1327/1480 train_time:210112ms step_avg:159.54ms step:1328/1480 train_time:210281ms step_avg:159.55ms step:1329/1480 train_time:210478ms step_avg:159.57ms step:1330/1480 train_time:210658ms step_avg:159.59ms step:1331/1480 train_time:210828ms step_avg:159.60ms step:1332/1480 train_time:211003ms step_avg:159.61ms step:1333/1480 train_time:211177ms step_avg:159.62ms step:1334/1480 train_time:211348ms step_avg:159.63ms step:1335/1480 train_time:211517ms step_avg:159.64ms step:1336/1480 train_time:211699ms step_avg:159.65ms step:1337/1480 train_time:211875ms step_avg:159.66ms step:1338/1480 train_time:212046ms step_avg:159.67ms step:1339/1480 train_time:212221ms step_avg:159.68ms step:1340/1480 train_time:212393ms step_avg:159.69ms step:1341/1480 train_time:212561ms step_avg:159.70ms step:1342/1480 train_time:212735ms step_avg:159.71ms step:1343/1480 train_time:212904ms step_avg:159.72ms step:1344/1480 train_time:213077ms step_avg:159.73ms step:1345/1480 train_time:213255ms step_avg:159.74ms step:1346/1480 train_time:213422ms step_avg:159.75ms step:1347/1480 train_time:213593ms step_avg:159.76ms step:1348/1480 train_time:213762ms step_avg:159.76ms step:1349/1480 train_time:213931ms step_avg:159.77ms step:1350/1480 train_time:214106ms step_avg:159.78ms step:1351/1480 train_time:214278ms step_avg:159.79ms step:1352/1480 train_time:214448ms step_avg:159.80ms step:1353/1480 train_time:214624ms step_avg:159.81ms step:1354/1480 train_time:214795ms step_avg:159.82ms step:1355/1480 train_time:214963ms step_avg:159.82ms step:1356/1480 train_time:215137ms step_avg:159.83ms step:1357/1480 train_time:215311ms step_avg:159.84ms step:1358/1480 train_time:215481ms step_avg:159.85ms step:1359/1480 train_time:215654ms step_avg:159.86ms step:1360/1480 train_time:215828ms step_avg:159.87ms step:1361/1480 train_time:216006ms step_avg:159.89ms step:1362/1480 train_time:216182ms step_avg:159.90ms step:1363/1480 train_time:216364ms step_avg:159.91ms step:1364/1480 train_time:216533ms step_avg:159.92ms step:1365/1480 train_time:216699ms step_avg:159.93ms step:1366/1480 train_time:216873ms step_avg:159.94ms step:1367/1480 train_time:217042ms step_avg:159.94ms step:1368/1480 train_time:217216ms step_avg:159.95ms step:1369/1480 train_time:217397ms step_avg:159.97ms step:1370/1480 train_time:217574ms step_avg:159.98ms step:1371/1480 train_time:217745ms step_avg:159.99ms step:1372/1480 train_time:217923ms step_avg:160.00ms step:1373/1480 train_time:218094ms step_avg:160.01ms step:1374/1480 train_time:218269ms step_avg:160.02ms step:1375/1480 train_time:218440ms step_avg:160.03ms step:1375/1480 val_loss:3.2947 train_time:218507ms step_avg:160.08ms step:1376/1480 train_time:218613ms step_avg:160.04ms step:1377/1480 train_time:218784ms step_avg:160.05ms step:1378/1480 train_time:218952ms step_avg:160.05ms step:1379/1480 train_time:219126ms step_avg:160.06ms step:1380/1480 train_time:219299ms step_avg:160.07ms step:1381/1480 train_time:219478ms step_avg:160.09ms step:1382/1480 train_time:219650ms step_avg:160.09ms step:1383/1480 train_time:219823ms step_avg:160.10ms step:1384/1480 train_time:220000ms step_avg:160.12ms step:1385/1480 train_time:220167ms step_avg:160.12ms step:1386/1480 train_time:220337ms step_avg:160.13ms step:1387/1480 train_time:220510ms step_avg:160.14ms step:1388/1480 train_time:220678ms step_avg:160.14ms step:1389/1480 train_time:220852ms step_avg:160.15ms step:1390/1480 train_time:221020ms step_avg:160.16ms step:1391/1480 train_time:221190ms step_avg:160.17ms step:1392/1480 train_time:221361ms step_avg:160.17ms step:1393/1480 train_time:221532ms step_avg:160.18ms step:1394/1480 train_time:221702ms step_avg:160.19ms step:1395/1480 train_time:221871ms step_avg:160.20ms step:1396/1480 train_time:222038ms step_avg:160.20ms step:1397/1480 train_time:222207ms step_avg:160.21ms step:1398/1480 train_time:222374ms step_avg:160.21ms step:1399/1480 train_time:222544ms step_avg:160.22ms step:1400/1480 train_time:222721ms step_avg:160.23ms step:1401/1480 train_time:222888ms step_avg:160.24ms step:1402/1480 train_time:223057ms step_avg:160.24ms step:1403/1480 train_time:223235ms step_avg:160.26ms step:1404/1480 train_time:223407ms step_avg:160.26ms step:1405/1480 train_time:223580ms step_avg:160.27ms step:1406/1480 train_time:223756ms step_avg:160.28ms step:1407/1480 train_time:223926ms step_avg:160.29ms step:1408/1480 train_time:224095ms step_avg:160.30ms step:1409/1480 train_time:224277ms step_avg:160.31ms step:1410/1480 train_time:224447ms step_avg:160.32ms step:1411/1480 train_time:224615ms step_avg:160.32ms step:1412/1480 train_time:224785ms step_avg:160.33ms step:1413/1480 train_time:224956ms step_avg:160.34ms step:1414/1480 train_time:225129ms step_avg:160.35ms step:1415/1480 train_time:225301ms step_avg:160.36ms step:1416/1480 train_time:225489ms step_avg:160.38ms step:1417/1480 train_time:225662ms step_avg:160.38ms step:1418/1480 train_time:225833ms step_avg:160.39ms step:1419/1480 train_time:226008ms step_avg:160.40ms step:1420/1480 train_time:226183ms step_avg:160.41ms step:1421/1480 train_time:226356ms step_avg:160.42ms step:1422/1480 train_time:226529ms step_avg:160.43ms step:1423/1480 train_time:226698ms step_avg:160.44ms step:1424/1480 train_time:226874ms step_avg:160.45ms step:1425/1480 train_time:227055ms step_avg:160.46ms step:1426/1480 train_time:227227ms step_avg:160.47ms step:1427/1480 train_time:227402ms step_avg:160.48ms step:1428/1480 train_time:227574ms step_avg:160.49ms step:1429/1480 train_time:227743ms step_avg:160.50ms step:1430/1480 train_time:227916ms step_avg:160.50ms step:1431/1480 train_time:228092ms step_avg:160.52ms step:1432/1480 train_time:228270ms step_avg:160.53ms step:1433/1480 train_time:228448ms step_avg:160.54ms step:1434/1480 train_time:228631ms step_avg:160.56ms step:1435/1480 train_time:228806ms step_avg:160.57ms step:1436/1480 train_time:228979ms step_avg:160.57ms step:1437/1480 train_time:229150ms step_avg:160.58ms step:1438/1480 train_time:229319ms step_avg:160.59ms step:1439/1480 train_time:229493ms step_avg:160.60ms step:1440/1480 train_time:229662ms step_avg:160.60ms step:1441/1480 train_time:229833ms step_avg:160.61ms step:1442/1480 train_time:230011ms step_avg:160.62ms step:1443/1480 train_time:230198ms step_avg:160.64ms step:1444/1480 train_time:230369ms step_avg:160.65ms step:1445/1480 train_time:230538ms step_avg:160.65ms step:1446/1480 train_time:230715ms step_avg:160.66ms step:1447/1480 train_time:230892ms step_avg:160.68ms step:1448/1480 train_time:231064ms step_avg:160.68ms step:1449/1480 train_time:231237ms step_avg:160.69ms step:1450/1480 train_time:231411ms step_avg:160.70ms step:1451/1480 train_time:231583ms step_avg:160.71ms step:1452/1480 train_time:231756ms step_avg:160.72ms step:1453/1480 train_time:231926ms step_avg:160.72ms step:1454/1480 train_time:232098ms step_avg:160.73ms step:1455/1480 train_time:232275ms step_avg:160.74ms step:1456/1480 train_time:232449ms step_avg:160.75ms step:1457/1480 train_time:232620ms step_avg:160.76ms step:1458/1480 train_time:232792ms step_avg:160.77ms step:1459/1480 train_time:232968ms step_avg:160.78ms step:1460/1480 train_time:233139ms step_avg:160.79ms step:1461/1480 train_time:233313ms step_avg:160.79ms step:1462/1480 train_time:233482ms step_avg:160.80ms step:1463/1480 train_time:233659ms step_avg:160.81ms step:1464/1480 train_time:233835ms step_avg:160.82ms step:1465/1480 train_time:234008ms step_avg:160.83ms step:1466/1480 train_time:234177ms step_avg:160.84ms step:1467/1480 train_time:234352ms step_avg:160.85ms step:1468/1480 train_time:234522ms step_avg:160.85ms step:1469/1480 train_time:234695ms step_avg:160.86ms step:1470/1480 train_time:234874ms step_avg:160.87ms step:1471/1480 train_time:235059ms step_avg:160.89ms step:1472/1480 train_time:235239ms step_avg:160.90ms step:1473/1480 train_time:235410ms step_avg:160.91ms step:1474/1480 train_time:235587ms step_avg:160.92ms step:1475/1480 train_time:235767ms step_avg:160.93ms step:1476/1480 train_time:235939ms step_avg:160.94ms step:1477/1480 train_time:236123ms step_avg:160.96ms step:1478/1480 train_time:236307ms step_avg:160.97ms step:1479/1480 train_time:236479ms step_avg:160.98ms step:1480/1480 train_time:236651ms step_avg:160.99ms step:1480/1480 val_loss:3.2759 train_time:236723ms step_avg:161.04ms