import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 11:30:40 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 36C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 130W / 700W | 47MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 84W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 38C P0 73W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 109W / 700W | 45MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 45C P0 79W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 38C P0 92W / 700W | 26MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:23079ms step_avg:nanms step:2/1480 train_time:23169ms step_avg:nanms step:3/1480 train_time:23307ms step_avg:nanms step:4/1480 train_time:23450ms step_avg:nanms step:5/1480 train_time:23590ms step_avg:nanms step:6/1480 train_time:23732ms step_avg:nanms step:7/1480 train_time:23874ms step_avg:nanms step:8/1480 train_time:24018ms step_avg:nanms step:9/1480 train_time:24163ms step_avg:nanms step:10/1480 train_time:24305ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:284ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.80ms step:14/1480 train_time:567ms step_avg:141.86ms step:15/1480 train_time:711ms step_avg:142.13ms step:16/1480 train_time:855ms step_avg:142.46ms step:17/1480 train_time:998ms step_avg:142.54ms step:18/1480 train_time:1141ms step_avg:142.64ms step:19/1480 train_time:1282ms step_avg:142.50ms step:20/1480 train_time:1426ms step_avg:142.60ms step:21/1480 train_time:1569ms step_avg:142.62ms step:22/1480 train_time:1713ms step_avg:142.71ms step:23/1480 train_time:1856ms step_avg:142.80ms step:24/1480 train_time:1999ms step_avg:142.76ms step:25/1480 train_time:2141ms step_avg:142.70ms step:26/1480 train_time:2284ms step_avg:142.75ms step:27/1480 train_time:2429ms step_avg:142.86ms step:28/1480 train_time:2572ms step_avg:142.89ms step:29/1480 train_time:2716ms step_avg:142.93ms step:30/1480 train_time:2860ms step_avg:142.99ms step:31/1480 train_time:3004ms step_avg:143.04ms step:32/1480 train_time:3147ms step_avg:143.04ms step:33/1480 train_time:3289ms step_avg:143.01ms step:34/1480 train_time:3433ms step_avg:143.03ms step:35/1480 train_time:3575ms step_avg:143.01ms step:36/1480 train_time:3718ms step_avg:142.98ms step:37/1480 train_time:3861ms step_avg:142.99ms step:38/1480 train_time:4005ms step_avg:143.03ms step:39/1480 train_time:4149ms step_avg:143.08ms step:40/1480 train_time:4293ms step_avg:143.10ms step:41/1480 train_time:4435ms step_avg:143.08ms step:42/1480 train_time:4578ms step_avg:143.05ms step:43/1480 train_time:4722ms step_avg:143.09ms step:44/1480 train_time:4866ms step_avg:143.12ms step:45/1480 train_time:5009ms step_avg:143.11ms step:46/1480 train_time:5152ms step_avg:143.11ms step:47/1480 train_time:5293ms step_avg:143.06ms step:48/1480 train_time:5436ms step_avg:143.06ms step:49/1480 train_time:5578ms step_avg:143.03ms step:50/1480 train_time:5719ms step_avg:142.98ms step:51/1480 train_time:5864ms step_avg:143.03ms step:52/1480 train_time:6008ms step_avg:143.05ms step:53/1480 train_time:6153ms step_avg:143.10ms step:54/1480 train_time:6296ms step_avg:143.09ms step:55/1480 train_time:6437ms step_avg:143.05ms step:56/1480 train_time:6578ms step_avg:143.00ms step:57/1480 train_time:6722ms step_avg:143.03ms step:58/1480 train_time:6867ms step_avg:143.06ms step:59/1480 train_time:7011ms step_avg:143.07ms step:60/1480 train_time:7155ms step_avg:143.09ms step:61/1480 train_time:7297ms step_avg:143.07ms step:62/1480 train_time:7438ms step_avg:143.04ms step:63/1480 train_time:7579ms step_avg:143.00ms step:64/1480 train_time:7720ms step_avg:142.96ms step:65/1480 train_time:7862ms step_avg:142.95ms step:66/1480 train_time:8007ms step_avg:142.97ms step:67/1480 train_time:8151ms step_avg:142.99ms step:68/1480 train_time:8293ms step_avg:142.98ms step:69/1480 train_time:8436ms step_avg:142.98ms step:70/1480 train_time:8577ms step_avg:142.95ms step:71/1480 train_time:8717ms step_avg:142.91ms step:72/1480 train_time:8861ms step_avg:142.91ms step:73/1480 train_time:9005ms step_avg:142.94ms step:74/1480 train_time:9147ms step_avg:142.93ms step:75/1480 train_time:9291ms step_avg:142.93ms step:76/1480 train_time:9433ms step_avg:142.93ms step:77/1480 train_time:9575ms step_avg:142.91ms step:78/1480 train_time:9717ms step_avg:142.90ms step:79/1480 train_time:9859ms step_avg:142.89ms step:80/1480 train_time:10002ms step_avg:142.89ms step:81/1480 train_time:10147ms step_avg:142.92ms step:82/1480 train_time:10289ms step_avg:142.91ms step:83/1480 train_time:10433ms step_avg:142.91ms step:84/1480 train_time:10575ms step_avg:142.91ms step:85/1480 train_time:10718ms step_avg:142.90ms step:86/1480 train_time:10859ms step_avg:142.89ms step:87/1480 train_time:11001ms step_avg:142.88ms step:88/1480 train_time:11144ms step_avg:142.87ms step:89/1480 train_time:11287ms step_avg:142.87ms step:90/1480 train_time:11431ms step_avg:142.88ms step:91/1480 train_time:11573ms step_avg:142.87ms step:92/1480 train_time:11715ms step_avg:142.86ms step:93/1480 train_time:11857ms step_avg:142.85ms step:94/1480 train_time:11998ms step_avg:142.84ms step:95/1480 train_time:12140ms step_avg:142.83ms step:96/1480 train_time:12284ms step_avg:142.83ms step:97/1480 train_time:12429ms step_avg:142.86ms step:98/1480 train_time:12573ms step_avg:142.87ms step:99/1480 train_time:12716ms step_avg:142.87ms step:100/1480 train_time:12859ms step_avg:142.88ms step:101/1480 train_time:13000ms step_avg:142.85ms step:102/1480 train_time:13143ms step_avg:142.86ms step:103/1480 train_time:13286ms step_avg:142.86ms step:104/1480 train_time:13429ms step_avg:142.87ms step:105/1480 train_time:13573ms step_avg:142.88ms step:106/1480 train_time:13717ms step_avg:142.89ms step:107/1480 train_time:13859ms step_avg:142.87ms step:108/1480 train_time:14000ms step_avg:142.86ms step:109/1480 train_time:14143ms step_avg:142.85ms step:110/1480 train_time:14284ms step_avg:142.84ms step:111/1480 train_time:14430ms step_avg:142.88ms step:112/1480 train_time:14577ms step_avg:142.91ms step:113/1480 train_time:14724ms step_avg:142.95ms step:114/1480 train_time:14871ms step_avg:142.99ms step:115/1480 train_time:15018ms step_avg:143.02ms step:116/1480 train_time:15163ms step_avg:143.05ms step:117/1480 train_time:15311ms step_avg:143.09ms step:118/1480 train_time:15457ms step_avg:143.12ms step:119/1480 train_time:15603ms step_avg:143.15ms step:120/1480 train_time:15752ms step_avg:143.20ms step:121/1480 train_time:15899ms step_avg:143.23ms step:122/1480 train_time:16046ms step_avg:143.27ms step:123/1480 train_time:16194ms step_avg:143.31ms step:124/1480 train_time:16340ms step_avg:143.34ms step:125/1480 train_time:16487ms step_avg:143.37ms step:125/1480 val_loss:4.4018 train_time:16545ms step_avg:143.87ms step:126/1480 train_time:16643ms step_avg:143.47ms step:127/1480 train_time:16791ms step_avg:143.51ms step:128/1480 train_time:16935ms step_avg:143.52ms step:129/1480 train_time:17082ms step_avg:143.54ms step:130/1480 train_time:17228ms step_avg:143.57ms step:131/1480 train_time:17374ms step_avg:143.59ms step:132/1480 train_time:17519ms step_avg:143.60ms step:133/1480 train_time:17669ms step_avg:143.65ms step:134/1480 train_time:17817ms step_avg:143.68ms step:135/1480 train_time:17962ms step_avg:143.70ms step:136/1480 train_time:18110ms step_avg:143.73ms step:137/1480 train_time:18255ms step_avg:143.74ms step:138/1480 train_time:18402ms step_avg:143.76ms step:139/1480 train_time:18549ms step_avg:143.79ms step:140/1480 train_time:18697ms step_avg:143.82ms step:141/1480 train_time:18845ms step_avg:143.85ms step:142/1480 train_time:18991ms step_avg:143.87ms step:143/1480 train_time:19136ms step_avg:143.88ms step:144/1480 train_time:19283ms step_avg:143.90ms step:145/1480 train_time:19430ms step_avg:143.92ms step:146/1480 train_time:19576ms step_avg:143.94ms step:147/1480 train_time:19724ms step_avg:143.97ms step:148/1480 train_time:19872ms step_avg:144.00ms step:149/1480 train_time:20018ms step_avg:144.01ms step:150/1480 train_time:20165ms step_avg:144.04ms step:151/1480 train_time:20312ms step_avg:144.06ms step:152/1480 train_time:20457ms step_avg:144.06ms step:153/1480 train_time:20604ms step_avg:144.08ms step:154/1480 train_time:20751ms step_avg:144.11ms step:155/1480 train_time:20898ms step_avg:144.12ms step:156/1480 train_time:21045ms step_avg:144.15ms step:157/1480 train_time:21192ms step_avg:144.16ms step:158/1480 train_time:21337ms step_avg:144.17ms step:159/1480 train_time:21483ms step_avg:144.18ms step:160/1480 train_time:21631ms step_avg:144.20ms step:161/1480 train_time:21778ms step_avg:144.22ms step:162/1480 train_time:21924ms step_avg:144.24ms step:163/1480 train_time:22073ms step_avg:144.27ms step:164/1480 train_time:22219ms step_avg:144.28ms step:165/1480 train_time:22366ms step_avg:144.30ms step:166/1480 train_time:22512ms step_avg:144.31ms step:167/1480 train_time:22658ms step_avg:144.32ms step:168/1480 train_time:22806ms step_avg:144.34ms step:169/1480 train_time:22952ms step_avg:144.35ms step:170/1480 train_time:23099ms step_avg:144.37ms step:171/1480 train_time:23247ms step_avg:144.39ms step:172/1480 train_time:23394ms step_avg:144.41ms step:173/1480 train_time:23539ms step_avg:144.41ms step:174/1480 train_time:23687ms step_avg:144.44ms step:175/1480 train_time:23834ms step_avg:144.45ms step:176/1480 train_time:23980ms step_avg:144.46ms step:177/1480 train_time:24126ms step_avg:144.47ms step:178/1480 train_time:24274ms step_avg:144.49ms step:179/1480 train_time:24420ms step_avg:144.50ms step:180/1480 train_time:24568ms step_avg:144.52ms step:181/1480 train_time:24714ms step_avg:144.52ms step:182/1480 train_time:24859ms step_avg:144.53ms step:183/1480 train_time:25007ms step_avg:144.55ms step:184/1480 train_time:25153ms step_avg:144.56ms step:185/1480 train_time:25299ms step_avg:144.57ms step:186/1480 train_time:25446ms step_avg:144.58ms step:187/1480 train_time:25594ms step_avg:144.60ms step:188/1480 train_time:25739ms step_avg:144.60ms step:189/1480 train_time:25886ms step_avg:144.62ms step:190/1480 train_time:26033ms step_avg:144.63ms step:191/1480 train_time:26178ms step_avg:144.63ms step:192/1480 train_time:26326ms step_avg:144.65ms step:193/1480 train_time:26474ms step_avg:144.67ms step:194/1480 train_time:26621ms step_avg:144.68ms step:195/1480 train_time:26769ms step_avg:144.70ms step:196/1480 train_time:26915ms step_avg:144.70ms step:197/1480 train_time:27061ms step_avg:144.71ms step:198/1480 train_time:27208ms step_avg:144.72ms step:199/1480 train_time:27354ms step_avg:144.73ms step:200/1480 train_time:27501ms step_avg:144.74ms step:201/1480 train_time:27647ms step_avg:144.75ms step:202/1480 train_time:27795ms step_avg:144.77ms step:203/1480 train_time:27941ms step_avg:144.77ms step:204/1480 train_time:28089ms step_avg:144.79ms step:205/1480 train_time:28234ms step_avg:144.79ms step:206/1480 train_time:28381ms step_avg:144.80ms step:207/1480 train_time:28528ms step_avg:144.81ms step:208/1480 train_time:28674ms step_avg:144.82ms step:209/1480 train_time:28820ms step_avg:144.83ms step:210/1480 train_time:28969ms step_avg:144.84ms step:211/1480 train_time:29116ms step_avg:144.85ms step:212/1480 train_time:29261ms step_avg:144.86ms step:213/1480 train_time:29409ms step_avg:144.87ms step:214/1480 train_time:29554ms step_avg:144.87ms step:215/1480 train_time:29701ms step_avg:144.88ms step:216/1480 train_time:29849ms step_avg:144.90ms step:217/1480 train_time:29995ms step_avg:144.91ms step:218/1480 train_time:30143ms step_avg:144.92ms step:219/1480 train_time:30290ms step_avg:144.93ms step:220/1480 train_time:30436ms step_avg:144.93ms step:221/1480 train_time:30585ms step_avg:144.95ms step:222/1480 train_time:30735ms step_avg:144.98ms step:223/1480 train_time:30885ms step_avg:145.00ms step:224/1480 train_time:31035ms step_avg:145.02ms step:225/1480 train_time:31185ms step_avg:145.04ms step:226/1480 train_time:31334ms step_avg:145.07ms step:227/1480 train_time:31485ms step_avg:145.09ms step:228/1480 train_time:31635ms step_avg:145.11ms step:229/1480 train_time:31785ms step_avg:145.14ms step:230/1480 train_time:31936ms step_avg:145.17ms step:231/1480 train_time:32088ms step_avg:145.20ms step:232/1480 train_time:32238ms step_avg:145.22ms step:233/1480 train_time:32390ms step_avg:145.24ms step:234/1480 train_time:32539ms step_avg:145.26ms step:235/1480 train_time:32689ms step_avg:145.29ms step:236/1480 train_time:32839ms step_avg:145.30ms step:237/1480 train_time:32990ms step_avg:145.33ms step:238/1480 train_time:33140ms step_avg:145.35ms step:239/1480 train_time:33291ms step_avg:145.38ms step:240/1480 train_time:33443ms step_avg:145.40ms step:241/1480 train_time:33594ms step_avg:145.43ms step:242/1480 train_time:33742ms step_avg:145.44ms step:243/1480 train_time:33892ms step_avg:145.46ms step:244/1480 train_time:34042ms step_avg:145.48ms step:245/1480 train_time:34193ms step_avg:145.50ms step:246/1480 train_time:34341ms step_avg:145.51ms step:247/1480 train_time:34492ms step_avg:145.54ms step:248/1480 train_time:34643ms step_avg:145.56ms step:249/1480 train_time:34793ms step_avg:145.58ms step:250/1480 train_time:34943ms step_avg:145.60ms step:250/1480 val_loss:3.9932 train_time:35002ms step_avg:145.84ms step:251/1480 train_time:35100ms step_avg:145.64ms step:252/1480 train_time:35252ms step_avg:145.67ms step:253/1480 train_time:35403ms step_avg:145.69ms step:254/1480 train_time:35552ms step_avg:145.70ms step:255/1480 train_time:35700ms step_avg:145.71ms step:256/1480 train_time:35850ms step_avg:145.73ms step:257/1480 train_time:36000ms step_avg:145.75ms step:258/1480 train_time:36151ms step_avg:145.77ms step:259/1480 train_time:36303ms step_avg:145.80ms step:260/1480 train_time:36453ms step_avg:145.81ms step:261/1480 train_time:36605ms step_avg:145.84ms step:262/1480 train_time:36754ms step_avg:145.85ms step:263/1480 train_time:36905ms step_avg:145.87ms step:264/1480 train_time:37055ms step_avg:145.89ms step:265/1480 train_time:37207ms step_avg:145.91ms step:266/1480 train_time:37357ms step_avg:145.93ms step:267/1480 train_time:37509ms step_avg:145.95ms step:268/1480 train_time:37658ms step_avg:145.96ms step:269/1480 train_time:37808ms step_avg:145.98ms step:270/1480 train_time:37957ms step_avg:145.99ms step:271/1480 train_time:38108ms step_avg:146.01ms step:272/1480 train_time:38257ms step_avg:146.02ms step:273/1480 train_time:38408ms step_avg:146.04ms step:274/1480 train_time:38557ms step_avg:146.05ms step:275/1480 train_time:38708ms step_avg:146.07ms step:276/1480 train_time:38858ms step_avg:146.08ms step:277/1480 train_time:39008ms step_avg:146.10ms step:278/1480 train_time:39157ms step_avg:146.11ms step:279/1480 train_time:39309ms step_avg:146.13ms step:280/1480 train_time:39459ms step_avg:146.14ms step:281/1480 train_time:39610ms step_avg:146.16ms step:282/1480 train_time:39760ms step_avg:146.18ms step:283/1480 train_time:39911ms step_avg:146.19ms step:284/1480 train_time:40060ms step_avg:146.20ms step:285/1480 train_time:40211ms step_avg:146.22ms step:286/1480 train_time:40361ms step_avg:146.23ms step:287/1480 train_time:40512ms step_avg:146.25ms step:288/1480 train_time:40664ms step_avg:146.27ms step:289/1480 train_time:40813ms step_avg:146.28ms step:290/1480 train_time:40963ms step_avg:146.30ms step:291/1480 train_time:41114ms step_avg:146.31ms step:292/1480 train_time:41265ms step_avg:146.33ms step:293/1480 train_time:41415ms step_avg:146.34ms step:294/1480 train_time:41566ms step_avg:146.36ms step:295/1480 train_time:41716ms step_avg:146.37ms step:296/1480 train_time:41867ms step_avg:146.39ms step:297/1480 train_time:42017ms step_avg:146.40ms step:298/1480 train_time:42168ms step_avg:146.42ms step:299/1480 train_time:42318ms step_avg:146.43ms step:300/1480 train_time:42470ms step_avg:146.45ms step:301/1480 train_time:42619ms step_avg:146.46ms step:302/1480 train_time:42770ms step_avg:146.47ms step:303/1480 train_time:42921ms step_avg:146.49ms step:304/1480 train_time:43072ms step_avg:146.50ms step:305/1480 train_time:43221ms step_avg:146.51ms step:306/1480 train_time:43372ms step_avg:146.53ms step:307/1480 train_time:43522ms step_avg:146.54ms step:308/1480 train_time:43672ms step_avg:146.55ms step:309/1480 train_time:43823ms step_avg:146.57ms step:310/1480 train_time:43973ms step_avg:146.58ms step:311/1480 train_time:44124ms step_avg:146.59ms step:312/1480 train_time:44274ms step_avg:146.60ms step:313/1480 train_time:44424ms step_avg:146.61ms step:314/1480 train_time:44574ms step_avg:146.62ms step:315/1480 train_time:44725ms step_avg:146.64ms step:316/1480 train_time:44874ms step_avg:146.65ms step:317/1480 train_time:45025ms step_avg:146.66ms step:318/1480 train_time:45174ms step_avg:146.67ms step:319/1480 train_time:45326ms step_avg:146.69ms step:320/1480 train_time:45475ms step_avg:146.69ms step:321/1480 train_time:45626ms step_avg:146.71ms step:322/1480 train_time:45776ms step_avg:146.72ms step:323/1480 train_time:45927ms step_avg:146.73ms step:324/1480 train_time:46078ms step_avg:146.74ms step:325/1480 train_time:46228ms step_avg:146.76ms step:326/1480 train_time:46377ms step_avg:146.76ms step:327/1480 train_time:46529ms step_avg:146.78ms step:328/1480 train_time:46678ms step_avg:146.79ms step:329/1480 train_time:46828ms step_avg:146.80ms step:330/1480 train_time:46980ms step_avg:146.81ms step:331/1480 train_time:47134ms step_avg:146.83ms step:332/1480 train_time:47287ms step_avg:146.85ms step:333/1480 train_time:47441ms step_avg:146.88ms step:334/1480 train_time:47595ms step_avg:146.90ms step:335/1480 train_time:47748ms step_avg:146.92ms step:336/1480 train_time:47902ms step_avg:146.94ms step:337/1480 train_time:48055ms step_avg:146.96ms step:338/1480 train_time:48208ms step_avg:146.98ms step:339/1480 train_time:48362ms step_avg:147.00ms step:340/1480 train_time:48514ms step_avg:147.01ms step:341/1480 train_time:48667ms step_avg:147.03ms step:342/1480 train_time:48822ms step_avg:147.05ms step:343/1480 train_time:48976ms step_avg:147.08ms step:344/1480 train_time:49131ms step_avg:147.10ms step:345/1480 train_time:49286ms step_avg:147.12ms step:346/1480 train_time:49440ms step_avg:147.14ms step:347/1480 train_time:49594ms step_avg:147.16ms step:348/1480 train_time:49747ms step_avg:147.18ms step:349/1480 train_time:49901ms step_avg:147.20ms step:350/1480 train_time:50055ms step_avg:147.22ms step:351/1480 train_time:50208ms step_avg:147.24ms step:352/1480 train_time:50362ms step_avg:147.26ms step:353/1480 train_time:50515ms step_avg:147.27ms step:354/1480 train_time:50668ms step_avg:147.29ms step:355/1480 train_time:50823ms step_avg:147.31ms step:356/1480 train_time:50977ms step_avg:147.33ms step:357/1480 train_time:51132ms step_avg:147.35ms step:358/1480 train_time:51285ms step_avg:147.37ms step:359/1480 train_time:51439ms step_avg:147.39ms step:360/1480 train_time:51594ms step_avg:147.41ms step:361/1480 train_time:51748ms step_avg:147.43ms step:362/1480 train_time:51902ms step_avg:147.45ms step:363/1480 train_time:52055ms step_avg:147.46ms step:364/1480 train_time:52209ms step_avg:147.48ms step:365/1480 train_time:52365ms step_avg:147.51ms step:366/1480 train_time:52519ms step_avg:147.52ms step:367/1480 train_time:52671ms step_avg:147.54ms step:368/1480 train_time:52826ms step_avg:147.56ms step:369/1480 train_time:52979ms step_avg:147.57ms step:370/1480 train_time:53132ms step_avg:147.59ms step:371/1480 train_time:53286ms step_avg:147.61ms step:372/1480 train_time:53440ms step_avg:147.63ms step:373/1480 train_time:53594ms step_avg:147.64ms step:374/1480 train_time:53747ms step_avg:147.66ms step:375/1480 train_time:53901ms step_avg:147.67ms step:375/1480 val_loss:3.8040 train_time:53962ms step_avg:147.84ms step:376/1480 train_time:54060ms step_avg:147.70ms step:377/1480 train_time:54214ms step_avg:147.72ms step:378/1480 train_time:54367ms step_avg:147.74ms step:379/1480 train_time:54519ms step_avg:147.75ms step:380/1480 train_time:54671ms step_avg:147.76ms step:381/1480 train_time:54824ms step_avg:147.77ms step:382/1480 train_time:54979ms step_avg:147.79ms step:383/1480 train_time:55134ms step_avg:147.81ms step:384/1480 train_time:55288ms step_avg:147.83ms step:385/1480 train_time:55442ms step_avg:147.85ms step:386/1480 train_time:55594ms step_avg:147.86ms step:387/1480 train_time:55748ms step_avg:147.87ms step:388/1480 train_time:55900ms step_avg:147.88ms step:389/1480 train_time:56053ms step_avg:147.90ms step:390/1480 train_time:56208ms step_avg:147.92ms step:391/1480 train_time:56363ms step_avg:147.93ms step:392/1480 train_time:56516ms step_avg:147.95ms step:393/1480 train_time:56669ms step_avg:147.96ms step:394/1480 train_time:56823ms step_avg:147.98ms step:395/1480 train_time:56976ms step_avg:147.99ms step:396/1480 train_time:57129ms step_avg:148.00ms step:397/1480 train_time:57283ms step_avg:148.02ms step:398/1480 train_time:57438ms step_avg:148.04ms step:399/1480 train_time:57591ms step_avg:148.05ms step:400/1480 train_time:57745ms step_avg:148.06ms step:401/1480 train_time:57899ms step_avg:148.08ms step:402/1480 train_time:58053ms step_avg:148.09ms step:403/1480 train_time:58207ms step_avg:148.11ms step:404/1480 train_time:58361ms step_avg:148.13ms step:405/1480 train_time:58515ms step_avg:148.14ms step:406/1480 train_time:58668ms step_avg:148.15ms step:407/1480 train_time:58821ms step_avg:148.16ms step:408/1480 train_time:58975ms step_avg:148.18ms step:409/1480 train_time:59129ms step_avg:148.19ms step:410/1480 train_time:59281ms step_avg:148.20ms step:411/1480 train_time:59436ms step_avg:148.22ms step:412/1480 train_time:59589ms step_avg:148.23ms step:413/1480 train_time:59743ms step_avg:148.25ms step:414/1480 train_time:59898ms step_avg:148.26ms step:415/1480 train_time:60052ms step_avg:148.28ms step:416/1480 train_time:60206ms step_avg:148.29ms step:417/1480 train_time:60360ms step_avg:148.30ms step:418/1480 train_time:60515ms step_avg:148.32ms step:419/1480 train_time:60667ms step_avg:148.33ms step:420/1480 train_time:60821ms step_avg:148.34ms step:421/1480 train_time:60975ms step_avg:148.36ms step:422/1480 train_time:61128ms step_avg:148.37ms step:423/1480 train_time:61280ms step_avg:148.38ms step:424/1480 train_time:61434ms step_avg:148.39ms step:425/1480 train_time:61587ms step_avg:148.40ms step:426/1480 train_time:61741ms step_avg:148.42ms step:427/1480 train_time:61895ms step_avg:148.43ms step:428/1480 train_time:62048ms step_avg:148.44ms step:429/1480 train_time:62202ms step_avg:148.45ms step:430/1480 train_time:62355ms step_avg:148.46ms step:431/1480 train_time:62508ms step_avg:148.48ms step:432/1480 train_time:62662ms step_avg:148.49ms step:433/1480 train_time:62816ms step_avg:148.50ms step:434/1480 train_time:62969ms step_avg:148.51ms step:435/1480 train_time:63123ms step_avg:148.52ms step:436/1480 train_time:63277ms step_avg:148.54ms step:437/1480 train_time:63431ms step_avg:148.55ms step:438/1480 train_time:63584ms step_avg:148.56ms step:439/1480 train_time:63739ms step_avg:148.58ms step:440/1480 train_time:63896ms step_avg:148.59ms step:441/1480 train_time:64052ms step_avg:148.61ms step:442/1480 train_time:64208ms step_avg:148.63ms step:443/1480 train_time:64364ms step_avg:148.65ms step:444/1480 train_time:64521ms step_avg:148.67ms step:445/1480 train_time:64676ms step_avg:148.68ms step:446/1480 train_time:64831ms step_avg:148.69ms step:447/1480 train_time:64986ms step_avg:148.71ms step:448/1480 train_time:65143ms step_avg:148.73ms step:449/1480 train_time:65302ms step_avg:148.75ms step:450/1480 train_time:65460ms step_avg:148.77ms step:451/1480 train_time:65617ms step_avg:148.79ms step:452/1480 train_time:65773ms step_avg:148.81ms step:453/1480 train_time:65929ms step_avg:148.82ms step:454/1480 train_time:66085ms step_avg:148.84ms step:455/1480 train_time:66243ms step_avg:148.86ms step:456/1480 train_time:66401ms step_avg:148.88ms step:457/1480 train_time:66555ms step_avg:148.89ms step:458/1480 train_time:66712ms step_avg:148.91ms step:459/1480 train_time:66869ms step_avg:148.93ms step:460/1480 train_time:67026ms step_avg:148.95ms step:461/1480 train_time:67184ms step_avg:148.97ms step:462/1480 train_time:67342ms step_avg:148.99ms step:463/1480 train_time:67500ms step_avg:149.01ms step:464/1480 train_time:67656ms step_avg:149.02ms step:465/1480 train_time:67812ms step_avg:149.04ms step:466/1480 train_time:67968ms step_avg:149.05ms step:467/1480 train_time:68125ms step_avg:149.07ms step:468/1480 train_time:68283ms step_avg:149.09ms step:469/1480 train_time:68439ms step_avg:149.10ms step:470/1480 train_time:68597ms step_avg:149.12ms step:471/1480 train_time:68752ms step_avg:149.14ms step:472/1480 train_time:68910ms step_avg:149.16ms step:473/1480 train_time:69066ms step_avg:149.17ms step:474/1480 train_time:69224ms step_avg:149.19ms step:475/1480 train_time:69381ms step_avg:149.21ms step:476/1480 train_time:69537ms step_avg:149.22ms step:477/1480 train_time:69693ms step_avg:149.24ms step:478/1480 train_time:69849ms step_avg:149.25ms step:479/1480 train_time:70006ms step_avg:149.27ms step:480/1480 train_time:70164ms step_avg:149.29ms step:481/1480 train_time:70322ms step_avg:149.30ms step:482/1480 train_time:70479ms step_avg:149.32ms step:483/1480 train_time:70633ms step_avg:149.33ms step:484/1480 train_time:70791ms step_avg:149.35ms step:485/1480 train_time:70948ms step_avg:149.36ms step:486/1480 train_time:71106ms step_avg:149.38ms step:487/1480 train_time:71264ms step_avg:149.40ms step:488/1480 train_time:71423ms step_avg:149.42ms step:489/1480 train_time:71580ms step_avg:149.44ms step:490/1480 train_time:71736ms step_avg:149.45ms step:491/1480 train_time:71892ms step_avg:149.46ms step:492/1480 train_time:72049ms step_avg:149.48ms step:493/1480 train_time:72205ms step_avg:149.49ms step:494/1480 train_time:72364ms step_avg:149.51ms step:495/1480 train_time:72522ms step_avg:149.53ms step:496/1480 train_time:72680ms step_avg:149.55ms step:497/1480 train_time:72836ms step_avg:149.56ms step:498/1480 train_time:72993ms step_avg:149.58ms step:499/1480 train_time:73150ms step_avg:149.59ms step:500/1480 train_time:73307ms step_avg:149.61ms step:500/1480 val_loss:3.6872 train_time:73369ms step_avg:149.73ms step:501/1480 train_time:73468ms step_avg:149.63ms step:502/1480 train_time:73628ms step_avg:149.65ms step:503/1480 train_time:73784ms step_avg:149.66ms step:504/1480 train_time:73939ms step_avg:149.68ms step:505/1480 train_time:74094ms step_avg:149.68ms step:506/1480 train_time:74250ms step_avg:149.70ms step:507/1480 train_time:74408ms step_avg:149.71ms step:508/1480 train_time:74568ms step_avg:149.73ms step:509/1480 train_time:74725ms step_avg:149.75ms step:510/1480 train_time:74882ms step_avg:149.76ms step:511/1480 train_time:75039ms step_avg:149.78ms step:512/1480 train_time:75196ms step_avg:149.79ms step:513/1480 train_time:75352ms step_avg:149.80ms step:514/1480 train_time:75509ms step_avg:149.82ms step:515/1480 train_time:75667ms step_avg:149.84ms step:516/1480 train_time:75826ms step_avg:149.85ms step:517/1480 train_time:75984ms step_avg:149.87ms step:518/1480 train_time:76142ms step_avg:149.89ms step:519/1480 train_time:76298ms step_avg:149.90ms step:520/1480 train_time:76454ms step_avg:149.91ms step:521/1480 train_time:76612ms step_avg:149.93ms step:522/1480 train_time:76770ms step_avg:149.94ms step:523/1480 train_time:76928ms step_avg:149.96ms step:524/1480 train_time:77085ms step_avg:149.97ms step:525/1480 train_time:77243ms step_avg:149.99ms step:526/1480 train_time:77400ms step_avg:150.00ms step:527/1480 train_time:77555ms step_avg:150.01ms step:528/1480 train_time:77712ms step_avg:150.02ms step:529/1480 train_time:77870ms step_avg:150.04ms step:530/1480 train_time:78027ms step_avg:150.05ms step:531/1480 train_time:78186ms step_avg:150.07ms step:532/1480 train_time:78343ms step_avg:150.08ms step:533/1480 train_time:78499ms step_avg:150.09ms step:534/1480 train_time:78655ms step_avg:150.11ms step:535/1480 train_time:78812ms step_avg:150.12ms step:536/1480 train_time:78971ms step_avg:150.14ms step:537/1480 train_time:79128ms step_avg:150.15ms step:538/1480 train_time:79286ms step_avg:150.16ms step:539/1480 train_time:79444ms step_avg:150.18ms step:540/1480 train_time:79602ms step_avg:150.19ms step:541/1480 train_time:79758ms step_avg:150.20ms step:542/1480 train_time:79915ms step_avg:150.22ms step:543/1480 train_time:80072ms step_avg:150.23ms step:544/1480 train_time:80229ms step_avg:150.24ms step:545/1480 train_time:80386ms step_avg:150.25ms step:546/1480 train_time:80542ms step_avg:150.27ms step:547/1480 train_time:80699ms step_avg:150.28ms step:548/1480 train_time:80855ms step_avg:150.29ms step:549/1480 train_time:81013ms step_avg:150.30ms step:550/1480 train_time:81171ms step_avg:150.32ms step:551/1480 train_time:81329ms step_avg:150.33ms step:552/1480 train_time:81489ms step_avg:150.35ms step:553/1480 train_time:81650ms step_avg:150.37ms step:554/1480 train_time:81811ms step_avg:150.39ms step:555/1480 train_time:81972ms step_avg:150.41ms step:556/1480 train_time:82131ms step_avg:150.42ms step:557/1480 train_time:82291ms step_avg:150.44ms step:558/1480 train_time:82450ms step_avg:150.46ms step:559/1480 train_time:82608ms step_avg:150.47ms step:560/1480 train_time:82769ms step_avg:150.49ms step:561/1480 train_time:82930ms step_avg:150.51ms step:562/1480 train_time:83091ms step_avg:150.53ms step:563/1480 train_time:83250ms step_avg:150.54ms step:564/1480 train_time:83410ms step_avg:150.56ms step:565/1480 train_time:83570ms step_avg:150.58ms step:566/1480 train_time:83730ms step_avg:150.59ms step:567/1480 train_time:83889ms step_avg:150.61ms step:568/1480 train_time:84047ms step_avg:150.62ms step:569/1480 train_time:84208ms step_avg:150.64ms step:570/1480 train_time:84367ms step_avg:150.66ms step:571/1480 train_time:84527ms step_avg:150.67ms step:572/1480 train_time:84688ms step_avg:150.69ms step:573/1480 train_time:84847ms step_avg:150.71ms step:574/1480 train_time:85010ms step_avg:150.73ms step:575/1480 train_time:85171ms step_avg:150.74ms step:576/1480 train_time:85330ms step_avg:150.76ms step:577/1480 train_time:85489ms step_avg:150.77ms step:578/1480 train_time:85648ms step_avg:150.79ms step:579/1480 train_time:85808ms step_avg:150.81ms step:580/1480 train_time:85968ms step_avg:150.82ms step:581/1480 train_time:86130ms step_avg:150.84ms step:582/1480 train_time:86290ms step_avg:150.86ms step:583/1480 train_time:86449ms step_avg:150.87ms step:584/1480 train_time:86610ms step_avg:150.89ms step:585/1480 train_time:86770ms step_avg:150.90ms step:586/1480 train_time:86930ms step_avg:150.92ms step:587/1480 train_time:87089ms step_avg:150.93ms step:588/1480 train_time:87248ms step_avg:150.95ms step:589/1480 train_time:87410ms step_avg:150.97ms step:590/1480 train_time:87571ms step_avg:150.99ms step:591/1480 train_time:87729ms step_avg:151.00ms step:592/1480 train_time:87890ms step_avg:151.01ms step:593/1480 train_time:88050ms step_avg:151.03ms step:594/1480 train_time:88211ms step_avg:151.05ms step:595/1480 train_time:88371ms step_avg:151.06ms step:596/1480 train_time:88532ms step_avg:151.08ms step:597/1480 train_time:88691ms step_avg:151.09ms step:598/1480 train_time:88848ms step_avg:151.10ms step:599/1480 train_time:89008ms step_avg:151.12ms step:600/1480 train_time:89168ms step_avg:151.13ms step:601/1480 train_time:89328ms step_avg:151.15ms step:602/1480 train_time:89487ms step_avg:151.16ms step:603/1480 train_time:89647ms step_avg:151.18ms step:604/1480 train_time:89808ms step_avg:151.19ms step:605/1480 train_time:89968ms step_avg:151.21ms step:606/1480 train_time:90130ms step_avg:151.23ms step:607/1480 train_time:90293ms step_avg:151.24ms step:608/1480 train_time:90451ms step_avg:151.26ms step:609/1480 train_time:90611ms step_avg:151.27ms step:610/1480 train_time:90769ms step_avg:151.28ms step:611/1480 train_time:90929ms step_avg:151.30ms step:612/1480 train_time:91091ms step_avg:151.31ms step:613/1480 train_time:91251ms step_avg:151.33ms step:614/1480 train_time:91411ms step_avg:151.34ms step:615/1480 train_time:91570ms step_avg:151.36ms step:616/1480 train_time:91729ms step_avg:151.37ms step:617/1480 train_time:91889ms step_avg:151.38ms step:618/1480 train_time:92048ms step_avg:151.39ms step:619/1480 train_time:92209ms step_avg:151.41ms step:620/1480 train_time:92368ms step_avg:151.42ms step:621/1480 train_time:92528ms step_avg:151.44ms step:622/1480 train_time:92689ms step_avg:151.45ms step:623/1480 train_time:92849ms step_avg:151.47ms step:624/1480 train_time:93009ms step_avg:151.48ms step:625/1480 train_time:93168ms step_avg:151.49ms step:625/1480 val_loss:3.6069 train_time:93232ms step_avg:151.60ms step:626/1480 train_time:93331ms step_avg:151.51ms step:627/1480 train_time:93492ms step_avg:151.53ms step:628/1480 train_time:93650ms step_avg:151.54ms step:629/1480 train_time:93809ms step_avg:151.55ms step:630/1480 train_time:93966ms step_avg:151.56ms step:631/1480 train_time:94123ms step_avg:151.57ms step:632/1480 train_time:94282ms step_avg:151.58ms step:633/1480 train_time:94440ms step_avg:151.59ms step:634/1480 train_time:94599ms step_avg:151.60ms step:635/1480 train_time:94758ms step_avg:151.61ms step:636/1480 train_time:94918ms step_avg:151.63ms step:637/1480 train_time:95079ms step_avg:151.64ms step:638/1480 train_time:95238ms step_avg:151.65ms step:639/1480 train_time:95398ms step_avg:151.67ms step:640/1480 train_time:95558ms step_avg:151.68ms step:641/1480 train_time:95718ms step_avg:151.69ms step:642/1480 train_time:95878ms step_avg:151.71ms step:643/1480 train_time:96039ms step_avg:151.72ms step:644/1480 train_time:96198ms step_avg:151.73ms step:645/1480 train_time:96357ms step_avg:151.74ms step:646/1480 train_time:96516ms step_avg:151.76ms step:647/1480 train_time:96676ms step_avg:151.77ms step:648/1480 train_time:96838ms step_avg:151.78ms step:649/1480 train_time:96997ms step_avg:151.79ms step:650/1480 train_time:97158ms step_avg:151.81ms step:651/1480 train_time:97318ms step_avg:151.82ms step:652/1480 train_time:97479ms step_avg:151.84ms step:653/1480 train_time:97638ms step_avg:151.85ms step:654/1480 train_time:97797ms step_avg:151.86ms step:655/1480 train_time:97957ms step_avg:151.87ms step:656/1480 train_time:98118ms step_avg:151.88ms step:657/1480 train_time:98279ms step_avg:151.90ms step:658/1480 train_time:98439ms step_avg:151.91ms step:659/1480 train_time:98602ms step_avg:151.93ms step:660/1480 train_time:98765ms step_avg:151.95ms step:661/1480 train_time:98926ms step_avg:151.96ms step:662/1480 train_time:99085ms step_avg:151.97ms step:663/1480 train_time:99243ms step_avg:151.98ms step:664/1480 train_time:99405ms step_avg:152.00ms step:665/1480 train_time:99566ms step_avg:152.01ms step:666/1480 train_time:99727ms step_avg:152.02ms step:667/1480 train_time:99889ms step_avg:152.04ms step:668/1480 train_time:100051ms step_avg:152.05ms step:669/1480 train_time:100213ms step_avg:152.07ms step:670/1480 train_time:100375ms step_avg:152.08ms step:671/1480 train_time:100536ms step_avg:152.10ms step:672/1480 train_time:100699ms step_avg:152.11ms step:673/1480 train_time:100862ms step_avg:152.13ms step:674/1480 train_time:101023ms step_avg:152.14ms step:675/1480 train_time:101184ms step_avg:152.16ms step:676/1480 train_time:101345ms step_avg:152.17ms step:677/1480 train_time:101506ms step_avg:152.18ms step:678/1480 train_time:101667ms step_avg:152.20ms step:679/1480 train_time:101829ms step_avg:152.21ms step:680/1480 train_time:101990ms step_avg:152.22ms step:681/1480 train_time:102152ms step_avg:152.24ms step:682/1480 train_time:102315ms step_avg:152.25ms step:683/1480 train_time:102477ms step_avg:152.27ms step:684/1480 train_time:102638ms step_avg:152.28ms step:685/1480 train_time:102802ms step_avg:152.30ms step:686/1480 train_time:102963ms step_avg:152.31ms step:687/1480 train_time:103122ms step_avg:152.32ms step:688/1480 train_time:103284ms step_avg:152.34ms step:689/1480 train_time:103447ms step_avg:152.35ms step:690/1480 train_time:103610ms step_avg:152.37ms step:691/1480 train_time:103772ms step_avg:152.38ms step:692/1480 train_time:103934ms step_avg:152.40ms step:693/1480 train_time:104096ms step_avg:152.41ms step:694/1480 train_time:104259ms step_avg:152.42ms step:695/1480 train_time:104419ms step_avg:152.44ms step:696/1480 train_time:104581ms step_avg:152.45ms step:697/1480 train_time:104742ms step_avg:152.46ms step:698/1480 train_time:104903ms step_avg:152.47ms step:699/1480 train_time:105064ms step_avg:152.49ms step:700/1480 train_time:105226ms step_avg:152.50ms step:701/1480 train_time:105385ms step_avg:152.51ms step:702/1480 train_time:105546ms step_avg:152.52ms step:703/1480 train_time:105706ms step_avg:152.53ms step:704/1480 train_time:105867ms step_avg:152.55ms step:705/1480 train_time:106031ms step_avg:152.56ms step:706/1480 train_time:106195ms step_avg:152.58ms step:707/1480 train_time:106358ms step_avg:152.59ms step:708/1480 train_time:106518ms step_avg:152.60ms step:709/1480 train_time:106681ms step_avg:152.62ms step:710/1480 train_time:106842ms step_avg:152.63ms step:711/1480 train_time:107003ms step_avg:152.64ms step:712/1480 train_time:107167ms step_avg:152.66ms step:713/1480 train_time:107333ms step_avg:152.68ms step:714/1480 train_time:107493ms step_avg:152.69ms step:715/1480 train_time:107653ms step_avg:152.70ms step:716/1480 train_time:107814ms step_avg:152.71ms step:717/1480 train_time:107979ms step_avg:152.73ms step:718/1480 train_time:108140ms step_avg:152.74ms step:719/1480 train_time:108300ms step_avg:152.75ms step:720/1480 train_time:108463ms step_avg:152.76ms step:721/1480 train_time:108624ms step_avg:152.78ms step:722/1480 train_time:108785ms step_avg:152.79ms step:723/1480 train_time:108944ms step_avg:152.80ms step:724/1480 train_time:109106ms step_avg:152.81ms step:725/1480 train_time:109269ms step_avg:152.82ms step:726/1480 train_time:109433ms step_avg:152.84ms step:727/1480 train_time:109596ms step_avg:152.85ms step:728/1480 train_time:109757ms step_avg:152.86ms step:729/1480 train_time:109919ms step_avg:152.88ms step:730/1480 train_time:110083ms step_avg:152.89ms step:731/1480 train_time:110244ms step_avg:152.90ms step:732/1480 train_time:110403ms step_avg:152.91ms step:733/1480 train_time:110564ms step_avg:152.92ms step:734/1480 train_time:110725ms step_avg:152.94ms step:735/1480 train_time:110886ms step_avg:152.95ms step:736/1480 train_time:111046ms step_avg:152.96ms step:737/1480 train_time:111206ms step_avg:152.97ms step:738/1480 train_time:111367ms step_avg:152.98ms step:739/1480 train_time:111527ms step_avg:152.99ms step:740/1480 train_time:111693ms step_avg:153.00ms step:741/1480 train_time:111857ms step_avg:153.02ms step:742/1480 train_time:112019ms step_avg:153.03ms step:743/1480 train_time:112181ms step_avg:153.04ms step:744/1480 train_time:112345ms step_avg:153.06ms step:745/1480 train_time:112509ms step_avg:153.07ms step:746/1480 train_time:112668ms step_avg:153.08ms step:747/1480 train_time:112831ms step_avg:153.10ms step:748/1480 train_time:112997ms step_avg:153.11ms step:749/1480 train_time:113163ms step_avg:153.13ms step:750/1480 train_time:113322ms step_avg:153.14ms step:750/1480 val_loss:3.5545 train_time:113387ms step_avg:153.23ms step:751/1480 train_time:113488ms step_avg:153.15ms step:752/1480 train_time:113650ms step_avg:153.17ms step:753/1480 train_time:113811ms step_avg:153.18ms step:754/1480 train_time:113973ms step_avg:153.19ms step:755/1480 train_time:114134ms step_avg:153.20ms step:756/1480 train_time:114295ms step_avg:153.21ms step:757/1480 train_time:114458ms step_avg:153.22ms step:758/1480 train_time:114620ms step_avg:153.23ms step:759/1480 train_time:114782ms step_avg:153.25ms step:760/1480 train_time:114944ms step_avg:153.26ms step:761/1480 train_time:115106ms step_avg:153.27ms step:762/1480 train_time:115267ms step_avg:153.28ms step:763/1480 train_time:115429ms step_avg:153.29ms step:764/1480 train_time:115591ms step_avg:153.30ms step:765/1480 train_time:115751ms step_avg:153.31ms step:766/1480 train_time:115915ms step_avg:153.33ms step:767/1480 train_time:116078ms step_avg:153.34ms step:768/1480 train_time:116241ms step_avg:153.35ms step:769/1480 train_time:116405ms step_avg:153.37ms step:770/1480 train_time:116567ms step_avg:153.38ms step:771/1480 train_time:116731ms step_avg:153.39ms step:772/1480 train_time:116893ms step_avg:153.40ms step:773/1480 train_time:117055ms step_avg:153.41ms step:774/1480 train_time:117219ms step_avg:153.43ms step:775/1480 train_time:117382ms step_avg:153.44ms step:776/1480 train_time:117547ms step_avg:153.46ms step:777/1480 train_time:117711ms step_avg:153.47ms step:778/1480 train_time:117874ms step_avg:153.48ms step:779/1480 train_time:118038ms step_avg:153.50ms step:780/1480 train_time:118202ms step_avg:153.51ms step:781/1480 train_time:118365ms step_avg:153.52ms step:782/1480 train_time:118528ms step_avg:153.53ms step:783/1480 train_time:118689ms step_avg:153.54ms step:784/1480 train_time:118853ms step_avg:153.56ms step:785/1480 train_time:119016ms step_avg:153.57ms step:786/1480 train_time:119181ms step_avg:153.58ms step:787/1480 train_time:119344ms step_avg:153.60ms step:788/1480 train_time:119507ms step_avg:153.61ms step:789/1480 train_time:119669ms step_avg:153.62ms step:790/1480 train_time:119833ms step_avg:153.63ms step:791/1480 train_time:120000ms step_avg:153.65ms step:792/1480 train_time:120166ms step_avg:153.66ms step:793/1480 train_time:120327ms step_avg:153.67ms step:794/1480 train_time:120491ms step_avg:153.69ms step:795/1480 train_time:120657ms step_avg:153.70ms step:796/1480 train_time:120824ms step_avg:153.72ms step:797/1480 train_time:120988ms step_avg:153.73ms step:798/1480 train_time:121151ms step_avg:153.74ms step:799/1480 train_time:121320ms step_avg:153.76ms step:800/1480 train_time:121484ms step_avg:153.78ms step:801/1480 train_time:121647ms step_avg:153.79ms step:802/1480 train_time:121813ms step_avg:153.80ms step:803/1480 train_time:121976ms step_avg:153.82ms step:804/1480 train_time:122139ms step_avg:153.83ms step:805/1480 train_time:122304ms step_avg:153.84ms step:806/1480 train_time:122465ms step_avg:153.85ms step:807/1480 train_time:122627ms step_avg:153.86ms step:808/1480 train_time:122791ms step_avg:153.87ms step:809/1480 train_time:122953ms step_avg:153.88ms step:810/1480 train_time:123115ms step_avg:153.89ms step:811/1480 train_time:123278ms step_avg:153.90ms step:812/1480 train_time:123442ms step_avg:153.92ms step:813/1480 train_time:123603ms step_avg:153.93ms step:814/1480 train_time:123765ms step_avg:153.94ms step:815/1480 train_time:123926ms step_avg:153.95ms step:816/1480 train_time:124091ms step_avg:153.96ms step:817/1480 train_time:124253ms step_avg:153.97ms step:818/1480 train_time:124415ms step_avg:153.98ms step:819/1480 train_time:124580ms step_avg:153.99ms step:820/1480 train_time:124744ms step_avg:154.01ms step:821/1480 train_time:124905ms step_avg:154.01ms step:822/1480 train_time:125068ms step_avg:154.02ms step:823/1480 train_time:125231ms step_avg:154.04ms step:824/1480 train_time:125393ms step_avg:154.05ms step:825/1480 train_time:125559ms step_avg:154.06ms step:826/1480 train_time:125724ms step_avg:154.07ms step:827/1480 train_time:125888ms step_avg:154.09ms step:828/1480 train_time:126049ms step_avg:154.09ms step:829/1480 train_time:126213ms step_avg:154.11ms step:830/1480 train_time:126379ms step_avg:154.12ms step:831/1480 train_time:126544ms step_avg:154.13ms step:832/1480 train_time:126708ms step_avg:154.15ms step:833/1480 train_time:126873ms step_avg:154.16ms step:834/1480 train_time:127038ms step_avg:154.17ms step:835/1480 train_time:127202ms step_avg:154.18ms step:836/1480 train_time:127366ms step_avg:154.20ms step:837/1480 train_time:127528ms step_avg:154.21ms step:838/1480 train_time:127690ms step_avg:154.21ms step:839/1480 train_time:127852ms step_avg:154.22ms step:840/1480 train_time:128014ms step_avg:154.23ms step:841/1480 train_time:128176ms step_avg:154.24ms step:842/1480 train_time:128340ms step_avg:154.25ms step:843/1480 train_time:128502ms step_avg:154.26ms step:844/1480 train_time:128664ms step_avg:154.27ms step:845/1480 train_time:128829ms step_avg:154.29ms step:846/1480 train_time:128993ms step_avg:154.30ms step:847/1480 train_time:129157ms step_avg:154.31ms step:848/1480 train_time:129319ms step_avg:154.32ms step:849/1480 train_time:129482ms step_avg:154.33ms step:850/1480 train_time:129645ms step_avg:154.34ms step:851/1480 train_time:129810ms step_avg:154.35ms step:852/1480 train_time:129972ms step_avg:154.36ms step:853/1480 train_time:130134ms step_avg:154.37ms step:854/1480 train_time:130297ms step_avg:154.38ms step:855/1480 train_time:130462ms step_avg:154.39ms step:856/1480 train_time:130623ms step_avg:154.40ms step:857/1480 train_time:130787ms step_avg:154.41ms step:858/1480 train_time:130951ms step_avg:154.42ms step:859/1480 train_time:131116ms step_avg:154.44ms step:860/1480 train_time:131278ms step_avg:154.44ms step:861/1480 train_time:131444ms step_avg:154.46ms step:862/1480 train_time:131611ms step_avg:154.47ms step:863/1480 train_time:131780ms step_avg:154.49ms step:864/1480 train_time:131945ms step_avg:154.50ms step:865/1480 train_time:132105ms step_avg:154.51ms step:866/1480 train_time:132272ms step_avg:154.52ms step:867/1480 train_time:132435ms step_avg:154.53ms step:868/1480 train_time:132596ms step_avg:154.54ms step:869/1480 train_time:132758ms step_avg:154.55ms step:870/1480 train_time:132923ms step_avg:154.56ms step:871/1480 train_time:133085ms step_avg:154.57ms step:872/1480 train_time:133248ms step_avg:154.58ms step:873/1480 train_time:133411ms step_avg:154.59ms step:874/1480 train_time:133579ms step_avg:154.60ms step:875/1480 train_time:133743ms step_avg:154.62ms step:875/1480 val_loss:3.5056 train_time:133808ms step_avg:154.69ms step:876/1480 train_time:133909ms step_avg:154.63ms step:877/1480 train_time:134076ms step_avg:154.64ms step:878/1480 train_time:134237ms step_avg:154.65ms step:879/1480 train_time:134400ms step_avg:154.66ms step:880/1480 train_time:134562ms step_avg:154.67ms step:881/1480 train_time:134724ms step_avg:154.68ms step:882/1480 train_time:134889ms step_avg:154.69ms step:883/1480 train_time:135056ms step_avg:154.70ms step:884/1480 train_time:135221ms step_avg:154.72ms step:885/1480 train_time:135387ms step_avg:154.73ms step:886/1480 train_time:135554ms step_avg:154.74ms step:887/1480 train_time:135722ms step_avg:154.76ms step:888/1480 train_time:135896ms step_avg:154.78ms step:889/1480 train_time:136064ms step_avg:154.79ms step:890/1480 train_time:136225ms step_avg:154.80ms step:891/1480 train_time:136392ms step_avg:154.82ms step:892/1480 train_time:136558ms step_avg:154.83ms step:893/1480 train_time:136720ms step_avg:154.84ms step:894/1480 train_time:136888ms step_avg:154.85ms step:895/1480 train_time:137054ms step_avg:154.86ms step:896/1480 train_time:137218ms step_avg:154.87ms step:897/1480 train_time:137386ms step_avg:154.89ms step:898/1480 train_time:137553ms step_avg:154.90ms step:899/1480 train_time:137717ms step_avg:154.91ms step:900/1480 train_time:137880ms step_avg:154.92ms step:901/1480 train_time:138045ms step_avg:154.93ms step:902/1480 train_time:138209ms step_avg:154.94ms step:903/1480 train_time:138385ms step_avg:154.97ms step:904/1480 train_time:138551ms step_avg:154.98ms step:905/1480 train_time:138714ms step_avg:154.99ms step:906/1480 train_time:138881ms step_avg:155.00ms step:907/1480 train_time:139051ms step_avg:155.02ms step:908/1480 train_time:139214ms step_avg:155.03ms step:909/1480 train_time:139378ms step_avg:155.04ms step:910/1480 train_time:139549ms step_avg:155.05ms step:911/1480 train_time:139715ms step_avg:155.07ms step:912/1480 train_time:139880ms step_avg:155.08ms step:913/1480 train_time:140049ms step_avg:155.09ms step:914/1480 train_time:140217ms step_avg:155.11ms step:915/1480 train_time:140388ms step_avg:155.12ms step:916/1480 train_time:140552ms step_avg:155.13ms step:917/1480 train_time:140715ms step_avg:155.14ms step:918/1480 train_time:140882ms step_avg:155.16ms step:919/1480 train_time:141051ms step_avg:155.17ms step:920/1480 train_time:141216ms step_avg:155.18ms step:921/1480 train_time:141383ms step_avg:155.20ms step:922/1480 train_time:141552ms step_avg:155.21ms step:923/1480 train_time:141713ms step_avg:155.22ms step:924/1480 train_time:141878ms step_avg:155.23ms step:925/1480 train_time:142045ms step_avg:155.24ms step:926/1480 train_time:142208ms step_avg:155.25ms step:927/1480 train_time:142374ms step_avg:155.26ms step:928/1480 train_time:142538ms step_avg:155.27ms step:929/1480 train_time:142702ms step_avg:155.28ms step:930/1480 train_time:142869ms step_avg:155.29ms step:931/1480 train_time:143033ms step_avg:155.30ms step:932/1480 train_time:143198ms step_avg:155.31ms step:933/1480 train_time:143365ms step_avg:155.32ms step:934/1480 train_time:143532ms step_avg:155.34ms step:935/1480 train_time:143704ms step_avg:155.36ms step:936/1480 train_time:143873ms step_avg:155.37ms step:937/1480 train_time:144042ms step_avg:155.38ms step:938/1480 train_time:144203ms step_avg:155.39ms step:939/1480 train_time:144373ms step_avg:155.41ms step:940/1480 train_time:144540ms step_avg:155.42ms step:941/1480 train_time:144702ms step_avg:155.43ms step:942/1480 train_time:144868ms step_avg:155.44ms step:943/1480 train_time:145039ms step_avg:155.45ms step:944/1480 train_time:145210ms step_avg:155.47ms step:945/1480 train_time:145374ms step_avg:155.48ms step:946/1480 train_time:145543ms step_avg:155.49ms step:947/1480 train_time:145711ms step_avg:155.51ms step:948/1480 train_time:145877ms step_avg:155.52ms step:949/1480 train_time:146041ms step_avg:155.53ms step:950/1480 train_time:146205ms step_avg:155.54ms step:951/1480 train_time:146374ms step_avg:155.55ms step:952/1480 train_time:146539ms step_avg:155.56ms step:953/1480 train_time:146707ms step_avg:155.57ms step:954/1480 train_time:146876ms step_avg:155.59ms step:955/1480 train_time:147039ms step_avg:155.60ms step:956/1480 train_time:147206ms step_avg:155.61ms step:957/1480 train_time:147374ms step_avg:155.62ms step:958/1480 train_time:147543ms step_avg:155.64ms step:959/1480 train_time:147708ms step_avg:155.65ms step:960/1480 train_time:147875ms step_avg:155.66ms step:961/1480 train_time:148040ms step_avg:155.67ms step:962/1480 train_time:148203ms step_avg:155.68ms step:963/1480 train_time:148370ms step_avg:155.69ms step:964/1480 train_time:148538ms step_avg:155.70ms step:965/1480 train_time:148702ms step_avg:155.71ms step:966/1480 train_time:148867ms step_avg:155.72ms step:967/1480 train_time:149032ms step_avg:155.73ms step:968/1480 train_time:149197ms step_avg:155.74ms step:969/1480 train_time:149362ms step_avg:155.75ms step:970/1480 train_time:149525ms step_avg:155.76ms step:971/1480 train_time:149691ms step_avg:155.77ms step:972/1480 train_time:149857ms step_avg:155.78ms step:973/1480 train_time:150020ms step_avg:155.78ms step:974/1480 train_time:150189ms step_avg:155.80ms step:975/1480 train_time:150356ms step_avg:155.81ms step:976/1480 train_time:150520ms step_avg:155.82ms step:977/1480 train_time:150682ms step_avg:155.82ms step:978/1480 train_time:150850ms step_avg:155.84ms step:979/1480 train_time:151016ms step_avg:155.85ms step:980/1480 train_time:151182ms step_avg:155.86ms step:981/1480 train_time:151353ms step_avg:155.87ms step:982/1480 train_time:151515ms step_avg:155.88ms step:983/1480 train_time:151680ms step_avg:155.89ms step:984/1480 train_time:151845ms step_avg:155.90ms step:985/1480 train_time:152012ms step_avg:155.91ms step:986/1480 train_time:152177ms step_avg:155.92ms step:987/1480 train_time:152339ms step_avg:155.93ms step:988/1480 train_time:152507ms step_avg:155.94ms step:989/1480 train_time:152674ms step_avg:155.95ms step:990/1480 train_time:152841ms step_avg:155.96ms step:991/1480 train_time:153007ms step_avg:155.97ms step:992/1480 train_time:153183ms step_avg:155.99ms step:993/1480 train_time:153360ms step_avg:156.01ms step:994/1480 train_time:153524ms step_avg:156.02ms step:995/1480 train_time:153688ms step_avg:156.03ms step:996/1480 train_time:153851ms step_avg:156.04ms step:997/1480 train_time:154015ms step_avg:156.04ms step:998/1480 train_time:154178ms step_avg:156.05ms step:999/1480 train_time:154344ms step_avg:156.06ms step:1000/1480 train_time:154514ms step_avg:156.07ms step:1000/1480 val_loss:3.4428 train_time:154582ms step_avg:156.14ms step:1001/1480 train_time:154685ms step_avg:156.09ms step:1002/1480 train_time:154851ms step_avg:156.10ms step:1003/1480 train_time:155024ms step_avg:156.12ms step:1004/1480 train_time:155193ms step_avg:156.13ms step:1005/1480 train_time:155361ms step_avg:156.14ms step:1006/1480 train_time:155528ms step_avg:156.15ms step:1007/1480 train_time:155693ms step_avg:156.16ms step:1008/1480 train_time:155860ms step_avg:156.17ms step:1009/1480 train_time:156033ms step_avg:156.19ms step:1010/1480 train_time:156198ms step_avg:156.20ms step:1011/1480 train_time:156363ms step_avg:156.21ms step:1012/1480 train_time:156529ms step_avg:156.22ms step:1013/1480 train_time:156699ms step_avg:156.23ms step:1014/1480 train_time:156866ms step_avg:156.24ms step:1015/1480 train_time:157035ms step_avg:156.25ms step:1016/1480 train_time:157204ms step_avg:156.27ms step:1017/1480 train_time:157374ms step_avg:156.28ms step:1018/1480 train_time:157543ms step_avg:156.29ms step:1019/1480 train_time:157713ms step_avg:156.31ms step:1020/1480 train_time:157881ms step_avg:156.32ms step:1021/1480 train_time:158046ms step_avg:156.33ms step:1022/1480 train_time:158213ms step_avg:156.34ms step:1023/1480 train_time:158379ms step_avg:156.35ms step:1024/1480 train_time:158547ms step_avg:156.36ms step:1025/1480 train_time:158719ms step_avg:156.37ms step:1026/1480 train_time:158886ms step_avg:156.38ms step:1027/1480 train_time:159052ms step_avg:156.39ms step:1028/1480 train_time:159227ms step_avg:156.41ms step:1029/1480 train_time:159399ms step_avg:156.43ms step:1030/1480 train_time:159566ms step_avg:156.44ms step:1031/1480 train_time:159731ms step_avg:156.45ms step:1032/1480 train_time:159902ms step_avg:156.46ms step:1033/1480 train_time:160069ms step_avg:156.47ms step:1034/1480 train_time:160238ms step_avg:156.48ms step:1035/1480 train_time:160408ms step_avg:156.50ms step:1036/1480 train_time:160573ms step_avg:156.50ms step:1037/1480 train_time:160740ms step_avg:156.51ms step:1038/1480 train_time:160909ms step_avg:156.53ms step:1039/1480 train_time:161080ms step_avg:156.54ms step:1040/1480 train_time:161247ms step_avg:156.55ms step:1041/1480 train_time:161415ms step_avg:156.56ms step:1042/1480 train_time:161578ms step_avg:156.57ms step:1043/1480 train_time:161743ms step_avg:156.58ms step:1044/1480 train_time:161910ms step_avg:156.59ms step:1045/1480 train_time:162077ms step_avg:156.60ms step:1046/1480 train_time:162246ms step_avg:156.61ms step:1047/1480 train_time:162413ms step_avg:156.62ms step:1048/1480 train_time:162578ms step_avg:156.63ms step:1049/1480 train_time:162743ms step_avg:156.63ms step:1050/1480 train_time:162913ms step_avg:156.65ms step:1051/1480 train_time:163081ms step_avg:156.66ms step:1052/1480 train_time:163249ms step_avg:156.67ms step:1053/1480 train_time:163415ms step_avg:156.68ms step:1054/1480 train_time:163584ms step_avg:156.69ms step:1055/1480 train_time:163751ms step_avg:156.70ms step:1056/1480 train_time:163916ms step_avg:156.71ms step:1057/1480 train_time:164082ms step_avg:156.72ms step:1058/1480 train_time:164252ms step_avg:156.73ms step:1059/1480 train_time:164428ms step_avg:156.75ms step:1060/1480 train_time:164596ms step_avg:156.76ms step:1061/1480 train_time:164758ms step_avg:156.76ms step:1062/1480 train_time:164923ms step_avg:156.77ms step:1063/1480 train_time:165089ms step_avg:156.78ms step:1064/1480 train_time:165253ms step_avg:156.79ms step:1065/1480 train_time:165420ms step_avg:156.80ms step:1066/1480 train_time:165588ms step_avg:156.81ms step:1067/1480 train_time:165756ms step_avg:156.82ms step:1068/1480 train_time:165922ms step_avg:156.83ms step:1069/1480 train_time:166093ms step_avg:156.84ms step:1070/1480 train_time:166258ms step_avg:156.85ms step:1071/1480 train_time:166433ms step_avg:156.86ms step:1072/1480 train_time:166597ms step_avg:156.87ms step:1073/1480 train_time:166759ms step_avg:156.88ms step:1074/1480 train_time:166927ms step_avg:156.89ms step:1075/1480 train_time:167097ms step_avg:156.90ms step:1076/1480 train_time:167264ms step_avg:156.91ms step:1077/1480 train_time:167431ms step_avg:156.92ms step:1078/1480 train_time:167607ms step_avg:156.94ms step:1079/1480 train_time:167779ms step_avg:156.95ms step:1080/1480 train_time:167948ms step_avg:156.96ms step:1081/1480 train_time:168115ms step_avg:156.97ms step:1082/1480 train_time:168281ms step_avg:156.98ms step:1083/1480 train_time:168448ms step_avg:156.99ms step:1084/1480 train_time:168616ms step_avg:157.00ms step:1085/1480 train_time:168785ms step_avg:157.01ms step:1086/1480 train_time:168953ms step_avg:157.02ms step:1087/1480 train_time:169119ms step_avg:157.03ms step:1088/1480 train_time:169289ms step_avg:157.04ms step:1089/1480 train_time:169459ms step_avg:157.05ms step:1090/1480 train_time:169632ms step_avg:157.07ms step:1091/1480 train_time:169800ms step_avg:157.08ms step:1092/1480 train_time:169969ms step_avg:157.09ms step:1093/1480 train_time:170137ms step_avg:157.10ms step:1094/1480 train_time:170302ms step_avg:157.11ms step:1095/1480 train_time:170467ms step_avg:157.11ms step:1096/1480 train_time:170636ms step_avg:157.12ms step:1097/1480 train_time:170805ms step_avg:157.13ms step:1098/1480 train_time:170975ms step_avg:157.15ms step:1099/1480 train_time:171146ms step_avg:157.16ms step:1100/1480 train_time:171318ms step_avg:157.17ms step:1101/1480 train_time:171490ms step_avg:157.19ms step:1102/1480 train_time:171662ms step_avg:157.20ms step:1103/1480 train_time:171837ms step_avg:157.22ms step:1104/1480 train_time:172006ms step_avg:157.23ms step:1105/1480 train_time:172175ms step_avg:157.24ms step:1106/1480 train_time:172343ms step_avg:157.25ms step:1107/1480 train_time:172512ms step_avg:157.26ms step:1108/1480 train_time:172677ms step_avg:157.27ms step:1109/1480 train_time:172844ms step_avg:157.27ms step:1110/1480 train_time:173011ms step_avg:157.28ms step:1111/1480 train_time:173178ms step_avg:157.29ms step:1112/1480 train_time:173350ms step_avg:157.30ms step:1113/1480 train_time:173530ms step_avg:157.33ms step:1114/1480 train_time:173701ms step_avg:157.34ms step:1115/1480 train_time:173874ms step_avg:157.35ms step:1116/1480 train_time:174039ms step_avg:157.36ms step:1117/1480 train_time:174213ms step_avg:157.37ms step:1118/1480 train_time:174387ms step_avg:157.39ms step:1119/1480 train_time:174554ms step_avg:157.40ms step:1120/1480 train_time:174723ms step_avg:157.41ms step:1121/1480 train_time:174894ms step_avg:157.42ms step:1122/1480 train_time:175059ms step_avg:157.43ms step:1123/1480 train_time:175226ms step_avg:157.44ms step:1124/1480 train_time:175394ms step_avg:157.45ms step:1125/1480 train_time:175561ms step_avg:157.45ms step:1125/1480 val_loss:3.3878 train_time:175629ms step_avg:157.51ms step:1126/1480 train_time:175731ms step_avg:157.47ms step:1127/1480 train_time:175902ms step_avg:157.48ms step:1128/1480 train_time:176072ms step_avg:157.49ms step:1129/1480 train_time:176247ms step_avg:157.50ms step:1130/1480 train_time:176415ms step_avg:157.51ms step:1131/1480 train_time:176594ms step_avg:157.53ms step:1132/1480 train_time:176761ms step_avg:157.54ms step:1133/1480 train_time:176931ms step_avg:157.55ms step:1134/1480 train_time:177102ms step_avg:157.56ms step:1135/1480 train_time:177269ms step_avg:157.57ms step:1136/1480 train_time:177442ms step_avg:157.59ms step:1137/1480 train_time:177610ms step_avg:157.60ms step:1138/1480 train_time:177783ms step_avg:157.61ms step:1139/1480 train_time:177950ms step_avg:157.62ms step:1140/1480 train_time:178118ms step_avg:157.63ms step:1141/1480 train_time:178289ms step_avg:157.64ms step:1142/1480 train_time:178457ms step_avg:157.65ms step:1143/1480 train_time:178627ms step_avg:157.66ms step:1144/1480 train_time:178797ms step_avg:157.67ms step:1145/1480 train_time:178963ms step_avg:157.68ms step:1146/1480 train_time:179134ms step_avg:157.69ms step:1147/1480 train_time:179303ms step_avg:157.70ms step:1148/1480 train_time:179472ms step_avg:157.71ms step:1149/1480 train_time:179644ms step_avg:157.72ms step:1150/1480 train_time:179811ms step_avg:157.73ms step:1151/1480 train_time:179983ms step_avg:157.74ms step:1152/1480 train_time:180155ms step_avg:157.75ms step:1153/1480 train_time:180329ms step_avg:157.77ms step:1154/1480 train_time:180496ms step_avg:157.78ms step:1155/1480 train_time:180668ms step_avg:157.79ms step:1156/1480 train_time:180846ms step_avg:157.81ms step:1157/1480 train_time:181016ms step_avg:157.82ms step:1158/1480 train_time:181184ms step_avg:157.83ms step:1159/1480 train_time:181351ms step_avg:157.83ms step:1160/1480 train_time:181516ms step_avg:157.84ms step:1161/1480 train_time:181687ms step_avg:157.85ms step:1162/1480 train_time:181857ms step_avg:157.86ms step:1163/1480 train_time:182027ms step_avg:157.87ms step:1164/1480 train_time:182195ms step_avg:157.88ms step:1165/1480 train_time:182362ms step_avg:157.89ms step:1166/1480 train_time:182529ms step_avg:157.90ms step:1167/1480 train_time:182698ms step_avg:157.91ms step:1168/1480 train_time:182866ms step_avg:157.92ms step:1169/1480 train_time:183035ms step_avg:157.93ms step:1170/1480 train_time:183203ms step_avg:157.93ms step:1171/1480 train_time:183369ms step_avg:157.94ms step:1172/1480 train_time:183537ms step_avg:157.95ms step:1173/1480 train_time:183708ms step_avg:157.96ms step:1174/1480 train_time:183891ms step_avg:157.98ms step:1175/1480 train_time:184063ms step_avg:157.99ms step:1176/1480 train_time:184236ms step_avg:158.01ms step:1177/1480 train_time:184413ms step_avg:158.02ms step:1178/1480 train_time:184581ms step_avg:158.03ms step:1179/1480 train_time:184747ms step_avg:158.04ms step:1180/1480 train_time:184927ms step_avg:158.06ms step:1181/1480 train_time:185096ms step_avg:158.07ms step:1182/1480 train_time:185264ms step_avg:158.08ms step:1183/1480 train_time:185434ms step_avg:158.08ms step:1184/1480 train_time:185602ms step_avg:158.09ms step:1185/1480 train_time:185774ms step_avg:158.11ms step:1186/1480 train_time:185945ms step_avg:158.12ms step:1187/1480 train_time:186128ms step_avg:158.14ms step:1188/1480 train_time:186294ms step_avg:158.14ms step:1189/1480 train_time:186468ms step_avg:158.16ms step:1190/1480 train_time:186636ms step_avg:158.17ms step:1191/1480 train_time:186807ms step_avg:158.18ms step:1192/1480 train_time:186973ms step_avg:158.18ms step:1193/1480 train_time:187141ms step_avg:158.19ms step:1194/1480 train_time:187309ms step_avg:158.20ms step:1195/1480 train_time:187483ms step_avg:158.21ms step:1196/1480 train_time:187665ms step_avg:158.23ms step:1197/1480 train_time:187837ms step_avg:158.25ms step:1198/1480 train_time:188019ms step_avg:158.27ms step:1199/1480 train_time:188189ms step_avg:158.28ms step:1200/1480 train_time:188358ms step_avg:158.28ms step:1201/1480 train_time:188526ms step_avg:158.29ms step:1202/1480 train_time:188708ms step_avg:158.31ms step:1203/1480 train_time:188884ms step_avg:158.33ms step:1204/1480 train_time:189059ms step_avg:158.34ms step:1205/1480 train_time:189227ms step_avg:158.35ms step:1206/1480 train_time:189394ms step_avg:158.36ms step:1207/1480 train_time:189564ms step_avg:158.37ms step:1208/1480 train_time:189730ms step_avg:158.37ms step:1209/1480 train_time:189903ms step_avg:158.38ms step:1210/1480 train_time:190079ms step_avg:158.40ms step:1211/1480 train_time:190254ms step_avg:158.41ms step:1212/1480 train_time:190426ms step_avg:158.42ms step:1213/1480 train_time:190598ms step_avg:158.44ms step:1214/1480 train_time:190774ms step_avg:158.45ms step:1215/1480 train_time:190947ms step_avg:158.46ms step:1216/1480 train_time:191116ms step_avg:158.47ms step:1217/1480 train_time:191290ms step_avg:158.48ms step:1218/1480 train_time:191462ms step_avg:158.49ms step:1219/1480 train_time:191639ms step_avg:158.51ms step:1220/1480 train_time:191807ms step_avg:158.52ms step:1221/1480 train_time:191976ms step_avg:158.53ms step:1222/1480 train_time:192144ms step_avg:158.53ms step:1223/1480 train_time:192314ms step_avg:158.54ms step:1224/1480 train_time:192491ms step_avg:158.56ms step:1225/1480 train_time:192662ms step_avg:158.57ms step:1226/1480 train_time:192834ms step_avg:158.58ms step:1227/1480 train_time:193007ms step_avg:158.59ms step:1228/1480 train_time:193176ms step_avg:158.60ms step:1229/1480 train_time:193350ms step_avg:158.61ms step:1230/1480 train_time:193531ms step_avg:158.63ms step:1231/1480 train_time:193706ms step_avg:158.65ms step:1232/1480 train_time:193882ms step_avg:158.66ms step:1233/1480 train_time:194052ms step_avg:158.67ms step:1234/1480 train_time:194222ms step_avg:158.68ms step:1235/1480 train_time:194397ms step_avg:158.69ms step:1236/1480 train_time:194566ms step_avg:158.70ms step:1237/1480 train_time:194734ms step_avg:158.71ms step:1238/1480 train_time:194920ms step_avg:158.73ms step:1239/1480 train_time:195091ms step_avg:158.74ms step:1240/1480 train_time:195262ms step_avg:158.75ms step:1241/1480 train_time:195437ms step_avg:158.76ms step:1242/1480 train_time:195607ms step_avg:158.77ms step:1243/1480 train_time:195781ms step_avg:158.78ms step:1244/1480 train_time:195948ms step_avg:158.79ms step:1245/1480 train_time:196118ms step_avg:158.80ms step:1246/1480 train_time:196287ms step_avg:158.81ms step:1247/1480 train_time:196458ms step_avg:158.82ms step:1248/1480 train_time:196627ms step_avg:158.83ms step:1249/1480 train_time:196797ms step_avg:158.84ms step:1250/1480 train_time:196966ms step_avg:158.84ms step:1250/1480 val_loss:3.3389 train_time:197037ms step_avg:158.90ms step:1251/1480 train_time:197148ms step_avg:158.86ms step:1252/1480 train_time:197317ms step_avg:158.87ms step:1253/1480 train_time:197486ms step_avg:158.88ms step:1254/1480 train_time:197657ms step_avg:158.89ms step:1255/1480 train_time:197844ms step_avg:158.91ms step:1256/1480 train_time:198018ms step_avg:158.92ms step:1257/1480 train_time:198187ms step_avg:158.93ms step:1258/1480 train_time:198362ms step_avg:158.94ms step:1259/1480 train_time:198534ms step_avg:158.95ms step:1260/1480 train_time:198701ms step_avg:158.96ms step:1261/1480 train_time:198873ms step_avg:158.97ms step:1262/1480 train_time:199047ms step_avg:158.98ms step:1263/1480 train_time:199222ms step_avg:159.00ms step:1264/1480 train_time:199387ms step_avg:159.00ms step:1265/1480 train_time:199554ms step_avg:159.01ms step:1266/1480 train_time:199725ms step_avg:159.02ms step:1267/1480 train_time:199895ms step_avg:159.03ms step:1268/1480 train_time:200067ms step_avg:159.04ms step:1269/1480 train_time:200244ms step_avg:159.05ms step:1270/1480 train_time:200413ms step_avg:159.06ms step:1271/1480 train_time:200584ms step_avg:159.07ms step:1272/1480 train_time:200751ms step_avg:159.07ms step:1273/1480 train_time:200923ms step_avg:159.08ms step:1274/1480 train_time:201093ms step_avg:159.09ms step:1275/1480 train_time:201263ms step_avg:159.10ms step:1276/1480 train_time:201427ms step_avg:159.11ms step:1277/1480 train_time:201600ms step_avg:159.12ms step:1278/1480 train_time:201769ms step_avg:159.12ms step:1279/1480 train_time:201941ms step_avg:159.13ms step:1280/1480 train_time:202122ms step_avg:159.15ms step:1281/1480 train_time:202291ms step_avg:159.16ms step:1282/1480 train_time:202458ms step_avg:159.17ms step:1283/1480 train_time:202628ms step_avg:159.17ms step:1284/1480 train_time:202800ms step_avg:159.18ms step:1285/1480 train_time:202969ms step_avg:159.19ms step:1286/1480 train_time:203140ms step_avg:159.20ms step:1287/1480 train_time:203312ms step_avg:159.21ms step:1288/1480 train_time:203485ms step_avg:159.22ms step:1289/1480 train_time:203666ms step_avg:159.24ms step:1290/1480 train_time:203846ms step_avg:159.26ms step:1291/1480 train_time:204020ms step_avg:159.27ms step:1292/1480 train_time:204194ms step_avg:159.28ms step:1293/1480 train_time:204369ms step_avg:159.29ms step:1294/1480 train_time:204541ms step_avg:159.30ms step:1295/1480 train_time:204711ms step_avg:159.31ms step:1296/1480 train_time:204885ms step_avg:159.32ms step:1297/1480 train_time:205056ms step_avg:159.33ms step:1298/1480 train_time:205226ms step_avg:159.34ms step:1299/1480 train_time:205397ms step_avg:159.35ms step:1300/1480 train_time:205566ms step_avg:159.35ms step:1301/1480 train_time:205734ms step_avg:159.36ms step:1302/1480 train_time:205908ms step_avg:159.37ms step:1303/1480 train_time:206087ms step_avg:159.39ms step:1304/1480 train_time:206262ms step_avg:159.40ms step:1305/1480 train_time:206430ms step_avg:159.41ms step:1306/1480 train_time:206605ms step_avg:159.42ms step:1307/1480 train_time:206772ms step_avg:159.42ms step:1308/1480 train_time:206941ms step_avg:159.43ms step:1309/1480 train_time:207111ms step_avg:159.44ms step:1310/1480 train_time:207283ms step_avg:159.45ms step:1311/1480 train_time:207452ms step_avg:159.46ms step:1312/1480 train_time:207626ms step_avg:159.47ms step:1313/1480 train_time:207796ms step_avg:159.47ms step:1314/1480 train_time:207969ms step_avg:159.49ms step:1315/1480 train_time:208141ms step_avg:159.50ms step:1316/1480 train_time:208308ms step_avg:159.50ms step:1317/1480 train_time:208481ms step_avg:159.51ms step:1318/1480 train_time:208662ms step_avg:159.53ms step:1319/1480 train_time:208837ms step_avg:159.54ms step:1320/1480 train_time:209011ms step_avg:159.55ms step:1321/1480 train_time:209185ms step_avg:159.56ms step:1322/1480 train_time:209364ms step_avg:159.58ms step:1323/1480 train_time:209536ms step_avg:159.59ms step:1324/1480 train_time:209712ms step_avg:159.60ms step:1325/1480 train_time:209892ms step_avg:159.61ms step:1326/1480 train_time:210067ms step_avg:159.63ms step:1327/1480 train_time:210237ms step_avg:159.63ms step:1328/1480 train_time:210408ms step_avg:159.64ms step:1329/1480 train_time:210604ms step_avg:159.67ms step:1330/1480 train_time:210786ms step_avg:159.69ms step:1331/1480 train_time:210956ms step_avg:159.69ms step:1332/1480 train_time:211131ms step_avg:159.71ms step:1333/1480 train_time:211305ms step_avg:159.72ms step:1334/1480 train_time:211477ms step_avg:159.73ms step:1335/1480 train_time:211646ms step_avg:159.73ms step:1336/1480 train_time:211830ms step_avg:159.75ms step:1337/1480 train_time:212006ms step_avg:159.76ms step:1338/1480 train_time:212178ms step_avg:159.77ms step:1339/1480 train_time:212351ms step_avg:159.78ms step:1340/1480 train_time:212524ms step_avg:159.79ms step:1341/1480 train_time:212691ms step_avg:159.80ms step:1342/1480 train_time:212865ms step_avg:159.81ms step:1343/1480 train_time:213034ms step_avg:159.82ms step:1344/1480 train_time:213207ms step_avg:159.82ms step:1345/1480 train_time:213385ms step_avg:159.84ms step:1346/1480 train_time:213554ms step_avg:159.85ms step:1347/1480 train_time:213724ms step_avg:159.85ms step:1348/1480 train_time:213893ms step_avg:159.86ms step:1349/1480 train_time:214063ms step_avg:159.87ms step:1350/1480 train_time:214239ms step_avg:159.88ms step:1351/1480 train_time:214410ms step_avg:159.89ms step:1352/1480 train_time:214581ms step_avg:159.90ms step:1353/1480 train_time:214757ms step_avg:159.91ms step:1354/1480 train_time:214928ms step_avg:159.92ms step:1355/1480 train_time:215095ms step_avg:159.92ms step:1356/1480 train_time:215267ms step_avg:159.93ms step:1357/1480 train_time:215442ms step_avg:159.94ms step:1358/1480 train_time:215614ms step_avg:159.95ms step:1359/1480 train_time:215786ms step_avg:159.96ms step:1360/1480 train_time:215962ms step_avg:159.97ms step:1361/1480 train_time:216139ms step_avg:159.98ms step:1362/1480 train_time:216315ms step_avg:160.00ms step:1363/1480 train_time:216497ms step_avg:160.01ms step:1364/1480 train_time:216666ms step_avg:160.02ms step:1365/1480 train_time:216833ms step_avg:160.02ms step:1366/1480 train_time:217005ms step_avg:160.03ms step:1367/1480 train_time:217177ms step_avg:160.04ms step:1368/1480 train_time:217350ms step_avg:160.05ms step:1369/1480 train_time:217530ms step_avg:160.07ms step:1370/1480 train_time:217707ms step_avg:160.08ms step:1371/1480 train_time:217879ms step_avg:160.09ms step:1372/1480 train_time:218058ms step_avg:160.10ms step:1373/1480 train_time:218227ms step_avg:160.11ms step:1374/1480 train_time:218405ms step_avg:160.12ms step:1375/1480 train_time:218575ms step_avg:160.13ms step:1375/1480 val_loss:3.2997 train_time:218643ms step_avg:160.18ms step:1376/1480 train_time:218748ms step_avg:160.14ms step:1377/1480 train_time:218920ms step_avg:160.15ms step:1378/1480 train_time:219088ms step_avg:160.15ms step:1379/1480 train_time:219264ms step_avg:160.16ms step:1380/1480 train_time:219439ms step_avg:160.17ms step:1381/1480 train_time:219621ms step_avg:160.19ms step:1382/1480 train_time:219793ms step_avg:160.20ms step:1383/1480 train_time:219965ms step_avg:160.21ms step:1384/1480 train_time:220141ms step_avg:160.22ms step:1385/1480 train_time:220306ms step_avg:160.22ms step:1386/1480 train_time:220476ms step_avg:160.23ms step:1387/1480 train_time:220648ms step_avg:160.24ms step:1388/1480 train_time:220817ms step_avg:160.24ms step:1389/1480 train_time:220989ms step_avg:160.25ms step:1390/1480 train_time:221159ms step_avg:160.26ms step:1391/1480 train_time:221328ms step_avg:160.27ms step:1392/1480 train_time:221500ms step_avg:160.27ms step:1393/1480 train_time:221670ms step_avg:160.28ms step:1394/1480 train_time:221840ms step_avg:160.29ms step:1395/1480 train_time:222007ms step_avg:160.29ms step:1396/1480 train_time:222177ms step_avg:160.30ms step:1397/1480 train_time:222344ms step_avg:160.31ms step:1398/1480 train_time:222511ms step_avg:160.31ms step:1399/1480 train_time:222680ms step_avg:160.32ms step:1400/1480 train_time:222858ms step_avg:160.33ms step:1401/1480 train_time:223024ms step_avg:160.33ms step:1402/1480 train_time:223196ms step_avg:160.34ms step:1403/1480 train_time:223372ms step_avg:160.35ms step:1404/1480 train_time:223543ms step_avg:160.36ms step:1405/1480 train_time:223718ms step_avg:160.37ms step:1406/1480 train_time:223893ms step_avg:160.38ms step:1407/1480 train_time:224062ms step_avg:160.39ms step:1408/1480 train_time:224230ms step_avg:160.39ms step:1409/1480 train_time:224413ms step_avg:160.41ms step:1410/1480 train_time:224582ms step_avg:160.42ms step:1411/1480 train_time:224749ms step_avg:160.42ms step:1412/1480 train_time:224919ms step_avg:160.43ms step:1413/1480 train_time:225089ms step_avg:160.43ms step:1414/1480 train_time:225262ms step_avg:160.44ms step:1415/1480 train_time:225436ms step_avg:160.45ms step:1416/1480 train_time:225622ms step_avg:160.47ms step:1417/1480 train_time:225797ms step_avg:160.48ms step:1418/1480 train_time:225967ms step_avg:160.49ms step:1419/1480 train_time:226142ms step_avg:160.50ms step:1420/1480 train_time:226317ms step_avg:160.51ms step:1421/1480 train_time:226492ms step_avg:160.52ms step:1422/1480 train_time:226664ms step_avg:160.53ms step:1423/1480 train_time:226833ms step_avg:160.53ms step:1424/1480 train_time:227009ms step_avg:160.54ms step:1425/1480 train_time:227189ms step_avg:160.56ms step:1426/1480 train_time:227361ms step_avg:160.57ms step:1427/1480 train_time:227536ms step_avg:160.58ms step:1428/1480 train_time:227708ms step_avg:160.58ms step:1429/1480 train_time:227877ms step_avg:160.59ms step:1430/1480 train_time:228049ms step_avg:160.60ms step:1431/1480 train_time:228225ms step_avg:160.61ms step:1432/1480 train_time:228401ms step_avg:160.62ms step:1433/1480 train_time:228581ms step_avg:160.63ms step:1434/1480 train_time:228762ms step_avg:160.65ms step:1435/1480 train_time:228936ms step_avg:160.66ms step:1436/1480 train_time:229110ms step_avg:160.67ms step:1437/1480 train_time:229281ms step_avg:160.67ms step:1438/1480 train_time:229450ms step_avg:160.68ms step:1439/1480 train_time:229625ms step_avg:160.69ms step:1440/1480 train_time:229797ms step_avg:160.70ms step:1441/1480 train_time:229967ms step_avg:160.70ms step:1442/1480 train_time:230145ms step_avg:160.72ms step:1443/1480 train_time:230334ms step_avg:160.74ms step:1444/1480 train_time:230505ms step_avg:160.74ms step:1445/1480 train_time:230675ms step_avg:160.75ms step:1446/1480 train_time:230849ms step_avg:160.76ms step:1447/1480 train_time:231027ms step_avg:160.77ms step:1448/1480 train_time:231199ms step_avg:160.78ms step:1449/1480 train_time:231373ms step_avg:160.79ms step:1450/1480 train_time:231545ms step_avg:160.80ms step:1451/1480 train_time:231716ms step_avg:160.80ms step:1452/1480 train_time:231891ms step_avg:160.81ms step:1453/1480 train_time:232061ms step_avg:160.82ms step:1454/1480 train_time:232234ms step_avg:160.83ms step:1455/1480 train_time:232411ms step_avg:160.84ms step:1456/1480 train_time:232584ms step_avg:160.85ms step:1457/1480 train_time:232755ms step_avg:160.85ms step:1458/1480 train_time:232924ms step_avg:160.86ms step:1459/1480 train_time:233102ms step_avg:160.87ms step:1460/1480 train_time:233274ms step_avg:160.88ms step:1461/1480 train_time:233450ms step_avg:160.89ms step:1462/1480 train_time:233621ms step_avg:160.90ms step:1463/1480 train_time:233798ms step_avg:160.91ms step:1464/1480 train_time:233973ms step_avg:160.92ms step:1465/1480 train_time:234144ms step_avg:160.92ms step:1466/1480 train_time:234315ms step_avg:160.93ms step:1467/1480 train_time:234491ms step_avg:160.94ms step:1468/1480 train_time:234662ms step_avg:160.95ms step:1469/1480 train_time:234836ms step_avg:160.96ms step:1470/1480 train_time:235015ms step_avg:160.97ms step:1471/1480 train_time:235203ms step_avg:160.99ms step:1472/1480 train_time:235384ms step_avg:161.00ms step:1473/1480 train_time:235555ms step_avg:161.01ms step:1474/1480 train_time:235731ms step_avg:161.02ms step:1475/1480 train_time:235910ms step_avg:161.03ms step:1476/1480 train_time:236083ms step_avg:161.04ms step:1477/1480 train_time:236265ms step_avg:161.05ms step:1478/1480 train_time:236447ms step_avg:161.07ms step:1479/1480 train_time:236622ms step_avg:161.08ms step:1480/1480 train_time:236795ms step_avg:161.08ms step:1480/1480 val_loss:3.2809 train_time:236867ms step_avg:161.13ms