import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 08:06:21 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 126W / 700W | 119MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 115W / 700W | 119MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 40C P0 118W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 128W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22912ms step_avg:nanms step:2/1480 train_time:22998ms step_avg:nanms step:3/1480 train_time:23136ms step_avg:nanms step:4/1480 train_time:23277ms step_avg:nanms step:5/1480 train_time:23416ms step_avg:nanms step:6/1480 train_time:23558ms step_avg:nanms step:7/1480 train_time:23698ms step_avg:nanms step:8/1480 train_time:23840ms step_avg:nanms step:9/1480 train_time:23985ms step_avg:nanms step:10/1480 train_time:24130ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:285ms step_avg:nanms step:13/1480 train_time:427ms step_avg:142.19ms step:14/1480 train_time:568ms step_avg:141.95ms step:15/1480 train_time:709ms step_avg:141.85ms step:16/1480 train_time:853ms step_avg:142.22ms step:17/1480 train_time:1000ms step_avg:142.84ms step:18/1480 train_time:1144ms step_avg:143.01ms step:19/1480 train_time:1286ms step_avg:142.93ms step:20/1480 train_time:1428ms step_avg:142.76ms step:21/1480 train_time:1569ms step_avg:142.59ms step:22/1480 train_time:1709ms step_avg:142.43ms step:23/1480 train_time:1851ms step_avg:142.37ms step:24/1480 train_time:1997ms step_avg:142.61ms step:25/1480 train_time:2142ms step_avg:142.78ms step:26/1480 train_time:2284ms step_avg:142.76ms step:27/1480 train_time:2426ms step_avg:142.71ms step:28/1480 train_time:2567ms step_avg:142.63ms step:29/1480 train_time:2709ms step_avg:142.59ms step:30/1480 train_time:2851ms step_avg:142.55ms step:31/1480 train_time:2994ms step_avg:142.58ms step:32/1480 train_time:3139ms step_avg:142.66ms step:33/1480 train_time:3282ms step_avg:142.69ms step:34/1480 train_time:3425ms step_avg:142.72ms step:35/1480 train_time:3568ms step_avg:142.70ms step:36/1480 train_time:3710ms step_avg:142.70ms step:37/1480 train_time:3852ms step_avg:142.65ms step:38/1480 train_time:3993ms step_avg:142.62ms step:39/1480 train_time:4137ms step_avg:142.66ms step:40/1480 train_time:4282ms step_avg:142.73ms step:41/1480 train_time:4425ms step_avg:142.75ms step:42/1480 train_time:4566ms step_avg:142.70ms step:43/1480 train_time:4708ms step_avg:142.65ms step:44/1480 train_time:4849ms step_avg:142.61ms step:45/1480 train_time:4991ms step_avg:142.61ms step:46/1480 train_time:5136ms step_avg:142.67ms step:47/1480 train_time:5281ms step_avg:142.74ms step:48/1480 train_time:5425ms step_avg:142.77ms step:49/1480 train_time:5567ms step_avg:142.73ms step:50/1480 train_time:5708ms step_avg:142.69ms step:51/1480 train_time:5849ms step_avg:142.66ms step:52/1480 train_time:5990ms step_avg:142.62ms step:53/1480 train_time:6133ms step_avg:142.63ms step:54/1480 train_time:6277ms step_avg:142.65ms step:55/1480 train_time:6421ms step_avg:142.69ms step:56/1480 train_time:6564ms step_avg:142.70ms step:57/1480 train_time:6706ms step_avg:142.68ms step:58/1480 train_time:6847ms step_avg:142.64ms step:59/1480 train_time:6987ms step_avg:142.60ms step:60/1480 train_time:7131ms step_avg:142.62ms step:61/1480 train_time:7275ms step_avg:142.64ms step:62/1480 train_time:7419ms step_avg:142.68ms step:63/1480 train_time:7563ms step_avg:142.70ms step:64/1480 train_time:7706ms step_avg:142.71ms step:65/1480 train_time:7848ms step_avg:142.68ms step:66/1480 train_time:7989ms step_avg:142.65ms step:67/1480 train_time:8131ms step_avg:142.65ms step:68/1480 train_time:8275ms step_avg:142.67ms step:69/1480 train_time:8420ms step_avg:142.71ms step:70/1480 train_time:8564ms step_avg:142.73ms step:71/1480 train_time:8707ms step_avg:142.75ms step:72/1480 train_time:8849ms step_avg:142.72ms step:73/1480 train_time:8989ms step_avg:142.68ms step:74/1480 train_time:9130ms step_avg:142.65ms step:75/1480 train_time:9273ms step_avg:142.66ms step:76/1480 train_time:9418ms step_avg:142.69ms step:77/1480 train_time:9561ms step_avg:142.69ms step:78/1480 train_time:9704ms step_avg:142.71ms step:79/1480 train_time:9845ms step_avg:142.69ms step:80/1480 train_time:9987ms step_avg:142.67ms step:81/1480 train_time:10130ms step_avg:142.67ms step:82/1480 train_time:10272ms step_avg:142.67ms step:83/1480 train_time:10416ms step_avg:142.69ms step:84/1480 train_time:10560ms step_avg:142.70ms step:85/1480 train_time:10703ms step_avg:142.70ms step:86/1480 train_time:10846ms step_avg:142.72ms step:87/1480 train_time:10987ms step_avg:142.69ms step:88/1480 train_time:11129ms step_avg:142.68ms step:89/1480 train_time:11272ms step_avg:142.68ms step:90/1480 train_time:11414ms step_avg:142.67ms step:91/1480 train_time:11558ms step_avg:142.69ms step:92/1480 train_time:11703ms step_avg:142.72ms step:93/1480 train_time:11845ms step_avg:142.71ms step:94/1480 train_time:11985ms step_avg:142.68ms step:95/1480 train_time:12127ms step_avg:142.67ms step:96/1480 train_time:12269ms step_avg:142.66ms step:97/1480 train_time:12411ms step_avg:142.65ms step:98/1480 train_time:12553ms step_avg:142.64ms step:99/1480 train_time:12695ms step_avg:142.64ms step:100/1480 train_time:12839ms step_avg:142.66ms step:101/1480 train_time:12982ms step_avg:142.66ms step:102/1480 train_time:13124ms step_avg:142.65ms step:103/1480 train_time:13265ms step_avg:142.64ms step:104/1480 train_time:13408ms step_avg:142.63ms step:105/1480 train_time:13549ms step_avg:142.62ms step:106/1480 train_time:13692ms step_avg:142.63ms step:107/1480 train_time:13835ms step_avg:142.63ms step:108/1480 train_time:13980ms step_avg:142.65ms step:109/1480 train_time:14123ms step_avg:142.66ms step:110/1480 train_time:14266ms step_avg:142.66ms step:111/1480 train_time:14410ms step_avg:142.67ms step:112/1480 train_time:14554ms step_avg:142.69ms step:113/1480 train_time:14704ms step_avg:142.75ms step:114/1480 train_time:14850ms step_avg:142.79ms step:115/1480 train_time:14997ms step_avg:142.83ms step:116/1480 train_time:15146ms step_avg:142.88ms step:117/1480 train_time:15291ms step_avg:142.91ms step:118/1480 train_time:15438ms step_avg:142.95ms step:119/1480 train_time:15586ms step_avg:142.99ms step:120/1480 train_time:15732ms step_avg:143.02ms step:121/1480 train_time:15880ms step_avg:143.06ms step:122/1480 train_time:16028ms step_avg:143.11ms step:123/1480 train_time:16172ms step_avg:143.12ms step:124/1480 train_time:16319ms step_avg:143.15ms step:125/1480 train_time:16467ms step_avg:143.19ms step:125/1480 val_loss:4.4110 train_time:16524ms step_avg:143.68ms step:126/1480 train_time:16619ms step_avg:143.27ms step:127/1480 train_time:16769ms step_avg:143.33ms step:128/1480 train_time:16916ms step_avg:143.36ms step:129/1480 train_time:17062ms step_avg:143.38ms step:130/1480 train_time:17209ms step_avg:143.41ms step:131/1480 train_time:17356ms step_avg:143.43ms step:132/1480 train_time:17502ms step_avg:143.46ms step:133/1480 train_time:17653ms step_avg:143.52ms step:134/1480 train_time:17800ms step_avg:143.55ms step:135/1480 train_time:17948ms step_avg:143.58ms step:136/1480 train_time:18095ms step_avg:143.61ms step:137/1480 train_time:18240ms step_avg:143.62ms step:138/1480 train_time:18384ms step_avg:143.63ms step:139/1480 train_time:18532ms step_avg:143.66ms step:140/1480 train_time:18680ms step_avg:143.69ms step:141/1480 train_time:18827ms step_avg:143.72ms step:142/1480 train_time:18976ms step_avg:143.76ms step:143/1480 train_time:19122ms step_avg:143.77ms step:144/1480 train_time:19269ms step_avg:143.80ms step:145/1480 train_time:19416ms step_avg:143.82ms step:146/1480 train_time:19562ms step_avg:143.84ms step:147/1480 train_time:19710ms step_avg:143.87ms step:148/1480 train_time:19857ms step_avg:143.89ms step:149/1480 train_time:20004ms step_avg:143.91ms step:150/1480 train_time:20152ms step_avg:143.94ms step:151/1480 train_time:20299ms step_avg:143.97ms step:152/1480 train_time:20446ms step_avg:143.98ms step:153/1480 train_time:20593ms step_avg:144.01ms step:154/1480 train_time:20740ms step_avg:144.03ms step:155/1480 train_time:20885ms step_avg:144.04ms step:156/1480 train_time:21035ms step_avg:144.08ms step:157/1480 train_time:21181ms step_avg:144.09ms step:158/1480 train_time:21327ms step_avg:144.10ms step:159/1480 train_time:21475ms step_avg:144.13ms step:160/1480 train_time:21621ms step_avg:144.14ms step:161/1480 train_time:21769ms step_avg:144.16ms step:162/1480 train_time:21916ms step_avg:144.19ms step:163/1480 train_time:22063ms step_avg:144.20ms step:164/1480 train_time:22212ms step_avg:144.23ms step:165/1480 train_time:22359ms step_avg:144.25ms step:166/1480 train_time:22505ms step_avg:144.26ms step:167/1480 train_time:22653ms step_avg:144.29ms step:168/1480 train_time:22800ms step_avg:144.30ms step:169/1480 train_time:22945ms step_avg:144.31ms step:170/1480 train_time:23093ms step_avg:144.33ms step:171/1480 train_time:23240ms step_avg:144.35ms step:172/1480 train_time:23386ms step_avg:144.36ms step:173/1480 train_time:23534ms step_avg:144.38ms step:174/1480 train_time:23681ms step_avg:144.39ms step:175/1480 train_time:23828ms step_avg:144.41ms step:176/1480 train_time:23975ms step_avg:144.43ms step:177/1480 train_time:24121ms step_avg:144.44ms step:178/1480 train_time:24269ms step_avg:144.46ms step:179/1480 train_time:24417ms step_avg:144.48ms step:180/1480 train_time:24563ms step_avg:144.49ms step:181/1480 train_time:24712ms step_avg:144.51ms step:182/1480 train_time:24859ms step_avg:144.53ms step:183/1480 train_time:25006ms step_avg:144.55ms step:184/1480 train_time:25154ms step_avg:144.56ms step:185/1480 train_time:25300ms step_avg:144.57ms step:186/1480 train_time:25446ms step_avg:144.58ms step:187/1480 train_time:25594ms step_avg:144.60ms step:188/1480 train_time:25741ms step_avg:144.61ms step:189/1480 train_time:25886ms step_avg:144.61ms step:190/1480 train_time:26034ms step_avg:144.63ms step:191/1480 train_time:26181ms step_avg:144.64ms step:192/1480 train_time:26327ms step_avg:144.65ms step:193/1480 train_time:26475ms step_avg:144.67ms step:194/1480 train_time:26622ms step_avg:144.68ms step:195/1480 train_time:26771ms step_avg:144.71ms step:196/1480 train_time:26918ms step_avg:144.72ms step:197/1480 train_time:27064ms step_avg:144.73ms step:198/1480 train_time:27213ms step_avg:144.75ms step:199/1480 train_time:27360ms step_avg:144.76ms step:200/1480 train_time:27508ms step_avg:144.78ms step:201/1480 train_time:27656ms step_avg:144.80ms step:202/1480 train_time:27802ms step_avg:144.80ms step:203/1480 train_time:27951ms step_avg:144.82ms step:204/1480 train_time:28098ms step_avg:144.83ms step:205/1480 train_time:28244ms step_avg:144.84ms step:206/1480 train_time:28391ms step_avg:144.85ms step:207/1480 train_time:28538ms step_avg:144.86ms step:208/1480 train_time:28684ms step_avg:144.87ms step:209/1480 train_time:28833ms step_avg:144.89ms step:210/1480 train_time:28981ms step_avg:144.90ms step:211/1480 train_time:29129ms step_avg:144.92ms step:212/1480 train_time:29277ms step_avg:144.94ms step:213/1480 train_time:29424ms step_avg:144.94ms step:214/1480 train_time:29571ms step_avg:144.96ms step:215/1480 train_time:29719ms step_avg:144.97ms step:216/1480 train_time:29865ms step_avg:144.97ms step:217/1480 train_time:30012ms step_avg:144.99ms step:218/1480 train_time:30160ms step_avg:145.00ms step:219/1480 train_time:30305ms step_avg:145.00ms step:220/1480 train_time:30453ms step_avg:145.01ms step:221/1480 train_time:30600ms step_avg:145.03ms step:222/1480 train_time:30751ms step_avg:145.05ms step:223/1480 train_time:30901ms step_avg:145.08ms step:224/1480 train_time:31052ms step_avg:145.10ms step:225/1480 train_time:31203ms step_avg:145.13ms step:226/1480 train_time:31355ms step_avg:145.16ms step:227/1480 train_time:31508ms step_avg:145.20ms step:228/1480 train_time:31656ms step_avg:145.21ms step:229/1480 train_time:31805ms step_avg:145.23ms step:230/1480 train_time:31955ms step_avg:145.25ms step:231/1480 train_time:32104ms step_avg:145.27ms step:232/1480 train_time:32255ms step_avg:145.29ms step:233/1480 train_time:32405ms step_avg:145.31ms step:234/1480 train_time:32557ms step_avg:145.34ms step:235/1480 train_time:32708ms step_avg:145.37ms step:236/1480 train_time:32859ms step_avg:145.39ms step:237/1480 train_time:33008ms step_avg:145.41ms step:238/1480 train_time:33159ms step_avg:145.43ms step:239/1480 train_time:33308ms step_avg:145.45ms step:240/1480 train_time:33460ms step_avg:145.48ms step:241/1480 train_time:33611ms step_avg:145.50ms step:242/1480 train_time:33761ms step_avg:145.52ms step:243/1480 train_time:33912ms step_avg:145.55ms step:244/1480 train_time:34063ms step_avg:145.57ms step:245/1480 train_time:34212ms step_avg:145.58ms step:246/1480 train_time:34362ms step_avg:145.60ms step:247/1480 train_time:34512ms step_avg:145.62ms step:248/1480 train_time:34662ms step_avg:145.64ms step:249/1480 train_time:34814ms step_avg:145.66ms step:250/1480 train_time:34965ms step_avg:145.69ms step:250/1480 val_loss:3.9910 train_time:35023ms step_avg:145.93ms step:251/1480 train_time:35120ms step_avg:145.73ms step:252/1480 train_time:35275ms step_avg:145.76ms step:253/1480 train_time:35425ms step_avg:145.78ms step:254/1480 train_time:35574ms step_avg:145.79ms step:255/1480 train_time:35723ms step_avg:145.81ms step:256/1480 train_time:35873ms step_avg:145.82ms step:257/1480 train_time:36023ms step_avg:145.84ms step:258/1480 train_time:36176ms step_avg:145.87ms step:259/1480 train_time:36328ms step_avg:145.90ms step:260/1480 train_time:36478ms step_avg:145.91ms step:261/1480 train_time:36628ms step_avg:145.93ms step:262/1480 train_time:36778ms step_avg:145.94ms step:263/1480 train_time:36928ms step_avg:145.96ms step:264/1480 train_time:37080ms step_avg:145.98ms step:265/1480 train_time:37230ms step_avg:146.00ms step:266/1480 train_time:37381ms step_avg:146.02ms step:267/1480 train_time:37533ms step_avg:146.04ms step:268/1480 train_time:37684ms step_avg:146.06ms step:269/1480 train_time:37834ms step_avg:146.08ms step:270/1480 train_time:37984ms step_avg:146.09ms step:271/1480 train_time:38133ms step_avg:146.10ms step:272/1480 train_time:38284ms step_avg:146.12ms step:273/1480 train_time:38432ms step_avg:146.13ms step:274/1480 train_time:38584ms step_avg:146.15ms step:275/1480 train_time:38733ms step_avg:146.16ms step:276/1480 train_time:38884ms step_avg:146.18ms step:277/1480 train_time:39033ms step_avg:146.19ms step:278/1480 train_time:39185ms step_avg:146.21ms step:279/1480 train_time:39335ms step_avg:146.23ms step:280/1480 train_time:39486ms step_avg:146.24ms step:281/1480 train_time:39636ms step_avg:146.26ms step:282/1480 train_time:39787ms step_avg:146.27ms step:283/1480 train_time:39936ms step_avg:146.29ms step:284/1480 train_time:40087ms step_avg:146.30ms step:285/1480 train_time:40237ms step_avg:146.32ms step:286/1480 train_time:40388ms step_avg:146.33ms step:287/1480 train_time:40539ms step_avg:146.35ms step:288/1480 train_time:40689ms step_avg:146.36ms step:289/1480 train_time:40840ms step_avg:146.38ms step:290/1480 train_time:40991ms step_avg:146.40ms step:291/1480 train_time:41143ms step_avg:146.42ms step:292/1480 train_time:41293ms step_avg:146.43ms step:293/1480 train_time:41445ms step_avg:146.45ms step:294/1480 train_time:41595ms step_avg:146.46ms step:295/1480 train_time:41746ms step_avg:146.48ms step:296/1480 train_time:41897ms step_avg:146.49ms step:297/1480 train_time:42047ms step_avg:146.51ms step:298/1480 train_time:42197ms step_avg:146.52ms step:299/1480 train_time:42348ms step_avg:146.53ms step:300/1480 train_time:42499ms step_avg:146.55ms step:301/1480 train_time:42649ms step_avg:146.56ms step:302/1480 train_time:42800ms step_avg:146.58ms step:303/1480 train_time:42952ms step_avg:146.59ms step:304/1480 train_time:43103ms step_avg:146.61ms step:305/1480 train_time:43254ms step_avg:146.62ms step:306/1480 train_time:43405ms step_avg:146.64ms step:307/1480 train_time:43556ms step_avg:146.65ms step:308/1480 train_time:43708ms step_avg:146.67ms step:309/1480 train_time:43859ms step_avg:146.69ms step:310/1480 train_time:44009ms step_avg:146.70ms step:311/1480 train_time:44160ms step_avg:146.71ms step:312/1480 train_time:44310ms step_avg:146.72ms step:313/1480 train_time:44462ms step_avg:146.74ms step:314/1480 train_time:44613ms step_avg:146.75ms step:315/1480 train_time:44764ms step_avg:146.77ms step:316/1480 train_time:44915ms step_avg:146.78ms step:317/1480 train_time:45066ms step_avg:146.79ms step:318/1480 train_time:45216ms step_avg:146.81ms step:319/1480 train_time:45370ms step_avg:146.83ms step:320/1480 train_time:45516ms step_avg:146.83ms step:321/1480 train_time:45667ms step_avg:146.84ms step:322/1480 train_time:45817ms step_avg:146.85ms step:323/1480 train_time:45967ms step_avg:146.86ms step:324/1480 train_time:46118ms step_avg:146.87ms step:325/1480 train_time:46268ms step_avg:146.88ms step:326/1480 train_time:46418ms step_avg:146.89ms step:327/1480 train_time:46568ms step_avg:146.90ms step:328/1480 train_time:46718ms step_avg:146.91ms step:329/1480 train_time:46868ms step_avg:146.92ms step:330/1480 train_time:47022ms step_avg:146.94ms step:331/1480 train_time:47176ms step_avg:146.97ms step:332/1480 train_time:47329ms step_avg:146.98ms step:333/1480 train_time:47483ms step_avg:147.01ms step:334/1480 train_time:47638ms step_avg:147.03ms step:335/1480 train_time:47792ms step_avg:147.05ms step:336/1480 train_time:47946ms step_avg:147.07ms step:337/1480 train_time:48099ms step_avg:147.09ms step:338/1480 train_time:48255ms step_avg:147.12ms step:339/1480 train_time:48408ms step_avg:147.14ms step:340/1480 train_time:48561ms step_avg:147.16ms step:341/1480 train_time:48714ms step_avg:147.17ms step:342/1480 train_time:48870ms step_avg:147.20ms step:343/1480 train_time:49026ms step_avg:147.23ms step:344/1480 train_time:49181ms step_avg:147.25ms step:345/1480 train_time:49336ms step_avg:147.27ms step:346/1480 train_time:49489ms step_avg:147.29ms step:347/1480 train_time:49643ms step_avg:147.31ms step:348/1480 train_time:49797ms step_avg:147.33ms step:349/1480 train_time:49950ms step_avg:147.35ms step:350/1480 train_time:50104ms step_avg:147.37ms step:351/1480 train_time:50258ms step_avg:147.38ms step:352/1480 train_time:50411ms step_avg:147.40ms step:353/1480 train_time:50568ms step_avg:147.43ms step:354/1480 train_time:50721ms step_avg:147.45ms step:355/1480 train_time:50875ms step_avg:147.47ms step:356/1480 train_time:51029ms step_avg:147.48ms step:357/1480 train_time:51184ms step_avg:147.50ms step:358/1480 train_time:51338ms step_avg:147.52ms step:359/1480 train_time:51494ms step_avg:147.55ms step:360/1480 train_time:51648ms step_avg:147.57ms step:361/1480 train_time:51802ms step_avg:147.59ms step:362/1480 train_time:51957ms step_avg:147.61ms step:363/1480 train_time:52111ms step_avg:147.62ms step:364/1480 train_time:52265ms step_avg:147.64ms step:365/1480 train_time:52419ms step_avg:147.66ms step:366/1480 train_time:52574ms step_avg:147.68ms step:367/1480 train_time:52727ms step_avg:147.70ms step:368/1480 train_time:52882ms step_avg:147.72ms step:369/1480 train_time:53035ms step_avg:147.73ms step:370/1480 train_time:53188ms step_avg:147.74ms step:371/1480 train_time:53342ms step_avg:147.76ms step:372/1480 train_time:53496ms step_avg:147.78ms step:373/1480 train_time:53650ms step_avg:147.80ms step:374/1480 train_time:53802ms step_avg:147.81ms step:375/1480 train_time:53955ms step_avg:147.82ms step:375/1480 val_loss:3.8068 train_time:54015ms step_avg:147.99ms step:376/1480 train_time:54115ms step_avg:147.85ms step:377/1480 train_time:54270ms step_avg:147.88ms step:378/1480 train_time:54423ms step_avg:147.89ms step:379/1480 train_time:54576ms step_avg:147.90ms step:380/1480 train_time:54728ms step_avg:147.91ms step:381/1480 train_time:54881ms step_avg:147.93ms step:382/1480 train_time:55034ms step_avg:147.94ms step:383/1480 train_time:55189ms step_avg:147.96ms step:384/1480 train_time:55343ms step_avg:147.98ms step:385/1480 train_time:55497ms step_avg:147.99ms step:386/1480 train_time:55648ms step_avg:148.00ms step:387/1480 train_time:55801ms step_avg:148.01ms step:388/1480 train_time:55954ms step_avg:148.03ms step:389/1480 train_time:56107ms step_avg:148.04ms step:390/1480 train_time:56263ms step_avg:148.06ms step:391/1480 train_time:56418ms step_avg:148.08ms step:392/1480 train_time:56572ms step_avg:148.09ms step:393/1480 train_time:56725ms step_avg:148.11ms step:394/1480 train_time:56879ms step_avg:148.12ms step:395/1480 train_time:57031ms step_avg:148.13ms step:396/1480 train_time:57186ms step_avg:148.15ms step:397/1480 train_time:57341ms step_avg:148.17ms step:398/1480 train_time:57497ms step_avg:148.19ms step:399/1480 train_time:57649ms step_avg:148.20ms step:400/1480 train_time:57803ms step_avg:148.21ms step:401/1480 train_time:57956ms step_avg:148.23ms step:402/1480 train_time:58109ms step_avg:148.24ms step:403/1480 train_time:58264ms step_avg:148.25ms step:404/1480 train_time:58418ms step_avg:148.27ms step:405/1480 train_time:58572ms step_avg:148.28ms step:406/1480 train_time:58725ms step_avg:148.30ms step:407/1480 train_time:58879ms step_avg:148.31ms step:408/1480 train_time:59033ms step_avg:148.32ms step:409/1480 train_time:59187ms step_avg:148.34ms step:410/1480 train_time:59341ms step_avg:148.35ms step:411/1480 train_time:59495ms step_avg:148.37ms step:412/1480 train_time:59649ms step_avg:148.38ms step:413/1480 train_time:59803ms step_avg:148.39ms step:414/1480 train_time:59957ms step_avg:148.41ms step:415/1480 train_time:60111ms step_avg:148.42ms step:416/1480 train_time:60264ms step_avg:148.43ms step:417/1480 train_time:60418ms step_avg:148.45ms step:418/1480 train_time:60573ms step_avg:148.46ms step:419/1480 train_time:60726ms step_avg:148.47ms step:420/1480 train_time:60880ms step_avg:148.49ms step:421/1480 train_time:61033ms step_avg:148.50ms step:422/1480 train_time:61187ms step_avg:148.51ms step:423/1480 train_time:61341ms step_avg:148.53ms step:424/1480 train_time:61494ms step_avg:148.54ms step:425/1480 train_time:61649ms step_avg:148.55ms step:426/1480 train_time:61804ms step_avg:148.57ms step:427/1480 train_time:61958ms step_avg:148.58ms step:428/1480 train_time:62111ms step_avg:148.59ms step:429/1480 train_time:62265ms step_avg:148.60ms step:430/1480 train_time:62418ms step_avg:148.61ms step:431/1480 train_time:62572ms step_avg:148.63ms step:432/1480 train_time:62725ms step_avg:148.64ms step:433/1480 train_time:62879ms step_avg:148.65ms step:434/1480 train_time:63032ms step_avg:148.66ms step:435/1480 train_time:63187ms step_avg:148.67ms step:436/1480 train_time:63341ms step_avg:148.69ms step:437/1480 train_time:63494ms step_avg:148.70ms step:438/1480 train_time:63648ms step_avg:148.71ms step:439/1480 train_time:63802ms step_avg:148.72ms step:440/1480 train_time:63957ms step_avg:148.74ms step:441/1480 train_time:64113ms step_avg:148.75ms step:442/1480 train_time:64270ms step_avg:148.77ms step:443/1480 train_time:64426ms step_avg:148.79ms step:444/1480 train_time:64583ms step_avg:148.81ms step:445/1480 train_time:64738ms step_avg:148.82ms step:446/1480 train_time:64894ms step_avg:148.84ms step:447/1480 train_time:65050ms step_avg:148.86ms step:448/1480 train_time:65207ms step_avg:148.87ms step:449/1480 train_time:65365ms step_avg:148.90ms step:450/1480 train_time:65524ms step_avg:148.92ms step:451/1480 train_time:65683ms step_avg:148.94ms step:452/1480 train_time:65841ms step_avg:148.96ms step:453/1480 train_time:65996ms step_avg:148.98ms step:454/1480 train_time:66151ms step_avg:148.99ms step:455/1480 train_time:66307ms step_avg:149.00ms step:456/1480 train_time:66464ms step_avg:149.02ms step:457/1480 train_time:66621ms step_avg:149.04ms step:458/1480 train_time:66778ms step_avg:149.06ms step:459/1480 train_time:66936ms step_avg:149.08ms step:460/1480 train_time:67092ms step_avg:149.09ms step:461/1480 train_time:67250ms step_avg:149.11ms step:462/1480 train_time:67407ms step_avg:149.13ms step:463/1480 train_time:67566ms step_avg:149.15ms step:464/1480 train_time:67722ms step_avg:149.17ms step:465/1480 train_time:67879ms step_avg:149.19ms step:466/1480 train_time:68038ms step_avg:149.21ms step:467/1480 train_time:68195ms step_avg:149.22ms step:468/1480 train_time:68350ms step_avg:149.24ms step:469/1480 train_time:68506ms step_avg:149.25ms step:470/1480 train_time:68663ms step_avg:149.27ms step:471/1480 train_time:68820ms step_avg:149.28ms step:472/1480 train_time:68977ms step_avg:149.30ms step:473/1480 train_time:69134ms step_avg:149.32ms step:474/1480 train_time:69290ms step_avg:149.33ms step:475/1480 train_time:69447ms step_avg:149.35ms step:476/1480 train_time:69603ms step_avg:149.36ms step:477/1480 train_time:69761ms step_avg:149.38ms step:478/1480 train_time:69918ms step_avg:149.40ms step:479/1480 train_time:70075ms step_avg:149.41ms step:480/1480 train_time:70232ms step_avg:149.43ms step:481/1480 train_time:70388ms step_avg:149.44ms step:482/1480 train_time:70544ms step_avg:149.46ms step:483/1480 train_time:70701ms step_avg:149.47ms step:484/1480 train_time:70859ms step_avg:149.49ms step:485/1480 train_time:71016ms step_avg:149.51ms step:486/1480 train_time:71172ms step_avg:149.52ms step:487/1480 train_time:71330ms step_avg:149.54ms step:488/1480 train_time:71488ms step_avg:149.56ms step:489/1480 train_time:71645ms step_avg:149.57ms step:490/1480 train_time:71802ms step_avg:149.59ms step:491/1480 train_time:71959ms step_avg:149.60ms step:492/1480 train_time:72115ms step_avg:149.62ms step:493/1480 train_time:72272ms step_avg:149.63ms step:494/1480 train_time:72428ms step_avg:149.64ms step:495/1480 train_time:72585ms step_avg:149.66ms step:496/1480 train_time:72743ms step_avg:149.68ms step:497/1480 train_time:72902ms step_avg:149.70ms step:498/1480 train_time:73061ms step_avg:149.71ms step:499/1480 train_time:73219ms step_avg:149.73ms step:500/1480 train_time:73377ms step_avg:149.75ms step:500/1480 val_loss:3.6833 train_time:73439ms step_avg:149.88ms step:501/1480 train_time:73536ms step_avg:149.77ms step:502/1480 train_time:73694ms step_avg:149.79ms step:503/1480 train_time:73851ms step_avg:149.80ms step:504/1480 train_time:74007ms step_avg:149.81ms step:505/1480 train_time:74163ms step_avg:149.82ms step:506/1480 train_time:74321ms step_avg:149.84ms step:507/1480 train_time:74478ms step_avg:149.85ms step:508/1480 train_time:74636ms step_avg:149.87ms step:509/1480 train_time:74792ms step_avg:149.88ms step:510/1480 train_time:74949ms step_avg:149.90ms step:511/1480 train_time:75106ms step_avg:149.91ms step:512/1480 train_time:75265ms step_avg:149.93ms step:513/1480 train_time:75421ms step_avg:149.94ms step:514/1480 train_time:75577ms step_avg:149.96ms step:515/1480 train_time:75734ms step_avg:149.97ms step:516/1480 train_time:75892ms step_avg:149.98ms step:517/1480 train_time:76050ms step_avg:150.00ms step:518/1480 train_time:76207ms step_avg:150.01ms step:519/1480 train_time:76365ms step_avg:150.03ms step:520/1480 train_time:76523ms step_avg:150.04ms step:521/1480 train_time:76679ms step_avg:150.06ms step:522/1480 train_time:76835ms step_avg:150.07ms step:523/1480 train_time:76991ms step_avg:150.08ms step:524/1480 train_time:77147ms step_avg:150.09ms step:525/1480 train_time:77304ms step_avg:150.11ms step:526/1480 train_time:77464ms step_avg:150.12ms step:527/1480 train_time:77622ms step_avg:150.14ms step:528/1480 train_time:77779ms step_avg:150.15ms step:529/1480 train_time:77935ms step_avg:150.16ms step:530/1480 train_time:78091ms step_avg:150.17ms step:531/1480 train_time:78247ms step_avg:150.19ms step:532/1480 train_time:78403ms step_avg:150.20ms step:533/1480 train_time:78561ms step_avg:150.21ms step:534/1480 train_time:78716ms step_avg:150.22ms step:535/1480 train_time:78873ms step_avg:150.23ms step:536/1480 train_time:79031ms step_avg:150.25ms step:537/1480 train_time:79187ms step_avg:150.26ms step:538/1480 train_time:79346ms step_avg:150.28ms step:539/1480 train_time:79506ms step_avg:150.29ms step:540/1480 train_time:79664ms step_avg:150.31ms step:541/1480 train_time:79820ms step_avg:150.32ms step:542/1480 train_time:79976ms step_avg:150.33ms step:543/1480 train_time:80132ms step_avg:150.34ms step:544/1480 train_time:80287ms step_avg:150.35ms step:545/1480 train_time:80445ms step_avg:150.37ms step:546/1480 train_time:80602ms step_avg:150.38ms step:547/1480 train_time:80759ms step_avg:150.39ms step:548/1480 train_time:80916ms step_avg:150.40ms step:549/1480 train_time:81073ms step_avg:150.41ms step:550/1480 train_time:81231ms step_avg:150.43ms step:551/1480 train_time:81388ms step_avg:150.44ms step:552/1480 train_time:81549ms step_avg:150.46ms step:553/1480 train_time:81708ms step_avg:150.48ms step:554/1480 train_time:81869ms step_avg:150.49ms step:555/1480 train_time:82029ms step_avg:150.51ms step:556/1480 train_time:82188ms step_avg:150.53ms step:557/1480 train_time:82349ms step_avg:150.55ms step:558/1480 train_time:82508ms step_avg:150.56ms step:559/1480 train_time:82667ms step_avg:150.58ms step:560/1480 train_time:82825ms step_avg:150.59ms step:561/1480 train_time:82984ms step_avg:150.61ms step:562/1480 train_time:83144ms step_avg:150.62ms step:563/1480 train_time:83304ms step_avg:150.64ms step:564/1480 train_time:83464ms step_avg:150.66ms step:565/1480 train_time:83623ms step_avg:150.67ms step:566/1480 train_time:83784ms step_avg:150.69ms step:567/1480 train_time:83942ms step_avg:150.70ms step:568/1480 train_time:84102ms step_avg:150.72ms step:569/1480 train_time:84261ms step_avg:150.73ms step:570/1480 train_time:84420ms step_avg:150.75ms step:571/1480 train_time:84579ms step_avg:150.77ms step:572/1480 train_time:84739ms step_avg:150.78ms step:573/1480 train_time:84899ms step_avg:150.80ms step:574/1480 train_time:85061ms step_avg:150.82ms step:575/1480 train_time:85220ms step_avg:150.83ms step:576/1480 train_time:85378ms step_avg:150.84ms step:577/1480 train_time:85538ms step_avg:150.86ms step:578/1480 train_time:85697ms step_avg:150.87ms step:579/1480 train_time:85856ms step_avg:150.89ms step:580/1480 train_time:86014ms step_avg:150.90ms step:581/1480 train_time:86174ms step_avg:150.92ms step:582/1480 train_time:86333ms step_avg:150.93ms step:583/1480 train_time:86491ms step_avg:150.94ms step:584/1480 train_time:86651ms step_avg:150.96ms step:585/1480 train_time:86809ms step_avg:150.97ms step:586/1480 train_time:86969ms step_avg:150.99ms step:587/1480 train_time:87128ms step_avg:151.00ms step:588/1480 train_time:87287ms step_avg:151.02ms step:589/1480 train_time:87448ms step_avg:151.03ms step:590/1480 train_time:87608ms step_avg:151.05ms step:591/1480 train_time:87769ms step_avg:151.06ms step:592/1480 train_time:87929ms step_avg:151.08ms step:593/1480 train_time:88088ms step_avg:151.09ms step:594/1480 train_time:88249ms step_avg:151.11ms step:595/1480 train_time:88410ms step_avg:151.13ms step:596/1480 train_time:88571ms step_avg:151.15ms step:597/1480 train_time:88730ms step_avg:151.16ms step:598/1480 train_time:88887ms step_avg:151.17ms step:599/1480 train_time:89046ms step_avg:151.18ms step:600/1480 train_time:89207ms step_avg:151.20ms step:601/1480 train_time:89367ms step_avg:151.21ms step:602/1480 train_time:89527ms step_avg:151.23ms step:603/1480 train_time:89686ms step_avg:151.24ms step:604/1480 train_time:89847ms step_avg:151.26ms step:605/1480 train_time:90006ms step_avg:151.27ms step:606/1480 train_time:90169ms step_avg:151.29ms step:607/1480 train_time:90331ms step_avg:151.31ms step:608/1480 train_time:90490ms step_avg:151.32ms step:609/1480 train_time:90649ms step_avg:151.33ms step:610/1480 train_time:90807ms step_avg:151.35ms step:611/1480 train_time:90969ms step_avg:151.36ms step:612/1480 train_time:91128ms step_avg:151.38ms step:613/1480 train_time:91289ms step_avg:151.39ms step:614/1480 train_time:91448ms step_avg:151.40ms step:615/1480 train_time:91607ms step_avg:151.42ms step:616/1480 train_time:91766ms step_avg:151.43ms step:617/1480 train_time:91926ms step_avg:151.44ms step:618/1480 train_time:92087ms step_avg:151.46ms step:619/1480 train_time:92247ms step_avg:151.47ms step:620/1480 train_time:92406ms step_avg:151.49ms step:621/1480 train_time:92567ms step_avg:151.50ms step:622/1480 train_time:92727ms step_avg:151.51ms step:623/1480 train_time:92887ms step_avg:151.53ms step:624/1480 train_time:93047ms step_avg:151.54ms step:625/1480 train_time:93207ms step_avg:151.56ms step:625/1480 val_loss:3.6037 train_time:93271ms step_avg:151.66ms step:626/1480 train_time:93370ms step_avg:151.57ms step:627/1480 train_time:93529ms step_avg:151.59ms step:628/1480 train_time:93685ms step_avg:151.59ms step:629/1480 train_time:93843ms step_avg:151.60ms step:630/1480 train_time:94001ms step_avg:151.61ms step:631/1480 train_time:94158ms step_avg:151.62ms step:632/1480 train_time:94317ms step_avg:151.64ms step:633/1480 train_time:94477ms step_avg:151.65ms step:634/1480 train_time:94638ms step_avg:151.66ms step:635/1480 train_time:94798ms step_avg:151.68ms step:636/1480 train_time:94957ms step_avg:151.69ms step:637/1480 train_time:95118ms step_avg:151.70ms step:638/1480 train_time:95277ms step_avg:151.72ms step:639/1480 train_time:95437ms step_avg:151.73ms step:640/1480 train_time:95597ms step_avg:151.74ms step:641/1480 train_time:95756ms step_avg:151.75ms step:642/1480 train_time:95916ms step_avg:151.77ms step:643/1480 train_time:96075ms step_avg:151.78ms step:644/1480 train_time:96236ms step_avg:151.79ms step:645/1480 train_time:96395ms step_avg:151.80ms step:646/1480 train_time:96555ms step_avg:151.82ms step:647/1480 train_time:96715ms step_avg:151.83ms step:648/1480 train_time:96875ms step_avg:151.84ms step:649/1480 train_time:97036ms step_avg:151.86ms step:650/1480 train_time:97197ms step_avg:151.87ms step:651/1480 train_time:97357ms step_avg:151.88ms step:652/1480 train_time:97518ms step_avg:151.90ms step:653/1480 train_time:97676ms step_avg:151.91ms step:654/1480 train_time:97837ms step_avg:151.92ms step:655/1480 train_time:97996ms step_avg:151.93ms step:656/1480 train_time:98157ms step_avg:151.95ms step:657/1480 train_time:98318ms step_avg:151.96ms step:658/1480 train_time:98478ms step_avg:151.97ms step:659/1480 train_time:98640ms step_avg:151.99ms step:660/1480 train_time:98801ms step_avg:152.00ms step:661/1480 train_time:98963ms step_avg:152.02ms step:662/1480 train_time:99122ms step_avg:152.03ms step:663/1480 train_time:99281ms step_avg:152.04ms step:664/1480 train_time:99443ms step_avg:152.05ms step:665/1480 train_time:99605ms step_avg:152.07ms step:666/1480 train_time:99764ms step_avg:152.08ms step:667/1480 train_time:99924ms step_avg:152.09ms step:668/1480 train_time:100086ms step_avg:152.11ms step:669/1480 train_time:100249ms step_avg:152.12ms step:670/1480 train_time:100408ms step_avg:152.13ms step:671/1480 train_time:100569ms step_avg:152.15ms step:672/1480 train_time:100733ms step_avg:152.16ms step:673/1480 train_time:100897ms step_avg:152.18ms step:674/1480 train_time:101061ms step_avg:152.20ms step:675/1480 train_time:101221ms step_avg:152.21ms step:676/1480 train_time:101382ms step_avg:152.23ms step:677/1480 train_time:101543ms step_avg:152.24ms step:678/1480 train_time:101705ms step_avg:152.25ms step:679/1480 train_time:101865ms step_avg:152.27ms step:680/1480 train_time:102027ms step_avg:152.28ms step:681/1480 train_time:102187ms step_avg:152.29ms step:682/1480 train_time:102350ms step_avg:152.31ms step:683/1480 train_time:102512ms step_avg:152.32ms step:684/1480 train_time:102674ms step_avg:152.34ms step:685/1480 train_time:102839ms step_avg:152.35ms step:686/1480 train_time:103000ms step_avg:152.37ms step:687/1480 train_time:103160ms step_avg:152.38ms step:688/1480 train_time:103324ms step_avg:152.40ms step:689/1480 train_time:103487ms step_avg:152.41ms step:690/1480 train_time:103652ms step_avg:152.43ms step:691/1480 train_time:103814ms step_avg:152.44ms step:692/1480 train_time:103977ms step_avg:152.46ms step:693/1480 train_time:104140ms step_avg:152.47ms step:694/1480 train_time:104301ms step_avg:152.49ms step:695/1480 train_time:104461ms step_avg:152.50ms step:696/1480 train_time:104621ms step_avg:152.51ms step:697/1480 train_time:104784ms step_avg:152.52ms step:698/1480 train_time:104945ms step_avg:152.54ms step:699/1480 train_time:105106ms step_avg:152.55ms step:700/1480 train_time:105269ms step_avg:152.56ms step:701/1480 train_time:105431ms step_avg:152.58ms step:702/1480 train_time:105594ms step_avg:152.59ms step:703/1480 train_time:105754ms step_avg:152.60ms step:704/1480 train_time:105916ms step_avg:152.62ms step:705/1480 train_time:106079ms step_avg:152.63ms step:706/1480 train_time:106243ms step_avg:152.65ms step:707/1480 train_time:106404ms step_avg:152.66ms step:708/1480 train_time:106563ms step_avg:152.67ms step:709/1480 train_time:106725ms step_avg:152.68ms step:710/1480 train_time:106885ms step_avg:152.69ms step:711/1480 train_time:107049ms step_avg:152.71ms step:712/1480 train_time:107217ms step_avg:152.73ms step:713/1480 train_time:107379ms step_avg:152.74ms step:714/1480 train_time:107541ms step_avg:152.76ms step:715/1480 train_time:107701ms step_avg:152.77ms step:716/1480 train_time:107861ms step_avg:152.78ms step:717/1480 train_time:108022ms step_avg:152.79ms step:718/1480 train_time:108180ms step_avg:152.80ms step:719/1480 train_time:108340ms step_avg:152.81ms step:720/1480 train_time:108503ms step_avg:152.82ms step:721/1480 train_time:108664ms step_avg:152.83ms step:722/1480 train_time:108826ms step_avg:152.85ms step:723/1480 train_time:108987ms step_avg:152.86ms step:724/1480 train_time:109150ms step_avg:152.87ms step:725/1480 train_time:109315ms step_avg:152.89ms step:726/1480 train_time:109478ms step_avg:152.90ms step:727/1480 train_time:109641ms step_avg:152.92ms step:728/1480 train_time:109801ms step_avg:152.93ms step:729/1480 train_time:109962ms step_avg:152.94ms step:730/1480 train_time:110125ms step_avg:152.95ms step:731/1480 train_time:110286ms step_avg:152.96ms step:732/1480 train_time:110445ms step_avg:152.97ms step:733/1480 train_time:110608ms step_avg:152.98ms step:734/1480 train_time:110770ms step_avg:153.00ms step:735/1480 train_time:110932ms step_avg:153.01ms step:736/1480 train_time:111096ms step_avg:153.02ms step:737/1480 train_time:111258ms step_avg:153.04ms step:738/1480 train_time:111418ms step_avg:153.05ms step:739/1480 train_time:111579ms step_avg:153.06ms step:740/1480 train_time:111744ms step_avg:153.07ms step:741/1480 train_time:111906ms step_avg:153.09ms step:742/1480 train_time:112066ms step_avg:153.10ms step:743/1480 train_time:112227ms step_avg:153.11ms step:744/1480 train_time:112390ms step_avg:153.12ms step:745/1480 train_time:112557ms step_avg:153.14ms step:746/1480 train_time:112719ms step_avg:153.15ms step:747/1480 train_time:112878ms step_avg:153.16ms step:748/1480 train_time:113044ms step_avg:153.18ms step:749/1480 train_time:113206ms step_avg:153.19ms step:750/1480 train_time:113365ms step_avg:153.20ms step:750/1480 val_loss:3.5503 train_time:113430ms step_avg:153.28ms step:751/1480 train_time:113531ms step_avg:153.21ms step:752/1480 train_time:113694ms step_avg:153.23ms step:753/1480 train_time:113854ms step_avg:153.24ms step:754/1480 train_time:114015ms step_avg:153.25ms step:755/1480 train_time:114177ms step_avg:153.26ms step:756/1480 train_time:114339ms step_avg:153.27ms step:757/1480 train_time:114506ms step_avg:153.29ms step:758/1480 train_time:114667ms step_avg:153.30ms step:759/1480 train_time:114827ms step_avg:153.31ms step:760/1480 train_time:114988ms step_avg:153.32ms step:761/1480 train_time:115149ms step_avg:153.33ms step:762/1480 train_time:115310ms step_avg:153.34ms step:763/1480 train_time:115472ms step_avg:153.35ms step:764/1480 train_time:115633ms step_avg:153.36ms step:765/1480 train_time:115794ms step_avg:153.37ms step:766/1480 train_time:115957ms step_avg:153.38ms step:767/1480 train_time:116120ms step_avg:153.40ms step:768/1480 train_time:116283ms step_avg:153.41ms step:769/1480 train_time:116446ms step_avg:153.42ms step:770/1480 train_time:116610ms step_avg:153.43ms step:771/1480 train_time:116773ms step_avg:153.45ms step:772/1480 train_time:116934ms step_avg:153.46ms step:773/1480 train_time:117098ms step_avg:153.47ms step:774/1480 train_time:117262ms step_avg:153.48ms step:775/1480 train_time:117425ms step_avg:153.50ms step:776/1480 train_time:117589ms step_avg:153.51ms step:777/1480 train_time:117754ms step_avg:153.53ms step:778/1480 train_time:117917ms step_avg:153.54ms step:779/1480 train_time:118081ms step_avg:153.55ms step:780/1480 train_time:118244ms step_avg:153.56ms step:781/1480 train_time:118407ms step_avg:153.58ms step:782/1480 train_time:118570ms step_avg:153.59ms step:783/1480 train_time:118731ms step_avg:153.60ms step:784/1480 train_time:118894ms step_avg:153.61ms step:785/1480 train_time:119057ms step_avg:153.62ms step:786/1480 train_time:119221ms step_avg:153.64ms step:787/1480 train_time:119385ms step_avg:153.65ms step:788/1480 train_time:119548ms step_avg:153.66ms step:789/1480 train_time:119709ms step_avg:153.67ms step:790/1480 train_time:119873ms step_avg:153.68ms step:791/1480 train_time:120040ms step_avg:153.70ms step:792/1480 train_time:120206ms step_avg:153.72ms step:793/1480 train_time:120367ms step_avg:153.73ms step:794/1480 train_time:120531ms step_avg:153.74ms step:795/1480 train_time:120695ms step_avg:153.75ms step:796/1480 train_time:120863ms step_avg:153.77ms step:797/1480 train_time:121027ms step_avg:153.78ms step:798/1480 train_time:121192ms step_avg:153.80ms step:799/1480 train_time:121359ms step_avg:153.81ms step:800/1480 train_time:121523ms step_avg:153.83ms step:801/1480 train_time:121686ms step_avg:153.84ms step:802/1480 train_time:121852ms step_avg:153.85ms step:803/1480 train_time:122013ms step_avg:153.86ms step:804/1480 train_time:122176ms step_avg:153.87ms step:805/1480 train_time:122342ms step_avg:153.89ms step:806/1480 train_time:122505ms step_avg:153.90ms step:807/1480 train_time:122666ms step_avg:153.91ms step:808/1480 train_time:122829ms step_avg:153.92ms step:809/1480 train_time:122991ms step_avg:153.93ms step:810/1480 train_time:123152ms step_avg:153.94ms step:811/1480 train_time:123315ms step_avg:153.95ms step:812/1480 train_time:123481ms step_avg:153.97ms step:813/1480 train_time:123643ms step_avg:153.98ms step:814/1480 train_time:123806ms step_avg:153.99ms step:815/1480 train_time:123968ms step_avg:154.00ms step:816/1480 train_time:124133ms step_avg:154.01ms step:817/1480 train_time:124297ms step_avg:154.02ms step:818/1480 train_time:124460ms step_avg:154.03ms step:819/1480 train_time:124623ms step_avg:154.05ms step:820/1480 train_time:124787ms step_avg:154.06ms step:821/1480 train_time:124947ms step_avg:154.07ms step:822/1480 train_time:125111ms step_avg:154.08ms step:823/1480 train_time:125273ms step_avg:154.09ms step:824/1480 train_time:125434ms step_avg:154.10ms step:825/1480 train_time:125601ms step_avg:154.11ms step:826/1480 train_time:125767ms step_avg:154.13ms step:827/1480 train_time:125932ms step_avg:154.14ms step:828/1480 train_time:126097ms step_avg:154.15ms step:829/1480 train_time:126262ms step_avg:154.17ms step:830/1480 train_time:126427ms step_avg:154.18ms step:831/1480 train_time:126590ms step_avg:154.19ms step:832/1480 train_time:126752ms step_avg:154.20ms step:833/1480 train_time:126916ms step_avg:154.21ms step:834/1480 train_time:127082ms step_avg:154.23ms step:835/1480 train_time:127244ms step_avg:154.23ms step:836/1480 train_time:127409ms step_avg:154.25ms step:837/1480 train_time:127570ms step_avg:154.26ms step:838/1480 train_time:127732ms step_avg:154.27ms step:839/1480 train_time:127895ms step_avg:154.28ms step:840/1480 train_time:128058ms step_avg:154.29ms step:841/1480 train_time:128220ms step_avg:154.30ms step:842/1480 train_time:128385ms step_avg:154.31ms step:843/1480 train_time:128546ms step_avg:154.32ms step:844/1480 train_time:128708ms step_avg:154.33ms step:845/1480 train_time:128870ms step_avg:154.34ms step:846/1480 train_time:129034ms step_avg:154.35ms step:847/1480 train_time:129200ms step_avg:154.36ms step:848/1480 train_time:129362ms step_avg:154.37ms step:849/1480 train_time:129525ms step_avg:154.38ms step:850/1480 train_time:129687ms step_avg:154.39ms step:851/1480 train_time:129851ms step_avg:154.40ms step:852/1480 train_time:130012ms step_avg:154.41ms step:853/1480 train_time:130174ms step_avg:154.42ms step:854/1480 train_time:130339ms step_avg:154.43ms step:855/1480 train_time:130505ms step_avg:154.44ms step:856/1480 train_time:130666ms step_avg:154.45ms step:857/1480 train_time:130830ms step_avg:154.46ms step:858/1480 train_time:130998ms step_avg:154.48ms step:859/1480 train_time:131163ms step_avg:154.49ms step:860/1480 train_time:131326ms step_avg:154.50ms step:861/1480 train_time:131491ms step_avg:154.51ms step:862/1480 train_time:131659ms step_avg:154.53ms step:863/1480 train_time:131827ms step_avg:154.54ms step:864/1480 train_time:131991ms step_avg:154.56ms step:865/1480 train_time:132152ms step_avg:154.56ms step:866/1480 train_time:132320ms step_avg:154.58ms step:867/1480 train_time:132485ms step_avg:154.59ms step:868/1480 train_time:132646ms step_avg:154.60ms step:869/1480 train_time:132808ms step_avg:154.61ms step:870/1480 train_time:132972ms step_avg:154.62ms step:871/1480 train_time:133135ms step_avg:154.63ms step:872/1480 train_time:133300ms step_avg:154.64ms step:873/1480 train_time:133464ms step_avg:154.65ms step:874/1480 train_time:133630ms step_avg:154.66ms step:875/1480 train_time:133793ms step_avg:154.67ms step:875/1480 val_loss:3.5070 train_time:133857ms step_avg:154.75ms step:876/1480 train_time:133957ms step_avg:154.68ms step:877/1480 train_time:134121ms step_avg:154.70ms step:878/1480 train_time:134283ms step_avg:154.70ms step:879/1480 train_time:134449ms step_avg:154.72ms step:880/1480 train_time:134612ms step_avg:154.73ms step:881/1480 train_time:134774ms step_avg:154.73ms step:882/1480 train_time:134940ms step_avg:154.75ms step:883/1480 train_time:135107ms step_avg:154.76ms step:884/1480 train_time:135274ms step_avg:154.78ms step:885/1480 train_time:135438ms step_avg:154.79ms step:886/1480 train_time:135606ms step_avg:154.80ms step:887/1480 train_time:135775ms step_avg:154.82ms step:888/1480 train_time:135950ms step_avg:154.84ms step:889/1480 train_time:136119ms step_avg:154.86ms step:890/1480 train_time:136280ms step_avg:154.86ms step:891/1480 train_time:136447ms step_avg:154.88ms step:892/1480 train_time:136613ms step_avg:154.89ms step:893/1480 train_time:136774ms step_avg:154.90ms step:894/1480 train_time:136941ms step_avg:154.91ms step:895/1480 train_time:137109ms step_avg:154.92ms step:896/1480 train_time:137273ms step_avg:154.94ms step:897/1480 train_time:137439ms step_avg:154.95ms step:898/1480 train_time:137607ms step_avg:154.96ms step:899/1480 train_time:137771ms step_avg:154.97ms step:900/1480 train_time:137934ms step_avg:154.98ms step:901/1480 train_time:138099ms step_avg:154.99ms step:902/1480 train_time:138264ms step_avg:155.00ms step:903/1480 train_time:138438ms step_avg:155.03ms step:904/1480 train_time:138605ms step_avg:155.04ms step:905/1480 train_time:138767ms step_avg:155.05ms step:906/1480 train_time:138935ms step_avg:155.06ms step:907/1480 train_time:139101ms step_avg:155.07ms step:908/1480 train_time:139265ms step_avg:155.08ms step:909/1480 train_time:139431ms step_avg:155.10ms step:910/1480 train_time:139600ms step_avg:155.11ms step:911/1480 train_time:139765ms step_avg:155.12ms step:912/1480 train_time:139933ms step_avg:155.14ms step:913/1480 train_time:140099ms step_avg:155.15ms step:914/1480 train_time:140266ms step_avg:155.16ms step:915/1480 train_time:140435ms step_avg:155.18ms step:916/1480 train_time:140600ms step_avg:155.19ms step:917/1480 train_time:140764ms step_avg:155.20ms step:918/1480 train_time:140934ms step_avg:155.21ms step:919/1480 train_time:141105ms step_avg:155.23ms step:920/1480 train_time:141270ms step_avg:155.24ms step:921/1480 train_time:141435ms step_avg:155.25ms step:922/1480 train_time:141603ms step_avg:155.27ms step:923/1480 train_time:141766ms step_avg:155.27ms step:924/1480 train_time:141932ms step_avg:155.29ms step:925/1480 train_time:142095ms step_avg:155.30ms step:926/1480 train_time:142258ms step_avg:155.30ms step:927/1480 train_time:142421ms step_avg:155.31ms step:928/1480 train_time:142589ms step_avg:155.33ms step:929/1480 train_time:142754ms step_avg:155.34ms step:930/1480 train_time:142919ms step_avg:155.35ms step:931/1480 train_time:143081ms step_avg:155.35ms step:932/1480 train_time:143248ms step_avg:155.37ms step:933/1480 train_time:143415ms step_avg:155.38ms step:934/1480 train_time:143582ms step_avg:155.39ms step:935/1480 train_time:143754ms step_avg:155.41ms step:936/1480 train_time:143920ms step_avg:155.42ms step:937/1480 train_time:144090ms step_avg:155.44ms step:938/1480 train_time:144253ms step_avg:155.44ms step:939/1480 train_time:144421ms step_avg:155.46ms step:940/1480 train_time:144589ms step_avg:155.47ms step:941/1480 train_time:144753ms step_avg:155.48ms step:942/1480 train_time:144917ms step_avg:155.49ms step:943/1480 train_time:145086ms step_avg:155.51ms step:944/1480 train_time:145258ms step_avg:155.52ms step:945/1480 train_time:145423ms step_avg:155.53ms step:946/1480 train_time:145591ms step_avg:155.55ms step:947/1480 train_time:145758ms step_avg:155.56ms step:948/1480 train_time:145925ms step_avg:155.57ms step:949/1480 train_time:146091ms step_avg:155.58ms step:950/1480 train_time:146254ms step_avg:155.59ms step:951/1480 train_time:146421ms step_avg:155.60ms step:952/1480 train_time:146587ms step_avg:155.61ms step:953/1480 train_time:146757ms step_avg:155.63ms step:954/1480 train_time:146927ms step_avg:155.64ms step:955/1480 train_time:147091ms step_avg:155.65ms step:956/1480 train_time:147255ms step_avg:155.66ms step:957/1480 train_time:147424ms step_avg:155.67ms step:958/1480 train_time:147593ms step_avg:155.69ms step:959/1480 train_time:147757ms step_avg:155.70ms step:960/1480 train_time:147924ms step_avg:155.71ms step:961/1480 train_time:148090ms step_avg:155.72ms step:962/1480 train_time:148255ms step_avg:155.73ms step:963/1480 train_time:148420ms step_avg:155.74ms step:964/1480 train_time:148588ms step_avg:155.75ms step:965/1480 train_time:148753ms step_avg:155.76ms step:966/1480 train_time:148916ms step_avg:155.77ms step:967/1480 train_time:149080ms step_avg:155.78ms step:968/1480 train_time:149247ms step_avg:155.79ms step:969/1480 train_time:149413ms step_avg:155.80ms step:970/1480 train_time:149576ms step_avg:155.81ms step:971/1480 train_time:149741ms step_avg:155.82ms step:972/1480 train_time:149907ms step_avg:155.83ms step:973/1480 train_time:150072ms step_avg:155.84ms step:974/1480 train_time:150239ms step_avg:155.85ms step:975/1480 train_time:150405ms step_avg:155.86ms step:976/1480 train_time:150571ms step_avg:155.87ms step:977/1480 train_time:150734ms step_avg:155.88ms step:978/1480 train_time:150898ms step_avg:155.89ms step:979/1480 train_time:151063ms step_avg:155.90ms step:980/1480 train_time:151230ms step_avg:155.91ms step:981/1480 train_time:151398ms step_avg:155.92ms step:982/1480 train_time:151561ms step_avg:155.93ms step:983/1480 train_time:151728ms step_avg:155.94ms step:984/1480 train_time:151893ms step_avg:155.95ms step:985/1480 train_time:152058ms step_avg:155.96ms step:986/1480 train_time:152223ms step_avg:155.97ms step:987/1480 train_time:152388ms step_avg:155.98ms step:988/1480 train_time:152555ms step_avg:155.99ms step:989/1480 train_time:152719ms step_avg:155.99ms step:990/1480 train_time:152888ms step_avg:156.01ms step:991/1480 train_time:153056ms step_avg:156.02ms step:992/1480 train_time:153232ms step_avg:156.04ms step:993/1480 train_time:153410ms step_avg:156.06ms step:994/1480 train_time:153576ms step_avg:156.07ms step:995/1480 train_time:153740ms step_avg:156.08ms step:996/1480 train_time:153904ms step_avg:156.09ms step:997/1480 train_time:154068ms step_avg:156.10ms step:998/1480 train_time:154232ms step_avg:156.11ms step:999/1480 train_time:154396ms step_avg:156.11ms step:1000/1480 train_time:154568ms step_avg:156.13ms step:1000/1480 val_loss:3.4413 train_time:154637ms step_avg:156.20ms step:1001/1480 train_time:154740ms step_avg:156.15ms step:1002/1480 train_time:154905ms step_avg:156.15ms step:1003/1480 train_time:155077ms step_avg:156.17ms step:1004/1480 train_time:155246ms step_avg:156.18ms step:1005/1480 train_time:155412ms step_avg:156.19ms step:1006/1480 train_time:155581ms step_avg:156.21ms step:1007/1480 train_time:155747ms step_avg:156.22ms step:1008/1480 train_time:155913ms step_avg:156.23ms step:1009/1480 train_time:156087ms step_avg:156.24ms step:1010/1480 train_time:156252ms step_avg:156.25ms step:1011/1480 train_time:156418ms step_avg:156.26ms step:1012/1480 train_time:156584ms step_avg:156.27ms step:1013/1480 train_time:156753ms step_avg:156.28ms step:1014/1480 train_time:156920ms step_avg:156.29ms step:1015/1480 train_time:157089ms step_avg:156.31ms step:1016/1480 train_time:157257ms step_avg:156.32ms step:1017/1480 train_time:157427ms step_avg:156.33ms step:1018/1480 train_time:157595ms step_avg:156.34ms step:1019/1480 train_time:157763ms step_avg:156.36ms step:1020/1480 train_time:157933ms step_avg:156.37ms step:1021/1480 train_time:158099ms step_avg:156.38ms step:1022/1480 train_time:158266ms step_avg:156.39ms step:1023/1480 train_time:158431ms step_avg:156.40ms step:1024/1480 train_time:158600ms step_avg:156.41ms step:1025/1480 train_time:158770ms step_avg:156.42ms step:1026/1480 train_time:158936ms step_avg:156.43ms step:1027/1480 train_time:159101ms step_avg:156.44ms step:1028/1480 train_time:159274ms step_avg:156.46ms step:1029/1480 train_time:159449ms step_avg:156.48ms step:1030/1480 train_time:159617ms step_avg:156.49ms step:1031/1480 train_time:159782ms step_avg:156.50ms step:1032/1480 train_time:159953ms step_avg:156.51ms step:1033/1480 train_time:160121ms step_avg:156.52ms step:1034/1480 train_time:160289ms step_avg:156.53ms step:1035/1480 train_time:160456ms step_avg:156.54ms step:1036/1480 train_time:160622ms step_avg:156.55ms step:1037/1480 train_time:160788ms step_avg:156.56ms step:1038/1480 train_time:160957ms step_avg:156.57ms step:1039/1480 train_time:161126ms step_avg:156.58ms step:1040/1480 train_time:161292ms step_avg:156.59ms step:1041/1480 train_time:161461ms step_avg:156.61ms step:1042/1480 train_time:161624ms step_avg:156.61ms step:1043/1480 train_time:161789ms step_avg:156.62ms step:1044/1480 train_time:161954ms step_avg:156.63ms step:1045/1480 train_time:162125ms step_avg:156.64ms step:1046/1480 train_time:162292ms step_avg:156.65ms step:1047/1480 train_time:162460ms step_avg:156.66ms step:1048/1480 train_time:162626ms step_avg:156.67ms step:1049/1480 train_time:162791ms step_avg:156.68ms step:1050/1480 train_time:162962ms step_avg:156.69ms step:1051/1480 train_time:163129ms step_avg:156.70ms step:1052/1480 train_time:163299ms step_avg:156.72ms step:1053/1480 train_time:163465ms step_avg:156.73ms step:1054/1480 train_time:163632ms step_avg:156.74ms step:1055/1480 train_time:163798ms step_avg:156.74ms step:1056/1480 train_time:163963ms step_avg:156.75ms step:1057/1480 train_time:164127ms step_avg:156.76ms step:1058/1480 train_time:164297ms step_avg:156.77ms step:1059/1480 train_time:164471ms step_avg:156.79ms step:1060/1480 train_time:164641ms step_avg:156.80ms step:1061/1480 train_time:164804ms step_avg:156.81ms step:1062/1480 train_time:164971ms step_avg:156.82ms step:1063/1480 train_time:165137ms step_avg:156.83ms step:1064/1480 train_time:165302ms step_avg:156.83ms step:1065/1480 train_time:165469ms step_avg:156.84ms step:1066/1480 train_time:165638ms step_avg:156.85ms step:1067/1480 train_time:165807ms step_avg:156.87ms step:1068/1480 train_time:165973ms step_avg:156.87ms step:1069/1480 train_time:166144ms step_avg:156.89ms step:1070/1480 train_time:166309ms step_avg:156.90ms step:1071/1480 train_time:166483ms step_avg:156.91ms step:1072/1480 train_time:166649ms step_avg:156.92ms step:1073/1480 train_time:166812ms step_avg:156.93ms step:1074/1480 train_time:166980ms step_avg:156.94ms step:1075/1480 train_time:167149ms step_avg:156.95ms step:1076/1480 train_time:167317ms step_avg:156.96ms step:1077/1480 train_time:167485ms step_avg:156.97ms step:1078/1480 train_time:167660ms step_avg:156.99ms step:1079/1480 train_time:167833ms step_avg:157.00ms step:1080/1480 train_time:168003ms step_avg:157.01ms step:1081/1480 train_time:168169ms step_avg:157.02ms step:1082/1480 train_time:168336ms step_avg:157.03ms step:1083/1480 train_time:168503ms step_avg:157.04ms step:1084/1480 train_time:168668ms step_avg:157.05ms step:1085/1480 train_time:168837ms step_avg:157.06ms step:1086/1480 train_time:169005ms step_avg:157.07ms step:1087/1480 train_time:169172ms step_avg:157.08ms step:1088/1480 train_time:169342ms step_avg:157.09ms step:1089/1480 train_time:169514ms step_avg:157.10ms step:1090/1480 train_time:169686ms step_avg:157.12ms step:1091/1480 train_time:169853ms step_avg:157.13ms step:1092/1480 train_time:170021ms step_avg:157.14ms step:1093/1480 train_time:170188ms step_avg:157.15ms step:1094/1480 train_time:170353ms step_avg:157.15ms step:1095/1480 train_time:170518ms step_avg:157.16ms step:1096/1480 train_time:170686ms step_avg:157.17ms step:1097/1480 train_time:170856ms step_avg:157.18ms step:1098/1480 train_time:171026ms step_avg:157.19ms step:1099/1480 train_time:171197ms step_avg:157.21ms step:1100/1480 train_time:171370ms step_avg:157.22ms step:1101/1480 train_time:171541ms step_avg:157.23ms step:1102/1480 train_time:171712ms step_avg:157.25ms step:1103/1480 train_time:171889ms step_avg:157.26ms step:1104/1480 train_time:172057ms step_avg:157.27ms step:1105/1480 train_time:172226ms step_avg:157.28ms step:1106/1480 train_time:172394ms step_avg:157.29ms step:1107/1480 train_time:172564ms step_avg:157.31ms step:1108/1480 train_time:172729ms step_avg:157.31ms step:1109/1480 train_time:172896ms step_avg:157.32ms step:1110/1480 train_time:173063ms step_avg:157.33ms step:1111/1480 train_time:173229ms step_avg:157.34ms step:1112/1480 train_time:173401ms step_avg:157.35ms step:1113/1480 train_time:173582ms step_avg:157.37ms step:1114/1480 train_time:173754ms step_avg:157.39ms step:1115/1480 train_time:173927ms step_avg:157.40ms step:1116/1480 train_time:174093ms step_avg:157.41ms step:1117/1480 train_time:174266ms step_avg:157.42ms step:1118/1480 train_time:174439ms step_avg:157.44ms step:1119/1480 train_time:174603ms step_avg:157.44ms step:1120/1480 train_time:174772ms step_avg:157.45ms step:1121/1480 train_time:174943ms step_avg:157.46ms step:1122/1480 train_time:175109ms step_avg:157.47ms step:1123/1480 train_time:175276ms step_avg:157.48ms step:1124/1480 train_time:175443ms step_avg:157.49ms step:1125/1480 train_time:175609ms step_avg:157.50ms step:1125/1480 val_loss:3.3865 train_time:175677ms step_avg:157.56ms step:1126/1480 train_time:175780ms step_avg:157.51ms step:1127/1480 train_time:175949ms step_avg:157.52ms step:1128/1480 train_time:176120ms step_avg:157.53ms step:1129/1480 train_time:176294ms step_avg:157.55ms step:1130/1480 train_time:176462ms step_avg:157.56ms step:1131/1480 train_time:176639ms step_avg:157.57ms step:1132/1480 train_time:176805ms step_avg:157.58ms step:1133/1480 train_time:176975ms step_avg:157.59ms step:1134/1480 train_time:177146ms step_avg:157.60ms step:1135/1480 train_time:177314ms step_avg:157.61ms step:1136/1480 train_time:177487ms step_avg:157.63ms step:1137/1480 train_time:177655ms step_avg:157.64ms step:1138/1480 train_time:177827ms step_avg:157.65ms step:1139/1480 train_time:177995ms step_avg:157.66ms step:1140/1480 train_time:178163ms step_avg:157.67ms step:1141/1480 train_time:178334ms step_avg:157.68ms step:1142/1480 train_time:178504ms step_avg:157.69ms step:1143/1480 train_time:178673ms step_avg:157.70ms step:1144/1480 train_time:178843ms step_avg:157.71ms step:1145/1480 train_time:179008ms step_avg:157.72ms step:1146/1480 train_time:179180ms step_avg:157.73ms step:1147/1480 train_time:179348ms step_avg:157.74ms step:1148/1480 train_time:179516ms step_avg:157.75ms step:1149/1480 train_time:179688ms step_avg:157.76ms step:1150/1480 train_time:179856ms step_avg:157.77ms step:1151/1480 train_time:180028ms step_avg:157.78ms step:1152/1480 train_time:180199ms step_avg:157.79ms step:1153/1480 train_time:180372ms step_avg:157.81ms step:1154/1480 train_time:180540ms step_avg:157.81ms step:1155/1480 train_time:180711ms step_avg:157.83ms step:1156/1480 train_time:180890ms step_avg:157.84ms step:1157/1480 train_time:181061ms step_avg:157.86ms step:1158/1480 train_time:181228ms step_avg:157.86ms step:1159/1480 train_time:181397ms step_avg:157.87ms step:1160/1480 train_time:181563ms step_avg:157.88ms step:1161/1480 train_time:181733ms step_avg:157.89ms step:1162/1480 train_time:181904ms step_avg:157.90ms step:1163/1480 train_time:182073ms step_avg:157.91ms step:1164/1480 train_time:182243ms step_avg:157.92ms step:1165/1480 train_time:182409ms step_avg:157.93ms step:1166/1480 train_time:182580ms step_avg:157.94ms step:1167/1480 train_time:182747ms step_avg:157.95ms step:1168/1480 train_time:182914ms step_avg:157.96ms step:1169/1480 train_time:183086ms step_avg:157.97ms step:1170/1480 train_time:183253ms step_avg:157.98ms step:1171/1480 train_time:183421ms step_avg:157.99ms step:1172/1480 train_time:183588ms step_avg:157.99ms step:1173/1480 train_time:183761ms step_avg:158.01ms step:1174/1480 train_time:183944ms step_avg:158.03ms step:1175/1480 train_time:184115ms step_avg:158.04ms step:1176/1480 train_time:184287ms step_avg:158.05ms step:1177/1480 train_time:184464ms step_avg:158.07ms step:1178/1480 train_time:184631ms step_avg:158.07ms step:1179/1480 train_time:184799ms step_avg:158.08ms step:1180/1480 train_time:184979ms step_avg:158.10ms step:1181/1480 train_time:185149ms step_avg:158.11ms step:1182/1480 train_time:185316ms step_avg:158.12ms step:1183/1480 train_time:185488ms step_avg:158.13ms step:1184/1480 train_time:185656ms step_avg:158.14ms step:1185/1480 train_time:185830ms step_avg:158.15ms step:1186/1480 train_time:186000ms step_avg:158.16ms step:1187/1480 train_time:186184ms step_avg:158.19ms step:1188/1480 train_time:186350ms step_avg:158.19ms step:1189/1480 train_time:186522ms step_avg:158.20ms step:1190/1480 train_time:186690ms step_avg:158.21ms step:1191/1480 train_time:186862ms step_avg:158.22ms step:1192/1480 train_time:187028ms step_avg:158.23ms step:1193/1480 train_time:187196ms step_avg:158.24ms step:1194/1480 train_time:187365ms step_avg:158.25ms step:1195/1480 train_time:187539ms step_avg:158.26ms step:1196/1480 train_time:187722ms step_avg:158.28ms step:1197/1480 train_time:187892ms step_avg:158.29ms step:1198/1480 train_time:188074ms step_avg:158.31ms step:1199/1480 train_time:188244ms step_avg:158.32ms step:1200/1480 train_time:188412ms step_avg:158.33ms step:1201/1480 train_time:188581ms step_avg:158.34ms step:1202/1480 train_time:188763ms step_avg:158.36ms step:1203/1480 train_time:188939ms step_avg:158.37ms step:1204/1480 train_time:189114ms step_avg:158.39ms step:1205/1480 train_time:189283ms step_avg:158.40ms step:1206/1480 train_time:189448ms step_avg:158.40ms step:1207/1480 train_time:189618ms step_avg:158.41ms step:1208/1480 train_time:189786ms step_avg:158.42ms step:1209/1480 train_time:189959ms step_avg:158.43ms step:1210/1480 train_time:190132ms step_avg:158.44ms step:1211/1480 train_time:190306ms step_avg:158.46ms step:1212/1480 train_time:190476ms step_avg:158.47ms step:1213/1480 train_time:190648ms step_avg:158.48ms step:1214/1480 train_time:190825ms step_avg:158.49ms step:1215/1480 train_time:191000ms step_avg:158.51ms step:1216/1480 train_time:191169ms step_avg:158.51ms step:1217/1480 train_time:191342ms step_avg:158.53ms step:1218/1480 train_time:191511ms step_avg:158.54ms step:1219/1480 train_time:191690ms step_avg:158.55ms step:1220/1480 train_time:191860ms step_avg:158.56ms step:1221/1480 train_time:192029ms step_avg:158.57ms step:1222/1480 train_time:192198ms step_avg:158.58ms step:1223/1480 train_time:192367ms step_avg:158.59ms step:1224/1480 train_time:192545ms step_avg:158.60ms step:1225/1480 train_time:192716ms step_avg:158.61ms step:1226/1480 train_time:192891ms step_avg:158.63ms step:1227/1480 train_time:193065ms step_avg:158.64ms step:1228/1480 train_time:193235ms step_avg:158.65ms step:1229/1480 train_time:193408ms step_avg:158.66ms step:1230/1480 train_time:193589ms step_avg:158.68ms step:1231/1480 train_time:193765ms step_avg:158.69ms step:1232/1480 train_time:193938ms step_avg:158.71ms step:1233/1480 train_time:194109ms step_avg:158.72ms step:1234/1480 train_time:194278ms step_avg:158.72ms step:1235/1480 train_time:194452ms step_avg:158.74ms step:1236/1480 train_time:194620ms step_avg:158.74ms step:1237/1480 train_time:194791ms step_avg:158.75ms step:1238/1480 train_time:194975ms step_avg:158.77ms step:1239/1480 train_time:195145ms step_avg:158.78ms step:1240/1480 train_time:195315ms step_avg:158.79ms step:1241/1480 train_time:195488ms step_avg:158.80ms step:1242/1480 train_time:195657ms step_avg:158.81ms step:1243/1480 train_time:195831ms step_avg:158.82ms step:1244/1480 train_time:195998ms step_avg:158.83ms step:1245/1480 train_time:196167ms step_avg:158.84ms step:1246/1480 train_time:196335ms step_avg:158.85ms step:1247/1480 train_time:196505ms step_avg:158.86ms step:1248/1480 train_time:196672ms step_avg:158.86ms step:1249/1480 train_time:196841ms step_avg:158.87ms step:1250/1480 train_time:197011ms step_avg:158.88ms step:1250/1480 val_loss:3.3373 train_time:197081ms step_avg:158.94ms step:1251/1480 train_time:197193ms step_avg:158.90ms step:1252/1480 train_time:197364ms step_avg:158.91ms step:1253/1480 train_time:197533ms step_avg:158.92ms step:1254/1480 train_time:197703ms step_avg:158.93ms step:1255/1480 train_time:197890ms step_avg:158.95ms step:1256/1480 train_time:198062ms step_avg:158.96ms step:1257/1480 train_time:198233ms step_avg:158.97ms step:1258/1480 train_time:198408ms step_avg:158.98ms step:1259/1480 train_time:198580ms step_avg:158.99ms step:1260/1480 train_time:198748ms step_avg:159.00ms step:1261/1480 train_time:198920ms step_avg:159.01ms step:1262/1480 train_time:199095ms step_avg:159.02ms step:1263/1480 train_time:199268ms step_avg:159.03ms step:1264/1480 train_time:199436ms step_avg:159.04ms step:1265/1480 train_time:199604ms step_avg:159.05ms step:1266/1480 train_time:199774ms step_avg:159.06ms step:1267/1480 train_time:199944ms step_avg:159.06ms step:1268/1480 train_time:200115ms step_avg:159.07ms step:1269/1480 train_time:200291ms step_avg:159.09ms step:1270/1480 train_time:200461ms step_avg:159.10ms step:1271/1480 train_time:200632ms step_avg:159.11ms step:1272/1480 train_time:200797ms step_avg:159.11ms step:1273/1480 train_time:200969ms step_avg:159.12ms step:1274/1480 train_time:201139ms step_avg:159.13ms step:1275/1480 train_time:201309ms step_avg:159.14ms step:1276/1480 train_time:201474ms step_avg:159.14ms step:1277/1480 train_time:201648ms step_avg:159.15ms step:1278/1480 train_time:201817ms step_avg:159.16ms step:1279/1480 train_time:201989ms step_avg:159.17ms step:1280/1480 train_time:202169ms step_avg:159.19ms step:1281/1480 train_time:202336ms step_avg:159.19ms step:1282/1480 train_time:202502ms step_avg:159.20ms step:1283/1480 train_time:202671ms step_avg:159.21ms step:1284/1480 train_time:202839ms step_avg:159.21ms step:1285/1480 train_time:203009ms step_avg:159.22ms step:1286/1480 train_time:203177ms step_avg:159.23ms step:1287/1480 train_time:203349ms step_avg:159.24ms step:1288/1480 train_time:203521ms step_avg:159.25ms step:1289/1480 train_time:203706ms step_avg:159.27ms step:1290/1480 train_time:203885ms step_avg:159.29ms step:1291/1480 train_time:204058ms step_avg:159.30ms step:1292/1480 train_time:204233ms step_avg:159.31ms step:1293/1480 train_time:204409ms step_avg:159.32ms step:1294/1480 train_time:204578ms step_avg:159.33ms step:1295/1480 train_time:204749ms step_avg:159.34ms step:1296/1480 train_time:204924ms step_avg:159.35ms step:1297/1480 train_time:205095ms step_avg:159.36ms step:1298/1480 train_time:205266ms step_avg:159.37ms step:1299/1480 train_time:205438ms step_avg:159.38ms step:1300/1480 train_time:205606ms step_avg:159.38ms step:1301/1480 train_time:205774ms step_avg:159.39ms step:1302/1480 train_time:205949ms step_avg:159.40ms step:1303/1480 train_time:206126ms step_avg:159.42ms step:1304/1480 train_time:206300ms step_avg:159.43ms step:1305/1480 train_time:206469ms step_avg:159.44ms step:1306/1480 train_time:206643ms step_avg:159.45ms step:1307/1480 train_time:206812ms step_avg:159.45ms step:1308/1480 train_time:206979ms step_avg:159.46ms step:1309/1480 train_time:207153ms step_avg:159.47ms step:1310/1480 train_time:207322ms step_avg:159.48ms step:1311/1480 train_time:207491ms step_avg:159.49ms step:1312/1480 train_time:207664ms step_avg:159.50ms step:1313/1480 train_time:207833ms step_avg:159.50ms step:1314/1480 train_time:208006ms step_avg:159.51ms step:1315/1480 train_time:208177ms step_avg:159.52ms step:1316/1480 train_time:208344ms step_avg:159.53ms step:1317/1480 train_time:208515ms step_avg:159.54ms step:1318/1480 train_time:208696ms step_avg:159.55ms step:1319/1480 train_time:208872ms step_avg:159.57ms step:1320/1480 train_time:209048ms step_avg:159.58ms step:1321/1480 train_time:209220ms step_avg:159.59ms step:1322/1480 train_time:209402ms step_avg:159.60ms step:1323/1480 train_time:209573ms step_avg:159.61ms step:1324/1480 train_time:209749ms step_avg:159.63ms step:1325/1480 train_time:209930ms step_avg:159.64ms step:1326/1480 train_time:210104ms step_avg:159.65ms step:1327/1480 train_time:210273ms step_avg:159.66ms step:1328/1480 train_time:210445ms step_avg:159.67ms step:1329/1480 train_time:210640ms step_avg:159.70ms step:1330/1480 train_time:210819ms step_avg:159.71ms step:1331/1480 train_time:210989ms step_avg:159.72ms step:1332/1480 train_time:211164ms step_avg:159.73ms step:1333/1480 train_time:211339ms step_avg:159.74ms step:1334/1480 train_time:211510ms step_avg:159.75ms step:1335/1480 train_time:211678ms step_avg:159.76ms step:1336/1480 train_time:211863ms step_avg:159.78ms step:1337/1480 train_time:212038ms step_avg:159.79ms step:1338/1480 train_time:212211ms step_avg:159.80ms step:1339/1480 train_time:212383ms step_avg:159.81ms step:1340/1480 train_time:212556ms step_avg:159.82ms step:1341/1480 train_time:212725ms step_avg:159.82ms step:1342/1480 train_time:212898ms step_avg:159.83ms step:1343/1480 train_time:213068ms step_avg:159.84ms step:1344/1480 train_time:213240ms step_avg:159.85ms step:1345/1480 train_time:213421ms step_avg:159.87ms step:1346/1480 train_time:213589ms step_avg:159.87ms step:1347/1480 train_time:213757ms step_avg:159.88ms step:1348/1480 train_time:213928ms step_avg:159.89ms step:1349/1480 train_time:214097ms step_avg:159.89ms step:1350/1480 train_time:214272ms step_avg:159.90ms step:1351/1480 train_time:214442ms step_avg:159.91ms step:1352/1480 train_time:214614ms step_avg:159.92ms step:1353/1480 train_time:214791ms step_avg:159.93ms step:1354/1480 train_time:214962ms step_avg:159.94ms step:1355/1480 train_time:215130ms step_avg:159.95ms step:1356/1480 train_time:215303ms step_avg:159.96ms step:1357/1480 train_time:215477ms step_avg:159.97ms step:1358/1480 train_time:215651ms step_avg:159.98ms step:1359/1480 train_time:215822ms step_avg:159.99ms step:1360/1480 train_time:215996ms step_avg:160.00ms step:1361/1480 train_time:216174ms step_avg:160.01ms step:1362/1480 train_time:216349ms step_avg:160.02ms step:1363/1480 train_time:216529ms step_avg:160.04ms step:1364/1480 train_time:216698ms step_avg:160.04ms step:1365/1480 train_time:216866ms step_avg:160.05ms step:1366/1480 train_time:217038ms step_avg:160.06ms step:1367/1480 train_time:217210ms step_avg:160.07ms step:1368/1480 train_time:217384ms step_avg:160.08ms step:1369/1480 train_time:217565ms step_avg:160.09ms step:1370/1480 train_time:217744ms step_avg:160.11ms step:1371/1480 train_time:217915ms step_avg:160.11ms step:1372/1480 train_time:218094ms step_avg:160.13ms step:1373/1480 train_time:218263ms step_avg:160.13ms step:1374/1480 train_time:218440ms step_avg:160.15ms step:1375/1480 train_time:218612ms step_avg:160.16ms step:1375/1480 val_loss:3.2985 train_time:218679ms step_avg:160.20ms step:1376/1480 train_time:218786ms step_avg:160.17ms step:1377/1480 train_time:218957ms step_avg:160.17ms step:1378/1480 train_time:219126ms step_avg:160.18ms step:1379/1480 train_time:219302ms step_avg:160.19ms step:1380/1480 train_time:219476ms step_avg:160.20ms step:1381/1480 train_time:219658ms step_avg:160.22ms step:1382/1480 train_time:219829ms step_avg:160.23ms step:1383/1480 train_time:220002ms step_avg:160.23ms step:1384/1480 train_time:220181ms step_avg:160.25ms step:1385/1480 train_time:220347ms step_avg:160.25ms step:1386/1480 train_time:220518ms step_avg:160.26ms step:1387/1480 train_time:220688ms step_avg:160.27ms step:1388/1480 train_time:220858ms step_avg:160.27ms step:1389/1480 train_time:221031ms step_avg:160.28ms step:1390/1480 train_time:221199ms step_avg:160.29ms step:1391/1480 train_time:221370ms step_avg:160.30ms step:1392/1480 train_time:221544ms step_avg:160.31ms step:1393/1480 train_time:221714ms step_avg:160.31ms step:1394/1480 train_time:221884ms step_avg:160.32ms step:1395/1480 train_time:222052ms step_avg:160.33ms step:1396/1480 train_time:222223ms step_avg:160.33ms step:1397/1480 train_time:222389ms step_avg:160.34ms step:1398/1480 train_time:222558ms step_avg:160.34ms step:1399/1480 train_time:222727ms step_avg:160.35ms step:1400/1480 train_time:222906ms step_avg:160.36ms step:1401/1480 train_time:223071ms step_avg:160.37ms step:1402/1480 train_time:223243ms step_avg:160.38ms step:1403/1480 train_time:223420ms step_avg:160.39ms step:1404/1480 train_time:223591ms step_avg:160.40ms step:1405/1480 train_time:223767ms step_avg:160.41ms step:1406/1480 train_time:223942ms step_avg:160.42ms step:1407/1480 train_time:224109ms step_avg:160.42ms step:1408/1480 train_time:224278ms step_avg:160.43ms step:1409/1480 train_time:224461ms step_avg:160.44ms step:1410/1480 train_time:224629ms step_avg:160.45ms step:1411/1480 train_time:224798ms step_avg:160.46ms step:1412/1480 train_time:224968ms step_avg:160.46ms step:1413/1480 train_time:225139ms step_avg:160.47ms step:1414/1480 train_time:225309ms step_avg:160.48ms step:1415/1480 train_time:225485ms step_avg:160.49ms step:1416/1480 train_time:225669ms step_avg:160.50ms step:1417/1480 train_time:225844ms step_avg:160.51ms step:1418/1480 train_time:226014ms step_avg:160.52ms step:1419/1480 train_time:226187ms step_avg:160.53ms step:1420/1480 train_time:226363ms step_avg:160.54ms step:1421/1480 train_time:226535ms step_avg:160.55ms step:1422/1480 train_time:226708ms step_avg:160.56ms step:1423/1480 train_time:226877ms step_avg:160.56ms step:1424/1480 train_time:227053ms step_avg:160.58ms step:1425/1480 train_time:227234ms step_avg:160.59ms step:1426/1480 train_time:227405ms step_avg:160.60ms step:1427/1480 train_time:227581ms step_avg:160.61ms step:1428/1480 train_time:227751ms step_avg:160.61ms step:1429/1480 train_time:227921ms step_avg:160.62ms step:1430/1480 train_time:228095ms step_avg:160.63ms step:1431/1480 train_time:228271ms step_avg:160.64ms step:1432/1480 train_time:228448ms step_avg:160.65ms step:1433/1480 train_time:228626ms step_avg:160.67ms step:1434/1480 train_time:228807ms step_avg:160.68ms step:1435/1480 train_time:228981ms step_avg:160.69ms step:1436/1480 train_time:229155ms step_avg:160.70ms step:1437/1480 train_time:229326ms step_avg:160.71ms step:1438/1480 train_time:229495ms step_avg:160.71ms step:1439/1480 train_time:229670ms step_avg:160.72ms step:1440/1480 train_time:229839ms step_avg:160.73ms step:1441/1480 train_time:230009ms step_avg:160.73ms step:1442/1480 train_time:230186ms step_avg:160.74ms step:1443/1480 train_time:230375ms step_avg:160.76ms step:1444/1480 train_time:230547ms step_avg:160.77ms step:1445/1480 train_time:230716ms step_avg:160.78ms step:1446/1480 train_time:230891ms step_avg:160.79ms step:1447/1480 train_time:231070ms step_avg:160.80ms step:1448/1480 train_time:231241ms step_avg:160.81ms step:1449/1480 train_time:231415ms step_avg:160.82ms step:1450/1480 train_time:231589ms step_avg:160.83ms step:1451/1480 train_time:231760ms step_avg:160.83ms step:1452/1480 train_time:231932ms step_avg:160.84ms step:1453/1480 train_time:232103ms step_avg:160.85ms step:1454/1480 train_time:232275ms step_avg:160.85ms step:1455/1480 train_time:232453ms step_avg:160.87ms step:1456/1480 train_time:232625ms step_avg:160.88ms step:1457/1480 train_time:232796ms step_avg:160.88ms step:1458/1480 train_time:232967ms step_avg:160.89ms step:1459/1480 train_time:233143ms step_avg:160.90ms step:1460/1480 train_time:233314ms step_avg:160.91ms step:1461/1480 train_time:233487ms step_avg:160.91ms step:1462/1480 train_time:233658ms step_avg:160.92ms step:1463/1480 train_time:233834ms step_avg:160.93ms step:1464/1480 train_time:234010ms step_avg:160.94ms step:1465/1480 train_time:234182ms step_avg:160.95ms step:1466/1480 train_time:234352ms step_avg:160.96ms step:1467/1480 train_time:234527ms step_avg:160.97ms step:1468/1480 train_time:234698ms step_avg:160.97ms step:1469/1480 train_time:234873ms step_avg:160.98ms step:1470/1480 train_time:235053ms step_avg:161.00ms step:1471/1480 train_time:235241ms step_avg:161.01ms step:1472/1480 train_time:235422ms step_avg:161.03ms step:1473/1480 train_time:235594ms step_avg:161.03ms step:1474/1480 train_time:235772ms step_avg:161.05ms step:1475/1480 train_time:235951ms step_avg:161.06ms step:1476/1480 train_time:236123ms step_avg:161.07ms step:1477/1480 train_time:236306ms step_avg:161.08ms step:1478/1480 train_time:236487ms step_avg:161.09ms step:1479/1480 train_time:236662ms step_avg:161.10ms step:1480/1480 train_time:236835ms step_avg:161.11ms step:1480/1480 val_loss:3.2792 train_time:236908ms step_avg:161.16ms