import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): self.num_process = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ["RANK"]) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) params: "list[torch.Tensor]" = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { "params": [p for p in params if p.numel() == size], "update_buffer": [ torch.empty(size, device="cuda", dtype=torch.bfloat16) for _ in range(self.num_process) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr: float = group["lr"] momentum: float = group["momentum"] nesterov: bool = group["nesterov"] zeropower_backend = zeropower_backends[group["backend"]] backend_steps: int = group["backend_steps"] update_buffers: "list[torch.Tensor]" = group["update_buffer"] # generate weight updates in distributed fashion params: "list[torch.Tensor]" = group["params"] assert len(params) % self.num_process == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.num_process]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: torch.Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_backend(g, steps=backend_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.num_process] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1) k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1) v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim: int): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor: x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 lm_head_softcap : int = 30 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.n_layer = config.n_layer self.lm_head_softcap = config.lm_head_softcap # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor: BLOCK_SIZE = 128 assert idx.ndim == 1 docs = (idx == 50256).cumsum(0) docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous() def document_sliding_window_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < sliding_window return causal_mask & document_mask & window_mask S = len(idx) def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor): kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_mask = q_idx >= kv_idx document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE) dense_mask = causal_mask & document_mask & window_mask dense_mask = dense_mask.to(torch.int32) num_blocks = dense_mask.sum(dim=-1).to(torch.int32) indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32) num_blocks = num_blocks[None, None, :].contiguous() indices = indices[None, None, :].contiguous() return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal) block_mask = create_sliding_window_causal_mask(S, sliding_window) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(file: Path, ntok: int): with file.open("rb") as f: tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.ntoks = [_peek_data_shard(file) for file in self.files] assert min(self.ntoks) >= num_processes * T + 1 self.ntok_total = sum(self.ntoks) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return x, y # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id # os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size assert train_accumulation_steps == 1 # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda") sw_size_prev = 64 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social sw_size = 64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64) if sw_size != sw_size_prev: sliding_window_size.copy_(sw_size, non_blocking=True) sw_size_prev = sw_size # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, sliding_window=sliding_window_size) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() loss = model(x, y, sliding_window=sliding_window_size) loss.backward() del loss # advance the dataset for the next batch x, y = train_loader.next_batch() # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Sun Dec 8 10:58:26 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.6 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:65:02.0 Off | 0 | | N/A 37C P0 74W / 700W | 7MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:67:02.0 Off | 0 | | N/A 46C P0 130W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:69:02.0 Off | 0 | | N/A 46C P0 124W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:6B:02.0 Off | 0 | | N/A 39C P0 119W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6F:02.0 Off | 0 | | N/A 39C P0 117W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:71:02.0 Off | 0 | | N/A 46C P0 122W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:73:02.0 Off | 0 | | N/A 46C P0 127W / 700W | 533MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:75:02.0 Off | 0 | | N/A 39C P0 124W / 700W | 627MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 3200000000 across 32 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:22899ms step_avg:nanms step:2/1480 train_time:22990ms step_avg:nanms step:3/1480 train_time:23129ms step_avg:nanms step:4/1480 train_time:23269ms step_avg:nanms step:5/1480 train_time:23411ms step_avg:nanms step:6/1480 train_time:23554ms step_avg:nanms step:7/1480 train_time:23696ms step_avg:nanms step:8/1480 train_time:23840ms step_avg:nanms step:9/1480 train_time:23983ms step_avg:nanms step:10/1480 train_time:24127ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:286ms step_avg:nanms step:13/1480 train_time:430ms step_avg:143.24ms step:14/1480 train_time:572ms step_avg:142.95ms step:15/1480 train_time:715ms step_avg:142.94ms step:16/1480 train_time:858ms step_avg:142.96ms step:17/1480 train_time:1001ms step_avg:143.04ms step:18/1480 train_time:1145ms step_avg:143.14ms step:19/1480 train_time:1289ms step_avg:143.20ms step:20/1480 train_time:1431ms step_avg:143.11ms step:21/1480 train_time:1574ms step_avg:143.08ms step:22/1480 train_time:1717ms step_avg:143.09ms step:23/1480 train_time:1858ms step_avg:142.96ms step:24/1480 train_time:2001ms step_avg:142.95ms step:25/1480 train_time:2145ms step_avg:143.00ms step:26/1480 train_time:2289ms step_avg:143.03ms step:27/1480 train_time:2432ms step_avg:143.04ms step:28/1480 train_time:2574ms step_avg:143.03ms step:29/1480 train_time:2716ms step_avg:142.95ms step:30/1480 train_time:2857ms step_avg:142.84ms step:31/1480 train_time:3001ms step_avg:142.89ms step:32/1480 train_time:3145ms step_avg:142.97ms step:33/1480 train_time:3289ms step_avg:143.02ms step:34/1480 train_time:3432ms step_avg:143.00ms step:35/1480 train_time:3574ms step_avg:142.98ms step:36/1480 train_time:3716ms step_avg:142.92ms step:37/1480 train_time:3857ms step_avg:142.83ms step:38/1480 train_time:3999ms step_avg:142.81ms step:39/1480 train_time:4144ms step_avg:142.89ms step:40/1480 train_time:4289ms step_avg:142.98ms step:41/1480 train_time:4433ms step_avg:143.00ms step:42/1480 train_time:4574ms step_avg:142.95ms step:43/1480 train_time:4715ms step_avg:142.89ms step:44/1480 train_time:4856ms step_avg:142.83ms step:45/1480 train_time:5001ms step_avg:142.87ms step:46/1480 train_time:5145ms step_avg:142.92ms step:47/1480 train_time:5289ms step_avg:142.94ms step:48/1480 train_time:5432ms step_avg:142.94ms step:49/1480 train_time:5574ms step_avg:142.92ms step:50/1480 train_time:5715ms step_avg:142.87ms step:51/1480 train_time:5856ms step_avg:142.83ms step:52/1480 train_time:5997ms step_avg:142.79ms step:53/1480 train_time:6141ms step_avg:142.82ms step:54/1480 train_time:6285ms step_avg:142.84ms step:55/1480 train_time:6430ms step_avg:142.89ms step:56/1480 train_time:6572ms step_avg:142.88ms step:57/1480 train_time:6714ms step_avg:142.84ms step:58/1480 train_time:6854ms step_avg:142.79ms step:59/1480 train_time:6995ms step_avg:142.75ms step:60/1480 train_time:7139ms step_avg:142.78ms step:61/1480 train_time:7282ms step_avg:142.79ms step:62/1480 train_time:7428ms step_avg:142.84ms step:63/1480 train_time:7570ms step_avg:142.84ms step:64/1480 train_time:7712ms step_avg:142.82ms step:65/1480 train_time:7853ms step_avg:142.78ms step:66/1480 train_time:7994ms step_avg:142.75ms step:67/1480 train_time:8135ms step_avg:142.73ms step:68/1480 train_time:8278ms step_avg:142.72ms step:69/1480 train_time:8420ms step_avg:142.72ms step:70/1480 train_time:8562ms step_avg:142.71ms step:71/1480 train_time:8708ms step_avg:142.75ms step:72/1480 train_time:8851ms step_avg:142.76ms step:73/1480 train_time:8992ms step_avg:142.74ms step:74/1480 train_time:9135ms step_avg:142.73ms step:75/1480 train_time:9278ms step_avg:142.74ms step:76/1480 train_time:9422ms step_avg:142.76ms step:77/1480 train_time:9566ms step_avg:142.78ms step:78/1480 train_time:9711ms step_avg:142.80ms step:79/1480 train_time:9852ms step_avg:142.78ms step:80/1480 train_time:9993ms step_avg:142.76ms step:81/1480 train_time:10136ms step_avg:142.76ms step:82/1480 train_time:10279ms step_avg:142.76ms step:83/1480 train_time:10422ms step_avg:142.77ms step:84/1480 train_time:10565ms step_avg:142.77ms step:85/1480 train_time:10710ms step_avg:142.80ms step:86/1480 train_time:10852ms step_avg:142.78ms step:87/1480 train_time:10993ms step_avg:142.76ms step:88/1480 train_time:11135ms step_avg:142.75ms step:89/1480 train_time:11276ms step_avg:142.74ms step:90/1480 train_time:11418ms step_avg:142.72ms step:91/1480 train_time:11560ms step_avg:142.72ms step:92/1480 train_time:11704ms step_avg:142.73ms step:93/1480 train_time:11847ms step_avg:142.73ms step:94/1480 train_time:11989ms step_avg:142.72ms step:95/1480 train_time:12131ms step_avg:142.72ms step:96/1480 train_time:12274ms step_avg:142.72ms step:97/1480 train_time:12416ms step_avg:142.71ms step:98/1480 train_time:12557ms step_avg:142.69ms step:99/1480 train_time:12699ms step_avg:142.69ms step:100/1480 train_time:12843ms step_avg:142.70ms step:101/1480 train_time:12986ms step_avg:142.70ms step:102/1480 train_time:13128ms step_avg:142.69ms step:103/1480 train_time:13270ms step_avg:142.68ms step:104/1480 train_time:13412ms step_avg:142.68ms step:105/1480 train_time:13555ms step_avg:142.68ms step:106/1480 train_time:13695ms step_avg:142.66ms step:107/1480 train_time:13838ms step_avg:142.66ms step:108/1480 train_time:13980ms step_avg:142.66ms step:109/1480 train_time:14123ms step_avg:142.65ms step:110/1480 train_time:14266ms step_avg:142.66ms step:111/1480 train_time:14411ms step_avg:142.68ms step:112/1480 train_time:14557ms step_avg:142.71ms step:113/1480 train_time:14704ms step_avg:142.75ms step:114/1480 train_time:14852ms step_avg:142.80ms step:115/1480 train_time:14998ms step_avg:142.84ms step:116/1480 train_time:15146ms step_avg:142.89ms step:117/1480 train_time:15292ms step_avg:142.92ms step:118/1480 train_time:15439ms step_avg:142.95ms step:119/1480 train_time:15587ms step_avg:143.00ms step:120/1480 train_time:15733ms step_avg:143.03ms step:121/1480 train_time:15879ms step_avg:143.05ms step:122/1480 train_time:16027ms step_avg:143.10ms step:123/1480 train_time:16174ms step_avg:143.13ms step:124/1480 train_time:16320ms step_avg:143.16ms step:125/1480 train_time:16468ms step_avg:143.20ms step:125/1480 val_loss:4.4082 train_time:16526ms step_avg:143.70ms step:126/1480 train_time:16628ms step_avg:143.35ms step:127/1480 train_time:16777ms step_avg:143.40ms step:128/1480 train_time:16923ms step_avg:143.41ms step:129/1480 train_time:17070ms step_avg:143.44ms step:130/1480 train_time:17215ms step_avg:143.46ms step:131/1480 train_time:17360ms step_avg:143.47ms step:132/1480 train_time:17506ms step_avg:143.50ms step:133/1480 train_time:17656ms step_avg:143.54ms step:134/1480 train_time:17803ms step_avg:143.58ms step:135/1480 train_time:17951ms step_avg:143.61ms step:136/1480 train_time:18097ms step_avg:143.62ms step:137/1480 train_time:18243ms step_avg:143.65ms step:138/1480 train_time:18390ms step_avg:143.67ms step:139/1480 train_time:18536ms step_avg:143.69ms step:140/1480 train_time:18685ms step_avg:143.73ms step:141/1480 train_time:18832ms step_avg:143.76ms step:142/1480 train_time:18978ms step_avg:143.77ms step:143/1480 train_time:19125ms step_avg:143.80ms step:144/1480 train_time:19272ms step_avg:143.82ms step:145/1480 train_time:19418ms step_avg:143.83ms step:146/1480 train_time:19564ms step_avg:143.86ms step:147/1480 train_time:19711ms step_avg:143.88ms step:148/1480 train_time:19857ms step_avg:143.89ms step:149/1480 train_time:20004ms step_avg:143.92ms step:150/1480 train_time:20152ms step_avg:143.94ms step:151/1480 train_time:20297ms step_avg:143.95ms step:152/1480 train_time:20444ms step_avg:143.97ms step:153/1480 train_time:20591ms step_avg:143.99ms step:154/1480 train_time:20737ms step_avg:144.01ms step:155/1480 train_time:20885ms step_avg:144.03ms step:156/1480 train_time:21032ms step_avg:144.05ms step:157/1480 train_time:21179ms step_avg:144.08ms step:158/1480 train_time:21327ms step_avg:144.10ms step:159/1480 train_time:21474ms step_avg:144.12ms step:160/1480 train_time:21621ms step_avg:144.14ms step:161/1480 train_time:21768ms step_avg:144.16ms step:162/1480 train_time:21915ms step_avg:144.18ms step:163/1480 train_time:22061ms step_avg:144.19ms step:164/1480 train_time:22208ms step_avg:144.21ms step:165/1480 train_time:22355ms step_avg:144.22ms step:166/1480 train_time:22500ms step_avg:144.23ms step:167/1480 train_time:22649ms step_avg:144.26ms step:168/1480 train_time:22796ms step_avg:144.28ms step:169/1480 train_time:22942ms step_avg:144.29ms step:170/1480 train_time:23090ms step_avg:144.31ms step:171/1480 train_time:23235ms step_avg:144.32ms step:172/1480 train_time:23381ms step_avg:144.33ms step:173/1480 train_time:23528ms step_avg:144.35ms step:174/1480 train_time:23675ms step_avg:144.36ms step:175/1480 train_time:23821ms step_avg:144.37ms step:176/1480 train_time:23969ms step_avg:144.39ms step:177/1480 train_time:24114ms step_avg:144.40ms step:178/1480 train_time:24260ms step_avg:144.40ms step:179/1480 train_time:24406ms step_avg:144.42ms step:180/1480 train_time:24554ms step_avg:144.43ms step:181/1480 train_time:24699ms step_avg:144.44ms step:182/1480 train_time:24847ms step_avg:144.46ms step:183/1480 train_time:24994ms step_avg:144.48ms step:184/1480 train_time:25140ms step_avg:144.48ms step:185/1480 train_time:25288ms step_avg:144.50ms step:186/1480 train_time:25433ms step_avg:144.51ms step:187/1480 train_time:25579ms step_avg:144.52ms step:188/1480 train_time:25727ms step_avg:144.53ms step:189/1480 train_time:25875ms step_avg:144.55ms step:190/1480 train_time:26023ms step_avg:144.57ms step:191/1480 train_time:26170ms step_avg:144.59ms step:192/1480 train_time:26316ms step_avg:144.59ms step:193/1480 train_time:26462ms step_avg:144.60ms step:194/1480 train_time:26609ms step_avg:144.62ms step:195/1480 train_time:26755ms step_avg:144.62ms step:196/1480 train_time:26900ms step_avg:144.62ms step:197/1480 train_time:27048ms step_avg:144.64ms step:198/1480 train_time:27194ms step_avg:144.65ms step:199/1480 train_time:27340ms step_avg:144.66ms step:200/1480 train_time:27489ms step_avg:144.68ms step:201/1480 train_time:27634ms step_avg:144.68ms step:202/1480 train_time:27780ms step_avg:144.69ms step:203/1480 train_time:27927ms step_avg:144.70ms step:204/1480 train_time:28075ms step_avg:144.72ms step:205/1480 train_time:28221ms step_avg:144.72ms step:206/1480 train_time:28368ms step_avg:144.73ms step:207/1480 train_time:28515ms step_avg:144.74ms step:208/1480 train_time:28661ms step_avg:144.75ms step:209/1480 train_time:28808ms step_avg:144.76ms step:210/1480 train_time:28956ms step_avg:144.78ms step:211/1480 train_time:29104ms step_avg:144.79ms step:212/1480 train_time:29251ms step_avg:144.80ms step:213/1480 train_time:29396ms step_avg:144.81ms step:214/1480 train_time:29542ms step_avg:144.81ms step:215/1480 train_time:29690ms step_avg:144.83ms step:216/1480 train_time:29835ms step_avg:144.83ms step:217/1480 train_time:29983ms step_avg:144.84ms step:218/1480 train_time:30130ms step_avg:144.86ms step:219/1480 train_time:30276ms step_avg:144.86ms step:220/1480 train_time:30422ms step_avg:144.86ms step:221/1480 train_time:30571ms step_avg:144.89ms step:222/1480 train_time:30721ms step_avg:144.91ms step:223/1480 train_time:30872ms step_avg:144.94ms step:224/1480 train_time:31023ms step_avg:144.97ms step:225/1480 train_time:31173ms step_avg:144.99ms step:226/1480 train_time:31323ms step_avg:145.01ms step:227/1480 train_time:31475ms step_avg:145.04ms step:228/1480 train_time:31625ms step_avg:145.07ms step:229/1480 train_time:31775ms step_avg:145.09ms step:230/1480 train_time:31925ms step_avg:145.11ms step:231/1480 train_time:32076ms step_avg:145.14ms step:232/1480 train_time:32226ms step_avg:145.16ms step:233/1480 train_time:32376ms step_avg:145.18ms step:234/1480 train_time:32528ms step_avg:145.21ms step:235/1480 train_time:32679ms step_avg:145.24ms step:236/1480 train_time:32830ms step_avg:145.26ms step:237/1480 train_time:32979ms step_avg:145.28ms step:238/1480 train_time:33130ms step_avg:145.31ms step:239/1480 train_time:33280ms step_avg:145.33ms step:240/1480 train_time:33431ms step_avg:145.35ms step:241/1480 train_time:33581ms step_avg:145.37ms step:242/1480 train_time:33732ms step_avg:145.39ms step:243/1480 train_time:33882ms step_avg:145.42ms step:244/1480 train_time:34032ms step_avg:145.44ms step:245/1480 train_time:34182ms step_avg:145.46ms step:246/1480 train_time:34332ms step_avg:145.48ms step:247/1480 train_time:34483ms step_avg:145.50ms step:248/1480 train_time:34633ms step_avg:145.52ms step:249/1480 train_time:34784ms step_avg:145.54ms step:250/1480 train_time:34935ms step_avg:145.56ms step:250/1480 val_loss:3.9983 train_time:34993ms step_avg:145.80ms step:251/1480 train_time:35089ms step_avg:145.60ms step:252/1480 train_time:35240ms step_avg:145.62ms step:253/1480 train_time:35390ms step_avg:145.64ms step:254/1480 train_time:35540ms step_avg:145.66ms step:255/1480 train_time:35689ms step_avg:145.67ms step:256/1480 train_time:35839ms step_avg:145.69ms step:257/1480 train_time:35989ms step_avg:145.70ms step:258/1480 train_time:36141ms step_avg:145.73ms step:259/1480 train_time:36294ms step_avg:145.76ms step:260/1480 train_time:36444ms step_avg:145.78ms step:261/1480 train_time:36594ms step_avg:145.79ms step:262/1480 train_time:36743ms step_avg:145.81ms step:263/1480 train_time:36893ms step_avg:145.82ms step:264/1480 train_time:37043ms step_avg:145.84ms step:265/1480 train_time:37194ms step_avg:145.86ms step:266/1480 train_time:37345ms step_avg:145.88ms step:267/1480 train_time:37496ms step_avg:145.90ms step:268/1480 train_time:37645ms step_avg:145.91ms step:269/1480 train_time:37795ms step_avg:145.93ms step:270/1480 train_time:37945ms step_avg:145.94ms step:271/1480 train_time:38095ms step_avg:145.96ms step:272/1480 train_time:38245ms step_avg:145.97ms step:273/1480 train_time:38396ms step_avg:145.99ms step:274/1480 train_time:38547ms step_avg:146.01ms step:275/1480 train_time:38699ms step_avg:146.03ms step:276/1480 train_time:38849ms step_avg:146.05ms step:277/1480 train_time:38999ms step_avg:146.06ms step:278/1480 train_time:39149ms step_avg:146.08ms step:279/1480 train_time:39299ms step_avg:146.09ms step:280/1480 train_time:39450ms step_avg:146.11ms step:281/1480 train_time:39601ms step_avg:146.13ms step:282/1480 train_time:39752ms step_avg:146.15ms step:283/1480 train_time:39903ms step_avg:146.17ms step:284/1480 train_time:40052ms step_avg:146.18ms step:285/1480 train_time:40203ms step_avg:146.19ms step:286/1480 train_time:40353ms step_avg:146.21ms step:287/1480 train_time:40503ms step_avg:146.22ms step:288/1480 train_time:40653ms step_avg:146.24ms step:289/1480 train_time:40804ms step_avg:146.25ms step:290/1480 train_time:40954ms step_avg:146.26ms step:291/1480 train_time:41105ms step_avg:146.28ms step:292/1480 train_time:41255ms step_avg:146.29ms step:293/1480 train_time:41406ms step_avg:146.31ms step:294/1480 train_time:41556ms step_avg:146.32ms step:295/1480 train_time:41707ms step_avg:146.34ms step:296/1480 train_time:41859ms step_avg:146.36ms step:297/1480 train_time:42008ms step_avg:146.37ms step:298/1480 train_time:42160ms step_avg:146.39ms step:299/1480 train_time:42309ms step_avg:146.40ms step:300/1480 train_time:42460ms step_avg:146.41ms step:301/1480 train_time:42609ms step_avg:146.42ms step:302/1480 train_time:42761ms step_avg:146.44ms step:303/1480 train_time:42910ms step_avg:146.45ms step:304/1480 train_time:43061ms step_avg:146.47ms step:305/1480 train_time:43210ms step_avg:146.48ms step:306/1480 train_time:43361ms step_avg:146.49ms step:307/1480 train_time:43511ms step_avg:146.50ms step:308/1480 train_time:43662ms step_avg:146.52ms step:309/1480 train_time:43812ms step_avg:146.53ms step:310/1480 train_time:43963ms step_avg:146.54ms step:311/1480 train_time:44113ms step_avg:146.55ms step:312/1480 train_time:44264ms step_avg:146.57ms step:313/1480 train_time:44414ms step_avg:146.58ms step:314/1480 train_time:44564ms step_avg:146.59ms step:315/1480 train_time:44714ms step_avg:146.60ms step:316/1480 train_time:44865ms step_avg:146.62ms step:317/1480 train_time:45015ms step_avg:146.63ms step:318/1480 train_time:45166ms step_avg:146.64ms step:319/1480 train_time:45316ms step_avg:146.65ms step:320/1480 train_time:45466ms step_avg:146.66ms step:321/1480 train_time:45615ms step_avg:146.67ms step:322/1480 train_time:45766ms step_avg:146.69ms step:323/1480 train_time:45916ms step_avg:146.70ms step:324/1480 train_time:46067ms step_avg:146.71ms step:325/1480 train_time:46216ms step_avg:146.72ms step:326/1480 train_time:46367ms step_avg:146.73ms step:327/1480 train_time:46516ms step_avg:146.74ms step:328/1480 train_time:46667ms step_avg:146.75ms step:329/1480 train_time:46817ms step_avg:146.76ms step:330/1480 train_time:46969ms step_avg:146.78ms step:331/1480 train_time:47122ms step_avg:146.80ms step:332/1480 train_time:47277ms step_avg:146.82ms step:333/1480 train_time:47430ms step_avg:146.84ms step:334/1480 train_time:47584ms step_avg:146.87ms step:335/1480 train_time:47738ms step_avg:146.89ms step:336/1480 train_time:47892ms step_avg:146.91ms step:337/1480 train_time:48046ms step_avg:146.93ms step:338/1480 train_time:48200ms step_avg:146.95ms step:339/1480 train_time:48354ms step_avg:146.97ms step:340/1480 train_time:48507ms step_avg:146.99ms step:341/1480 train_time:48661ms step_avg:147.01ms step:342/1480 train_time:48815ms step_avg:147.03ms step:343/1480 train_time:48969ms step_avg:147.05ms step:344/1480 train_time:49123ms step_avg:147.07ms step:345/1480 train_time:49279ms step_avg:147.10ms step:346/1480 train_time:49432ms step_avg:147.12ms step:347/1480 train_time:49586ms step_avg:147.14ms step:348/1480 train_time:49740ms step_avg:147.16ms step:349/1480 train_time:49895ms step_avg:147.18ms step:350/1480 train_time:50048ms step_avg:147.20ms step:351/1480 train_time:50203ms step_avg:147.22ms step:352/1480 train_time:50356ms step_avg:147.24ms step:353/1480 train_time:50509ms step_avg:147.26ms step:354/1480 train_time:50662ms step_avg:147.27ms step:355/1480 train_time:50818ms step_avg:147.30ms step:356/1480 train_time:50972ms step_avg:147.32ms step:357/1480 train_time:51125ms step_avg:147.34ms step:358/1480 train_time:51279ms step_avg:147.35ms step:359/1480 train_time:51433ms step_avg:147.37ms step:360/1480 train_time:51588ms step_avg:147.39ms step:361/1480 train_time:51742ms step_avg:147.41ms step:362/1480 train_time:51897ms step_avg:147.43ms step:363/1480 train_time:52051ms step_avg:147.45ms step:364/1480 train_time:52204ms step_avg:147.47ms step:365/1480 train_time:52358ms step_avg:147.49ms step:366/1480 train_time:52511ms step_avg:147.50ms step:367/1480 train_time:52664ms step_avg:147.52ms step:368/1480 train_time:52819ms step_avg:147.54ms step:369/1480 train_time:52973ms step_avg:147.56ms step:370/1480 train_time:53126ms step_avg:147.57ms step:371/1480 train_time:53280ms step_avg:147.59ms step:372/1480 train_time:53434ms step_avg:147.61ms step:373/1480 train_time:53587ms step_avg:147.62ms step:374/1480 train_time:53740ms step_avg:147.64ms step:375/1480 train_time:53895ms step_avg:147.66ms step:375/1480 val_loss:3.8075 train_time:53956ms step_avg:147.82ms step:376/1480 train_time:54056ms step_avg:147.69ms step:377/1480 train_time:54210ms step_avg:147.71ms step:378/1480 train_time:54363ms step_avg:147.72ms step:379/1480 train_time:54515ms step_avg:147.74ms step:380/1480 train_time:54667ms step_avg:147.75ms step:381/1480 train_time:54819ms step_avg:147.76ms step:382/1480 train_time:54973ms step_avg:147.78ms step:383/1480 train_time:55128ms step_avg:147.80ms step:384/1480 train_time:55281ms step_avg:147.81ms step:385/1480 train_time:55435ms step_avg:147.83ms step:386/1480 train_time:55588ms step_avg:147.84ms step:387/1480 train_time:55741ms step_avg:147.85ms step:388/1480 train_time:55896ms step_avg:147.87ms step:389/1480 train_time:56048ms step_avg:147.88ms step:390/1480 train_time:56204ms step_avg:147.90ms step:391/1480 train_time:56358ms step_avg:147.92ms step:392/1480 train_time:56511ms step_avg:147.93ms step:393/1480 train_time:56664ms step_avg:147.95ms step:394/1480 train_time:56817ms step_avg:147.96ms step:395/1480 train_time:56970ms step_avg:147.97ms step:396/1480 train_time:57123ms step_avg:147.99ms step:397/1480 train_time:57277ms step_avg:148.00ms step:398/1480 train_time:57431ms step_avg:148.02ms step:399/1480 train_time:57585ms step_avg:148.03ms step:400/1480 train_time:57739ms step_avg:148.05ms step:401/1480 train_time:57893ms step_avg:148.06ms step:402/1480 train_time:58047ms step_avg:148.08ms step:403/1480 train_time:58201ms step_avg:148.09ms step:404/1480 train_time:58355ms step_avg:148.11ms step:405/1480 train_time:58508ms step_avg:148.12ms step:406/1480 train_time:58662ms step_avg:148.14ms step:407/1480 train_time:58815ms step_avg:148.15ms step:408/1480 train_time:58968ms step_avg:148.16ms step:409/1480 train_time:59122ms step_avg:148.18ms step:410/1480 train_time:59277ms step_avg:148.19ms step:411/1480 train_time:59431ms step_avg:148.21ms step:412/1480 train_time:59585ms step_avg:148.22ms step:413/1480 train_time:59738ms step_avg:148.23ms step:414/1480 train_time:59893ms step_avg:148.25ms step:415/1480 train_time:60046ms step_avg:148.26ms step:416/1480 train_time:60199ms step_avg:148.27ms step:417/1480 train_time:60353ms step_avg:148.29ms step:418/1480 train_time:60505ms step_avg:148.30ms step:419/1480 train_time:60661ms step_avg:148.31ms step:420/1480 train_time:60816ms step_avg:148.33ms step:421/1480 train_time:60969ms step_avg:148.34ms step:422/1480 train_time:61122ms step_avg:148.36ms step:423/1480 train_time:61277ms step_avg:148.37ms step:424/1480 train_time:61432ms step_avg:148.39ms step:425/1480 train_time:61585ms step_avg:148.40ms step:426/1480 train_time:61741ms step_avg:148.41ms step:427/1480 train_time:61897ms step_avg:148.43ms step:428/1480 train_time:62050ms step_avg:148.44ms step:429/1480 train_time:62203ms step_avg:148.45ms step:430/1480 train_time:62356ms step_avg:148.47ms step:431/1480 train_time:62509ms step_avg:148.48ms step:432/1480 train_time:62664ms step_avg:148.49ms step:433/1480 train_time:62817ms step_avg:148.50ms step:434/1480 train_time:62972ms step_avg:148.52ms step:435/1480 train_time:63124ms step_avg:148.53ms step:436/1480 train_time:63280ms step_avg:148.54ms step:437/1480 train_time:63433ms step_avg:148.55ms step:438/1480 train_time:63585ms step_avg:148.56ms step:439/1480 train_time:63739ms step_avg:148.58ms step:440/1480 train_time:63897ms step_avg:148.60ms step:441/1480 train_time:64054ms step_avg:148.62ms step:442/1480 train_time:64211ms step_avg:148.64ms step:443/1480 train_time:64366ms step_avg:148.65ms step:444/1480 train_time:64522ms step_avg:148.67ms step:445/1480 train_time:64678ms step_avg:148.68ms step:446/1480 train_time:64834ms step_avg:148.70ms step:447/1480 train_time:64989ms step_avg:148.72ms step:448/1480 train_time:65145ms step_avg:148.73ms step:449/1480 train_time:65303ms step_avg:148.75ms step:450/1480 train_time:65460ms step_avg:148.77ms step:451/1480 train_time:65618ms step_avg:148.79ms step:452/1480 train_time:65777ms step_avg:148.82ms step:453/1480 train_time:65931ms step_avg:148.83ms step:454/1480 train_time:66087ms step_avg:148.84ms step:455/1480 train_time:66242ms step_avg:148.86ms step:456/1480 train_time:66399ms step_avg:148.88ms step:457/1480 train_time:66557ms step_avg:148.90ms step:458/1480 train_time:66713ms step_avg:148.91ms step:459/1480 train_time:66870ms step_avg:148.93ms step:460/1480 train_time:67026ms step_avg:148.95ms step:461/1480 train_time:67184ms step_avg:148.97ms step:462/1480 train_time:67342ms step_avg:148.99ms step:463/1480 train_time:67499ms step_avg:149.00ms step:464/1480 train_time:67657ms step_avg:149.02ms step:465/1480 train_time:67812ms step_avg:149.04ms step:466/1480 train_time:67970ms step_avg:149.06ms step:467/1480 train_time:68128ms step_avg:149.08ms step:468/1480 train_time:68284ms step_avg:149.09ms step:469/1480 train_time:68440ms step_avg:149.11ms step:470/1480 train_time:68598ms step_avg:149.13ms step:471/1480 train_time:68755ms step_avg:149.14ms step:472/1480 train_time:68910ms step_avg:149.16ms step:473/1480 train_time:69067ms step_avg:149.17ms step:474/1480 train_time:69224ms step_avg:149.19ms step:475/1480 train_time:69380ms step_avg:149.21ms step:476/1480 train_time:69538ms step_avg:149.22ms step:477/1480 train_time:69696ms step_avg:149.24ms step:478/1480 train_time:69853ms step_avg:149.26ms step:479/1480 train_time:70009ms step_avg:149.27ms step:480/1480 train_time:70165ms step_avg:149.29ms step:481/1480 train_time:70323ms step_avg:149.31ms step:482/1480 train_time:70480ms step_avg:149.32ms step:483/1480 train_time:70636ms step_avg:149.34ms step:484/1480 train_time:70793ms step_avg:149.35ms step:485/1480 train_time:70950ms step_avg:149.37ms step:486/1480 train_time:71106ms step_avg:149.38ms step:487/1480 train_time:71264ms step_avg:149.40ms step:488/1480 train_time:71422ms step_avg:149.42ms step:489/1480 train_time:71578ms step_avg:149.43ms step:490/1480 train_time:71735ms step_avg:149.45ms step:491/1480 train_time:71890ms step_avg:149.46ms step:492/1480 train_time:72046ms step_avg:149.47ms step:493/1480 train_time:72202ms step_avg:149.49ms step:494/1480 train_time:72361ms step_avg:149.51ms step:495/1480 train_time:72520ms step_avg:149.53ms step:496/1480 train_time:72679ms step_avg:149.55ms step:497/1480 train_time:72837ms step_avg:149.56ms step:498/1480 train_time:72994ms step_avg:149.58ms step:499/1480 train_time:73152ms step_avg:149.60ms step:500/1480 train_time:73309ms step_avg:149.61ms step:500/1480 val_loss:3.6891 train_time:73371ms step_avg:149.74ms step:501/1480 train_time:73470ms step_avg:149.63ms step:502/1480 train_time:73628ms step_avg:149.65ms step:503/1480 train_time:73787ms step_avg:149.67ms step:504/1480 train_time:73941ms step_avg:149.68ms step:505/1480 train_time:74097ms step_avg:149.69ms step:506/1480 train_time:74255ms step_avg:149.71ms step:507/1480 train_time:74411ms step_avg:149.72ms step:508/1480 train_time:74570ms step_avg:149.74ms step:509/1480 train_time:74729ms step_avg:149.76ms step:510/1480 train_time:74888ms step_avg:149.78ms step:511/1480 train_time:75045ms step_avg:149.79ms step:512/1480 train_time:75202ms step_avg:149.81ms step:513/1480 train_time:75358ms step_avg:149.82ms step:514/1480 train_time:75514ms step_avg:149.83ms step:515/1480 train_time:75672ms step_avg:149.85ms step:516/1480 train_time:75830ms step_avg:149.86ms step:517/1480 train_time:75989ms step_avg:149.88ms step:518/1480 train_time:76147ms step_avg:149.90ms step:519/1480 train_time:76305ms step_avg:149.91ms step:520/1480 train_time:76462ms step_avg:149.93ms step:521/1480 train_time:76619ms step_avg:149.94ms step:522/1480 train_time:76775ms step_avg:149.95ms step:523/1480 train_time:76932ms step_avg:149.97ms step:524/1480 train_time:77088ms step_avg:149.98ms step:525/1480 train_time:77247ms step_avg:149.99ms step:526/1480 train_time:77403ms step_avg:150.01ms step:527/1480 train_time:77559ms step_avg:150.02ms step:528/1480 train_time:77714ms step_avg:150.03ms step:529/1480 train_time:77872ms step_avg:150.04ms step:530/1480 train_time:78028ms step_avg:150.05ms step:531/1480 train_time:78187ms step_avg:150.07ms step:532/1480 train_time:78344ms step_avg:150.08ms step:533/1480 train_time:78500ms step_avg:150.09ms step:534/1480 train_time:78656ms step_avg:150.11ms step:535/1480 train_time:78812ms step_avg:150.12ms step:536/1480 train_time:78970ms step_avg:150.13ms step:537/1480 train_time:79128ms step_avg:150.15ms step:538/1480 train_time:79285ms step_avg:150.16ms step:539/1480 train_time:79443ms step_avg:150.18ms step:540/1480 train_time:79600ms step_avg:150.19ms step:541/1480 train_time:79756ms step_avg:150.20ms step:542/1480 train_time:79912ms step_avg:150.21ms step:543/1480 train_time:80069ms step_avg:150.22ms step:544/1480 train_time:80226ms step_avg:150.24ms step:545/1480 train_time:80383ms step_avg:150.25ms step:546/1480 train_time:80540ms step_avg:150.26ms step:547/1480 train_time:80696ms step_avg:150.27ms step:548/1480 train_time:80853ms step_avg:150.28ms step:549/1480 train_time:81009ms step_avg:150.29ms step:550/1480 train_time:81167ms step_avg:150.31ms step:551/1480 train_time:81328ms step_avg:150.33ms step:552/1480 train_time:81488ms step_avg:150.35ms step:553/1480 train_time:81650ms step_avg:150.37ms step:554/1480 train_time:81811ms step_avg:150.39ms step:555/1480 train_time:81971ms step_avg:150.41ms step:556/1480 train_time:82129ms step_avg:150.42ms step:557/1480 train_time:82290ms step_avg:150.44ms step:558/1480 train_time:82451ms step_avg:150.46ms step:559/1480 train_time:82610ms step_avg:150.47ms step:560/1480 train_time:82772ms step_avg:150.49ms step:561/1480 train_time:82931ms step_avg:150.51ms step:562/1480 train_time:83091ms step_avg:150.53ms step:563/1480 train_time:83250ms step_avg:150.54ms step:564/1480 train_time:83410ms step_avg:150.56ms step:565/1480 train_time:83568ms step_avg:150.57ms step:566/1480 train_time:83729ms step_avg:150.59ms step:567/1480 train_time:83889ms step_avg:150.61ms step:568/1480 train_time:84049ms step_avg:150.62ms step:569/1480 train_time:84209ms step_avg:150.64ms step:570/1480 train_time:84370ms step_avg:150.66ms step:571/1480 train_time:84530ms step_avg:150.68ms step:572/1480 train_time:84690ms step_avg:150.69ms step:573/1480 train_time:84849ms step_avg:150.71ms step:574/1480 train_time:85010ms step_avg:150.73ms step:575/1480 train_time:85171ms step_avg:150.74ms step:576/1480 train_time:85330ms step_avg:150.76ms step:577/1480 train_time:85491ms step_avg:150.78ms step:578/1480 train_time:85650ms step_avg:150.79ms step:579/1480 train_time:85809ms step_avg:150.81ms step:580/1480 train_time:85969ms step_avg:150.82ms step:581/1480 train_time:86129ms step_avg:150.84ms step:582/1480 train_time:86289ms step_avg:150.86ms step:583/1480 train_time:86448ms step_avg:150.87ms step:584/1480 train_time:86609ms step_avg:150.89ms step:585/1480 train_time:86768ms step_avg:150.90ms step:586/1480 train_time:86928ms step_avg:150.92ms step:587/1480 train_time:87088ms step_avg:150.93ms step:588/1480 train_time:87248ms step_avg:150.95ms step:589/1480 train_time:87407ms step_avg:150.96ms step:590/1480 train_time:87569ms step_avg:150.98ms step:591/1480 train_time:87728ms step_avg:150.99ms step:592/1480 train_time:87889ms step_avg:151.01ms step:593/1480 train_time:88050ms step_avg:151.03ms step:594/1480 train_time:88211ms step_avg:151.05ms step:595/1480 train_time:88371ms step_avg:151.06ms step:596/1480 train_time:88532ms step_avg:151.08ms step:597/1480 train_time:88691ms step_avg:151.09ms step:598/1480 train_time:88849ms step_avg:151.10ms step:599/1480 train_time:89008ms step_avg:151.12ms step:600/1480 train_time:89169ms step_avg:151.13ms step:601/1480 train_time:89329ms step_avg:151.15ms step:602/1480 train_time:89490ms step_avg:151.17ms step:603/1480 train_time:89651ms step_avg:151.18ms step:604/1480 train_time:89810ms step_avg:151.20ms step:605/1480 train_time:89970ms step_avg:151.21ms step:606/1480 train_time:90131ms step_avg:151.23ms step:607/1480 train_time:90293ms step_avg:151.25ms step:608/1480 train_time:90453ms step_avg:151.26ms step:609/1480 train_time:90612ms step_avg:151.27ms step:610/1480 train_time:90770ms step_avg:151.28ms step:611/1480 train_time:90929ms step_avg:151.30ms step:612/1480 train_time:91090ms step_avg:151.31ms step:613/1480 train_time:91250ms step_avg:151.33ms step:614/1480 train_time:91409ms step_avg:151.34ms step:615/1480 train_time:91569ms step_avg:151.35ms step:616/1480 train_time:91728ms step_avg:151.37ms step:617/1480 train_time:91888ms step_avg:151.38ms step:618/1480 train_time:92048ms step_avg:151.40ms step:619/1480 train_time:92208ms step_avg:151.41ms step:620/1480 train_time:92368ms step_avg:151.42ms step:621/1480 train_time:92528ms step_avg:151.44ms step:622/1480 train_time:92688ms step_avg:151.45ms step:623/1480 train_time:92849ms step_avg:151.47ms step:624/1480 train_time:93009ms step_avg:151.48ms step:625/1480 train_time:93168ms step_avg:151.49ms step:625/1480 val_loss:3.6079 train_time:93232ms step_avg:151.60ms step:626/1480 train_time:93333ms step_avg:151.51ms step:627/1480 train_time:93492ms step_avg:151.53ms step:628/1480 train_time:93650ms step_avg:151.54ms step:629/1480 train_time:93809ms step_avg:151.55ms step:630/1480 train_time:93966ms step_avg:151.56ms step:631/1480 train_time:94124ms step_avg:151.57ms step:632/1480 train_time:94284ms step_avg:151.58ms step:633/1480 train_time:94444ms step_avg:151.60ms step:634/1480 train_time:94603ms step_avg:151.61ms step:635/1480 train_time:94763ms step_avg:151.62ms step:636/1480 train_time:94922ms step_avg:151.63ms step:637/1480 train_time:95081ms step_avg:151.64ms step:638/1480 train_time:95239ms step_avg:151.65ms step:639/1480 train_time:95398ms step_avg:151.67ms step:640/1480 train_time:95558ms step_avg:151.68ms step:641/1480 train_time:95717ms step_avg:151.69ms step:642/1480 train_time:95877ms step_avg:151.70ms step:643/1480 train_time:96037ms step_avg:151.72ms step:644/1480 train_time:96195ms step_avg:151.73ms step:645/1480 train_time:96356ms step_avg:151.74ms step:646/1480 train_time:96515ms step_avg:151.75ms step:647/1480 train_time:96675ms step_avg:151.77ms step:648/1480 train_time:96838ms step_avg:151.78ms step:649/1480 train_time:96998ms step_avg:151.80ms step:650/1480 train_time:97157ms step_avg:151.81ms step:651/1480 train_time:97317ms step_avg:151.82ms step:652/1480 train_time:97477ms step_avg:151.83ms step:653/1480 train_time:97637ms step_avg:151.85ms step:654/1480 train_time:97797ms step_avg:151.86ms step:655/1480 train_time:97958ms step_avg:151.87ms step:656/1480 train_time:98117ms step_avg:151.88ms step:657/1480 train_time:98278ms step_avg:151.90ms step:658/1480 train_time:98437ms step_avg:151.91ms step:659/1480 train_time:98599ms step_avg:151.92ms step:660/1480 train_time:98761ms step_avg:151.94ms step:661/1480 train_time:98923ms step_avg:151.96ms step:662/1480 train_time:99082ms step_avg:151.97ms step:663/1480 train_time:99241ms step_avg:151.98ms step:664/1480 train_time:99402ms step_avg:151.99ms step:665/1480 train_time:99563ms step_avg:152.01ms step:666/1480 train_time:99724ms step_avg:152.02ms step:667/1480 train_time:99885ms step_avg:152.03ms step:668/1480 train_time:100047ms step_avg:152.05ms step:669/1480 train_time:100208ms step_avg:152.06ms step:670/1480 train_time:100367ms step_avg:152.07ms step:671/1480 train_time:100527ms step_avg:152.08ms step:672/1480 train_time:100688ms step_avg:152.10ms step:673/1480 train_time:100852ms step_avg:152.11ms step:674/1480 train_time:101014ms step_avg:152.13ms step:675/1480 train_time:101177ms step_avg:152.15ms step:676/1480 train_time:101341ms step_avg:152.16ms step:677/1480 train_time:101502ms step_avg:152.18ms step:678/1480 train_time:101662ms step_avg:152.19ms step:679/1480 train_time:101824ms step_avg:152.20ms step:680/1480 train_time:101985ms step_avg:152.22ms step:681/1480 train_time:102145ms step_avg:152.23ms step:682/1480 train_time:102306ms step_avg:152.24ms step:683/1480 train_time:102468ms step_avg:152.26ms step:684/1480 train_time:102630ms step_avg:152.27ms step:685/1480 train_time:102794ms step_avg:152.29ms step:686/1480 train_time:102957ms step_avg:152.30ms step:687/1480 train_time:103118ms step_avg:152.32ms step:688/1480 train_time:103282ms step_avg:152.33ms step:689/1480 train_time:103444ms step_avg:152.35ms step:690/1480 train_time:103608ms step_avg:152.36ms step:691/1480 train_time:103768ms step_avg:152.38ms step:692/1480 train_time:103928ms step_avg:152.39ms step:693/1480 train_time:104089ms step_avg:152.40ms step:694/1480 train_time:104252ms step_avg:152.41ms step:695/1480 train_time:104411ms step_avg:152.43ms step:696/1480 train_time:104574ms step_avg:152.44ms step:697/1480 train_time:104739ms step_avg:152.46ms step:698/1480 train_time:104899ms step_avg:152.47ms step:699/1480 train_time:105061ms step_avg:152.48ms step:700/1480 train_time:105223ms step_avg:152.50ms step:701/1480 train_time:105382ms step_avg:152.51ms step:702/1480 train_time:105543ms step_avg:152.52ms step:703/1480 train_time:105703ms step_avg:152.53ms step:704/1480 train_time:105863ms step_avg:152.54ms step:705/1480 train_time:106025ms step_avg:152.55ms step:706/1480 train_time:106188ms step_avg:152.57ms step:707/1480 train_time:106349ms step_avg:152.58ms step:708/1480 train_time:106508ms step_avg:152.59ms step:709/1480 train_time:106671ms step_avg:152.60ms step:710/1480 train_time:106832ms step_avg:152.62ms step:711/1480 train_time:106993ms step_avg:152.63ms step:712/1480 train_time:107161ms step_avg:152.65ms step:713/1480 train_time:107323ms step_avg:152.66ms step:714/1480 train_time:107483ms step_avg:152.67ms step:715/1480 train_time:107644ms step_avg:152.69ms step:716/1480 train_time:107802ms step_avg:152.69ms step:717/1480 train_time:107964ms step_avg:152.71ms step:718/1480 train_time:108122ms step_avg:152.72ms step:719/1480 train_time:108282ms step_avg:152.72ms step:720/1480 train_time:108444ms step_avg:152.74ms step:721/1480 train_time:108604ms step_avg:152.75ms step:722/1480 train_time:108765ms step_avg:152.76ms step:723/1480 train_time:108925ms step_avg:152.77ms step:724/1480 train_time:109087ms step_avg:152.78ms step:725/1480 train_time:109251ms step_avg:152.80ms step:726/1480 train_time:109415ms step_avg:152.81ms step:727/1480 train_time:109578ms step_avg:152.83ms step:728/1480 train_time:109739ms step_avg:152.84ms step:729/1480 train_time:109900ms step_avg:152.85ms step:730/1480 train_time:110063ms step_avg:152.87ms step:731/1480 train_time:110224ms step_avg:152.88ms step:732/1480 train_time:110383ms step_avg:152.88ms step:733/1480 train_time:110544ms step_avg:152.90ms step:734/1480 train_time:110705ms step_avg:152.91ms step:735/1480 train_time:110865ms step_avg:152.92ms step:736/1480 train_time:111027ms step_avg:152.93ms step:737/1480 train_time:111190ms step_avg:152.94ms step:738/1480 train_time:111351ms step_avg:152.95ms step:739/1480 train_time:111512ms step_avg:152.97ms step:740/1480 train_time:111678ms step_avg:152.98ms step:741/1480 train_time:111841ms step_avg:153.00ms step:742/1480 train_time:112002ms step_avg:153.01ms step:743/1480 train_time:112163ms step_avg:153.02ms step:744/1480 train_time:112325ms step_avg:153.03ms step:745/1480 train_time:112488ms step_avg:153.04ms step:746/1480 train_time:112647ms step_avg:153.05ms step:747/1480 train_time:112807ms step_avg:153.06ms step:748/1480 train_time:112975ms step_avg:153.08ms step:749/1480 train_time:113139ms step_avg:153.10ms step:750/1480 train_time:113298ms step_avg:153.11ms step:750/1480 val_loss:3.5526 train_time:113362ms step_avg:153.19ms step:751/1480 train_time:113462ms step_avg:153.12ms step:752/1480 train_time:113622ms step_avg:153.13ms step:753/1480 train_time:113784ms step_avg:153.14ms step:754/1480 train_time:113944ms step_avg:153.15ms step:755/1480 train_time:114105ms step_avg:153.16ms step:756/1480 train_time:114266ms step_avg:153.17ms step:757/1480 train_time:114431ms step_avg:153.19ms step:758/1480 train_time:114592ms step_avg:153.20ms step:759/1480 train_time:114754ms step_avg:153.21ms step:760/1480 train_time:114917ms step_avg:153.22ms step:761/1480 train_time:115079ms step_avg:153.23ms step:762/1480 train_time:115240ms step_avg:153.24ms step:763/1480 train_time:115402ms step_avg:153.26ms step:764/1480 train_time:115562ms step_avg:153.27ms step:765/1480 train_time:115723ms step_avg:153.28ms step:766/1480 train_time:115886ms step_avg:153.29ms step:767/1480 train_time:116047ms step_avg:153.30ms step:768/1480 train_time:116212ms step_avg:153.31ms step:769/1480 train_time:116375ms step_avg:153.33ms step:770/1480 train_time:116539ms step_avg:153.34ms step:771/1480 train_time:116702ms step_avg:153.35ms step:772/1480 train_time:116863ms step_avg:153.36ms step:773/1480 train_time:117025ms step_avg:153.37ms step:774/1480 train_time:117188ms step_avg:153.39ms step:775/1480 train_time:117352ms step_avg:153.40ms step:776/1480 train_time:117518ms step_avg:153.42ms step:777/1480 train_time:117684ms step_avg:153.43ms step:778/1480 train_time:117846ms step_avg:153.45ms step:779/1480 train_time:118008ms step_avg:153.46ms step:780/1480 train_time:118172ms step_avg:153.47ms step:781/1480 train_time:118335ms step_avg:153.48ms step:782/1480 train_time:118500ms step_avg:153.50ms step:783/1480 train_time:118662ms step_avg:153.51ms step:784/1480 train_time:118824ms step_avg:153.52ms step:785/1480 train_time:118985ms step_avg:153.53ms step:786/1480 train_time:119152ms step_avg:153.55ms step:787/1480 train_time:119316ms step_avg:153.56ms step:788/1480 train_time:119480ms step_avg:153.57ms step:789/1480 train_time:119642ms step_avg:153.58ms step:790/1480 train_time:119806ms step_avg:153.60ms step:791/1480 train_time:119973ms step_avg:153.61ms step:792/1480 train_time:120138ms step_avg:153.63ms step:793/1480 train_time:120300ms step_avg:153.64ms step:794/1480 train_time:120464ms step_avg:153.65ms step:795/1480 train_time:120630ms step_avg:153.67ms step:796/1480 train_time:120796ms step_avg:153.68ms step:797/1480 train_time:120960ms step_avg:153.70ms step:798/1480 train_time:121124ms step_avg:153.71ms step:799/1480 train_time:121291ms step_avg:153.73ms step:800/1480 train_time:121454ms step_avg:153.74ms step:801/1480 train_time:121619ms step_avg:153.75ms step:802/1480 train_time:121786ms step_avg:153.77ms step:803/1480 train_time:121947ms step_avg:153.78ms step:804/1480 train_time:122112ms step_avg:153.79ms step:805/1480 train_time:122277ms step_avg:153.81ms step:806/1480 train_time:122439ms step_avg:153.82ms step:807/1480 train_time:122600ms step_avg:153.83ms step:808/1480 train_time:122764ms step_avg:153.84ms step:809/1480 train_time:122925ms step_avg:153.85ms step:810/1480 train_time:123088ms step_avg:153.86ms step:811/1480 train_time:123251ms step_avg:153.87ms step:812/1480 train_time:123416ms step_avg:153.89ms step:813/1480 train_time:123578ms step_avg:153.90ms step:814/1480 train_time:123741ms step_avg:153.91ms step:815/1480 train_time:123904ms step_avg:153.92ms step:816/1480 train_time:124069ms step_avg:153.93ms step:817/1480 train_time:124232ms step_avg:153.94ms step:818/1480 train_time:124395ms step_avg:153.95ms step:819/1480 train_time:124558ms step_avg:153.97ms step:820/1480 train_time:124721ms step_avg:153.98ms step:821/1480 train_time:124882ms step_avg:153.99ms step:822/1480 train_time:125045ms step_avg:154.00ms step:823/1480 train_time:125206ms step_avg:154.01ms step:824/1480 train_time:125368ms step_avg:154.01ms step:825/1480 train_time:125531ms step_avg:154.03ms step:826/1480 train_time:125699ms step_avg:154.04ms step:827/1480 train_time:125862ms step_avg:154.05ms step:828/1480 train_time:126024ms step_avg:154.06ms step:829/1480 train_time:126187ms step_avg:154.07ms step:830/1480 train_time:126351ms step_avg:154.09ms step:831/1480 train_time:126516ms step_avg:154.10ms step:832/1480 train_time:126681ms step_avg:154.11ms step:833/1480 train_time:126845ms step_avg:154.13ms step:834/1480 train_time:127012ms step_avg:154.14ms step:835/1480 train_time:127176ms step_avg:154.15ms step:836/1480 train_time:127340ms step_avg:154.16ms step:837/1480 train_time:127503ms step_avg:154.18ms step:838/1480 train_time:127665ms step_avg:154.18ms step:839/1480 train_time:127825ms step_avg:154.19ms step:840/1480 train_time:127986ms step_avg:154.20ms step:841/1480 train_time:128146ms step_avg:154.21ms step:842/1480 train_time:128312ms step_avg:154.22ms step:843/1480 train_time:128476ms step_avg:154.23ms step:844/1480 train_time:128637ms step_avg:154.24ms step:845/1480 train_time:128801ms step_avg:154.25ms step:846/1480 train_time:128966ms step_avg:154.26ms step:847/1480 train_time:129128ms step_avg:154.27ms step:848/1480 train_time:129289ms step_avg:154.28ms step:849/1480 train_time:129452ms step_avg:154.29ms step:850/1480 train_time:129616ms step_avg:154.31ms step:851/1480 train_time:129780ms step_avg:154.32ms step:852/1480 train_time:129942ms step_avg:154.33ms step:853/1480 train_time:130103ms step_avg:154.33ms step:854/1480 train_time:130266ms step_avg:154.34ms step:855/1480 train_time:130430ms step_avg:154.36ms step:856/1480 train_time:130592ms step_avg:154.36ms step:857/1480 train_time:130757ms step_avg:154.38ms step:858/1480 train_time:130922ms step_avg:154.39ms step:859/1480 train_time:131086ms step_avg:154.40ms step:860/1480 train_time:131247ms step_avg:154.41ms step:861/1480 train_time:131414ms step_avg:154.42ms step:862/1480 train_time:131581ms step_avg:154.44ms step:863/1480 train_time:131749ms step_avg:154.45ms step:864/1480 train_time:131913ms step_avg:154.46ms step:865/1480 train_time:132076ms step_avg:154.47ms step:866/1480 train_time:132243ms step_avg:154.49ms step:867/1480 train_time:132406ms step_avg:154.50ms step:868/1480 train_time:132567ms step_avg:154.51ms step:869/1480 train_time:132728ms step_avg:154.51ms step:870/1480 train_time:132893ms step_avg:154.53ms step:871/1480 train_time:133057ms step_avg:154.54ms step:872/1480 train_time:133221ms step_avg:154.55ms step:873/1480 train_time:133384ms step_avg:154.56ms step:874/1480 train_time:133550ms step_avg:154.57ms step:875/1480 train_time:133715ms step_avg:154.58ms step:875/1480 val_loss:3.5077 train_time:133780ms step_avg:154.66ms step:876/1480 train_time:133881ms step_avg:154.60ms step:877/1480 train_time:134048ms step_avg:154.61ms step:878/1480 train_time:134210ms step_avg:154.62ms step:879/1480 train_time:134373ms step_avg:154.63ms step:880/1480 train_time:134536ms step_avg:154.64ms step:881/1480 train_time:134698ms step_avg:154.65ms step:882/1480 train_time:134863ms step_avg:154.66ms step:883/1480 train_time:135030ms step_avg:154.67ms step:884/1480 train_time:135196ms step_avg:154.69ms step:885/1480 train_time:135364ms step_avg:154.70ms step:886/1480 train_time:135529ms step_avg:154.71ms step:887/1480 train_time:135696ms step_avg:154.73ms step:888/1480 train_time:135869ms step_avg:154.75ms step:889/1480 train_time:136038ms step_avg:154.76ms step:890/1480 train_time:136201ms step_avg:154.77ms step:891/1480 train_time:136368ms step_avg:154.79ms step:892/1480 train_time:136532ms step_avg:154.80ms step:893/1480 train_time:136693ms step_avg:154.81ms step:894/1480 train_time:136860ms step_avg:154.82ms step:895/1480 train_time:137026ms step_avg:154.83ms step:896/1480 train_time:137191ms step_avg:154.84ms step:897/1480 train_time:137357ms step_avg:154.86ms step:898/1480 train_time:137526ms step_avg:154.87ms step:899/1480 train_time:137690ms step_avg:154.88ms step:900/1480 train_time:137854ms step_avg:154.89ms step:901/1480 train_time:138018ms step_avg:154.90ms step:902/1480 train_time:138182ms step_avg:154.91ms step:903/1480 train_time:138354ms step_avg:154.93ms step:904/1480 train_time:138520ms step_avg:154.94ms step:905/1480 train_time:138683ms step_avg:154.95ms step:906/1480 train_time:138851ms step_avg:154.97ms step:907/1480 train_time:139019ms step_avg:154.98ms step:908/1480 train_time:139182ms step_avg:154.99ms step:909/1480 train_time:139348ms step_avg:155.00ms step:910/1480 train_time:139519ms step_avg:155.02ms step:911/1480 train_time:139685ms step_avg:155.03ms step:912/1480 train_time:139851ms step_avg:155.05ms step:913/1480 train_time:140019ms step_avg:155.06ms step:914/1480 train_time:140187ms step_avg:155.07ms step:915/1480 train_time:140360ms step_avg:155.09ms step:916/1480 train_time:140524ms step_avg:155.10ms step:917/1480 train_time:140687ms step_avg:155.11ms step:918/1480 train_time:140855ms step_avg:155.13ms step:919/1480 train_time:141026ms step_avg:155.14ms step:920/1480 train_time:141191ms step_avg:155.15ms step:921/1480 train_time:141357ms step_avg:155.17ms step:922/1480 train_time:141525ms step_avg:155.18ms step:923/1480 train_time:141687ms step_avg:155.19ms step:924/1480 train_time:141852ms step_avg:155.20ms step:925/1480 train_time:142018ms step_avg:155.21ms step:926/1480 train_time:142180ms step_avg:155.22ms step:927/1480 train_time:142345ms step_avg:155.23ms step:928/1480 train_time:142510ms step_avg:155.24ms step:929/1480 train_time:142673ms step_avg:155.25ms step:930/1480 train_time:142840ms step_avg:155.26ms step:931/1480 train_time:143003ms step_avg:155.27ms step:932/1480 train_time:143169ms step_avg:155.28ms step:933/1480 train_time:143336ms step_avg:155.29ms step:934/1480 train_time:143503ms step_avg:155.31ms step:935/1480 train_time:143675ms step_avg:155.32ms step:936/1480 train_time:143844ms step_avg:155.34ms step:937/1480 train_time:144012ms step_avg:155.35ms step:938/1480 train_time:144174ms step_avg:155.36ms step:939/1480 train_time:144345ms step_avg:155.38ms step:940/1480 train_time:144511ms step_avg:155.39ms step:941/1480 train_time:144674ms step_avg:155.40ms step:942/1480 train_time:144841ms step_avg:155.41ms step:943/1480 train_time:145010ms step_avg:155.42ms step:944/1480 train_time:145183ms step_avg:155.44ms step:945/1480 train_time:145346ms step_avg:155.45ms step:946/1480 train_time:145514ms step_avg:155.46ms step:947/1480 train_time:145681ms step_avg:155.48ms step:948/1480 train_time:145847ms step_avg:155.49ms step:949/1480 train_time:146012ms step_avg:155.50ms step:950/1480 train_time:146174ms step_avg:155.50ms step:951/1480 train_time:146344ms step_avg:155.52ms step:952/1480 train_time:146509ms step_avg:155.53ms step:953/1480 train_time:146677ms step_avg:155.54ms step:954/1480 train_time:146845ms step_avg:155.56ms step:955/1480 train_time:147008ms step_avg:155.56ms step:956/1480 train_time:147173ms step_avg:155.57ms step:957/1480 train_time:147343ms step_avg:155.59ms step:958/1480 train_time:147512ms step_avg:155.60ms step:959/1480 train_time:147677ms step_avg:155.61ms step:960/1480 train_time:147847ms step_avg:155.63ms step:961/1480 train_time:148012ms step_avg:155.64ms step:962/1480 train_time:148176ms step_avg:155.65ms step:963/1480 train_time:148343ms step_avg:155.66ms step:964/1480 train_time:148511ms step_avg:155.67ms step:965/1480 train_time:148674ms step_avg:155.68ms step:966/1480 train_time:148838ms step_avg:155.69ms step:967/1480 train_time:149003ms step_avg:155.70ms step:968/1480 train_time:149168ms step_avg:155.71ms step:969/1480 train_time:149334ms step_avg:155.72ms step:970/1480 train_time:149497ms step_avg:155.73ms step:971/1480 train_time:149661ms step_avg:155.73ms step:972/1480 train_time:149826ms step_avg:155.74ms step:973/1480 train_time:149989ms step_avg:155.75ms step:974/1480 train_time:150159ms step_avg:155.77ms step:975/1480 train_time:150325ms step_avg:155.78ms step:976/1480 train_time:150489ms step_avg:155.79ms step:977/1480 train_time:150653ms step_avg:155.79ms step:978/1480 train_time:150819ms step_avg:155.80ms step:979/1480 train_time:150985ms step_avg:155.82ms step:980/1480 train_time:151150ms step_avg:155.82ms step:981/1480 train_time:151320ms step_avg:155.84ms step:982/1480 train_time:151484ms step_avg:155.85ms step:983/1480 train_time:151649ms step_avg:155.86ms step:984/1480 train_time:151814ms step_avg:155.87ms step:985/1480 train_time:151981ms step_avg:155.88ms step:986/1480 train_time:152148ms step_avg:155.89ms step:987/1480 train_time:152311ms step_avg:155.90ms step:988/1480 train_time:152479ms step_avg:155.91ms step:989/1480 train_time:152646ms step_avg:155.92ms step:990/1480 train_time:152814ms step_avg:155.93ms step:991/1480 train_time:152981ms step_avg:155.94ms step:992/1480 train_time:153156ms step_avg:155.96ms step:993/1480 train_time:153332ms step_avg:155.98ms step:994/1480 train_time:153497ms step_avg:155.99ms step:995/1480 train_time:153662ms step_avg:156.00ms step:996/1480 train_time:153824ms step_avg:156.01ms step:997/1480 train_time:153988ms step_avg:156.02ms step:998/1480 train_time:154152ms step_avg:156.02ms step:999/1480 train_time:154317ms step_avg:156.03ms step:1000/1480 train_time:154486ms step_avg:156.05ms step:1000/1480 val_loss:3.4444 train_time:154552ms step_avg:156.11ms step:1001/1480 train_time:154656ms step_avg:156.06ms step:1002/1480 train_time:154822ms step_avg:156.07ms step:1003/1480 train_time:154996ms step_avg:156.09ms step:1004/1480 train_time:155165ms step_avg:156.10ms step:1005/1480 train_time:155332ms step_avg:156.11ms step:1006/1480 train_time:155499ms step_avg:156.12ms step:1007/1480 train_time:155665ms step_avg:156.13ms step:1008/1480 train_time:155831ms step_avg:156.14ms step:1009/1480 train_time:156004ms step_avg:156.16ms step:1010/1480 train_time:156169ms step_avg:156.17ms step:1011/1480 train_time:156335ms step_avg:156.18ms step:1012/1480 train_time:156501ms step_avg:156.19ms step:1013/1480 train_time:156670ms step_avg:156.20ms step:1014/1480 train_time:156836ms step_avg:156.21ms step:1015/1480 train_time:157005ms step_avg:156.22ms step:1016/1480 train_time:157173ms step_avg:156.24ms step:1017/1480 train_time:157344ms step_avg:156.25ms step:1018/1480 train_time:157513ms step_avg:156.26ms step:1019/1480 train_time:157681ms step_avg:156.27ms step:1020/1480 train_time:157852ms step_avg:156.29ms step:1021/1480 train_time:158017ms step_avg:156.30ms step:1022/1480 train_time:158184ms step_avg:156.31ms step:1023/1480 train_time:158353ms step_avg:156.32ms step:1024/1480 train_time:158520ms step_avg:156.33ms step:1025/1480 train_time:158690ms step_avg:156.34ms step:1026/1480 train_time:158857ms step_avg:156.36ms step:1027/1480 train_time:159022ms step_avg:156.36ms step:1028/1480 train_time:159195ms step_avg:156.38ms step:1029/1480 train_time:159369ms step_avg:156.40ms step:1030/1480 train_time:159535ms step_avg:156.41ms step:1031/1480 train_time:159701ms step_avg:156.42ms step:1032/1480 train_time:159874ms step_avg:156.43ms step:1033/1480 train_time:160041ms step_avg:156.44ms step:1034/1480 train_time:160210ms step_avg:156.46ms step:1035/1480 train_time:160379ms step_avg:156.47ms step:1036/1480 train_time:160545ms step_avg:156.48ms step:1037/1480 train_time:160713ms step_avg:156.49ms step:1038/1480 train_time:160882ms step_avg:156.50ms step:1039/1480 train_time:161054ms step_avg:156.52ms step:1040/1480 train_time:161220ms step_avg:156.52ms step:1041/1480 train_time:161387ms step_avg:156.53ms step:1042/1480 train_time:161551ms step_avg:156.54ms step:1043/1480 train_time:161718ms step_avg:156.55ms step:1044/1480 train_time:161883ms step_avg:156.56ms step:1045/1480 train_time:162055ms step_avg:156.57ms step:1046/1480 train_time:162223ms step_avg:156.59ms step:1047/1480 train_time:162389ms step_avg:156.59ms step:1048/1480 train_time:162556ms step_avg:156.60ms step:1049/1480 train_time:162722ms step_avg:156.61ms step:1050/1480 train_time:162891ms step_avg:156.63ms step:1051/1480 train_time:163060ms step_avg:156.64ms step:1052/1480 train_time:163227ms step_avg:156.65ms step:1053/1480 train_time:163394ms step_avg:156.66ms step:1054/1480 train_time:163561ms step_avg:156.67ms step:1055/1480 train_time:163725ms step_avg:156.68ms step:1056/1480 train_time:163891ms step_avg:156.68ms step:1057/1480 train_time:164057ms step_avg:156.69ms step:1058/1480 train_time:164224ms step_avg:156.70ms step:1059/1480 train_time:164399ms step_avg:156.72ms step:1060/1480 train_time:164567ms step_avg:156.73ms step:1061/1480 train_time:164731ms step_avg:156.74ms step:1062/1480 train_time:164898ms step_avg:156.75ms step:1063/1480 train_time:165063ms step_avg:156.75ms step:1064/1480 train_time:165226ms step_avg:156.76ms step:1065/1480 train_time:165394ms step_avg:156.77ms step:1066/1480 train_time:165561ms step_avg:156.78ms step:1067/1480 train_time:165731ms step_avg:156.79ms step:1068/1480 train_time:165899ms step_avg:156.80ms step:1069/1480 train_time:166071ms step_avg:156.82ms step:1070/1480 train_time:166238ms step_avg:156.83ms step:1071/1480 train_time:166412ms step_avg:156.84ms step:1072/1480 train_time:166579ms step_avg:156.85ms step:1073/1480 train_time:166741ms step_avg:156.86ms step:1074/1480 train_time:166908ms step_avg:156.87ms step:1075/1480 train_time:167080ms step_avg:156.88ms step:1076/1480 train_time:167246ms step_avg:156.89ms step:1077/1480 train_time:167412ms step_avg:156.90ms step:1078/1480 train_time:167586ms step_avg:156.92ms step:1079/1480 train_time:167759ms step_avg:156.93ms step:1080/1480 train_time:167929ms step_avg:156.94ms step:1081/1480 train_time:168096ms step_avg:156.95ms step:1082/1480 train_time:168263ms step_avg:156.96ms step:1083/1480 train_time:168428ms step_avg:156.97ms step:1084/1480 train_time:168596ms step_avg:156.98ms step:1085/1480 train_time:168763ms step_avg:156.99ms step:1086/1480 train_time:168930ms step_avg:157.00ms step:1087/1480 train_time:169097ms step_avg:157.01ms step:1088/1480 train_time:169265ms step_avg:157.02ms step:1089/1480 train_time:169438ms step_avg:157.03ms step:1090/1480 train_time:169610ms step_avg:157.05ms step:1091/1480 train_time:169779ms step_avg:157.06ms step:1092/1480 train_time:169946ms step_avg:157.07ms step:1093/1480 train_time:170113ms step_avg:157.08ms step:1094/1480 train_time:170280ms step_avg:157.08ms step:1095/1480 train_time:170445ms step_avg:157.09ms step:1096/1480 train_time:170615ms step_avg:157.10ms step:1097/1480 train_time:170783ms step_avg:157.11ms step:1098/1480 train_time:170956ms step_avg:157.13ms step:1099/1480 train_time:171127ms step_avg:157.14ms step:1100/1480 train_time:171300ms step_avg:157.16ms step:1101/1480 train_time:171471ms step_avg:157.17ms step:1102/1480 train_time:171641ms step_avg:157.18ms step:1103/1480 train_time:171820ms step_avg:157.20ms step:1104/1480 train_time:171986ms step_avg:157.21ms step:1105/1480 train_time:172158ms step_avg:157.22ms step:1106/1480 train_time:172325ms step_avg:157.23ms step:1107/1480 train_time:172496ms step_avg:157.24ms step:1108/1480 train_time:172661ms step_avg:157.25ms step:1109/1480 train_time:172827ms step_avg:157.26ms step:1110/1480 train_time:172995ms step_avg:157.27ms step:1111/1480 train_time:173161ms step_avg:157.28ms step:1112/1480 train_time:173332ms step_avg:157.29ms step:1113/1480 train_time:173511ms step_avg:157.31ms step:1114/1480 train_time:173684ms step_avg:157.32ms step:1115/1480 train_time:173858ms step_avg:157.34ms step:1116/1480 train_time:174025ms step_avg:157.35ms step:1117/1480 train_time:174199ms step_avg:157.36ms step:1118/1480 train_time:174372ms step_avg:157.38ms step:1119/1480 train_time:174538ms step_avg:157.38ms step:1120/1480 train_time:174707ms step_avg:157.39ms step:1121/1480 train_time:174878ms step_avg:157.41ms step:1122/1480 train_time:175044ms step_avg:157.41ms step:1123/1480 train_time:175210ms step_avg:157.42ms step:1124/1480 train_time:175379ms step_avg:157.43ms step:1125/1480 train_time:175547ms step_avg:157.44ms step:1125/1480 val_loss:3.3885 train_time:175614ms step_avg:157.50ms step:1126/1480 train_time:175718ms step_avg:157.45ms step:1127/1480 train_time:175887ms step_avg:157.46ms step:1128/1480 train_time:176058ms step_avg:157.48ms step:1129/1480 train_time:176230ms step_avg:157.49ms step:1130/1480 train_time:176400ms step_avg:157.50ms step:1131/1480 train_time:176576ms step_avg:157.52ms step:1132/1480 train_time:176743ms step_avg:157.52ms step:1133/1480 train_time:176915ms step_avg:157.54ms step:1134/1480 train_time:177086ms step_avg:157.55ms step:1135/1480 train_time:177255ms step_avg:157.56ms step:1136/1480 train_time:177426ms step_avg:157.57ms step:1137/1480 train_time:177594ms step_avg:157.58ms step:1138/1480 train_time:177765ms step_avg:157.59ms step:1139/1480 train_time:177932ms step_avg:157.60ms step:1140/1480 train_time:178101ms step_avg:157.61ms step:1141/1480 train_time:178273ms step_avg:157.62ms step:1142/1480 train_time:178440ms step_avg:157.63ms step:1143/1480 train_time:178611ms step_avg:157.64ms step:1144/1480 train_time:178779ms step_avg:157.65ms step:1145/1480 train_time:178946ms step_avg:157.66ms step:1146/1480 train_time:179117ms step_avg:157.67ms step:1147/1480 train_time:179285ms step_avg:157.68ms step:1148/1480 train_time:179453ms step_avg:157.69ms step:1149/1480 train_time:179625ms step_avg:157.70ms step:1150/1480 train_time:179792ms step_avg:157.71ms step:1151/1480 train_time:179965ms step_avg:157.73ms step:1152/1480 train_time:180135ms step_avg:157.74ms step:1153/1480 train_time:180308ms step_avg:157.75ms step:1154/1480 train_time:180475ms step_avg:157.76ms step:1155/1480 train_time:180647ms step_avg:157.77ms step:1156/1480 train_time:180826ms step_avg:157.79ms step:1157/1480 train_time:180997ms step_avg:157.80ms step:1158/1480 train_time:181165ms step_avg:157.81ms step:1159/1480 train_time:181332ms step_avg:157.82ms step:1160/1480 train_time:181500ms step_avg:157.83ms step:1161/1480 train_time:181669ms step_avg:157.84ms step:1162/1480 train_time:181839ms step_avg:157.85ms step:1163/1480 train_time:182009ms step_avg:157.86ms step:1164/1480 train_time:182178ms step_avg:157.87ms step:1165/1480 train_time:182345ms step_avg:157.87ms step:1166/1480 train_time:182514ms step_avg:157.88ms step:1167/1480 train_time:182683ms step_avg:157.89ms step:1168/1480 train_time:182850ms step_avg:157.90ms step:1169/1480 train_time:183020ms step_avg:157.91ms step:1170/1480 train_time:183189ms step_avg:157.92ms step:1171/1480 train_time:183356ms step_avg:157.93ms step:1172/1480 train_time:183525ms step_avg:157.94ms step:1173/1480 train_time:183695ms step_avg:157.95ms step:1174/1480 train_time:183877ms step_avg:157.97ms step:1175/1480 train_time:184050ms step_avg:157.98ms step:1176/1480 train_time:184223ms step_avg:158.00ms step:1177/1480 train_time:184399ms step_avg:158.01ms step:1178/1480 train_time:184566ms step_avg:158.02ms step:1179/1480 train_time:184732ms step_avg:158.03ms step:1180/1480 train_time:184913ms step_avg:158.05ms step:1181/1480 train_time:185084ms step_avg:158.06ms step:1182/1480 train_time:185251ms step_avg:158.06ms step:1183/1480 train_time:185421ms step_avg:158.07ms step:1184/1480 train_time:185588ms step_avg:158.08ms step:1185/1480 train_time:185762ms step_avg:158.10ms step:1186/1480 train_time:185933ms step_avg:158.11ms step:1187/1480 train_time:186115ms step_avg:158.13ms step:1188/1480 train_time:186282ms step_avg:158.13ms step:1189/1480 train_time:186451ms step_avg:158.14ms step:1190/1480 train_time:186620ms step_avg:158.15ms step:1191/1480 train_time:186790ms step_avg:158.16ms step:1192/1480 train_time:186958ms step_avg:158.17ms step:1193/1480 train_time:187125ms step_avg:158.18ms step:1194/1480 train_time:187293ms step_avg:158.19ms step:1195/1480 train_time:187467ms step_avg:158.20ms step:1196/1480 train_time:187650ms step_avg:158.22ms step:1197/1480 train_time:187824ms step_avg:158.23ms step:1198/1480 train_time:188005ms step_avg:158.25ms step:1199/1480 train_time:188175ms step_avg:158.26ms step:1200/1480 train_time:188344ms step_avg:158.27ms step:1201/1480 train_time:188512ms step_avg:158.28ms step:1202/1480 train_time:188693ms step_avg:158.30ms step:1203/1480 train_time:188868ms step_avg:158.31ms step:1204/1480 train_time:189042ms step_avg:158.33ms step:1205/1480 train_time:189211ms step_avg:158.34ms step:1206/1480 train_time:189379ms step_avg:158.34ms step:1207/1480 train_time:189547ms step_avg:158.35ms step:1208/1480 train_time:189715ms step_avg:158.36ms step:1209/1480 train_time:189889ms step_avg:158.37ms step:1210/1480 train_time:190065ms step_avg:158.39ms step:1211/1480 train_time:190238ms step_avg:158.40ms step:1212/1480 train_time:190410ms step_avg:158.41ms step:1213/1480 train_time:190582ms step_avg:158.42ms step:1214/1480 train_time:190759ms step_avg:158.44ms step:1215/1480 train_time:190931ms step_avg:158.45ms step:1216/1480 train_time:191100ms step_avg:158.46ms step:1217/1480 train_time:191272ms step_avg:158.47ms step:1218/1480 train_time:191443ms step_avg:158.48ms step:1219/1480 train_time:191624ms step_avg:158.50ms step:1220/1480 train_time:191792ms step_avg:158.51ms step:1221/1480 train_time:191961ms step_avg:158.51ms step:1222/1480 train_time:192129ms step_avg:158.52ms step:1223/1480 train_time:192300ms step_avg:158.53ms step:1224/1480 train_time:192475ms step_avg:158.55ms step:1225/1480 train_time:192646ms step_avg:158.56ms step:1226/1480 train_time:192821ms step_avg:158.57ms step:1227/1480 train_time:192993ms step_avg:158.58ms step:1228/1480 train_time:193164ms step_avg:158.59ms step:1229/1480 train_time:193336ms step_avg:158.60ms step:1230/1480 train_time:193515ms step_avg:158.62ms step:1231/1480 train_time:193690ms step_avg:158.63ms step:1232/1480 train_time:193865ms step_avg:158.65ms step:1233/1480 train_time:194033ms step_avg:158.65ms step:1234/1480 train_time:194205ms step_avg:158.66ms step:1235/1480 train_time:194381ms step_avg:158.68ms step:1236/1480 train_time:194550ms step_avg:158.69ms step:1237/1480 train_time:194721ms step_avg:158.70ms step:1238/1480 train_time:194905ms step_avg:158.72ms step:1239/1480 train_time:195075ms step_avg:158.73ms step:1240/1480 train_time:195246ms step_avg:158.74ms step:1241/1480 train_time:195420ms step_avg:158.75ms step:1242/1480 train_time:195590ms step_avg:158.76ms step:1243/1480 train_time:195765ms step_avg:158.77ms step:1244/1480 train_time:195931ms step_avg:158.78ms step:1245/1480 train_time:196101ms step_avg:158.79ms step:1246/1480 train_time:196269ms step_avg:158.79ms step:1247/1480 train_time:196438ms step_avg:158.80ms step:1248/1480 train_time:196608ms step_avg:158.81ms step:1249/1480 train_time:196777ms step_avg:158.82ms step:1250/1480 train_time:196946ms step_avg:158.83ms step:1250/1480 val_loss:3.3394 train_time:197017ms step_avg:158.88ms step:1251/1480 train_time:197126ms step_avg:158.84ms step:1252/1480 train_time:197295ms step_avg:158.85ms step:1253/1480 train_time:197463ms step_avg:158.86ms step:1254/1480 train_time:197636ms step_avg:158.87ms step:1255/1480 train_time:197824ms step_avg:158.89ms step:1256/1480 train_time:197997ms step_avg:158.91ms step:1257/1480 train_time:198168ms step_avg:158.92ms step:1258/1480 train_time:198344ms step_avg:158.93ms step:1259/1480 train_time:198514ms step_avg:158.94ms step:1260/1480 train_time:198682ms step_avg:158.95ms step:1261/1480 train_time:198855ms step_avg:158.96ms step:1262/1480 train_time:199030ms step_avg:158.97ms step:1263/1480 train_time:199204ms step_avg:158.98ms step:1264/1480 train_time:199370ms step_avg:158.99ms step:1265/1480 train_time:199537ms step_avg:158.99ms step:1266/1480 train_time:199709ms step_avg:159.00ms step:1267/1480 train_time:199880ms step_avg:159.01ms step:1268/1480 train_time:200051ms step_avg:159.02ms step:1269/1480 train_time:200227ms step_avg:159.04ms step:1270/1480 train_time:200396ms step_avg:159.04ms step:1271/1480 train_time:200567ms step_avg:159.05ms step:1272/1480 train_time:200734ms step_avg:159.06ms step:1273/1480 train_time:200905ms step_avg:159.07ms step:1274/1480 train_time:201076ms step_avg:159.08ms step:1275/1480 train_time:201245ms step_avg:159.09ms step:1276/1480 train_time:201411ms step_avg:159.09ms step:1277/1480 train_time:201582ms step_avg:159.10ms step:1278/1480 train_time:201749ms step_avg:159.11ms step:1279/1480 train_time:201920ms step_avg:159.12ms step:1280/1480 train_time:202100ms step_avg:159.13ms step:1281/1480 train_time:202268ms step_avg:159.14ms step:1282/1480 train_time:202435ms step_avg:159.15ms step:1283/1480 train_time:202606ms step_avg:159.16ms step:1284/1480 train_time:202776ms step_avg:159.16ms step:1285/1480 train_time:202948ms step_avg:159.17ms step:1286/1480 train_time:203117ms step_avg:159.18ms step:1287/1480 train_time:203289ms step_avg:159.19ms step:1288/1480 train_time:203460ms step_avg:159.20ms step:1289/1480 train_time:203645ms step_avg:159.22ms step:1290/1480 train_time:203825ms step_avg:159.24ms step:1291/1480 train_time:203997ms step_avg:159.25ms step:1292/1480 train_time:204172ms step_avg:159.26ms step:1293/1480 train_time:204348ms step_avg:159.27ms step:1294/1480 train_time:204518ms step_avg:159.28ms step:1295/1480 train_time:204690ms step_avg:159.29ms step:1296/1480 train_time:204863ms step_avg:159.30ms step:1297/1480 train_time:205035ms step_avg:159.31ms step:1298/1480 train_time:205207ms step_avg:159.32ms step:1299/1480 train_time:205377ms step_avg:159.33ms step:1300/1480 train_time:205544ms step_avg:159.34ms step:1301/1480 train_time:205713ms step_avg:159.34ms step:1302/1480 train_time:205888ms step_avg:159.36ms step:1303/1480 train_time:206062ms step_avg:159.37ms step:1304/1480 train_time:206235ms step_avg:159.38ms step:1305/1480 train_time:206404ms step_avg:159.39ms step:1306/1480 train_time:206578ms step_avg:159.40ms step:1307/1480 train_time:206746ms step_avg:159.40ms step:1308/1480 train_time:206914ms step_avg:159.41ms step:1309/1480 train_time:207087ms step_avg:159.42ms step:1310/1480 train_time:207254ms step_avg:159.43ms step:1311/1480 train_time:207422ms step_avg:159.43ms step:1312/1480 train_time:207595ms step_avg:159.44ms step:1313/1480 train_time:207765ms step_avg:159.45ms step:1314/1480 train_time:207938ms step_avg:159.46ms step:1315/1480 train_time:208109ms step_avg:159.47ms step:1316/1480 train_time:208276ms step_avg:159.48ms step:1317/1480 train_time:208447ms step_avg:159.49ms step:1318/1480 train_time:208628ms step_avg:159.50ms step:1319/1480 train_time:208804ms step_avg:159.51ms step:1320/1480 train_time:208981ms step_avg:159.53ms step:1321/1480 train_time:209153ms step_avg:159.54ms step:1322/1480 train_time:209336ms step_avg:159.55ms step:1323/1480 train_time:209510ms step_avg:159.57ms step:1324/1480 train_time:209686ms step_avg:159.58ms step:1325/1480 train_time:209866ms step_avg:159.59ms step:1326/1480 train_time:210041ms step_avg:159.61ms step:1327/1480 train_time:210211ms step_avg:159.61ms step:1328/1480 train_time:210381ms step_avg:159.62ms step:1329/1480 train_time:210577ms step_avg:159.65ms step:1330/1480 train_time:210755ms step_avg:159.66ms step:1331/1480 train_time:210925ms step_avg:159.67ms step:1332/1480 train_time:211098ms step_avg:159.68ms step:1333/1480 train_time:211274ms step_avg:159.69ms step:1334/1480 train_time:211447ms step_avg:159.70ms step:1335/1480 train_time:211616ms step_avg:159.71ms step:1336/1480 train_time:211800ms step_avg:159.73ms step:1337/1480 train_time:211976ms step_avg:159.74ms step:1338/1480 train_time:212148ms step_avg:159.75ms step:1339/1480 train_time:212320ms step_avg:159.76ms step:1340/1480 train_time:212492ms step_avg:159.77ms step:1341/1480 train_time:212661ms step_avg:159.78ms step:1342/1480 train_time:212835ms step_avg:159.79ms step:1343/1480 train_time:213006ms step_avg:159.79ms step:1344/1480 train_time:213178ms step_avg:159.80ms step:1345/1480 train_time:213355ms step_avg:159.82ms step:1346/1480 train_time:213524ms step_avg:159.82ms step:1347/1480 train_time:213694ms step_avg:159.83ms step:1348/1480 train_time:213864ms step_avg:159.84ms step:1349/1480 train_time:214035ms step_avg:159.85ms step:1350/1480 train_time:214211ms step_avg:159.86ms step:1351/1480 train_time:214381ms step_avg:159.87ms step:1352/1480 train_time:214552ms step_avg:159.88ms step:1353/1480 train_time:214729ms step_avg:159.89ms step:1354/1480 train_time:214900ms step_avg:159.90ms step:1355/1480 train_time:215067ms step_avg:159.90ms step:1356/1480 train_time:215240ms step_avg:159.91ms step:1357/1480 train_time:215413ms step_avg:159.92ms step:1358/1480 train_time:215586ms step_avg:159.93ms step:1359/1480 train_time:215757ms step_avg:159.94ms step:1360/1480 train_time:215931ms step_avg:159.95ms step:1361/1480 train_time:216108ms step_avg:159.96ms step:1362/1480 train_time:216284ms step_avg:159.97ms step:1363/1480 train_time:216464ms step_avg:159.99ms step:1364/1480 train_time:216632ms step_avg:159.99ms step:1365/1480 train_time:216801ms step_avg:160.00ms step:1366/1480 train_time:216973ms step_avg:160.01ms step:1367/1480 train_time:217145ms step_avg:160.02ms step:1368/1480 train_time:217317ms step_avg:160.03ms step:1369/1480 train_time:217498ms step_avg:160.04ms step:1370/1480 train_time:217675ms step_avg:160.06ms step:1371/1480 train_time:217847ms step_avg:160.06ms step:1372/1480 train_time:218025ms step_avg:160.08ms step:1373/1480 train_time:218195ms step_avg:160.08ms step:1374/1480 train_time:218372ms step_avg:160.10ms step:1375/1480 train_time:218543ms step_avg:160.10ms step:1375/1480 val_loss:3.3006 train_time:218611ms step_avg:160.15ms step:1376/1480 train_time:218719ms step_avg:160.12ms step:1377/1480 train_time:218889ms step_avg:160.12ms step:1378/1480 train_time:219059ms step_avg:160.13ms step:1379/1480 train_time:219233ms step_avg:160.14ms step:1380/1480 train_time:219407ms step_avg:160.15ms step:1381/1480 train_time:219587ms step_avg:160.17ms step:1382/1480 train_time:219758ms step_avg:160.17ms step:1383/1480 train_time:219929ms step_avg:160.18ms step:1384/1480 train_time:220106ms step_avg:160.19ms step:1385/1480 train_time:220270ms step_avg:160.20ms step:1386/1480 train_time:220443ms step_avg:160.21ms step:1387/1480 train_time:220616ms step_avg:160.21ms step:1388/1480 train_time:220783ms step_avg:160.22ms step:1389/1480 train_time:220957ms step_avg:160.23ms step:1390/1480 train_time:221126ms step_avg:160.24ms step:1391/1480 train_time:221296ms step_avg:160.24ms step:1392/1480 train_time:221468ms step_avg:160.25ms step:1393/1480 train_time:221641ms step_avg:160.26ms step:1394/1480 train_time:221810ms step_avg:160.27ms step:1395/1480 train_time:221980ms step_avg:160.27ms step:1396/1480 train_time:222147ms step_avg:160.28ms step:1397/1480 train_time:222315ms step_avg:160.28ms step:1398/1480 train_time:222482ms step_avg:160.29ms step:1399/1480 train_time:222651ms step_avg:160.30ms step:1400/1480 train_time:222827ms step_avg:160.31ms step:1401/1480 train_time:222993ms step_avg:160.31ms step:1402/1480 train_time:223165ms step_avg:160.32ms step:1403/1480 train_time:223343ms step_avg:160.33ms step:1404/1480 train_time:223515ms step_avg:160.34ms step:1405/1480 train_time:223688ms step_avg:160.35ms step:1406/1480 train_time:223862ms step_avg:160.36ms step:1407/1480 train_time:224030ms step_avg:160.36ms step:1408/1480 train_time:224198ms step_avg:160.37ms step:1409/1480 train_time:224382ms step_avg:160.39ms step:1410/1480 train_time:224553ms step_avg:160.39ms step:1411/1480 train_time:224722ms step_avg:160.40ms step:1412/1480 train_time:224891ms step_avg:160.41ms step:1413/1480 train_time:225061ms step_avg:160.41ms step:1414/1480 train_time:225231ms step_avg:160.42ms step:1415/1480 train_time:225406ms step_avg:160.43ms step:1416/1480 train_time:225592ms step_avg:160.45ms step:1417/1480 train_time:225766ms step_avg:160.46ms step:1418/1480 train_time:225938ms step_avg:160.47ms step:1419/1480 train_time:226111ms step_avg:160.48ms step:1420/1480 train_time:226286ms step_avg:160.49ms step:1421/1480 train_time:226461ms step_avg:160.50ms step:1422/1480 train_time:226632ms step_avg:160.50ms step:1423/1480 train_time:226803ms step_avg:160.51ms step:1424/1480 train_time:226980ms step_avg:160.52ms step:1425/1480 train_time:227163ms step_avg:160.54ms step:1426/1480 train_time:227335ms step_avg:160.55ms step:1427/1480 train_time:227510ms step_avg:160.56ms step:1428/1480 train_time:227680ms step_avg:160.56ms step:1429/1480 train_time:227848ms step_avg:160.57ms step:1430/1480 train_time:228024ms step_avg:160.58ms step:1431/1480 train_time:228201ms step_avg:160.59ms step:1432/1480 train_time:228379ms step_avg:160.60ms step:1433/1480 train_time:228558ms step_avg:160.62ms step:1434/1480 train_time:228738ms step_avg:160.63ms step:1435/1480 train_time:228913ms step_avg:160.64ms step:1436/1480 train_time:229086ms step_avg:160.65ms step:1437/1480 train_time:229259ms step_avg:160.66ms step:1438/1480 train_time:229427ms step_avg:160.66ms step:1439/1480 train_time:229601ms step_avg:160.67ms step:1440/1480 train_time:229770ms step_avg:160.68ms step:1441/1480 train_time:229942ms step_avg:160.69ms step:1442/1480 train_time:230121ms step_avg:160.70ms step:1443/1480 train_time:230310ms step_avg:160.72ms step:1444/1480 train_time:230480ms step_avg:160.73ms step:1445/1480 train_time:230651ms step_avg:160.73ms step:1446/1480 train_time:230825ms step_avg:160.74ms step:1447/1480 train_time:231003ms step_avg:160.75ms step:1448/1480 train_time:231173ms step_avg:160.76ms step:1449/1480 train_time:231348ms step_avg:160.77ms step:1450/1480 train_time:231521ms step_avg:160.78ms step:1451/1480 train_time:231693ms step_avg:160.79ms step:1452/1480 train_time:231866ms step_avg:160.79ms step:1453/1480 train_time:232036ms step_avg:160.80ms step:1454/1480 train_time:232208ms step_avg:160.81ms step:1455/1480 train_time:232389ms step_avg:160.82ms step:1456/1480 train_time:232562ms step_avg:160.83ms step:1457/1480 train_time:232732ms step_avg:160.84ms step:1458/1480 train_time:232903ms step_avg:160.84ms step:1459/1480 train_time:233079ms step_avg:160.85ms step:1460/1480 train_time:233251ms step_avg:160.86ms step:1461/1480 train_time:233428ms step_avg:160.87ms step:1462/1480 train_time:233600ms step_avg:160.88ms step:1463/1480 train_time:233776ms step_avg:160.89ms step:1464/1480 train_time:233950ms step_avg:160.90ms step:1465/1480 train_time:234124ms step_avg:160.91ms step:1466/1480 train_time:234294ms step_avg:160.92ms step:1467/1480 train_time:234468ms step_avg:160.93ms step:1468/1480 train_time:234639ms step_avg:160.93ms step:1469/1480 train_time:234810ms step_avg:160.94ms step:1470/1480 train_time:234990ms step_avg:160.95ms step:1471/1480 train_time:235177ms step_avg:160.97ms step:1472/1480 train_time:235360ms step_avg:160.98ms step:1473/1480 train_time:235531ms step_avg:160.99ms step:1474/1480 train_time:235709ms step_avg:161.00ms step:1475/1480 train_time:235890ms step_avg:161.02ms step:1476/1480 train_time:236063ms step_avg:161.02ms step:1477/1480 train_time:236244ms step_avg:161.04ms step:1478/1480 train_time:236426ms step_avg:161.05ms step:1479/1480 train_time:236601ms step_avg:161.06ms step:1480/1480 train_time:236773ms step_avg:161.07ms step:1480/1480 val_loss:3.2810 train_time:236845ms step_avg:161.12ms