import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 09:57:37 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 111W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28851ms step_avg:nanms step:2/1480 train_time:28957ms step_avg:nanms step:3/1480 train_time:29082ms step_avg:nanms step:4/1480 train_time:29219ms step_avg:nanms step:5/1480 train_time:29361ms step_avg:nanms step:6/1480 train_time:29502ms step_avg:nanms step:7/1480 train_time:29645ms step_avg:nanms step:8/1480 train_time:29785ms step_avg:nanms step:9/1480 train_time:29933ms step_avg:nanms step:10/1480 train_time:30072ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:280ms step_avg:nanms step:13/1480 train_time:423ms step_avg:141.01ms step:14/1480 train_time:568ms step_avg:142.08ms step:15/1480 train_time:711ms step_avg:142.18ms step:16/1480 train_time:856ms step_avg:142.67ms step:17/1480 train_time:999ms step_avg:142.72ms step:18/1480 train_time:1141ms step_avg:142.67ms step:19/1480 train_time:1285ms step_avg:142.80ms step:20/1480 train_time:1429ms step_avg:142.86ms step:21/1480 train_time:1572ms step_avg:142.92ms step:22/1480 train_time:1714ms step_avg:142.83ms step:23/1480 train_time:1856ms step_avg:142.77ms step:24/1480 train_time:1997ms step_avg:142.65ms step:25/1480 train_time:2139ms step_avg:142.59ms step:26/1480 train_time:2281ms step_avg:142.59ms step:27/1480 train_time:2425ms step_avg:142.63ms step:28/1480 train_time:2569ms step_avg:142.73ms step:29/1480 train_time:2713ms step_avg:142.78ms step:30/1480 train_time:2856ms step_avg:142.81ms step:31/1480 train_time:2998ms step_avg:142.76ms step:32/1480 train_time:3140ms step_avg:142.71ms step:33/1480 train_time:3282ms step_avg:142.69ms step:34/1480 train_time:3424ms step_avg:142.68ms step:35/1480 train_time:3569ms step_avg:142.75ms step:36/1480 train_time:3713ms step_avg:142.80ms step:37/1480 train_time:3856ms step_avg:142.80ms step:38/1480 train_time:3998ms step_avg:142.78ms step:39/1480 train_time:4140ms step_avg:142.75ms step:40/1480 train_time:4283ms step_avg:142.78ms step:41/1480 train_time:4427ms step_avg:142.82ms step:42/1480 train_time:4570ms step_avg:142.82ms step:43/1480 train_time:4712ms step_avg:142.79ms step:44/1480 train_time:4856ms step_avg:142.83ms step:45/1480 train_time:4997ms step_avg:142.78ms step:46/1480 train_time:5140ms step_avg:142.77ms step:47/1480 train_time:5282ms step_avg:142.76ms step:48/1480 train_time:5424ms step_avg:142.74ms step:49/1480 train_time:5568ms step_avg:142.77ms step:50/1480 train_time:5713ms step_avg:142.82ms step:51/1480 train_time:5856ms step_avg:142.82ms step:52/1480 train_time:5997ms step_avg:142.79ms step:53/1480 train_time:6138ms step_avg:142.76ms step:54/1480 train_time:6281ms step_avg:142.74ms step:55/1480 train_time:6425ms step_avg:142.77ms step:56/1480 train_time:6568ms step_avg:142.78ms step:57/1480 train_time:6711ms step_avg:142.78ms step:58/1480 train_time:6853ms step_avg:142.78ms step:59/1480 train_time:6996ms step_avg:142.78ms step:60/1480 train_time:7137ms step_avg:142.75ms step:61/1480 train_time:7280ms step_avg:142.74ms step:62/1480 train_time:7423ms step_avg:142.75ms step:63/1480 train_time:7568ms step_avg:142.80ms step:64/1480 train_time:7713ms step_avg:142.83ms step:65/1480 train_time:7856ms step_avg:142.83ms step:66/1480 train_time:7997ms step_avg:142.81ms step:67/1480 train_time:8139ms step_avg:142.79ms step:68/1480 train_time:8283ms step_avg:142.80ms step:69/1480 train_time:8425ms step_avg:142.80ms step:70/1480 train_time:8568ms step_avg:142.81ms step:71/1480 train_time:8712ms step_avg:142.82ms step:72/1480 train_time:8854ms step_avg:142.81ms step:73/1480 train_time:8996ms step_avg:142.79ms step:74/1480 train_time:9136ms step_avg:142.75ms step:75/1480 train_time:9279ms step_avg:142.75ms step:76/1480 train_time:9421ms step_avg:142.75ms step:77/1480 train_time:9566ms step_avg:142.78ms step:78/1480 train_time:9710ms step_avg:142.79ms step:79/1480 train_time:9854ms step_avg:142.81ms step:80/1480 train_time:10383ms step_avg:148.33ms step:81/1480 train_time:10897ms step_avg:153.47ms step:82/1480 train_time:10999ms step_avg:152.77ms step:83/1480 train_time:11142ms step_avg:152.63ms step:84/1480 train_time:11285ms step_avg:152.49ms step:85/1480 train_time:11426ms step_avg:152.35ms step:86/1480 train_time:11569ms step_avg:152.22ms step:87/1480 train_time:11711ms step_avg:152.09ms step:88/1480 train_time:11855ms step_avg:151.98ms step:89/1480 train_time:11996ms step_avg:151.85ms step:90/1480 train_time:12138ms step_avg:151.72ms step:91/1480 train_time:12280ms step_avg:151.61ms step:92/1480 train_time:12423ms step_avg:151.50ms step:93/1480 train_time:12567ms step_avg:151.41ms step:94/1480 train_time:12711ms step_avg:151.32ms step:95/1480 train_time:12854ms step_avg:151.23ms step:96/1480 train_time:12997ms step_avg:151.13ms step:97/1480 train_time:13513ms step_avg:155.32ms step:98/1480 train_time:13615ms step_avg:154.71ms step:99/1480 train_time:13757ms step_avg:154.57ms step:100/1480 train_time:13898ms step_avg:154.42ms step:101/1480 train_time:14044ms step_avg:154.33ms step:102/1480 train_time:14181ms step_avg:154.15ms step:103/1480 train_time:14325ms step_avg:154.03ms step:104/1480 train_time:14468ms step_avg:153.91ms step:105/1480 train_time:14611ms step_avg:153.80ms step:106/1480 train_time:14754ms step_avg:153.69ms step:107/1480 train_time:14896ms step_avg:153.56ms step:108/1480 train_time:15037ms step_avg:153.44ms step:109/1480 train_time:15181ms step_avg:153.34ms step:110/1480 train_time:15324ms step_avg:153.24ms step:111/1480 train_time:15468ms step_avg:153.15ms step:112/1480 train_time:15614ms step_avg:153.08ms step:113/1480 train_time:15760ms step_avg:153.01ms step:114/1480 train_time:15906ms step_avg:152.94ms step:115/1480 train_time:16053ms step_avg:152.88ms step:116/1480 train_time:16198ms step_avg:152.81ms step:117/1480 train_time:16343ms step_avg:152.74ms step:118/1480 train_time:16491ms step_avg:152.70ms step:119/1480 train_time:16636ms step_avg:152.62ms step:120/1480 train_time:16781ms step_avg:152.55ms step:121/1480 train_time:16926ms step_avg:152.49ms step:122/1480 train_time:17073ms step_avg:152.43ms step:123/1480 train_time:17218ms step_avg:152.37ms step:124/1480 train_time:17364ms step_avg:152.31ms step:125/1480 train_time:17511ms step_avg:152.27ms step:125/1480 val_loss:4.4303 train_time:17576ms step_avg:152.83ms step:126/1480 train_time:17671ms step_avg:152.34ms step:127/1480 train_time:17811ms step_avg:152.23ms step:128/1480 train_time:17957ms step_avg:152.18ms step:129/1480 train_time:18103ms step_avg:152.13ms step:130/1480 train_time:18249ms step_avg:152.07ms step:131/1480 train_time:18393ms step_avg:152.01ms step:132/1480 train_time:18539ms step_avg:151.96ms step:133/1480 train_time:18685ms step_avg:151.91ms step:134/1480 train_time:18834ms step_avg:151.89ms step:135/1480 train_time:18980ms step_avg:151.84ms step:136/1480 train_time:19126ms step_avg:151.80ms step:137/1480 train_time:19270ms step_avg:151.73ms step:138/1480 train_time:19415ms step_avg:151.68ms step:139/1480 train_time:19562ms step_avg:151.64ms step:140/1480 train_time:19708ms step_avg:151.60ms step:141/1480 train_time:19853ms step_avg:151.55ms step:142/1480 train_time:19998ms step_avg:151.50ms step:143/1480 train_time:20145ms step_avg:151.47ms step:144/1480 train_time:20290ms step_avg:151.42ms step:145/1480 train_time:20435ms step_avg:151.37ms step:146/1480 train_time:20581ms step_avg:151.33ms step:147/1480 train_time:20728ms step_avg:151.30ms step:148/1480 train_time:20872ms step_avg:151.24ms step:149/1480 train_time:21020ms step_avg:151.22ms step:150/1480 train_time:21166ms step_avg:151.19ms step:151/1480 train_time:21312ms step_avg:151.15ms step:152/1480 train_time:21456ms step_avg:151.10ms step:153/1480 train_time:21603ms step_avg:151.07ms step:154/1480 train_time:21748ms step_avg:151.03ms step:155/1480 train_time:21892ms step_avg:150.98ms step:156/1480 train_time:22039ms step_avg:150.95ms step:157/1480 train_time:22185ms step_avg:150.92ms step:158/1480 train_time:22331ms step_avg:150.88ms step:159/1480 train_time:22477ms step_avg:150.85ms step:160/1480 train_time:22624ms step_avg:150.82ms step:161/1480 train_time:22769ms step_avg:150.79ms step:162/1480 train_time:22915ms step_avg:150.75ms step:163/1480 train_time:23061ms step_avg:150.73ms step:164/1480 train_time:23207ms step_avg:150.70ms step:165/1480 train_time:23352ms step_avg:150.65ms step:166/1480 train_time:23498ms step_avg:150.63ms step:167/1480 train_time:23645ms step_avg:150.60ms step:168/1480 train_time:23789ms step_avg:150.57ms step:169/1480 train_time:23934ms step_avg:150.53ms step:170/1480 train_time:24080ms step_avg:150.50ms step:171/1480 train_time:24227ms step_avg:150.48ms step:172/1480 train_time:24372ms step_avg:150.44ms step:173/1480 train_time:24518ms step_avg:150.42ms step:174/1480 train_time:24665ms step_avg:150.39ms step:175/1480 train_time:24810ms step_avg:150.36ms step:176/1480 train_time:24955ms step_avg:150.33ms step:177/1480 train_time:25101ms step_avg:150.31ms step:178/1480 train_time:25248ms step_avg:150.28ms step:179/1480 train_time:25393ms step_avg:150.25ms step:180/1480 train_time:25539ms step_avg:150.23ms step:181/1480 train_time:25685ms step_avg:150.21ms step:182/1480 train_time:25832ms step_avg:150.18ms step:183/1480 train_time:25978ms step_avg:150.16ms step:184/1480 train_time:26125ms step_avg:150.14ms step:185/1480 train_time:26270ms step_avg:150.11ms step:186/1480 train_time:26416ms step_avg:150.09ms step:187/1480 train_time:26563ms step_avg:150.07ms step:188/1480 train_time:26709ms step_avg:150.05ms step:189/1480 train_time:26871ms step_avg:150.11ms step:190/1480 train_time:27000ms step_avg:150.00ms step:191/1480 train_time:27147ms step_avg:149.98ms step:192/1480 train_time:27291ms step_avg:149.95ms step:193/1480 train_time:27437ms step_avg:149.93ms step:194/1480 train_time:27584ms step_avg:149.91ms step:195/1480 train_time:27730ms step_avg:149.89ms step:196/1480 train_time:27876ms step_avg:149.87ms step:197/1480 train_time:28022ms step_avg:149.85ms step:198/1480 train_time:28168ms step_avg:149.83ms step:199/1480 train_time:28313ms step_avg:149.80ms step:200/1480 train_time:28459ms step_avg:149.78ms step:201/1480 train_time:28605ms step_avg:149.77ms step:202/1480 train_time:28750ms step_avg:149.74ms step:203/1480 train_time:28896ms step_avg:149.72ms step:204/1480 train_time:29043ms step_avg:149.71ms step:205/1480 train_time:29188ms step_avg:149.68ms step:206/1480 train_time:29334ms step_avg:149.66ms step:207/1480 train_time:29481ms step_avg:149.65ms step:208/1480 train_time:29627ms step_avg:149.63ms step:209/1480 train_time:29772ms step_avg:149.61ms step:210/1480 train_time:29918ms step_avg:149.59ms step:211/1480 train_time:30065ms step_avg:149.58ms step:212/1480 train_time:30211ms step_avg:149.56ms step:213/1480 train_time:30357ms step_avg:149.54ms step:214/1480 train_time:30504ms step_avg:149.53ms step:215/1480 train_time:30649ms step_avg:149.51ms step:216/1480 train_time:30794ms step_avg:149.48ms step:217/1480 train_time:30940ms step_avg:149.47ms step:218/1480 train_time:31471ms step_avg:151.30ms step:219/1480 train_time:31579ms step_avg:151.09ms step:220/1480 train_time:31726ms step_avg:151.08ms step:221/1480 train_time:32276ms step_avg:152.97ms step:222/1480 train_time:32381ms step_avg:152.74ms step:223/1480 train_time:32529ms step_avg:152.72ms step:224/1480 train_time:32677ms step_avg:152.70ms step:225/1480 train_time:32826ms step_avg:152.68ms step:226/1480 train_time:32974ms step_avg:152.66ms step:227/1480 train_time:33123ms step_avg:152.64ms step:228/1480 train_time:33272ms step_avg:152.62ms step:229/1480 train_time:33422ms step_avg:152.61ms step:230/1480 train_time:33571ms step_avg:152.60ms step:231/1480 train_time:33721ms step_avg:152.59ms step:232/1480 train_time:33870ms step_avg:152.57ms step:233/1480 train_time:34018ms step_avg:152.55ms step:234/1480 train_time:34167ms step_avg:152.53ms step:235/1480 train_time:34316ms step_avg:152.52ms step:236/1480 train_time:34466ms step_avg:152.50ms step:237/1480 train_time:34614ms step_avg:152.48ms step:238/1480 train_time:34764ms step_avg:152.47ms step:239/1480 train_time:34912ms step_avg:152.45ms step:240/1480 train_time:35062ms step_avg:152.44ms step:241/1480 train_time:35210ms step_avg:152.42ms step:242/1480 train_time:35359ms step_avg:152.41ms step:243/1480 train_time:35508ms step_avg:152.40ms step:244/1480 train_time:35656ms step_avg:152.37ms step:245/1480 train_time:35806ms step_avg:152.36ms step:246/1480 train_time:35954ms step_avg:152.35ms step:247/1480 train_time:36103ms step_avg:152.33ms step:248/1480 train_time:36250ms step_avg:152.31ms step:249/1480 train_time:36398ms step_avg:152.29ms step:250/1480 train_time:36547ms step_avg:152.28ms step:250/1480 val_loss:3.9923 train_time:36613ms step_avg:152.55ms step:251/1480 train_time:36708ms step_avg:152.32ms step:252/1480 train_time:36850ms step_avg:152.27ms step:253/1480 train_time:37000ms step_avg:152.26ms step:254/1480 train_time:37148ms step_avg:152.24ms step:255/1480 train_time:37297ms step_avg:152.23ms step:256/1480 train_time:37444ms step_avg:152.21ms step:257/1480 train_time:37594ms step_avg:152.20ms step:258/1480 train_time:37743ms step_avg:152.19ms step:259/1480 train_time:37893ms step_avg:152.18ms step:260/1480 train_time:38040ms step_avg:152.16ms step:261/1480 train_time:38189ms step_avg:152.15ms step:262/1480 train_time:38338ms step_avg:152.14ms step:263/1480 train_time:38485ms step_avg:152.12ms step:264/1480 train_time:38635ms step_avg:152.11ms step:265/1480 train_time:38784ms step_avg:152.09ms step:266/1480 train_time:38933ms step_avg:152.08ms step:267/1480 train_time:39080ms step_avg:152.06ms step:268/1480 train_time:39228ms step_avg:152.05ms step:269/1480 train_time:39377ms step_avg:152.03ms step:270/1480 train_time:39524ms step_avg:152.02ms step:271/1480 train_time:39673ms step_avg:152.00ms step:272/1480 train_time:39821ms step_avg:151.99ms step:273/1480 train_time:39971ms step_avg:151.98ms step:274/1480 train_time:40119ms step_avg:151.96ms step:275/1480 train_time:40268ms step_avg:151.95ms step:276/1480 train_time:40416ms step_avg:151.94ms step:277/1480 train_time:40564ms step_avg:151.93ms step:278/1480 train_time:40713ms step_avg:151.91ms step:279/1480 train_time:40862ms step_avg:151.90ms step:280/1480 train_time:41011ms step_avg:151.89ms step:281/1480 train_time:41159ms step_avg:151.88ms step:282/1480 train_time:41307ms step_avg:151.86ms step:283/1480 train_time:41456ms step_avg:151.85ms step:284/1480 train_time:41604ms step_avg:151.84ms step:285/1480 train_time:41753ms step_avg:151.83ms step:286/1480 train_time:41900ms step_avg:151.81ms step:287/1480 train_time:42050ms step_avg:151.80ms step:288/1480 train_time:42198ms step_avg:151.79ms step:289/1480 train_time:42346ms step_avg:151.78ms step:290/1480 train_time:42495ms step_avg:151.77ms step:291/1480 train_time:42643ms step_avg:151.75ms step:292/1480 train_time:42793ms step_avg:151.75ms step:293/1480 train_time:42940ms step_avg:151.73ms step:294/1480 train_time:43088ms step_avg:151.72ms step:295/1480 train_time:43237ms step_avg:151.71ms step:296/1480 train_time:43384ms step_avg:151.69ms step:297/1480 train_time:43534ms step_avg:151.69ms step:298/1480 train_time:43681ms step_avg:151.67ms step:299/1480 train_time:43829ms step_avg:151.66ms step:300/1480 train_time:43979ms step_avg:151.65ms step:301/1480 train_time:44127ms step_avg:151.64ms step:302/1480 train_time:44276ms step_avg:151.63ms step:303/1480 train_time:44424ms step_avg:151.62ms step:304/1480 train_time:44573ms step_avg:151.61ms step:305/1480 train_time:44720ms step_avg:151.59ms step:306/1480 train_time:44870ms step_avg:151.59ms step:307/1480 train_time:45019ms step_avg:151.58ms step:308/1480 train_time:45168ms step_avg:151.57ms step:309/1480 train_time:45316ms step_avg:151.56ms step:310/1480 train_time:45465ms step_avg:151.55ms step:311/1480 train_time:45614ms step_avg:151.54ms step:312/1480 train_time:45763ms step_avg:151.53ms step:313/1480 train_time:45912ms step_avg:151.52ms step:314/1480 train_time:46059ms step_avg:151.51ms step:315/1480 train_time:46208ms step_avg:151.50ms step:316/1480 train_time:46357ms step_avg:151.49ms step:317/1480 train_time:46506ms step_avg:151.48ms step:318/1480 train_time:46655ms step_avg:151.48ms step:319/1480 train_time:46803ms step_avg:151.47ms step:320/1480 train_time:46952ms step_avg:151.46ms step:321/1480 train_time:47099ms step_avg:151.44ms step:322/1480 train_time:47248ms step_avg:151.44ms step:323/1480 train_time:47398ms step_avg:151.43ms step:324/1480 train_time:47546ms step_avg:151.42ms step:325/1480 train_time:47696ms step_avg:151.42ms step:326/1480 train_time:47844ms step_avg:151.40ms step:327/1480 train_time:47994ms step_avg:151.40ms step:328/1480 train_time:48141ms step_avg:151.39ms step:329/1480 train_time:48290ms step_avg:151.38ms step:330/1480 train_time:48439ms step_avg:151.37ms step:331/1480 train_time:48590ms step_avg:151.37ms step:332/1480 train_time:48741ms step_avg:151.37ms step:333/1480 train_time:48892ms step_avg:151.37ms step:334/1480 train_time:49042ms step_avg:151.37ms step:335/1480 train_time:49195ms step_avg:151.37ms step:336/1480 train_time:49344ms step_avg:151.36ms step:337/1480 train_time:49496ms step_avg:151.36ms step:338/1480 train_time:49647ms step_avg:151.36ms step:339/1480 train_time:49798ms step_avg:151.36ms step:340/1480 train_time:49950ms step_avg:151.36ms step:341/1480 train_time:50101ms step_avg:151.36ms step:342/1480 train_time:50253ms step_avg:151.36ms step:343/1480 train_time:50403ms step_avg:151.36ms step:344/1480 train_time:50555ms step_avg:151.36ms step:345/1480 train_time:50705ms step_avg:151.36ms step:346/1480 train_time:50856ms step_avg:151.36ms step:347/1480 train_time:51006ms step_avg:151.35ms step:348/1480 train_time:51158ms step_avg:151.36ms step:349/1480 train_time:51309ms step_avg:151.35ms step:350/1480 train_time:51460ms step_avg:151.35ms step:351/1480 train_time:51611ms step_avg:151.35ms step:352/1480 train_time:51762ms step_avg:151.35ms step:353/1480 train_time:51913ms step_avg:151.35ms step:354/1480 train_time:52064ms step_avg:151.35ms step:355/1480 train_time:52214ms step_avg:151.35ms step:356/1480 train_time:52365ms step_avg:151.34ms step:357/1480 train_time:52516ms step_avg:151.34ms step:358/1480 train_time:52666ms step_avg:151.34ms step:359/1480 train_time:52818ms step_avg:151.34ms step:360/1480 train_time:52969ms step_avg:151.34ms step:361/1480 train_time:53120ms step_avg:151.34ms step:362/1480 train_time:53272ms step_avg:151.34ms step:363/1480 train_time:53423ms step_avg:151.34ms step:364/1480 train_time:53575ms step_avg:151.34ms step:365/1480 train_time:53725ms step_avg:151.34ms step:366/1480 train_time:53877ms step_avg:151.34ms step:367/1480 train_time:54027ms step_avg:151.34ms step:368/1480 train_time:54178ms step_avg:151.34ms step:369/1480 train_time:54329ms step_avg:151.33ms step:370/1480 train_time:54479ms step_avg:151.33ms step:371/1480 train_time:54630ms step_avg:151.33ms step:372/1480 train_time:54781ms step_avg:151.33ms step:373/1480 train_time:54932ms step_avg:151.33ms step:374/1480 train_time:55083ms step_avg:151.33ms step:375/1480 train_time:55234ms step_avg:151.33ms step:375/1480 val_loss:3.8046 train_time:55302ms step_avg:151.51ms step:376/1480 train_time:55393ms step_avg:151.35ms step:377/1480 train_time:55543ms step_avg:151.34ms step:378/1480 train_time:55695ms step_avg:151.34ms step:379/1480 train_time:55860ms step_avg:151.38ms step:380/1480 train_time:55995ms step_avg:151.34ms step:381/1480 train_time:56145ms step_avg:151.33ms step:382/1480 train_time:56295ms step_avg:151.33ms step:383/1480 train_time:56446ms step_avg:151.33ms step:384/1480 train_time:56597ms step_avg:151.33ms step:385/1480 train_time:56748ms step_avg:151.33ms step:386/1480 train_time:56898ms step_avg:151.33ms step:387/1480 train_time:57050ms step_avg:151.33ms step:388/1480 train_time:57200ms step_avg:151.32ms step:389/1480 train_time:57351ms step_avg:151.32ms step:390/1480 train_time:57501ms step_avg:151.32ms step:391/1480 train_time:57652ms step_avg:151.32ms step:392/1480 train_time:57802ms step_avg:151.31ms step:393/1480 train_time:57954ms step_avg:151.32ms step:394/1480 train_time:58104ms step_avg:151.31ms step:395/1480 train_time:58255ms step_avg:151.31ms step:396/1480 train_time:58406ms step_avg:151.31ms step:397/1480 train_time:58557ms step_avg:151.31ms step:398/1480 train_time:58708ms step_avg:151.31ms step:399/1480 train_time:58858ms step_avg:151.31ms step:400/1480 train_time:59011ms step_avg:151.31ms step:401/1480 train_time:59161ms step_avg:151.31ms step:402/1480 train_time:59313ms step_avg:151.31ms step:403/1480 train_time:59463ms step_avg:151.30ms step:404/1480 train_time:59614ms step_avg:151.31ms step:405/1480 train_time:59765ms step_avg:151.30ms step:406/1480 train_time:59916ms step_avg:151.30ms step:407/1480 train_time:60068ms step_avg:151.30ms step:408/1480 train_time:60219ms step_avg:151.30ms step:409/1480 train_time:60370ms step_avg:151.30ms step:410/1480 train_time:60519ms step_avg:151.30ms step:411/1480 train_time:60671ms step_avg:151.30ms step:412/1480 train_time:60821ms step_avg:151.30ms step:413/1480 train_time:60973ms step_avg:151.30ms step:414/1480 train_time:61124ms step_avg:151.30ms step:415/1480 train_time:61275ms step_avg:151.30ms step:416/1480 train_time:61426ms step_avg:151.29ms step:417/1480 train_time:61578ms step_avg:151.30ms step:418/1480 train_time:61728ms step_avg:151.29ms step:419/1480 train_time:61878ms step_avg:151.29ms step:420/1480 train_time:62028ms step_avg:151.29ms step:421/1480 train_time:62179ms step_avg:151.29ms step:422/1480 train_time:62330ms step_avg:151.29ms step:423/1480 train_time:62481ms step_avg:151.29ms step:424/1480 train_time:62633ms step_avg:151.29ms step:425/1480 train_time:62784ms step_avg:151.29ms step:426/1480 train_time:62936ms step_avg:151.29ms step:427/1480 train_time:63087ms step_avg:151.29ms step:428/1480 train_time:63237ms step_avg:151.29ms step:429/1480 train_time:63389ms step_avg:151.29ms step:430/1480 train_time:63540ms step_avg:151.28ms step:431/1480 train_time:63691ms step_avg:151.28ms step:432/1480 train_time:63841ms step_avg:151.28ms step:433/1480 train_time:63993ms step_avg:151.28ms step:434/1480 train_time:64143ms step_avg:151.28ms step:435/1480 train_time:64294ms step_avg:151.28ms step:436/1480 train_time:64446ms step_avg:151.28ms step:437/1480 train_time:64597ms step_avg:151.28ms step:438/1480 train_time:64748ms step_avg:151.28ms step:439/1480 train_time:64899ms step_avg:151.28ms step:440/1480 train_time:65051ms step_avg:151.28ms step:441/1480 train_time:65203ms step_avg:151.28ms step:442/1480 train_time:65356ms step_avg:151.29ms step:443/1480 train_time:65509ms step_avg:151.29ms step:444/1480 train_time:65662ms step_avg:151.29ms step:445/1480 train_time:65815ms step_avg:151.30ms step:446/1480 train_time:65968ms step_avg:151.30ms step:447/1480 train_time:66120ms step_avg:151.30ms step:448/1480 train_time:66274ms step_avg:151.31ms step:449/1480 train_time:66427ms step_avg:151.31ms step:450/1480 train_time:66580ms step_avg:151.32ms step:451/1480 train_time:66733ms step_avg:151.32ms step:452/1480 train_time:66887ms step_avg:151.33ms step:453/1480 train_time:67040ms step_avg:151.33ms step:454/1480 train_time:67193ms step_avg:151.33ms step:455/1480 train_time:67345ms step_avg:151.34ms step:456/1480 train_time:67497ms step_avg:151.34ms step:457/1480 train_time:67651ms step_avg:151.34ms step:458/1480 train_time:67802ms step_avg:151.34ms step:459/1480 train_time:67956ms step_avg:151.35ms step:460/1480 train_time:68109ms step_avg:151.35ms step:461/1480 train_time:68262ms step_avg:151.36ms step:462/1480 train_time:68416ms step_avg:151.36ms step:463/1480 train_time:68570ms step_avg:151.37ms step:464/1480 train_time:68722ms step_avg:151.37ms step:465/1480 train_time:68875ms step_avg:151.37ms step:466/1480 train_time:69027ms step_avg:151.37ms step:467/1480 train_time:69180ms step_avg:151.38ms step:468/1480 train_time:69333ms step_avg:151.38ms step:469/1480 train_time:69487ms step_avg:151.39ms step:470/1480 train_time:69640ms step_avg:151.39ms step:471/1480 train_time:69795ms step_avg:151.40ms step:472/1480 train_time:69947ms step_avg:151.40ms step:473/1480 train_time:70101ms step_avg:151.41ms step:474/1480 train_time:70254ms step_avg:151.41ms step:475/1480 train_time:70406ms step_avg:151.41ms step:476/1480 train_time:70559ms step_avg:151.41ms step:477/1480 train_time:70714ms step_avg:151.42ms step:478/1480 train_time:70867ms step_avg:151.43ms step:479/1480 train_time:71020ms step_avg:151.43ms step:480/1480 train_time:71173ms step_avg:151.43ms step:481/1480 train_time:71325ms step_avg:151.43ms step:482/1480 train_time:71477ms step_avg:151.43ms step:483/1480 train_time:71630ms step_avg:151.44ms step:484/1480 train_time:71785ms step_avg:151.44ms step:485/1480 train_time:71938ms step_avg:151.45ms step:486/1480 train_time:72092ms step_avg:151.45ms step:487/1480 train_time:72244ms step_avg:151.46ms step:488/1480 train_time:72397ms step_avg:151.46ms step:489/1480 train_time:72550ms step_avg:151.46ms step:490/1480 train_time:72702ms step_avg:151.46ms step:491/1480 train_time:72854ms step_avg:151.46ms step:492/1480 train_time:73006ms step_avg:151.46ms step:493/1480 train_time:73159ms step_avg:151.47ms step:494/1480 train_time:73313ms step_avg:151.47ms step:495/1480 train_time:73466ms step_avg:151.48ms step:496/1480 train_time:73621ms step_avg:151.48ms step:497/1480 train_time:73774ms step_avg:151.49ms step:498/1480 train_time:73926ms step_avg:151.49ms step:499/1480 train_time:74079ms step_avg:151.49ms step:500/1480 train_time:74231ms step_avg:151.49ms step:500/1480 val_loss:3.6840 train_time:74301ms step_avg:151.64ms step:501/1480 train_time:74392ms step_avg:151.51ms step:502/1480 train_time:74544ms step_avg:151.51ms step:503/1480 train_time:74698ms step_avg:151.52ms step:504/1480 train_time:74850ms step_avg:151.52ms step:505/1480 train_time:75002ms step_avg:151.52ms step:506/1480 train_time:75155ms step_avg:151.52ms step:507/1480 train_time:75306ms step_avg:151.52ms step:508/1480 train_time:75461ms step_avg:151.53ms step:509/1480 train_time:75615ms step_avg:151.53ms step:510/1480 train_time:75767ms step_avg:151.53ms step:511/1480 train_time:75920ms step_avg:151.54ms step:512/1480 train_time:76075ms step_avg:151.54ms step:513/1480 train_time:76228ms step_avg:151.55ms step:514/1480 train_time:76381ms step_avg:151.55ms step:515/1480 train_time:76534ms step_avg:151.55ms step:516/1480 train_time:76688ms step_avg:151.56ms step:517/1480 train_time:76842ms step_avg:151.56ms step:518/1480 train_time:76995ms step_avg:151.57ms step:519/1480 train_time:77149ms step_avg:151.57ms step:520/1480 train_time:77302ms step_avg:151.57ms step:521/1480 train_time:77455ms step_avg:151.58ms step:522/1480 train_time:77608ms step_avg:151.58ms step:523/1480 train_time:77761ms step_avg:151.58ms step:524/1480 train_time:77914ms step_avg:151.58ms step:525/1480 train_time:78066ms step_avg:151.59ms step:526/1480 train_time:78220ms step_avg:151.59ms step:527/1480 train_time:78373ms step_avg:151.59ms step:528/1480 train_time:78527ms step_avg:151.60ms step:529/1480 train_time:78680ms step_avg:151.60ms step:530/1480 train_time:78833ms step_avg:151.60ms step:531/1480 train_time:78986ms step_avg:151.60ms step:532/1480 train_time:79139ms step_avg:151.61ms step:533/1480 train_time:79293ms step_avg:151.61ms step:534/1480 train_time:79446ms step_avg:151.61ms step:535/1480 train_time:79600ms step_avg:151.62ms step:536/1480 train_time:79752ms step_avg:151.62ms step:537/1480 train_time:79905ms step_avg:151.62ms step:538/1480 train_time:80058ms step_avg:151.62ms step:539/1480 train_time:80211ms step_avg:151.63ms step:540/1480 train_time:80364ms step_avg:151.63ms step:541/1480 train_time:80519ms step_avg:151.64ms step:542/1480 train_time:80671ms step_avg:151.64ms step:543/1480 train_time:80824ms step_avg:151.64ms step:544/1480 train_time:80977ms step_avg:151.64ms step:545/1480 train_time:81130ms step_avg:151.64ms step:546/1480 train_time:81282ms step_avg:151.65ms step:547/1480 train_time:81434ms step_avg:151.65ms step:548/1480 train_time:81588ms step_avg:151.65ms step:549/1480 train_time:81740ms step_avg:151.65ms step:550/1480 train_time:81896ms step_avg:151.66ms step:551/1480 train_time:82051ms step_avg:151.67ms step:552/1480 train_time:82205ms step_avg:151.67ms step:553/1480 train_time:82360ms step_avg:151.68ms step:554/1480 train_time:82514ms step_avg:151.68ms step:555/1480 train_time:82668ms step_avg:151.68ms step:556/1480 train_time:82822ms step_avg:151.69ms step:557/1480 train_time:82979ms step_avg:151.70ms step:558/1480 train_time:83133ms step_avg:151.70ms step:559/1480 train_time:83289ms step_avg:151.71ms step:560/1480 train_time:83443ms step_avg:151.71ms step:561/1480 train_time:83598ms step_avg:151.72ms step:562/1480 train_time:83753ms step_avg:151.73ms step:563/1480 train_time:83907ms step_avg:151.73ms step:564/1480 train_time:84062ms step_avg:151.74ms step:565/1480 train_time:84216ms step_avg:151.74ms step:566/1480 train_time:84372ms step_avg:151.75ms step:567/1480 train_time:84527ms step_avg:151.75ms step:568/1480 train_time:84682ms step_avg:151.76ms step:569/1480 train_time:84848ms step_avg:151.79ms step:570/1480 train_time:84991ms step_avg:151.77ms step:571/1480 train_time:85146ms step_avg:151.78ms step:572/1480 train_time:85301ms step_avg:151.78ms step:573/1480 train_time:85456ms step_avg:151.79ms step:574/1480 train_time:85611ms step_avg:151.79ms step:575/1480 train_time:85766ms step_avg:151.80ms step:576/1480 train_time:85922ms step_avg:151.80ms step:577/1480 train_time:86076ms step_avg:151.81ms step:578/1480 train_time:86230ms step_avg:151.81ms step:579/1480 train_time:86385ms step_avg:151.82ms step:580/1480 train_time:86538ms step_avg:151.82ms step:581/1480 train_time:86694ms step_avg:151.83ms step:582/1480 train_time:86849ms step_avg:151.83ms step:583/1480 train_time:87003ms step_avg:151.84ms step:584/1480 train_time:87159ms step_avg:151.84ms step:585/1480 train_time:87312ms step_avg:151.85ms step:586/1480 train_time:87467ms step_avg:151.85ms step:587/1480 train_time:87622ms step_avg:151.86ms step:588/1480 train_time:87777ms step_avg:151.86ms step:589/1480 train_time:87931ms step_avg:151.87ms step:590/1480 train_time:88087ms step_avg:151.87ms step:591/1480 train_time:88241ms step_avg:151.88ms step:592/1480 train_time:88397ms step_avg:151.88ms step:593/1480 train_time:88552ms step_avg:151.89ms step:594/1480 train_time:88707ms step_avg:151.90ms step:595/1480 train_time:88863ms step_avg:151.90ms step:596/1480 train_time:89019ms step_avg:151.91ms step:597/1480 train_time:89174ms step_avg:151.91ms step:598/1480 train_time:89328ms step_avg:151.92ms step:599/1480 train_time:89483ms step_avg:151.92ms step:600/1480 train_time:89637ms step_avg:151.93ms step:601/1480 train_time:89794ms step_avg:151.94ms step:602/1480 train_time:89950ms step_avg:151.94ms step:603/1480 train_time:90105ms step_avg:151.95ms step:604/1480 train_time:90260ms step_avg:151.95ms step:605/1480 train_time:90414ms step_avg:151.96ms step:606/1480 train_time:90570ms step_avg:151.96ms step:607/1480 train_time:90725ms step_avg:151.97ms step:608/1480 train_time:90881ms step_avg:151.97ms step:609/1480 train_time:91035ms step_avg:151.98ms step:610/1480 train_time:91189ms step_avg:151.98ms step:611/1480 train_time:91343ms step_avg:151.99ms step:612/1480 train_time:91499ms step_avg:151.99ms step:613/1480 train_time:91654ms step_avg:152.00ms step:614/1480 train_time:91810ms step_avg:152.00ms step:615/1480 train_time:91964ms step_avg:152.01ms step:616/1480 train_time:92119ms step_avg:152.01ms step:617/1480 train_time:92274ms step_avg:152.02ms step:618/1480 train_time:92429ms step_avg:152.02ms step:619/1480 train_time:92584ms step_avg:152.03ms step:620/1480 train_time:92739ms step_avg:152.03ms step:621/1480 train_time:92894ms step_avg:152.04ms step:622/1480 train_time:93051ms step_avg:152.04ms step:623/1480 train_time:93205ms step_avg:152.05ms step:624/1480 train_time:93360ms step_avg:152.05ms step:625/1480 train_time:93513ms step_avg:152.05ms step:625/1480 val_loss:3.6072 train_time:93584ms step_avg:152.17ms step:626/1480 train_time:93676ms step_avg:152.07ms step:627/1480 train_time:93831ms step_avg:152.08ms step:628/1480 train_time:93986ms step_avg:152.08ms step:629/1480 train_time:94140ms step_avg:152.08ms step:630/1480 train_time:94295ms step_avg:152.09ms step:631/1480 train_time:94449ms step_avg:152.09ms step:632/1480 train_time:94603ms step_avg:152.10ms step:633/1480 train_time:94759ms step_avg:152.10ms step:634/1480 train_time:94913ms step_avg:152.10ms step:635/1480 train_time:95068ms step_avg:152.11ms step:636/1480 train_time:95223ms step_avg:152.11ms step:637/1480 train_time:95378ms step_avg:152.12ms step:638/1480 train_time:95532ms step_avg:152.12ms step:639/1480 train_time:95686ms step_avg:152.12ms step:640/1480 train_time:95841ms step_avg:152.13ms step:641/1480 train_time:95995ms step_avg:152.13ms step:642/1480 train_time:96150ms step_avg:152.14ms step:643/1480 train_time:96305ms step_avg:152.14ms step:644/1480 train_time:96459ms step_avg:152.14ms step:645/1480 train_time:96614ms step_avg:152.15ms step:646/1480 train_time:96768ms step_avg:152.15ms step:647/1480 train_time:96924ms step_avg:152.16ms step:648/1480 train_time:97081ms step_avg:152.16ms step:649/1480 train_time:97235ms step_avg:152.17ms step:650/1480 train_time:97391ms step_avg:152.17ms step:651/1480 train_time:97546ms step_avg:152.18ms step:652/1480 train_time:97701ms step_avg:152.18ms step:653/1480 train_time:97856ms step_avg:152.19ms step:654/1480 train_time:98010ms step_avg:152.19ms step:655/1480 train_time:98167ms step_avg:152.20ms step:656/1480 train_time:98321ms step_avg:152.20ms step:657/1480 train_time:98476ms step_avg:152.20ms step:658/1480 train_time:98631ms step_avg:152.21ms step:659/1480 train_time:98786ms step_avg:152.21ms step:660/1480 train_time:98941ms step_avg:152.22ms step:661/1480 train_time:99099ms step_avg:152.23ms step:662/1480 train_time:99255ms step_avg:152.23ms step:663/1480 train_time:99410ms step_avg:152.24ms step:664/1480 train_time:99567ms step_avg:152.24ms step:665/1480 train_time:99723ms step_avg:152.25ms step:666/1480 train_time:99880ms step_avg:152.26ms step:667/1480 train_time:100035ms step_avg:152.26ms step:668/1480 train_time:100193ms step_avg:152.27ms step:669/1480 train_time:100351ms step_avg:152.28ms step:670/1480 train_time:100507ms step_avg:152.28ms step:671/1480 train_time:100662ms step_avg:152.29ms step:672/1480 train_time:100819ms step_avg:152.29ms step:673/1480 train_time:100976ms step_avg:152.30ms step:674/1480 train_time:101132ms step_avg:152.31ms step:675/1480 train_time:101290ms step_avg:152.32ms step:676/1480 train_time:101448ms step_avg:152.32ms step:677/1480 train_time:101604ms step_avg:152.33ms step:678/1480 train_time:101760ms step_avg:152.34ms step:679/1480 train_time:101917ms step_avg:152.34ms step:680/1480 train_time:102074ms step_avg:152.35ms step:681/1480 train_time:102229ms step_avg:152.35ms step:682/1480 train_time:102386ms step_avg:152.36ms step:683/1480 train_time:102543ms step_avg:152.37ms step:684/1480 train_time:102701ms step_avg:152.38ms step:685/1480 train_time:102858ms step_avg:152.38ms step:686/1480 train_time:103014ms step_avg:152.39ms step:687/1480 train_time:103169ms step_avg:152.39ms step:688/1480 train_time:103326ms step_avg:152.40ms step:689/1480 train_time:103484ms step_avg:152.41ms step:690/1480 train_time:103641ms step_avg:152.41ms step:691/1480 train_time:103799ms step_avg:152.42ms step:692/1480 train_time:103955ms step_avg:152.43ms step:693/1480 train_time:104111ms step_avg:152.43ms step:694/1480 train_time:104268ms step_avg:152.44ms step:695/1480 train_time:104424ms step_avg:152.44ms step:696/1480 train_time:104580ms step_avg:152.45ms step:697/1480 train_time:104736ms step_avg:152.45ms step:698/1480 train_time:104893ms step_avg:152.46ms step:699/1480 train_time:105050ms step_avg:152.47ms step:700/1480 train_time:105207ms step_avg:152.47ms step:701/1480 train_time:105362ms step_avg:152.48ms step:702/1480 train_time:105519ms step_avg:152.48ms step:703/1480 train_time:105676ms step_avg:152.49ms step:704/1480 train_time:105832ms step_avg:152.50ms step:705/1480 train_time:105988ms step_avg:152.50ms step:706/1480 train_time:106146ms step_avg:152.51ms step:707/1480 train_time:106304ms step_avg:152.52ms step:708/1480 train_time:106460ms step_avg:152.52ms step:709/1480 train_time:106616ms step_avg:152.53ms step:710/1480 train_time:106772ms step_avg:152.53ms step:711/1480 train_time:106927ms step_avg:152.54ms step:712/1480 train_time:107086ms step_avg:152.54ms step:713/1480 train_time:107243ms step_avg:152.55ms step:714/1480 train_time:107399ms step_avg:152.56ms step:715/1480 train_time:107554ms step_avg:152.56ms step:716/1480 train_time:107709ms step_avg:152.56ms step:717/1480 train_time:107864ms step_avg:152.57ms step:718/1480 train_time:108021ms step_avg:152.57ms step:719/1480 train_time:108176ms step_avg:152.58ms step:720/1480 train_time:108333ms step_avg:152.58ms step:721/1480 train_time:108491ms step_avg:152.59ms step:722/1480 train_time:108648ms step_avg:152.60ms step:723/1480 train_time:108803ms step_avg:152.60ms step:724/1480 train_time:108959ms step_avg:152.60ms step:725/1480 train_time:109117ms step_avg:152.61ms step:726/1480 train_time:109274ms step_avg:152.62ms step:727/1480 train_time:109432ms step_avg:152.62ms step:728/1480 train_time:109588ms step_avg:152.63ms step:729/1480 train_time:109744ms step_avg:152.63ms step:730/1480 train_time:109901ms step_avg:152.64ms step:731/1480 train_time:110058ms step_avg:152.65ms step:732/1480 train_time:110213ms step_avg:152.65ms step:733/1480 train_time:110371ms step_avg:152.66ms step:734/1480 train_time:110527ms step_avg:152.66ms step:735/1480 train_time:110684ms step_avg:152.67ms step:736/1480 train_time:110840ms step_avg:152.67ms step:737/1480 train_time:110996ms step_avg:152.68ms step:738/1480 train_time:111152ms step_avg:152.68ms step:739/1480 train_time:111307ms step_avg:152.68ms step:740/1480 train_time:111466ms step_avg:152.69ms step:741/1480 train_time:111623ms step_avg:152.70ms step:742/1480 train_time:111780ms step_avg:152.71ms step:743/1480 train_time:111935ms step_avg:152.71ms step:744/1480 train_time:112093ms step_avg:152.72ms step:745/1480 train_time:112251ms step_avg:152.72ms step:746/1480 train_time:112407ms step_avg:152.73ms step:747/1480 train_time:112565ms step_avg:152.73ms step:748/1480 train_time:112723ms step_avg:152.74ms step:749/1480 train_time:112880ms step_avg:152.75ms step:750/1480 train_time:113035ms step_avg:152.75ms step:750/1480 val_loss:3.5481 train_time:113108ms step_avg:152.85ms step:751/1480 train_time:113200ms step_avg:152.77ms step:752/1480 train_time:113353ms step_avg:152.77ms step:753/1480 train_time:113510ms step_avg:152.77ms step:754/1480 train_time:113666ms step_avg:152.78ms step:755/1480 train_time:113823ms step_avg:152.78ms step:756/1480 train_time:113979ms step_avg:152.79ms step:757/1480 train_time:114137ms step_avg:152.79ms step:758/1480 train_time:114292ms step_avg:152.80ms step:759/1480 train_time:114459ms step_avg:152.82ms step:760/1480 train_time:114605ms step_avg:152.81ms step:761/1480 train_time:114761ms step_avg:152.81ms step:762/1480 train_time:114918ms step_avg:152.82ms step:763/1480 train_time:115074ms step_avg:152.82ms step:764/1480 train_time:115232ms step_avg:152.83ms step:765/1480 train_time:115388ms step_avg:152.83ms step:766/1480 train_time:115547ms step_avg:152.84ms step:767/1480 train_time:115704ms step_avg:152.84ms step:768/1480 train_time:115860ms step_avg:152.85ms step:769/1480 train_time:116018ms step_avg:152.86ms step:770/1480 train_time:116175ms step_avg:152.86ms step:771/1480 train_time:116332ms step_avg:152.87ms step:772/1480 train_time:116490ms step_avg:152.87ms step:773/1480 train_time:116648ms step_avg:152.88ms step:774/1480 train_time:116806ms step_avg:152.89ms step:775/1480 train_time:116963ms step_avg:152.89ms step:776/1480 train_time:117120ms step_avg:152.90ms step:777/1480 train_time:117280ms step_avg:152.91ms step:778/1480 train_time:117439ms step_avg:152.92ms step:779/1480 train_time:117596ms step_avg:152.92ms step:780/1480 train_time:117758ms step_avg:152.93ms step:781/1480 train_time:117917ms step_avg:152.94ms step:782/1480 train_time:118076ms step_avg:152.95ms step:783/1480 train_time:118233ms step_avg:152.95ms step:784/1480 train_time:118391ms step_avg:152.96ms step:785/1480 train_time:118547ms step_avg:152.96ms step:786/1480 train_time:118705ms step_avg:152.97ms step:787/1480 train_time:118863ms step_avg:152.98ms step:788/1480 train_time:119020ms step_avg:152.98ms step:789/1480 train_time:119178ms step_avg:152.99ms step:790/1480 train_time:119336ms step_avg:153.00ms step:791/1480 train_time:119497ms step_avg:153.00ms step:792/1480 train_time:119657ms step_avg:153.01ms step:793/1480 train_time:119815ms step_avg:153.02ms step:794/1480 train_time:119973ms step_avg:153.03ms step:795/1480 train_time:120133ms step_avg:153.04ms step:796/1480 train_time:120292ms step_avg:153.04ms step:797/1480 train_time:120452ms step_avg:153.05ms step:798/1480 train_time:120612ms step_avg:153.06ms step:799/1480 train_time:120774ms step_avg:153.07ms step:800/1480 train_time:120933ms step_avg:153.08ms step:801/1480 train_time:121089ms step_avg:153.08ms step:802/1480 train_time:121249ms step_avg:153.09ms step:803/1480 train_time:121406ms step_avg:153.10ms step:804/1480 train_time:121563ms step_avg:153.10ms step:805/1480 train_time:121721ms step_avg:153.11ms step:806/1480 train_time:121879ms step_avg:153.11ms step:807/1480 train_time:122035ms step_avg:153.12ms step:808/1480 train_time:122194ms step_avg:153.13ms step:809/1480 train_time:122352ms step_avg:153.13ms step:810/1480 train_time:122512ms step_avg:153.14ms step:811/1480 train_time:122669ms step_avg:153.15ms step:812/1480 train_time:122827ms step_avg:153.15ms step:813/1480 train_time:122984ms step_avg:153.16ms step:814/1480 train_time:123141ms step_avg:153.16ms step:815/1480 train_time:123298ms step_avg:153.17ms step:816/1480 train_time:123459ms step_avg:153.18ms step:817/1480 train_time:123617ms step_avg:153.18ms step:818/1480 train_time:123774ms step_avg:153.19ms step:819/1480 train_time:123932ms step_avg:153.19ms step:820/1480 train_time:124091ms step_avg:153.20ms step:821/1480 train_time:124249ms step_avg:153.21ms step:822/1480 train_time:124406ms step_avg:153.21ms step:823/1480 train_time:124564ms step_avg:153.21ms step:824/1480 train_time:124720ms step_avg:153.22ms step:825/1480 train_time:124881ms step_avg:153.23ms step:826/1480 train_time:125040ms step_avg:153.23ms step:827/1480 train_time:125200ms step_avg:153.24ms step:828/1480 train_time:125359ms step_avg:153.25ms step:829/1480 train_time:125518ms step_avg:153.26ms step:830/1480 train_time:125678ms step_avg:153.27ms step:831/1480 train_time:125837ms step_avg:153.27ms step:832/1480 train_time:125994ms step_avg:153.28ms step:833/1480 train_time:126152ms step_avg:153.28ms step:834/1480 train_time:126312ms step_avg:153.29ms step:835/1480 train_time:126470ms step_avg:153.30ms step:836/1480 train_time:126628ms step_avg:153.30ms step:837/1480 train_time:126785ms step_avg:153.31ms step:838/1480 train_time:126942ms step_avg:153.31ms step:839/1480 train_time:127100ms step_avg:153.32ms step:840/1480 train_time:127257ms step_avg:153.32ms step:841/1480 train_time:127414ms step_avg:153.33ms step:842/1480 train_time:127573ms step_avg:153.33ms step:843/1480 train_time:127732ms step_avg:153.34ms step:844/1480 train_time:127888ms step_avg:153.34ms step:845/1480 train_time:128045ms step_avg:153.35ms step:846/1480 train_time:128205ms step_avg:153.36ms step:847/1480 train_time:128364ms step_avg:153.36ms step:848/1480 train_time:128520ms step_avg:153.36ms step:849/1480 train_time:128679ms step_avg:153.37ms step:850/1480 train_time:128836ms step_avg:153.38ms step:851/1480 train_time:128995ms step_avg:153.38ms step:852/1480 train_time:129154ms step_avg:153.39ms step:853/1480 train_time:129313ms step_avg:153.40ms step:854/1480 train_time:129470ms step_avg:153.40ms step:855/1480 train_time:129626ms step_avg:153.40ms step:856/1480 train_time:129785ms step_avg:153.41ms step:857/1480 train_time:129943ms step_avg:153.42ms step:858/1480 train_time:130103ms step_avg:153.42ms step:859/1480 train_time:130261ms step_avg:153.43ms step:860/1480 train_time:130417ms step_avg:153.43ms step:861/1480 train_time:130578ms step_avg:153.44ms step:862/1480 train_time:130741ms step_avg:153.45ms step:863/1480 train_time:130900ms step_avg:153.46ms step:864/1480 train_time:131059ms step_avg:153.47ms step:865/1480 train_time:131217ms step_avg:153.47ms step:866/1480 train_time:131376ms step_avg:153.48ms step:867/1480 train_time:131537ms step_avg:153.48ms step:868/1480 train_time:131693ms step_avg:153.49ms step:869/1480 train_time:131851ms step_avg:153.49ms step:870/1480 train_time:132010ms step_avg:153.50ms step:871/1480 train_time:132167ms step_avg:153.50ms step:872/1480 train_time:132324ms step_avg:153.51ms step:873/1480 train_time:132481ms step_avg:153.51ms step:874/1480 train_time:132642ms step_avg:153.52ms step:875/1480 train_time:132801ms step_avg:153.53ms step:875/1480 val_loss:3.5013 train_time:132872ms step_avg:153.61ms step:876/1480 train_time:132966ms step_avg:153.54ms step:877/1480 train_time:133118ms step_avg:153.54ms step:878/1480 train_time:133277ms step_avg:153.54ms step:879/1480 train_time:133435ms step_avg:153.55ms step:880/1480 train_time:133593ms step_avg:153.55ms step:881/1480 train_time:133751ms step_avg:153.56ms step:882/1480 train_time:133909ms step_avg:153.57ms step:883/1480 train_time:134068ms step_avg:153.57ms step:884/1480 train_time:134230ms step_avg:153.58ms step:885/1480 train_time:134392ms step_avg:153.59ms step:886/1480 train_time:134552ms step_avg:153.60ms step:887/1480 train_time:134712ms step_avg:153.61ms step:888/1480 train_time:134877ms step_avg:153.62ms step:889/1480 train_time:135037ms step_avg:153.63ms step:890/1480 train_time:135193ms step_avg:153.63ms step:891/1480 train_time:135353ms step_avg:153.64ms step:892/1480 train_time:135513ms step_avg:153.64ms step:893/1480 train_time:135671ms step_avg:153.65ms step:894/1480 train_time:135832ms step_avg:153.66ms step:895/1480 train_time:135993ms step_avg:153.66ms step:896/1480 train_time:136152ms step_avg:153.67ms step:897/1480 train_time:136312ms step_avg:153.68ms step:898/1480 train_time:136473ms step_avg:153.69ms step:899/1480 train_time:136633ms step_avg:153.69ms step:900/1480 train_time:136791ms step_avg:153.70ms step:901/1480 train_time:136949ms step_avg:153.70ms step:902/1480 train_time:137108ms step_avg:153.71ms step:903/1480 train_time:137269ms step_avg:153.72ms step:904/1480 train_time:137429ms step_avg:153.72ms step:905/1480 train_time:137587ms step_avg:153.73ms step:906/1480 train_time:137748ms step_avg:153.74ms step:907/1480 train_time:137910ms step_avg:153.75ms step:908/1480 train_time:138067ms step_avg:153.75ms step:909/1480 train_time:138228ms step_avg:153.76ms step:910/1480 train_time:138392ms step_avg:153.77ms step:911/1480 train_time:138551ms step_avg:153.77ms step:912/1480 train_time:138713ms step_avg:153.78ms step:913/1480 train_time:138874ms step_avg:153.79ms step:914/1480 train_time:139035ms step_avg:153.80ms step:915/1480 train_time:139197ms step_avg:153.81ms step:916/1480 train_time:139356ms step_avg:153.81ms step:917/1480 train_time:139515ms step_avg:153.82ms step:918/1480 train_time:139675ms step_avg:153.83ms step:919/1480 train_time:139837ms step_avg:153.84ms step:920/1480 train_time:139995ms step_avg:153.84ms step:921/1480 train_time:140156ms step_avg:153.85ms step:922/1480 train_time:140318ms step_avg:153.86ms step:923/1480 train_time:140476ms step_avg:153.86ms step:924/1480 train_time:140635ms step_avg:153.87ms step:925/1480 train_time:140793ms step_avg:153.87ms step:926/1480 train_time:140952ms step_avg:153.88ms step:927/1480 train_time:141110ms step_avg:153.88ms step:928/1480 train_time:141270ms step_avg:153.89ms step:929/1480 train_time:141432ms step_avg:153.90ms step:930/1480 train_time:141591ms step_avg:153.90ms step:931/1480 train_time:141751ms step_avg:153.91ms step:932/1480 train_time:141911ms step_avg:153.92ms step:933/1480 train_time:142069ms step_avg:153.92ms step:934/1480 train_time:142230ms step_avg:153.93ms step:935/1480 train_time:142391ms step_avg:153.94ms step:936/1480 train_time:142551ms step_avg:153.94ms step:937/1480 train_time:142713ms step_avg:153.95ms step:938/1480 train_time:142870ms step_avg:153.96ms step:939/1480 train_time:143033ms step_avg:153.96ms step:940/1480 train_time:143195ms step_avg:153.97ms step:941/1480 train_time:143356ms step_avg:153.98ms step:942/1480 train_time:143514ms step_avg:153.99ms step:943/1480 train_time:143675ms step_avg:153.99ms step:944/1480 train_time:143839ms step_avg:154.00ms step:945/1480 train_time:143996ms step_avg:154.01ms step:946/1480 train_time:144158ms step_avg:154.02ms step:947/1480 train_time:144319ms step_avg:154.02ms step:948/1480 train_time:144477ms step_avg:154.03ms step:949/1480 train_time:144645ms step_avg:154.04ms step:950/1480 train_time:144794ms step_avg:154.04ms step:951/1480 train_time:144956ms step_avg:154.04ms step:952/1480 train_time:145115ms step_avg:154.05ms step:953/1480 train_time:145274ms step_avg:154.06ms step:954/1480 train_time:145438ms step_avg:154.07ms step:955/1480 train_time:145595ms step_avg:154.07ms step:956/1480 train_time:145754ms step_avg:154.07ms step:957/1480 train_time:145914ms step_avg:154.08ms step:958/1480 train_time:146077ms step_avg:154.09ms step:959/1480 train_time:146235ms step_avg:154.09ms step:960/1480 train_time:146394ms step_avg:154.10ms step:961/1480 train_time:146553ms step_avg:154.10ms step:962/1480 train_time:146713ms step_avg:154.11ms step:963/1480 train_time:146873ms step_avg:154.12ms step:964/1480 train_time:147035ms step_avg:154.12ms step:965/1480 train_time:147195ms step_avg:154.13ms step:966/1480 train_time:147354ms step_avg:154.14ms step:967/1480 train_time:147513ms step_avg:154.14ms step:968/1480 train_time:147672ms step_avg:154.15ms step:969/1480 train_time:147832ms step_avg:154.15ms step:970/1480 train_time:147991ms step_avg:154.16ms step:971/1480 train_time:148150ms step_avg:154.16ms step:972/1480 train_time:148310ms step_avg:154.17ms step:973/1480 train_time:148468ms step_avg:154.17ms step:974/1480 train_time:148629ms step_avg:154.18ms step:975/1480 train_time:148791ms step_avg:154.19ms step:976/1480 train_time:148952ms step_avg:154.19ms step:977/1480 train_time:149112ms step_avg:154.20ms step:978/1480 train_time:149271ms step_avg:154.21ms step:979/1480 train_time:149433ms step_avg:154.21ms step:980/1480 train_time:149593ms step_avg:154.22ms step:981/1480 train_time:149756ms step_avg:154.23ms step:982/1480 train_time:149915ms step_avg:154.23ms step:983/1480 train_time:150075ms step_avg:154.24ms step:984/1480 train_time:150234ms step_avg:154.24ms step:985/1480 train_time:150396ms step_avg:154.25ms step:986/1480 train_time:150556ms step_avg:154.26ms step:987/1480 train_time:150714ms step_avg:154.26ms step:988/1480 train_time:150875ms step_avg:154.27ms step:989/1480 train_time:151035ms step_avg:154.27ms step:990/1480 train_time:151195ms step_avg:154.28ms step:991/1480 train_time:151356ms step_avg:154.29ms step:992/1480 train_time:151522ms step_avg:154.30ms step:993/1480 train_time:151691ms step_avg:154.31ms step:994/1480 train_time:151852ms step_avg:154.32ms step:995/1480 train_time:152012ms step_avg:154.33ms step:996/1480 train_time:152169ms step_avg:154.33ms step:997/1480 train_time:152329ms step_avg:154.33ms step:998/1480 train_time:152488ms step_avg:154.34ms step:999/1480 train_time:152647ms step_avg:154.35ms step:1000/1480 train_time:152812ms step_avg:154.36ms step:1000/1480 val_loss:3.4390 train_time:152885ms step_avg:154.43ms step:1001/1480 train_time:152975ms step_avg:154.36ms step:1002/1480 train_time:153135ms step_avg:154.37ms step:1003/1480 train_time:153298ms step_avg:154.38ms step:1004/1480 train_time:153459ms step_avg:154.38ms step:1005/1480 train_time:153620ms step_avg:154.39ms step:1006/1480 train_time:153781ms step_avg:154.40ms step:1007/1480 train_time:153941ms step_avg:154.40ms step:1008/1480 train_time:154101ms step_avg:154.41ms step:1009/1480 train_time:154264ms step_avg:154.42ms step:1010/1480 train_time:154424ms step_avg:154.42ms step:1011/1480 train_time:154583ms step_avg:154.43ms step:1012/1480 train_time:154742ms step_avg:154.43ms step:1013/1480 train_time:154903ms step_avg:154.44ms step:1014/1480 train_time:155062ms step_avg:154.44ms step:1015/1480 train_time:155224ms step_avg:154.45ms step:1016/1480 train_time:155384ms step_avg:154.46ms step:1017/1480 train_time:155544ms step_avg:154.46ms step:1018/1480 train_time:155704ms step_avg:154.47ms step:1019/1480 train_time:155865ms step_avg:154.47ms step:1020/1480 train_time:156024ms step_avg:154.48ms step:1021/1480 train_time:156183ms step_avg:154.48ms step:1022/1480 train_time:156342ms step_avg:154.49ms step:1023/1480 train_time:156502ms step_avg:154.49ms step:1024/1480 train_time:156660ms step_avg:154.50ms step:1025/1480 train_time:156823ms step_avg:154.51ms step:1026/1480 train_time:156982ms step_avg:154.51ms step:1027/1480 train_time:157141ms step_avg:154.51ms step:1028/1480 train_time:157303ms step_avg:154.52ms step:1029/1480 train_time:157465ms step_avg:154.53ms step:1030/1480 train_time:157625ms step_avg:154.53ms step:1031/1480 train_time:157784ms step_avg:154.54ms step:1032/1480 train_time:157946ms step_avg:154.55ms step:1033/1480 train_time:158105ms step_avg:154.55ms step:1034/1480 train_time:158264ms step_avg:154.55ms step:1035/1480 train_time:158425ms step_avg:154.56ms step:1036/1480 train_time:158586ms step_avg:154.57ms step:1037/1480 train_time:158747ms step_avg:154.57ms step:1038/1480 train_time:158905ms step_avg:154.58ms step:1039/1480 train_time:159068ms step_avg:154.58ms step:1040/1480 train_time:159228ms step_avg:154.59ms step:1041/1480 train_time:159391ms step_avg:154.60ms step:1042/1480 train_time:159550ms step_avg:154.60ms step:1043/1480 train_time:159710ms step_avg:154.61ms step:1044/1480 train_time:159868ms step_avg:154.61ms step:1045/1480 train_time:160030ms step_avg:154.62ms step:1046/1480 train_time:160192ms step_avg:154.63ms step:1047/1480 train_time:160352ms step_avg:154.63ms step:1048/1480 train_time:160514ms step_avg:154.64ms step:1049/1480 train_time:160675ms step_avg:154.64ms step:1050/1480 train_time:160836ms step_avg:154.65ms step:1051/1480 train_time:160998ms step_avg:154.66ms step:1052/1480 train_time:161159ms step_avg:154.66ms step:1053/1480 train_time:161321ms step_avg:154.67ms step:1054/1480 train_time:161482ms step_avg:154.68ms step:1055/1480 train_time:161641ms step_avg:154.68ms step:1056/1480 train_time:161800ms step_avg:154.68ms step:1057/1480 train_time:161960ms step_avg:154.69ms step:1058/1480 train_time:162122ms step_avg:154.70ms step:1059/1480 train_time:162284ms step_avg:154.70ms step:1060/1480 train_time:162445ms step_avg:154.71ms step:1061/1480 train_time:162603ms step_avg:154.71ms step:1062/1480 train_time:162761ms step_avg:154.72ms step:1063/1480 train_time:162921ms step_avg:154.72ms step:1064/1480 train_time:163079ms step_avg:154.72ms step:1065/1480 train_time:163240ms step_avg:154.73ms step:1066/1480 train_time:163402ms step_avg:154.74ms step:1067/1480 train_time:163563ms step_avg:154.74ms step:1068/1480 train_time:163723ms step_avg:154.75ms step:1069/1480 train_time:163886ms step_avg:154.76ms step:1070/1480 train_time:164046ms step_avg:154.76ms step:1071/1480 train_time:164210ms step_avg:154.77ms step:1072/1480 train_time:164369ms step_avg:154.77ms step:1073/1480 train_time:164528ms step_avg:154.78ms step:1074/1480 train_time:164689ms step_avg:154.78ms step:1075/1480 train_time:164849ms step_avg:154.79ms step:1076/1480 train_time:165011ms step_avg:154.79ms step:1077/1480 train_time:165172ms step_avg:154.80ms step:1078/1480 train_time:165339ms step_avg:154.81ms step:1079/1480 train_time:165502ms step_avg:154.82ms step:1080/1480 train_time:165662ms step_avg:154.82ms step:1081/1480 train_time:165822ms step_avg:154.83ms step:1082/1480 train_time:165982ms step_avg:154.83ms step:1083/1480 train_time:166141ms step_avg:154.84ms step:1084/1480 train_time:166301ms step_avg:154.84ms step:1085/1480 train_time:166459ms step_avg:154.85ms step:1086/1480 train_time:166621ms step_avg:154.85ms step:1087/1480 train_time:166781ms step_avg:154.86ms step:1088/1480 train_time:166942ms step_avg:154.86ms step:1089/1480 train_time:167106ms step_avg:154.87ms step:1090/1480 train_time:167269ms step_avg:154.88ms step:1091/1480 train_time:167429ms step_avg:154.88ms step:1092/1480 train_time:167591ms step_avg:154.89ms step:1093/1480 train_time:167753ms step_avg:154.90ms step:1094/1480 train_time:167913ms step_avg:154.90ms step:1095/1480 train_time:168073ms step_avg:154.91ms step:1096/1480 train_time:168236ms step_avg:154.91ms step:1097/1480 train_time:168401ms step_avg:154.92ms step:1098/1480 train_time:168562ms step_avg:154.93ms step:1099/1480 train_time:168724ms step_avg:154.94ms step:1100/1480 train_time:168886ms step_avg:154.94ms step:1101/1480 train_time:169046ms step_avg:154.95ms step:1102/1480 train_time:169209ms step_avg:154.95ms step:1103/1480 train_time:169375ms step_avg:154.96ms step:1104/1480 train_time:169538ms step_avg:154.97ms step:1105/1480 train_time:169701ms step_avg:154.98ms step:1106/1480 train_time:169862ms step_avg:154.98ms step:1107/1480 train_time:170024ms step_avg:154.99ms step:1108/1480 train_time:170183ms step_avg:154.99ms step:1109/1480 train_time:170343ms step_avg:155.00ms step:1110/1480 train_time:170502ms step_avg:155.00ms step:1111/1480 train_time:170661ms step_avg:155.01ms step:1112/1480 train_time:170824ms step_avg:155.01ms step:1113/1480 train_time:170992ms step_avg:155.02ms step:1114/1480 train_time:171156ms step_avg:155.03ms step:1115/1480 train_time:171319ms step_avg:155.04ms step:1116/1480 train_time:171479ms step_avg:155.04ms step:1117/1480 train_time:171642ms step_avg:155.05ms step:1118/1480 train_time:171807ms step_avg:155.06ms step:1119/1480 train_time:171968ms step_avg:155.07ms step:1120/1480 train_time:172128ms step_avg:155.07ms step:1121/1480 train_time:172290ms step_avg:155.08ms step:1122/1480 train_time:172449ms step_avg:155.08ms step:1123/1480 train_time:172611ms step_avg:155.09ms step:1124/1480 train_time:172777ms step_avg:155.10ms step:1125/1480 train_time:172939ms step_avg:155.10ms step:1125/1480 val_loss:3.3830 train_time:173014ms step_avg:155.17ms step:1126/1480 train_time:173110ms step_avg:155.12ms step:1127/1480 train_time:173265ms step_avg:155.12ms step:1128/1480 train_time:173425ms step_avg:155.12ms step:1129/1480 train_time:173590ms step_avg:155.13ms step:1130/1480 train_time:173752ms step_avg:155.14ms step:1131/1480 train_time:173919ms step_avg:155.15ms step:1132/1480 train_time:174078ms step_avg:155.15ms step:1133/1480 train_time:174241ms step_avg:155.16ms step:1134/1480 train_time:174403ms step_avg:155.16ms step:1135/1480 train_time:174565ms step_avg:155.17ms step:1136/1480 train_time:174729ms step_avg:155.18ms step:1137/1480 train_time:174889ms step_avg:155.18ms step:1138/1480 train_time:175054ms step_avg:155.19ms step:1139/1480 train_time:175224ms step_avg:155.20ms step:1140/1480 train_time:175377ms step_avg:155.20ms step:1141/1480 train_time:175542ms step_avg:155.21ms step:1142/1480 train_time:175702ms step_avg:155.21ms step:1143/1480 train_time:175868ms step_avg:155.22ms step:1144/1480 train_time:176030ms step_avg:155.23ms step:1145/1480 train_time:176189ms step_avg:155.23ms step:1146/1480 train_time:176353ms step_avg:155.24ms step:1147/1480 train_time:176514ms step_avg:155.25ms step:1148/1480 train_time:176676ms step_avg:155.25ms step:1149/1480 train_time:176837ms step_avg:155.26ms step:1150/1480 train_time:176997ms step_avg:155.26ms step:1151/1480 train_time:177161ms step_avg:155.27ms step:1152/1480 train_time:177324ms step_avg:155.27ms step:1153/1480 train_time:177489ms step_avg:155.28ms step:1154/1480 train_time:177652ms step_avg:155.29ms step:1155/1480 train_time:177814ms step_avg:155.30ms step:1156/1480 train_time:177982ms step_avg:155.31ms step:1157/1480 train_time:178145ms step_avg:155.31ms step:1158/1480 train_time:178308ms step_avg:155.32ms step:1159/1480 train_time:178471ms step_avg:155.33ms step:1160/1480 train_time:178631ms step_avg:155.33ms step:1161/1480 train_time:178793ms step_avg:155.34ms step:1162/1480 train_time:178956ms step_avg:155.34ms step:1163/1480 train_time:179117ms step_avg:155.35ms step:1164/1480 train_time:179278ms step_avg:155.35ms step:1165/1480 train_time:179438ms step_avg:155.36ms step:1166/1480 train_time:179599ms step_avg:155.36ms step:1167/1480 train_time:179758ms step_avg:155.37ms step:1168/1480 train_time:179920ms step_avg:155.37ms step:1169/1480 train_time:180083ms step_avg:155.38ms step:1170/1480 train_time:180243ms step_avg:155.38ms step:1171/1480 train_time:180408ms step_avg:155.39ms step:1172/1480 train_time:180569ms step_avg:155.39ms step:1173/1480 train_time:180731ms step_avg:155.40ms step:1174/1480 train_time:180900ms step_avg:155.41ms step:1175/1480 train_time:181062ms step_avg:155.42ms step:1176/1480 train_time:181225ms step_avg:155.42ms step:1177/1480 train_time:181393ms step_avg:155.44ms step:1178/1480 train_time:181553ms step_avg:155.44ms step:1179/1480 train_time:181713ms step_avg:155.44ms step:1180/1480 train_time:181882ms step_avg:155.46ms step:1181/1480 train_time:182046ms step_avg:155.46ms step:1182/1480 train_time:182208ms step_avg:155.47ms step:1183/1480 train_time:182370ms step_avg:155.47ms step:1184/1480 train_time:182531ms step_avg:155.48ms step:1185/1480 train_time:182695ms step_avg:155.49ms step:1186/1480 train_time:182858ms step_avg:155.49ms step:1187/1480 train_time:183029ms step_avg:155.50ms step:1188/1480 train_time:183189ms step_avg:155.51ms step:1189/1480 train_time:183352ms step_avg:155.52ms step:1190/1480 train_time:183514ms step_avg:155.52ms step:1191/1480 train_time:183677ms step_avg:155.53ms step:1192/1480 train_time:183837ms step_avg:155.53ms step:1193/1480 train_time:183998ms step_avg:155.53ms step:1194/1480 train_time:184158ms step_avg:155.54ms step:1195/1480 train_time:184319ms step_avg:155.54ms step:1196/1480 train_time:184490ms step_avg:155.56ms step:1197/1480 train_time:184654ms step_avg:155.56ms step:1198/1480 train_time:184821ms step_avg:155.57ms step:1199/1480 train_time:184983ms step_avg:155.58ms step:1200/1480 train_time:185143ms step_avg:155.58ms step:1201/1480 train_time:185305ms step_avg:155.59ms step:1202/1480 train_time:185477ms step_avg:155.60ms step:1203/1480 train_time:185642ms step_avg:155.61ms step:1204/1480 train_time:185806ms step_avg:155.62ms step:1205/1480 train_time:185968ms step_avg:155.62ms step:1206/1480 train_time:186129ms step_avg:155.63ms step:1207/1480 train_time:186290ms step_avg:155.63ms step:1208/1480 train_time:186452ms step_avg:155.64ms step:1209/1480 train_time:186614ms step_avg:155.64ms step:1210/1480 train_time:186780ms step_avg:155.65ms step:1211/1480 train_time:186945ms step_avg:155.66ms step:1212/1480 train_time:187110ms step_avg:155.67ms step:1213/1480 train_time:187275ms step_avg:155.67ms step:1214/1480 train_time:187440ms step_avg:155.68ms step:1215/1480 train_time:187603ms step_avg:155.69ms step:1216/1480 train_time:187763ms step_avg:155.69ms step:1217/1480 train_time:187926ms step_avg:155.70ms step:1218/1480 train_time:188087ms step_avg:155.70ms step:1219/1480 train_time:188256ms step_avg:155.71ms step:1220/1480 train_time:188417ms step_avg:155.72ms step:1221/1480 train_time:188578ms step_avg:155.72ms step:1222/1480 train_time:188739ms step_avg:155.73ms step:1223/1480 train_time:188902ms step_avg:155.73ms step:1224/1480 train_time:189069ms step_avg:155.74ms step:1225/1480 train_time:189234ms step_avg:155.75ms step:1226/1480 train_time:189399ms step_avg:155.76ms step:1227/1480 train_time:189563ms step_avg:155.76ms step:1228/1480 train_time:189726ms step_avg:155.77ms step:1229/1480 train_time:189888ms step_avg:155.77ms step:1230/1480 train_time:190058ms step_avg:155.79ms step:1231/1480 train_time:190224ms step_avg:155.79ms step:1232/1480 train_time:190389ms step_avg:155.80ms step:1233/1480 train_time:190550ms step_avg:155.81ms step:1234/1480 train_time:190712ms step_avg:155.81ms step:1235/1480 train_time:190878ms step_avg:155.82ms step:1236/1480 train_time:191039ms step_avg:155.82ms step:1237/1480 train_time:191200ms step_avg:155.83ms step:1238/1480 train_time:191375ms step_avg:155.84ms step:1239/1480 train_time:191538ms step_avg:155.85ms step:1240/1480 train_time:191701ms step_avg:155.85ms step:1241/1480 train_time:191869ms step_avg:155.86ms step:1242/1480 train_time:192032ms step_avg:155.87ms step:1243/1480 train_time:192195ms step_avg:155.88ms step:1244/1480 train_time:192356ms step_avg:155.88ms step:1245/1480 train_time:192517ms step_avg:155.88ms step:1246/1480 train_time:192679ms step_avg:155.89ms step:1247/1480 train_time:192841ms step_avg:155.89ms step:1248/1480 train_time:193004ms step_avg:155.90ms step:1249/1480 train_time:193166ms step_avg:155.90ms step:1250/1480 train_time:193329ms step_avg:155.91ms step:1250/1480 val_loss:3.3334 train_time:193404ms step_avg:155.97ms step:1251/1480 train_time:193498ms step_avg:155.92ms step:1252/1480 train_time:193661ms step_avg:155.93ms step:1253/1480 train_time:193822ms step_avg:155.93ms step:1254/1480 train_time:193983ms step_avg:155.94ms step:1255/1480 train_time:194153ms step_avg:155.95ms step:1256/1480 train_time:194317ms step_avg:155.95ms step:1257/1480 train_time:194478ms step_avg:155.96ms step:1258/1480 train_time:194642ms step_avg:155.96ms step:1259/1480 train_time:194807ms step_avg:155.97ms step:1260/1480 train_time:194967ms step_avg:155.97ms step:1261/1480 train_time:195130ms step_avg:155.98ms step:1262/1480 train_time:195295ms step_avg:155.99ms step:1263/1480 train_time:195460ms step_avg:155.99ms step:1264/1480 train_time:195618ms step_avg:156.00ms step:1265/1480 train_time:195778ms step_avg:156.00ms step:1266/1480 train_time:195944ms step_avg:156.01ms step:1267/1480 train_time:196107ms step_avg:156.01ms step:1268/1480 train_time:196269ms step_avg:156.02ms step:1269/1480 train_time:196435ms step_avg:156.02ms step:1270/1480 train_time:196597ms step_avg:156.03ms step:1271/1480 train_time:196759ms step_avg:156.03ms step:1272/1480 train_time:196920ms step_avg:156.04ms step:1273/1480 train_time:197083ms step_avg:156.04ms step:1274/1480 train_time:197248ms step_avg:156.05ms step:1275/1480 train_time:197410ms step_avg:156.06ms step:1276/1480 train_time:197569ms step_avg:156.06ms step:1277/1480 train_time:197733ms step_avg:156.06ms step:1278/1480 train_time:197893ms step_avg:156.07ms step:1279/1480 train_time:198054ms step_avg:156.07ms step:1280/1480 train_time:198221ms step_avg:156.08ms step:1281/1480 train_time:198384ms step_avg:156.08ms step:1282/1480 train_time:198544ms step_avg:156.09ms step:1283/1480 train_time:198709ms step_avg:156.09ms step:1284/1480 train_time:198871ms step_avg:156.10ms step:1285/1480 train_time:199033ms step_avg:156.10ms step:1286/1480 train_time:199194ms step_avg:156.11ms step:1287/1480 train_time:199356ms step_avg:156.11ms step:1288/1480 train_time:199518ms step_avg:156.12ms step:1289/1480 train_time:199688ms step_avg:156.13ms step:1290/1480 train_time:199854ms step_avg:156.14ms step:1291/1480 train_time:200018ms step_avg:156.14ms step:1292/1480 train_time:200182ms step_avg:156.15ms step:1293/1480 train_time:200351ms step_avg:156.16ms step:1294/1480 train_time:200515ms step_avg:156.16ms step:1295/1480 train_time:200677ms step_avg:156.17ms step:1296/1480 train_time:200840ms step_avg:156.17ms step:1297/1480 train_time:201004ms step_avg:156.18ms step:1298/1480 train_time:201167ms step_avg:156.19ms step:1299/1480 train_time:201331ms step_avg:156.19ms step:1300/1480 train_time:201492ms step_avg:156.19ms step:1301/1480 train_time:201652ms step_avg:156.20ms step:1302/1480 train_time:201820ms step_avg:156.21ms step:1303/1480 train_time:201988ms step_avg:156.22ms step:1304/1480 train_time:202153ms step_avg:156.22ms step:1305/1480 train_time:202313ms step_avg:156.23ms step:1306/1480 train_time:202478ms step_avg:156.23ms step:1307/1480 train_time:202639ms step_avg:156.24ms step:1308/1480 train_time:202801ms step_avg:156.24ms step:1309/1480 train_time:202967ms step_avg:156.25ms step:1310/1480 train_time:203132ms step_avg:156.26ms step:1311/1480 train_time:203292ms step_avg:156.26ms step:1312/1480 train_time:203456ms step_avg:156.26ms step:1313/1480 train_time:203619ms step_avg:156.27ms step:1314/1480 train_time:203784ms step_avg:156.28ms step:1315/1480 train_time:203949ms step_avg:156.28ms step:1316/1480 train_time:204109ms step_avg:156.29ms step:1317/1480 train_time:204270ms step_avg:156.29ms step:1318/1480 train_time:204438ms step_avg:156.30ms step:1319/1480 train_time:204604ms step_avg:156.31ms step:1320/1480 train_time:204771ms step_avg:156.31ms step:1321/1480 train_time:204934ms step_avg:156.32ms step:1322/1480 train_time:205107ms step_avg:156.33ms step:1323/1480 train_time:205270ms step_avg:156.34ms step:1324/1480 train_time:205434ms step_avg:156.34ms step:1325/1480 train_time:205605ms step_avg:156.35ms step:1326/1480 train_time:205771ms step_avg:156.36ms step:1327/1480 train_time:205933ms step_avg:156.37ms step:1328/1480 train_time:206095ms step_avg:156.37ms step:1329/1480 train_time:206282ms step_avg:156.39ms step:1330/1480 train_time:206444ms step_avg:156.40ms step:1331/1480 train_time:206609ms step_avg:156.40ms step:1332/1480 train_time:206772ms step_avg:156.41ms step:1333/1480 train_time:206936ms step_avg:156.41ms step:1334/1480 train_time:207101ms step_avg:156.42ms step:1335/1480 train_time:207267ms step_avg:156.43ms step:1336/1480 train_time:207438ms step_avg:156.44ms step:1337/1480 train_time:207607ms step_avg:156.45ms step:1338/1480 train_time:207771ms step_avg:156.45ms step:1339/1480 train_time:207936ms step_avg:156.46ms step:1340/1480 train_time:208100ms step_avg:156.47ms step:1341/1480 train_time:208261ms step_avg:156.47ms step:1342/1480 train_time:208427ms step_avg:156.48ms step:1343/1480 train_time:208590ms step_avg:156.48ms step:1344/1480 train_time:208753ms step_avg:156.49ms step:1345/1480 train_time:208923ms step_avg:156.50ms step:1346/1480 train_time:209084ms step_avg:156.50ms step:1347/1480 train_time:209247ms step_avg:156.51ms step:1348/1480 train_time:209412ms step_avg:156.51ms step:1349/1480 train_time:209572ms step_avg:156.51ms step:1350/1480 train_time:209738ms step_avg:156.52ms step:1351/1480 train_time:209901ms step_avg:156.53ms step:1352/1480 train_time:210063ms step_avg:156.53ms step:1353/1480 train_time:210230ms step_avg:156.54ms step:1354/1480 train_time:210393ms step_avg:156.54ms step:1355/1480 train_time:210554ms step_avg:156.55ms step:1356/1480 train_time:210719ms step_avg:156.55ms step:1357/1480 train_time:210883ms step_avg:156.56ms step:1358/1480 train_time:211046ms step_avg:156.56ms step:1359/1480 train_time:211212ms step_avg:156.57ms step:1360/1480 train_time:211377ms step_avg:156.58ms step:1361/1480 train_time:211544ms step_avg:156.58ms step:1362/1480 train_time:211710ms step_avg:156.59ms step:1363/1480 train_time:211878ms step_avg:156.60ms step:1364/1480 train_time:212039ms step_avg:156.60ms step:1365/1480 train_time:212199ms step_avg:156.60ms step:1366/1480 train_time:212361ms step_avg:156.61ms step:1367/1480 train_time:212524ms step_avg:156.61ms step:1368/1480 train_time:212689ms step_avg:156.62ms step:1369/1480 train_time:212858ms step_avg:156.63ms step:1370/1480 train_time:213025ms step_avg:156.64ms step:1371/1480 train_time:213188ms step_avg:156.64ms step:1372/1480 train_time:213355ms step_avg:156.65ms step:1373/1480 train_time:213516ms step_avg:156.65ms step:1374/1480 train_time:213683ms step_avg:156.66ms step:1375/1480 train_time:213845ms step_avg:156.66ms step:1375/1480 val_loss:3.2950 train_time:213920ms step_avg:156.72ms step:1376/1480 train_time:214011ms step_avg:156.67ms step:1377/1480 train_time:214175ms step_avg:156.68ms step:1378/1480 train_time:214337ms step_avg:156.68ms step:1379/1480 train_time:214503ms step_avg:156.69ms step:1380/1480 train_time:214667ms step_avg:156.69ms step:1381/1480 train_time:214836ms step_avg:156.70ms step:1382/1480 train_time:214998ms step_avg:156.70ms step:1383/1480 train_time:215161ms step_avg:156.71ms step:1384/1480 train_time:215327ms step_avg:156.72ms step:1385/1480 train_time:215488ms step_avg:156.72ms step:1386/1480 train_time:215651ms step_avg:156.72ms step:1387/1480 train_time:215814ms step_avg:156.73ms step:1388/1480 train_time:215975ms step_avg:156.73ms step:1389/1480 train_time:216140ms step_avg:156.74ms step:1390/1480 train_time:216301ms step_avg:156.74ms step:1391/1480 train_time:216465ms step_avg:156.75ms step:1392/1480 train_time:216629ms step_avg:156.75ms step:1393/1480 train_time:216791ms step_avg:156.75ms step:1394/1480 train_time:216954ms step_avg:156.76ms step:1395/1480 train_time:217116ms step_avg:156.76ms step:1396/1480 train_time:217278ms step_avg:156.77ms step:1397/1480 train_time:217439ms step_avg:156.77ms step:1398/1480 train_time:217599ms step_avg:156.77ms step:1399/1480 train_time:217760ms step_avg:156.77ms step:1400/1480 train_time:217930ms step_avg:156.78ms step:1401/1480 train_time:218091ms step_avg:156.79ms step:1402/1480 train_time:218253ms step_avg:156.79ms step:1403/1480 train_time:218419ms step_avg:156.80ms step:1404/1480 train_time:218581ms step_avg:156.80ms step:1405/1480 train_time:218747ms step_avg:156.81ms step:1406/1480 train_time:218912ms step_avg:156.81ms step:1407/1480 train_time:219075ms step_avg:156.82ms step:1408/1480 train_time:219236ms step_avg:156.82ms step:1409/1480 train_time:219408ms step_avg:156.83ms step:1410/1480 train_time:219571ms step_avg:156.84ms step:1411/1480 train_time:219732ms step_avg:156.84ms step:1412/1480 train_time:219894ms step_avg:156.84ms step:1413/1480 train_time:220058ms step_avg:156.85ms step:1414/1480 train_time:220221ms step_avg:156.85ms step:1415/1480 train_time:220386ms step_avg:156.86ms step:1416/1480 train_time:220560ms step_avg:156.87ms step:1417/1480 train_time:220725ms step_avg:156.88ms step:1418/1480 train_time:220889ms step_avg:156.88ms step:1419/1480 train_time:221055ms step_avg:156.89ms step:1420/1480 train_time:221220ms step_avg:156.89ms step:1421/1480 train_time:221385ms step_avg:156.90ms step:1422/1480 train_time:221553ms step_avg:156.91ms step:1423/1480 train_time:221715ms step_avg:156.91ms step:1424/1480 train_time:221882ms step_avg:156.92ms step:1425/1480 train_time:222053ms step_avg:156.93ms step:1426/1480 train_time:222217ms step_avg:156.93ms step:1427/1480 train_time:222382ms step_avg:156.94ms step:1428/1480 train_time:222544ms step_avg:156.94ms step:1429/1480 train_time:222706ms step_avg:156.95ms step:1430/1480 train_time:222871ms step_avg:156.95ms step:1431/1480 train_time:223037ms step_avg:156.96ms step:1432/1480 train_time:223205ms step_avg:156.97ms step:1433/1480 train_time:223376ms step_avg:156.98ms step:1434/1480 train_time:223545ms step_avg:156.98ms step:1435/1480 train_time:223712ms step_avg:156.99ms step:1436/1480 train_time:223878ms step_avg:157.00ms step:1437/1480 train_time:224039ms step_avg:157.00ms step:1438/1480 train_time:224200ms step_avg:157.00ms step:1439/1480 train_time:224366ms step_avg:157.01ms step:1440/1480 train_time:224530ms step_avg:157.01ms step:1441/1480 train_time:224695ms step_avg:157.02ms step:1442/1480 train_time:224860ms step_avg:157.02ms step:1443/1480 train_time:225033ms step_avg:157.04ms step:1444/1480 train_time:225196ms step_avg:157.04ms step:1445/1480 train_time:225359ms step_avg:157.04ms step:1446/1480 train_time:225524ms step_avg:157.05ms step:1447/1480 train_time:225692ms step_avg:157.06ms step:1448/1480 train_time:225856ms step_avg:157.06ms step:1449/1480 train_time:226018ms step_avg:157.07ms step:1450/1480 train_time:226182ms step_avg:157.07ms step:1451/1480 train_time:226346ms step_avg:157.08ms step:1452/1480 train_time:226511ms step_avg:157.08ms step:1453/1480 train_time:226674ms step_avg:157.09ms step:1454/1480 train_time:226836ms step_avg:157.09ms step:1455/1480 train_time:227002ms step_avg:157.09ms step:1456/1480 train_time:227167ms step_avg:157.10ms step:1457/1480 train_time:227330ms step_avg:157.10ms step:1458/1480 train_time:227494ms step_avg:157.11ms step:1459/1480 train_time:227660ms step_avg:157.11ms step:1460/1480 train_time:227821ms step_avg:157.12ms step:1461/1480 train_time:227985ms step_avg:157.12ms step:1462/1480 train_time:228152ms step_avg:157.13ms step:1463/1480 train_time:228318ms step_avg:157.14ms step:1464/1480 train_time:228483ms step_avg:157.14ms step:1465/1480 train_time:228648ms step_avg:157.15ms step:1466/1480 train_time:228811ms step_avg:157.15ms step:1467/1480 train_time:228978ms step_avg:157.16ms step:1468/1480 train_time:229140ms step_avg:157.16ms step:1469/1480 train_time:229303ms step_avg:157.16ms step:1470/1480 train_time:229473ms step_avg:157.17ms step:1471/1480 train_time:229644ms step_avg:157.18ms step:1472/1480 train_time:229815ms step_avg:157.19ms step:1473/1480 train_time:229977ms step_avg:157.20ms step:1474/1480 train_time:230142ms step_avg:157.20ms step:1475/1480 train_time:230311ms step_avg:157.21ms step:1476/1480 train_time:230475ms step_avg:157.21ms step:1477/1480 train_time:230644ms step_avg:157.22ms step:1478/1480 train_time:230814ms step_avg:157.23ms step:1479/1480 train_time:230980ms step_avg:157.24ms step:1480/1480 train_time:231142ms step_avg:157.24ms step:1480/1480 val_loss:3.2759 train_time:231219ms step_avg:157.29ms peak memory consumption: 34240 MiB