import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 07:59:49 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29082ms step_avg:nanms step:2/1480 train_time:29699ms step_avg:nanms step:3/1480 train_time:29823ms step_avg:nanms step:4/1480 train_time:29959ms step_avg:nanms step:5/1480 train_time:30100ms step_avg:nanms step:6/1480 train_time:30241ms step_avg:nanms step:7/1480 train_time:30383ms step_avg:nanms step:8/1480 train_time:30528ms step_avg:nanms step:9/1480 train_time:30672ms step_avg:nanms step:10/1480 train_time:30813ms step_avg:nanms step:11/1480 train_time:145ms step_avg:nanms step:12/1480 train_time:281ms step_avg:nanms step:13/1480 train_time:423ms step_avg:140.90ms step:14/1480 train_time:566ms step_avg:141.45ms step:15/1480 train_time:708ms step_avg:141.60ms step:16/1480 train_time:851ms step_avg:141.87ms step:17/1480 train_time:996ms step_avg:142.30ms step:18/1480 train_time:1138ms step_avg:142.23ms step:19/1480 train_time:1281ms step_avg:142.32ms step:20/1480 train_time:1424ms step_avg:142.38ms step:21/1480 train_time:1565ms step_avg:142.30ms step:22/1480 train_time:1708ms step_avg:142.36ms step:23/1480 train_time:1853ms step_avg:142.53ms step:24/1480 train_time:1998ms step_avg:142.72ms step:25/1480 train_time:2139ms step_avg:142.63ms step:26/1480 train_time:2282ms step_avg:142.65ms step:27/1480 train_time:2424ms step_avg:142.57ms step:28/1480 train_time:2567ms step_avg:142.62ms step:29/1480 train_time:2710ms step_avg:142.65ms step:30/1480 train_time:3245ms step_avg:162.23ms step:31/1480 train_time:3344ms step_avg:159.22ms step:32/1480 train_time:3487ms step_avg:158.50ms step:33/1480 train_time:3630ms step_avg:157.84ms step:34/1480 train_time:3773ms step_avg:157.21ms step:35/1480 train_time:3916ms step_avg:156.62ms step:36/1480 train_time:4058ms step_avg:156.08ms step:37/1480 train_time:4201ms step_avg:155.58ms step:38/1480 train_time:4343ms step_avg:155.10ms step:39/1480 train_time:4487ms step_avg:154.73ms step:40/1480 train_time:4629ms step_avg:154.31ms step:41/1480 train_time:4772ms step_avg:153.94ms step:42/1480 train_time:4916ms step_avg:153.62ms step:43/1480 train_time:5059ms step_avg:153.32ms step:44/1480 train_time:5202ms step_avg:152.99ms step:45/1480 train_time:5343ms step_avg:152.67ms step:46/1480 train_time:5486ms step_avg:152.40ms step:47/1480 train_time:5629ms step_avg:152.15ms step:48/1480 train_time:5771ms step_avg:151.88ms step:49/1480 train_time:5915ms step_avg:151.67ms step:50/1480 train_time:6058ms step_avg:151.45ms step:51/1480 train_time:6199ms step_avg:151.19ms step:52/1480 train_time:6340ms step_avg:150.96ms step:53/1480 train_time:6483ms step_avg:150.77ms step:54/1480 train_time:6626ms step_avg:150.59ms step:55/1480 train_time:6769ms step_avg:150.42ms step:56/1480 train_time:6913ms step_avg:150.27ms step:57/1480 train_time:7055ms step_avg:150.12ms step:58/1480 train_time:7198ms step_avg:149.96ms step:59/1480 train_time:7340ms step_avg:149.79ms step:60/1480 train_time:7481ms step_avg:149.62ms step:61/1480 train_time:7623ms step_avg:149.47ms step:62/1480 train_time:7767ms step_avg:149.37ms step:63/1480 train_time:7912ms step_avg:149.29ms step:64/1480 train_time:8056ms step_avg:149.18ms step:65/1480 train_time:8197ms step_avg:149.04ms step:66/1480 train_time:8339ms step_avg:148.90ms step:67/1480 train_time:8481ms step_avg:148.78ms step:68/1480 train_time:8624ms step_avg:148.69ms step:69/1480 train_time:8767ms step_avg:148.59ms step:70/1480 train_time:8910ms step_avg:148.50ms step:71/1480 train_time:9052ms step_avg:148.39ms step:72/1480 train_time:9195ms step_avg:148.31ms step:73/1480 train_time:9337ms step_avg:148.21ms step:74/1480 train_time:9480ms step_avg:148.13ms step:75/1480 train_time:9622ms step_avg:148.03ms step:76/1480 train_time:9765ms step_avg:147.96ms step:77/1480 train_time:9909ms step_avg:147.90ms step:78/1480 train_time:10053ms step_avg:147.84ms step:79/1480 train_time:10197ms step_avg:147.78ms step:80/1480 train_time:10338ms step_avg:147.68ms step:81/1480 train_time:10876ms step_avg:153.18ms step:82/1480 train_time:10974ms step_avg:152.42ms step:83/1480 train_time:11117ms step_avg:152.28ms step:84/1480 train_time:11258ms step_avg:152.14ms step:85/1480 train_time:11399ms step_avg:151.98ms step:86/1480 train_time:11540ms step_avg:151.84ms step:87/1480 train_time:11682ms step_avg:151.71ms step:88/1480 train_time:11825ms step_avg:151.61ms step:89/1480 train_time:11971ms step_avg:151.53ms step:90/1480 train_time:12113ms step_avg:151.41ms step:91/1480 train_time:12256ms step_avg:151.31ms step:92/1480 train_time:12398ms step_avg:151.20ms step:93/1480 train_time:12540ms step_avg:151.08ms step:94/1480 train_time:12683ms step_avg:150.99ms step:95/1480 train_time:13227ms step_avg:155.61ms step:96/1480 train_time:13328ms step_avg:154.98ms step:97/1480 train_time:13471ms step_avg:154.84ms step:98/1480 train_time:13613ms step_avg:154.69ms step:99/1480 train_time:13755ms step_avg:154.55ms step:100/1480 train_time:13897ms step_avg:154.41ms step:101/1480 train_time:14042ms step_avg:154.31ms step:102/1480 train_time:14179ms step_avg:154.12ms step:103/1480 train_time:14324ms step_avg:154.02ms step:104/1480 train_time:14468ms step_avg:153.91ms step:105/1480 train_time:14613ms step_avg:153.82ms step:106/1480 train_time:14756ms step_avg:153.71ms step:107/1480 train_time:14899ms step_avg:153.60ms step:108/1480 train_time:15041ms step_avg:153.48ms step:109/1480 train_time:15184ms step_avg:153.37ms step:110/1480 train_time:15327ms step_avg:153.27ms step:111/1480 train_time:15472ms step_avg:153.19ms step:112/1480 train_time:15618ms step_avg:153.12ms step:113/1480 train_time:15763ms step_avg:153.04ms step:114/1480 train_time:15907ms step_avg:152.96ms step:115/1480 train_time:16053ms step_avg:152.89ms step:116/1480 train_time:16199ms step_avg:152.82ms step:117/1480 train_time:16343ms step_avg:152.74ms step:118/1480 train_time:16490ms step_avg:152.69ms step:119/1480 train_time:16637ms step_avg:152.63ms step:120/1480 train_time:16783ms step_avg:152.57ms step:121/1480 train_time:16928ms step_avg:152.51ms step:122/1480 train_time:17075ms step_avg:152.46ms step:123/1480 train_time:17220ms step_avg:152.39ms step:124/1480 train_time:17366ms step_avg:152.33ms step:125/1480 train_time:17512ms step_avg:152.28ms step:125/1480 val_loss:4.4274 train_time:17577ms step_avg:152.84ms step:126/1480 train_time:17683ms step_avg:152.44ms step:127/1480 train_time:17814ms step_avg:152.26ms step:128/1480 train_time:17962ms step_avg:152.22ms step:129/1480 train_time:18107ms step_avg:152.16ms step:130/1480 train_time:18251ms step_avg:152.09ms step:131/1480 train_time:18397ms step_avg:152.04ms step:132/1480 train_time:18542ms step_avg:151.99ms step:133/1480 train_time:18687ms step_avg:151.93ms step:134/1480 train_time:18835ms step_avg:151.90ms step:135/1480 train_time:18982ms step_avg:151.85ms step:136/1480 train_time:19127ms step_avg:151.80ms step:137/1480 train_time:19272ms step_avg:151.75ms step:138/1480 train_time:19419ms step_avg:151.71ms step:139/1480 train_time:19565ms step_avg:151.67ms step:140/1480 train_time:19710ms step_avg:151.61ms step:141/1480 train_time:19855ms step_avg:151.57ms step:142/1480 train_time:20001ms step_avg:151.53ms step:143/1480 train_time:20147ms step_avg:151.48ms step:144/1480 train_time:20292ms step_avg:151.43ms step:145/1480 train_time:20440ms step_avg:151.41ms step:146/1480 train_time:20585ms step_avg:151.36ms step:147/1480 train_time:20732ms step_avg:151.33ms step:148/1480 train_time:20878ms step_avg:151.29ms step:149/1480 train_time:21025ms step_avg:151.26ms step:150/1480 train_time:21169ms step_avg:151.21ms step:151/1480 train_time:21314ms step_avg:151.17ms step:152/1480 train_time:21460ms step_avg:151.13ms step:153/1480 train_time:21606ms step_avg:151.09ms step:154/1480 train_time:21752ms step_avg:151.06ms step:155/1480 train_time:21898ms step_avg:151.02ms step:156/1480 train_time:22044ms step_avg:150.99ms step:157/1480 train_time:22189ms step_avg:150.94ms step:158/1480 train_time:22334ms step_avg:150.91ms step:159/1480 train_time:22481ms step_avg:150.88ms step:160/1480 train_time:22627ms step_avg:150.85ms step:161/1480 train_time:22773ms step_avg:150.81ms step:162/1480 train_time:22921ms step_avg:150.80ms step:163/1480 train_time:23067ms step_avg:150.76ms step:164/1480 train_time:23213ms step_avg:150.73ms step:165/1480 train_time:23359ms step_avg:150.71ms step:166/1480 train_time:23505ms step_avg:150.68ms step:167/1480 train_time:23650ms step_avg:150.64ms step:168/1480 train_time:23796ms step_avg:150.61ms step:169/1480 train_time:23943ms step_avg:150.58ms step:170/1480 train_time:24088ms step_avg:150.55ms step:171/1480 train_time:24233ms step_avg:150.52ms step:172/1480 train_time:24380ms step_avg:150.49ms step:173/1480 train_time:24525ms step_avg:150.46ms step:174/1480 train_time:24670ms step_avg:150.43ms step:175/1480 train_time:24816ms step_avg:150.40ms step:176/1480 train_time:24963ms step_avg:150.38ms step:177/1480 train_time:25107ms step_avg:150.34ms step:178/1480 train_time:25253ms step_avg:150.31ms step:179/1480 train_time:25399ms step_avg:150.29ms step:180/1480 train_time:25544ms step_avg:150.26ms step:181/1480 train_time:25689ms step_avg:150.23ms step:182/1480 train_time:25834ms step_avg:150.20ms step:183/1480 train_time:25980ms step_avg:150.17ms step:184/1480 train_time:26126ms step_avg:150.15ms step:185/1480 train_time:26270ms step_avg:150.11ms step:186/1480 train_time:26417ms step_avg:150.10ms step:187/1480 train_time:26563ms step_avg:150.08ms step:188/1480 train_time:26709ms step_avg:150.05ms step:189/1480 train_time:26879ms step_avg:150.16ms step:190/1480 train_time:27000ms step_avg:150.00ms step:191/1480 train_time:27146ms step_avg:149.98ms step:192/1480 train_time:27290ms step_avg:149.95ms step:193/1480 train_time:27437ms step_avg:149.93ms step:194/1480 train_time:27583ms step_avg:149.91ms step:195/1480 train_time:27730ms step_avg:149.89ms step:196/1480 train_time:27876ms step_avg:149.87ms step:197/1480 train_time:28022ms step_avg:149.85ms step:198/1480 train_time:28167ms step_avg:149.82ms step:199/1480 train_time:28312ms step_avg:149.80ms step:200/1480 train_time:28459ms step_avg:149.78ms step:201/1480 train_time:28605ms step_avg:149.77ms step:202/1480 train_time:28749ms step_avg:149.73ms step:203/1480 train_time:28895ms step_avg:149.71ms step:204/1480 train_time:29041ms step_avg:149.70ms step:205/1480 train_time:29186ms step_avg:149.67ms step:206/1480 train_time:29332ms step_avg:149.65ms step:207/1480 train_time:29478ms step_avg:149.63ms step:208/1480 train_time:29624ms step_avg:149.62ms step:209/1480 train_time:29769ms step_avg:149.59ms step:210/1480 train_time:29916ms step_avg:149.58ms step:211/1480 train_time:30063ms step_avg:149.57ms step:212/1480 train_time:30207ms step_avg:149.54ms step:213/1480 train_time:30352ms step_avg:149.52ms step:214/1480 train_time:30497ms step_avg:149.50ms step:215/1480 train_time:30643ms step_avg:149.48ms step:216/1480 train_time:30788ms step_avg:149.46ms step:217/1480 train_time:30935ms step_avg:149.45ms step:218/1480 train_time:31082ms step_avg:149.43ms step:219/1480 train_time:31227ms step_avg:149.41ms step:220/1480 train_time:31372ms step_avg:149.39ms step:221/1480 train_time:32008ms step_avg:151.70ms step:222/1480 train_time:32523ms step_avg:153.41ms step:223/1480 train_time:32627ms step_avg:153.18ms step:224/1480 train_time:32775ms step_avg:153.15ms step:225/1480 train_time:32923ms step_avg:153.13ms step:226/1480 train_time:33070ms step_avg:153.10ms step:227/1480 train_time:33218ms step_avg:153.08ms step:228/1480 train_time:33367ms step_avg:153.06ms step:229/1480 train_time:33517ms step_avg:153.05ms step:230/1480 train_time:33667ms step_avg:153.03ms step:231/1480 train_time:33814ms step_avg:153.00ms step:232/1480 train_time:33963ms step_avg:152.99ms step:233/1480 train_time:34110ms step_avg:152.96ms step:234/1480 train_time:34258ms step_avg:152.94ms step:235/1480 train_time:34407ms step_avg:152.92ms step:236/1480 train_time:34555ms step_avg:152.90ms step:237/1480 train_time:34704ms step_avg:152.88ms step:238/1480 train_time:34851ms step_avg:152.86ms step:239/1480 train_time:34999ms step_avg:152.84ms step:240/1480 train_time:35148ms step_avg:152.82ms step:241/1480 train_time:35295ms step_avg:152.79ms step:242/1480 train_time:35446ms step_avg:152.78ms step:243/1480 train_time:35592ms step_avg:152.76ms step:244/1480 train_time:35742ms step_avg:152.74ms step:245/1480 train_time:35889ms step_avg:152.72ms step:246/1480 train_time:36039ms step_avg:152.71ms step:247/1480 train_time:36188ms step_avg:152.69ms step:248/1480 train_time:36336ms step_avg:152.67ms step:249/1480 train_time:36484ms step_avg:152.65ms step:250/1480 train_time:36634ms step_avg:152.64ms step:250/1480 val_loss:4.0046 train_time:36702ms step_avg:152.92ms step:251/1480 train_time:36794ms step_avg:152.67ms step:252/1480 train_time:36943ms step_avg:152.66ms step:253/1480 train_time:37092ms step_avg:152.64ms step:254/1480 train_time:37239ms step_avg:152.62ms step:255/1480 train_time:37387ms step_avg:152.60ms step:256/1480 train_time:37535ms step_avg:152.58ms step:257/1480 train_time:37682ms step_avg:152.56ms step:258/1480 train_time:37834ms step_avg:152.56ms step:259/1480 train_time:37981ms step_avg:152.54ms step:260/1480 train_time:38131ms step_avg:152.52ms step:261/1480 train_time:38278ms step_avg:152.50ms step:262/1480 train_time:38427ms step_avg:152.49ms step:263/1480 train_time:38575ms step_avg:152.47ms step:264/1480 train_time:38723ms step_avg:152.45ms step:265/1480 train_time:38873ms step_avg:152.44ms step:266/1480 train_time:39021ms step_avg:152.42ms step:267/1480 train_time:39171ms step_avg:152.42ms step:268/1480 train_time:39317ms step_avg:152.39ms step:269/1480 train_time:39467ms step_avg:152.38ms step:270/1480 train_time:39615ms step_avg:152.37ms step:271/1480 train_time:39763ms step_avg:152.35ms step:272/1480 train_time:39912ms step_avg:152.33ms step:273/1480 train_time:40061ms step_avg:152.32ms step:274/1480 train_time:40210ms step_avg:152.31ms step:275/1480 train_time:40357ms step_avg:152.29ms step:276/1480 train_time:40506ms step_avg:152.28ms step:277/1480 train_time:40655ms step_avg:152.26ms step:278/1480 train_time:40803ms step_avg:152.25ms step:279/1480 train_time:40952ms step_avg:152.24ms step:280/1480 train_time:41100ms step_avg:152.22ms step:281/1480 train_time:41249ms step_avg:152.21ms step:282/1480 train_time:41397ms step_avg:152.19ms step:283/1480 train_time:41545ms step_avg:152.18ms step:284/1480 train_time:41694ms step_avg:152.17ms step:285/1480 train_time:41842ms step_avg:152.15ms step:286/1480 train_time:41991ms step_avg:152.14ms step:287/1480 train_time:42138ms step_avg:152.12ms step:288/1480 train_time:42287ms step_avg:152.11ms step:289/1480 train_time:42436ms step_avg:152.10ms step:290/1480 train_time:42583ms step_avg:152.08ms step:291/1480 train_time:42733ms step_avg:152.07ms step:292/1480 train_time:42880ms step_avg:152.06ms step:293/1480 train_time:43029ms step_avg:152.05ms step:294/1480 train_time:43177ms step_avg:152.03ms step:295/1480 train_time:43325ms step_avg:152.02ms step:296/1480 train_time:43474ms step_avg:152.01ms step:297/1480 train_time:43622ms step_avg:151.99ms step:298/1480 train_time:43771ms step_avg:151.98ms step:299/1480 train_time:43918ms step_avg:151.97ms step:300/1480 train_time:44068ms step_avg:151.96ms step:301/1480 train_time:44217ms step_avg:151.95ms step:302/1480 train_time:44366ms step_avg:151.94ms step:303/1480 train_time:44515ms step_avg:151.93ms step:304/1480 train_time:44664ms step_avg:151.92ms step:305/1480 train_time:44812ms step_avg:151.91ms step:306/1480 train_time:44961ms step_avg:151.89ms step:307/1480 train_time:45109ms step_avg:151.88ms step:308/1480 train_time:45258ms step_avg:151.87ms step:309/1480 train_time:45408ms step_avg:151.86ms step:310/1480 train_time:45556ms step_avg:151.85ms step:311/1480 train_time:45704ms step_avg:151.84ms step:312/1480 train_time:45853ms step_avg:151.83ms step:313/1480 train_time:46001ms step_avg:151.82ms step:314/1480 train_time:46150ms step_avg:151.81ms step:315/1480 train_time:46298ms step_avg:151.80ms step:316/1480 train_time:46448ms step_avg:151.79ms step:317/1480 train_time:46596ms step_avg:151.78ms step:318/1480 train_time:46745ms step_avg:151.77ms step:319/1480 train_time:46894ms step_avg:151.76ms step:320/1480 train_time:47042ms step_avg:151.75ms step:321/1480 train_time:47192ms step_avg:151.74ms step:322/1480 train_time:47339ms step_avg:151.73ms step:323/1480 train_time:47488ms step_avg:151.72ms step:324/1480 train_time:47636ms step_avg:151.71ms step:325/1480 train_time:47784ms step_avg:151.70ms step:326/1480 train_time:47933ms step_avg:151.69ms step:327/1480 train_time:48080ms step_avg:151.67ms step:328/1480 train_time:48229ms step_avg:151.66ms step:329/1480 train_time:48377ms step_avg:151.65ms step:330/1480 train_time:48527ms step_avg:151.65ms step:331/1480 train_time:48678ms step_avg:151.65ms step:332/1480 train_time:48830ms step_avg:151.64ms step:333/1480 train_time:48979ms step_avg:151.64ms step:334/1480 train_time:49131ms step_avg:151.64ms step:335/1480 train_time:49280ms step_avg:151.63ms step:336/1480 train_time:49431ms step_avg:151.63ms step:337/1480 train_time:49582ms step_avg:151.63ms step:338/1480 train_time:49733ms step_avg:151.63ms step:339/1480 train_time:49883ms step_avg:151.62ms step:340/1480 train_time:50034ms step_avg:151.62ms step:341/1480 train_time:50184ms step_avg:151.61ms step:342/1480 train_time:50335ms step_avg:151.61ms step:343/1480 train_time:50485ms step_avg:151.61ms step:344/1480 train_time:50636ms step_avg:151.60ms step:345/1480 train_time:50787ms step_avg:151.60ms step:346/1480 train_time:50938ms step_avg:151.60ms step:347/1480 train_time:51089ms step_avg:151.60ms step:348/1480 train_time:51240ms step_avg:151.60ms step:349/1480 train_time:51392ms step_avg:151.60ms step:350/1480 train_time:51541ms step_avg:151.59ms step:351/1480 train_time:51693ms step_avg:151.59ms step:352/1480 train_time:51844ms step_avg:151.59ms step:353/1480 train_time:51995ms step_avg:151.59ms step:354/1480 train_time:52147ms step_avg:151.59ms step:355/1480 train_time:52297ms step_avg:151.59ms step:356/1480 train_time:52449ms step_avg:151.59ms step:357/1480 train_time:52600ms step_avg:151.58ms step:358/1480 train_time:52751ms step_avg:151.58ms step:359/1480 train_time:52901ms step_avg:151.58ms step:360/1480 train_time:53054ms step_avg:151.58ms step:361/1480 train_time:53205ms step_avg:151.58ms step:362/1480 train_time:53357ms step_avg:151.58ms step:363/1480 train_time:53508ms step_avg:151.58ms step:364/1480 train_time:53659ms step_avg:151.58ms step:365/1480 train_time:53811ms step_avg:151.58ms step:366/1480 train_time:53963ms step_avg:151.58ms step:367/1480 train_time:54114ms step_avg:151.58ms step:368/1480 train_time:54265ms step_avg:151.58ms step:369/1480 train_time:54415ms step_avg:151.57ms step:370/1480 train_time:54565ms step_avg:151.57ms step:371/1480 train_time:54716ms step_avg:151.57ms step:372/1480 train_time:54868ms step_avg:151.57ms step:373/1480 train_time:55019ms step_avg:151.57ms step:374/1480 train_time:55170ms step_avg:151.57ms step:375/1480 train_time:55320ms step_avg:151.56ms step:375/1480 val_loss:3.8185 train_time:55387ms step_avg:151.75ms step:376/1480 train_time:55488ms step_avg:151.61ms step:377/1480 train_time:55627ms step_avg:151.57ms step:378/1480 train_time:55778ms step_avg:151.57ms step:379/1480 train_time:55946ms step_avg:151.61ms step:380/1480 train_time:56080ms step_avg:151.57ms step:381/1480 train_time:56230ms step_avg:151.56ms step:382/1480 train_time:56381ms step_avg:151.56ms step:383/1480 train_time:56532ms step_avg:151.56ms step:384/1480 train_time:56684ms step_avg:151.56ms step:385/1480 train_time:56834ms step_avg:151.56ms step:386/1480 train_time:56986ms step_avg:151.56ms step:387/1480 train_time:57137ms step_avg:151.56ms step:388/1480 train_time:57287ms step_avg:151.55ms step:389/1480 train_time:57438ms step_avg:151.55ms step:390/1480 train_time:57589ms step_avg:151.55ms step:391/1480 train_time:57740ms step_avg:151.55ms step:392/1480 train_time:57890ms step_avg:151.54ms step:393/1480 train_time:58042ms step_avg:151.54ms step:394/1480 train_time:58191ms step_avg:151.54ms step:395/1480 train_time:58343ms step_avg:151.54ms step:396/1480 train_time:58493ms step_avg:151.54ms step:397/1480 train_time:58645ms step_avg:151.54ms step:398/1480 train_time:58795ms step_avg:151.53ms step:399/1480 train_time:58946ms step_avg:151.53ms step:400/1480 train_time:59098ms step_avg:151.53ms step:401/1480 train_time:59248ms step_avg:151.53ms step:402/1480 train_time:59400ms step_avg:151.53ms step:403/1480 train_time:59550ms step_avg:151.53ms step:404/1480 train_time:59702ms step_avg:151.53ms step:405/1480 train_time:59853ms step_avg:151.53ms step:406/1480 train_time:60004ms step_avg:151.52ms step:407/1480 train_time:60156ms step_avg:151.53ms step:408/1480 train_time:60307ms step_avg:151.52ms step:409/1480 train_time:60459ms step_avg:151.53ms step:410/1480 train_time:60608ms step_avg:151.52ms step:411/1480 train_time:60760ms step_avg:151.52ms step:412/1480 train_time:60911ms step_avg:151.52ms step:413/1480 train_time:61062ms step_avg:151.52ms step:414/1480 train_time:61212ms step_avg:151.52ms step:415/1480 train_time:61364ms step_avg:151.52ms step:416/1480 train_time:61514ms step_avg:151.51ms step:417/1480 train_time:61665ms step_avg:151.51ms step:418/1480 train_time:61816ms step_avg:151.51ms step:419/1480 train_time:61968ms step_avg:151.51ms step:420/1480 train_time:62119ms step_avg:151.51ms step:421/1480 train_time:62270ms step_avg:151.51ms step:422/1480 train_time:62422ms step_avg:151.51ms step:423/1480 train_time:62574ms step_avg:151.51ms step:424/1480 train_time:62725ms step_avg:151.51ms step:425/1480 train_time:62876ms step_avg:151.51ms step:426/1480 train_time:63028ms step_avg:151.51ms step:427/1480 train_time:63179ms step_avg:151.51ms step:428/1480 train_time:63329ms step_avg:151.51ms step:429/1480 train_time:63480ms step_avg:151.50ms step:430/1480 train_time:63630ms step_avg:151.50ms step:431/1480 train_time:63782ms step_avg:151.50ms step:432/1480 train_time:63932ms step_avg:151.50ms step:433/1480 train_time:64083ms step_avg:151.50ms step:434/1480 train_time:64234ms step_avg:151.50ms step:435/1480 train_time:64385ms step_avg:151.49ms step:436/1480 train_time:64535ms step_avg:151.49ms step:437/1480 train_time:64686ms step_avg:151.49ms step:438/1480 train_time:64837ms step_avg:151.49ms step:439/1480 train_time:64988ms step_avg:151.49ms step:440/1480 train_time:65140ms step_avg:151.49ms step:441/1480 train_time:65292ms step_avg:151.49ms step:442/1480 train_time:65445ms step_avg:151.49ms step:443/1480 train_time:65597ms step_avg:151.50ms step:444/1480 train_time:65750ms step_avg:151.50ms step:445/1480 train_time:65903ms step_avg:151.50ms step:446/1480 train_time:66057ms step_avg:151.51ms step:447/1480 train_time:66212ms step_avg:151.51ms step:448/1480 train_time:66364ms step_avg:151.52ms step:449/1480 train_time:66516ms step_avg:151.52ms step:450/1480 train_time:66670ms step_avg:151.52ms step:451/1480 train_time:66822ms step_avg:151.52ms step:452/1480 train_time:66977ms step_avg:151.53ms step:453/1480 train_time:67130ms step_avg:151.54ms step:454/1480 train_time:67284ms step_avg:151.54ms step:455/1480 train_time:67436ms step_avg:151.54ms step:456/1480 train_time:67589ms step_avg:151.54ms step:457/1480 train_time:67742ms step_avg:151.55ms step:458/1480 train_time:67894ms step_avg:151.55ms step:459/1480 train_time:68047ms step_avg:151.55ms step:460/1480 train_time:68201ms step_avg:151.56ms step:461/1480 train_time:68354ms step_avg:151.56ms step:462/1480 train_time:68508ms step_avg:151.57ms step:463/1480 train_time:68661ms step_avg:151.57ms step:464/1480 train_time:68814ms step_avg:151.57ms step:465/1480 train_time:68966ms step_avg:151.57ms step:466/1480 train_time:69119ms step_avg:151.58ms step:467/1480 train_time:69271ms step_avg:151.58ms step:468/1480 train_time:69424ms step_avg:151.58ms step:469/1480 train_time:69578ms step_avg:151.59ms step:470/1480 train_time:69731ms step_avg:151.59ms step:471/1480 train_time:69885ms step_avg:151.59ms step:472/1480 train_time:70037ms step_avg:151.60ms step:473/1480 train_time:70189ms step_avg:151.60ms step:474/1480 train_time:70342ms step_avg:151.60ms step:475/1480 train_time:70494ms step_avg:151.60ms step:476/1480 train_time:70647ms step_avg:151.60ms step:477/1480 train_time:70801ms step_avg:151.61ms step:478/1480 train_time:70955ms step_avg:151.61ms step:479/1480 train_time:71107ms step_avg:151.61ms step:480/1480 train_time:71261ms step_avg:151.62ms step:481/1480 train_time:71413ms step_avg:151.62ms step:482/1480 train_time:71566ms step_avg:151.62ms step:483/1480 train_time:71718ms step_avg:151.62ms step:484/1480 train_time:71871ms step_avg:151.63ms step:485/1480 train_time:72025ms step_avg:151.63ms step:486/1480 train_time:72179ms step_avg:151.64ms step:487/1480 train_time:72332ms step_avg:151.64ms step:488/1480 train_time:72485ms step_avg:151.64ms step:489/1480 train_time:72637ms step_avg:151.64ms step:490/1480 train_time:72789ms step_avg:151.64ms step:491/1480 train_time:72943ms step_avg:151.65ms step:492/1480 train_time:73095ms step_avg:151.65ms step:493/1480 train_time:73247ms step_avg:151.65ms step:494/1480 train_time:73401ms step_avg:151.66ms step:495/1480 train_time:73555ms step_avg:151.66ms step:496/1480 train_time:73708ms step_avg:151.66ms step:497/1480 train_time:73862ms step_avg:151.67ms step:498/1480 train_time:74014ms step_avg:151.67ms step:499/1480 train_time:74167ms step_avg:151.67ms step:500/1480 train_time:74320ms step_avg:151.67ms step:500/1480 val_loss:3.6936 train_time:74388ms step_avg:151.81ms step:501/1480 train_time:74480ms step_avg:151.69ms step:502/1480 train_time:74632ms step_avg:151.69ms step:503/1480 train_time:74785ms step_avg:151.69ms step:504/1480 train_time:74937ms step_avg:151.69ms step:505/1480 train_time:75089ms step_avg:151.70ms step:506/1480 train_time:75242ms step_avg:151.70ms step:507/1480 train_time:75393ms step_avg:151.70ms step:508/1480 train_time:75547ms step_avg:151.70ms step:509/1480 train_time:75701ms step_avg:151.71ms step:510/1480 train_time:75853ms step_avg:151.71ms step:511/1480 train_time:76006ms step_avg:151.71ms step:512/1480 train_time:76162ms step_avg:151.72ms step:513/1480 train_time:76314ms step_avg:151.72ms step:514/1480 train_time:76467ms step_avg:151.72ms step:515/1480 train_time:76620ms step_avg:151.72ms step:516/1480 train_time:76773ms step_avg:151.73ms step:517/1480 train_time:76927ms step_avg:151.73ms step:518/1480 train_time:77081ms step_avg:151.73ms step:519/1480 train_time:77233ms step_avg:151.74ms step:520/1480 train_time:77386ms step_avg:151.74ms step:521/1480 train_time:77539ms step_avg:151.74ms step:522/1480 train_time:77691ms step_avg:151.74ms step:523/1480 train_time:77845ms step_avg:151.74ms step:524/1480 train_time:77997ms step_avg:151.75ms step:525/1480 train_time:78150ms step_avg:151.75ms step:526/1480 train_time:78304ms step_avg:151.75ms step:527/1480 train_time:78457ms step_avg:151.75ms step:528/1480 train_time:78610ms step_avg:151.76ms step:529/1480 train_time:78764ms step_avg:151.76ms step:530/1480 train_time:78917ms step_avg:151.76ms step:531/1480 train_time:79069ms step_avg:151.76ms step:532/1480 train_time:79222ms step_avg:151.77ms step:533/1480 train_time:79375ms step_avg:151.77ms step:534/1480 train_time:79528ms step_avg:151.77ms step:535/1480 train_time:79683ms step_avg:151.78ms step:536/1480 train_time:79834ms step_avg:151.78ms step:537/1480 train_time:79987ms step_avg:151.78ms step:538/1480 train_time:80141ms step_avg:151.78ms step:539/1480 train_time:80294ms step_avg:151.78ms step:540/1480 train_time:80447ms step_avg:151.79ms step:541/1480 train_time:80599ms step_avg:151.79ms step:542/1480 train_time:80751ms step_avg:151.79ms step:543/1480 train_time:80904ms step_avg:151.79ms step:544/1480 train_time:81058ms step_avg:151.79ms step:545/1480 train_time:81212ms step_avg:151.80ms step:546/1480 train_time:81366ms step_avg:151.80ms step:547/1480 train_time:81517ms step_avg:151.80ms step:548/1480 train_time:81669ms step_avg:151.80ms step:549/1480 train_time:81822ms step_avg:151.80ms step:550/1480 train_time:81978ms step_avg:151.81ms step:551/1480 train_time:82133ms step_avg:151.82ms step:552/1480 train_time:82288ms step_avg:151.82ms step:553/1480 train_time:82442ms step_avg:151.83ms step:554/1480 train_time:82597ms step_avg:151.83ms step:555/1480 train_time:82752ms step_avg:151.84ms step:556/1480 train_time:82907ms step_avg:151.84ms step:557/1480 train_time:83063ms step_avg:151.85ms step:558/1480 train_time:83216ms step_avg:151.85ms step:559/1480 train_time:83370ms step_avg:151.86ms step:560/1480 train_time:83526ms step_avg:151.86ms step:561/1480 train_time:83680ms step_avg:151.87ms step:562/1480 train_time:83835ms step_avg:151.87ms step:563/1480 train_time:83988ms step_avg:151.88ms step:564/1480 train_time:84144ms step_avg:151.88ms step:565/1480 train_time:84298ms step_avg:151.89ms step:566/1480 train_time:84453ms step_avg:151.89ms step:567/1480 train_time:84611ms step_avg:151.90ms step:568/1480 train_time:84765ms step_avg:151.91ms step:569/1480 train_time:84936ms step_avg:151.94ms step:570/1480 train_time:85075ms step_avg:151.92ms step:571/1480 train_time:85229ms step_avg:151.92ms step:572/1480 train_time:85384ms step_avg:151.93ms step:573/1480 train_time:85539ms step_avg:151.93ms step:574/1480 train_time:85696ms step_avg:151.94ms step:575/1480 train_time:85850ms step_avg:151.95ms step:576/1480 train_time:86005ms step_avg:151.95ms step:577/1480 train_time:86159ms step_avg:151.96ms step:578/1480 train_time:86314ms step_avg:151.96ms step:579/1480 train_time:86469ms step_avg:151.97ms step:580/1480 train_time:86624ms step_avg:151.97ms step:581/1480 train_time:86780ms step_avg:151.98ms step:582/1480 train_time:86935ms step_avg:151.98ms step:583/1480 train_time:87089ms step_avg:151.99ms step:584/1480 train_time:87244ms step_avg:151.99ms step:585/1480 train_time:87399ms step_avg:152.00ms step:586/1480 train_time:87553ms step_avg:152.00ms step:587/1480 train_time:87708ms step_avg:152.01ms step:588/1480 train_time:87862ms step_avg:152.01ms step:589/1480 train_time:88018ms step_avg:152.02ms step:590/1480 train_time:88173ms step_avg:152.02ms step:591/1480 train_time:88328ms step_avg:152.03ms step:592/1480 train_time:88484ms step_avg:152.03ms step:593/1480 train_time:88640ms step_avg:152.04ms step:594/1480 train_time:88796ms step_avg:152.05ms step:595/1480 train_time:88951ms step_avg:152.05ms step:596/1480 train_time:89107ms step_avg:152.06ms step:597/1480 train_time:89260ms step_avg:152.06ms step:598/1480 train_time:89415ms step_avg:152.07ms step:599/1480 train_time:89570ms step_avg:152.07ms step:600/1480 train_time:89724ms step_avg:152.07ms step:601/1480 train_time:89881ms step_avg:152.08ms step:602/1480 train_time:90036ms step_avg:152.09ms step:603/1480 train_time:90190ms step_avg:152.09ms step:604/1480 train_time:90345ms step_avg:152.10ms step:605/1480 train_time:90499ms step_avg:152.10ms step:606/1480 train_time:90655ms step_avg:152.11ms step:607/1480 train_time:90810ms step_avg:152.11ms step:608/1480 train_time:90966ms step_avg:152.12ms step:609/1480 train_time:91120ms step_avg:152.12ms step:610/1480 train_time:91274ms step_avg:152.12ms step:611/1480 train_time:91429ms step_avg:152.13ms step:612/1480 train_time:91584ms step_avg:152.13ms step:613/1480 train_time:91739ms step_avg:152.14ms step:614/1480 train_time:91894ms step_avg:152.14ms step:615/1480 train_time:92048ms step_avg:152.15ms step:616/1480 train_time:92203ms step_avg:152.15ms step:617/1480 train_time:92358ms step_avg:152.15ms step:618/1480 train_time:92512ms step_avg:152.16ms step:619/1480 train_time:92667ms step_avg:152.16ms step:620/1480 train_time:92821ms step_avg:152.16ms step:621/1480 train_time:92978ms step_avg:152.17ms step:622/1480 train_time:93132ms step_avg:152.18ms step:623/1480 train_time:93286ms step_avg:152.18ms step:624/1480 train_time:93441ms step_avg:152.18ms step:625/1480 train_time:93594ms step_avg:152.18ms step:625/1480 val_loss:3.6127 train_time:93666ms step_avg:152.30ms step:626/1480 train_time:93762ms step_avg:152.21ms step:627/1480 train_time:93910ms step_avg:152.21ms step:628/1480 train_time:94066ms step_avg:152.21ms step:629/1480 train_time:94220ms step_avg:152.21ms step:630/1480 train_time:94374ms step_avg:152.22ms step:631/1480 train_time:94529ms step_avg:152.22ms step:632/1480 train_time:94682ms step_avg:152.22ms step:633/1480 train_time:94838ms step_avg:152.23ms step:634/1480 train_time:94992ms step_avg:152.23ms step:635/1480 train_time:95147ms step_avg:152.24ms step:636/1480 train_time:95301ms step_avg:152.24ms step:637/1480 train_time:95456ms step_avg:152.24ms step:638/1480 train_time:95610ms step_avg:152.25ms step:639/1480 train_time:95766ms step_avg:152.25ms step:640/1480 train_time:95921ms step_avg:152.25ms step:641/1480 train_time:96075ms step_avg:152.26ms step:642/1480 train_time:96229ms step_avg:152.26ms step:643/1480 train_time:96384ms step_avg:152.26ms step:644/1480 train_time:96539ms step_avg:152.27ms step:645/1480 train_time:96693ms step_avg:152.27ms step:646/1480 train_time:96849ms step_avg:152.28ms step:647/1480 train_time:97004ms step_avg:152.28ms step:648/1480 train_time:97160ms step_avg:152.29ms step:649/1480 train_time:97315ms step_avg:152.29ms step:650/1480 train_time:97471ms step_avg:152.30ms step:651/1480 train_time:97625ms step_avg:152.30ms step:652/1480 train_time:97779ms step_avg:152.30ms step:653/1480 train_time:97933ms step_avg:152.31ms step:654/1480 train_time:98088ms step_avg:152.31ms step:655/1480 train_time:98242ms step_avg:152.31ms step:656/1480 train_time:98396ms step_avg:152.32ms step:657/1480 train_time:98552ms step_avg:152.32ms step:658/1480 train_time:98705ms step_avg:152.32ms step:659/1480 train_time:98861ms step_avg:152.33ms step:660/1480 train_time:99017ms step_avg:152.33ms step:661/1480 train_time:99174ms step_avg:152.34ms step:662/1480 train_time:99330ms step_avg:152.35ms step:663/1480 train_time:99486ms step_avg:152.35ms step:664/1480 train_time:99642ms step_avg:152.36ms step:665/1480 train_time:99799ms step_avg:152.36ms step:666/1480 train_time:99955ms step_avg:152.37ms step:667/1480 train_time:100110ms step_avg:152.37ms step:668/1480 train_time:100266ms step_avg:152.38ms step:669/1480 train_time:100425ms step_avg:152.39ms step:670/1480 train_time:100580ms step_avg:152.39ms step:671/1480 train_time:100736ms step_avg:152.40ms step:672/1480 train_time:100893ms step_avg:152.41ms step:673/1480 train_time:101049ms step_avg:152.41ms step:674/1480 train_time:101205ms step_avg:152.42ms step:675/1480 train_time:101363ms step_avg:152.43ms step:676/1480 train_time:101521ms step_avg:152.43ms step:677/1480 train_time:101678ms step_avg:152.44ms step:678/1480 train_time:101834ms step_avg:152.45ms step:679/1480 train_time:101991ms step_avg:152.45ms step:680/1480 train_time:102148ms step_avg:152.46ms step:681/1480 train_time:102303ms step_avg:152.46ms step:682/1480 train_time:102461ms step_avg:152.47ms step:683/1480 train_time:102619ms step_avg:152.48ms step:684/1480 train_time:102776ms step_avg:152.49ms step:685/1480 train_time:102932ms step_avg:152.49ms step:686/1480 train_time:103090ms step_avg:152.50ms step:687/1480 train_time:103245ms step_avg:152.50ms step:688/1480 train_time:103403ms step_avg:152.51ms step:689/1480 train_time:103561ms step_avg:152.52ms step:690/1480 train_time:103718ms step_avg:152.53ms step:691/1480 train_time:103874ms step_avg:152.53ms step:692/1480 train_time:104030ms step_avg:152.54ms step:693/1480 train_time:104187ms step_avg:152.54ms step:694/1480 train_time:104344ms step_avg:152.55ms step:695/1480 train_time:104500ms step_avg:152.55ms step:696/1480 train_time:104656ms step_avg:152.56ms step:697/1480 train_time:104812ms step_avg:152.57ms step:698/1480 train_time:104969ms step_avg:152.57ms step:699/1480 train_time:105125ms step_avg:152.58ms step:700/1480 train_time:105281ms step_avg:152.58ms step:701/1480 train_time:105438ms step_avg:152.59ms step:702/1480 train_time:105593ms step_avg:152.59ms step:703/1480 train_time:105750ms step_avg:152.60ms step:704/1480 train_time:105905ms step_avg:152.60ms step:705/1480 train_time:106062ms step_avg:152.61ms step:706/1480 train_time:106220ms step_avg:152.62ms step:707/1480 train_time:106376ms step_avg:152.62ms step:708/1480 train_time:106533ms step_avg:152.63ms step:709/1480 train_time:106689ms step_avg:152.63ms step:710/1480 train_time:106844ms step_avg:152.63ms step:711/1480 train_time:107001ms step_avg:152.64ms step:712/1480 train_time:107158ms step_avg:152.65ms step:713/1480 train_time:107314ms step_avg:152.65ms step:714/1480 train_time:107471ms step_avg:152.66ms step:715/1480 train_time:107627ms step_avg:152.66ms step:716/1480 train_time:107782ms step_avg:152.67ms step:717/1480 train_time:107938ms step_avg:152.67ms step:718/1480 train_time:108094ms step_avg:152.67ms step:719/1480 train_time:108249ms step_avg:152.68ms step:720/1480 train_time:108406ms step_avg:152.68ms step:721/1480 train_time:108563ms step_avg:152.69ms step:722/1480 train_time:108720ms step_avg:152.70ms step:723/1480 train_time:108876ms step_avg:152.70ms step:724/1480 train_time:109033ms step_avg:152.71ms step:725/1480 train_time:109190ms step_avg:152.71ms step:726/1480 train_time:109346ms step_avg:152.72ms step:727/1480 train_time:109504ms step_avg:152.73ms step:728/1480 train_time:109661ms step_avg:152.73ms step:729/1480 train_time:109818ms step_avg:152.74ms step:730/1480 train_time:109974ms step_avg:152.74ms step:731/1480 train_time:110131ms step_avg:152.75ms step:732/1480 train_time:110287ms step_avg:152.75ms step:733/1480 train_time:110444ms step_avg:152.76ms step:734/1480 train_time:110601ms step_avg:152.76ms step:735/1480 train_time:110758ms step_avg:152.77ms step:736/1480 train_time:110914ms step_avg:152.77ms step:737/1480 train_time:111070ms step_avg:152.78ms step:738/1480 train_time:111225ms step_avg:152.78ms step:739/1480 train_time:111381ms step_avg:152.79ms step:740/1480 train_time:111540ms step_avg:152.79ms step:741/1480 train_time:111699ms step_avg:152.80ms step:742/1480 train_time:111855ms step_avg:152.81ms step:743/1480 train_time:112011ms step_avg:152.81ms step:744/1480 train_time:112167ms step_avg:152.82ms step:745/1480 train_time:112325ms step_avg:152.82ms step:746/1480 train_time:112481ms step_avg:152.83ms step:747/1480 train_time:112637ms step_avg:152.83ms step:748/1480 train_time:112794ms step_avg:152.84ms step:749/1480 train_time:112952ms step_avg:152.84ms step:750/1480 train_time:113107ms step_avg:152.85ms step:750/1480 val_loss:3.5565 train_time:113179ms step_avg:152.94ms step:751/1480 train_time:113272ms step_avg:152.86ms step:752/1480 train_time:113425ms step_avg:152.86ms step:753/1480 train_time:113581ms step_avg:152.87ms step:754/1480 train_time:113737ms step_avg:152.87ms step:755/1480 train_time:113892ms step_avg:152.87ms step:756/1480 train_time:114049ms step_avg:152.88ms step:757/1480 train_time:114206ms step_avg:152.89ms step:758/1480 train_time:114362ms step_avg:152.89ms step:759/1480 train_time:114534ms step_avg:152.92ms step:760/1480 train_time:114677ms step_avg:152.90ms step:761/1480 train_time:114833ms step_avg:152.91ms step:762/1480 train_time:114990ms step_avg:152.91ms step:763/1480 train_time:115146ms step_avg:152.92ms step:764/1480 train_time:115304ms step_avg:152.92ms step:765/1480 train_time:115460ms step_avg:152.93ms step:766/1480 train_time:115618ms step_avg:152.93ms step:767/1480 train_time:115775ms step_avg:152.94ms step:768/1480 train_time:115931ms step_avg:152.94ms step:769/1480 train_time:116088ms step_avg:152.95ms step:770/1480 train_time:116246ms step_avg:152.95ms step:771/1480 train_time:116401ms step_avg:152.96ms step:772/1480 train_time:116557ms step_avg:152.96ms step:773/1480 train_time:116715ms step_avg:152.97ms step:774/1480 train_time:116873ms step_avg:152.98ms step:775/1480 train_time:117031ms step_avg:152.98ms step:776/1480 train_time:117189ms step_avg:152.99ms step:777/1480 train_time:117349ms step_avg:153.00ms step:778/1480 train_time:117507ms step_avg:153.00ms step:779/1480 train_time:117663ms step_avg:153.01ms step:780/1480 train_time:117824ms step_avg:153.02ms step:781/1480 train_time:117982ms step_avg:153.02ms step:782/1480 train_time:118141ms step_avg:153.03ms step:783/1480 train_time:118300ms step_avg:153.04ms step:784/1480 train_time:118458ms step_avg:153.05ms step:785/1480 train_time:118616ms step_avg:153.05ms step:786/1480 train_time:118773ms step_avg:153.06ms step:787/1480 train_time:118930ms step_avg:153.06ms step:788/1480 train_time:119088ms step_avg:153.07ms step:789/1480 train_time:119246ms step_avg:153.08ms step:790/1480 train_time:119403ms step_avg:153.08ms step:791/1480 train_time:119564ms step_avg:153.09ms step:792/1480 train_time:119722ms step_avg:153.10ms step:793/1480 train_time:119879ms step_avg:153.10ms step:794/1480 train_time:120038ms step_avg:153.11ms step:795/1480 train_time:120197ms step_avg:153.12ms step:796/1480 train_time:120357ms step_avg:153.13ms step:797/1480 train_time:120517ms step_avg:153.13ms step:798/1480 train_time:120676ms step_avg:153.14ms step:799/1480 train_time:120837ms step_avg:153.15ms step:800/1480 train_time:120995ms step_avg:153.16ms step:801/1480 train_time:121152ms step_avg:153.16ms step:802/1480 train_time:121313ms step_avg:153.17ms step:803/1480 train_time:121470ms step_avg:153.18ms step:804/1480 train_time:121627ms step_avg:153.18ms step:805/1480 train_time:121785ms step_avg:153.19ms step:806/1480 train_time:121942ms step_avg:153.19ms step:807/1480 train_time:122098ms step_avg:153.20ms step:808/1480 train_time:122255ms step_avg:153.20ms step:809/1480 train_time:122412ms step_avg:153.21ms step:810/1480 train_time:122569ms step_avg:153.21ms step:811/1480 train_time:122727ms step_avg:153.22ms step:812/1480 train_time:122883ms step_avg:153.22ms step:813/1480 train_time:123041ms step_avg:153.23ms step:814/1480 train_time:123201ms step_avg:153.24ms step:815/1480 train_time:123359ms step_avg:153.24ms step:816/1480 train_time:123518ms step_avg:153.25ms step:817/1480 train_time:123676ms step_avg:153.25ms step:818/1480 train_time:123833ms step_avg:153.26ms step:819/1480 train_time:123993ms step_avg:153.27ms step:820/1480 train_time:124150ms step_avg:153.27ms step:821/1480 train_time:124307ms step_avg:153.28ms step:822/1480 train_time:124464ms step_avg:153.28ms step:823/1480 train_time:124624ms step_avg:153.29ms step:824/1480 train_time:124781ms step_avg:153.29ms step:825/1480 train_time:124941ms step_avg:153.30ms step:826/1480 train_time:125102ms step_avg:153.31ms step:827/1480 train_time:125262ms step_avg:153.32ms step:828/1480 train_time:125421ms step_avg:153.33ms step:829/1480 train_time:125580ms step_avg:153.33ms step:830/1480 train_time:125740ms step_avg:153.34ms step:831/1480 train_time:125898ms step_avg:153.35ms step:832/1480 train_time:126056ms step_avg:153.35ms step:833/1480 train_time:126215ms step_avg:153.36ms step:834/1480 train_time:126374ms step_avg:153.37ms step:835/1480 train_time:126532ms step_avg:153.37ms step:836/1480 train_time:126693ms step_avg:153.38ms step:837/1480 train_time:126850ms step_avg:153.39ms step:838/1480 train_time:127008ms step_avg:153.39ms step:839/1480 train_time:127166ms step_avg:153.40ms step:840/1480 train_time:127324ms step_avg:153.40ms step:841/1480 train_time:127480ms step_avg:153.41ms step:842/1480 train_time:127641ms step_avg:153.41ms step:843/1480 train_time:127798ms step_avg:153.42ms step:844/1480 train_time:127954ms step_avg:153.42ms step:845/1480 train_time:128112ms step_avg:153.43ms step:846/1480 train_time:128271ms step_avg:153.43ms step:847/1480 train_time:128429ms step_avg:153.44ms step:848/1480 train_time:128586ms step_avg:153.44ms step:849/1480 train_time:128745ms step_avg:153.45ms step:850/1480 train_time:128904ms step_avg:153.46ms step:851/1480 train_time:129063ms step_avg:153.46ms step:852/1480 train_time:129221ms step_avg:153.47ms step:853/1480 train_time:129379ms step_avg:153.48ms step:854/1480 train_time:129537ms step_avg:153.48ms step:855/1480 train_time:129694ms step_avg:153.48ms step:856/1480 train_time:129853ms step_avg:153.49ms step:857/1480 train_time:130010ms step_avg:153.50ms step:858/1480 train_time:130169ms step_avg:153.50ms step:859/1480 train_time:130327ms step_avg:153.51ms step:860/1480 train_time:130483ms step_avg:153.51ms step:861/1480 train_time:130644ms step_avg:153.52ms step:862/1480 train_time:130806ms step_avg:153.53ms step:863/1480 train_time:130964ms step_avg:153.53ms step:864/1480 train_time:131123ms step_avg:153.54ms step:865/1480 train_time:131282ms step_avg:153.55ms step:866/1480 train_time:131441ms step_avg:153.55ms step:867/1480 train_time:131602ms step_avg:153.56ms step:868/1480 train_time:131760ms step_avg:153.57ms step:869/1480 train_time:131919ms step_avg:153.57ms step:870/1480 train_time:132076ms step_avg:153.58ms step:871/1480 train_time:132233ms step_avg:153.58ms step:872/1480 train_time:132391ms step_avg:153.59ms step:873/1480 train_time:132549ms step_avg:153.59ms step:874/1480 train_time:132709ms step_avg:153.60ms step:875/1480 train_time:132867ms step_avg:153.60ms step:875/1480 val_loss:3.5098 train_time:132939ms step_avg:153.69ms step:876/1480 train_time:133030ms step_avg:153.61ms step:877/1480 train_time:133186ms step_avg:153.62ms step:878/1480 train_time:133343ms step_avg:153.62ms step:879/1480 train_time:133501ms step_avg:153.63ms step:880/1480 train_time:133658ms step_avg:153.63ms step:881/1480 train_time:133818ms step_avg:153.64ms step:882/1480 train_time:133977ms step_avg:153.64ms step:883/1480 train_time:134136ms step_avg:153.65ms step:884/1480 train_time:134298ms step_avg:153.66ms step:885/1480 train_time:134458ms step_avg:153.67ms step:886/1480 train_time:134617ms step_avg:153.67ms step:887/1480 train_time:134777ms step_avg:153.68ms step:888/1480 train_time:134941ms step_avg:153.69ms step:889/1480 train_time:135102ms step_avg:153.70ms step:890/1480 train_time:135258ms step_avg:153.70ms step:891/1480 train_time:135417ms step_avg:153.71ms step:892/1480 train_time:135577ms step_avg:153.72ms step:893/1480 train_time:135736ms step_avg:153.72ms step:894/1480 train_time:135896ms step_avg:153.73ms step:895/1480 train_time:136058ms step_avg:153.74ms step:896/1480 train_time:136215ms step_avg:153.74ms step:897/1480 train_time:136377ms step_avg:153.75ms step:898/1480 train_time:136538ms step_avg:153.76ms step:899/1480 train_time:136697ms step_avg:153.76ms step:900/1480 train_time:136855ms step_avg:153.77ms step:901/1480 train_time:137014ms step_avg:153.78ms step:902/1480 train_time:137173ms step_avg:153.78ms step:903/1480 train_time:137334ms step_avg:153.79ms step:904/1480 train_time:137494ms step_avg:153.80ms step:905/1480 train_time:137652ms step_avg:153.80ms step:906/1480 train_time:137814ms step_avg:153.81ms step:907/1480 train_time:137977ms step_avg:153.82ms step:908/1480 train_time:138135ms step_avg:153.83ms step:909/1480 train_time:138294ms step_avg:153.83ms step:910/1480 train_time:138458ms step_avg:153.84ms step:911/1480 train_time:138617ms step_avg:153.85ms step:912/1480 train_time:138778ms step_avg:153.86ms step:913/1480 train_time:138939ms step_avg:153.86ms step:914/1480 train_time:139100ms step_avg:153.87ms step:915/1480 train_time:139260ms step_avg:153.88ms step:916/1480 train_time:139420ms step_avg:153.88ms step:917/1480 train_time:139577ms step_avg:153.89ms step:918/1480 train_time:139737ms step_avg:153.90ms step:919/1480 train_time:139900ms step_avg:153.91ms step:920/1480 train_time:140058ms step_avg:153.91ms step:921/1480 train_time:140218ms step_avg:153.92ms step:922/1480 train_time:140378ms step_avg:153.92ms step:923/1480 train_time:140535ms step_avg:153.93ms step:924/1480 train_time:140694ms step_avg:153.93ms step:925/1480 train_time:140854ms step_avg:153.94ms step:926/1480 train_time:141013ms step_avg:153.94ms step:927/1480 train_time:141172ms step_avg:153.95ms step:928/1480 train_time:141333ms step_avg:153.96ms step:929/1480 train_time:141493ms step_avg:153.96ms step:930/1480 train_time:141653ms step_avg:153.97ms step:931/1480 train_time:141812ms step_avg:153.98ms step:932/1480 train_time:141971ms step_avg:153.98ms step:933/1480 train_time:142130ms step_avg:153.99ms step:934/1480 train_time:142290ms step_avg:153.99ms step:935/1480 train_time:142451ms step_avg:154.00ms step:936/1480 train_time:142610ms step_avg:154.01ms step:937/1480 train_time:142770ms step_avg:154.01ms step:938/1480 train_time:142927ms step_avg:154.02ms step:939/1480 train_time:143089ms step_avg:154.02ms step:940/1480 train_time:143250ms step_avg:154.03ms step:941/1480 train_time:143408ms step_avg:154.04ms step:942/1480 train_time:143566ms step_avg:154.04ms step:943/1480 train_time:143726ms step_avg:154.05ms step:944/1480 train_time:143888ms step_avg:154.06ms step:945/1480 train_time:144048ms step_avg:154.06ms step:946/1480 train_time:144209ms step_avg:154.07ms step:947/1480 train_time:144370ms step_avg:154.08ms step:948/1480 train_time:144530ms step_avg:154.08ms step:949/1480 train_time:144704ms step_avg:154.10ms step:950/1480 train_time:144850ms step_avg:154.10ms step:951/1480 train_time:145014ms step_avg:154.11ms step:952/1480 train_time:145175ms step_avg:154.11ms step:953/1480 train_time:145334ms step_avg:154.12ms step:954/1480 train_time:145497ms step_avg:154.13ms step:955/1480 train_time:145656ms step_avg:154.13ms step:956/1480 train_time:145815ms step_avg:154.14ms step:957/1480 train_time:145976ms step_avg:154.15ms step:958/1480 train_time:146140ms step_avg:154.16ms step:959/1480 train_time:146300ms step_avg:154.16ms step:960/1480 train_time:146460ms step_avg:154.17ms step:961/1480 train_time:146619ms step_avg:154.17ms step:962/1480 train_time:146777ms step_avg:154.18ms step:963/1480 train_time:146937ms step_avg:154.18ms step:964/1480 train_time:147099ms step_avg:154.19ms step:965/1480 train_time:147259ms step_avg:154.20ms step:966/1480 train_time:147418ms step_avg:154.20ms step:967/1480 train_time:147577ms step_avg:154.21ms step:968/1480 train_time:147735ms step_avg:154.21ms step:969/1480 train_time:147895ms step_avg:154.22ms step:970/1480 train_time:148053ms step_avg:154.22ms step:971/1480 train_time:148214ms step_avg:154.23ms step:972/1480 train_time:148372ms step_avg:154.23ms step:973/1480 train_time:148531ms step_avg:154.24ms step:974/1480 train_time:148692ms step_avg:154.24ms step:975/1480 train_time:148853ms step_avg:154.25ms step:976/1480 train_time:149014ms step_avg:154.26ms step:977/1480 train_time:149174ms step_avg:154.26ms step:978/1480 train_time:149333ms step_avg:154.27ms step:979/1480 train_time:149495ms step_avg:154.28ms step:980/1480 train_time:149656ms step_avg:154.28ms step:981/1480 train_time:149816ms step_avg:154.29ms step:982/1480 train_time:149974ms step_avg:154.29ms step:983/1480 train_time:150134ms step_avg:154.30ms step:984/1480 train_time:150292ms step_avg:154.30ms step:985/1480 train_time:150454ms step_avg:154.31ms step:986/1480 train_time:150615ms step_avg:154.32ms step:987/1480 train_time:150775ms step_avg:154.32ms step:988/1480 train_time:150933ms step_avg:154.33ms step:989/1480 train_time:151092ms step_avg:154.33ms step:990/1480 train_time:151255ms step_avg:154.34ms step:991/1480 train_time:151416ms step_avg:154.35ms step:992/1480 train_time:151580ms step_avg:154.36ms step:993/1480 train_time:151748ms step_avg:154.37ms step:994/1480 train_time:151907ms step_avg:154.38ms step:995/1480 train_time:152067ms step_avg:154.38ms step:996/1480 train_time:152224ms step_avg:154.39ms step:997/1480 train_time:152384ms step_avg:154.39ms step:998/1480 train_time:152542ms step_avg:154.39ms step:999/1480 train_time:152703ms step_avg:154.40ms step:1000/1480 train_time:152864ms step_avg:154.41ms step:1000/1480 val_loss:3.4461 train_time:152937ms step_avg:154.48ms step:1001/1480 train_time:153033ms step_avg:154.42ms step:1002/1480 train_time:153191ms step_avg:154.43ms step:1003/1480 train_time:153353ms step_avg:154.43ms step:1004/1480 train_time:153515ms step_avg:154.44ms step:1005/1480 train_time:153675ms step_avg:154.45ms step:1006/1480 train_time:153836ms step_avg:154.45ms step:1007/1480 train_time:153996ms step_avg:154.46ms step:1008/1480 train_time:154157ms step_avg:154.47ms step:1009/1480 train_time:154321ms step_avg:154.48ms step:1010/1480 train_time:154480ms step_avg:154.48ms step:1011/1480 train_time:154640ms step_avg:154.49ms step:1012/1480 train_time:154798ms step_avg:154.49ms step:1013/1480 train_time:154959ms step_avg:154.50ms step:1014/1480 train_time:155119ms step_avg:154.50ms step:1015/1480 train_time:155281ms step_avg:154.51ms step:1016/1480 train_time:155442ms step_avg:154.51ms step:1017/1480 train_time:155605ms step_avg:154.52ms step:1018/1480 train_time:155767ms step_avg:154.53ms step:1019/1480 train_time:155928ms step_avg:154.54ms step:1020/1480 train_time:156089ms step_avg:154.54ms step:1021/1480 train_time:156249ms step_avg:154.55ms step:1022/1480 train_time:156408ms step_avg:154.55ms step:1023/1480 train_time:156572ms step_avg:154.56ms step:1024/1480 train_time:156731ms step_avg:154.57ms step:1025/1480 train_time:156893ms step_avg:154.57ms step:1026/1480 train_time:157054ms step_avg:154.58ms step:1027/1480 train_time:157214ms step_avg:154.59ms step:1028/1480 train_time:157377ms step_avg:154.59ms step:1029/1480 train_time:157540ms step_avg:154.60ms step:1030/1480 train_time:157701ms step_avg:154.61ms step:1031/1480 train_time:157859ms step_avg:154.61ms step:1032/1480 train_time:158023ms step_avg:154.62ms step:1033/1480 train_time:158182ms step_avg:154.63ms step:1034/1480 train_time:158344ms step_avg:154.63ms step:1035/1480 train_time:158505ms step_avg:154.64ms step:1036/1480 train_time:158666ms step_avg:154.65ms step:1037/1480 train_time:158827ms step_avg:154.65ms step:1038/1480 train_time:158989ms step_avg:154.66ms step:1039/1480 train_time:159151ms step_avg:154.67ms step:1040/1480 train_time:159311ms step_avg:154.67ms step:1041/1480 train_time:159471ms step_avg:154.68ms step:1042/1480 train_time:159631ms step_avg:154.68ms step:1043/1480 train_time:159789ms step_avg:154.68ms step:1044/1480 train_time:159950ms step_avg:154.69ms step:1045/1480 train_time:160112ms step_avg:154.70ms step:1046/1480 train_time:160272ms step_avg:154.70ms step:1047/1480 train_time:160431ms step_avg:154.71ms step:1048/1480 train_time:160592ms step_avg:154.71ms step:1049/1480 train_time:160753ms step_avg:154.72ms step:1050/1480 train_time:160914ms step_avg:154.72ms step:1051/1480 train_time:161076ms step_avg:154.73ms step:1052/1480 train_time:161237ms step_avg:154.74ms step:1053/1480 train_time:161397ms step_avg:154.74ms step:1054/1480 train_time:161558ms step_avg:154.75ms step:1055/1480 train_time:161718ms step_avg:154.75ms step:1056/1480 train_time:161878ms step_avg:154.76ms step:1057/1480 train_time:162037ms step_avg:154.76ms step:1058/1480 train_time:162197ms step_avg:154.77ms step:1059/1480 train_time:162359ms step_avg:154.78ms step:1060/1480 train_time:162520ms step_avg:154.78ms step:1061/1480 train_time:162678ms step_avg:154.78ms step:1062/1480 train_time:162837ms step_avg:154.79ms step:1063/1480 train_time:162995ms step_avg:154.79ms step:1064/1480 train_time:163153ms step_avg:154.79ms step:1065/1480 train_time:163313ms step_avg:154.80ms step:1066/1480 train_time:163475ms step_avg:154.81ms step:1067/1480 train_time:163636ms step_avg:154.81ms step:1068/1480 train_time:163796ms step_avg:154.82ms step:1069/1480 train_time:163958ms step_avg:154.82ms step:1070/1480 train_time:164118ms step_avg:154.83ms step:1071/1480 train_time:164282ms step_avg:154.84ms step:1072/1480 train_time:164442ms step_avg:154.84ms step:1073/1480 train_time:164601ms step_avg:154.85ms step:1074/1480 train_time:164762ms step_avg:154.85ms step:1075/1480 train_time:164925ms step_avg:154.86ms step:1076/1480 train_time:165083ms step_avg:154.86ms step:1077/1480 train_time:165244ms step_avg:154.87ms step:1078/1480 train_time:165411ms step_avg:154.88ms step:1079/1480 train_time:165576ms step_avg:154.89ms step:1080/1480 train_time:165736ms step_avg:154.89ms step:1081/1480 train_time:165895ms step_avg:154.90ms step:1082/1480 train_time:166055ms step_avg:154.90ms step:1083/1480 train_time:166214ms step_avg:154.91ms step:1084/1480 train_time:166374ms step_avg:154.91ms step:1085/1480 train_time:166533ms step_avg:154.91ms step:1086/1480 train_time:166693ms step_avg:154.92ms step:1087/1480 train_time:166853ms step_avg:154.92ms step:1088/1480 train_time:167012ms step_avg:154.93ms step:1089/1480 train_time:167175ms step_avg:154.93ms step:1090/1480 train_time:167337ms step_avg:154.94ms step:1091/1480 train_time:167497ms step_avg:154.95ms step:1092/1480 train_time:167659ms step_avg:154.95ms step:1093/1480 train_time:167819ms step_avg:154.96ms step:1094/1480 train_time:167978ms step_avg:154.96ms step:1095/1480 train_time:168137ms step_avg:154.97ms step:1096/1480 train_time:168298ms step_avg:154.97ms step:1097/1480 train_time:168460ms step_avg:154.98ms step:1098/1480 train_time:168622ms step_avg:154.98ms step:1099/1480 train_time:168785ms step_avg:154.99ms step:1100/1480 train_time:168950ms step_avg:155.00ms step:1101/1480 train_time:169112ms step_avg:155.01ms step:1102/1480 train_time:169274ms step_avg:155.01ms step:1103/1480 train_time:169440ms step_avg:155.02ms step:1104/1480 train_time:169600ms step_avg:155.03ms step:1105/1480 train_time:169763ms step_avg:155.03ms step:1106/1480 train_time:169924ms step_avg:155.04ms step:1107/1480 train_time:170085ms step_avg:155.05ms step:1108/1480 train_time:170245ms step_avg:155.05ms step:1109/1480 train_time:170405ms step_avg:155.06ms step:1110/1480 train_time:170569ms step_avg:155.06ms step:1111/1480 train_time:170732ms step_avg:155.07ms step:1112/1480 train_time:170894ms step_avg:155.08ms step:1113/1480 train_time:171064ms step_avg:155.09ms step:1114/1480 train_time:171225ms step_avg:155.10ms step:1115/1480 train_time:171388ms step_avg:155.10ms step:1116/1480 train_time:171549ms step_avg:155.11ms step:1117/1480 train_time:171714ms step_avg:155.12ms step:1118/1480 train_time:171877ms step_avg:155.12ms step:1119/1480 train_time:172037ms step_avg:155.13ms step:1120/1480 train_time:172197ms step_avg:155.13ms step:1121/1480 train_time:172357ms step_avg:155.14ms step:1122/1480 train_time:172517ms step_avg:155.14ms step:1123/1480 train_time:172677ms step_avg:155.15ms step:1124/1480 train_time:172839ms step_avg:155.15ms step:1125/1480 train_time:172999ms step_avg:155.16ms step:1125/1480 val_loss:3.3909 train_time:173074ms step_avg:155.22ms step:1126/1480 train_time:173169ms step_avg:155.17ms step:1127/1480 train_time:173324ms step_avg:155.17ms step:1128/1480 train_time:173485ms step_avg:155.17ms step:1129/1480 train_time:173648ms step_avg:155.18ms step:1130/1480 train_time:173808ms step_avg:155.19ms step:1131/1480 train_time:173973ms step_avg:155.19ms step:1132/1480 train_time:174133ms step_avg:155.20ms step:1133/1480 train_time:174299ms step_avg:155.21ms step:1134/1480 train_time:174463ms step_avg:155.22ms step:1135/1480 train_time:174624ms step_avg:155.22ms step:1136/1480 train_time:174786ms step_avg:155.23ms step:1137/1480 train_time:174946ms step_avg:155.23ms step:1138/1480 train_time:175110ms step_avg:155.24ms step:1139/1480 train_time:175285ms step_avg:155.26ms step:1140/1480 train_time:175430ms step_avg:155.25ms step:1141/1480 train_time:175595ms step_avg:155.26ms step:1142/1480 train_time:175757ms step_avg:155.26ms step:1143/1480 train_time:175923ms step_avg:155.27ms step:1144/1480 train_time:176085ms step_avg:155.28ms step:1145/1480 train_time:176244ms step_avg:155.28ms step:1146/1480 train_time:176406ms step_avg:155.29ms step:1147/1480 train_time:176566ms step_avg:155.29ms step:1148/1480 train_time:176726ms step_avg:155.30ms step:1149/1480 train_time:176888ms step_avg:155.30ms step:1150/1480 train_time:177048ms step_avg:155.31ms step:1151/1480 train_time:177214ms step_avg:155.31ms step:1152/1480 train_time:177378ms step_avg:155.32ms step:1153/1480 train_time:177544ms step_avg:155.33ms step:1154/1480 train_time:177705ms step_avg:155.34ms step:1155/1480 train_time:177865ms step_avg:155.34ms step:1156/1480 train_time:178031ms step_avg:155.35ms step:1157/1480 train_time:178195ms step_avg:155.36ms step:1158/1480 train_time:178355ms step_avg:155.36ms step:1159/1480 train_time:178517ms step_avg:155.37ms step:1160/1480 train_time:178679ms step_avg:155.37ms step:1161/1480 train_time:178843ms step_avg:155.38ms step:1162/1480 train_time:179006ms step_avg:155.39ms step:1163/1480 train_time:179167ms step_avg:155.39ms step:1164/1480 train_time:179328ms step_avg:155.40ms step:1165/1480 train_time:179487ms step_avg:155.40ms step:1166/1480 train_time:179648ms step_avg:155.40ms step:1167/1480 train_time:179808ms step_avg:155.41ms step:1168/1480 train_time:179967ms step_avg:155.41ms step:1169/1480 train_time:180129ms step_avg:155.42ms step:1170/1480 train_time:180290ms step_avg:155.42ms step:1171/1480 train_time:180451ms step_avg:155.43ms step:1172/1480 train_time:180611ms step_avg:155.43ms step:1173/1480 train_time:180776ms step_avg:155.44ms step:1174/1480 train_time:180946ms step_avg:155.45ms step:1175/1480 train_time:181109ms step_avg:155.46ms step:1176/1480 train_time:181271ms step_avg:155.46ms step:1177/1480 train_time:181440ms step_avg:155.48ms step:1178/1480 train_time:181602ms step_avg:155.48ms step:1179/1480 train_time:181761ms step_avg:155.48ms step:1180/1480 train_time:181928ms step_avg:155.49ms step:1181/1480 train_time:182089ms step_avg:155.50ms step:1182/1480 train_time:182250ms step_avg:155.50ms step:1183/1480 train_time:182412ms step_avg:155.51ms step:1184/1480 train_time:182573ms step_avg:155.51ms step:1185/1480 train_time:182739ms step_avg:155.52ms step:1186/1480 train_time:182903ms step_avg:155.53ms step:1187/1480 train_time:183073ms step_avg:155.54ms step:1188/1480 train_time:183232ms step_avg:155.54ms step:1189/1480 train_time:183394ms step_avg:155.55ms step:1190/1480 train_time:183557ms step_avg:155.56ms step:1191/1480 train_time:183721ms step_avg:155.56ms step:1192/1480 train_time:183881ms step_avg:155.57ms step:1193/1480 train_time:184041ms step_avg:155.57ms step:1194/1480 train_time:184203ms step_avg:155.58ms step:1195/1480 train_time:184365ms step_avg:155.58ms step:1196/1480 train_time:184539ms step_avg:155.60ms step:1197/1480 train_time:184701ms step_avg:155.60ms step:1198/1480 train_time:184868ms step_avg:155.61ms step:1199/1480 train_time:185029ms step_avg:155.62ms step:1200/1480 train_time:185190ms step_avg:155.62ms step:1201/1480 train_time:185352ms step_avg:155.63ms step:1202/1480 train_time:185522ms step_avg:155.64ms step:1203/1480 train_time:185688ms step_avg:155.65ms step:1204/1480 train_time:185852ms step_avg:155.65ms step:1205/1480 train_time:186013ms step_avg:155.66ms step:1206/1480 train_time:186175ms step_avg:155.66ms step:1207/1480 train_time:186336ms step_avg:155.67ms step:1208/1480 train_time:186497ms step_avg:155.67ms step:1209/1480 train_time:186660ms step_avg:155.68ms step:1210/1480 train_time:186826ms step_avg:155.69ms step:1211/1480 train_time:186989ms step_avg:155.69ms step:1212/1480 train_time:187152ms step_avg:155.70ms step:1213/1480 train_time:187316ms step_avg:155.71ms step:1214/1480 train_time:187483ms step_avg:155.72ms step:1215/1480 train_time:187645ms step_avg:155.72ms step:1216/1480 train_time:187806ms step_avg:155.73ms step:1217/1480 train_time:187968ms step_avg:155.73ms step:1218/1480 train_time:188130ms step_avg:155.74ms step:1219/1480 train_time:188300ms step_avg:155.75ms step:1220/1480 train_time:188462ms step_avg:155.75ms step:1221/1480 train_time:188624ms step_avg:155.76ms step:1222/1480 train_time:188784ms step_avg:155.76ms step:1223/1480 train_time:188946ms step_avg:155.77ms step:1224/1480 train_time:189111ms step_avg:155.77ms step:1225/1480 train_time:189275ms step_avg:155.78ms step:1226/1480 train_time:189442ms step_avg:155.79ms step:1227/1480 train_time:189606ms step_avg:155.80ms step:1228/1480 train_time:189767ms step_avg:155.80ms step:1229/1480 train_time:189929ms step_avg:155.81ms step:1230/1480 train_time:190098ms step_avg:155.82ms step:1231/1480 train_time:190265ms step_avg:155.83ms step:1232/1480 train_time:190429ms step_avg:155.83ms step:1233/1480 train_time:190589ms step_avg:155.84ms step:1234/1480 train_time:190751ms step_avg:155.84ms step:1235/1480 train_time:190919ms step_avg:155.85ms step:1236/1480 train_time:191080ms step_avg:155.86ms step:1237/1480 train_time:191242ms step_avg:155.86ms step:1238/1480 train_time:191414ms step_avg:155.87ms step:1239/1480 train_time:191577ms step_avg:155.88ms step:1240/1480 train_time:191742ms step_avg:155.89ms step:1241/1480 train_time:191907ms step_avg:155.89ms step:1242/1480 train_time:192068ms step_avg:155.90ms step:1243/1480 train_time:192230ms step_avg:155.90ms step:1244/1480 train_time:192391ms step_avg:155.91ms step:1245/1480 train_time:192555ms step_avg:155.91ms step:1246/1480 train_time:192720ms step_avg:155.92ms step:1247/1480 train_time:192883ms step_avg:155.93ms step:1248/1480 train_time:193044ms step_avg:155.93ms step:1249/1480 train_time:193205ms step_avg:155.94ms step:1250/1480 train_time:193366ms step_avg:155.94ms step:1250/1480 val_loss:3.3402 train_time:193442ms step_avg:156.00ms step:1251/1480 train_time:193534ms step_avg:155.95ms step:1252/1480 train_time:193696ms step_avg:155.96ms step:1253/1480 train_time:193857ms step_avg:155.96ms step:1254/1480 train_time:194018ms step_avg:155.96ms step:1255/1480 train_time:194191ms step_avg:155.98ms step:1256/1480 train_time:194356ms step_avg:155.98ms step:1257/1480 train_time:194518ms step_avg:155.99ms step:1258/1480 train_time:194682ms step_avg:156.00ms step:1259/1480 train_time:194847ms step_avg:156.00ms step:1260/1480 train_time:195007ms step_avg:156.01ms step:1261/1480 train_time:195171ms step_avg:156.01ms step:1262/1480 train_time:195334ms step_avg:156.02ms step:1263/1480 train_time:195500ms step_avg:156.03ms step:1264/1480 train_time:195660ms step_avg:156.03ms step:1265/1480 train_time:195822ms step_avg:156.03ms step:1266/1480 train_time:195987ms step_avg:156.04ms step:1267/1480 train_time:196149ms step_avg:156.05ms step:1268/1480 train_time:196311ms step_avg:156.05ms step:1269/1480 train_time:196476ms step_avg:156.06ms step:1270/1480 train_time:196639ms step_avg:156.06ms step:1271/1480 train_time:196804ms step_avg:156.07ms step:1272/1480 train_time:196966ms step_avg:156.07ms step:1273/1480 train_time:197129ms step_avg:156.08ms step:1274/1480 train_time:197293ms step_avg:156.09ms step:1275/1480 train_time:197455ms step_avg:156.09ms step:1276/1480 train_time:197614ms step_avg:156.09ms step:1277/1480 train_time:197776ms step_avg:156.10ms step:1278/1480 train_time:197936ms step_avg:156.10ms step:1279/1480 train_time:198098ms step_avg:156.11ms step:1280/1480 train_time:198267ms step_avg:156.12ms step:1281/1480 train_time:198429ms step_avg:156.12ms step:1282/1480 train_time:198588ms step_avg:156.12ms step:1283/1480 train_time:198751ms step_avg:156.13ms step:1284/1480 train_time:198913ms step_avg:156.13ms step:1285/1480 train_time:199074ms step_avg:156.14ms step:1286/1480 train_time:199235ms step_avg:156.14ms step:1287/1480 train_time:199399ms step_avg:156.15ms step:1288/1480 train_time:199563ms step_avg:156.15ms step:1289/1480 train_time:199732ms step_avg:156.16ms step:1290/1480 train_time:199901ms step_avg:156.17ms step:1291/1480 train_time:200067ms step_avg:156.18ms step:1292/1480 train_time:200231ms step_avg:156.19ms step:1293/1480 train_time:200397ms step_avg:156.19ms step:1294/1480 train_time:200560ms step_avg:156.20ms step:1295/1480 train_time:200722ms step_avg:156.20ms step:1296/1480 train_time:200886ms step_avg:156.21ms step:1297/1480 train_time:201049ms step_avg:156.22ms step:1298/1480 train_time:201211ms step_avg:156.22ms step:1299/1480 train_time:201374ms step_avg:156.22ms step:1300/1480 train_time:201534ms step_avg:156.23ms step:1301/1480 train_time:201695ms step_avg:156.23ms step:1302/1480 train_time:201860ms step_avg:156.24ms step:1303/1480 train_time:202029ms step_avg:156.25ms step:1304/1480 train_time:202194ms step_avg:156.26ms step:1305/1480 train_time:202355ms step_avg:156.26ms step:1306/1480 train_time:202521ms step_avg:156.27ms step:1307/1480 train_time:202681ms step_avg:156.27ms step:1308/1480 train_time:202844ms step_avg:156.27ms step:1309/1480 train_time:203010ms step_avg:156.28ms step:1310/1480 train_time:203173ms step_avg:156.29ms step:1311/1480 train_time:203334ms step_avg:156.29ms step:1312/1480 train_time:203498ms step_avg:156.30ms step:1313/1480 train_time:203664ms step_avg:156.30ms step:1314/1480 train_time:203828ms step_avg:156.31ms step:1315/1480 train_time:203993ms step_avg:156.32ms step:1316/1480 train_time:204152ms step_avg:156.32ms step:1317/1480 train_time:204312ms step_avg:156.32ms step:1318/1480 train_time:204479ms step_avg:156.33ms step:1319/1480 train_time:204644ms step_avg:156.34ms step:1320/1480 train_time:204811ms step_avg:156.34ms step:1321/1480 train_time:204975ms step_avg:156.35ms step:1322/1480 train_time:205146ms step_avg:156.36ms step:1323/1480 train_time:205308ms step_avg:156.37ms step:1324/1480 train_time:205472ms step_avg:156.37ms step:1325/1480 train_time:205642ms step_avg:156.38ms step:1326/1480 train_time:205809ms step_avg:156.39ms step:1327/1480 train_time:205971ms step_avg:156.39ms step:1328/1480 train_time:206133ms step_avg:156.40ms step:1329/1480 train_time:206325ms step_avg:156.43ms step:1330/1480 train_time:206480ms step_avg:156.42ms step:1331/1480 train_time:206643ms step_avg:156.43ms step:1332/1480 train_time:206806ms step_avg:156.43ms step:1333/1480 train_time:206972ms step_avg:156.44ms step:1334/1480 train_time:207135ms step_avg:156.45ms step:1335/1480 train_time:207295ms step_avg:156.45ms step:1336/1480 train_time:207466ms step_avg:156.46ms step:1337/1480 train_time:207632ms step_avg:156.47ms step:1338/1480 train_time:207795ms step_avg:156.47ms step:1339/1480 train_time:207960ms step_avg:156.48ms step:1340/1480 train_time:208124ms step_avg:156.48ms step:1341/1480 train_time:208285ms step_avg:156.49ms step:1342/1480 train_time:208451ms step_avg:156.49ms step:1343/1480 train_time:208612ms step_avg:156.50ms step:1344/1480 train_time:208775ms step_avg:156.50ms step:1345/1480 train_time:208944ms step_avg:156.51ms step:1346/1480 train_time:209106ms step_avg:156.52ms step:1347/1480 train_time:209270ms step_avg:156.52ms step:1348/1480 train_time:209431ms step_avg:156.53ms step:1349/1480 train_time:209593ms step_avg:156.53ms step:1350/1480 train_time:209760ms step_avg:156.54ms step:1351/1480 train_time:209923ms step_avg:156.54ms step:1352/1480 train_time:210086ms step_avg:156.55ms step:1353/1480 train_time:210252ms step_avg:156.55ms step:1354/1480 train_time:210415ms step_avg:156.56ms step:1355/1480 train_time:210576ms step_avg:156.56ms step:1356/1480 train_time:210741ms step_avg:156.57ms step:1357/1480 train_time:210906ms step_avg:156.57ms step:1358/1480 train_time:211070ms step_avg:156.58ms step:1359/1480 train_time:211235ms step_avg:156.59ms step:1360/1480 train_time:211399ms step_avg:156.59ms step:1361/1480 train_time:211567ms step_avg:156.60ms step:1362/1480 train_time:211733ms step_avg:156.61ms step:1363/1480 train_time:211900ms step_avg:156.61ms step:1364/1480 train_time:212064ms step_avg:156.62ms step:1365/1480 train_time:212225ms step_avg:156.62ms step:1366/1480 train_time:212388ms step_avg:156.63ms step:1367/1480 train_time:212551ms step_avg:156.63ms step:1368/1480 train_time:212713ms step_avg:156.64ms step:1369/1480 train_time:212884ms step_avg:156.65ms step:1370/1480 train_time:213051ms step_avg:156.66ms step:1371/1480 train_time:213213ms step_avg:156.66ms step:1372/1480 train_time:213381ms step_avg:156.67ms step:1373/1480 train_time:213544ms step_avg:156.67ms step:1374/1480 train_time:213711ms step_avg:156.68ms step:1375/1480 train_time:213874ms step_avg:156.68ms step:1375/1480 val_loss:3.3016 train_time:213948ms step_avg:156.74ms step:1376/1480 train_time:214040ms step_avg:156.69ms step:1377/1480 train_time:214203ms step_avg:156.70ms step:1378/1480 train_time:214366ms step_avg:156.70ms step:1379/1480 train_time:214530ms step_avg:156.71ms step:1380/1480 train_time:214693ms step_avg:156.71ms step:1381/1480 train_time:214862ms step_avg:156.72ms step:1382/1480 train_time:215026ms step_avg:156.72ms step:1383/1480 train_time:215189ms step_avg:156.73ms step:1384/1480 train_time:215355ms step_avg:156.74ms step:1385/1480 train_time:215514ms step_avg:156.74ms step:1386/1480 train_time:215678ms step_avg:156.74ms step:1387/1480 train_time:215844ms step_avg:156.75ms step:1388/1480 train_time:216006ms step_avg:156.75ms step:1389/1480 train_time:216171ms step_avg:156.76ms step:1390/1480 train_time:216331ms step_avg:156.76ms step:1391/1480 train_time:216492ms step_avg:156.76ms step:1392/1480 train_time:216658ms step_avg:156.77ms step:1393/1480 train_time:216821ms step_avg:156.78ms step:1394/1480 train_time:216986ms step_avg:156.78ms step:1395/1480 train_time:217148ms step_avg:156.79ms step:1396/1480 train_time:217311ms step_avg:156.79ms step:1397/1480 train_time:217472ms step_avg:156.79ms step:1398/1480 train_time:217633ms step_avg:156.80ms step:1399/1480 train_time:217794ms step_avg:156.80ms step:1400/1480 train_time:217963ms step_avg:156.81ms step:1401/1480 train_time:218124ms step_avg:156.81ms step:1402/1480 train_time:218287ms step_avg:156.82ms step:1403/1480 train_time:218453ms step_avg:156.82ms step:1404/1480 train_time:218615ms step_avg:156.83ms step:1405/1480 train_time:218780ms step_avg:156.83ms step:1406/1480 train_time:218945ms step_avg:156.84ms step:1407/1480 train_time:219107ms step_avg:156.84ms step:1408/1480 train_time:219268ms step_avg:156.84ms step:1409/1480 train_time:219439ms step_avg:156.85ms step:1410/1480 train_time:219604ms step_avg:156.86ms step:1411/1480 train_time:219766ms step_avg:156.86ms step:1412/1480 train_time:219928ms step_avg:156.87ms step:1413/1480 train_time:220090ms step_avg:156.87ms step:1414/1480 train_time:220252ms step_avg:156.87ms step:1415/1480 train_time:220416ms step_avg:156.88ms step:1416/1480 train_time:220590ms step_avg:156.89ms step:1417/1480 train_time:220754ms step_avg:156.90ms step:1418/1480 train_time:220919ms step_avg:156.90ms step:1419/1480 train_time:221084ms step_avg:156.91ms step:1420/1480 train_time:221248ms step_avg:156.91ms step:1421/1480 train_time:221412ms step_avg:156.92ms step:1422/1480 train_time:221577ms step_avg:156.92ms step:1423/1480 train_time:221740ms step_avg:156.93ms step:1424/1480 train_time:221908ms step_avg:156.94ms step:1425/1480 train_time:222077ms step_avg:156.94ms step:1426/1480 train_time:222242ms step_avg:156.95ms step:1427/1480 train_time:222408ms step_avg:156.96ms step:1428/1480 train_time:222570ms step_avg:156.96ms step:1429/1480 train_time:222731ms step_avg:156.96ms step:1430/1480 train_time:222895ms step_avg:156.97ms step:1431/1480 train_time:223061ms step_avg:156.97ms step:1432/1480 train_time:223229ms step_avg:156.98ms step:1433/1480 train_time:223397ms step_avg:156.99ms step:1434/1480 train_time:223566ms step_avg:157.00ms step:1435/1480 train_time:223732ms step_avg:157.00ms step:1436/1480 train_time:223898ms step_avg:157.01ms step:1437/1480 train_time:224061ms step_avg:157.02ms step:1438/1480 train_time:224222ms step_avg:157.02ms step:1439/1480 train_time:224389ms step_avg:157.03ms step:1440/1480 train_time:224551ms step_avg:157.03ms step:1441/1480 train_time:224714ms step_avg:157.03ms step:1442/1480 train_time:224882ms step_avg:157.04ms step:1443/1480 train_time:225054ms step_avg:157.05ms step:1444/1480 train_time:225217ms step_avg:157.05ms step:1445/1480 train_time:225381ms step_avg:157.06ms step:1446/1480 train_time:225548ms step_avg:157.07ms step:1447/1480 train_time:225715ms step_avg:157.07ms step:1448/1480 train_time:225878ms step_avg:157.08ms step:1449/1480 train_time:226043ms step_avg:157.08ms step:1450/1480 train_time:226208ms step_avg:157.09ms step:1451/1480 train_time:226370ms step_avg:157.09ms step:1452/1480 train_time:226534ms step_avg:157.10ms step:1453/1480 train_time:226696ms step_avg:157.10ms step:1454/1480 train_time:226859ms step_avg:157.10ms step:1455/1480 train_time:227031ms step_avg:157.11ms step:1456/1480 train_time:227194ms step_avg:157.12ms step:1457/1480 train_time:227356ms step_avg:157.12ms step:1458/1480 train_time:227519ms step_avg:157.13ms step:1459/1480 train_time:227686ms step_avg:157.13ms step:1460/1480 train_time:227849ms step_avg:157.14ms step:1461/1480 train_time:228013ms step_avg:157.14ms step:1462/1480 train_time:228177ms step_avg:157.15ms step:1463/1480 train_time:228343ms step_avg:157.15ms step:1464/1480 train_time:228509ms step_avg:157.16ms step:1465/1480 train_time:228673ms step_avg:157.16ms step:1466/1480 train_time:228835ms step_avg:157.17ms step:1467/1480 train_time:229002ms step_avg:157.17ms step:1468/1480 train_time:229167ms step_avg:157.18ms step:1469/1480 train_time:229329ms step_avg:157.18ms step:1470/1480 train_time:229497ms step_avg:157.19ms step:1471/1480 train_time:229670ms step_avg:157.20ms step:1472/1480 train_time:229839ms step_avg:157.21ms step:1473/1480 train_time:230003ms step_avg:157.21ms step:1474/1480 train_time:230169ms step_avg:157.22ms step:1475/1480 train_time:230337ms step_avg:157.23ms step:1476/1480 train_time:230501ms step_avg:157.23ms step:1477/1480 train_time:230670ms step_avg:157.24ms step:1478/1480 train_time:230840ms step_avg:157.25ms step:1479/1480 train_time:231007ms step_avg:157.25ms step:1480/1480 train_time:231169ms step_avg:157.26ms step:1480/1480 val_loss:3.2828 train_time:231244ms step_avg:157.31ms peak memory consumption: 34239 MiB