import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 08:16:44 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29409ms step_avg:nanms step:2/1480 train_time:29518ms step_avg:nanms step:3/1480 train_time:29638ms step_avg:nanms step:4/1480 train_time:29778ms step_avg:nanms step:5/1480 train_time:29919ms step_avg:nanms step:6/1480 train_time:30061ms step_avg:nanms step:7/1480 train_time:30204ms step_avg:nanms step:8/1480 train_time:30352ms step_avg:nanms step:9/1480 train_time:30489ms step_avg:nanms step:10/1480 train_time:30634ms step_avg:nanms step:11/1480 train_time:149ms step_avg:nanms step:12/1480 train_time:276ms step_avg:nanms step:13/1480 train_time:421ms step_avg:140.17ms step:14/1480 train_time:564ms step_avg:140.90ms step:15/1480 train_time:705ms step_avg:141.08ms step:16/1480 train_time:848ms step_avg:141.27ms step:17/1480 train_time:990ms step_avg:141.40ms step:18/1480 train_time:1131ms step_avg:141.36ms step:19/1480 train_time:1273ms step_avg:141.48ms step:20/1480 train_time:1417ms step_avg:141.70ms step:21/1480 train_time:1562ms step_avg:141.97ms step:22/1480 train_time:1704ms step_avg:142.01ms step:23/1480 train_time:1846ms step_avg:142.02ms step:24/1480 train_time:1989ms step_avg:142.08ms step:25/1480 train_time:2132ms step_avg:142.13ms step:26/1480 train_time:2273ms step_avg:142.06ms step:27/1480 train_time:2414ms step_avg:142.02ms step:28/1480 train_time:2557ms step_avg:142.06ms step:29/1480 train_time:2701ms step_avg:142.13ms step:30/1480 train_time:2843ms step_avg:142.16ms step:31/1480 train_time:2986ms step_avg:142.18ms step:32/1480 train_time:3128ms step_avg:142.19ms step:33/1480 train_time:3270ms step_avg:142.19ms step:34/1480 train_time:3412ms step_avg:142.19ms step:35/1480 train_time:3556ms step_avg:142.24ms step:36/1480 train_time:3700ms step_avg:142.31ms step:37/1480 train_time:3844ms step_avg:142.38ms step:38/1480 train_time:3987ms step_avg:142.40ms step:39/1480 train_time:4130ms step_avg:142.41ms step:40/1480 train_time:4695ms step_avg:156.49ms step:41/1480 train_time:4790ms step_avg:154.53ms step:42/1480 train_time:4932ms step_avg:154.12ms step:43/1480 train_time:5073ms step_avg:153.74ms step:44/1480 train_time:5215ms step_avg:153.39ms step:45/1480 train_time:5358ms step_avg:153.09ms step:46/1480 train_time:5500ms step_avg:152.78ms step:47/1480 train_time:5644ms step_avg:152.54ms step:48/1480 train_time:5787ms step_avg:152.28ms step:49/1480 train_time:5930ms step_avg:152.05ms step:50/1480 train_time:6071ms step_avg:151.78ms step:51/1480 train_time:6214ms step_avg:151.56ms step:52/1480 train_time:6357ms step_avg:151.35ms step:53/1480 train_time:6501ms step_avg:151.18ms step:54/1480 train_time:6646ms step_avg:151.05ms step:55/1480 train_time:6788ms step_avg:150.84ms step:56/1480 train_time:6930ms step_avg:150.65ms step:57/1480 train_time:7071ms step_avg:150.45ms step:58/1480 train_time:7213ms step_avg:150.27ms step:59/1480 train_time:7355ms step_avg:150.09ms step:60/1480 train_time:7497ms step_avg:149.95ms step:61/1480 train_time:7642ms step_avg:149.83ms step:62/1480 train_time:7786ms step_avg:149.73ms step:63/1480 train_time:7929ms step_avg:149.61ms step:64/1480 train_time:8071ms step_avg:149.46ms step:65/1480 train_time:8213ms step_avg:149.33ms step:66/1480 train_time:8356ms step_avg:149.21ms step:67/1480 train_time:8499ms step_avg:149.11ms step:68/1480 train_time:8643ms step_avg:149.02ms step:69/1480 train_time:8785ms step_avg:148.89ms step:70/1480 train_time:8927ms step_avg:148.78ms step:71/1480 train_time:9070ms step_avg:148.69ms step:72/1480 train_time:9212ms step_avg:148.58ms step:73/1480 train_time:9354ms step_avg:148.47ms step:74/1480 train_time:9497ms step_avg:148.39ms step:75/1480 train_time:9639ms step_avg:148.30ms step:76/1480 train_time:9786ms step_avg:148.28ms step:77/1480 train_time:9928ms step_avg:148.18ms step:78/1480 train_time:10069ms step_avg:148.07ms step:79/1480 train_time:10210ms step_avg:147.98ms step:80/1480 train_time:10751ms step_avg:153.59ms step:81/1480 train_time:11251ms step_avg:158.46ms step:82/1480 train_time:11353ms step_avg:157.68ms step:83/1480 train_time:11495ms step_avg:157.46ms step:84/1480 train_time:11638ms step_avg:157.27ms step:85/1480 train_time:11780ms step_avg:157.07ms step:86/1480 train_time:11924ms step_avg:156.90ms step:87/1480 train_time:12068ms step_avg:156.72ms step:88/1480 train_time:12209ms step_avg:156.53ms step:89/1480 train_time:12356ms step_avg:156.40ms step:90/1480 train_time:12500ms step_avg:156.25ms step:91/1480 train_time:12645ms step_avg:156.11ms step:92/1480 train_time:12788ms step_avg:155.95ms step:93/1480 train_time:12928ms step_avg:155.76ms step:94/1480 train_time:13070ms step_avg:155.60ms step:95/1480 train_time:13212ms step_avg:155.43ms step:96/1480 train_time:13354ms step_avg:155.28ms step:97/1480 train_time:13497ms step_avg:155.14ms step:98/1480 train_time:13641ms step_avg:155.01ms step:99/1480 train_time:13784ms step_avg:154.87ms step:100/1480 train_time:13927ms step_avg:154.75ms step:101/1480 train_time:14081ms step_avg:154.73ms step:102/1480 train_time:14209ms step_avg:154.45ms step:103/1480 train_time:14352ms step_avg:154.33ms step:104/1480 train_time:14497ms step_avg:154.22ms step:105/1480 train_time:14641ms step_avg:154.12ms step:106/1480 train_time:14784ms step_avg:153.99ms step:107/1480 train_time:14927ms step_avg:153.89ms step:108/1480 train_time:15068ms step_avg:153.76ms step:109/1480 train_time:15211ms step_avg:153.65ms step:110/1480 train_time:15354ms step_avg:153.54ms step:111/1480 train_time:15498ms step_avg:153.44ms step:112/1480 train_time:15645ms step_avg:153.38ms step:113/1480 train_time:15790ms step_avg:153.30ms step:114/1480 train_time:15936ms step_avg:153.23ms step:115/1480 train_time:16084ms step_avg:153.18ms step:116/1480 train_time:16229ms step_avg:153.10ms step:117/1480 train_time:16373ms step_avg:153.02ms step:118/1480 train_time:16519ms step_avg:152.96ms step:119/1480 train_time:16666ms step_avg:152.89ms step:120/1480 train_time:16809ms step_avg:152.81ms step:121/1480 train_time:16955ms step_avg:152.75ms step:122/1480 train_time:17101ms step_avg:152.69ms step:123/1480 train_time:17247ms step_avg:152.63ms step:124/1480 train_time:17392ms step_avg:152.56ms step:125/1480 train_time:17539ms step_avg:152.51ms step:125/1480 val_loss:4.4046 train_time:17604ms step_avg:153.08ms step:126/1480 train_time:17695ms step_avg:152.55ms step:127/1480 train_time:17841ms step_avg:152.49ms step:128/1480 train_time:17988ms step_avg:152.44ms step:129/1480 train_time:18133ms step_avg:152.38ms step:130/1480 train_time:18278ms step_avg:152.32ms step:131/1480 train_time:18424ms step_avg:152.26ms step:132/1480 train_time:18570ms step_avg:152.22ms step:133/1480 train_time:18716ms step_avg:152.16ms step:134/1480 train_time:18861ms step_avg:152.11ms step:135/1480 train_time:19007ms step_avg:152.06ms step:136/1480 train_time:19154ms step_avg:152.01ms step:137/1480 train_time:19298ms step_avg:151.95ms step:138/1480 train_time:19443ms step_avg:151.90ms step:139/1480 train_time:19590ms step_avg:151.86ms step:140/1480 train_time:19735ms step_avg:151.81ms step:141/1480 train_time:19882ms step_avg:151.77ms step:142/1480 train_time:20028ms step_avg:151.73ms step:143/1480 train_time:20174ms step_avg:151.68ms step:144/1480 train_time:20318ms step_avg:151.63ms step:145/1480 train_time:20465ms step_avg:151.59ms step:146/1480 train_time:20612ms step_avg:151.56ms step:147/1480 train_time:20759ms step_avg:151.52ms step:148/1480 train_time:20904ms step_avg:151.48ms step:149/1480 train_time:21051ms step_avg:151.45ms step:150/1480 train_time:21196ms step_avg:151.40ms step:151/1480 train_time:21341ms step_avg:151.35ms step:152/1480 train_time:21487ms step_avg:151.32ms step:153/1480 train_time:21633ms step_avg:151.28ms step:154/1480 train_time:21779ms step_avg:151.25ms step:155/1480 train_time:21925ms step_avg:151.20ms step:156/1480 train_time:22071ms step_avg:151.17ms step:157/1480 train_time:22216ms step_avg:151.13ms step:158/1480 train_time:22360ms step_avg:151.08ms step:159/1480 train_time:22506ms step_avg:151.05ms step:160/1480 train_time:22652ms step_avg:151.01ms step:161/1480 train_time:22797ms step_avg:150.97ms step:162/1480 train_time:22942ms step_avg:150.94ms step:163/1480 train_time:23090ms step_avg:150.92ms step:164/1480 train_time:23235ms step_avg:150.88ms step:165/1480 train_time:23381ms step_avg:150.84ms step:166/1480 train_time:23527ms step_avg:150.81ms step:167/1480 train_time:23673ms step_avg:150.78ms step:168/1480 train_time:23817ms step_avg:150.74ms step:169/1480 train_time:23964ms step_avg:150.72ms step:170/1480 train_time:24111ms step_avg:150.69ms step:171/1480 train_time:24256ms step_avg:150.66ms step:172/1480 train_time:24400ms step_avg:150.62ms step:173/1480 train_time:24546ms step_avg:150.59ms step:174/1480 train_time:24692ms step_avg:150.56ms step:175/1480 train_time:24837ms step_avg:150.52ms step:176/1480 train_time:24981ms step_avg:150.49ms step:177/1480 train_time:25127ms step_avg:150.46ms step:178/1480 train_time:25274ms step_avg:150.44ms step:179/1480 train_time:25419ms step_avg:150.41ms step:180/1480 train_time:25960ms step_avg:152.71ms step:181/1480 train_time:26065ms step_avg:152.43ms step:182/1480 train_time:26212ms step_avg:152.39ms step:183/1480 train_time:26356ms step_avg:152.35ms step:184/1480 train_time:26500ms step_avg:152.30ms step:185/1480 train_time:26645ms step_avg:152.26ms step:186/1480 train_time:26791ms step_avg:152.22ms step:187/1480 train_time:26938ms step_avg:152.19ms step:188/1480 train_time:27084ms step_avg:152.16ms step:189/1480 train_time:27263ms step_avg:152.31ms step:190/1480 train_time:27375ms step_avg:152.08ms step:191/1480 train_time:27519ms step_avg:152.04ms step:192/1480 train_time:27665ms step_avg:152.01ms step:193/1480 train_time:27811ms step_avg:151.97ms step:194/1480 train_time:27957ms step_avg:151.94ms step:195/1480 train_time:28102ms step_avg:151.90ms step:196/1480 train_time:28248ms step_avg:151.87ms step:197/1480 train_time:28395ms step_avg:151.84ms step:198/1480 train_time:28540ms step_avg:151.81ms step:199/1480 train_time:28687ms step_avg:151.78ms step:200/1480 train_time:28832ms step_avg:151.75ms step:201/1480 train_time:28982ms step_avg:151.74ms step:202/1480 train_time:29124ms step_avg:151.69ms step:203/1480 train_time:29271ms step_avg:151.66ms step:204/1480 train_time:29416ms step_avg:151.63ms step:205/1480 train_time:29561ms step_avg:151.59ms step:206/1480 train_time:29706ms step_avg:151.56ms step:207/1480 train_time:29854ms step_avg:151.54ms step:208/1480 train_time:29998ms step_avg:151.50ms step:209/1480 train_time:30143ms step_avg:151.47ms step:210/1480 train_time:30289ms step_avg:151.44ms step:211/1480 train_time:30434ms step_avg:151.41ms step:212/1480 train_time:30582ms step_avg:151.39ms step:213/1480 train_time:30727ms step_avg:151.37ms step:214/1480 train_time:30874ms step_avg:151.34ms step:215/1480 train_time:31018ms step_avg:151.31ms step:216/1480 train_time:31164ms step_avg:151.28ms step:217/1480 train_time:31310ms step_avg:151.26ms step:218/1480 train_time:31455ms step_avg:151.23ms step:219/1480 train_time:31599ms step_avg:151.19ms step:220/1480 train_time:31745ms step_avg:151.16ms step:221/1480 train_time:32274ms step_avg:152.96ms step:222/1480 train_time:32780ms step_avg:154.62ms step:223/1480 train_time:32889ms step_avg:154.41ms step:224/1480 train_time:33037ms step_avg:154.38ms step:225/1480 train_time:33186ms step_avg:154.35ms step:226/1480 train_time:33334ms step_avg:154.32ms step:227/1480 train_time:33481ms step_avg:154.29ms step:228/1480 train_time:33629ms step_avg:154.26ms step:229/1480 train_time:33778ms step_avg:154.24ms step:230/1480 train_time:33927ms step_avg:154.21ms step:231/1480 train_time:34077ms step_avg:154.19ms step:232/1480 train_time:34226ms step_avg:154.17ms step:233/1480 train_time:34375ms step_avg:154.15ms step:234/1480 train_time:34522ms step_avg:154.12ms step:235/1480 train_time:34671ms step_avg:154.09ms step:236/1480 train_time:34818ms step_avg:154.06ms step:237/1480 train_time:34967ms step_avg:154.04ms step:238/1480 train_time:35117ms step_avg:154.02ms step:239/1480 train_time:35266ms step_avg:154.00ms step:240/1480 train_time:35416ms step_avg:153.98ms step:241/1480 train_time:35564ms step_avg:153.96ms step:242/1480 train_time:35712ms step_avg:153.93ms step:243/1480 train_time:35860ms step_avg:153.91ms step:244/1480 train_time:36009ms step_avg:153.88ms step:245/1480 train_time:36157ms step_avg:153.86ms step:246/1480 train_time:36306ms step_avg:153.84ms step:247/1480 train_time:36455ms step_avg:153.82ms step:248/1480 train_time:36602ms step_avg:153.79ms step:249/1480 train_time:36752ms step_avg:153.78ms step:250/1480 train_time:36900ms step_avg:153.75ms step:250/1480 val_loss:3.9869 train_time:36966ms step_avg:154.03ms step:251/1480 train_time:37059ms step_avg:153.77ms step:252/1480 train_time:37205ms step_avg:153.74ms step:253/1480 train_time:37354ms step_avg:153.72ms step:254/1480 train_time:37503ms step_avg:153.70ms step:255/1480 train_time:37650ms step_avg:153.67ms step:256/1480 train_time:37798ms step_avg:153.65ms step:257/1480 train_time:37946ms step_avg:153.63ms step:258/1480 train_time:38096ms step_avg:153.61ms step:259/1480 train_time:38245ms step_avg:153.59ms step:260/1480 train_time:38394ms step_avg:153.58ms step:261/1480 train_time:38544ms step_avg:153.56ms step:262/1480 train_time:38692ms step_avg:153.54ms step:263/1480 train_time:38841ms step_avg:153.52ms step:264/1480 train_time:38989ms step_avg:153.50ms step:265/1480 train_time:39139ms step_avg:153.49ms step:266/1480 train_time:39287ms step_avg:153.47ms step:267/1480 train_time:39437ms step_avg:153.45ms step:268/1480 train_time:39585ms step_avg:153.43ms step:269/1480 train_time:39732ms step_avg:153.41ms step:270/1480 train_time:39881ms step_avg:153.39ms step:271/1480 train_time:40029ms step_avg:153.37ms step:272/1480 train_time:40179ms step_avg:153.35ms step:273/1480 train_time:40326ms step_avg:153.33ms step:274/1480 train_time:40475ms step_avg:153.31ms step:275/1480 train_time:40623ms step_avg:153.30ms step:276/1480 train_time:40772ms step_avg:153.28ms step:277/1480 train_time:40921ms step_avg:153.26ms step:278/1480 train_time:41069ms step_avg:153.24ms step:279/1480 train_time:41218ms step_avg:153.23ms step:280/1480 train_time:41366ms step_avg:153.21ms step:281/1480 train_time:41516ms step_avg:153.19ms step:282/1480 train_time:41663ms step_avg:153.17ms step:283/1480 train_time:41810ms step_avg:153.15ms step:284/1480 train_time:41959ms step_avg:153.14ms step:285/1480 train_time:42107ms step_avg:153.12ms step:286/1480 train_time:42256ms step_avg:153.10ms step:287/1480 train_time:42404ms step_avg:153.08ms step:288/1480 train_time:42553ms step_avg:153.07ms step:289/1480 train_time:42702ms step_avg:153.06ms step:290/1480 train_time:42850ms step_avg:153.03ms step:291/1480 train_time:42999ms step_avg:153.02ms step:292/1480 train_time:43146ms step_avg:153.00ms step:293/1480 train_time:43295ms step_avg:152.99ms step:294/1480 train_time:43443ms step_avg:152.97ms step:295/1480 train_time:43592ms step_avg:152.96ms step:296/1480 train_time:43741ms step_avg:152.94ms step:297/1480 train_time:43889ms step_avg:152.92ms step:298/1480 train_time:44037ms step_avg:152.91ms step:299/1480 train_time:44185ms step_avg:152.89ms step:300/1480 train_time:44334ms step_avg:152.87ms step:301/1480 train_time:44482ms step_avg:152.86ms step:302/1480 train_time:44630ms step_avg:152.84ms step:303/1480 train_time:44779ms step_avg:152.83ms step:304/1480 train_time:44927ms step_avg:152.81ms step:305/1480 train_time:45076ms step_avg:152.80ms step:306/1480 train_time:45224ms step_avg:152.78ms step:307/1480 train_time:45372ms step_avg:152.77ms step:308/1480 train_time:45520ms step_avg:152.75ms step:309/1480 train_time:45669ms step_avg:152.74ms step:310/1480 train_time:45818ms step_avg:152.73ms step:311/1480 train_time:45966ms step_avg:152.71ms step:312/1480 train_time:46114ms step_avg:152.70ms step:313/1480 train_time:46262ms step_avg:152.68ms step:314/1480 train_time:46410ms step_avg:152.67ms step:315/1480 train_time:46560ms step_avg:152.65ms step:316/1480 train_time:46707ms step_avg:152.64ms step:317/1480 train_time:46856ms step_avg:152.62ms step:318/1480 train_time:47004ms step_avg:152.61ms step:319/1480 train_time:47152ms step_avg:152.60ms step:320/1480 train_time:47301ms step_avg:152.58ms step:321/1480 train_time:47449ms step_avg:152.57ms step:322/1480 train_time:47599ms step_avg:152.56ms step:323/1480 train_time:47746ms step_avg:152.54ms step:324/1480 train_time:47896ms step_avg:152.53ms step:325/1480 train_time:48044ms step_avg:152.52ms step:326/1480 train_time:48193ms step_avg:152.51ms step:327/1480 train_time:48341ms step_avg:152.50ms step:328/1480 train_time:48490ms step_avg:152.48ms step:329/1480 train_time:48639ms step_avg:152.47ms step:330/1480 train_time:48789ms step_avg:152.47ms step:331/1480 train_time:48939ms step_avg:152.46ms step:332/1480 train_time:49091ms step_avg:152.46ms step:333/1480 train_time:49242ms step_avg:152.45ms step:334/1480 train_time:49393ms step_avg:152.45ms step:335/1480 train_time:49544ms step_avg:152.44ms step:336/1480 train_time:49696ms step_avg:152.44ms step:337/1480 train_time:49846ms step_avg:152.43ms step:338/1480 train_time:49997ms step_avg:152.43ms step:339/1480 train_time:50148ms step_avg:152.42ms step:340/1480 train_time:50299ms step_avg:152.42ms step:341/1480 train_time:50449ms step_avg:152.41ms step:342/1480 train_time:50601ms step_avg:152.41ms step:343/1480 train_time:50752ms step_avg:152.41ms step:344/1480 train_time:50903ms step_avg:152.40ms step:345/1480 train_time:51054ms step_avg:152.40ms step:346/1480 train_time:51205ms step_avg:152.39ms step:347/1480 train_time:51356ms step_avg:152.39ms step:348/1480 train_time:51507ms step_avg:152.39ms step:349/1480 train_time:51658ms step_avg:152.38ms step:350/1480 train_time:51808ms step_avg:152.38ms step:351/1480 train_time:51960ms step_avg:152.37ms step:352/1480 train_time:52110ms step_avg:152.37ms step:353/1480 train_time:52262ms step_avg:152.37ms step:354/1480 train_time:52412ms step_avg:152.36ms step:355/1480 train_time:52563ms step_avg:152.36ms step:356/1480 train_time:52713ms step_avg:152.35ms step:357/1480 train_time:52864ms step_avg:152.34ms step:358/1480 train_time:53014ms step_avg:152.34ms step:359/1480 train_time:53165ms step_avg:152.33ms step:360/1480 train_time:53316ms step_avg:152.33ms step:361/1480 train_time:53466ms step_avg:152.33ms step:362/1480 train_time:53618ms step_avg:152.32ms step:363/1480 train_time:53767ms step_avg:152.32ms step:364/1480 train_time:53919ms step_avg:152.31ms step:365/1480 train_time:54070ms step_avg:152.31ms step:366/1480 train_time:54221ms step_avg:152.31ms step:367/1480 train_time:54372ms step_avg:152.30ms step:368/1480 train_time:54522ms step_avg:152.30ms step:369/1480 train_time:54673ms step_avg:152.29ms step:370/1480 train_time:54824ms step_avg:152.29ms step:371/1480 train_time:54975ms step_avg:152.29ms step:372/1480 train_time:55126ms step_avg:152.28ms step:373/1480 train_time:55278ms step_avg:152.28ms step:374/1480 train_time:55428ms step_avg:152.27ms step:375/1480 train_time:55580ms step_avg:152.27ms step:375/1480 val_loss:3.8025 train_time:55647ms step_avg:152.46ms step:376/1480 train_time:55742ms step_avg:152.30ms step:377/1480 train_time:55889ms step_avg:152.29ms step:378/1480 train_time:56040ms step_avg:152.28ms step:379/1480 train_time:56219ms step_avg:152.35ms step:380/1480 train_time:56340ms step_avg:152.27ms step:381/1480 train_time:56491ms step_avg:152.27ms step:382/1480 train_time:56640ms step_avg:152.26ms step:383/1480 train_time:56793ms step_avg:152.26ms step:384/1480 train_time:56943ms step_avg:152.25ms step:385/1480 train_time:57095ms step_avg:152.25ms step:386/1480 train_time:57246ms step_avg:152.25ms step:387/1480 train_time:57396ms step_avg:152.24ms step:388/1480 train_time:57547ms step_avg:152.24ms step:389/1480 train_time:57698ms step_avg:152.24ms step:390/1480 train_time:57848ms step_avg:152.23ms step:391/1480 train_time:57999ms step_avg:152.23ms step:392/1480 train_time:58150ms step_avg:152.22ms step:393/1480 train_time:58300ms step_avg:152.22ms step:394/1480 train_time:58451ms step_avg:152.22ms step:395/1480 train_time:58600ms step_avg:152.21ms step:396/1480 train_time:58752ms step_avg:152.21ms step:397/1480 train_time:58902ms step_avg:152.20ms step:398/1480 train_time:59054ms step_avg:152.20ms step:399/1480 train_time:59203ms step_avg:152.19ms step:400/1480 train_time:59356ms step_avg:152.19ms step:401/1480 train_time:59505ms step_avg:152.19ms step:402/1480 train_time:59656ms step_avg:152.18ms step:403/1480 train_time:59807ms step_avg:152.18ms step:404/1480 train_time:59957ms step_avg:152.18ms step:405/1480 train_time:60108ms step_avg:152.17ms step:406/1480 train_time:60259ms step_avg:152.17ms step:407/1480 train_time:60410ms step_avg:152.17ms step:408/1480 train_time:60560ms step_avg:152.16ms step:409/1480 train_time:60712ms step_avg:152.16ms step:410/1480 train_time:60861ms step_avg:152.15ms step:411/1480 train_time:61013ms step_avg:152.15ms step:412/1480 train_time:61164ms step_avg:152.15ms step:413/1480 train_time:61315ms step_avg:152.15ms step:414/1480 train_time:61466ms step_avg:152.14ms step:415/1480 train_time:61617ms step_avg:152.14ms step:416/1480 train_time:61768ms step_avg:152.14ms step:417/1480 train_time:61919ms step_avg:152.13ms step:418/1480 train_time:62071ms step_avg:152.13ms step:419/1480 train_time:62221ms step_avg:152.13ms step:420/1480 train_time:62372ms step_avg:152.13ms step:421/1480 train_time:62523ms step_avg:152.12ms step:422/1480 train_time:62674ms step_avg:152.12ms step:423/1480 train_time:62824ms step_avg:152.12ms step:424/1480 train_time:62975ms step_avg:152.11ms step:425/1480 train_time:63126ms step_avg:152.11ms step:426/1480 train_time:63276ms step_avg:152.11ms step:427/1480 train_time:63427ms step_avg:152.10ms step:428/1480 train_time:63578ms step_avg:152.10ms step:429/1480 train_time:63729ms step_avg:152.10ms step:430/1480 train_time:63879ms step_avg:152.09ms step:431/1480 train_time:64031ms step_avg:152.09ms step:432/1480 train_time:64181ms step_avg:152.09ms step:433/1480 train_time:64332ms step_avg:152.08ms step:434/1480 train_time:64483ms step_avg:152.08ms step:435/1480 train_time:64634ms step_avg:152.08ms step:436/1480 train_time:64785ms step_avg:152.08ms step:437/1480 train_time:64937ms step_avg:152.08ms step:438/1480 train_time:65088ms step_avg:152.07ms step:439/1480 train_time:65239ms step_avg:152.07ms step:440/1480 train_time:65391ms step_avg:152.07ms step:441/1480 train_time:65542ms step_avg:152.07ms step:442/1480 train_time:65695ms step_avg:152.07ms step:443/1480 train_time:65847ms step_avg:152.07ms step:444/1480 train_time:66000ms step_avg:152.07ms step:445/1480 train_time:66154ms step_avg:152.08ms step:446/1480 train_time:66306ms step_avg:152.08ms step:447/1480 train_time:66459ms step_avg:152.08ms step:448/1480 train_time:66612ms step_avg:152.08ms step:449/1480 train_time:66765ms step_avg:152.08ms step:450/1480 train_time:66918ms step_avg:152.09ms step:451/1480 train_time:67072ms step_avg:152.09ms step:452/1480 train_time:67224ms step_avg:152.09ms step:453/1480 train_time:67376ms step_avg:152.09ms step:454/1480 train_time:67529ms step_avg:152.09ms step:455/1480 train_time:67681ms step_avg:152.09ms step:456/1480 train_time:67833ms step_avg:152.09ms step:457/1480 train_time:67987ms step_avg:152.10ms step:458/1480 train_time:68140ms step_avg:152.10ms step:459/1480 train_time:68293ms step_avg:152.10ms step:460/1480 train_time:68446ms step_avg:152.10ms step:461/1480 train_time:68599ms step_avg:152.10ms step:462/1480 train_time:68752ms step_avg:152.11ms step:463/1480 train_time:68904ms step_avg:152.11ms step:464/1480 train_time:69057ms step_avg:152.11ms step:465/1480 train_time:69209ms step_avg:152.11ms step:466/1480 train_time:69361ms step_avg:152.11ms step:467/1480 train_time:69515ms step_avg:152.11ms step:468/1480 train_time:69668ms step_avg:152.11ms step:469/1480 train_time:69820ms step_avg:152.11ms step:470/1480 train_time:69974ms step_avg:152.12ms step:471/1480 train_time:70126ms step_avg:152.12ms step:472/1480 train_time:70279ms step_avg:152.12ms step:473/1480 train_time:70431ms step_avg:152.12ms step:474/1480 train_time:70583ms step_avg:152.12ms step:475/1480 train_time:70736ms step_avg:152.12ms step:476/1480 train_time:70890ms step_avg:152.12ms step:477/1480 train_time:71044ms step_avg:152.13ms step:478/1480 train_time:71197ms step_avg:152.13ms step:479/1480 train_time:71350ms step_avg:152.13ms step:480/1480 train_time:71502ms step_avg:152.13ms step:481/1480 train_time:71655ms step_avg:152.13ms step:482/1480 train_time:71807ms step_avg:152.13ms step:483/1480 train_time:71961ms step_avg:152.14ms step:484/1480 train_time:72115ms step_avg:152.14ms step:485/1480 train_time:72267ms step_avg:152.14ms step:486/1480 train_time:72421ms step_avg:152.14ms step:487/1480 train_time:72574ms step_avg:152.15ms step:488/1480 train_time:72726ms step_avg:152.15ms step:489/1480 train_time:72878ms step_avg:152.15ms step:490/1480 train_time:73031ms step_avg:152.15ms step:491/1480 train_time:73184ms step_avg:152.15ms step:492/1480 train_time:73337ms step_avg:152.15ms step:493/1480 train_time:73491ms step_avg:152.16ms step:494/1480 train_time:73644ms step_avg:152.16ms step:495/1480 train_time:73797ms step_avg:152.16ms step:496/1480 train_time:73951ms step_avg:152.16ms step:497/1480 train_time:74103ms step_avg:152.16ms step:498/1480 train_time:74257ms step_avg:152.17ms step:499/1480 train_time:74409ms step_avg:152.17ms step:500/1480 train_time:74562ms step_avg:152.17ms step:500/1480 val_loss:3.6854 train_time:74631ms step_avg:152.31ms step:501/1480 train_time:74723ms step_avg:152.19ms step:502/1480 train_time:74874ms step_avg:152.18ms step:503/1480 train_time:75027ms step_avg:152.18ms step:504/1480 train_time:75179ms step_avg:152.18ms step:505/1480 train_time:75332ms step_avg:152.19ms step:506/1480 train_time:75483ms step_avg:152.18ms step:507/1480 train_time:75637ms step_avg:152.19ms step:508/1480 train_time:75790ms step_avg:152.19ms step:509/1480 train_time:75944ms step_avg:152.19ms step:510/1480 train_time:76097ms step_avg:152.19ms step:511/1480 train_time:76250ms step_avg:152.20ms step:512/1480 train_time:76403ms step_avg:152.20ms step:513/1480 train_time:76556ms step_avg:152.20ms step:514/1480 train_time:76709ms step_avg:152.20ms step:515/1480 train_time:76863ms step_avg:152.20ms step:516/1480 train_time:77017ms step_avg:152.21ms step:517/1480 train_time:77170ms step_avg:152.21ms step:518/1480 train_time:77323ms step_avg:152.21ms step:519/1480 train_time:77477ms step_avg:152.21ms step:520/1480 train_time:77629ms step_avg:152.21ms step:521/1480 train_time:77783ms step_avg:152.22ms step:522/1480 train_time:77936ms step_avg:152.22ms step:523/1480 train_time:78089ms step_avg:152.22ms step:524/1480 train_time:78242ms step_avg:152.22ms step:525/1480 train_time:78395ms step_avg:152.22ms step:526/1480 train_time:78547ms step_avg:152.22ms step:527/1480 train_time:78700ms step_avg:152.22ms step:528/1480 train_time:78853ms step_avg:152.23ms step:529/1480 train_time:79007ms step_avg:152.23ms step:530/1480 train_time:79159ms step_avg:152.23ms step:531/1480 train_time:79312ms step_avg:152.23ms step:532/1480 train_time:79463ms step_avg:152.23ms step:533/1480 train_time:79616ms step_avg:152.23ms step:534/1480 train_time:79770ms step_avg:152.23ms step:535/1480 train_time:79923ms step_avg:152.23ms step:536/1480 train_time:80077ms step_avg:152.24ms step:537/1480 train_time:80229ms step_avg:152.24ms step:538/1480 train_time:80383ms step_avg:152.24ms step:539/1480 train_time:80537ms step_avg:152.24ms step:540/1480 train_time:80689ms step_avg:152.24ms step:541/1480 train_time:80842ms step_avg:152.24ms step:542/1480 train_time:80995ms step_avg:152.25ms step:543/1480 train_time:81149ms step_avg:152.25ms step:544/1480 train_time:81301ms step_avg:152.25ms step:545/1480 train_time:81454ms step_avg:152.25ms step:546/1480 train_time:81607ms step_avg:152.25ms step:547/1480 train_time:81759ms step_avg:152.25ms step:548/1480 train_time:81912ms step_avg:152.25ms step:549/1480 train_time:82064ms step_avg:152.25ms step:550/1480 train_time:82218ms step_avg:152.26ms step:551/1480 train_time:82373ms step_avg:152.26ms step:552/1480 train_time:82528ms step_avg:152.26ms step:553/1480 train_time:82683ms step_avg:152.27ms step:554/1480 train_time:82837ms step_avg:152.27ms step:555/1480 train_time:82991ms step_avg:152.28ms step:556/1480 train_time:83146ms step_avg:152.28ms step:557/1480 train_time:83300ms step_avg:152.29ms step:558/1480 train_time:83455ms step_avg:152.29ms step:559/1480 train_time:83609ms step_avg:152.29ms step:560/1480 train_time:83763ms step_avg:152.30ms step:561/1480 train_time:83917ms step_avg:152.30ms step:562/1480 train_time:84072ms step_avg:152.30ms step:563/1480 train_time:84226ms step_avg:152.31ms step:564/1480 train_time:84380ms step_avg:152.31ms step:565/1480 train_time:84534ms step_avg:152.31ms step:566/1480 train_time:84689ms step_avg:152.32ms step:567/1480 train_time:84845ms step_avg:152.32ms step:568/1480 train_time:84998ms step_avg:152.33ms step:569/1480 train_time:85180ms step_avg:152.38ms step:570/1480 train_time:85307ms step_avg:152.33ms step:571/1480 train_time:85462ms step_avg:152.34ms step:572/1480 train_time:85617ms step_avg:152.34ms step:573/1480 train_time:85772ms step_avg:152.35ms step:574/1480 train_time:85927ms step_avg:152.35ms step:575/1480 train_time:86082ms step_avg:152.36ms step:576/1480 train_time:86237ms step_avg:152.36ms step:577/1480 train_time:86390ms step_avg:152.36ms step:578/1480 train_time:86546ms step_avg:152.37ms step:579/1480 train_time:86700ms step_avg:152.37ms step:580/1480 train_time:86854ms step_avg:152.38ms step:581/1480 train_time:87009ms step_avg:152.38ms step:582/1480 train_time:87165ms step_avg:152.39ms step:583/1480 train_time:87319ms step_avg:152.39ms step:584/1480 train_time:87474ms step_avg:152.39ms step:585/1480 train_time:87628ms step_avg:152.40ms step:586/1480 train_time:87783ms step_avg:152.40ms step:587/1480 train_time:87938ms step_avg:152.41ms step:588/1480 train_time:88092ms step_avg:152.41ms step:589/1480 train_time:88246ms step_avg:152.41ms step:590/1480 train_time:88401ms step_avg:152.42ms step:591/1480 train_time:88556ms step_avg:152.42ms step:592/1480 train_time:88711ms step_avg:152.42ms step:593/1480 train_time:88866ms step_avg:152.43ms step:594/1480 train_time:89020ms step_avg:152.43ms step:595/1480 train_time:89176ms step_avg:152.44ms step:596/1480 train_time:89332ms step_avg:152.44ms step:597/1480 train_time:89487ms step_avg:152.45ms step:598/1480 train_time:89641ms step_avg:152.45ms step:599/1480 train_time:89796ms step_avg:152.45ms step:600/1480 train_time:89951ms step_avg:152.46ms step:601/1480 train_time:90106ms step_avg:152.46ms step:602/1480 train_time:90261ms step_avg:152.47ms step:603/1480 train_time:90415ms step_avg:152.47ms step:604/1480 train_time:90572ms step_avg:152.48ms step:605/1480 train_time:90727ms step_avg:152.48ms step:606/1480 train_time:90882ms step_avg:152.49ms step:607/1480 train_time:91039ms step_avg:152.49ms step:608/1480 train_time:91194ms step_avg:152.50ms step:609/1480 train_time:91350ms step_avg:152.50ms step:610/1480 train_time:91504ms step_avg:152.51ms step:611/1480 train_time:91659ms step_avg:152.51ms step:612/1480 train_time:91813ms step_avg:152.51ms step:613/1480 train_time:91971ms step_avg:152.52ms step:614/1480 train_time:92126ms step_avg:152.53ms step:615/1480 train_time:92281ms step_avg:152.53ms step:616/1480 train_time:92435ms step_avg:152.53ms step:617/1480 train_time:92589ms step_avg:152.54ms step:618/1480 train_time:92744ms step_avg:152.54ms step:619/1480 train_time:92899ms step_avg:152.54ms step:620/1480 train_time:93054ms step_avg:152.55ms step:621/1480 train_time:93210ms step_avg:152.55ms step:622/1480 train_time:93366ms step_avg:152.56ms step:623/1480 train_time:93522ms step_avg:152.56ms step:624/1480 train_time:93677ms step_avg:152.57ms step:625/1480 train_time:93831ms step_avg:152.57ms step:625/1480 val_loss:3.6038 train_time:93902ms step_avg:152.69ms step:626/1480 train_time:93997ms step_avg:152.59ms step:627/1480 train_time:94146ms step_avg:152.59ms step:628/1480 train_time:94301ms step_avg:152.59ms step:629/1480 train_time:94455ms step_avg:152.59ms step:630/1480 train_time:94609ms step_avg:152.59ms step:631/1480 train_time:94762ms step_avg:152.60ms step:632/1480 train_time:94917ms step_avg:152.60ms step:633/1480 train_time:95072ms step_avg:152.60ms step:634/1480 train_time:95229ms step_avg:152.61ms step:635/1480 train_time:95383ms step_avg:152.61ms step:636/1480 train_time:95537ms step_avg:152.61ms step:637/1480 train_time:95692ms step_avg:152.62ms step:638/1480 train_time:95847ms step_avg:152.62ms step:639/1480 train_time:96001ms step_avg:152.63ms step:640/1480 train_time:96155ms step_avg:152.63ms step:641/1480 train_time:96310ms step_avg:152.63ms step:642/1480 train_time:96463ms step_avg:152.63ms step:643/1480 train_time:96618ms step_avg:152.63ms step:644/1480 train_time:96773ms step_avg:152.64ms step:645/1480 train_time:96929ms step_avg:152.64ms step:646/1480 train_time:97083ms step_avg:152.65ms step:647/1480 train_time:97239ms step_avg:152.65ms step:648/1480 train_time:97396ms step_avg:152.66ms step:649/1480 train_time:97551ms step_avg:152.66ms step:650/1480 train_time:97706ms step_avg:152.67ms step:651/1480 train_time:97860ms step_avg:152.67ms step:652/1480 train_time:98016ms step_avg:152.67ms step:653/1480 train_time:98170ms step_avg:152.67ms step:654/1480 train_time:98325ms step_avg:152.68ms step:655/1480 train_time:98479ms step_avg:152.68ms step:656/1480 train_time:98633ms step_avg:152.68ms step:657/1480 train_time:98788ms step_avg:152.69ms step:658/1480 train_time:98944ms step_avg:152.69ms step:659/1480 train_time:99099ms step_avg:152.70ms step:660/1480 train_time:99256ms step_avg:152.70ms step:661/1480 train_time:99411ms step_avg:152.71ms step:662/1480 train_time:99568ms step_avg:152.71ms step:663/1480 train_time:99724ms step_avg:152.72ms step:664/1480 train_time:99880ms step_avg:152.72ms step:665/1480 train_time:100036ms step_avg:152.73ms step:666/1480 train_time:100193ms step_avg:152.73ms step:667/1480 train_time:100350ms step_avg:152.74ms step:668/1480 train_time:100507ms step_avg:152.75ms step:669/1480 train_time:100663ms step_avg:152.75ms step:670/1480 train_time:100819ms step_avg:152.76ms step:671/1480 train_time:100975ms step_avg:152.76ms step:672/1480 train_time:101130ms step_avg:152.76ms step:673/1480 train_time:101285ms step_avg:152.77ms step:674/1480 train_time:101443ms step_avg:152.78ms step:675/1480 train_time:101600ms step_avg:152.78ms step:676/1480 train_time:101758ms step_avg:152.79ms step:677/1480 train_time:101914ms step_avg:152.79ms step:678/1480 train_time:102070ms step_avg:152.80ms step:679/1480 train_time:102227ms step_avg:152.81ms step:680/1480 train_time:102383ms step_avg:152.81ms step:681/1480 train_time:102540ms step_avg:152.82ms step:682/1480 train_time:102698ms step_avg:152.82ms step:683/1480 train_time:102855ms step_avg:152.83ms step:684/1480 train_time:103012ms step_avg:152.84ms step:685/1480 train_time:103168ms step_avg:152.84ms step:686/1480 train_time:103325ms step_avg:152.85ms step:687/1480 train_time:103481ms step_avg:152.85ms step:688/1480 train_time:103639ms step_avg:152.86ms step:689/1480 train_time:103798ms step_avg:152.87ms step:690/1480 train_time:103956ms step_avg:152.88ms step:691/1480 train_time:104113ms step_avg:152.88ms step:692/1480 train_time:104270ms step_avg:152.89ms step:693/1480 train_time:104426ms step_avg:152.89ms step:694/1480 train_time:104582ms step_avg:152.90ms step:695/1480 train_time:104739ms step_avg:152.90ms step:696/1480 train_time:104895ms step_avg:152.91ms step:697/1480 train_time:105050ms step_avg:152.91ms step:698/1480 train_time:105205ms step_avg:152.91ms step:699/1480 train_time:105362ms step_avg:152.92ms step:700/1480 train_time:105521ms step_avg:152.93ms step:701/1480 train_time:105676ms step_avg:152.93ms step:702/1480 train_time:105833ms step_avg:152.94ms step:703/1480 train_time:105988ms step_avg:152.94ms step:704/1480 train_time:106145ms step_avg:152.95ms step:705/1480 train_time:106300ms step_avg:152.95ms step:706/1480 train_time:106460ms step_avg:152.96ms step:707/1480 train_time:106618ms step_avg:152.97ms step:708/1480 train_time:106774ms step_avg:152.97ms step:709/1480 train_time:106930ms step_avg:152.98ms step:710/1480 train_time:107085ms step_avg:152.98ms step:711/1480 train_time:107242ms step_avg:152.98ms step:712/1480 train_time:107400ms step_avg:152.99ms step:713/1480 train_time:107556ms step_avg:153.00ms step:714/1480 train_time:107712ms step_avg:153.00ms step:715/1480 train_time:107869ms step_avg:153.01ms step:716/1480 train_time:108024ms step_avg:153.01ms step:717/1480 train_time:108180ms step_avg:153.01ms step:718/1480 train_time:108336ms step_avg:153.02ms step:719/1480 train_time:108491ms step_avg:153.02ms step:720/1480 train_time:108649ms step_avg:153.03ms step:721/1480 train_time:108805ms step_avg:153.03ms step:722/1480 train_time:108962ms step_avg:153.04ms step:723/1480 train_time:109119ms step_avg:153.04ms step:724/1480 train_time:109276ms step_avg:153.05ms step:725/1480 train_time:109433ms step_avg:153.05ms step:726/1480 train_time:109590ms step_avg:153.06ms step:727/1480 train_time:109747ms step_avg:153.06ms step:728/1480 train_time:109904ms step_avg:153.07ms step:729/1480 train_time:110060ms step_avg:153.07ms step:730/1480 train_time:110217ms step_avg:153.08ms step:731/1480 train_time:110374ms step_avg:153.09ms step:732/1480 train_time:110530ms step_avg:153.09ms step:733/1480 train_time:110686ms step_avg:153.09ms step:734/1480 train_time:110845ms step_avg:153.10ms step:735/1480 train_time:111002ms step_avg:153.11ms step:736/1480 train_time:111157ms step_avg:153.11ms step:737/1480 train_time:111313ms step_avg:153.11ms step:738/1480 train_time:111468ms step_avg:153.12ms step:739/1480 train_time:111625ms step_avg:153.12ms step:740/1480 train_time:111783ms step_avg:153.13ms step:741/1480 train_time:111942ms step_avg:153.14ms step:742/1480 train_time:112098ms step_avg:153.14ms step:743/1480 train_time:112254ms step_avg:153.14ms step:744/1480 train_time:112410ms step_avg:153.15ms step:745/1480 train_time:112567ms step_avg:153.15ms step:746/1480 train_time:112723ms step_avg:153.16ms step:747/1480 train_time:112879ms step_avg:153.16ms step:748/1480 train_time:113039ms step_avg:153.17ms step:749/1480 train_time:113197ms step_avg:153.18ms step:750/1480 train_time:113352ms step_avg:153.18ms step:750/1480 val_loss:3.5489 train_time:113425ms step_avg:153.28ms step:751/1480 train_time:113516ms step_avg:153.19ms step:752/1480 train_time:113669ms step_avg:153.19ms step:753/1480 train_time:113824ms step_avg:153.20ms step:754/1480 train_time:113981ms step_avg:153.20ms step:755/1480 train_time:114138ms step_avg:153.20ms step:756/1480 train_time:114294ms step_avg:153.21ms step:757/1480 train_time:114451ms step_avg:153.21ms step:758/1480 train_time:114606ms step_avg:153.22ms step:759/1480 train_time:114792ms step_avg:153.26ms step:760/1480 train_time:114922ms step_avg:153.23ms step:761/1480 train_time:115078ms step_avg:153.23ms step:762/1480 train_time:115236ms step_avg:153.24ms step:763/1480 train_time:115394ms step_avg:153.25ms step:764/1480 train_time:115550ms step_avg:153.25ms step:765/1480 train_time:115706ms step_avg:153.25ms step:766/1480 train_time:115864ms step_avg:153.26ms step:767/1480 train_time:116019ms step_avg:153.26ms step:768/1480 train_time:116174ms step_avg:153.26ms step:769/1480 train_time:116331ms step_avg:153.27ms step:770/1480 train_time:116488ms step_avg:153.27ms step:771/1480 train_time:116645ms step_avg:153.28ms step:772/1480 train_time:116802ms step_avg:153.28ms step:773/1480 train_time:116960ms step_avg:153.29ms step:774/1480 train_time:117117ms step_avg:153.30ms step:775/1480 train_time:117275ms step_avg:153.30ms step:776/1480 train_time:117434ms step_avg:153.31ms step:777/1480 train_time:117594ms step_avg:153.32ms step:778/1480 train_time:117753ms step_avg:153.32ms step:779/1480 train_time:117909ms step_avg:153.33ms step:780/1480 train_time:118069ms step_avg:153.34ms step:781/1480 train_time:118226ms step_avg:153.34ms step:782/1480 train_time:118384ms step_avg:153.35ms step:783/1480 train_time:118540ms step_avg:153.35ms step:784/1480 train_time:118700ms step_avg:153.36ms step:785/1480 train_time:118859ms step_avg:153.37ms step:786/1480 train_time:119016ms step_avg:153.37ms step:787/1480 train_time:119175ms step_avg:153.38ms step:788/1480 train_time:119333ms step_avg:153.38ms step:789/1480 train_time:119489ms step_avg:153.39ms step:790/1480 train_time:119647ms step_avg:153.39ms step:791/1480 train_time:119808ms step_avg:153.40ms step:792/1480 train_time:119965ms step_avg:153.41ms step:793/1480 train_time:120122ms step_avg:153.41ms step:794/1480 train_time:120280ms step_avg:153.42ms step:795/1480 train_time:120440ms step_avg:153.43ms step:796/1480 train_time:120600ms step_avg:153.44ms step:797/1480 train_time:120760ms step_avg:153.44ms step:798/1480 train_time:120919ms step_avg:153.45ms step:799/1480 train_time:121080ms step_avg:153.46ms step:800/1480 train_time:121238ms step_avg:153.47ms step:801/1480 train_time:121397ms step_avg:153.47ms step:802/1480 train_time:121556ms step_avg:153.48ms step:803/1480 train_time:121714ms step_avg:153.49ms step:804/1480 train_time:121870ms step_avg:153.49ms step:805/1480 train_time:122030ms step_avg:153.50ms step:806/1480 train_time:122187ms step_avg:153.50ms step:807/1480 train_time:122343ms step_avg:153.50ms step:808/1480 train_time:122502ms step_avg:153.51ms step:809/1480 train_time:122658ms step_avg:153.51ms step:810/1480 train_time:122815ms step_avg:153.52ms step:811/1480 train_time:122974ms step_avg:153.53ms step:812/1480 train_time:123131ms step_avg:153.53ms step:813/1480 train_time:123288ms step_avg:153.53ms step:814/1480 train_time:123444ms step_avg:153.54ms step:815/1480 train_time:123602ms step_avg:153.54ms step:816/1480 train_time:123761ms step_avg:153.55ms step:817/1480 train_time:123918ms step_avg:153.55ms step:818/1480 train_time:124075ms step_avg:153.56ms step:819/1480 train_time:124234ms step_avg:153.57ms step:820/1480 train_time:124392ms step_avg:153.57ms step:821/1480 train_time:124549ms step_avg:153.57ms step:822/1480 train_time:124706ms step_avg:153.58ms step:823/1480 train_time:124865ms step_avg:153.59ms step:824/1480 train_time:125021ms step_avg:153.59ms step:825/1480 train_time:125180ms step_avg:153.60ms step:826/1480 train_time:125340ms step_avg:153.60ms step:827/1480 train_time:125499ms step_avg:153.61ms step:828/1480 train_time:125657ms step_avg:153.62ms step:829/1480 train_time:125816ms step_avg:153.62ms step:830/1480 train_time:125976ms step_avg:153.63ms step:831/1480 train_time:126134ms step_avg:153.63ms step:832/1480 train_time:126294ms step_avg:153.64ms step:833/1480 train_time:126451ms step_avg:153.65ms step:834/1480 train_time:126612ms step_avg:153.66ms step:835/1480 train_time:126770ms step_avg:153.66ms step:836/1480 train_time:126928ms step_avg:153.67ms step:837/1480 train_time:127086ms step_avg:153.67ms step:838/1480 train_time:127242ms step_avg:153.67ms step:839/1480 train_time:127400ms step_avg:153.68ms step:840/1480 train_time:127558ms step_avg:153.68ms step:841/1480 train_time:127715ms step_avg:153.69ms step:842/1480 train_time:127872ms step_avg:153.69ms step:843/1480 train_time:128030ms step_avg:153.70ms step:844/1480 train_time:128187ms step_avg:153.70ms step:845/1480 train_time:128343ms step_avg:153.70ms step:846/1480 train_time:128503ms step_avg:153.71ms step:847/1480 train_time:128661ms step_avg:153.72ms step:848/1480 train_time:128818ms step_avg:153.72ms step:849/1480 train_time:128977ms step_avg:153.73ms step:850/1480 train_time:129135ms step_avg:153.73ms step:851/1480 train_time:129296ms step_avg:153.74ms step:852/1480 train_time:129455ms step_avg:153.75ms step:853/1480 train_time:129612ms step_avg:153.75ms step:854/1480 train_time:129770ms step_avg:153.76ms step:855/1480 train_time:129928ms step_avg:153.76ms step:856/1480 train_time:130085ms step_avg:153.76ms step:857/1480 train_time:130243ms step_avg:153.77ms step:858/1480 train_time:130402ms step_avg:153.78ms step:859/1480 train_time:130561ms step_avg:153.78ms step:860/1480 train_time:130719ms step_avg:153.79ms step:861/1480 train_time:130879ms step_avg:153.79ms step:862/1480 train_time:131041ms step_avg:153.80ms step:863/1480 train_time:131201ms step_avg:153.81ms step:864/1480 train_time:131361ms step_avg:153.82ms step:865/1480 train_time:131518ms step_avg:153.82ms step:866/1480 train_time:131676ms step_avg:153.83ms step:867/1480 train_time:131835ms step_avg:153.83ms step:868/1480 train_time:131991ms step_avg:153.84ms step:869/1480 train_time:132148ms step_avg:153.84ms step:870/1480 train_time:132306ms step_avg:153.84ms step:871/1480 train_time:132463ms step_avg:153.85ms step:872/1480 train_time:132621ms step_avg:153.85ms step:873/1480 train_time:132778ms step_avg:153.86ms step:874/1480 train_time:132938ms step_avg:153.86ms step:875/1480 train_time:133099ms step_avg:153.87ms step:875/1480 val_loss:3.5027 train_time:133171ms step_avg:153.96ms step:876/1480 train_time:133266ms step_avg:153.89ms step:877/1480 train_time:133418ms step_avg:153.88ms step:878/1480 train_time:133575ms step_avg:153.89ms step:879/1480 train_time:133733ms step_avg:153.89ms step:880/1480 train_time:133891ms step_avg:153.90ms step:881/1480 train_time:134049ms step_avg:153.90ms step:882/1480 train_time:134210ms step_avg:153.91ms step:883/1480 train_time:134369ms step_avg:153.92ms step:884/1480 train_time:134530ms step_avg:153.92ms step:885/1480 train_time:134691ms step_avg:153.93ms step:886/1480 train_time:134852ms step_avg:153.94ms step:887/1480 train_time:135011ms step_avg:153.95ms step:888/1480 train_time:135176ms step_avg:153.96ms step:889/1480 train_time:135338ms step_avg:153.97ms step:890/1480 train_time:135494ms step_avg:153.97ms step:891/1480 train_time:135653ms step_avg:153.98ms step:892/1480 train_time:135813ms step_avg:153.98ms step:893/1480 train_time:135971ms step_avg:153.99ms step:894/1480 train_time:136132ms step_avg:154.00ms step:895/1480 train_time:136293ms step_avg:154.00ms step:896/1480 train_time:136453ms step_avg:154.01ms step:897/1480 train_time:136614ms step_avg:154.02ms step:898/1480 train_time:136774ms step_avg:154.03ms step:899/1480 train_time:136934ms step_avg:154.03ms step:900/1480 train_time:137092ms step_avg:154.04ms step:901/1480 train_time:137250ms step_avg:154.04ms step:902/1480 train_time:137408ms step_avg:154.05ms step:903/1480 train_time:137570ms step_avg:154.05ms step:904/1480 train_time:137731ms step_avg:154.06ms step:905/1480 train_time:137890ms step_avg:154.07ms step:906/1480 train_time:138050ms step_avg:154.07ms step:907/1480 train_time:138212ms step_avg:154.08ms step:908/1480 train_time:138370ms step_avg:154.09ms step:909/1480 train_time:138530ms step_avg:154.09ms step:910/1480 train_time:138693ms step_avg:154.10ms step:911/1480 train_time:138853ms step_avg:154.11ms step:912/1480 train_time:139013ms step_avg:154.12ms step:913/1480 train_time:139174ms step_avg:154.12ms step:914/1480 train_time:139335ms step_avg:154.13ms step:915/1480 train_time:139496ms step_avg:154.14ms step:916/1480 train_time:139656ms step_avg:154.15ms step:917/1480 train_time:139814ms step_avg:154.15ms step:918/1480 train_time:139976ms step_avg:154.16ms step:919/1480 train_time:140137ms step_avg:154.17ms step:920/1480 train_time:140295ms step_avg:154.17ms step:921/1480 train_time:140455ms step_avg:154.18ms step:922/1480 train_time:140617ms step_avg:154.18ms step:923/1480 train_time:140774ms step_avg:154.19ms step:924/1480 train_time:140933ms step_avg:154.19ms step:925/1480 train_time:141093ms step_avg:154.20ms step:926/1480 train_time:141251ms step_avg:154.20ms step:927/1480 train_time:141409ms step_avg:154.21ms step:928/1480 train_time:141570ms step_avg:154.22ms step:929/1480 train_time:141730ms step_avg:154.22ms step:930/1480 train_time:141890ms step_avg:154.23ms step:931/1480 train_time:142050ms step_avg:154.23ms step:932/1480 train_time:142209ms step_avg:154.24ms step:933/1480 train_time:142368ms step_avg:154.24ms step:934/1480 train_time:142529ms step_avg:154.25ms step:935/1480 train_time:142689ms step_avg:154.26ms step:936/1480 train_time:142849ms step_avg:154.26ms step:937/1480 train_time:143010ms step_avg:154.27ms step:938/1480 train_time:143168ms step_avg:154.28ms step:939/1480 train_time:143330ms step_avg:154.28ms step:940/1480 train_time:143492ms step_avg:154.29ms step:941/1480 train_time:143652ms step_avg:154.30ms step:942/1480 train_time:143811ms step_avg:154.30ms step:943/1480 train_time:143972ms step_avg:154.31ms step:944/1480 train_time:144136ms step_avg:154.32ms step:945/1480 train_time:144294ms step_avg:154.33ms step:946/1480 train_time:144457ms step_avg:154.33ms step:947/1480 train_time:144618ms step_avg:154.34ms step:948/1480 train_time:144776ms step_avg:154.35ms step:949/1480 train_time:144960ms step_avg:154.38ms step:950/1480 train_time:145094ms step_avg:154.35ms step:951/1480 train_time:145257ms step_avg:154.36ms step:952/1480 train_time:145415ms step_avg:154.37ms step:953/1480 train_time:145574ms step_avg:154.37ms step:954/1480 train_time:145737ms step_avg:154.38ms step:955/1480 train_time:145894ms step_avg:154.39ms step:956/1480 train_time:146054ms step_avg:154.39ms step:957/1480 train_time:146215ms step_avg:154.40ms step:958/1480 train_time:146378ms step_avg:154.41ms step:959/1480 train_time:146537ms step_avg:154.41ms step:960/1480 train_time:146697ms step_avg:154.42ms step:961/1480 train_time:146856ms step_avg:154.42ms step:962/1480 train_time:147016ms step_avg:154.43ms step:963/1480 train_time:147176ms step_avg:154.43ms step:964/1480 train_time:147337ms step_avg:154.44ms step:965/1480 train_time:147495ms step_avg:154.44ms step:966/1480 train_time:147654ms step_avg:154.45ms step:967/1480 train_time:147812ms step_avg:154.45ms step:968/1480 train_time:147971ms step_avg:154.46ms step:969/1480 train_time:148133ms step_avg:154.47ms step:970/1480 train_time:148291ms step_avg:154.47ms step:971/1480 train_time:148450ms step_avg:154.47ms step:972/1480 train_time:148609ms step_avg:154.48ms step:973/1480 train_time:148767ms step_avg:154.48ms step:974/1480 train_time:148928ms step_avg:154.49ms step:975/1480 train_time:149089ms step_avg:154.50ms step:976/1480 train_time:149250ms step_avg:154.50ms step:977/1480 train_time:149411ms step_avg:154.51ms step:978/1480 train_time:149570ms step_avg:154.51ms step:979/1480 train_time:149732ms step_avg:154.52ms step:980/1480 train_time:149892ms step_avg:154.53ms step:981/1480 train_time:150054ms step_avg:154.54ms step:982/1480 train_time:150213ms step_avg:154.54ms step:983/1480 train_time:150373ms step_avg:154.55ms step:984/1480 train_time:150533ms step_avg:154.55ms step:985/1480 train_time:150695ms step_avg:154.56ms step:986/1480 train_time:150855ms step_avg:154.57ms step:987/1480 train_time:151013ms step_avg:154.57ms step:988/1480 train_time:151171ms step_avg:154.57ms step:989/1480 train_time:151330ms step_avg:154.58ms step:990/1480 train_time:151493ms step_avg:154.58ms step:991/1480 train_time:151654ms step_avg:154.59ms step:992/1480 train_time:151818ms step_avg:154.60ms step:993/1480 train_time:151986ms step_avg:154.61ms step:994/1480 train_time:152146ms step_avg:154.62ms step:995/1480 train_time:152306ms step_avg:154.63ms step:996/1480 train_time:152463ms step_avg:154.63ms step:997/1480 train_time:152621ms step_avg:154.63ms step:998/1480 train_time:152780ms step_avg:154.64ms step:999/1480 train_time:152938ms step_avg:154.64ms step:1000/1480 train_time:153098ms step_avg:154.64ms step:1000/1480 val_loss:3.4398 train_time:153173ms step_avg:154.72ms step:1001/1480 train_time:153264ms step_avg:154.66ms step:1002/1480 train_time:153427ms step_avg:154.66ms step:1003/1480 train_time:153591ms step_avg:154.67ms step:1004/1480 train_time:153753ms step_avg:154.68ms step:1005/1480 train_time:153912ms step_avg:154.69ms step:1006/1480 train_time:154074ms step_avg:154.69ms step:1007/1480 train_time:154233ms step_avg:154.70ms step:1008/1480 train_time:154393ms step_avg:154.70ms step:1009/1480 train_time:154557ms step_avg:154.71ms step:1010/1480 train_time:154717ms step_avg:154.72ms step:1011/1480 train_time:154876ms step_avg:154.72ms step:1012/1480 train_time:155033ms step_avg:154.72ms step:1013/1480 train_time:155194ms step_avg:154.73ms step:1014/1480 train_time:155356ms step_avg:154.74ms step:1015/1480 train_time:155518ms step_avg:154.74ms step:1016/1480 train_time:155679ms step_avg:154.75ms step:1017/1480 train_time:155840ms step_avg:154.76ms step:1018/1480 train_time:156000ms step_avg:154.76ms step:1019/1480 train_time:156160ms step_avg:154.77ms step:1020/1480 train_time:156321ms step_avg:154.77ms step:1021/1480 train_time:156480ms step_avg:154.78ms step:1022/1480 train_time:156640ms step_avg:154.78ms step:1023/1480 train_time:156802ms step_avg:154.79ms step:1024/1480 train_time:156963ms step_avg:154.80ms step:1025/1480 train_time:157127ms step_avg:154.81ms step:1026/1480 train_time:157288ms step_avg:154.81ms step:1027/1480 train_time:157449ms step_avg:154.82ms step:1028/1480 train_time:157613ms step_avg:154.83ms step:1029/1480 train_time:157776ms step_avg:154.83ms step:1030/1480 train_time:157936ms step_avg:154.84ms step:1031/1480 train_time:158096ms step_avg:154.84ms step:1032/1480 train_time:158260ms step_avg:154.85ms step:1033/1480 train_time:158419ms step_avg:154.86ms step:1034/1480 train_time:158579ms step_avg:154.86ms step:1035/1480 train_time:158739ms step_avg:154.87ms step:1036/1480 train_time:158899ms step_avg:154.87ms step:1037/1480 train_time:159059ms step_avg:154.88ms step:1038/1480 train_time:159217ms step_avg:154.88ms step:1039/1480 train_time:159379ms step_avg:154.89ms step:1040/1480 train_time:159539ms step_avg:154.89ms step:1041/1480 train_time:159700ms step_avg:154.90ms step:1042/1480 train_time:159858ms step_avg:154.90ms step:1043/1480 train_time:160017ms step_avg:154.91ms step:1044/1480 train_time:160176ms step_avg:154.91ms step:1045/1480 train_time:160337ms step_avg:154.91ms step:1046/1480 train_time:160496ms step_avg:154.92ms step:1047/1480 train_time:160656ms step_avg:154.92ms step:1048/1480 train_time:160816ms step_avg:154.93ms step:1049/1480 train_time:160976ms step_avg:154.93ms step:1050/1480 train_time:161136ms step_avg:154.94ms step:1051/1480 train_time:161298ms step_avg:154.94ms step:1052/1480 train_time:161458ms step_avg:154.95ms step:1053/1480 train_time:161617ms step_avg:154.95ms step:1054/1480 train_time:161778ms step_avg:154.96ms step:1055/1480 train_time:161936ms step_avg:154.96ms step:1056/1480 train_time:162095ms step_avg:154.97ms step:1057/1480 train_time:162254ms step_avg:154.97ms step:1058/1480 train_time:162416ms step_avg:154.98ms step:1059/1480 train_time:162578ms step_avg:154.98ms step:1060/1480 train_time:162739ms step_avg:154.99ms step:1061/1480 train_time:162897ms step_avg:154.99ms step:1062/1480 train_time:163055ms step_avg:155.00ms step:1063/1480 train_time:163215ms step_avg:155.00ms step:1064/1480 train_time:163372ms step_avg:155.00ms step:1065/1480 train_time:163533ms step_avg:155.01ms step:1066/1480 train_time:163695ms step_avg:155.01ms step:1067/1480 train_time:163856ms step_avg:155.02ms step:1068/1480 train_time:164016ms step_avg:155.02ms step:1069/1480 train_time:164178ms step_avg:155.03ms step:1070/1480 train_time:164336ms step_avg:155.03ms step:1071/1480 train_time:164501ms step_avg:155.04ms step:1072/1480 train_time:164661ms step_avg:155.05ms step:1073/1480 train_time:164818ms step_avg:155.05ms step:1074/1480 train_time:164976ms step_avg:155.05ms step:1075/1480 train_time:165136ms step_avg:155.06ms step:1076/1480 train_time:165296ms step_avg:155.06ms step:1077/1480 train_time:165456ms step_avg:155.07ms step:1078/1480 train_time:165619ms step_avg:155.07ms step:1079/1480 train_time:165782ms step_avg:155.08ms step:1080/1480 train_time:165943ms step_avg:155.09ms step:1081/1480 train_time:166102ms step_avg:155.09ms step:1082/1480 train_time:166263ms step_avg:155.10ms step:1083/1480 train_time:166425ms step_avg:155.10ms step:1084/1480 train_time:166586ms step_avg:155.11ms step:1085/1480 train_time:166747ms step_avg:155.11ms step:1086/1480 train_time:166908ms step_avg:155.12ms step:1087/1480 train_time:167069ms step_avg:155.12ms step:1088/1480 train_time:167230ms step_avg:155.13ms step:1089/1480 train_time:167395ms step_avg:155.14ms step:1090/1480 train_time:167558ms step_avg:155.15ms step:1091/1480 train_time:167717ms step_avg:155.15ms step:1092/1480 train_time:167879ms step_avg:155.16ms step:1093/1480 train_time:168039ms step_avg:155.16ms step:1094/1480 train_time:168198ms step_avg:155.16ms step:1095/1480 train_time:168358ms step_avg:155.17ms step:1096/1480 train_time:168519ms step_avg:155.17ms step:1097/1480 train_time:168680ms step_avg:155.18ms step:1098/1480 train_time:168843ms step_avg:155.19ms step:1099/1480 train_time:169005ms step_avg:155.19ms step:1100/1480 train_time:169170ms step_avg:155.20ms step:1101/1480 train_time:169332ms step_avg:155.21ms step:1102/1480 train_time:169495ms step_avg:155.22ms step:1103/1480 train_time:169660ms step_avg:155.22ms step:1104/1480 train_time:169822ms step_avg:155.23ms step:1105/1480 train_time:169984ms step_avg:155.24ms step:1106/1480 train_time:170147ms step_avg:155.24ms step:1107/1480 train_time:170309ms step_avg:155.25ms step:1108/1480 train_time:170470ms step_avg:155.25ms step:1109/1480 train_time:170630ms step_avg:155.26ms step:1110/1480 train_time:170792ms step_avg:155.27ms step:1111/1480 train_time:170953ms step_avg:155.27ms step:1112/1480 train_time:171116ms step_avg:155.28ms step:1113/1480 train_time:171286ms step_avg:155.29ms step:1114/1480 train_time:171449ms step_avg:155.30ms step:1115/1480 train_time:171612ms step_avg:155.31ms step:1116/1480 train_time:171773ms step_avg:155.31ms step:1117/1480 train_time:171935ms step_avg:155.32ms step:1118/1480 train_time:172098ms step_avg:155.32ms step:1119/1480 train_time:172258ms step_avg:155.33ms step:1120/1480 train_time:172419ms step_avg:155.33ms step:1121/1480 train_time:172581ms step_avg:155.34ms step:1122/1480 train_time:172740ms step_avg:155.34ms step:1123/1480 train_time:172899ms step_avg:155.35ms step:1124/1480 train_time:173062ms step_avg:155.35ms step:1125/1480 train_time:173226ms step_avg:155.36ms step:1125/1480 val_loss:3.3847 train_time:173301ms step_avg:155.43ms step:1126/1480 train_time:173396ms step_avg:155.37ms step:1127/1480 train_time:173550ms step_avg:155.37ms step:1128/1480 train_time:173711ms step_avg:155.38ms step:1129/1480 train_time:173872ms step_avg:155.38ms step:1130/1480 train_time:174032ms step_avg:155.39ms step:1131/1480 train_time:174201ms step_avg:155.40ms step:1132/1480 train_time:174362ms step_avg:155.40ms step:1133/1480 train_time:174527ms step_avg:155.41ms step:1134/1480 train_time:174688ms step_avg:155.42ms step:1135/1480 train_time:174850ms step_avg:155.42ms step:1136/1480 train_time:175013ms step_avg:155.43ms step:1137/1480 train_time:175173ms step_avg:155.43ms step:1138/1480 train_time:175340ms step_avg:155.44ms step:1139/1480 train_time:175524ms step_avg:155.47ms step:1140/1480 train_time:175663ms step_avg:155.45ms step:1141/1480 train_time:175830ms step_avg:155.46ms step:1142/1480 train_time:175991ms step_avg:155.47ms step:1143/1480 train_time:176155ms step_avg:155.48ms step:1144/1480 train_time:176317ms step_avg:155.48ms step:1145/1480 train_time:176476ms step_avg:155.49ms step:1146/1480 train_time:176642ms step_avg:155.49ms step:1147/1480 train_time:176803ms step_avg:155.50ms step:1148/1480 train_time:176963ms step_avg:155.50ms step:1149/1480 train_time:177127ms step_avg:155.51ms step:1150/1480 train_time:177286ms step_avg:155.51ms step:1151/1480 train_time:177450ms step_avg:155.52ms step:1152/1480 train_time:177613ms step_avg:155.53ms step:1153/1480 train_time:177778ms step_avg:155.54ms step:1154/1480 train_time:177939ms step_avg:155.54ms step:1155/1480 train_time:178101ms step_avg:155.55ms step:1156/1480 train_time:178266ms step_avg:155.56ms step:1157/1480 train_time:178429ms step_avg:155.56ms step:1158/1480 train_time:178588ms step_avg:155.56ms step:1159/1480 train_time:178748ms step_avg:155.57ms step:1160/1480 train_time:178908ms step_avg:155.57ms step:1161/1480 train_time:179069ms step_avg:155.58ms step:1162/1480 train_time:179230ms step_avg:155.58ms step:1163/1480 train_time:179390ms step_avg:155.59ms step:1164/1480 train_time:179552ms step_avg:155.59ms step:1165/1480 train_time:179712ms step_avg:155.59ms step:1166/1480 train_time:179873ms step_avg:155.60ms step:1167/1480 train_time:180032ms step_avg:155.60ms step:1168/1480 train_time:180198ms step_avg:155.61ms step:1169/1480 train_time:180360ms step_avg:155.62ms step:1170/1480 train_time:180522ms step_avg:155.62ms step:1171/1480 train_time:180683ms step_avg:155.63ms step:1172/1480 train_time:180844ms step_avg:155.63ms step:1173/1480 train_time:181006ms step_avg:155.64ms step:1174/1480 train_time:181175ms step_avg:155.65ms step:1175/1480 train_time:181339ms step_avg:155.66ms step:1176/1480 train_time:181503ms step_avg:155.66ms step:1177/1480 train_time:181669ms step_avg:155.67ms step:1178/1480 train_time:181828ms step_avg:155.67ms step:1179/1480 train_time:181987ms step_avg:155.68ms step:1180/1480 train_time:182156ms step_avg:155.69ms step:1181/1480 train_time:182318ms step_avg:155.69ms step:1182/1480 train_time:182479ms step_avg:155.70ms step:1183/1480 train_time:182642ms step_avg:155.70ms step:1184/1480 train_time:182804ms step_avg:155.71ms step:1185/1480 train_time:182968ms step_avg:155.72ms step:1186/1480 train_time:183130ms step_avg:155.72ms step:1187/1480 train_time:183303ms step_avg:155.74ms step:1188/1480 train_time:183463ms step_avg:155.74ms step:1189/1480 train_time:183625ms step_avg:155.75ms step:1190/1480 train_time:183786ms step_avg:155.75ms step:1191/1480 train_time:183949ms step_avg:155.76ms step:1192/1480 train_time:184109ms step_avg:155.76ms step:1193/1480 train_time:184269ms step_avg:155.76ms step:1194/1480 train_time:184430ms step_avg:155.77ms step:1195/1480 train_time:184593ms step_avg:155.77ms step:1196/1480 train_time:184761ms step_avg:155.79ms step:1197/1480 train_time:184924ms step_avg:155.79ms step:1198/1480 train_time:185090ms step_avg:155.80ms step:1199/1480 train_time:185251ms step_avg:155.80ms step:1200/1480 train_time:185410ms step_avg:155.81ms step:1201/1480 train_time:185572ms step_avg:155.81ms step:1202/1480 train_time:185742ms step_avg:155.82ms step:1203/1480 train_time:185908ms step_avg:155.83ms step:1204/1480 train_time:186069ms step_avg:155.84ms step:1205/1480 train_time:186230ms step_avg:155.84ms step:1206/1480 train_time:186390ms step_avg:155.84ms step:1207/1480 train_time:186551ms step_avg:155.85ms step:1208/1480 train_time:186712ms step_avg:155.85ms step:1209/1480 train_time:186876ms step_avg:155.86ms step:1210/1480 train_time:187045ms step_avg:155.87ms step:1211/1480 train_time:187208ms step_avg:155.88ms step:1212/1480 train_time:187370ms step_avg:155.88ms step:1213/1480 train_time:187533ms step_avg:155.89ms step:1214/1480 train_time:187700ms step_avg:155.90ms step:1215/1480 train_time:187864ms step_avg:155.90ms step:1216/1480 train_time:188026ms step_avg:155.91ms step:1217/1480 train_time:188189ms step_avg:155.91ms step:1218/1480 train_time:188350ms step_avg:155.92ms step:1219/1480 train_time:188518ms step_avg:155.93ms step:1220/1480 train_time:188681ms step_avg:155.93ms step:1221/1480 train_time:188841ms step_avg:155.94ms step:1222/1480 train_time:189003ms step_avg:155.94ms step:1223/1480 train_time:189165ms step_avg:155.95ms step:1224/1480 train_time:189332ms step_avg:155.96ms step:1225/1480 train_time:189495ms step_avg:155.96ms step:1226/1480 train_time:189659ms step_avg:155.97ms step:1227/1480 train_time:189825ms step_avg:155.98ms step:1228/1480 train_time:189987ms step_avg:155.98ms step:1229/1480 train_time:190149ms step_avg:155.99ms step:1230/1480 train_time:190318ms step_avg:156.00ms step:1231/1480 train_time:190485ms step_avg:156.01ms step:1232/1480 train_time:190649ms step_avg:156.01ms step:1233/1480 train_time:190809ms step_avg:156.02ms step:1234/1480 train_time:190970ms step_avg:156.02ms step:1235/1480 train_time:191137ms step_avg:156.03ms step:1236/1480 train_time:191299ms step_avg:156.04ms step:1237/1480 train_time:191462ms step_avg:156.04ms step:1238/1480 train_time:191631ms step_avg:156.05ms step:1239/1480 train_time:191796ms step_avg:156.06ms step:1240/1480 train_time:191961ms step_avg:156.07ms step:1241/1480 train_time:192126ms step_avg:156.07ms step:1242/1480 train_time:192287ms step_avg:156.08ms step:1243/1480 train_time:192450ms step_avg:156.08ms step:1244/1480 train_time:192609ms step_avg:156.09ms step:1245/1480 train_time:192771ms step_avg:156.09ms step:1246/1480 train_time:192936ms step_avg:156.10ms step:1247/1480 train_time:193100ms step_avg:156.10ms step:1248/1480 train_time:193262ms step_avg:156.11ms step:1249/1480 train_time:193425ms step_avg:156.11ms step:1250/1480 train_time:193586ms step_avg:156.12ms step:1250/1480 val_loss:3.3350 train_time:193662ms step_avg:156.18ms step:1251/1480 train_time:193755ms step_avg:156.13ms step:1252/1480 train_time:193917ms step_avg:156.13ms step:1253/1480 train_time:194077ms step_avg:156.14ms step:1254/1480 train_time:194237ms step_avg:156.14ms step:1255/1480 train_time:194408ms step_avg:156.15ms step:1256/1480 train_time:194573ms step_avg:156.16ms step:1257/1480 train_time:194734ms step_avg:156.16ms step:1258/1480 train_time:194898ms step_avg:156.17ms step:1259/1480 train_time:195060ms step_avg:156.17ms step:1260/1480 train_time:195221ms step_avg:156.18ms step:1261/1480 train_time:195384ms step_avg:156.18ms step:1262/1480 train_time:195549ms step_avg:156.19ms step:1263/1480 train_time:195715ms step_avg:156.20ms step:1264/1480 train_time:195873ms step_avg:156.20ms step:1265/1480 train_time:196033ms step_avg:156.20ms step:1266/1480 train_time:196196ms step_avg:156.21ms step:1267/1480 train_time:196356ms step_avg:156.21ms step:1268/1480 train_time:196520ms step_avg:156.22ms step:1269/1480 train_time:196686ms step_avg:156.22ms step:1270/1480 train_time:196850ms step_avg:156.23ms step:1271/1480 train_time:197013ms step_avg:156.24ms step:1272/1480 train_time:197173ms step_avg:156.24ms step:1273/1480 train_time:197336ms step_avg:156.24ms step:1274/1480 train_time:197501ms step_avg:156.25ms step:1275/1480 train_time:197662ms step_avg:156.25ms step:1276/1480 train_time:197822ms step_avg:156.26ms step:1277/1480 train_time:197986ms step_avg:156.26ms step:1278/1480 train_time:198147ms step_avg:156.27ms step:1279/1480 train_time:198309ms step_avg:156.27ms step:1280/1480 train_time:198476ms step_avg:156.28ms step:1281/1480 train_time:198638ms step_avg:156.28ms step:1282/1480 train_time:198796ms step_avg:156.29ms step:1283/1480 train_time:198959ms step_avg:156.29ms step:1284/1480 train_time:199122ms step_avg:156.30ms step:1285/1480 train_time:199284ms step_avg:156.30ms step:1286/1480 train_time:199446ms step_avg:156.31ms step:1287/1480 train_time:199609ms step_avg:156.31ms step:1288/1480 train_time:199772ms step_avg:156.32ms step:1289/1480 train_time:199942ms step_avg:156.33ms step:1290/1480 train_time:200112ms step_avg:156.34ms step:1291/1480 train_time:200275ms step_avg:156.34ms step:1292/1480 train_time:200438ms step_avg:156.35ms step:1293/1480 train_time:200605ms step_avg:156.36ms step:1294/1480 train_time:200769ms step_avg:156.36ms step:1295/1480 train_time:200932ms step_avg:156.37ms step:1296/1480 train_time:201093ms step_avg:156.37ms step:1297/1480 train_time:201256ms step_avg:156.38ms step:1298/1480 train_time:201418ms step_avg:156.38ms step:1299/1480 train_time:201581ms step_avg:156.39ms step:1300/1480 train_time:201743ms step_avg:156.39ms step:1301/1480 train_time:201904ms step_avg:156.39ms step:1302/1480 train_time:202070ms step_avg:156.40ms step:1303/1480 train_time:202235ms step_avg:156.41ms step:1304/1480 train_time:202400ms step_avg:156.41ms step:1305/1480 train_time:202561ms step_avg:156.42ms step:1306/1480 train_time:202727ms step_avg:156.43ms step:1307/1480 train_time:202890ms step_avg:156.43ms step:1308/1480 train_time:203051ms step_avg:156.43ms step:1309/1480 train_time:203215ms step_avg:156.44ms step:1310/1480 train_time:203375ms step_avg:156.44ms step:1311/1480 train_time:203535ms step_avg:156.45ms step:1312/1480 train_time:203702ms step_avg:156.45ms step:1313/1480 train_time:203865ms step_avg:156.46ms step:1314/1480 train_time:204031ms step_avg:156.47ms step:1315/1480 train_time:204194ms step_avg:156.47ms step:1316/1480 train_time:204353ms step_avg:156.47ms step:1317/1480 train_time:204514ms step_avg:156.48ms step:1318/1480 train_time:204681ms step_avg:156.48ms step:1319/1480 train_time:204847ms step_avg:156.49ms step:1320/1480 train_time:205015ms step_avg:156.50ms step:1321/1480 train_time:205178ms step_avg:156.51ms step:1322/1480 train_time:205351ms step_avg:156.52ms step:1323/1480 train_time:205514ms step_avg:156.52ms step:1324/1480 train_time:205677ms step_avg:156.53ms step:1325/1480 train_time:205847ms step_avg:156.54ms step:1326/1480 train_time:206014ms step_avg:156.55ms step:1327/1480 train_time:206176ms step_avg:156.55ms step:1328/1480 train_time:206338ms step_avg:156.55ms step:1329/1480 train_time:206538ms step_avg:156.59ms step:1330/1480 train_time:206688ms step_avg:156.58ms step:1331/1480 train_time:206852ms step_avg:156.59ms step:1332/1480 train_time:207014ms step_avg:156.59ms step:1333/1480 train_time:207179ms step_avg:156.60ms step:1334/1480 train_time:207343ms step_avg:156.60ms step:1335/1480 train_time:207504ms step_avg:156.61ms step:1336/1480 train_time:207674ms step_avg:156.62ms step:1337/1480 train_time:207841ms step_avg:156.62ms step:1338/1480 train_time:208004ms step_avg:156.63ms step:1339/1480 train_time:208169ms step_avg:156.64ms step:1340/1480 train_time:208332ms step_avg:156.64ms step:1341/1480 train_time:208493ms step_avg:156.64ms step:1342/1480 train_time:208659ms step_avg:156.65ms step:1343/1480 train_time:208821ms step_avg:156.66ms step:1344/1480 train_time:208983ms step_avg:156.66ms step:1345/1480 train_time:209152ms step_avg:156.67ms step:1346/1480 train_time:209313ms step_avg:156.67ms step:1347/1480 train_time:209476ms step_avg:156.68ms step:1348/1480 train_time:209638ms step_avg:156.68ms step:1349/1480 train_time:209803ms step_avg:156.69ms step:1350/1480 train_time:209970ms step_avg:156.69ms step:1351/1480 train_time:210133ms step_avg:156.70ms step:1352/1480 train_time:210294ms step_avg:156.70ms step:1353/1480 train_time:210460ms step_avg:156.71ms step:1354/1480 train_time:210627ms step_avg:156.72ms step:1355/1480 train_time:210788ms step_avg:156.72ms step:1356/1480 train_time:210956ms step_avg:156.73ms step:1357/1480 train_time:211119ms step_avg:156.73ms step:1358/1480 train_time:211283ms step_avg:156.74ms step:1359/1480 train_time:211447ms step_avg:156.74ms step:1360/1480 train_time:211615ms step_avg:156.75ms step:1361/1480 train_time:211780ms step_avg:156.76ms step:1362/1480 train_time:211945ms step_avg:156.76ms step:1363/1480 train_time:212113ms step_avg:156.77ms step:1364/1480 train_time:212275ms step_avg:156.78ms step:1365/1480 train_time:212436ms step_avg:156.78ms step:1366/1480 train_time:212598ms step_avg:156.78ms step:1367/1480 train_time:212760ms step_avg:156.79ms step:1368/1480 train_time:212928ms step_avg:156.80ms step:1369/1480 train_time:213097ms step_avg:156.80ms step:1370/1480 train_time:213263ms step_avg:156.81ms step:1371/1480 train_time:213428ms step_avg:156.82ms step:1372/1480 train_time:213596ms step_avg:156.83ms step:1373/1480 train_time:213756ms step_avg:156.83ms step:1374/1480 train_time:213925ms step_avg:156.84ms step:1375/1480 train_time:214087ms step_avg:156.84ms step:1375/1480 val_loss:3.2960 train_time:214162ms step_avg:156.89ms step:1376/1480 train_time:214253ms step_avg:156.85ms step:1377/1480 train_time:214415ms step_avg:156.85ms step:1378/1480 train_time:214577ms step_avg:156.85ms step:1379/1480 train_time:214741ms step_avg:156.86ms step:1380/1480 train_time:214905ms step_avg:156.87ms step:1381/1480 train_time:215076ms step_avg:156.88ms step:1382/1480 train_time:215238ms step_avg:156.88ms step:1383/1480 train_time:215402ms step_avg:156.88ms step:1384/1480 train_time:215569ms step_avg:156.89ms step:1385/1480 train_time:215729ms step_avg:156.89ms step:1386/1480 train_time:215892ms step_avg:156.90ms step:1387/1480 train_time:216055ms step_avg:156.90ms step:1388/1480 train_time:216216ms step_avg:156.91ms step:1389/1480 train_time:216381ms step_avg:156.91ms step:1390/1480 train_time:216542ms step_avg:156.91ms step:1391/1480 train_time:216705ms step_avg:156.92ms step:1392/1480 train_time:216870ms step_avg:156.92ms step:1393/1480 train_time:217034ms step_avg:156.93ms step:1394/1480 train_time:217195ms step_avg:156.93ms step:1395/1480 train_time:217358ms step_avg:156.94ms step:1396/1480 train_time:217520ms step_avg:156.94ms step:1397/1480 train_time:217680ms step_avg:156.94ms step:1398/1480 train_time:217840ms step_avg:156.94ms step:1399/1480 train_time:218003ms step_avg:156.95ms step:1400/1480 train_time:218172ms step_avg:156.96ms step:1401/1480 train_time:218332ms step_avg:156.96ms step:1402/1480 train_time:218494ms step_avg:156.96ms step:1403/1480 train_time:218658ms step_avg:156.97ms step:1404/1480 train_time:218821ms step_avg:156.97ms step:1405/1480 train_time:218987ms step_avg:156.98ms step:1406/1480 train_time:219152ms step_avg:156.99ms step:1407/1480 train_time:219312ms step_avg:156.99ms step:1408/1480 train_time:219474ms step_avg:156.99ms step:1409/1480 train_time:219645ms step_avg:157.00ms step:1410/1480 train_time:219809ms step_avg:157.01ms step:1411/1480 train_time:219969ms step_avg:157.01ms step:1412/1480 train_time:220130ms step_avg:157.01ms step:1413/1480 train_time:220293ms step_avg:157.02ms step:1414/1480 train_time:220457ms step_avg:157.02ms step:1415/1480 train_time:220623ms step_avg:157.03ms step:1416/1480 train_time:220797ms step_avg:157.04ms step:1417/1480 train_time:220964ms step_avg:157.05ms step:1418/1480 train_time:221128ms step_avg:157.05ms step:1419/1480 train_time:221293ms step_avg:157.06ms step:1420/1480 train_time:221457ms step_avg:157.06ms step:1421/1480 train_time:221621ms step_avg:157.07ms step:1422/1480 train_time:221787ms step_avg:157.07ms step:1423/1480 train_time:221950ms step_avg:157.08ms step:1424/1480 train_time:222116ms step_avg:157.08ms step:1425/1480 train_time:222287ms step_avg:157.09ms step:1426/1480 train_time:222452ms step_avg:157.10ms step:1427/1480 train_time:222616ms step_avg:157.10ms step:1428/1480 train_time:222778ms step_avg:157.11ms step:1429/1480 train_time:222939ms step_avg:157.11ms step:1430/1480 train_time:223103ms step_avg:157.11ms step:1431/1480 train_time:223269ms step_avg:157.12ms step:1432/1480 train_time:223436ms step_avg:157.13ms step:1433/1480 train_time:223606ms step_avg:157.14ms step:1434/1480 train_time:223775ms step_avg:157.15ms step:1435/1480 train_time:223940ms step_avg:157.15ms step:1436/1480 train_time:224105ms step_avg:157.16ms step:1437/1480 train_time:224269ms step_avg:157.16ms step:1438/1480 train_time:224432ms step_avg:157.17ms step:1439/1480 train_time:224598ms step_avg:157.17ms step:1440/1480 train_time:224759ms step_avg:157.17ms step:1441/1480 train_time:224925ms step_avg:157.18ms step:1442/1480 train_time:225090ms step_avg:157.19ms step:1443/1480 train_time:225262ms step_avg:157.20ms step:1444/1480 train_time:225426ms step_avg:157.20ms step:1445/1480 train_time:225589ms step_avg:157.20ms step:1446/1480 train_time:225754ms step_avg:157.21ms step:1447/1480 train_time:225921ms step_avg:157.22ms step:1448/1480 train_time:226084ms step_avg:157.22ms step:1449/1480 train_time:226249ms step_avg:157.23ms step:1450/1480 train_time:226413ms step_avg:157.23ms step:1451/1480 train_time:226576ms step_avg:157.24ms step:1452/1480 train_time:226740ms step_avg:157.24ms step:1453/1480 train_time:226903ms step_avg:157.24ms step:1454/1480 train_time:227067ms step_avg:157.25ms step:1455/1480 train_time:227233ms step_avg:157.25ms step:1456/1480 train_time:227395ms step_avg:157.26ms step:1457/1480 train_time:227558ms step_avg:157.26ms step:1458/1480 train_time:227723ms step_avg:157.27ms step:1459/1480 train_time:227890ms step_avg:157.27ms step:1460/1480 train_time:228054ms step_avg:157.28ms step:1461/1480 train_time:228217ms step_avg:157.28ms step:1462/1480 train_time:228380ms step_avg:157.29ms step:1463/1480 train_time:228548ms step_avg:157.29ms step:1464/1480 train_time:228713ms step_avg:157.30ms step:1465/1480 train_time:228878ms step_avg:157.30ms step:1466/1480 train_time:229040ms step_avg:157.31ms step:1467/1480 train_time:229206ms step_avg:157.31ms step:1468/1480 train_time:229368ms step_avg:157.32ms step:1469/1480 train_time:229532ms step_avg:157.32ms step:1470/1480 train_time:229699ms step_avg:157.33ms step:1471/1480 train_time:229873ms step_avg:157.34ms step:1472/1480 train_time:230042ms step_avg:157.35ms step:1473/1480 train_time:230205ms step_avg:157.35ms step:1474/1480 train_time:230373ms step_avg:157.36ms step:1475/1480 train_time:230543ms step_avg:157.37ms step:1476/1480 train_time:230707ms step_avg:157.37ms step:1477/1480 train_time:230875ms step_avg:157.38ms step:1478/1480 train_time:231046ms step_avg:157.39ms step:1479/1480 train_time:231210ms step_avg:157.39ms step:1480/1480 train_time:231373ms step_avg:157.40ms step:1480/1480 val_loss:3.2765 train_time:231448ms step_avg:157.45ms peak memory consumption: 34239 MiB