import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 10:03:14 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 37C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29376ms step_avg:nanms step:2/1480 train_time:29485ms step_avg:nanms step:3/1480 train_time:29607ms step_avg:nanms step:4/1480 train_time:29748ms step_avg:nanms step:5/1480 train_time:29890ms step_avg:nanms step:6/1480 train_time:30030ms step_avg:nanms step:7/1480 train_time:30171ms step_avg:nanms step:8/1480 train_time:30313ms step_avg:nanms step:9/1480 train_time:30456ms step_avg:nanms step:10/1480 train_time:30600ms step_avg:nanms step:11/1480 train_time:143ms step_avg:nanms step:12/1480 train_time:281ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.81ms step:14/1480 train_time:570ms step_avg:142.40ms step:15/1480 train_time:711ms step_avg:142.30ms step:16/1480 train_time:853ms step_avg:142.08ms step:17/1480 train_time:995ms step_avg:142.21ms step:18/1480 train_time:1137ms step_avg:142.14ms step:19/1480 train_time:1278ms step_avg:142.00ms step:20/1480 train_time:1420ms step_avg:142.00ms step:21/1480 train_time:1565ms step_avg:142.29ms step:22/1480 train_time:1708ms step_avg:142.37ms step:23/1480 train_time:1851ms step_avg:142.39ms step:24/1480 train_time:1994ms step_avg:142.44ms step:25/1480 train_time:2137ms step_avg:142.45ms step:26/1480 train_time:2278ms step_avg:142.36ms step:27/1480 train_time:2421ms step_avg:142.40ms step:28/1480 train_time:2563ms step_avg:142.39ms step:29/1480 train_time:2708ms step_avg:142.51ms step:30/1480 train_time:2852ms step_avg:142.60ms step:31/1480 train_time:2996ms step_avg:142.68ms step:32/1480 train_time:3139ms step_avg:142.67ms step:33/1480 train_time:3281ms step_avg:142.64ms step:34/1480 train_time:3424ms step_avg:142.66ms step:35/1480 train_time:3568ms step_avg:142.73ms step:36/1480 train_time:3712ms step_avg:142.78ms step:37/1480 train_time:3853ms step_avg:142.71ms step:38/1480 train_time:3995ms step_avg:142.69ms step:39/1480 train_time:4138ms step_avg:142.68ms step:40/1480 train_time:4280ms step_avg:142.66ms step:41/1480 train_time:4422ms step_avg:142.65ms step:42/1480 train_time:4567ms step_avg:142.71ms step:43/1480 train_time:4710ms step_avg:142.73ms step:44/1480 train_time:4853ms step_avg:142.75ms step:45/1480 train_time:4995ms step_avg:142.71ms step:46/1480 train_time:5137ms step_avg:142.70ms step:47/1480 train_time:5278ms step_avg:142.66ms step:48/1480 train_time:5421ms step_avg:142.67ms step:49/1480 train_time:5564ms step_avg:142.68ms step:50/1480 train_time:5709ms step_avg:142.72ms step:51/1480 train_time:5851ms step_avg:142.70ms step:52/1480 train_time:5993ms step_avg:142.70ms step:53/1480 train_time:6135ms step_avg:142.69ms step:54/1480 train_time:6277ms step_avg:142.67ms step:55/1480 train_time:6420ms step_avg:142.67ms step:56/1480 train_time:6562ms step_avg:142.65ms step:57/1480 train_time:6708ms step_avg:142.73ms step:58/1480 train_time:6854ms step_avg:142.79ms step:59/1480 train_time:6996ms step_avg:142.77ms step:60/1480 train_time:7138ms step_avg:142.77ms step:61/1480 train_time:7280ms step_avg:142.74ms step:62/1480 train_time:7423ms step_avg:142.75ms step:63/1480 train_time:7566ms step_avg:142.76ms step:64/1480 train_time:7710ms step_avg:142.78ms step:65/1480 train_time:7852ms step_avg:142.77ms step:66/1480 train_time:7995ms step_avg:142.77ms step:67/1480 train_time:8137ms step_avg:142.75ms step:68/1480 train_time:8278ms step_avg:142.73ms step:69/1480 train_time:8421ms step_avg:142.72ms step:70/1480 train_time:8564ms step_avg:142.73ms step:71/1480 train_time:8706ms step_avg:142.73ms step:72/1480 train_time:8850ms step_avg:142.74ms step:73/1480 train_time:8991ms step_avg:142.72ms step:74/1480 train_time:9134ms step_avg:142.71ms step:75/1480 train_time:9276ms step_avg:142.70ms step:76/1480 train_time:9419ms step_avg:142.71ms step:77/1480 train_time:9560ms step_avg:142.68ms step:78/1480 train_time:9703ms step_avg:142.69ms step:79/1480 train_time:9845ms step_avg:142.68ms step:80/1480 train_time:10387ms step_avg:148.38ms step:81/1480 train_time:10488ms step_avg:147.72ms step:82/1480 train_time:10630ms step_avg:147.65ms step:83/1480 train_time:10772ms step_avg:147.57ms step:84/1480 train_time:10915ms step_avg:147.50ms step:85/1480 train_time:11056ms step_avg:147.41ms step:86/1480 train_time:11199ms step_avg:147.35ms step:87/1480 train_time:11341ms step_avg:147.29ms step:88/1480 train_time:11484ms step_avg:147.24ms step:89/1480 train_time:11628ms step_avg:147.20ms step:90/1480 train_time:11770ms step_avg:147.13ms step:91/1480 train_time:11913ms step_avg:147.07ms step:92/1480 train_time:12056ms step_avg:147.03ms step:93/1480 train_time:12198ms step_avg:146.97ms step:94/1480 train_time:12340ms step_avg:146.91ms step:95/1480 train_time:12483ms step_avg:146.86ms step:96/1480 train_time:12628ms step_avg:146.83ms step:97/1480 train_time:13151ms step_avg:151.17ms step:98/1480 train_time:13654ms step_avg:155.16ms step:99/1480 train_time:13752ms step_avg:154.52ms step:100/1480 train_time:13895ms step_avg:154.39ms step:101/1480 train_time:14041ms step_avg:154.30ms step:102/1480 train_time:14177ms step_avg:154.10ms step:103/1480 train_time:14320ms step_avg:153.98ms step:104/1480 train_time:14462ms step_avg:153.86ms step:105/1480 train_time:14607ms step_avg:153.75ms step:106/1480 train_time:14753ms step_avg:153.68ms step:107/1480 train_time:14897ms step_avg:153.58ms step:108/1480 train_time:15040ms step_avg:153.46ms step:109/1480 train_time:15182ms step_avg:153.35ms step:110/1480 train_time:15328ms step_avg:153.28ms step:111/1480 train_time:15471ms step_avg:153.18ms step:112/1480 train_time:15617ms step_avg:153.10ms step:113/1480 train_time:15762ms step_avg:153.03ms step:114/1480 train_time:15908ms step_avg:152.96ms step:115/1480 train_time:16054ms step_avg:152.90ms step:116/1480 train_time:16200ms step_avg:152.83ms step:117/1480 train_time:16346ms step_avg:152.76ms step:118/1480 train_time:16492ms step_avg:152.70ms step:119/1480 train_time:16636ms step_avg:152.63ms step:120/1480 train_time:16782ms step_avg:152.56ms step:121/1480 train_time:16928ms step_avg:152.51ms step:122/1480 train_time:17073ms step_avg:152.44ms step:123/1480 train_time:17219ms step_avg:152.38ms step:124/1480 train_time:17365ms step_avg:152.33ms step:125/1480 train_time:17512ms step_avg:152.28ms step:125/1480 val_loss:4.3985 train_time:17576ms step_avg:152.84ms step:126/1480 train_time:17681ms step_avg:152.43ms step:127/1480 train_time:17814ms step_avg:152.25ms step:128/1480 train_time:17959ms step_avg:152.20ms step:129/1480 train_time:18104ms step_avg:152.14ms step:130/1480 train_time:18251ms step_avg:152.09ms step:131/1480 train_time:18396ms step_avg:152.03ms step:132/1480 train_time:18540ms step_avg:151.97ms step:133/1480 train_time:18686ms step_avg:151.92ms step:134/1480 train_time:18832ms step_avg:151.87ms step:135/1480 train_time:18978ms step_avg:151.82ms step:136/1480 train_time:19123ms step_avg:151.77ms step:137/1480 train_time:19270ms step_avg:151.73ms step:138/1480 train_time:19416ms step_avg:151.69ms step:139/1480 train_time:19561ms step_avg:151.64ms step:140/1480 train_time:19706ms step_avg:151.59ms step:141/1480 train_time:19853ms step_avg:151.55ms step:142/1480 train_time:19998ms step_avg:151.50ms step:143/1480 train_time:20143ms step_avg:151.45ms step:144/1480 train_time:20290ms step_avg:151.41ms step:145/1480 train_time:20436ms step_avg:151.38ms step:146/1480 train_time:20581ms step_avg:151.33ms step:147/1480 train_time:20726ms step_avg:151.29ms step:148/1480 train_time:20873ms step_avg:151.26ms step:149/1480 train_time:21018ms step_avg:151.21ms step:150/1480 train_time:21164ms step_avg:151.17ms step:151/1480 train_time:21310ms step_avg:151.14ms step:152/1480 train_time:21457ms step_avg:151.11ms step:153/1480 train_time:21601ms step_avg:151.06ms step:154/1480 train_time:21748ms step_avg:151.03ms step:155/1480 train_time:21894ms step_avg:150.99ms step:156/1480 train_time:22039ms step_avg:150.95ms step:157/1480 train_time:22184ms step_avg:150.91ms step:158/1480 train_time:22330ms step_avg:150.88ms step:159/1480 train_time:22476ms step_avg:150.85ms step:160/1480 train_time:22622ms step_avg:150.81ms step:161/1480 train_time:22770ms step_avg:150.79ms step:162/1480 train_time:22916ms step_avg:150.76ms step:163/1480 train_time:23062ms step_avg:150.73ms step:164/1480 train_time:23208ms step_avg:150.70ms step:165/1480 train_time:23355ms step_avg:150.68ms step:166/1480 train_time:23501ms step_avg:150.64ms step:167/1480 train_time:23647ms step_avg:150.62ms step:168/1480 train_time:23794ms step_avg:150.59ms step:169/1480 train_time:23939ms step_avg:150.56ms step:170/1480 train_time:24084ms step_avg:150.52ms step:171/1480 train_time:24230ms step_avg:150.50ms step:172/1480 train_time:24376ms step_avg:150.47ms step:173/1480 train_time:24521ms step_avg:150.43ms step:174/1480 train_time:24666ms step_avg:150.40ms step:175/1480 train_time:24812ms step_avg:150.38ms step:176/1480 train_time:24959ms step_avg:150.35ms step:177/1480 train_time:25104ms step_avg:150.32ms step:178/1480 train_time:25251ms step_avg:150.31ms step:179/1480 train_time:25397ms step_avg:150.28ms step:180/1480 train_time:25542ms step_avg:150.25ms step:181/1480 train_time:25689ms step_avg:150.23ms step:182/1480 train_time:25836ms step_avg:150.21ms step:183/1480 train_time:25980ms step_avg:150.17ms step:184/1480 train_time:26126ms step_avg:150.15ms step:185/1480 train_time:26273ms step_avg:150.13ms step:186/1480 train_time:26418ms step_avg:150.10ms step:187/1480 train_time:26564ms step_avg:150.08ms step:188/1480 train_time:26710ms step_avg:150.05ms step:189/1480 train_time:26875ms step_avg:150.14ms step:190/1480 train_time:27002ms step_avg:150.01ms step:191/1480 train_time:27147ms step_avg:149.99ms step:192/1480 train_time:27294ms step_avg:149.96ms step:193/1480 train_time:27439ms step_avg:149.94ms step:194/1480 train_time:27583ms step_avg:149.91ms step:195/1480 train_time:27730ms step_avg:149.89ms step:196/1480 train_time:27876ms step_avg:149.87ms step:197/1480 train_time:28021ms step_avg:149.85ms step:198/1480 train_time:28167ms step_avg:149.83ms step:199/1480 train_time:28313ms step_avg:149.80ms step:200/1480 train_time:28459ms step_avg:149.78ms step:201/1480 train_time:28606ms step_avg:149.77ms step:202/1480 train_time:28750ms step_avg:149.74ms step:203/1480 train_time:28896ms step_avg:149.72ms step:204/1480 train_time:29041ms step_avg:149.70ms step:205/1480 train_time:29187ms step_avg:149.68ms step:206/1480 train_time:29333ms step_avg:149.66ms step:207/1480 train_time:29477ms step_avg:149.63ms step:208/1480 train_time:29623ms step_avg:149.61ms step:209/1480 train_time:29770ms step_avg:149.60ms step:210/1480 train_time:29916ms step_avg:149.58ms step:211/1480 train_time:30062ms step_avg:149.56ms step:212/1480 train_time:30209ms step_avg:149.55ms step:213/1480 train_time:30356ms step_avg:149.54ms step:214/1480 train_time:30500ms step_avg:149.51ms step:215/1480 train_time:30646ms step_avg:149.49ms step:216/1480 train_time:30793ms step_avg:149.48ms step:217/1480 train_time:30937ms step_avg:149.45ms step:218/1480 train_time:31082ms step_avg:149.43ms step:219/1480 train_time:31228ms step_avg:149.42ms step:220/1480 train_time:31374ms step_avg:149.40ms step:221/1480 train_time:32006ms step_avg:151.69ms step:222/1480 train_time:32114ms step_avg:151.48ms step:223/1480 train_time:32264ms step_avg:151.47ms step:224/1480 train_time:32412ms step_avg:151.46ms step:225/1480 train_time:32561ms step_avg:151.44ms step:226/1480 train_time:32709ms step_avg:151.43ms step:227/1480 train_time:32857ms step_avg:151.42ms step:228/1480 train_time:33005ms step_avg:151.40ms step:229/1480 train_time:33155ms step_avg:151.39ms step:230/1480 train_time:33303ms step_avg:151.38ms step:231/1480 train_time:33453ms step_avg:151.37ms step:232/1480 train_time:33600ms step_avg:151.35ms step:233/1480 train_time:33750ms step_avg:151.34ms step:234/1480 train_time:33898ms step_avg:151.33ms step:235/1480 train_time:34047ms step_avg:151.32ms step:236/1480 train_time:34196ms step_avg:151.31ms step:237/1480 train_time:34345ms step_avg:151.30ms step:238/1480 train_time:34494ms step_avg:151.29ms step:239/1480 train_time:34642ms step_avg:151.28ms step:240/1480 train_time:34791ms step_avg:151.27ms step:241/1480 train_time:34939ms step_avg:151.25ms step:242/1480 train_time:35088ms step_avg:151.24ms step:243/1480 train_time:35237ms step_avg:151.23ms step:244/1480 train_time:35385ms step_avg:151.22ms step:245/1480 train_time:35534ms step_avg:151.21ms step:246/1480 train_time:35682ms step_avg:151.20ms step:247/1480 train_time:35832ms step_avg:151.19ms step:248/1480 train_time:35980ms step_avg:151.18ms step:249/1480 train_time:36128ms step_avg:151.16ms step:250/1480 train_time:36277ms step_avg:151.16ms step:250/1480 val_loss:3.9821 train_time:36343ms step_avg:151.43ms step:251/1480 train_time:36435ms step_avg:151.18ms step:252/1480 train_time:36584ms step_avg:151.17ms step:253/1480 train_time:36732ms step_avg:151.16ms step:254/1480 train_time:36881ms step_avg:151.15ms step:255/1480 train_time:37029ms step_avg:151.14ms step:256/1480 train_time:37178ms step_avg:151.13ms step:257/1480 train_time:37326ms step_avg:151.12ms step:258/1480 train_time:37477ms step_avg:151.12ms step:259/1480 train_time:37626ms step_avg:151.11ms step:260/1480 train_time:37776ms step_avg:151.10ms step:261/1480 train_time:37924ms step_avg:151.09ms step:262/1480 train_time:38072ms step_avg:151.08ms step:263/1480 train_time:38221ms step_avg:151.07ms step:264/1480 train_time:38368ms step_avg:151.06ms step:265/1480 train_time:38518ms step_avg:151.05ms step:266/1480 train_time:38665ms step_avg:151.03ms step:267/1480 train_time:38814ms step_avg:151.03ms step:268/1480 train_time:38962ms step_avg:151.02ms step:269/1480 train_time:39110ms step_avg:151.00ms step:270/1480 train_time:39259ms step_avg:151.00ms step:271/1480 train_time:39406ms step_avg:150.98ms step:272/1480 train_time:39555ms step_avg:150.98ms step:273/1480 train_time:39704ms step_avg:150.96ms step:274/1480 train_time:39852ms step_avg:150.96ms step:275/1480 train_time:40002ms step_avg:150.95ms step:276/1480 train_time:40150ms step_avg:150.94ms step:277/1480 train_time:40299ms step_avg:150.93ms step:278/1480 train_time:40449ms step_avg:150.93ms step:279/1480 train_time:40598ms step_avg:150.92ms step:280/1480 train_time:40746ms step_avg:150.91ms step:281/1480 train_time:40895ms step_avg:150.91ms step:282/1480 train_time:41043ms step_avg:150.89ms step:283/1480 train_time:41192ms step_avg:150.89ms step:284/1480 train_time:41341ms step_avg:150.88ms step:285/1480 train_time:41490ms step_avg:150.87ms step:286/1480 train_time:41639ms step_avg:150.86ms step:287/1480 train_time:41787ms step_avg:150.85ms step:288/1480 train_time:41935ms step_avg:150.85ms step:289/1480 train_time:42083ms step_avg:150.84ms step:290/1480 train_time:42231ms step_avg:150.83ms step:291/1480 train_time:42380ms step_avg:150.82ms step:292/1480 train_time:42528ms step_avg:150.81ms step:293/1480 train_time:42677ms step_avg:150.80ms step:294/1480 train_time:42824ms step_avg:150.79ms step:295/1480 train_time:42974ms step_avg:150.78ms step:296/1480 train_time:43122ms step_avg:150.78ms step:297/1480 train_time:43270ms step_avg:150.77ms step:298/1480 train_time:43420ms step_avg:150.76ms step:299/1480 train_time:43569ms step_avg:150.76ms step:300/1480 train_time:43719ms step_avg:150.76ms step:301/1480 train_time:43868ms step_avg:150.75ms step:302/1480 train_time:44018ms step_avg:150.75ms step:303/1480 train_time:44165ms step_avg:150.73ms step:304/1480 train_time:44314ms step_avg:150.73ms step:305/1480 train_time:44463ms step_avg:150.72ms step:306/1480 train_time:44609ms step_avg:150.71ms step:307/1480 train_time:44760ms step_avg:150.71ms step:308/1480 train_time:44907ms step_avg:150.69ms step:309/1480 train_time:45057ms step_avg:150.69ms step:310/1480 train_time:45204ms step_avg:150.68ms step:311/1480 train_time:45353ms step_avg:150.67ms step:312/1480 train_time:45502ms step_avg:150.67ms step:313/1480 train_time:45650ms step_avg:150.66ms step:314/1480 train_time:45799ms step_avg:150.66ms step:315/1480 train_time:45947ms step_avg:150.65ms step:316/1480 train_time:46098ms step_avg:150.65ms step:317/1480 train_time:46247ms step_avg:150.64ms step:318/1480 train_time:46395ms step_avg:150.63ms step:319/1480 train_time:46543ms step_avg:150.63ms step:320/1480 train_time:46692ms step_avg:150.62ms step:321/1480 train_time:46841ms step_avg:150.61ms step:322/1480 train_time:46990ms step_avg:150.61ms step:323/1480 train_time:47138ms step_avg:150.60ms step:324/1480 train_time:47287ms step_avg:150.60ms step:325/1480 train_time:47437ms step_avg:150.59ms step:326/1480 train_time:47585ms step_avg:150.58ms step:327/1480 train_time:47733ms step_avg:150.58ms step:328/1480 train_time:47881ms step_avg:150.57ms step:329/1480 train_time:48030ms step_avg:150.56ms step:330/1480 train_time:48180ms step_avg:150.56ms step:331/1480 train_time:48331ms step_avg:150.56ms step:332/1480 train_time:48482ms step_avg:150.57ms step:333/1480 train_time:48632ms step_avg:150.56ms step:334/1480 train_time:48783ms step_avg:150.56ms step:335/1480 train_time:48933ms step_avg:150.56ms step:336/1480 train_time:49083ms step_avg:150.56ms step:337/1480 train_time:49234ms step_avg:150.56ms step:338/1480 train_time:49386ms step_avg:150.57ms step:339/1480 train_time:49536ms step_avg:150.57ms step:340/1480 train_time:49687ms step_avg:150.57ms step:341/1480 train_time:49838ms step_avg:150.57ms step:342/1480 train_time:49988ms step_avg:150.57ms step:343/1480 train_time:50139ms step_avg:150.57ms step:344/1480 train_time:50290ms step_avg:150.57ms step:345/1480 train_time:50441ms step_avg:150.57ms step:346/1480 train_time:50593ms step_avg:150.57ms step:347/1480 train_time:50744ms step_avg:150.58ms step:348/1480 train_time:50895ms step_avg:150.58ms step:349/1480 train_time:51045ms step_avg:150.58ms step:350/1480 train_time:51197ms step_avg:150.58ms step:351/1480 train_time:51347ms step_avg:150.58ms step:352/1480 train_time:51499ms step_avg:150.58ms step:353/1480 train_time:51650ms step_avg:150.58ms step:354/1480 train_time:51801ms step_avg:150.59ms step:355/1480 train_time:51952ms step_avg:150.58ms step:356/1480 train_time:52103ms step_avg:150.59ms step:357/1480 train_time:52253ms step_avg:150.59ms step:358/1480 train_time:52404ms step_avg:150.59ms step:359/1480 train_time:52555ms step_avg:150.59ms step:360/1480 train_time:52706ms step_avg:150.59ms step:361/1480 train_time:52858ms step_avg:150.59ms step:362/1480 train_time:53009ms step_avg:150.59ms step:363/1480 train_time:53160ms step_avg:150.59ms step:364/1480 train_time:53309ms step_avg:150.59ms step:365/1480 train_time:53461ms step_avg:150.59ms step:366/1480 train_time:53610ms step_avg:150.59ms step:367/1480 train_time:53762ms step_avg:150.59ms step:368/1480 train_time:53911ms step_avg:150.59ms step:369/1480 train_time:54062ms step_avg:150.59ms step:370/1480 train_time:54211ms step_avg:150.59ms step:371/1480 train_time:54362ms step_avg:150.59ms step:372/1480 train_time:54512ms step_avg:150.59ms step:373/1480 train_time:54664ms step_avg:150.59ms step:374/1480 train_time:54814ms step_avg:150.59ms step:375/1480 train_time:54965ms step_avg:150.59ms step:375/1480 val_loss:3.8004 train_time:55032ms step_avg:150.77ms step:376/1480 train_time:55127ms step_avg:150.62ms step:377/1480 train_time:55272ms step_avg:150.60ms step:378/1480 train_time:55423ms step_avg:150.61ms step:379/1480 train_time:55591ms step_avg:150.65ms step:380/1480 train_time:55724ms step_avg:150.61ms step:381/1480 train_time:55873ms step_avg:150.60ms step:382/1480 train_time:56024ms step_avg:150.60ms step:383/1480 train_time:56174ms step_avg:150.60ms step:384/1480 train_time:56327ms step_avg:150.61ms step:385/1480 train_time:56478ms step_avg:150.61ms step:386/1480 train_time:56630ms step_avg:150.61ms step:387/1480 train_time:56780ms step_avg:150.61ms step:388/1480 train_time:56931ms step_avg:150.61ms step:389/1480 train_time:57082ms step_avg:150.61ms step:390/1480 train_time:57233ms step_avg:150.61ms step:391/1480 train_time:57384ms step_avg:150.61ms step:392/1480 train_time:57534ms step_avg:150.61ms step:393/1480 train_time:57685ms step_avg:150.61ms step:394/1480 train_time:57836ms step_avg:150.61ms step:395/1480 train_time:57988ms step_avg:150.62ms step:396/1480 train_time:58138ms step_avg:150.62ms step:397/1480 train_time:58289ms step_avg:150.62ms step:398/1480 train_time:58439ms step_avg:150.62ms step:399/1480 train_time:58590ms step_avg:150.62ms step:400/1480 train_time:58740ms step_avg:150.62ms step:401/1480 train_time:58891ms step_avg:150.62ms step:402/1480 train_time:59043ms step_avg:150.62ms step:403/1480 train_time:59193ms step_avg:150.62ms step:404/1480 train_time:59344ms step_avg:150.62ms step:405/1480 train_time:59495ms step_avg:150.62ms step:406/1480 train_time:59646ms step_avg:150.62ms step:407/1480 train_time:59798ms step_avg:150.62ms step:408/1480 train_time:59949ms step_avg:150.63ms step:409/1480 train_time:60100ms step_avg:150.63ms step:410/1480 train_time:60251ms step_avg:150.63ms step:411/1480 train_time:60402ms step_avg:150.63ms step:412/1480 train_time:60553ms step_avg:150.63ms step:413/1480 train_time:60706ms step_avg:150.64ms step:414/1480 train_time:60856ms step_avg:150.63ms step:415/1480 train_time:61008ms step_avg:150.64ms step:416/1480 train_time:61158ms step_avg:150.64ms step:417/1480 train_time:61309ms step_avg:150.64ms step:418/1480 train_time:61459ms step_avg:150.64ms step:419/1480 train_time:61611ms step_avg:150.64ms step:420/1480 train_time:61761ms step_avg:150.64ms step:421/1480 train_time:61912ms step_avg:150.64ms step:422/1480 train_time:62063ms step_avg:150.64ms step:423/1480 train_time:62213ms step_avg:150.64ms step:424/1480 train_time:62364ms step_avg:150.64ms step:425/1480 train_time:62513ms step_avg:150.63ms step:426/1480 train_time:62665ms step_avg:150.64ms step:427/1480 train_time:62815ms step_avg:150.64ms step:428/1480 train_time:62967ms step_avg:150.64ms step:429/1480 train_time:63118ms step_avg:150.64ms step:430/1480 train_time:63269ms step_avg:150.64ms step:431/1480 train_time:63419ms step_avg:150.64ms step:432/1480 train_time:63570ms step_avg:150.64ms step:433/1480 train_time:63719ms step_avg:150.64ms step:434/1480 train_time:63870ms step_avg:150.64ms step:435/1480 train_time:64022ms step_avg:150.64ms step:436/1480 train_time:64172ms step_avg:150.64ms step:437/1480 train_time:64324ms step_avg:150.64ms step:438/1480 train_time:64473ms step_avg:150.64ms step:439/1480 train_time:64625ms step_avg:150.64ms step:440/1480 train_time:64776ms step_avg:150.64ms step:441/1480 train_time:64929ms step_avg:150.65ms step:442/1480 train_time:65083ms step_avg:150.65ms step:443/1480 train_time:65236ms step_avg:150.66ms step:444/1480 train_time:65389ms step_avg:150.67ms step:445/1480 train_time:65541ms step_avg:150.67ms step:446/1480 train_time:65693ms step_avg:150.67ms step:447/1480 train_time:65845ms step_avg:150.68ms step:448/1480 train_time:65999ms step_avg:150.68ms step:449/1480 train_time:66152ms step_avg:150.69ms step:450/1480 train_time:66305ms step_avg:150.69ms step:451/1480 train_time:66458ms step_avg:150.70ms step:452/1480 train_time:66611ms step_avg:150.70ms step:453/1480 train_time:66764ms step_avg:150.71ms step:454/1480 train_time:66915ms step_avg:150.71ms step:455/1480 train_time:67069ms step_avg:150.72ms step:456/1480 train_time:67221ms step_avg:150.72ms step:457/1480 train_time:67374ms step_avg:150.73ms step:458/1480 train_time:67528ms step_avg:150.73ms step:459/1480 train_time:67683ms step_avg:150.74ms step:460/1480 train_time:67835ms step_avg:150.74ms step:461/1480 train_time:67989ms step_avg:150.75ms step:462/1480 train_time:68140ms step_avg:150.75ms step:463/1480 train_time:68293ms step_avg:150.76ms step:464/1480 train_time:68446ms step_avg:150.76ms step:465/1480 train_time:68600ms step_avg:150.77ms step:466/1480 train_time:68753ms step_avg:150.77ms step:467/1480 train_time:68907ms step_avg:150.78ms step:468/1480 train_time:69059ms step_avg:150.78ms step:469/1480 train_time:69211ms step_avg:150.79ms step:470/1480 train_time:69364ms step_avg:150.79ms step:471/1480 train_time:69516ms step_avg:150.79ms step:472/1480 train_time:69669ms step_avg:150.80ms step:473/1480 train_time:69821ms step_avg:150.80ms step:474/1480 train_time:69974ms step_avg:150.81ms step:475/1480 train_time:70127ms step_avg:150.81ms step:476/1480 train_time:70279ms step_avg:150.81ms step:477/1480 train_time:70432ms step_avg:150.82ms step:478/1480 train_time:70585ms step_avg:150.82ms step:479/1480 train_time:70737ms step_avg:150.83ms step:480/1480 train_time:70891ms step_avg:150.83ms step:481/1480 train_time:71044ms step_avg:150.84ms step:482/1480 train_time:71197ms step_avg:150.84ms step:483/1480 train_time:71350ms step_avg:150.85ms step:484/1480 train_time:71504ms step_avg:150.85ms step:485/1480 train_time:71657ms step_avg:150.86ms step:486/1480 train_time:71810ms step_avg:150.86ms step:487/1480 train_time:71962ms step_avg:150.86ms step:488/1480 train_time:72115ms step_avg:150.87ms step:489/1480 train_time:72269ms step_avg:150.87ms step:490/1480 train_time:72420ms step_avg:150.88ms step:491/1480 train_time:72573ms step_avg:150.88ms step:492/1480 train_time:72727ms step_avg:150.89ms step:493/1480 train_time:72882ms step_avg:150.89ms step:494/1480 train_time:73034ms step_avg:150.90ms step:495/1480 train_time:73188ms step_avg:150.90ms step:496/1480 train_time:73341ms step_avg:150.91ms step:497/1480 train_time:73493ms step_avg:150.91ms step:498/1480 train_time:73646ms step_avg:150.91ms step:499/1480 train_time:73799ms step_avg:150.92ms step:500/1480 train_time:73952ms step_avg:150.92ms step:500/1480 val_loss:3.6818 train_time:74022ms step_avg:151.06ms step:501/1480 train_time:74113ms step_avg:150.94ms step:502/1480 train_time:74262ms step_avg:150.94ms step:503/1480 train_time:74416ms step_avg:150.94ms step:504/1480 train_time:74567ms step_avg:150.95ms step:505/1480 train_time:74720ms step_avg:150.95ms step:506/1480 train_time:74873ms step_avg:150.95ms step:507/1480 train_time:75025ms step_avg:150.96ms step:508/1480 train_time:75180ms step_avg:150.96ms step:509/1480 train_time:75333ms step_avg:150.97ms step:510/1480 train_time:75486ms step_avg:150.97ms step:511/1480 train_time:75638ms step_avg:150.97ms step:512/1480 train_time:75795ms step_avg:150.99ms step:513/1480 train_time:75947ms step_avg:150.99ms step:514/1480 train_time:76101ms step_avg:150.99ms step:515/1480 train_time:76254ms step_avg:151.00ms step:516/1480 train_time:76407ms step_avg:151.00ms step:517/1480 train_time:76561ms step_avg:151.01ms step:518/1480 train_time:76713ms step_avg:151.01ms step:519/1480 train_time:76866ms step_avg:151.01ms step:520/1480 train_time:77019ms step_avg:151.02ms step:521/1480 train_time:77172ms step_avg:151.02ms step:522/1480 train_time:77325ms step_avg:151.02ms step:523/1480 train_time:77478ms step_avg:151.03ms step:524/1480 train_time:77630ms step_avg:151.03ms step:525/1480 train_time:77783ms step_avg:151.03ms step:526/1480 train_time:77936ms step_avg:151.04ms step:527/1480 train_time:78089ms step_avg:151.04ms step:528/1480 train_time:78242ms step_avg:151.05ms step:529/1480 train_time:78395ms step_avg:151.05ms step:530/1480 train_time:78549ms step_avg:151.06ms step:531/1480 train_time:78703ms step_avg:151.06ms step:532/1480 train_time:78856ms step_avg:151.06ms step:533/1480 train_time:79008ms step_avg:151.07ms step:534/1480 train_time:79161ms step_avg:151.07ms step:535/1480 train_time:79313ms step_avg:151.07ms step:536/1480 train_time:79465ms step_avg:151.07ms step:537/1480 train_time:79620ms step_avg:151.08ms step:538/1480 train_time:79774ms step_avg:151.09ms step:539/1480 train_time:79928ms step_avg:151.09ms step:540/1480 train_time:80082ms step_avg:151.10ms step:541/1480 train_time:80234ms step_avg:151.10ms step:542/1480 train_time:80386ms step_avg:151.10ms step:543/1480 train_time:80538ms step_avg:151.10ms step:544/1480 train_time:80691ms step_avg:151.11ms step:545/1480 train_time:80844ms step_avg:151.11ms step:546/1480 train_time:80998ms step_avg:151.12ms step:547/1480 train_time:81151ms step_avg:151.12ms step:548/1480 train_time:81303ms step_avg:151.12ms step:549/1480 train_time:81456ms step_avg:151.12ms step:550/1480 train_time:81608ms step_avg:151.13ms step:551/1480 train_time:81763ms step_avg:151.13ms step:552/1480 train_time:81918ms step_avg:151.14ms step:553/1480 train_time:82074ms step_avg:151.15ms step:554/1480 train_time:82229ms step_avg:151.16ms step:555/1480 train_time:82385ms step_avg:151.17ms step:556/1480 train_time:82539ms step_avg:151.17ms step:557/1480 train_time:82696ms step_avg:151.18ms step:558/1480 train_time:82851ms step_avg:151.19ms step:559/1480 train_time:83005ms step_avg:151.19ms step:560/1480 train_time:83159ms step_avg:151.20ms step:561/1480 train_time:83313ms step_avg:151.20ms step:562/1480 train_time:83467ms step_avg:151.21ms step:563/1480 train_time:83622ms step_avg:151.22ms step:564/1480 train_time:83778ms step_avg:151.22ms step:565/1480 train_time:83933ms step_avg:151.23ms step:566/1480 train_time:84088ms step_avg:151.24ms step:567/1480 train_time:84242ms step_avg:151.24ms step:568/1480 train_time:84398ms step_avg:151.25ms step:569/1480 train_time:84569ms step_avg:151.29ms step:570/1480 train_time:84707ms step_avg:151.26ms step:571/1480 train_time:84863ms step_avg:151.27ms step:572/1480 train_time:85018ms step_avg:151.28ms step:573/1480 train_time:85172ms step_avg:151.28ms step:574/1480 train_time:85328ms step_avg:151.29ms step:575/1480 train_time:85484ms step_avg:151.30ms step:576/1480 train_time:85637ms step_avg:151.30ms step:577/1480 train_time:85792ms step_avg:151.31ms step:578/1480 train_time:85946ms step_avg:151.31ms step:579/1480 train_time:86101ms step_avg:151.32ms step:580/1480 train_time:86255ms step_avg:151.33ms step:581/1480 train_time:86410ms step_avg:151.33ms step:582/1480 train_time:86564ms step_avg:151.33ms step:583/1480 train_time:86718ms step_avg:151.34ms step:584/1480 train_time:86873ms step_avg:151.35ms step:585/1480 train_time:87028ms step_avg:151.35ms step:586/1480 train_time:87184ms step_avg:151.36ms step:587/1480 train_time:87338ms step_avg:151.37ms step:588/1480 train_time:87493ms step_avg:151.37ms step:589/1480 train_time:87647ms step_avg:151.38ms step:590/1480 train_time:87802ms step_avg:151.38ms step:591/1480 train_time:87956ms step_avg:151.39ms step:592/1480 train_time:88110ms step_avg:151.39ms step:593/1480 train_time:88265ms step_avg:151.40ms step:594/1480 train_time:88422ms step_avg:151.41ms step:595/1480 train_time:88578ms step_avg:151.41ms step:596/1480 train_time:88734ms step_avg:151.42ms step:597/1480 train_time:88890ms step_avg:151.43ms step:598/1480 train_time:89043ms step_avg:151.43ms step:599/1480 train_time:89198ms step_avg:151.44ms step:600/1480 train_time:89353ms step_avg:151.45ms step:601/1480 train_time:89507ms step_avg:151.45ms step:602/1480 train_time:89661ms step_avg:151.45ms step:603/1480 train_time:89817ms step_avg:151.46ms step:604/1480 train_time:89973ms step_avg:151.47ms step:605/1480 train_time:90128ms step_avg:151.48ms step:606/1480 train_time:90284ms step_avg:151.48ms step:607/1480 train_time:90439ms step_avg:151.49ms step:608/1480 train_time:90593ms step_avg:151.49ms step:609/1480 train_time:90748ms step_avg:151.50ms step:610/1480 train_time:90903ms step_avg:151.51ms step:611/1480 train_time:91058ms step_avg:151.51ms step:612/1480 train_time:91213ms step_avg:151.52ms step:613/1480 train_time:91369ms step_avg:151.52ms step:614/1480 train_time:91523ms step_avg:151.53ms step:615/1480 train_time:91678ms step_avg:151.53ms step:616/1480 train_time:91832ms step_avg:151.54ms step:617/1480 train_time:91987ms step_avg:151.54ms step:618/1480 train_time:92140ms step_avg:151.55ms step:619/1480 train_time:92297ms step_avg:151.55ms step:620/1480 train_time:92452ms step_avg:151.56ms step:621/1480 train_time:92607ms step_avg:151.57ms step:622/1480 train_time:92762ms step_avg:151.57ms step:623/1480 train_time:92917ms step_avg:151.58ms step:624/1480 train_time:93073ms step_avg:151.58ms step:625/1480 train_time:93227ms step_avg:151.59ms step:625/1480 val_loss:3.6025 train_time:93298ms step_avg:151.70ms step:626/1480 train_time:93395ms step_avg:151.61ms step:627/1480 train_time:93544ms step_avg:151.61ms step:628/1480 train_time:93698ms step_avg:151.61ms step:629/1480 train_time:93851ms step_avg:151.62ms step:630/1480 train_time:94006ms step_avg:151.62ms step:631/1480 train_time:94160ms step_avg:151.63ms step:632/1480 train_time:94314ms step_avg:151.63ms step:633/1480 train_time:94469ms step_avg:151.64ms step:634/1480 train_time:94622ms step_avg:151.64ms step:635/1480 train_time:94778ms step_avg:151.64ms step:636/1480 train_time:94933ms step_avg:151.65ms step:637/1480 train_time:95088ms step_avg:151.66ms step:638/1480 train_time:95242ms step_avg:151.66ms step:639/1480 train_time:95396ms step_avg:151.66ms step:640/1480 train_time:95550ms step_avg:151.67ms step:641/1480 train_time:95706ms step_avg:151.67ms step:642/1480 train_time:95860ms step_avg:151.68ms step:643/1480 train_time:96014ms step_avg:151.68ms step:644/1480 train_time:96169ms step_avg:151.69ms step:645/1480 train_time:96322ms step_avg:151.69ms step:646/1480 train_time:96478ms step_avg:151.70ms step:647/1480 train_time:96633ms step_avg:151.70ms step:648/1480 train_time:96790ms step_avg:151.71ms step:649/1480 train_time:96944ms step_avg:151.71ms step:650/1480 train_time:97099ms step_avg:151.72ms step:651/1480 train_time:97255ms step_avg:151.72ms step:652/1480 train_time:97409ms step_avg:151.73ms step:653/1480 train_time:97565ms step_avg:151.73ms step:654/1480 train_time:97719ms step_avg:151.74ms step:655/1480 train_time:97874ms step_avg:151.74ms step:656/1480 train_time:98028ms step_avg:151.75ms step:657/1480 train_time:98184ms step_avg:151.75ms step:658/1480 train_time:98338ms step_avg:151.76ms step:659/1480 train_time:98492ms step_avg:151.76ms step:660/1480 train_time:98648ms step_avg:151.77ms step:661/1480 train_time:98806ms step_avg:151.78ms step:662/1480 train_time:98962ms step_avg:151.78ms step:663/1480 train_time:99117ms step_avg:151.79ms step:664/1480 train_time:99273ms step_avg:151.79ms step:665/1480 train_time:99431ms step_avg:151.80ms step:666/1480 train_time:99587ms step_avg:151.81ms step:667/1480 train_time:99744ms step_avg:151.82ms step:668/1480 train_time:99899ms step_avg:151.82ms step:669/1480 train_time:100056ms step_avg:151.83ms step:670/1480 train_time:100212ms step_avg:151.84ms step:671/1480 train_time:100368ms step_avg:151.84ms step:672/1480 train_time:100524ms step_avg:151.85ms step:673/1480 train_time:100681ms step_avg:151.86ms step:674/1480 train_time:100838ms step_avg:151.86ms step:675/1480 train_time:100995ms step_avg:151.87ms step:676/1480 train_time:101152ms step_avg:151.88ms step:677/1480 train_time:101308ms step_avg:151.89ms step:678/1480 train_time:101465ms step_avg:151.89ms step:679/1480 train_time:101620ms step_avg:151.90ms step:680/1480 train_time:101777ms step_avg:151.91ms step:681/1480 train_time:101933ms step_avg:151.91ms step:682/1480 train_time:102089ms step_avg:151.92ms step:683/1480 train_time:102246ms step_avg:151.93ms step:684/1480 train_time:102405ms step_avg:151.94ms step:685/1480 train_time:102562ms step_avg:151.94ms step:686/1480 train_time:102718ms step_avg:151.95ms step:687/1480 train_time:102875ms step_avg:151.96ms step:688/1480 train_time:103032ms step_avg:151.96ms step:689/1480 train_time:103189ms step_avg:151.97ms step:690/1480 train_time:103348ms step_avg:151.98ms step:691/1480 train_time:103504ms step_avg:151.99ms step:692/1480 train_time:103660ms step_avg:151.99ms step:693/1480 train_time:103816ms step_avg:152.00ms step:694/1480 train_time:103974ms step_avg:152.01ms step:695/1480 train_time:104129ms step_avg:152.01ms step:696/1480 train_time:104286ms step_avg:152.02ms step:697/1480 train_time:104443ms step_avg:152.03ms step:698/1480 train_time:104598ms step_avg:152.03ms step:699/1480 train_time:104754ms step_avg:152.04ms step:700/1480 train_time:104911ms step_avg:152.04ms step:701/1480 train_time:105066ms step_avg:152.05ms step:702/1480 train_time:105223ms step_avg:152.06ms step:703/1480 train_time:105379ms step_avg:152.06ms step:704/1480 train_time:105536ms step_avg:152.07ms step:705/1480 train_time:105692ms step_avg:152.07ms step:706/1480 train_time:105850ms step_avg:152.08ms step:707/1480 train_time:106007ms step_avg:152.09ms step:708/1480 train_time:106163ms step_avg:152.10ms step:709/1480 train_time:106320ms step_avg:152.10ms step:710/1480 train_time:106474ms step_avg:152.11ms step:711/1480 train_time:106630ms step_avg:152.11ms step:712/1480 train_time:106788ms step_avg:152.12ms step:713/1480 train_time:106945ms step_avg:152.13ms step:714/1480 train_time:107101ms step_avg:152.13ms step:715/1480 train_time:107256ms step_avg:152.14ms step:716/1480 train_time:107411ms step_avg:152.14ms step:717/1480 train_time:107568ms step_avg:152.15ms step:718/1480 train_time:107723ms step_avg:152.15ms step:719/1480 train_time:107880ms step_avg:152.16ms step:720/1480 train_time:108037ms step_avg:152.17ms step:721/1480 train_time:108194ms step_avg:152.17ms step:722/1480 train_time:108350ms step_avg:152.18ms step:723/1480 train_time:108506ms step_avg:152.18ms step:724/1480 train_time:108664ms step_avg:152.19ms step:725/1480 train_time:108820ms step_avg:152.20ms step:726/1480 train_time:108977ms step_avg:152.20ms step:727/1480 train_time:109133ms step_avg:152.21ms step:728/1480 train_time:109288ms step_avg:152.21ms step:729/1480 train_time:109445ms step_avg:152.22ms step:730/1480 train_time:109603ms step_avg:152.23ms step:731/1480 train_time:109759ms step_avg:152.23ms step:732/1480 train_time:109915ms step_avg:152.24ms step:733/1480 train_time:110071ms step_avg:152.24ms step:734/1480 train_time:110227ms step_avg:152.25ms step:735/1480 train_time:110386ms step_avg:152.26ms step:736/1480 train_time:110542ms step_avg:152.26ms step:737/1480 train_time:110698ms step_avg:152.27ms step:738/1480 train_time:110853ms step_avg:152.27ms step:739/1480 train_time:111010ms step_avg:152.28ms step:740/1480 train_time:111169ms step_avg:152.29ms step:741/1480 train_time:111326ms step_avg:152.29ms step:742/1480 train_time:111483ms step_avg:152.30ms step:743/1480 train_time:111640ms step_avg:152.31ms step:744/1480 train_time:111795ms step_avg:152.31ms step:745/1480 train_time:111952ms step_avg:152.32ms step:746/1480 train_time:112109ms step_avg:152.32ms step:747/1480 train_time:112266ms step_avg:152.33ms step:748/1480 train_time:112423ms step_avg:152.33ms step:749/1480 train_time:112582ms step_avg:152.34ms step:750/1480 train_time:112737ms step_avg:152.35ms step:750/1480 val_loss:3.5477 train_time:112809ms step_avg:152.44ms step:751/1480 train_time:112907ms step_avg:152.37ms step:752/1480 train_time:113057ms step_avg:152.37ms step:753/1480 train_time:113214ms step_avg:152.37ms step:754/1480 train_time:113369ms step_avg:152.38ms step:755/1480 train_time:113525ms step_avg:152.38ms step:756/1480 train_time:113680ms step_avg:152.39ms step:757/1480 train_time:113838ms step_avg:152.39ms step:758/1480 train_time:113994ms step_avg:152.40ms step:759/1480 train_time:114166ms step_avg:152.43ms step:760/1480 train_time:114308ms step_avg:152.41ms step:761/1480 train_time:114465ms step_avg:152.42ms step:762/1480 train_time:114621ms step_avg:152.42ms step:763/1480 train_time:114777ms step_avg:152.43ms step:764/1480 train_time:114934ms step_avg:152.43ms step:765/1480 train_time:115091ms step_avg:152.44ms step:766/1480 train_time:115250ms step_avg:152.45ms step:767/1480 train_time:115406ms step_avg:152.45ms step:768/1480 train_time:115563ms step_avg:152.46ms step:769/1480 train_time:115721ms step_avg:152.47ms step:770/1480 train_time:115878ms step_avg:152.47ms step:771/1480 train_time:116036ms step_avg:152.48ms step:772/1480 train_time:116194ms step_avg:152.49ms step:773/1480 train_time:116352ms step_avg:152.49ms step:774/1480 train_time:116509ms step_avg:152.50ms step:775/1480 train_time:116667ms step_avg:152.51ms step:776/1480 train_time:116825ms step_avg:152.51ms step:777/1480 train_time:116984ms step_avg:152.52ms step:778/1480 train_time:117142ms step_avg:152.53ms step:779/1480 train_time:117300ms step_avg:152.54ms step:780/1480 train_time:117459ms step_avg:152.54ms step:781/1480 train_time:117618ms step_avg:152.55ms step:782/1480 train_time:117776ms step_avg:152.56ms step:783/1480 train_time:117933ms step_avg:152.57ms step:784/1480 train_time:118091ms step_avg:152.57ms step:785/1480 train_time:118248ms step_avg:152.58ms step:786/1480 train_time:118404ms step_avg:152.58ms step:787/1480 train_time:118563ms step_avg:152.59ms step:788/1480 train_time:118722ms step_avg:152.60ms step:789/1480 train_time:118879ms step_avg:152.61ms step:790/1480 train_time:119039ms step_avg:152.61ms step:791/1480 train_time:119199ms step_avg:152.62ms step:792/1480 train_time:119357ms step_avg:152.63ms step:793/1480 train_time:119515ms step_avg:152.64ms step:794/1480 train_time:119672ms step_avg:152.64ms step:795/1480 train_time:119835ms step_avg:152.66ms step:796/1480 train_time:119995ms step_avg:152.67ms step:797/1480 train_time:120155ms step_avg:152.67ms step:798/1480 train_time:120313ms step_avg:152.68ms step:799/1480 train_time:120476ms step_avg:152.69ms step:800/1480 train_time:120634ms step_avg:152.70ms step:801/1480 train_time:120791ms step_avg:152.71ms step:802/1480 train_time:120951ms step_avg:152.72ms step:803/1480 train_time:121108ms step_avg:152.72ms step:804/1480 train_time:121265ms step_avg:152.73ms step:805/1480 train_time:121426ms step_avg:152.74ms step:806/1480 train_time:121583ms step_avg:152.74ms step:807/1480 train_time:121740ms step_avg:152.75ms step:808/1480 train_time:121899ms step_avg:152.76ms step:809/1480 train_time:122057ms step_avg:152.76ms step:810/1480 train_time:122215ms step_avg:152.77ms step:811/1480 train_time:122373ms step_avg:152.77ms step:812/1480 train_time:122530ms step_avg:152.78ms step:813/1480 train_time:122686ms step_avg:152.79ms step:814/1480 train_time:122845ms step_avg:152.79ms step:815/1480 train_time:123002ms step_avg:152.80ms step:816/1480 train_time:123164ms step_avg:152.81ms step:817/1480 train_time:123321ms step_avg:152.81ms step:818/1480 train_time:123478ms step_avg:152.82ms step:819/1480 train_time:123636ms step_avg:152.83ms step:820/1480 train_time:123795ms step_avg:152.83ms step:821/1480 train_time:123953ms step_avg:152.84ms step:822/1480 train_time:124109ms step_avg:152.84ms step:823/1480 train_time:124267ms step_avg:152.85ms step:824/1480 train_time:124424ms step_avg:152.86ms step:825/1480 train_time:124583ms step_avg:152.86ms step:826/1480 train_time:124742ms step_avg:152.87ms step:827/1480 train_time:124901ms step_avg:152.88ms step:828/1480 train_time:125059ms step_avg:152.88ms step:829/1480 train_time:125219ms step_avg:152.89ms step:830/1480 train_time:125380ms step_avg:152.90ms step:831/1480 train_time:125539ms step_avg:152.91ms step:832/1480 train_time:125696ms step_avg:152.92ms step:833/1480 train_time:125854ms step_avg:152.92ms step:834/1480 train_time:126013ms step_avg:152.93ms step:835/1480 train_time:126171ms step_avg:152.93ms step:836/1480 train_time:126329ms step_avg:152.94ms step:837/1480 train_time:126486ms step_avg:152.95ms step:838/1480 train_time:126645ms step_avg:152.95ms step:839/1480 train_time:126802ms step_avg:152.96ms step:840/1480 train_time:126962ms step_avg:152.97ms step:841/1480 train_time:127119ms step_avg:152.97ms step:842/1480 train_time:127277ms step_avg:152.98ms step:843/1480 train_time:127434ms step_avg:152.98ms step:844/1480 train_time:127592ms step_avg:152.99ms step:845/1480 train_time:127749ms step_avg:152.99ms step:846/1480 train_time:127909ms step_avg:153.00ms step:847/1480 train_time:128067ms step_avg:153.01ms step:848/1480 train_time:128225ms step_avg:153.01ms step:849/1480 train_time:128382ms step_avg:153.02ms step:850/1480 train_time:128539ms step_avg:153.02ms step:851/1480 train_time:128698ms step_avg:153.03ms step:852/1480 train_time:128857ms step_avg:153.04ms step:853/1480 train_time:129014ms step_avg:153.04ms step:854/1480 train_time:129172ms step_avg:153.05ms step:855/1480 train_time:129329ms step_avg:153.05ms step:856/1480 train_time:129486ms step_avg:153.06ms step:857/1480 train_time:129644ms step_avg:153.06ms step:858/1480 train_time:129803ms step_avg:153.07ms step:859/1480 train_time:129962ms step_avg:153.08ms step:860/1480 train_time:130120ms step_avg:153.08ms step:861/1480 train_time:130278ms step_avg:153.09ms step:862/1480 train_time:130442ms step_avg:153.10ms step:863/1480 train_time:130602ms step_avg:153.11ms step:864/1480 train_time:130760ms step_avg:153.12ms step:865/1480 train_time:130917ms step_avg:153.12ms step:866/1480 train_time:131075ms step_avg:153.13ms step:867/1480 train_time:131235ms step_avg:153.13ms step:868/1480 train_time:131392ms step_avg:153.14ms step:869/1480 train_time:131549ms step_avg:153.14ms step:870/1480 train_time:131708ms step_avg:153.15ms step:871/1480 train_time:131864ms step_avg:153.15ms step:872/1480 train_time:132023ms step_avg:153.16ms step:873/1480 train_time:132181ms step_avg:153.16ms step:874/1480 train_time:132341ms step_avg:153.17ms step:875/1480 train_time:132501ms step_avg:153.18ms step:875/1480 val_loss:3.5036 train_time:132574ms step_avg:153.26ms step:876/1480 train_time:132666ms step_avg:153.19ms step:877/1480 train_time:132820ms step_avg:153.19ms step:878/1480 train_time:132977ms step_avg:153.20ms step:879/1480 train_time:133136ms step_avg:153.21ms step:880/1480 train_time:133295ms step_avg:153.21ms step:881/1480 train_time:133452ms step_avg:153.22ms step:882/1480 train_time:133610ms step_avg:153.22ms step:883/1480 train_time:133770ms step_avg:153.23ms step:884/1480 train_time:133932ms step_avg:153.24ms step:885/1480 train_time:134092ms step_avg:153.25ms step:886/1480 train_time:134251ms step_avg:153.25ms step:887/1480 train_time:134413ms step_avg:153.26ms step:888/1480 train_time:134576ms step_avg:153.28ms step:889/1480 train_time:134736ms step_avg:153.28ms step:890/1480 train_time:134894ms step_avg:153.29ms step:891/1480 train_time:135052ms step_avg:153.29ms step:892/1480 train_time:135212ms step_avg:153.30ms step:893/1480 train_time:135370ms step_avg:153.31ms step:894/1480 train_time:135530ms step_avg:153.31ms step:895/1480 train_time:135692ms step_avg:153.32ms step:896/1480 train_time:135851ms step_avg:153.33ms step:897/1480 train_time:136013ms step_avg:153.34ms step:898/1480 train_time:136171ms step_avg:153.35ms step:899/1480 train_time:136331ms step_avg:153.35ms step:900/1480 train_time:136489ms step_avg:153.36ms step:901/1480 train_time:136648ms step_avg:153.36ms step:902/1480 train_time:136805ms step_avg:153.37ms step:903/1480 train_time:136968ms step_avg:153.38ms step:904/1480 train_time:137129ms step_avg:153.39ms step:905/1480 train_time:137288ms step_avg:153.39ms step:906/1480 train_time:137446ms step_avg:153.40ms step:907/1480 train_time:137609ms step_avg:153.41ms step:908/1480 train_time:137767ms step_avg:153.42ms step:909/1480 train_time:137926ms step_avg:153.42ms step:910/1480 train_time:138090ms step_avg:153.43ms step:911/1480 train_time:138249ms step_avg:153.44ms step:912/1480 train_time:138409ms step_avg:153.45ms step:913/1480 train_time:138570ms step_avg:153.46ms step:914/1480 train_time:138731ms step_avg:153.46ms step:915/1480 train_time:138895ms step_avg:153.47ms step:916/1480 train_time:139055ms step_avg:153.48ms step:917/1480 train_time:139213ms step_avg:153.49ms step:918/1480 train_time:139373ms step_avg:153.49ms step:919/1480 train_time:139534ms step_avg:153.50ms step:920/1480 train_time:139693ms step_avg:153.51ms step:921/1480 train_time:139852ms step_avg:153.52ms step:922/1480 train_time:140015ms step_avg:153.53ms step:923/1480 train_time:140173ms step_avg:153.53ms step:924/1480 train_time:140333ms step_avg:153.54ms step:925/1480 train_time:140492ms step_avg:153.54ms step:926/1480 train_time:140649ms step_avg:153.55ms step:927/1480 train_time:140807ms step_avg:153.55ms step:928/1480 train_time:140967ms step_avg:153.56ms step:929/1480 train_time:141128ms step_avg:153.57ms step:930/1480 train_time:141287ms step_avg:153.57ms step:931/1480 train_time:141445ms step_avg:153.58ms step:932/1480 train_time:141605ms step_avg:153.58ms step:933/1480 train_time:141764ms step_avg:153.59ms step:934/1480 train_time:141922ms step_avg:153.60ms step:935/1480 train_time:142082ms step_avg:153.60ms step:936/1480 train_time:142242ms step_avg:153.61ms step:937/1480 train_time:142403ms step_avg:153.62ms step:938/1480 train_time:142561ms step_avg:153.62ms step:939/1480 train_time:142723ms step_avg:153.63ms step:940/1480 train_time:142884ms step_avg:153.64ms step:941/1480 train_time:143042ms step_avg:153.64ms step:942/1480 train_time:143199ms step_avg:153.65ms step:943/1480 train_time:143358ms step_avg:153.65ms step:944/1480 train_time:143520ms step_avg:153.66ms step:945/1480 train_time:143678ms step_avg:153.67ms step:946/1480 train_time:143841ms step_avg:153.68ms step:947/1480 train_time:144001ms step_avg:153.68ms step:948/1480 train_time:144164ms step_avg:153.69ms step:949/1480 train_time:144337ms step_avg:153.71ms step:950/1480 train_time:144484ms step_avg:153.71ms step:951/1480 train_time:144646ms step_avg:153.72ms step:952/1480 train_time:144806ms step_avg:153.72ms step:953/1480 train_time:144966ms step_avg:153.73ms step:954/1480 train_time:145129ms step_avg:153.74ms step:955/1480 train_time:145288ms step_avg:153.74ms step:956/1480 train_time:145446ms step_avg:153.75ms step:957/1480 train_time:145609ms step_avg:153.76ms step:958/1480 train_time:145773ms step_avg:153.77ms step:959/1480 train_time:145933ms step_avg:153.78ms step:960/1480 train_time:146093ms step_avg:153.78ms step:961/1480 train_time:146252ms step_avg:153.79ms step:962/1480 train_time:146411ms step_avg:153.79ms step:963/1480 train_time:146571ms step_avg:153.80ms step:964/1480 train_time:146734ms step_avg:153.81ms step:965/1480 train_time:146893ms step_avg:153.81ms step:966/1480 train_time:147052ms step_avg:153.82ms step:967/1480 train_time:147210ms step_avg:153.82ms step:968/1480 train_time:147370ms step_avg:153.83ms step:969/1480 train_time:147531ms step_avg:153.84ms step:970/1480 train_time:147690ms step_avg:153.84ms step:971/1480 train_time:147848ms step_avg:153.85ms step:972/1480 train_time:148008ms step_avg:153.85ms step:973/1480 train_time:148166ms step_avg:153.86ms step:974/1480 train_time:148327ms step_avg:153.87ms step:975/1480 train_time:148489ms step_avg:153.87ms step:976/1480 train_time:148649ms step_avg:153.88ms step:977/1480 train_time:148808ms step_avg:153.89ms step:978/1480 train_time:148968ms step_avg:153.89ms step:979/1480 train_time:149129ms step_avg:153.90ms step:980/1480 train_time:149289ms step_avg:153.91ms step:981/1480 train_time:149451ms step_avg:153.91ms step:982/1480 train_time:149610ms step_avg:153.92ms step:983/1480 train_time:149770ms step_avg:153.93ms step:984/1480 train_time:149929ms step_avg:153.93ms step:985/1480 train_time:150091ms step_avg:153.94ms step:986/1480 train_time:150250ms step_avg:153.94ms step:987/1480 train_time:150409ms step_avg:153.95ms step:988/1480 train_time:150569ms step_avg:153.96ms step:989/1480 train_time:150730ms step_avg:153.96ms step:990/1480 train_time:150891ms step_avg:153.97ms step:991/1480 train_time:151052ms step_avg:153.98ms step:992/1480 train_time:151218ms step_avg:153.99ms step:993/1480 train_time:151387ms step_avg:154.01ms step:994/1480 train_time:151547ms step_avg:154.01ms step:995/1480 train_time:151707ms step_avg:154.02ms step:996/1480 train_time:151865ms step_avg:154.02ms step:997/1480 train_time:152024ms step_avg:154.03ms step:998/1480 train_time:152182ms step_avg:154.03ms step:999/1480 train_time:152343ms step_avg:154.04ms step:1000/1480 train_time:152503ms step_avg:154.04ms step:1000/1480 val_loss:3.4391 train_time:152577ms step_avg:154.12ms step:1001/1480 train_time:152668ms step_avg:154.05ms step:1002/1480 train_time:152826ms step_avg:154.06ms step:1003/1480 train_time:152990ms step_avg:154.07ms step:1004/1480 train_time:153151ms step_avg:154.08ms step:1005/1480 train_time:153311ms step_avg:154.08ms step:1006/1480 train_time:153472ms step_avg:154.09ms step:1007/1480 train_time:153631ms step_avg:154.09ms step:1008/1480 train_time:153793ms step_avg:154.10ms step:1009/1480 train_time:153956ms step_avg:154.11ms step:1010/1480 train_time:154115ms step_avg:154.11ms step:1011/1480 train_time:154274ms step_avg:154.12ms step:1012/1480 train_time:154432ms step_avg:154.12ms step:1013/1480 train_time:154594ms step_avg:154.13ms step:1014/1480 train_time:154754ms step_avg:154.14ms step:1015/1480 train_time:154916ms step_avg:154.14ms step:1016/1480 train_time:155075ms step_avg:154.15ms step:1017/1480 train_time:155236ms step_avg:154.16ms step:1018/1480 train_time:155396ms step_avg:154.16ms step:1019/1480 train_time:155558ms step_avg:154.17ms step:1020/1480 train_time:155720ms step_avg:154.18ms step:1021/1480 train_time:155880ms step_avg:154.18ms step:1022/1480 train_time:156039ms step_avg:154.19ms step:1023/1480 train_time:156203ms step_avg:154.20ms step:1024/1480 train_time:156364ms step_avg:154.20ms step:1025/1480 train_time:156526ms step_avg:154.21ms step:1026/1480 train_time:156687ms step_avg:154.22ms step:1027/1480 train_time:156847ms step_avg:154.23ms step:1028/1480 train_time:157009ms step_avg:154.23ms step:1029/1480 train_time:157174ms step_avg:154.24ms step:1030/1480 train_time:157334ms step_avg:154.25ms step:1031/1480 train_time:157493ms step_avg:154.25ms step:1032/1480 train_time:157656ms step_avg:154.26ms step:1033/1480 train_time:157815ms step_avg:154.27ms step:1034/1480 train_time:157975ms step_avg:154.27ms step:1035/1480 train_time:158136ms step_avg:154.28ms step:1036/1480 train_time:158298ms step_avg:154.29ms step:1037/1480 train_time:158460ms step_avg:154.29ms step:1038/1480 train_time:158620ms step_avg:154.30ms step:1039/1480 train_time:158783ms step_avg:154.31ms step:1040/1480 train_time:158943ms step_avg:154.31ms step:1041/1480 train_time:159103ms step_avg:154.32ms step:1042/1480 train_time:159263ms step_avg:154.32ms step:1043/1480 train_time:159422ms step_avg:154.33ms step:1044/1480 train_time:159581ms step_avg:154.33ms step:1045/1480 train_time:159741ms step_avg:154.34ms step:1046/1480 train_time:159903ms step_avg:154.35ms step:1047/1480 train_time:160064ms step_avg:154.35ms step:1048/1480 train_time:160225ms step_avg:154.36ms step:1049/1480 train_time:160385ms step_avg:154.37ms step:1050/1480 train_time:160547ms step_avg:154.37ms step:1051/1480 train_time:160709ms step_avg:154.38ms step:1052/1480 train_time:160870ms step_avg:154.39ms step:1053/1480 train_time:161030ms step_avg:154.39ms step:1054/1480 train_time:161192ms step_avg:154.40ms step:1055/1480 train_time:161352ms step_avg:154.40ms step:1056/1480 train_time:161510ms step_avg:154.41ms step:1057/1480 train_time:161671ms step_avg:154.41ms step:1058/1480 train_time:161831ms step_avg:154.42ms step:1059/1480 train_time:161994ms step_avg:154.43ms step:1060/1480 train_time:162155ms step_avg:154.43ms step:1061/1480 train_time:162313ms step_avg:154.44ms step:1062/1480 train_time:162473ms step_avg:154.44ms step:1063/1480 train_time:162632ms step_avg:154.45ms step:1064/1480 train_time:162791ms step_avg:154.45ms step:1065/1480 train_time:162951ms step_avg:154.46ms step:1066/1480 train_time:163113ms step_avg:154.46ms step:1067/1480 train_time:163275ms step_avg:154.47ms step:1068/1480 train_time:163434ms step_avg:154.47ms step:1069/1480 train_time:163600ms step_avg:154.49ms step:1070/1480 train_time:163759ms step_avg:154.49ms step:1071/1480 train_time:163925ms step_avg:154.50ms step:1072/1480 train_time:164085ms step_avg:154.51ms step:1073/1480 train_time:164244ms step_avg:154.51ms step:1074/1480 train_time:164402ms step_avg:154.51ms step:1075/1480 train_time:164565ms step_avg:154.52ms step:1076/1480 train_time:164725ms step_avg:154.53ms step:1077/1480 train_time:164886ms step_avg:154.53ms step:1078/1480 train_time:165051ms step_avg:154.54ms step:1079/1480 train_time:165214ms step_avg:154.55ms step:1080/1480 train_time:165375ms step_avg:154.56ms step:1081/1480 train_time:165534ms step_avg:154.56ms step:1082/1480 train_time:165693ms step_avg:154.56ms step:1083/1480 train_time:165853ms step_avg:154.57ms step:1084/1480 train_time:166013ms step_avg:154.57ms step:1085/1480 train_time:166172ms step_avg:154.58ms step:1086/1480 train_time:166331ms step_avg:154.58ms step:1087/1480 train_time:166492ms step_avg:154.59ms step:1088/1480 train_time:166653ms step_avg:154.59ms step:1089/1480 train_time:166816ms step_avg:154.60ms step:1090/1480 train_time:166980ms step_avg:154.61ms step:1091/1480 train_time:167141ms step_avg:154.62ms step:1092/1480 train_time:167300ms step_avg:154.62ms step:1093/1480 train_time:167461ms step_avg:154.63ms step:1094/1480 train_time:167623ms step_avg:154.63ms step:1095/1480 train_time:167785ms step_avg:154.64ms step:1096/1480 train_time:167945ms step_avg:154.65ms step:1097/1480 train_time:168107ms step_avg:154.65ms step:1098/1480 train_time:168269ms step_avg:154.66ms step:1099/1480 train_time:168428ms step_avg:154.66ms step:1100/1480 train_time:168594ms step_avg:154.67ms step:1101/1480 train_time:168756ms step_avg:154.68ms step:1102/1480 train_time:168918ms step_avg:154.69ms step:1103/1480 train_time:169085ms step_avg:154.70ms step:1104/1480 train_time:169248ms step_avg:154.71ms step:1105/1480 train_time:169410ms step_avg:154.71ms step:1106/1480 train_time:169571ms step_avg:154.72ms step:1107/1480 train_time:169732ms step_avg:154.72ms step:1108/1480 train_time:169892ms step_avg:154.73ms step:1109/1480 train_time:170053ms step_avg:154.73ms step:1110/1480 train_time:170213ms step_avg:154.74ms step:1111/1480 train_time:170375ms step_avg:154.75ms step:1112/1480 train_time:170537ms step_avg:154.75ms step:1113/1480 train_time:170705ms step_avg:154.76ms step:1114/1480 train_time:170869ms step_avg:154.77ms step:1115/1480 train_time:171030ms step_avg:154.78ms step:1116/1480 train_time:171191ms step_avg:154.78ms step:1117/1480 train_time:171354ms step_avg:154.79ms step:1118/1480 train_time:171518ms step_avg:154.80ms step:1119/1480 train_time:171680ms step_avg:154.81ms step:1120/1480 train_time:171840ms step_avg:154.81ms step:1121/1480 train_time:172002ms step_avg:154.82ms step:1122/1480 train_time:172163ms step_avg:154.82ms step:1123/1480 train_time:172324ms step_avg:154.83ms step:1124/1480 train_time:172487ms step_avg:154.84ms step:1125/1480 train_time:172649ms step_avg:154.84ms step:1125/1480 val_loss:3.3836 train_time:172724ms step_avg:154.91ms step:1126/1480 train_time:172815ms step_avg:154.85ms step:1127/1480 train_time:172976ms step_avg:154.86ms step:1128/1480 train_time:173138ms step_avg:154.86ms step:1129/1480 train_time:173301ms step_avg:154.87ms step:1130/1480 train_time:173461ms step_avg:154.88ms step:1131/1480 train_time:173628ms step_avg:154.89ms step:1132/1480 train_time:173788ms step_avg:154.89ms step:1133/1480 train_time:173951ms step_avg:154.90ms step:1134/1480 train_time:174115ms step_avg:154.91ms step:1135/1480 train_time:174276ms step_avg:154.91ms step:1136/1480 train_time:174439ms step_avg:154.92ms step:1137/1480 train_time:174600ms step_avg:154.92ms step:1138/1480 train_time:174764ms step_avg:154.93ms step:1139/1480 train_time:174938ms step_avg:154.95ms step:1140/1480 train_time:175086ms step_avg:154.94ms step:1141/1480 train_time:175250ms step_avg:154.95ms step:1142/1480 train_time:175411ms step_avg:154.96ms step:1143/1480 train_time:175575ms step_avg:154.97ms step:1144/1480 train_time:175738ms step_avg:154.97ms step:1145/1480 train_time:175897ms step_avg:154.98ms step:1146/1480 train_time:176060ms step_avg:154.98ms step:1147/1480 train_time:176222ms step_avg:154.99ms step:1148/1480 train_time:176382ms step_avg:154.99ms step:1149/1480 train_time:176546ms step_avg:155.00ms step:1150/1480 train_time:176706ms step_avg:155.01ms step:1151/1480 train_time:176874ms step_avg:155.02ms step:1152/1480 train_time:177037ms step_avg:155.02ms step:1153/1480 train_time:177202ms step_avg:155.03ms step:1154/1480 train_time:177362ms step_avg:155.04ms step:1155/1480 train_time:177523ms step_avg:155.04ms step:1156/1480 train_time:177690ms step_avg:155.05ms step:1157/1480 train_time:177854ms step_avg:155.06ms step:1158/1480 train_time:178016ms step_avg:155.07ms step:1159/1480 train_time:178176ms step_avg:155.07ms step:1160/1480 train_time:178338ms step_avg:155.08ms step:1161/1480 train_time:178500ms step_avg:155.08ms step:1162/1480 train_time:178662ms step_avg:155.09ms step:1163/1480 train_time:178824ms step_avg:155.09ms step:1164/1480 train_time:178985ms step_avg:155.10ms step:1165/1480 train_time:179144ms step_avg:155.10ms step:1166/1480 train_time:179308ms step_avg:155.11ms step:1167/1480 train_time:179470ms step_avg:155.12ms step:1168/1480 train_time:179633ms step_avg:155.12ms step:1169/1480 train_time:179794ms step_avg:155.13ms step:1170/1480 train_time:179955ms step_avg:155.13ms step:1171/1480 train_time:180115ms step_avg:155.14ms step:1172/1480 train_time:180275ms step_avg:155.14ms step:1173/1480 train_time:180439ms step_avg:155.15ms step:1174/1480 train_time:180610ms step_avg:155.16ms step:1175/1480 train_time:180773ms step_avg:155.17ms step:1176/1480 train_time:180936ms step_avg:155.18ms step:1177/1480 train_time:181102ms step_avg:155.19ms step:1178/1480 train_time:181263ms step_avg:155.19ms step:1179/1480 train_time:181423ms step_avg:155.20ms step:1180/1480 train_time:181593ms step_avg:155.21ms step:1181/1480 train_time:181757ms step_avg:155.22ms step:1182/1480 train_time:181918ms step_avg:155.22ms step:1183/1480 train_time:182079ms step_avg:155.23ms step:1184/1480 train_time:182240ms step_avg:155.23ms step:1185/1480 train_time:182405ms step_avg:155.24ms step:1186/1480 train_time:182567ms step_avg:155.24ms step:1187/1480 train_time:182741ms step_avg:155.26ms step:1188/1480 train_time:182900ms step_avg:155.26ms step:1189/1480 train_time:183062ms step_avg:155.27ms step:1190/1480 train_time:183224ms step_avg:155.27ms step:1191/1480 train_time:183386ms step_avg:155.28ms step:1192/1480 train_time:183547ms step_avg:155.28ms step:1193/1480 train_time:183707ms step_avg:155.29ms step:1194/1480 train_time:183871ms step_avg:155.30ms step:1195/1480 train_time:184036ms step_avg:155.30ms step:1196/1480 train_time:184207ms step_avg:155.32ms step:1197/1480 train_time:184368ms step_avg:155.32ms step:1198/1480 train_time:184538ms step_avg:155.33ms step:1199/1480 train_time:184700ms step_avg:155.34ms step:1200/1480 train_time:184862ms step_avg:155.35ms step:1201/1480 train_time:185023ms step_avg:155.35ms step:1202/1480 train_time:185192ms step_avg:155.36ms step:1203/1480 train_time:185359ms step_avg:155.37ms step:1204/1480 train_time:185522ms step_avg:155.38ms step:1205/1480 train_time:185683ms step_avg:155.38ms step:1206/1480 train_time:185844ms step_avg:155.39ms step:1207/1480 train_time:186004ms step_avg:155.39ms step:1208/1480 train_time:186164ms step_avg:155.40ms step:1209/1480 train_time:186326ms step_avg:155.40ms step:1210/1480 train_time:186495ms step_avg:155.41ms step:1211/1480 train_time:186658ms step_avg:155.42ms step:1212/1480 train_time:186820ms step_avg:155.42ms step:1213/1480 train_time:186983ms step_avg:155.43ms step:1214/1480 train_time:187149ms step_avg:155.44ms step:1215/1480 train_time:187315ms step_avg:155.45ms step:1216/1480 train_time:187476ms step_avg:155.45ms step:1217/1480 train_time:187641ms step_avg:155.46ms step:1218/1480 train_time:187802ms step_avg:155.47ms step:1219/1480 train_time:187970ms step_avg:155.48ms step:1220/1480 train_time:188134ms step_avg:155.48ms step:1221/1480 train_time:188295ms step_avg:155.49ms step:1222/1480 train_time:188455ms step_avg:155.49ms step:1223/1480 train_time:188619ms step_avg:155.50ms step:1224/1480 train_time:188785ms step_avg:155.51ms step:1225/1480 train_time:188948ms step_avg:155.51ms step:1226/1480 train_time:189114ms step_avg:155.52ms step:1227/1480 train_time:189279ms step_avg:155.53ms step:1228/1480 train_time:189440ms step_avg:155.53ms step:1229/1480 train_time:189603ms step_avg:155.54ms step:1230/1480 train_time:189773ms step_avg:155.55ms step:1231/1480 train_time:189940ms step_avg:155.56ms step:1232/1480 train_time:190106ms step_avg:155.57ms step:1233/1480 train_time:190266ms step_avg:155.57ms step:1234/1480 train_time:190430ms step_avg:155.58ms step:1235/1480 train_time:190597ms step_avg:155.59ms step:1236/1480 train_time:190758ms step_avg:155.59ms step:1237/1480 train_time:190919ms step_avg:155.60ms step:1238/1480 train_time:191092ms step_avg:155.61ms step:1239/1480 train_time:191255ms step_avg:155.62ms step:1240/1480 train_time:191417ms step_avg:155.62ms step:1241/1480 train_time:191580ms step_avg:155.63ms step:1242/1480 train_time:191741ms step_avg:155.63ms step:1243/1480 train_time:191904ms step_avg:155.64ms step:1244/1480 train_time:192064ms step_avg:155.64ms step:1245/1480 train_time:192228ms step_avg:155.65ms step:1246/1480 train_time:192392ms step_avg:155.66ms step:1247/1480 train_time:192554ms step_avg:155.66ms step:1248/1480 train_time:192717ms step_avg:155.67ms step:1249/1480 train_time:192877ms step_avg:155.67ms step:1250/1480 train_time:193040ms step_avg:155.68ms step:1250/1480 val_loss:3.3344 train_time:193115ms step_avg:155.74ms step:1251/1480 train_time:193208ms step_avg:155.69ms step:1252/1480 train_time:193371ms step_avg:155.69ms step:1253/1480 train_time:193532ms step_avg:155.70ms step:1254/1480 train_time:193693ms step_avg:155.70ms step:1255/1480 train_time:193864ms step_avg:155.71ms step:1256/1480 train_time:194028ms step_avg:155.72ms step:1257/1480 train_time:194190ms step_avg:155.73ms step:1258/1480 train_time:194356ms step_avg:155.73ms step:1259/1480 train_time:194519ms step_avg:155.74ms step:1260/1480 train_time:194679ms step_avg:155.74ms step:1261/1480 train_time:194842ms step_avg:155.75ms step:1262/1480 train_time:195006ms step_avg:155.76ms step:1263/1480 train_time:195171ms step_avg:155.76ms step:1264/1480 train_time:195332ms step_avg:155.77ms step:1265/1480 train_time:195491ms step_avg:155.77ms step:1266/1480 train_time:195655ms step_avg:155.78ms step:1267/1480 train_time:195816ms step_avg:155.78ms step:1268/1480 train_time:195980ms step_avg:155.79ms step:1269/1480 train_time:196146ms step_avg:155.79ms step:1270/1480 train_time:196307ms step_avg:155.80ms step:1271/1480 train_time:196471ms step_avg:155.81ms step:1272/1480 train_time:196631ms step_avg:155.81ms step:1273/1480 train_time:196794ms step_avg:155.81ms step:1274/1480 train_time:196960ms step_avg:155.82ms step:1275/1480 train_time:197120ms step_avg:155.83ms step:1276/1480 train_time:197280ms step_avg:155.83ms step:1277/1480 train_time:197443ms step_avg:155.84ms step:1278/1480 train_time:197603ms step_avg:155.84ms step:1279/1480 train_time:197764ms step_avg:155.84ms step:1280/1480 train_time:197931ms step_avg:155.85ms step:1281/1480 train_time:198092ms step_avg:155.86ms step:1282/1480 train_time:198252ms step_avg:155.86ms step:1283/1480 train_time:198415ms step_avg:155.86ms step:1284/1480 train_time:198579ms step_avg:155.87ms step:1285/1480 train_time:198741ms step_avg:155.88ms step:1286/1480 train_time:198902ms step_avg:155.88ms step:1287/1480 train_time:199064ms step_avg:155.88ms step:1288/1480 train_time:199227ms step_avg:155.89ms step:1289/1480 train_time:199396ms step_avg:155.90ms step:1290/1480 train_time:199564ms step_avg:155.91ms step:1291/1480 train_time:199727ms step_avg:155.92ms step:1292/1480 train_time:199893ms step_avg:155.92ms step:1293/1480 train_time:200060ms step_avg:155.93ms step:1294/1480 train_time:200222ms step_avg:155.94ms step:1295/1480 train_time:200384ms step_avg:155.94ms step:1296/1480 train_time:200546ms step_avg:155.95ms step:1297/1480 train_time:200712ms step_avg:155.95ms step:1298/1480 train_time:200875ms step_avg:155.96ms step:1299/1480 train_time:201039ms step_avg:155.97ms step:1300/1480 train_time:201200ms step_avg:155.97ms step:1301/1480 train_time:201360ms step_avg:155.97ms step:1302/1480 train_time:201526ms step_avg:155.98ms step:1303/1480 train_time:201690ms step_avg:155.99ms step:1304/1480 train_time:201857ms step_avg:155.99ms step:1305/1480 train_time:202019ms step_avg:156.00ms step:1306/1480 train_time:202184ms step_avg:156.01ms step:1307/1480 train_time:202345ms step_avg:156.01ms step:1308/1480 train_time:202509ms step_avg:156.02ms step:1309/1480 train_time:202675ms step_avg:156.02ms step:1310/1480 train_time:202839ms step_avg:156.03ms step:1311/1480 train_time:203000ms step_avg:156.03ms step:1312/1480 train_time:203166ms step_avg:156.04ms step:1313/1480 train_time:203328ms step_avg:156.05ms step:1314/1480 train_time:203492ms step_avg:156.05ms step:1315/1480 train_time:203656ms step_avg:156.06ms step:1316/1480 train_time:203816ms step_avg:156.06ms step:1317/1480 train_time:203979ms step_avg:156.07ms step:1318/1480 train_time:204146ms step_avg:156.07ms step:1319/1480 train_time:204312ms step_avg:156.08ms step:1320/1480 train_time:204479ms step_avg:156.09ms step:1321/1480 train_time:204642ms step_avg:156.10ms step:1322/1480 train_time:204813ms step_avg:156.11ms step:1323/1480 train_time:204978ms step_avg:156.11ms step:1324/1480 train_time:205142ms step_avg:156.12ms step:1325/1480 train_time:205312ms step_avg:156.13ms step:1326/1480 train_time:205479ms step_avg:156.14ms step:1327/1480 train_time:205641ms step_avg:156.14ms step:1328/1480 train_time:205802ms step_avg:156.15ms step:1329/1480 train_time:205993ms step_avg:156.17ms step:1330/1480 train_time:206152ms step_avg:156.18ms step:1331/1480 train_time:206315ms step_avg:156.18ms step:1332/1480 train_time:206479ms step_avg:156.19ms step:1333/1480 train_time:206645ms step_avg:156.19ms step:1334/1480 train_time:206808ms step_avg:156.20ms step:1335/1480 train_time:206967ms step_avg:156.20ms step:1336/1480 train_time:207137ms step_avg:156.21ms step:1337/1480 train_time:207303ms step_avg:156.22ms step:1338/1480 train_time:207467ms step_avg:156.22ms step:1339/1480 train_time:207631ms step_avg:156.23ms step:1340/1480 train_time:207795ms step_avg:156.24ms step:1341/1480 train_time:207957ms step_avg:156.24ms step:1342/1480 train_time:208121ms step_avg:156.25ms step:1343/1480 train_time:208282ms step_avg:156.25ms step:1344/1480 train_time:208445ms step_avg:156.26ms step:1345/1480 train_time:208612ms step_avg:156.26ms step:1346/1480 train_time:208776ms step_avg:156.27ms step:1347/1480 train_time:208939ms step_avg:156.27ms step:1348/1480 train_time:209101ms step_avg:156.28ms step:1349/1480 train_time:209263ms step_avg:156.28ms step:1350/1480 train_time:209429ms step_avg:156.29ms step:1351/1480 train_time:209592ms step_avg:156.29ms step:1352/1480 train_time:209755ms step_avg:156.30ms step:1353/1480 train_time:209920ms step_avg:156.31ms step:1354/1480 train_time:210083ms step_avg:156.31ms step:1355/1480 train_time:210247ms step_avg:156.32ms step:1356/1480 train_time:210412ms step_avg:156.32ms step:1357/1480 train_time:210577ms step_avg:156.33ms step:1358/1480 train_time:210741ms step_avg:156.34ms step:1359/1480 train_time:210904ms step_avg:156.34ms step:1360/1480 train_time:211070ms step_avg:156.35ms step:1361/1480 train_time:211239ms step_avg:156.36ms step:1362/1480 train_time:211404ms step_avg:156.36ms step:1363/1480 train_time:211572ms step_avg:156.37ms step:1364/1480 train_time:211735ms step_avg:156.38ms step:1365/1480 train_time:211895ms step_avg:156.38ms step:1366/1480 train_time:212060ms step_avg:156.39ms step:1367/1480 train_time:212223ms step_avg:156.39ms step:1368/1480 train_time:212388ms step_avg:156.40ms step:1369/1480 train_time:212558ms step_avg:156.41ms step:1370/1480 train_time:212723ms step_avg:156.41ms step:1371/1480 train_time:212885ms step_avg:156.42ms step:1372/1480 train_time:213054ms step_avg:156.43ms step:1373/1480 train_time:213216ms step_avg:156.43ms step:1374/1480 train_time:213382ms step_avg:156.44ms step:1375/1480 train_time:213544ms step_avg:156.44ms step:1375/1480 val_loss:3.2959 train_time:213618ms step_avg:156.50ms step:1376/1480 train_time:213713ms step_avg:156.45ms step:1377/1480 train_time:213876ms step_avg:156.46ms step:1378/1480 train_time:214037ms step_avg:156.46ms step:1379/1480 train_time:214202ms step_avg:156.47ms step:1380/1480 train_time:214365ms step_avg:156.47ms step:1381/1480 train_time:214533ms step_avg:156.48ms step:1382/1480 train_time:214697ms step_avg:156.48ms step:1383/1480 train_time:214859ms step_avg:156.49ms step:1384/1480 train_time:215027ms step_avg:156.50ms step:1385/1480 train_time:215187ms step_avg:156.50ms step:1386/1480 train_time:215352ms step_avg:156.51ms step:1387/1480 train_time:215517ms step_avg:156.51ms step:1388/1480 train_time:215678ms step_avg:156.52ms step:1389/1480 train_time:215844ms step_avg:156.52ms step:1390/1480 train_time:216005ms step_avg:156.53ms step:1391/1480 train_time:216166ms step_avg:156.53ms step:1392/1480 train_time:216330ms step_avg:156.53ms step:1393/1480 train_time:216493ms step_avg:156.54ms step:1394/1480 train_time:216655ms step_avg:156.54ms step:1395/1480 train_time:216821ms step_avg:156.55ms step:1396/1480 train_time:216982ms step_avg:156.55ms step:1397/1480 train_time:217143ms step_avg:156.56ms step:1398/1480 train_time:217304ms step_avg:156.56ms step:1399/1480 train_time:217464ms step_avg:156.56ms step:1400/1480 train_time:217636ms step_avg:156.57ms step:1401/1480 train_time:217796ms step_avg:156.57ms step:1402/1480 train_time:217957ms step_avg:156.58ms step:1403/1480 train_time:218124ms step_avg:156.59ms step:1404/1480 train_time:218286ms step_avg:156.59ms step:1405/1480 train_time:218452ms step_avg:156.60ms step:1406/1480 train_time:218618ms step_avg:156.60ms step:1407/1480 train_time:218780ms step_avg:156.61ms step:1408/1480 train_time:218941ms step_avg:156.61ms step:1409/1480 train_time:219114ms step_avg:156.62ms step:1410/1480 train_time:219276ms step_avg:156.63ms step:1411/1480 train_time:219436ms step_avg:156.63ms step:1412/1480 train_time:219599ms step_avg:156.63ms step:1413/1480 train_time:219761ms step_avg:156.64ms step:1414/1480 train_time:219925ms step_avg:156.64ms step:1415/1480 train_time:220092ms step_avg:156.65ms step:1416/1480 train_time:220264ms step_avg:156.66ms step:1417/1480 train_time:220428ms step_avg:156.67ms step:1418/1480 train_time:220593ms step_avg:156.67ms step:1419/1480 train_time:220758ms step_avg:156.68ms step:1420/1480 train_time:220923ms step_avg:156.68ms step:1421/1480 train_time:221089ms step_avg:156.69ms step:1422/1480 train_time:221255ms step_avg:156.70ms step:1423/1480 train_time:221417ms step_avg:156.70ms step:1424/1480 train_time:221585ms step_avg:156.71ms step:1425/1480 train_time:221756ms step_avg:156.72ms step:1426/1480 train_time:221921ms step_avg:156.72ms step:1427/1480 train_time:222086ms step_avg:156.73ms step:1428/1480 train_time:222249ms step_avg:156.73ms step:1429/1480 train_time:222410ms step_avg:156.74ms step:1430/1480 train_time:222575ms step_avg:156.74ms step:1431/1480 train_time:222741ms step_avg:156.75ms step:1432/1480 train_time:222913ms step_avg:156.76ms step:1433/1480 train_time:223082ms step_avg:156.77ms step:1434/1480 train_time:223251ms step_avg:156.78ms step:1435/1480 train_time:223418ms step_avg:156.78ms step:1436/1480 train_time:223583ms step_avg:156.79ms step:1437/1480 train_time:223745ms step_avg:156.79ms step:1438/1480 train_time:223905ms step_avg:156.80ms step:1439/1480 train_time:224071ms step_avg:156.80ms step:1440/1480 train_time:224234ms step_avg:156.81ms step:1441/1480 train_time:224399ms step_avg:156.81ms step:1442/1480 train_time:224565ms step_avg:156.82ms step:1443/1480 train_time:224739ms step_avg:156.83ms step:1444/1480 train_time:224902ms step_avg:156.84ms step:1445/1480 train_time:225063ms step_avg:156.84ms step:1446/1480 train_time:225231ms step_avg:156.85ms step:1447/1480 train_time:225399ms step_avg:156.85ms step:1448/1480 train_time:225560ms step_avg:156.86ms step:1449/1480 train_time:225724ms step_avg:156.86ms step:1450/1480 train_time:225889ms step_avg:156.87ms step:1451/1480 train_time:226054ms step_avg:156.87ms step:1452/1480 train_time:226219ms step_avg:156.88ms step:1453/1480 train_time:226381ms step_avg:156.88ms step:1454/1480 train_time:226542ms step_avg:156.89ms step:1455/1480 train_time:226713ms step_avg:156.89ms step:1456/1480 train_time:226877ms step_avg:156.90ms step:1457/1480 train_time:227039ms step_avg:156.90ms step:1458/1480 train_time:227202ms step_avg:156.91ms step:1459/1480 train_time:227367ms step_avg:156.91ms step:1460/1480 train_time:227529ms step_avg:156.92ms step:1461/1480 train_time:227695ms step_avg:156.92ms step:1462/1480 train_time:227858ms step_avg:156.93ms step:1463/1480 train_time:228025ms step_avg:156.93ms step:1464/1480 train_time:228191ms step_avg:156.94ms step:1465/1480 train_time:228355ms step_avg:156.95ms step:1466/1480 train_time:228518ms step_avg:156.95ms step:1467/1480 train_time:228683ms step_avg:156.95ms step:1468/1480 train_time:228846ms step_avg:156.96ms step:1469/1480 train_time:229010ms step_avg:156.96ms step:1470/1480 train_time:229179ms step_avg:156.97ms step:1471/1480 train_time:229350ms step_avg:156.98ms step:1472/1480 train_time:229521ms step_avg:156.99ms step:1473/1480 train_time:229683ms step_avg:156.99ms step:1474/1480 train_time:229849ms step_avg:157.00ms step:1475/1480 train_time:230019ms step_avg:157.01ms step:1476/1480 train_time:230181ms step_avg:157.01ms step:1477/1480 train_time:230349ms step_avg:157.02ms step:1478/1480 train_time:230520ms step_avg:157.03ms step:1479/1480 train_time:230685ms step_avg:157.04ms step:1480/1480 train_time:230847ms step_avg:157.04ms step:1480/1480 val_loss:3.2771 train_time:230924ms step_avg:157.09ms peak memory consumption: 34239 MiB