import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 07:03:12 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 33C P0 122W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 28C P0 115W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 27C P0 111W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 32C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 34C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 28C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 32C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 28C P0 117W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:397097ms step_avg:nanms step:2/1480 train_time:397891ms step_avg:nanms step:3/1480 train_time:398012ms step_avg:nanms step:4/1480 train_time:398155ms step_avg:nanms step:5/1480 train_time:398293ms step_avg:nanms step:6/1480 train_time:398433ms step_avg:nanms step:7/1480 train_time:398575ms step_avg:nanms step:8/1480 train_time:398717ms step_avg:nanms step:9/1480 train_time:398861ms step_avg:nanms step:10/1480 train_time:399005ms step_avg:nanms step:11/1480 train_time:145ms step_avg:nanms step:12/1480 train_time:287ms step_avg:nanms step:13/1480 train_time:429ms step_avg:143.05ms step:14/1480 train_time:571ms step_avg:142.69ms step:15/1480 train_time:712ms step_avg:142.32ms step:16/1480 train_time:853ms step_avg:142.24ms step:17/1480 train_time:995ms step_avg:142.13ms step:18/1480 train_time:1139ms step_avg:142.33ms step:19/1480 train_time:1282ms step_avg:142.47ms step:20/1480 train_time:1424ms step_avg:142.44ms step:21/1480 train_time:1568ms step_avg:142.56ms step:22/1480 train_time:1709ms step_avg:142.45ms step:23/1480 train_time:1853ms step_avg:142.51ms step:24/1480 train_time:1996ms step_avg:142.58ms step:25/1480 train_time:2140ms step_avg:142.65ms step:26/1480 train_time:2283ms step_avg:142.70ms step:27/1480 train_time:2425ms step_avg:142.67ms step:28/1480 train_time:2566ms step_avg:142.58ms step:29/1480 train_time:2709ms step_avg:142.56ms step:30/1480 train_time:2852ms step_avg:142.60ms step:31/1480 train_time:2994ms step_avg:142.56ms step:32/1480 train_time:3136ms step_avg:142.55ms step:33/1480 train_time:3279ms step_avg:142.56ms step:34/1480 train_time:3423ms step_avg:142.61ms step:35/1480 train_time:3565ms step_avg:142.62ms step:36/1480 train_time:3707ms step_avg:142.57ms step:37/1480 train_time:3851ms step_avg:142.62ms step:38/1480 train_time:3993ms step_avg:142.62ms step:39/1480 train_time:4135ms step_avg:142.60ms step:40/1480 train_time:4279ms step_avg:142.63ms step:41/1480 train_time:4422ms step_avg:142.66ms step:42/1480 train_time:4566ms step_avg:142.70ms step:43/1480 train_time:4708ms step_avg:142.66ms step:44/1480 train_time:4849ms step_avg:142.61ms step:45/1480 train_time:4991ms step_avg:142.61ms step:46/1480 train_time:5135ms step_avg:142.63ms step:47/1480 train_time:5279ms step_avg:142.66ms step:48/1480 train_time:5421ms step_avg:142.66ms step:49/1480 train_time:5566ms step_avg:142.71ms step:50/1480 train_time:5707ms step_avg:142.67ms step:51/1480 train_time:5850ms step_avg:142.67ms step:52/1480 train_time:5991ms step_avg:142.64ms step:53/1480 train_time:6134ms step_avg:142.66ms step:54/1480 train_time:6279ms step_avg:142.70ms step:55/1480 train_time:6422ms step_avg:142.70ms step:56/1480 train_time:6565ms step_avg:142.72ms step:57/1480 train_time:6707ms step_avg:142.69ms step:58/1480 train_time:6850ms step_avg:142.72ms step:59/1480 train_time:6993ms step_avg:142.71ms step:60/1480 train_time:7134ms step_avg:142.69ms step:61/1480 train_time:7277ms step_avg:142.68ms step:62/1480 train_time:7421ms step_avg:142.70ms step:63/1480 train_time:7567ms step_avg:142.77ms step:64/1480 train_time:7708ms step_avg:142.73ms step:65/1480 train_time:7849ms step_avg:142.71ms step:66/1480 train_time:7991ms step_avg:142.70ms step:67/1480 train_time:8133ms step_avg:142.68ms step:68/1480 train_time:8275ms step_avg:142.67ms step:69/1480 train_time:8417ms step_avg:142.66ms step:70/1480 train_time:8560ms step_avg:142.67ms step:71/1480 train_time:8702ms step_avg:142.66ms step:72/1480 train_time:8846ms step_avg:142.68ms step:73/1480 train_time:8989ms step_avg:142.68ms step:74/1480 train_time:9130ms step_avg:142.66ms step:75/1480 train_time:9273ms step_avg:142.66ms step:76/1480 train_time:9416ms step_avg:142.67ms step:77/1480 train_time:9558ms step_avg:142.66ms step:78/1480 train_time:9701ms step_avg:142.67ms step:79/1480 train_time:9846ms step_avg:142.69ms step:80/1480 train_time:9988ms step_avg:142.68ms step:81/1480 train_time:10129ms step_avg:142.66ms step:82/1480 train_time:10272ms step_avg:142.66ms step:83/1480 train_time:10414ms step_avg:142.66ms step:84/1480 train_time:10559ms step_avg:142.69ms step:85/1480 train_time:10702ms step_avg:142.70ms step:86/1480 train_time:10845ms step_avg:142.69ms step:87/1480 train_time:10987ms step_avg:142.68ms step:88/1480 train_time:11129ms step_avg:142.68ms step:89/1480 train_time:11271ms step_avg:142.67ms step:90/1480 train_time:11414ms step_avg:142.67ms step:91/1480 train_time:11557ms step_avg:142.67ms step:92/1480 train_time:11701ms step_avg:142.69ms step:93/1480 train_time:11844ms step_avg:142.70ms step:94/1480 train_time:11987ms step_avg:142.70ms step:95/1480 train_time:12128ms step_avg:142.69ms step:96/1480 train_time:12271ms step_avg:142.69ms step:97/1480 train_time:12413ms step_avg:142.68ms step:98/1480 train_time:12556ms step_avg:142.68ms step:99/1480 train_time:12699ms step_avg:142.69ms step:100/1480 train_time:12842ms step_avg:142.69ms step:101/1480 train_time:12987ms step_avg:142.72ms step:102/1480 train_time:13128ms step_avg:142.70ms step:103/1480 train_time:13269ms step_avg:142.68ms step:104/1480 train_time:13411ms step_avg:142.67ms step:105/1480 train_time:13554ms step_avg:142.68ms step:106/1480 train_time:13697ms step_avg:142.68ms step:107/1480 train_time:13840ms step_avg:142.68ms step:108/1480 train_time:13983ms step_avg:142.69ms step:109/1480 train_time:14125ms step_avg:142.68ms step:110/1480 train_time:14268ms step_avg:142.68ms step:111/1480 train_time:14411ms step_avg:142.68ms step:112/1480 train_time:14556ms step_avg:142.71ms step:113/1480 train_time:14703ms step_avg:142.75ms step:114/1480 train_time:14848ms step_avg:142.77ms step:115/1480 train_time:14993ms step_avg:142.79ms step:116/1480 train_time:15140ms step_avg:142.83ms step:117/1480 train_time:15286ms step_avg:142.86ms step:118/1480 train_time:15430ms step_avg:142.87ms step:119/1480 train_time:15576ms step_avg:142.90ms step:120/1480 train_time:15723ms step_avg:142.93ms step:121/1480 train_time:15868ms step_avg:142.96ms step:122/1480 train_time:16013ms step_avg:142.97ms step:123/1480 train_time:16159ms step_avg:143.00ms step:124/1480 train_time:16306ms step_avg:143.03ms step:125/1480 train_time:16450ms step_avg:143.04ms step:125/1480 val_loss:4.4081 train_time:16515ms step_avg:143.61ms step:126/1480 train_time:16607ms step_avg:143.16ms step:127/1480 train_time:16750ms step_avg:143.16ms step:128/1480 train_time:16895ms step_avg:143.18ms step:129/1480 train_time:17041ms step_avg:143.20ms step:130/1480 train_time:17187ms step_avg:143.22ms step:131/1480 train_time:17331ms step_avg:143.23ms step:132/1480 train_time:17477ms step_avg:143.25ms step:133/1480 train_time:17623ms step_avg:143.28ms step:134/1480 train_time:17769ms step_avg:143.29ms step:135/1480 train_time:17915ms step_avg:143.32ms step:136/1480 train_time:18062ms step_avg:143.35ms step:137/1480 train_time:18208ms step_avg:143.37ms step:138/1480 train_time:18353ms step_avg:143.38ms step:139/1480 train_time:18499ms step_avg:143.41ms step:140/1480 train_time:18645ms step_avg:143.42ms step:141/1480 train_time:18791ms step_avg:143.44ms step:142/1480 train_time:18937ms step_avg:143.46ms step:143/1480 train_time:19084ms step_avg:143.49ms step:144/1480 train_time:19229ms step_avg:143.50ms step:145/1480 train_time:19375ms step_avg:143.52ms step:146/1480 train_time:19521ms step_avg:143.54ms step:147/1480 train_time:19667ms step_avg:143.56ms step:148/1480 train_time:19812ms step_avg:143.57ms step:149/1480 train_time:19958ms step_avg:143.58ms step:150/1480 train_time:20105ms step_avg:143.61ms step:151/1480 train_time:20250ms step_avg:143.62ms step:152/1480 train_time:20396ms step_avg:143.63ms step:153/1480 train_time:20542ms step_avg:143.65ms step:154/1480 train_time:20687ms step_avg:143.66ms step:155/1480 train_time:20832ms step_avg:143.67ms step:156/1480 train_time:20978ms step_avg:143.68ms step:157/1480 train_time:21125ms step_avg:143.71ms step:158/1480 train_time:21270ms step_avg:143.71ms step:159/1480 train_time:21415ms step_avg:143.73ms step:160/1480 train_time:21562ms step_avg:143.74ms step:161/1480 train_time:21707ms step_avg:143.76ms step:162/1480 train_time:21853ms step_avg:143.77ms step:163/1480 train_time:22000ms step_avg:143.79ms step:164/1480 train_time:22147ms step_avg:143.81ms step:165/1480 train_time:22292ms step_avg:143.82ms step:166/1480 train_time:22438ms step_avg:143.84ms step:167/1480 train_time:22585ms step_avg:143.86ms step:168/1480 train_time:22730ms step_avg:143.86ms step:169/1480 train_time:22876ms step_avg:143.87ms step:170/1480 train_time:23022ms step_avg:143.89ms step:171/1480 train_time:23167ms step_avg:143.89ms step:172/1480 train_time:23312ms step_avg:143.90ms step:173/1480 train_time:23458ms step_avg:143.91ms step:174/1480 train_time:23605ms step_avg:143.93ms step:175/1480 train_time:23749ms step_avg:143.93ms step:176/1480 train_time:23895ms step_avg:143.95ms step:177/1480 train_time:24042ms step_avg:143.97ms step:178/1480 train_time:24188ms step_avg:143.98ms step:179/1480 train_time:24332ms step_avg:143.98ms step:180/1480 train_time:24479ms step_avg:144.00ms step:181/1480 train_time:24625ms step_avg:144.01ms step:182/1480 train_time:24770ms step_avg:144.01ms step:183/1480 train_time:24915ms step_avg:144.02ms step:184/1480 train_time:25061ms step_avg:144.03ms step:185/1480 train_time:25207ms step_avg:144.04ms step:186/1480 train_time:25351ms step_avg:144.04ms step:187/1480 train_time:25499ms step_avg:144.06ms step:188/1480 train_time:25644ms step_avg:144.07ms step:189/1480 train_time:25816ms step_avg:144.22ms step:190/1480 train_time:25936ms step_avg:144.09ms step:191/1480 train_time:26082ms step_avg:144.10ms step:192/1480 train_time:26227ms step_avg:144.10ms step:193/1480 train_time:26372ms step_avg:144.11ms step:194/1480 train_time:26518ms step_avg:144.12ms step:195/1480 train_time:26664ms step_avg:144.13ms step:196/1480 train_time:26808ms step_avg:144.13ms step:197/1480 train_time:26955ms step_avg:144.14ms step:198/1480 train_time:27101ms step_avg:144.16ms step:199/1480 train_time:27246ms step_avg:144.16ms step:200/1480 train_time:27391ms step_avg:144.16ms step:201/1480 train_time:27538ms step_avg:144.18ms step:202/1480 train_time:27683ms step_avg:144.18ms step:203/1480 train_time:27828ms step_avg:144.19ms step:204/1480 train_time:27974ms step_avg:144.20ms step:205/1480 train_time:28121ms step_avg:144.21ms step:206/1480 train_time:28266ms step_avg:144.22ms step:207/1480 train_time:28411ms step_avg:144.22ms step:208/1480 train_time:28556ms step_avg:144.22ms step:209/1480 train_time:28704ms step_avg:144.24ms step:210/1480 train_time:28848ms step_avg:144.24ms step:211/1480 train_time:28994ms step_avg:144.25ms step:212/1480 train_time:29140ms step_avg:144.26ms step:213/1480 train_time:29286ms step_avg:144.27ms step:214/1480 train_time:29430ms step_avg:144.27ms step:215/1480 train_time:29576ms step_avg:144.27ms step:216/1480 train_time:29723ms step_avg:144.28ms step:217/1480 train_time:29867ms step_avg:144.29ms step:218/1480 train_time:30012ms step_avg:144.29ms step:219/1480 train_time:30158ms step_avg:144.30ms step:220/1480 train_time:30305ms step_avg:144.31ms step:221/1480 train_time:30450ms step_avg:144.31ms step:222/1480 train_time:30600ms step_avg:144.34ms step:223/1480 train_time:30748ms step_avg:144.36ms step:224/1480 train_time:30896ms step_avg:144.38ms step:225/1480 train_time:31045ms step_avg:144.40ms step:226/1480 train_time:31193ms step_avg:144.41ms step:227/1480 train_time:31342ms step_avg:144.43ms step:228/1480 train_time:31490ms step_avg:144.45ms step:229/1480 train_time:31639ms step_avg:144.47ms step:230/1480 train_time:31788ms step_avg:144.49ms step:231/1480 train_time:31935ms step_avg:144.50ms step:232/1480 train_time:32085ms step_avg:144.53ms step:233/1480 train_time:32232ms step_avg:144.54ms step:234/1480 train_time:32382ms step_avg:144.56ms step:235/1480 train_time:32529ms step_avg:144.58ms step:236/1480 train_time:32677ms step_avg:144.59ms step:237/1480 train_time:32827ms step_avg:144.61ms step:238/1480 train_time:32975ms step_avg:144.63ms step:239/1480 train_time:33123ms step_avg:144.64ms step:240/1480 train_time:33270ms step_avg:144.65ms step:241/1480 train_time:33419ms step_avg:144.67ms step:242/1480 train_time:33568ms step_avg:144.69ms step:243/1480 train_time:33716ms step_avg:144.70ms step:244/1480 train_time:33866ms step_avg:144.72ms step:245/1480 train_time:34012ms step_avg:144.73ms step:246/1480 train_time:34161ms step_avg:144.75ms step:247/1480 train_time:34309ms step_avg:144.77ms step:248/1480 train_time:34458ms step_avg:144.78ms step:249/1480 train_time:34607ms step_avg:144.80ms step:250/1480 train_time:34754ms step_avg:144.81ms step:250/1480 val_loss:4.0113 train_time:34821ms step_avg:145.09ms step:251/1480 train_time:34913ms step_avg:144.87ms step:252/1480 train_time:35062ms step_avg:144.88ms step:253/1480 train_time:35211ms step_avg:144.90ms step:254/1480 train_time:35358ms step_avg:144.91ms step:255/1480 train_time:35507ms step_avg:144.93ms step:256/1480 train_time:35655ms step_avg:144.94ms step:257/1480 train_time:35803ms step_avg:144.95ms step:258/1480 train_time:35952ms step_avg:144.97ms step:259/1480 train_time:36099ms step_avg:144.98ms step:260/1480 train_time:36250ms step_avg:145.00ms step:261/1480 train_time:36397ms step_avg:145.01ms step:262/1480 train_time:36546ms step_avg:145.03ms step:263/1480 train_time:36694ms step_avg:145.04ms step:264/1480 train_time:36842ms step_avg:145.05ms step:265/1480 train_time:36991ms step_avg:145.06ms step:266/1480 train_time:37138ms step_avg:145.07ms step:267/1480 train_time:37287ms step_avg:145.09ms step:268/1480 train_time:37435ms step_avg:145.10ms step:269/1480 train_time:37585ms step_avg:145.11ms step:270/1480 train_time:37734ms step_avg:145.13ms step:271/1480 train_time:37883ms step_avg:145.14ms step:272/1480 train_time:38032ms step_avg:145.16ms step:273/1480 train_time:38179ms step_avg:145.17ms step:274/1480 train_time:38328ms step_avg:145.18ms step:275/1480 train_time:38475ms step_avg:145.19ms step:276/1480 train_time:38624ms step_avg:145.20ms step:277/1480 train_time:38773ms step_avg:145.22ms step:278/1480 train_time:38922ms step_avg:145.23ms step:279/1480 train_time:39071ms step_avg:145.25ms step:280/1480 train_time:39221ms step_avg:145.26ms step:281/1480 train_time:39370ms step_avg:145.28ms step:282/1480 train_time:39518ms step_avg:145.29ms step:283/1480 train_time:39667ms step_avg:145.30ms step:284/1480 train_time:39814ms step_avg:145.31ms step:285/1480 train_time:39962ms step_avg:145.32ms step:286/1480 train_time:40112ms step_avg:145.33ms step:287/1480 train_time:40261ms step_avg:145.35ms step:288/1480 train_time:40411ms step_avg:145.36ms step:289/1480 train_time:40558ms step_avg:145.37ms step:290/1480 train_time:40707ms step_avg:145.38ms step:291/1480 train_time:40854ms step_avg:145.39ms step:292/1480 train_time:41003ms step_avg:145.40ms step:293/1480 train_time:41151ms step_avg:145.41ms step:294/1480 train_time:41300ms step_avg:145.42ms step:295/1480 train_time:41450ms step_avg:145.44ms step:296/1480 train_time:41597ms step_avg:145.44ms step:297/1480 train_time:41746ms step_avg:145.46ms step:298/1480 train_time:41894ms step_avg:145.46ms step:299/1480 train_time:42042ms step_avg:145.48ms step:300/1480 train_time:42192ms step_avg:145.49ms step:301/1480 train_time:42339ms step_avg:145.49ms step:302/1480 train_time:42488ms step_avg:145.51ms step:303/1480 train_time:42636ms step_avg:145.51ms step:304/1480 train_time:42785ms step_avg:145.53ms step:305/1480 train_time:42934ms step_avg:145.54ms step:306/1480 train_time:43083ms step_avg:145.55ms step:307/1480 train_time:43232ms step_avg:145.56ms step:308/1480 train_time:43379ms step_avg:145.57ms step:309/1480 train_time:43529ms step_avg:145.58ms step:310/1480 train_time:43676ms step_avg:145.59ms step:311/1480 train_time:43824ms step_avg:145.60ms step:312/1480 train_time:43973ms step_avg:145.61ms step:313/1480 train_time:44122ms step_avg:145.62ms step:314/1480 train_time:44271ms step_avg:145.63ms step:315/1480 train_time:44419ms step_avg:145.64ms step:316/1480 train_time:44568ms step_avg:145.65ms step:317/1480 train_time:44717ms step_avg:145.66ms step:318/1480 train_time:44866ms step_avg:145.67ms step:319/1480 train_time:45014ms step_avg:145.68ms step:320/1480 train_time:45164ms step_avg:145.69ms step:321/1480 train_time:45312ms step_avg:145.70ms step:322/1480 train_time:45460ms step_avg:145.70ms step:323/1480 train_time:45610ms step_avg:145.72ms step:324/1480 train_time:45757ms step_avg:145.72ms step:325/1480 train_time:45907ms step_avg:145.74ms step:326/1480 train_time:46055ms step_avg:145.74ms step:327/1480 train_time:46204ms step_avg:145.75ms step:328/1480 train_time:46353ms step_avg:145.76ms step:329/1480 train_time:46501ms step_avg:145.77ms step:330/1480 train_time:46652ms step_avg:145.79ms step:331/1480 train_time:46802ms step_avg:145.80ms step:332/1480 train_time:46953ms step_avg:145.82ms step:333/1480 train_time:47103ms step_avg:145.83ms step:334/1480 train_time:47254ms step_avg:145.85ms step:335/1480 train_time:47406ms step_avg:145.86ms step:336/1480 train_time:47556ms step_avg:145.88ms step:337/1480 train_time:47708ms step_avg:145.90ms step:338/1480 train_time:47858ms step_avg:145.91ms step:339/1480 train_time:48008ms step_avg:145.92ms step:340/1480 train_time:48158ms step_avg:145.93ms step:341/1480 train_time:48309ms step_avg:145.95ms step:342/1480 train_time:48458ms step_avg:145.96ms step:343/1480 train_time:48610ms step_avg:145.98ms step:344/1480 train_time:48761ms step_avg:145.99ms step:345/1480 train_time:48912ms step_avg:146.01ms step:346/1480 train_time:49061ms step_avg:146.02ms step:347/1480 train_time:49212ms step_avg:146.03ms step:348/1480 train_time:49363ms step_avg:146.04ms step:349/1480 train_time:49513ms step_avg:146.06ms step:350/1480 train_time:49664ms step_avg:146.07ms step:351/1480 train_time:49814ms step_avg:146.08ms step:352/1480 train_time:49966ms step_avg:146.10ms step:353/1480 train_time:50116ms step_avg:146.11ms step:354/1480 train_time:50267ms step_avg:146.13ms step:355/1480 train_time:50417ms step_avg:146.14ms step:356/1480 train_time:50568ms step_avg:146.15ms step:357/1480 train_time:50719ms step_avg:146.16ms step:358/1480 train_time:50870ms step_avg:146.18ms step:359/1480 train_time:51024ms step_avg:146.20ms step:360/1480 train_time:51176ms step_avg:146.22ms step:361/1480 train_time:51327ms step_avg:146.23ms step:362/1480 train_time:51478ms step_avg:146.24ms step:363/1480 train_time:51630ms step_avg:146.26ms step:364/1480 train_time:51779ms step_avg:146.27ms step:365/1480 train_time:51931ms step_avg:146.28ms step:366/1480 train_time:52082ms step_avg:146.30ms step:367/1480 train_time:52233ms step_avg:146.31ms step:368/1480 train_time:52384ms step_avg:146.32ms step:369/1480 train_time:52535ms step_avg:146.34ms step:370/1480 train_time:52686ms step_avg:146.35ms step:371/1480 train_time:52836ms step_avg:146.36ms step:372/1480 train_time:52988ms step_avg:146.38ms step:373/1480 train_time:53138ms step_avg:146.39ms step:374/1480 train_time:53289ms step_avg:146.40ms step:375/1480 train_time:53438ms step_avg:146.41ms step:375/1480 val_loss:3.8083 train_time:53507ms step_avg:146.59ms step:376/1480 train_time:53598ms step_avg:146.44ms step:377/1480 train_time:53748ms step_avg:146.45ms step:378/1480 train_time:53898ms step_avg:146.46ms step:379/1480 train_time:54070ms step_avg:146.53ms step:380/1480 train_time:54199ms step_avg:146.48ms step:381/1480 train_time:54349ms step_avg:146.49ms step:382/1480 train_time:54499ms step_avg:146.50ms step:383/1480 train_time:54651ms step_avg:146.52ms step:384/1480 train_time:54801ms step_avg:146.53ms step:385/1480 train_time:54952ms step_avg:146.54ms step:386/1480 train_time:55102ms step_avg:146.55ms step:387/1480 train_time:55253ms step_avg:146.56ms step:388/1480 train_time:55403ms step_avg:146.57ms step:389/1480 train_time:55554ms step_avg:146.58ms step:390/1480 train_time:55704ms step_avg:146.59ms step:391/1480 train_time:55856ms step_avg:146.60ms step:392/1480 train_time:56007ms step_avg:146.62ms step:393/1480 train_time:56158ms step_avg:146.63ms step:394/1480 train_time:56310ms step_avg:146.64ms step:395/1480 train_time:56460ms step_avg:146.65ms step:396/1480 train_time:56611ms step_avg:146.66ms step:397/1480 train_time:56761ms step_avg:146.67ms step:398/1480 train_time:56912ms step_avg:146.68ms step:399/1480 train_time:57063ms step_avg:146.69ms step:400/1480 train_time:57214ms step_avg:146.70ms step:401/1480 train_time:57364ms step_avg:146.71ms step:402/1480 train_time:57516ms step_avg:146.73ms step:403/1480 train_time:57667ms step_avg:146.74ms step:404/1480 train_time:57819ms step_avg:146.75ms step:405/1480 train_time:57970ms step_avg:146.76ms step:406/1480 train_time:58120ms step_avg:146.77ms step:407/1480 train_time:58271ms step_avg:146.78ms step:408/1480 train_time:58421ms step_avg:146.79ms step:409/1480 train_time:58572ms step_avg:146.80ms step:410/1480 train_time:58722ms step_avg:146.81ms step:411/1480 train_time:58873ms step_avg:146.82ms step:412/1480 train_time:59024ms step_avg:146.82ms step:413/1480 train_time:59175ms step_avg:146.84ms step:414/1480 train_time:59326ms step_avg:146.85ms step:415/1480 train_time:59478ms step_avg:146.86ms step:416/1480 train_time:59629ms step_avg:146.87ms step:417/1480 train_time:59779ms step_avg:146.88ms step:418/1480 train_time:59931ms step_avg:146.89ms step:419/1480 train_time:60080ms step_avg:146.90ms step:420/1480 train_time:60232ms step_avg:146.91ms step:421/1480 train_time:60381ms step_avg:146.91ms step:422/1480 train_time:60533ms step_avg:146.92ms step:423/1480 train_time:60682ms step_avg:146.93ms step:424/1480 train_time:60834ms step_avg:146.94ms step:425/1480 train_time:60985ms step_avg:146.95ms step:426/1480 train_time:61136ms step_avg:146.96ms step:427/1480 train_time:61288ms step_avg:146.97ms step:428/1480 train_time:61438ms step_avg:146.98ms step:429/1480 train_time:61589ms step_avg:146.99ms step:430/1480 train_time:61739ms step_avg:147.00ms step:431/1480 train_time:61890ms step_avg:147.01ms step:432/1480 train_time:62040ms step_avg:147.01ms step:433/1480 train_time:62192ms step_avg:147.03ms step:434/1480 train_time:62341ms step_avg:147.03ms step:435/1480 train_time:62493ms step_avg:147.04ms step:436/1480 train_time:62642ms step_avg:147.05ms step:437/1480 train_time:62794ms step_avg:147.06ms step:438/1480 train_time:62944ms step_avg:147.06ms step:439/1480 train_time:63096ms step_avg:147.08ms step:440/1480 train_time:63247ms step_avg:147.09ms step:441/1480 train_time:63399ms step_avg:147.10ms step:442/1480 train_time:63552ms step_avg:147.11ms step:443/1480 train_time:63704ms step_avg:147.12ms step:444/1480 train_time:63857ms step_avg:147.14ms step:445/1480 train_time:64010ms step_avg:147.15ms step:446/1480 train_time:64163ms step_avg:147.16ms step:447/1480 train_time:64316ms step_avg:147.18ms step:448/1480 train_time:64469ms step_avg:147.19ms step:449/1480 train_time:64621ms step_avg:147.20ms step:450/1480 train_time:64774ms step_avg:147.21ms step:451/1480 train_time:64928ms step_avg:147.23ms step:452/1480 train_time:65081ms step_avg:147.24ms step:453/1480 train_time:65234ms step_avg:147.26ms step:454/1480 train_time:65387ms step_avg:147.27ms step:455/1480 train_time:65541ms step_avg:147.28ms step:456/1480 train_time:65695ms step_avg:147.30ms step:457/1480 train_time:65847ms step_avg:147.31ms step:458/1480 train_time:66000ms step_avg:147.32ms step:459/1480 train_time:66154ms step_avg:147.34ms step:460/1480 train_time:66306ms step_avg:147.35ms step:461/1480 train_time:66459ms step_avg:147.36ms step:462/1480 train_time:66613ms step_avg:147.37ms step:463/1480 train_time:66766ms step_avg:147.39ms step:464/1480 train_time:66919ms step_avg:147.40ms step:465/1480 train_time:67071ms step_avg:147.41ms step:466/1480 train_time:67225ms step_avg:147.42ms step:467/1480 train_time:67377ms step_avg:147.43ms step:468/1480 train_time:67531ms step_avg:147.45ms step:469/1480 train_time:67683ms step_avg:147.46ms step:470/1480 train_time:67836ms step_avg:147.47ms step:471/1480 train_time:67991ms step_avg:147.49ms step:472/1480 train_time:68143ms step_avg:147.50ms step:473/1480 train_time:68296ms step_avg:147.51ms step:474/1480 train_time:68449ms step_avg:147.52ms step:475/1480 train_time:68602ms step_avg:147.53ms step:476/1480 train_time:68756ms step_avg:147.54ms step:477/1480 train_time:68910ms step_avg:147.56ms step:478/1480 train_time:69064ms step_avg:147.57ms step:479/1480 train_time:69217ms step_avg:147.58ms step:480/1480 train_time:69370ms step_avg:147.59ms step:481/1480 train_time:69521ms step_avg:147.60ms step:482/1480 train_time:69674ms step_avg:147.61ms step:483/1480 train_time:69827ms step_avg:147.63ms step:484/1480 train_time:69980ms step_avg:147.64ms step:485/1480 train_time:70135ms step_avg:147.65ms step:486/1480 train_time:70288ms step_avg:147.66ms step:487/1480 train_time:70441ms step_avg:147.67ms step:488/1480 train_time:70595ms step_avg:147.69ms step:489/1480 train_time:70747ms step_avg:147.70ms step:490/1480 train_time:70900ms step_avg:147.71ms step:491/1480 train_time:71053ms step_avg:147.72ms step:492/1480 train_time:71206ms step_avg:147.73ms step:493/1480 train_time:71359ms step_avg:147.74ms step:494/1480 train_time:71513ms step_avg:147.75ms step:495/1480 train_time:71665ms step_avg:147.76ms step:496/1480 train_time:71818ms step_avg:147.77ms step:497/1480 train_time:71970ms step_avg:147.78ms step:498/1480 train_time:72123ms step_avg:147.79ms step:499/1480 train_time:72276ms step_avg:147.80ms step:500/1480 train_time:72430ms step_avg:147.82ms step:500/1480 val_loss:3.6901 train_time:72499ms step_avg:147.96ms step:501/1480 train_time:72590ms step_avg:147.84ms step:502/1480 train_time:72742ms step_avg:147.85ms step:503/1480 train_time:72894ms step_avg:147.86ms step:504/1480 train_time:73046ms step_avg:147.87ms step:505/1480 train_time:73199ms step_avg:147.88ms step:506/1480 train_time:73351ms step_avg:147.89ms step:507/1480 train_time:73504ms step_avg:147.90ms step:508/1480 train_time:73658ms step_avg:147.91ms step:509/1480 train_time:73812ms step_avg:147.92ms step:510/1480 train_time:73965ms step_avg:147.93ms step:511/1480 train_time:74118ms step_avg:147.94ms step:512/1480 train_time:74272ms step_avg:147.95ms step:513/1480 train_time:74426ms step_avg:147.96ms step:514/1480 train_time:74579ms step_avg:147.97ms step:515/1480 train_time:74732ms step_avg:147.98ms step:516/1480 train_time:74886ms step_avg:148.00ms step:517/1480 train_time:75038ms step_avg:148.00ms step:518/1480 train_time:75191ms step_avg:148.01ms step:519/1480 train_time:75346ms step_avg:148.03ms step:520/1480 train_time:75500ms step_avg:148.04ms step:521/1480 train_time:75653ms step_avg:148.05ms step:522/1480 train_time:75806ms step_avg:148.06ms step:523/1480 train_time:75959ms step_avg:148.07ms step:524/1480 train_time:76112ms step_avg:148.08ms step:525/1480 train_time:76264ms step_avg:148.09ms step:526/1480 train_time:76419ms step_avg:148.10ms step:527/1480 train_time:76572ms step_avg:148.11ms step:528/1480 train_time:76726ms step_avg:148.12ms step:529/1480 train_time:76879ms step_avg:148.13ms step:530/1480 train_time:77032ms step_avg:148.14ms step:531/1480 train_time:77185ms step_avg:148.15ms step:532/1480 train_time:77337ms step_avg:148.15ms step:533/1480 train_time:77489ms step_avg:148.16ms step:534/1480 train_time:77644ms step_avg:148.18ms step:535/1480 train_time:77796ms step_avg:148.18ms step:536/1480 train_time:77949ms step_avg:148.19ms step:537/1480 train_time:78104ms step_avg:148.20ms step:538/1480 train_time:78257ms step_avg:148.21ms step:539/1480 train_time:78410ms step_avg:148.22ms step:540/1480 train_time:78563ms step_avg:148.23ms step:541/1480 train_time:78715ms step_avg:148.24ms step:542/1480 train_time:78868ms step_avg:148.25ms step:543/1480 train_time:79022ms step_avg:148.26ms step:544/1480 train_time:79175ms step_avg:148.27ms step:545/1480 train_time:79328ms step_avg:148.28ms step:546/1480 train_time:79482ms step_avg:148.29ms step:547/1480 train_time:79634ms step_avg:148.29ms step:548/1480 train_time:79788ms step_avg:148.30ms step:549/1480 train_time:79941ms step_avg:148.31ms step:550/1480 train_time:80095ms step_avg:148.32ms step:551/1480 train_time:80249ms step_avg:148.33ms step:552/1480 train_time:80404ms step_avg:148.35ms step:553/1480 train_time:80559ms step_avg:148.36ms step:554/1480 train_time:80714ms step_avg:148.37ms step:555/1480 train_time:80868ms step_avg:148.38ms step:556/1480 train_time:81022ms step_avg:148.39ms step:557/1480 train_time:81177ms step_avg:148.40ms step:558/1480 train_time:81332ms step_avg:148.42ms step:559/1480 train_time:81487ms step_avg:148.43ms step:560/1480 train_time:81642ms step_avg:148.44ms step:561/1480 train_time:81796ms step_avg:148.45ms step:562/1480 train_time:81951ms step_avg:148.46ms step:563/1480 train_time:82106ms step_avg:148.47ms step:564/1480 train_time:82261ms step_avg:148.48ms step:565/1480 train_time:82416ms step_avg:148.50ms step:566/1480 train_time:82571ms step_avg:148.51ms step:567/1480 train_time:82726ms step_avg:148.52ms step:568/1480 train_time:82881ms step_avg:148.53ms step:569/1480 train_time:83054ms step_avg:148.58ms step:570/1480 train_time:83190ms step_avg:148.55ms step:571/1480 train_time:83344ms step_avg:148.56ms step:572/1480 train_time:83499ms step_avg:148.57ms step:573/1480 train_time:83654ms step_avg:148.59ms step:574/1480 train_time:83809ms step_avg:148.60ms step:575/1480 train_time:83964ms step_avg:148.61ms step:576/1480 train_time:84119ms step_avg:148.62ms step:577/1480 train_time:84273ms step_avg:148.63ms step:578/1480 train_time:84428ms step_avg:148.64ms step:579/1480 train_time:84583ms step_avg:148.65ms step:580/1480 train_time:84737ms step_avg:148.66ms step:581/1480 train_time:84891ms step_avg:148.67ms step:582/1480 train_time:85046ms step_avg:148.68ms step:583/1480 train_time:85202ms step_avg:148.69ms step:584/1480 train_time:85357ms step_avg:148.71ms step:585/1480 train_time:85511ms step_avg:148.72ms step:586/1480 train_time:85665ms step_avg:148.72ms step:587/1480 train_time:85820ms step_avg:148.73ms step:588/1480 train_time:85975ms step_avg:148.75ms step:589/1480 train_time:86129ms step_avg:148.75ms step:590/1480 train_time:86284ms step_avg:148.77ms step:591/1480 train_time:86438ms step_avg:148.77ms step:592/1480 train_time:86594ms step_avg:148.79ms step:593/1480 train_time:86749ms step_avg:148.80ms step:594/1480 train_time:86904ms step_avg:148.81ms step:595/1480 train_time:87059ms step_avg:148.82ms step:596/1480 train_time:87215ms step_avg:148.83ms step:597/1480 train_time:87369ms step_avg:148.84ms step:598/1480 train_time:87524ms step_avg:148.85ms step:599/1480 train_time:87678ms step_avg:148.86ms step:600/1480 train_time:87833ms step_avg:148.87ms step:601/1480 train_time:87988ms step_avg:148.88ms step:602/1480 train_time:88143ms step_avg:148.89ms step:603/1480 train_time:88298ms step_avg:148.90ms step:604/1480 train_time:88452ms step_avg:148.91ms step:605/1480 train_time:88607ms step_avg:148.92ms step:606/1480 train_time:88763ms step_avg:148.93ms step:607/1480 train_time:88918ms step_avg:148.94ms step:608/1480 train_time:89073ms step_avg:148.95ms step:609/1480 train_time:89228ms step_avg:148.96ms step:610/1480 train_time:89383ms step_avg:148.97ms step:611/1480 train_time:89537ms step_avg:148.98ms step:612/1480 train_time:89691ms step_avg:148.99ms step:613/1480 train_time:89847ms step_avg:149.00ms step:614/1480 train_time:90003ms step_avg:149.01ms step:615/1480 train_time:90158ms step_avg:149.02ms step:616/1480 train_time:90312ms step_avg:149.03ms step:617/1480 train_time:90468ms step_avg:149.04ms step:618/1480 train_time:90622ms step_avg:149.05ms step:619/1480 train_time:90777ms step_avg:149.06ms step:620/1480 train_time:90931ms step_avg:149.07ms step:621/1480 train_time:91087ms step_avg:149.08ms step:622/1480 train_time:91242ms step_avg:149.09ms step:623/1480 train_time:91397ms step_avg:149.10ms step:624/1480 train_time:91553ms step_avg:149.11ms step:625/1480 train_time:91707ms step_avg:149.12ms step:625/1480 val_loss:3.6103 train_time:91778ms step_avg:149.23ms step:626/1480 train_time:91869ms step_avg:149.14ms step:627/1480 train_time:92021ms step_avg:149.14ms step:628/1480 train_time:92176ms step_avg:149.15ms step:629/1480 train_time:92329ms step_avg:149.16ms step:630/1480 train_time:92484ms step_avg:149.17ms step:631/1480 train_time:92638ms step_avg:149.18ms step:632/1480 train_time:92793ms step_avg:149.18ms step:633/1480 train_time:92947ms step_avg:149.19ms step:634/1480 train_time:93103ms step_avg:149.20ms step:635/1480 train_time:93258ms step_avg:149.21ms step:636/1480 train_time:93413ms step_avg:149.22ms step:637/1480 train_time:93569ms step_avg:149.23ms step:638/1480 train_time:93722ms step_avg:149.24ms step:639/1480 train_time:93876ms step_avg:149.25ms step:640/1480 train_time:94030ms step_avg:149.25ms step:641/1480 train_time:94185ms step_avg:149.26ms step:642/1480 train_time:94340ms step_avg:149.27ms step:643/1480 train_time:94495ms step_avg:149.28ms step:644/1480 train_time:94649ms step_avg:149.29ms step:645/1480 train_time:94805ms step_avg:149.30ms step:646/1480 train_time:94961ms step_avg:149.31ms step:647/1480 train_time:95116ms step_avg:149.32ms step:648/1480 train_time:95270ms step_avg:149.33ms step:649/1480 train_time:95426ms step_avg:149.34ms step:650/1480 train_time:95581ms step_avg:149.34ms step:651/1480 train_time:95736ms step_avg:149.35ms step:652/1480 train_time:95891ms step_avg:149.36ms step:653/1480 train_time:96044ms step_avg:149.37ms step:654/1480 train_time:96201ms step_avg:149.38ms step:655/1480 train_time:96357ms step_avg:149.39ms step:656/1480 train_time:96512ms step_avg:149.40ms step:657/1480 train_time:96667ms step_avg:149.41ms step:658/1480 train_time:96820ms step_avg:149.41ms step:659/1480 train_time:96976ms step_avg:149.42ms step:660/1480 train_time:97131ms step_avg:149.43ms step:661/1480 train_time:97289ms step_avg:149.45ms step:662/1480 train_time:97444ms step_avg:149.45ms step:663/1480 train_time:97599ms step_avg:149.46ms step:664/1480 train_time:97755ms step_avg:149.47ms step:665/1480 train_time:97912ms step_avg:149.48ms step:666/1480 train_time:98068ms step_avg:149.49ms step:667/1480 train_time:98226ms step_avg:149.51ms step:668/1480 train_time:98381ms step_avg:149.52ms step:669/1480 train_time:98539ms step_avg:149.53ms step:670/1480 train_time:98695ms step_avg:149.54ms step:671/1480 train_time:98850ms step_avg:149.55ms step:672/1480 train_time:99007ms step_avg:149.56ms step:673/1480 train_time:99164ms step_avg:149.57ms step:674/1480 train_time:99320ms step_avg:149.58ms step:675/1480 train_time:99478ms step_avg:149.59ms step:676/1480 train_time:99635ms step_avg:149.60ms step:677/1480 train_time:99791ms step_avg:149.61ms step:678/1480 train_time:99947ms step_avg:149.62ms step:679/1480 train_time:100105ms step_avg:149.63ms step:680/1480 train_time:100262ms step_avg:149.64ms step:681/1480 train_time:100416ms step_avg:149.65ms step:682/1480 train_time:100573ms step_avg:149.66ms step:683/1480 train_time:100730ms step_avg:149.67ms step:684/1480 train_time:100887ms step_avg:149.68ms step:685/1480 train_time:101043ms step_avg:149.69ms step:686/1480 train_time:101200ms step_avg:149.70ms step:687/1480 train_time:101356ms step_avg:149.71ms step:688/1480 train_time:101512ms step_avg:149.72ms step:689/1480 train_time:101669ms step_avg:149.73ms step:690/1480 train_time:101827ms step_avg:149.75ms step:691/1480 train_time:101983ms step_avg:149.76ms step:692/1480 train_time:102139ms step_avg:149.76ms step:693/1480 train_time:102295ms step_avg:149.77ms step:694/1480 train_time:102452ms step_avg:149.78ms step:695/1480 train_time:102608ms step_avg:149.79ms step:696/1480 train_time:102764ms step_avg:149.80ms step:697/1480 train_time:102921ms step_avg:149.81ms step:698/1480 train_time:103078ms step_avg:149.82ms step:699/1480 train_time:103235ms step_avg:149.83ms step:700/1480 train_time:103392ms step_avg:149.84ms step:701/1480 train_time:103547ms step_avg:149.85ms step:702/1480 train_time:103705ms step_avg:149.86ms step:703/1480 train_time:103861ms step_avg:149.87ms step:704/1480 train_time:104016ms step_avg:149.88ms step:705/1480 train_time:104175ms step_avg:149.89ms step:706/1480 train_time:104333ms step_avg:149.90ms step:707/1480 train_time:104490ms step_avg:149.91ms step:708/1480 train_time:104645ms step_avg:149.92ms step:709/1480 train_time:104802ms step_avg:149.93ms step:710/1480 train_time:104958ms step_avg:149.94ms step:711/1480 train_time:105113ms step_avg:149.95ms step:712/1480 train_time:105270ms step_avg:149.96ms step:713/1480 train_time:105428ms step_avg:149.97ms step:714/1480 train_time:105585ms step_avg:149.98ms step:715/1480 train_time:105739ms step_avg:149.98ms step:716/1480 train_time:105895ms step_avg:149.99ms step:717/1480 train_time:106051ms step_avg:150.00ms step:718/1480 train_time:106208ms step_avg:150.01ms step:719/1480 train_time:106365ms step_avg:150.02ms step:720/1480 train_time:106522ms step_avg:150.03ms step:721/1480 train_time:106679ms step_avg:150.04ms step:722/1480 train_time:106836ms step_avg:150.05ms step:723/1480 train_time:106991ms step_avg:150.06ms step:724/1480 train_time:107147ms step_avg:150.07ms step:725/1480 train_time:107305ms step_avg:150.08ms step:726/1480 train_time:107461ms step_avg:150.08ms step:727/1480 train_time:107619ms step_avg:150.10ms step:728/1480 train_time:107776ms step_avg:150.11ms step:729/1480 train_time:107932ms step_avg:150.11ms step:730/1480 train_time:108091ms step_avg:150.13ms step:731/1480 train_time:108247ms step_avg:150.13ms step:732/1480 train_time:108403ms step_avg:150.14ms step:733/1480 train_time:108559ms step_avg:150.15ms step:734/1480 train_time:108715ms step_avg:150.16ms step:735/1480 train_time:108871ms step_avg:150.17ms step:736/1480 train_time:109027ms step_avg:150.18ms step:737/1480 train_time:109183ms step_avg:150.18ms step:738/1480 train_time:109339ms step_avg:150.19ms step:739/1480 train_time:109494ms step_avg:150.20ms step:740/1480 train_time:109652ms step_avg:150.21ms step:741/1480 train_time:109809ms step_avg:150.22ms step:742/1480 train_time:109966ms step_avg:150.23ms step:743/1480 train_time:110122ms step_avg:150.23ms step:744/1480 train_time:110278ms step_avg:150.24ms step:745/1480 train_time:110435ms step_avg:150.25ms step:746/1480 train_time:110591ms step_avg:150.26ms step:747/1480 train_time:110748ms step_avg:150.27ms step:748/1480 train_time:110907ms step_avg:150.28ms step:749/1480 train_time:111063ms step_avg:150.29ms step:750/1480 train_time:111218ms step_avg:150.29ms step:750/1480 val_loss:3.5504 train_time:111291ms step_avg:150.39ms step:751/1480 train_time:111381ms step_avg:150.31ms step:752/1480 train_time:111539ms step_avg:150.32ms step:753/1480 train_time:111695ms step_avg:150.33ms step:754/1480 train_time:111851ms step_avg:150.34ms step:755/1480 train_time:112008ms step_avg:150.35ms step:756/1480 train_time:112164ms step_avg:150.35ms step:757/1480 train_time:112320ms step_avg:150.36ms step:758/1480 train_time:112477ms step_avg:150.37ms step:759/1480 train_time:112650ms step_avg:150.40ms step:760/1480 train_time:112790ms step_avg:150.39ms step:761/1480 train_time:112946ms step_avg:150.39ms step:762/1480 train_time:113104ms step_avg:150.40ms step:763/1480 train_time:113261ms step_avg:150.41ms step:764/1480 train_time:113417ms step_avg:150.42ms step:765/1480 train_time:113574ms step_avg:150.43ms step:766/1480 train_time:113731ms step_avg:150.44ms step:767/1480 train_time:113888ms step_avg:150.45ms step:768/1480 train_time:114045ms step_avg:150.46ms step:769/1480 train_time:114202ms step_avg:150.46ms step:770/1480 train_time:114360ms step_avg:150.47ms step:771/1480 train_time:114517ms step_avg:150.48ms step:772/1480 train_time:114675ms step_avg:150.49ms step:773/1480 train_time:114832ms step_avg:150.50ms step:774/1480 train_time:114989ms step_avg:150.51ms step:775/1480 train_time:115146ms step_avg:150.52ms step:776/1480 train_time:115305ms step_avg:150.53ms step:777/1480 train_time:115465ms step_avg:150.54ms step:778/1480 train_time:115623ms step_avg:150.55ms step:779/1480 train_time:115780ms step_avg:150.56ms step:780/1480 train_time:115940ms step_avg:150.57ms step:781/1480 train_time:116096ms step_avg:150.58ms step:782/1480 train_time:116254ms step_avg:150.59ms step:783/1480 train_time:116412ms step_avg:150.60ms step:784/1480 train_time:116570ms step_avg:150.61ms step:785/1480 train_time:116726ms step_avg:150.61ms step:786/1480 train_time:116884ms step_avg:150.62ms step:787/1480 train_time:117042ms step_avg:150.63ms step:788/1480 train_time:117201ms step_avg:150.64ms step:789/1480 train_time:117361ms step_avg:150.66ms step:790/1480 train_time:117516ms step_avg:150.66ms step:791/1480 train_time:117677ms step_avg:150.67ms step:792/1480 train_time:117835ms step_avg:150.68ms step:793/1480 train_time:117992ms step_avg:150.69ms step:794/1480 train_time:118152ms step_avg:150.70ms step:795/1480 train_time:118314ms step_avg:150.72ms step:796/1480 train_time:118475ms step_avg:150.73ms step:797/1480 train_time:118636ms step_avg:150.74ms step:798/1480 train_time:118796ms step_avg:150.76ms step:799/1480 train_time:118956ms step_avg:150.77ms step:800/1480 train_time:119115ms step_avg:150.78ms step:801/1480 train_time:119272ms step_avg:150.79ms step:802/1480 train_time:119432ms step_avg:150.80ms step:803/1480 train_time:119589ms step_avg:150.81ms step:804/1480 train_time:119748ms step_avg:150.82ms step:805/1480 train_time:119908ms step_avg:150.83ms step:806/1480 train_time:120066ms step_avg:150.84ms step:807/1480 train_time:120222ms step_avg:150.84ms step:808/1480 train_time:120381ms step_avg:150.85ms step:809/1480 train_time:120538ms step_avg:150.86ms step:810/1480 train_time:120694ms step_avg:150.87ms step:811/1480 train_time:120853ms step_avg:150.88ms step:812/1480 train_time:121010ms step_avg:150.89ms step:813/1480 train_time:121166ms step_avg:150.89ms step:814/1480 train_time:121323ms step_avg:150.90ms step:815/1480 train_time:121480ms step_avg:150.91ms step:816/1480 train_time:121639ms step_avg:150.92ms step:817/1480 train_time:121796ms step_avg:150.92ms step:818/1480 train_time:121954ms step_avg:150.93ms step:819/1480 train_time:122112ms step_avg:150.94ms step:820/1480 train_time:122269ms step_avg:150.95ms step:821/1480 train_time:122425ms step_avg:150.96ms step:822/1480 train_time:122584ms step_avg:150.97ms step:823/1480 train_time:122742ms step_avg:150.97ms step:824/1480 train_time:122899ms step_avg:150.98ms step:825/1480 train_time:123059ms step_avg:150.99ms step:826/1480 train_time:123219ms step_avg:151.00ms step:827/1480 train_time:123379ms step_avg:151.01ms step:828/1480 train_time:123538ms step_avg:151.02ms step:829/1480 train_time:123698ms step_avg:151.03ms step:830/1480 train_time:123858ms step_avg:151.05ms step:831/1480 train_time:124016ms step_avg:151.05ms step:832/1480 train_time:124177ms step_avg:151.07ms step:833/1480 train_time:124335ms step_avg:151.08ms step:834/1480 train_time:124494ms step_avg:151.09ms step:835/1480 train_time:124652ms step_avg:151.09ms step:836/1480 train_time:124810ms step_avg:151.10ms step:837/1480 train_time:124968ms step_avg:151.11ms step:838/1480 train_time:125125ms step_avg:151.12ms step:839/1480 train_time:125283ms step_avg:151.13ms step:840/1480 train_time:125441ms step_avg:151.13ms step:841/1480 train_time:125599ms step_avg:151.14ms step:842/1480 train_time:125760ms step_avg:151.15ms step:843/1480 train_time:125916ms step_avg:151.16ms step:844/1480 train_time:126073ms step_avg:151.17ms step:845/1480 train_time:126229ms step_avg:151.17ms step:846/1480 train_time:126390ms step_avg:151.18ms step:847/1480 train_time:126548ms step_avg:151.19ms step:848/1480 train_time:126704ms step_avg:151.20ms step:849/1480 train_time:126863ms step_avg:151.21ms step:850/1480 train_time:127021ms step_avg:151.22ms step:851/1480 train_time:127181ms step_avg:151.23ms step:852/1480 train_time:127339ms step_avg:151.23ms step:853/1480 train_time:127495ms step_avg:151.24ms step:854/1480 train_time:127652ms step_avg:151.25ms step:855/1480 train_time:127809ms step_avg:151.25ms step:856/1480 train_time:127967ms step_avg:151.26ms step:857/1480 train_time:128124ms step_avg:151.27ms step:858/1480 train_time:128285ms step_avg:151.28ms step:859/1480 train_time:128444ms step_avg:151.29ms step:860/1480 train_time:128601ms step_avg:151.30ms step:861/1480 train_time:128760ms step_avg:151.30ms step:862/1480 train_time:128920ms step_avg:151.31ms step:863/1480 train_time:129080ms step_avg:151.32ms step:864/1480 train_time:129239ms step_avg:151.33ms step:865/1480 train_time:129396ms step_avg:151.34ms step:866/1480 train_time:129554ms step_avg:151.35ms step:867/1480 train_time:129713ms step_avg:151.36ms step:868/1480 train_time:129870ms step_avg:151.36ms step:869/1480 train_time:130028ms step_avg:151.37ms step:870/1480 train_time:130187ms step_avg:151.38ms step:871/1480 train_time:130345ms step_avg:151.39ms step:872/1480 train_time:130503ms step_avg:151.40ms step:873/1480 train_time:130660ms step_avg:151.40ms step:874/1480 train_time:130819ms step_avg:151.41ms step:875/1480 train_time:130978ms step_avg:151.42ms step:875/1480 val_loss:3.5055 train_time:131049ms step_avg:151.50ms step:876/1480 train_time:131140ms step_avg:151.43ms step:877/1480 train_time:131296ms step_avg:151.44ms step:878/1480 train_time:131455ms step_avg:151.45ms step:879/1480 train_time:131613ms step_avg:151.45ms step:880/1480 train_time:131771ms step_avg:151.46ms step:881/1480 train_time:131928ms step_avg:151.47ms step:882/1480 train_time:132088ms step_avg:151.48ms step:883/1480 train_time:132247ms step_avg:151.49ms step:884/1480 train_time:132407ms step_avg:151.50ms step:885/1480 train_time:132566ms step_avg:151.50ms step:886/1480 train_time:132728ms step_avg:151.52ms step:887/1480 train_time:132887ms step_avg:151.52ms step:888/1480 train_time:133050ms step_avg:151.54ms step:889/1480 train_time:133210ms step_avg:151.55ms step:890/1480 train_time:133368ms step_avg:151.55ms step:891/1480 train_time:133526ms step_avg:151.56ms step:892/1480 train_time:133686ms step_avg:151.57ms step:893/1480 train_time:133845ms step_avg:151.58ms step:894/1480 train_time:134005ms step_avg:151.59ms step:895/1480 train_time:134166ms step_avg:151.60ms step:896/1480 train_time:134326ms step_avg:151.61ms step:897/1480 train_time:134484ms step_avg:151.62ms step:898/1480 train_time:134644ms step_avg:151.63ms step:899/1480 train_time:134803ms step_avg:151.63ms step:900/1480 train_time:134961ms step_avg:151.64ms step:901/1480 train_time:135120ms step_avg:151.65ms step:902/1480 train_time:135279ms step_avg:151.66ms step:903/1480 train_time:135440ms step_avg:151.67ms step:904/1480 train_time:135600ms step_avg:151.68ms step:905/1480 train_time:135759ms step_avg:151.69ms step:906/1480 train_time:135919ms step_avg:151.70ms step:907/1480 train_time:136081ms step_avg:151.71ms step:908/1480 train_time:136239ms step_avg:151.71ms step:909/1480 train_time:136398ms step_avg:151.72ms step:910/1480 train_time:136562ms step_avg:151.74ms step:911/1480 train_time:136721ms step_avg:151.74ms step:912/1480 train_time:136881ms step_avg:151.75ms step:913/1480 train_time:137042ms step_avg:151.76ms step:914/1480 train_time:137203ms step_avg:151.77ms step:915/1480 train_time:137364ms step_avg:151.78ms step:916/1480 train_time:137523ms step_avg:151.79ms step:917/1480 train_time:137681ms step_avg:151.80ms step:918/1480 train_time:137843ms step_avg:151.81ms step:919/1480 train_time:138006ms step_avg:151.82ms step:920/1480 train_time:138165ms step_avg:151.83ms step:921/1480 train_time:138324ms step_avg:151.84ms step:922/1480 train_time:138484ms step_avg:151.85ms step:923/1480 train_time:138642ms step_avg:151.85ms step:924/1480 train_time:138801ms step_avg:151.86ms step:925/1480 train_time:138961ms step_avg:151.87ms step:926/1480 train_time:139119ms step_avg:151.88ms step:927/1480 train_time:139279ms step_avg:151.89ms step:928/1480 train_time:139439ms step_avg:151.89ms step:929/1480 train_time:139599ms step_avg:151.90ms step:930/1480 train_time:139757ms step_avg:151.91ms step:931/1480 train_time:139916ms step_avg:151.92ms step:932/1480 train_time:140075ms step_avg:151.93ms step:933/1480 train_time:140236ms step_avg:151.93ms step:934/1480 train_time:140396ms step_avg:151.94ms step:935/1480 train_time:140556ms step_avg:151.95ms step:936/1480 train_time:140716ms step_avg:151.96ms step:937/1480 train_time:140876ms step_avg:151.97ms step:938/1480 train_time:141034ms step_avg:151.98ms step:939/1480 train_time:141196ms step_avg:151.99ms step:940/1480 train_time:141358ms step_avg:152.00ms step:941/1480 train_time:141517ms step_avg:152.01ms step:942/1480 train_time:141676ms step_avg:152.01ms step:943/1480 train_time:141835ms step_avg:152.02ms step:944/1480 train_time:141999ms step_avg:152.03ms step:945/1480 train_time:142160ms step_avg:152.04ms step:946/1480 train_time:142322ms step_avg:152.05ms step:947/1480 train_time:142483ms step_avg:152.06ms step:948/1480 train_time:142643ms step_avg:152.07ms step:949/1480 train_time:142817ms step_avg:152.09ms step:950/1480 train_time:142962ms step_avg:152.09ms step:951/1480 train_time:143123ms step_avg:152.10ms step:952/1480 train_time:143282ms step_avg:152.10ms step:953/1480 train_time:143441ms step_avg:152.11ms step:954/1480 train_time:143604ms step_avg:152.12ms step:955/1480 train_time:143762ms step_avg:152.13ms step:956/1480 train_time:143921ms step_avg:152.14ms step:957/1480 train_time:144082ms step_avg:152.15ms step:958/1480 train_time:144248ms step_avg:152.16ms step:959/1480 train_time:144407ms step_avg:152.17ms step:960/1480 train_time:144567ms step_avg:152.18ms step:961/1480 train_time:144726ms step_avg:152.18ms step:962/1480 train_time:144885ms step_avg:152.19ms step:963/1480 train_time:145046ms step_avg:152.20ms step:964/1480 train_time:145207ms step_avg:152.21ms step:965/1480 train_time:145365ms step_avg:152.21ms step:966/1480 train_time:145524ms step_avg:152.22ms step:967/1480 train_time:145681ms step_avg:152.23ms step:968/1480 train_time:145840ms step_avg:152.23ms step:969/1480 train_time:146001ms step_avg:152.24ms step:970/1480 train_time:146160ms step_avg:152.25ms step:971/1480 train_time:146319ms step_avg:152.26ms step:972/1480 train_time:146478ms step_avg:152.26ms step:973/1480 train_time:146636ms step_avg:152.27ms step:974/1480 train_time:146797ms step_avg:152.28ms step:975/1480 train_time:146958ms step_avg:152.29ms step:976/1480 train_time:147120ms step_avg:152.30ms step:977/1480 train_time:147279ms step_avg:152.31ms step:978/1480 train_time:147439ms step_avg:152.31ms step:979/1480 train_time:147602ms step_avg:152.32ms step:980/1480 train_time:147761ms step_avg:152.33ms step:981/1480 train_time:147922ms step_avg:152.34ms step:982/1480 train_time:148080ms step_avg:152.35ms step:983/1480 train_time:148241ms step_avg:152.35ms step:984/1480 train_time:148401ms step_avg:152.36ms step:985/1480 train_time:148563ms step_avg:152.37ms step:986/1480 train_time:148724ms step_avg:152.38ms step:987/1480 train_time:148882ms step_avg:152.39ms step:988/1480 train_time:149042ms step_avg:152.39ms step:989/1480 train_time:149201ms step_avg:152.40ms step:990/1480 train_time:149363ms step_avg:152.41ms step:991/1480 train_time:149524ms step_avg:152.42ms step:992/1480 train_time:149687ms step_avg:152.43ms step:993/1480 train_time:149855ms step_avg:152.45ms step:994/1480 train_time:150016ms step_avg:152.46ms step:995/1480 train_time:150176ms step_avg:152.46ms step:996/1480 train_time:150334ms step_avg:152.47ms step:997/1480 train_time:150494ms step_avg:152.48ms step:998/1480 train_time:150652ms step_avg:152.48ms step:999/1480 train_time:150814ms step_avg:152.49ms step:1000/1480 train_time:150977ms step_avg:152.50ms step:1000/1480 val_loss:3.4442 train_time:151050ms step_avg:152.58ms step:1001/1480 train_time:151141ms step_avg:152.51ms step:1002/1480 train_time:151299ms step_avg:152.52ms step:1003/1480 train_time:151463ms step_avg:152.53ms step:1004/1480 train_time:151624ms step_avg:152.54ms step:1005/1480 train_time:151785ms step_avg:152.55ms step:1006/1480 train_time:151946ms step_avg:152.56ms step:1007/1480 train_time:152107ms step_avg:152.57ms step:1008/1480 train_time:152267ms step_avg:152.57ms step:1009/1480 train_time:152432ms step_avg:152.58ms step:1010/1480 train_time:152592ms step_avg:152.59ms step:1011/1480 train_time:152750ms step_avg:152.60ms step:1012/1480 train_time:152908ms step_avg:152.60ms step:1013/1480 train_time:153069ms step_avg:152.61ms step:1014/1480 train_time:153228ms step_avg:152.62ms step:1015/1480 train_time:153394ms step_avg:152.63ms step:1016/1480 train_time:153554ms step_avg:152.64ms step:1017/1480 train_time:153715ms step_avg:152.65ms step:1018/1480 train_time:153876ms step_avg:152.65ms step:1019/1480 train_time:154036ms step_avg:152.66ms step:1020/1480 train_time:154197ms step_avg:152.67ms step:1021/1480 train_time:154358ms step_avg:152.68ms step:1022/1480 train_time:154518ms step_avg:152.69ms step:1023/1480 train_time:154678ms step_avg:152.69ms step:1024/1480 train_time:154837ms step_avg:152.70ms step:1025/1480 train_time:154999ms step_avg:152.71ms step:1026/1480 train_time:155158ms step_avg:152.71ms step:1027/1480 train_time:155316ms step_avg:152.72ms step:1028/1480 train_time:155479ms step_avg:152.73ms step:1029/1480 train_time:155644ms step_avg:152.74ms step:1030/1480 train_time:155805ms step_avg:152.75ms step:1031/1480 train_time:155965ms step_avg:152.76ms step:1032/1480 train_time:156128ms step_avg:152.77ms step:1033/1480 train_time:156289ms step_avg:152.78ms step:1034/1480 train_time:156449ms step_avg:152.78ms step:1035/1480 train_time:156609ms step_avg:152.79ms step:1036/1480 train_time:156769ms step_avg:152.80ms step:1037/1480 train_time:156930ms step_avg:152.80ms step:1038/1480 train_time:157091ms step_avg:152.81ms step:1039/1480 train_time:157253ms step_avg:152.82ms step:1040/1480 train_time:157414ms step_avg:152.83ms step:1041/1480 train_time:157575ms step_avg:152.84ms step:1042/1480 train_time:157732ms step_avg:152.84ms step:1043/1480 train_time:157892ms step_avg:152.85ms step:1044/1480 train_time:158053ms step_avg:152.86ms step:1045/1480 train_time:158214ms step_avg:152.86ms step:1046/1480 train_time:158373ms step_avg:152.87ms step:1047/1480 train_time:158533ms step_avg:152.88ms step:1048/1480 train_time:158695ms step_avg:152.89ms step:1049/1480 train_time:158856ms step_avg:152.89ms step:1050/1480 train_time:159016ms step_avg:152.90ms step:1051/1480 train_time:159178ms step_avg:152.91ms step:1052/1480 train_time:159339ms step_avg:152.92ms step:1053/1480 train_time:159499ms step_avg:152.92ms step:1054/1480 train_time:159660ms step_avg:152.93ms step:1055/1480 train_time:159819ms step_avg:152.94ms step:1056/1480 train_time:159979ms step_avg:152.94ms step:1057/1480 train_time:160139ms step_avg:152.95ms step:1058/1480 train_time:160301ms step_avg:152.96ms step:1059/1480 train_time:160464ms step_avg:152.97ms step:1060/1480 train_time:160628ms step_avg:152.98ms step:1061/1480 train_time:160786ms step_avg:152.98ms step:1062/1480 train_time:160944ms step_avg:152.99ms step:1063/1480 train_time:161104ms step_avg:153.00ms step:1064/1480 train_time:161262ms step_avg:153.00ms step:1065/1480 train_time:161421ms step_avg:153.01ms step:1066/1480 train_time:161584ms step_avg:153.01ms step:1067/1480 train_time:161747ms step_avg:153.02ms step:1068/1480 train_time:161909ms step_avg:153.03ms step:1069/1480 train_time:162072ms step_avg:153.04ms step:1070/1480 train_time:162232ms step_avg:153.05ms step:1071/1480 train_time:162396ms step_avg:153.06ms step:1072/1480 train_time:162556ms step_avg:153.07ms step:1073/1480 train_time:162713ms step_avg:153.07ms step:1074/1480 train_time:162872ms step_avg:153.08ms step:1075/1480 train_time:163032ms step_avg:153.08ms step:1076/1480 train_time:163192ms step_avg:153.09ms step:1077/1480 train_time:163353ms step_avg:153.10ms step:1078/1480 train_time:163517ms step_avg:153.11ms step:1079/1480 train_time:163680ms step_avg:153.12ms step:1080/1480 train_time:163841ms step_avg:153.12ms step:1081/1480 train_time:164000ms step_avg:153.13ms step:1082/1480 train_time:164159ms step_avg:153.13ms step:1083/1480 train_time:164319ms step_avg:153.14ms step:1084/1480 train_time:164478ms step_avg:153.15ms step:1085/1480 train_time:164639ms step_avg:153.15ms step:1086/1480 train_time:164799ms step_avg:153.16ms step:1087/1480 train_time:164960ms step_avg:153.17ms step:1088/1480 train_time:165122ms step_avg:153.17ms step:1089/1480 train_time:165289ms step_avg:153.19ms step:1090/1480 train_time:165452ms step_avg:153.20ms step:1091/1480 train_time:165613ms step_avg:153.20ms step:1092/1480 train_time:165774ms step_avg:153.21ms step:1093/1480 train_time:165934ms step_avg:153.22ms step:1094/1480 train_time:166096ms step_avg:153.22ms step:1095/1480 train_time:166256ms step_avg:153.23ms step:1096/1480 train_time:166421ms step_avg:153.24ms step:1097/1480 train_time:166581ms step_avg:153.25ms step:1098/1480 train_time:166743ms step_avg:153.26ms step:1099/1480 train_time:166905ms step_avg:153.26ms step:1100/1480 train_time:167071ms step_avg:153.28ms step:1101/1480 train_time:167233ms step_avg:153.28ms step:1102/1480 train_time:167395ms step_avg:153.29ms step:1103/1480 train_time:167561ms step_avg:153.30ms step:1104/1480 train_time:167723ms step_avg:153.31ms step:1105/1480 train_time:167889ms step_avg:153.32ms step:1106/1480 train_time:168052ms step_avg:153.33ms step:1107/1480 train_time:168213ms step_avg:153.34ms step:1108/1480 train_time:168373ms step_avg:153.35ms step:1109/1480 train_time:168534ms step_avg:153.35ms step:1110/1480 train_time:168695ms step_avg:153.36ms step:1111/1480 train_time:168857ms step_avg:153.37ms step:1112/1480 train_time:169019ms step_avg:153.37ms step:1113/1480 train_time:169188ms step_avg:153.39ms step:1114/1480 train_time:169352ms step_avg:153.40ms step:1115/1480 train_time:169514ms step_avg:153.41ms step:1116/1480 train_time:169675ms step_avg:153.41ms step:1117/1480 train_time:169838ms step_avg:153.42ms step:1118/1480 train_time:170003ms step_avg:153.43ms step:1119/1480 train_time:170164ms step_avg:153.44ms step:1120/1480 train_time:170327ms step_avg:153.45ms step:1121/1480 train_time:170490ms step_avg:153.46ms step:1122/1480 train_time:170651ms step_avg:153.46ms step:1123/1480 train_time:170811ms step_avg:153.47ms step:1124/1480 train_time:170974ms step_avg:153.48ms step:1125/1480 train_time:171136ms step_avg:153.48ms step:1125/1480 val_loss:3.3873 train_time:171210ms step_avg:153.55ms step:1126/1480 train_time:171300ms step_avg:153.49ms step:1127/1480 train_time:171460ms step_avg:153.50ms step:1128/1480 train_time:171621ms step_avg:153.51ms step:1129/1480 train_time:171782ms step_avg:153.51ms step:1130/1480 train_time:171942ms step_avg:153.52ms step:1131/1480 train_time:172112ms step_avg:153.53ms step:1132/1480 train_time:172273ms step_avg:153.54ms step:1133/1480 train_time:172437ms step_avg:153.55ms step:1134/1480 train_time:172601ms step_avg:153.56ms step:1135/1480 train_time:172761ms step_avg:153.57ms step:1136/1480 train_time:172922ms step_avg:153.57ms step:1137/1480 train_time:173084ms step_avg:153.58ms step:1138/1480 train_time:173250ms step_avg:153.59ms step:1139/1480 train_time:173426ms step_avg:153.61ms step:1140/1480 train_time:173573ms step_avg:153.60ms step:1141/1480 train_time:173738ms step_avg:153.61ms step:1142/1480 train_time:173899ms step_avg:153.62ms step:1143/1480 train_time:174062ms step_avg:153.63ms step:1144/1480 train_time:174224ms step_avg:153.64ms step:1145/1480 train_time:174383ms step_avg:153.64ms step:1146/1480 train_time:174546ms step_avg:153.65ms step:1147/1480 train_time:174710ms step_avg:153.66ms step:1148/1480 train_time:174872ms step_avg:153.67ms step:1149/1480 train_time:175036ms step_avg:153.68ms step:1150/1480 train_time:175197ms step_avg:153.68ms step:1151/1480 train_time:175360ms step_avg:153.69ms step:1152/1480 train_time:175523ms step_avg:153.70ms step:1153/1480 train_time:175687ms step_avg:153.71ms step:1154/1480 train_time:175848ms step_avg:153.71ms step:1155/1480 train_time:176011ms step_avg:153.72ms step:1156/1480 train_time:176178ms step_avg:153.73ms step:1157/1480 train_time:176341ms step_avg:153.74ms step:1158/1480 train_time:176501ms step_avg:153.75ms step:1159/1480 train_time:176661ms step_avg:153.75ms step:1160/1480 train_time:176820ms step_avg:153.76ms step:1161/1480 train_time:176983ms step_avg:153.76ms step:1162/1480 train_time:177144ms step_avg:153.77ms step:1163/1480 train_time:177308ms step_avg:153.78ms step:1164/1480 train_time:177471ms step_avg:153.79ms step:1165/1480 train_time:177632ms step_avg:153.79ms step:1166/1480 train_time:177794ms step_avg:153.80ms step:1167/1480 train_time:177955ms step_avg:153.81ms step:1168/1480 train_time:178118ms step_avg:153.82ms step:1169/1480 train_time:178279ms step_avg:153.82ms step:1170/1480 train_time:178440ms step_avg:153.83ms step:1171/1480 train_time:178601ms step_avg:153.83ms step:1172/1480 train_time:178762ms step_avg:153.84ms step:1173/1480 train_time:178924ms step_avg:153.85ms step:1174/1480 train_time:179095ms step_avg:153.86ms step:1175/1480 train_time:179258ms step_avg:153.87ms step:1176/1480 train_time:179421ms step_avg:153.88ms step:1177/1480 train_time:179587ms step_avg:153.89ms step:1178/1480 train_time:179749ms step_avg:153.89ms step:1179/1480 train_time:179909ms step_avg:153.90ms step:1180/1480 train_time:180077ms step_avg:153.91ms step:1181/1480 train_time:180241ms step_avg:153.92ms step:1182/1480 train_time:180401ms step_avg:153.93ms step:1183/1480 train_time:180563ms step_avg:153.93ms step:1184/1480 train_time:180725ms step_avg:153.94ms step:1185/1480 train_time:180890ms step_avg:153.95ms step:1186/1480 train_time:181053ms step_avg:153.96ms step:1187/1480 train_time:181225ms step_avg:153.97ms step:1188/1480 train_time:181383ms step_avg:153.98ms step:1189/1480 train_time:181545ms step_avg:153.98ms step:1190/1480 train_time:181708ms step_avg:153.99ms step:1191/1480 train_time:181871ms step_avg:154.00ms step:1192/1480 train_time:182033ms step_avg:154.00ms step:1193/1480 train_time:182193ms step_avg:154.01ms step:1194/1480 train_time:182355ms step_avg:154.02ms step:1195/1480 train_time:182516ms step_avg:154.02ms step:1196/1480 train_time:182688ms step_avg:154.04ms step:1197/1480 train_time:182852ms step_avg:154.05ms step:1198/1480 train_time:183020ms step_avg:154.06ms step:1199/1480 train_time:183182ms step_avg:154.06ms step:1200/1480 train_time:183342ms step_avg:154.07ms step:1201/1480 train_time:183503ms step_avg:154.07ms step:1202/1480 train_time:183672ms step_avg:154.09ms step:1203/1480 train_time:183838ms step_avg:154.10ms step:1204/1480 train_time:184002ms step_avg:154.11ms step:1205/1480 train_time:184162ms step_avg:154.11ms step:1206/1480 train_time:184324ms step_avg:154.12ms step:1207/1480 train_time:184483ms step_avg:154.12ms step:1208/1480 train_time:184643ms step_avg:154.13ms step:1209/1480 train_time:184809ms step_avg:154.14ms step:1210/1480 train_time:184974ms step_avg:154.15ms step:1211/1480 train_time:185137ms step_avg:154.15ms step:1212/1480 train_time:185299ms step_avg:154.16ms step:1213/1480 train_time:185463ms step_avg:154.17ms step:1214/1480 train_time:185629ms step_avg:154.18ms step:1215/1480 train_time:185795ms step_avg:154.19ms step:1216/1480 train_time:185956ms step_avg:154.19ms step:1217/1480 train_time:186120ms step_avg:154.20ms step:1218/1480 train_time:186282ms step_avg:154.21ms step:1219/1480 train_time:186449ms step_avg:154.22ms step:1220/1480 train_time:186613ms step_avg:154.23ms step:1221/1480 train_time:186774ms step_avg:154.23ms step:1222/1480 train_time:186935ms step_avg:154.24ms step:1223/1480 train_time:187098ms step_avg:154.24ms step:1224/1480 train_time:187263ms step_avg:154.25ms step:1225/1480 train_time:187425ms step_avg:154.26ms step:1226/1480 train_time:187590ms step_avg:154.27ms step:1227/1480 train_time:187756ms step_avg:154.28ms step:1228/1480 train_time:187918ms step_avg:154.28ms step:1229/1480 train_time:188080ms step_avg:154.29ms step:1230/1480 train_time:188248ms step_avg:154.30ms step:1231/1480 train_time:188414ms step_avg:154.31ms step:1232/1480 train_time:188578ms step_avg:154.32ms step:1233/1480 train_time:188740ms step_avg:154.33ms step:1234/1480 train_time:188901ms step_avg:154.33ms step:1235/1480 train_time:189065ms step_avg:154.34ms step:1236/1480 train_time:189228ms step_avg:154.35ms step:1237/1480 train_time:189389ms step_avg:154.35ms step:1238/1480 train_time:189563ms step_avg:154.37ms step:1239/1480 train_time:189725ms step_avg:154.37ms step:1240/1480 train_time:189891ms step_avg:154.38ms step:1241/1480 train_time:190057ms step_avg:154.39ms step:1242/1480 train_time:190218ms step_avg:154.40ms step:1243/1480 train_time:190381ms step_avg:154.40ms step:1244/1480 train_time:190541ms step_avg:154.41ms step:1245/1480 train_time:190703ms step_avg:154.42ms step:1246/1480 train_time:190866ms step_avg:154.42ms step:1247/1480 train_time:191029ms step_avg:154.43ms step:1248/1480 train_time:191192ms step_avg:154.44ms step:1249/1480 train_time:191353ms step_avg:154.44ms step:1250/1480 train_time:191516ms step_avg:154.45ms step:1250/1480 val_loss:3.3380 train_time:191591ms step_avg:154.51ms step:1251/1480 train_time:191683ms step_avg:154.46ms step:1252/1480 train_time:191847ms step_avg:154.47ms step:1253/1480 train_time:192007ms step_avg:154.47ms step:1254/1480 train_time:192169ms step_avg:154.48ms step:1255/1480 train_time:192340ms step_avg:154.49ms step:1256/1480 train_time:192504ms step_avg:154.50ms step:1257/1480 train_time:192666ms step_avg:154.50ms step:1258/1480 train_time:192831ms step_avg:154.51ms step:1259/1480 train_time:192995ms step_avg:154.52ms step:1260/1480 train_time:193156ms step_avg:154.52ms step:1261/1480 train_time:193317ms step_avg:154.53ms step:1262/1480 train_time:193481ms step_avg:154.54ms step:1263/1480 train_time:193648ms step_avg:154.55ms step:1264/1480 train_time:193807ms step_avg:154.55ms step:1265/1480 train_time:193968ms step_avg:154.56ms step:1266/1480 train_time:194132ms step_avg:154.56ms step:1267/1480 train_time:194295ms step_avg:154.57ms step:1268/1480 train_time:194458ms step_avg:154.58ms step:1269/1480 train_time:194622ms step_avg:154.58ms step:1270/1480 train_time:194784ms step_avg:154.59ms step:1271/1480 train_time:194947ms step_avg:154.60ms step:1272/1480 train_time:195108ms step_avg:154.60ms step:1273/1480 train_time:195271ms step_avg:154.61ms step:1274/1480 train_time:195435ms step_avg:154.62ms step:1275/1480 train_time:195595ms step_avg:154.62ms step:1276/1480 train_time:195755ms step_avg:154.62ms step:1277/1480 train_time:195916ms step_avg:154.63ms step:1278/1480 train_time:196077ms step_avg:154.63ms step:1279/1480 train_time:196240ms step_avg:154.64ms step:1280/1480 train_time:196407ms step_avg:154.65ms step:1281/1480 train_time:196571ms step_avg:154.66ms step:1282/1480 train_time:196731ms step_avg:154.66ms step:1283/1480 train_time:196894ms step_avg:154.67ms step:1284/1480 train_time:197057ms step_avg:154.68ms step:1285/1480 train_time:197217ms step_avg:154.68ms step:1286/1480 train_time:197379ms step_avg:154.69ms step:1287/1480 train_time:197541ms step_avg:154.69ms step:1288/1480 train_time:197704ms step_avg:154.70ms step:1289/1480 train_time:197874ms step_avg:154.71ms step:1290/1480 train_time:198042ms step_avg:154.72ms step:1291/1480 train_time:198207ms step_avg:154.73ms step:1292/1480 train_time:198371ms step_avg:154.74ms step:1293/1480 train_time:198538ms step_avg:154.75ms step:1294/1480 train_time:198700ms step_avg:154.75ms step:1295/1480 train_time:198862ms step_avg:154.76ms step:1296/1480 train_time:199024ms step_avg:154.76ms step:1297/1480 train_time:199188ms step_avg:154.77ms step:1298/1480 train_time:199353ms step_avg:154.78ms step:1299/1480 train_time:199516ms step_avg:154.78ms step:1300/1480 train_time:199677ms step_avg:154.79ms step:1301/1480 train_time:199837ms step_avg:154.79ms step:1302/1480 train_time:200002ms step_avg:154.80ms step:1303/1480 train_time:200168ms step_avg:154.81ms step:1304/1480 train_time:200334ms step_avg:154.82ms step:1305/1480 train_time:200496ms step_avg:154.82ms step:1306/1480 train_time:200659ms step_avg:154.83ms step:1307/1480 train_time:200819ms step_avg:154.83ms step:1308/1480 train_time:200980ms step_avg:154.84ms step:1309/1480 train_time:201145ms step_avg:154.85ms step:1310/1480 train_time:201306ms step_avg:154.85ms step:1311/1480 train_time:201468ms step_avg:154.86ms step:1312/1480 train_time:201633ms step_avg:154.86ms step:1313/1480 train_time:201796ms step_avg:154.87ms step:1314/1480 train_time:201960ms step_avg:154.88ms step:1315/1480 train_time:202122ms step_avg:154.88ms step:1316/1480 train_time:202282ms step_avg:154.89ms step:1317/1480 train_time:202444ms step_avg:154.89ms step:1318/1480 train_time:202611ms step_avg:154.90ms step:1319/1480 train_time:202779ms step_avg:154.91ms step:1320/1480 train_time:202946ms step_avg:154.92ms step:1321/1480 train_time:203111ms step_avg:154.93ms step:1322/1480 train_time:203281ms step_avg:154.94ms step:1323/1480 train_time:203445ms step_avg:154.95ms step:1324/1480 train_time:203609ms step_avg:154.95ms step:1325/1480 train_time:203779ms step_avg:154.96ms step:1326/1480 train_time:203944ms step_avg:154.97ms step:1327/1480 train_time:204108ms step_avg:154.98ms step:1328/1480 train_time:204270ms step_avg:154.99ms step:1329/1480 train_time:204459ms step_avg:155.01ms step:1330/1480 train_time:204619ms step_avg:155.01ms step:1331/1480 train_time:204780ms step_avg:155.02ms step:1332/1480 train_time:204943ms step_avg:155.02ms step:1333/1480 train_time:205110ms step_avg:155.03ms step:1334/1480 train_time:205274ms step_avg:155.04ms step:1335/1480 train_time:205436ms step_avg:155.05ms step:1336/1480 train_time:205605ms step_avg:155.06ms step:1337/1480 train_time:205774ms step_avg:155.07ms step:1338/1480 train_time:205937ms step_avg:155.07ms step:1339/1480 train_time:206100ms step_avg:155.08ms step:1340/1480 train_time:206265ms step_avg:155.09ms step:1341/1480 train_time:206426ms step_avg:155.09ms step:1342/1480 train_time:206593ms step_avg:155.10ms step:1343/1480 train_time:206756ms step_avg:155.11ms step:1344/1480 train_time:206918ms step_avg:155.11ms step:1345/1480 train_time:207088ms step_avg:155.12ms step:1346/1480 train_time:207251ms step_avg:155.13ms step:1347/1480 train_time:207415ms step_avg:155.13ms step:1348/1480 train_time:207578ms step_avg:155.14ms step:1349/1480 train_time:207741ms step_avg:155.15ms step:1350/1480 train_time:207907ms step_avg:155.15ms step:1351/1480 train_time:208069ms step_avg:155.16ms step:1352/1480 train_time:208233ms step_avg:155.17ms step:1353/1480 train_time:208398ms step_avg:155.17ms step:1354/1480 train_time:208564ms step_avg:155.18ms step:1355/1480 train_time:208725ms step_avg:155.19ms step:1356/1480 train_time:208889ms step_avg:155.19ms step:1357/1480 train_time:209055ms step_avg:155.20ms step:1358/1480 train_time:209218ms step_avg:155.21ms step:1359/1480 train_time:209382ms step_avg:155.21ms step:1360/1480 train_time:209549ms step_avg:155.22ms step:1361/1480 train_time:209717ms step_avg:155.23ms step:1362/1480 train_time:209882ms step_avg:155.24ms step:1363/1480 train_time:210049ms step_avg:155.25ms step:1364/1480 train_time:210213ms step_avg:155.25ms step:1365/1480 train_time:210374ms step_avg:155.26ms step:1366/1480 train_time:210536ms step_avg:155.26ms step:1367/1480 train_time:210698ms step_avg:155.27ms step:1368/1480 train_time:210862ms step_avg:155.27ms step:1369/1480 train_time:211031ms step_avg:155.28ms step:1370/1480 train_time:211198ms step_avg:155.29ms step:1371/1480 train_time:211362ms step_avg:155.30ms step:1372/1480 train_time:211531ms step_avg:155.31ms step:1373/1480 train_time:211692ms step_avg:155.31ms step:1374/1480 train_time:211858ms step_avg:155.32ms step:1375/1480 train_time:212019ms step_avg:155.33ms step:1375/1480 val_loss:3.2996 train_time:212093ms step_avg:155.38ms step:1376/1480 train_time:212185ms step_avg:155.33ms step:1377/1480 train_time:212350ms step_avg:155.34ms step:1378/1480 train_time:212512ms step_avg:155.35ms step:1379/1480 train_time:212677ms step_avg:155.35ms step:1380/1480 train_time:212841ms step_avg:155.36ms step:1381/1480 train_time:213009ms step_avg:155.37ms step:1382/1480 train_time:213173ms step_avg:155.37ms step:1383/1480 train_time:213337ms step_avg:155.38ms step:1384/1480 train_time:213505ms step_avg:155.39ms step:1385/1480 train_time:213666ms step_avg:155.39ms step:1386/1480 train_time:213828ms step_avg:155.40ms step:1387/1480 train_time:213992ms step_avg:155.40ms step:1388/1480 train_time:214155ms step_avg:155.41ms step:1389/1480 train_time:214321ms step_avg:155.42ms step:1390/1480 train_time:214483ms step_avg:155.42ms step:1391/1480 train_time:214647ms step_avg:155.43ms step:1392/1480 train_time:214810ms step_avg:155.43ms step:1393/1480 train_time:214972ms step_avg:155.44ms step:1394/1480 train_time:215135ms step_avg:155.44ms step:1395/1480 train_time:215296ms step_avg:155.45ms step:1396/1480 train_time:215459ms step_avg:155.45ms step:1397/1480 train_time:215621ms step_avg:155.46ms step:1398/1480 train_time:215782ms step_avg:155.46ms step:1399/1480 train_time:215943ms step_avg:155.47ms step:1400/1480 train_time:216110ms step_avg:155.47ms step:1401/1480 train_time:216271ms step_avg:155.48ms step:1402/1480 train_time:216432ms step_avg:155.48ms step:1403/1480 train_time:216597ms step_avg:155.49ms step:1404/1480 train_time:216761ms step_avg:155.50ms step:1405/1480 train_time:216927ms step_avg:155.50ms step:1406/1480 train_time:217092ms step_avg:155.51ms step:1407/1480 train_time:217252ms step_avg:155.51ms step:1408/1480 train_time:217413ms step_avg:155.52ms step:1409/1480 train_time:217586ms step_avg:155.53ms step:1410/1480 train_time:217748ms step_avg:155.53ms step:1411/1480 train_time:217911ms step_avg:155.54ms step:1412/1480 train_time:218074ms step_avg:155.54ms step:1413/1480 train_time:218239ms step_avg:155.55ms step:1414/1480 train_time:218402ms step_avg:155.56ms step:1415/1480 train_time:218567ms step_avg:155.56ms step:1416/1480 train_time:218740ms step_avg:155.58ms step:1417/1480 train_time:218906ms step_avg:155.58ms step:1418/1480 train_time:219070ms step_avg:155.59ms step:1419/1480 train_time:219235ms step_avg:155.60ms step:1420/1480 train_time:219399ms step_avg:155.60ms step:1421/1480 train_time:219565ms step_avg:155.61ms step:1422/1480 train_time:219730ms step_avg:155.62ms step:1423/1480 train_time:219891ms step_avg:155.62ms step:1424/1480 train_time:220057ms step_avg:155.63ms step:1425/1480 train_time:220226ms step_avg:155.64ms step:1426/1480 train_time:220391ms step_avg:155.64ms step:1427/1480 train_time:220555ms step_avg:155.65ms step:1428/1480 train_time:220717ms step_avg:155.65ms step:1429/1480 train_time:220877ms step_avg:155.66ms step:1430/1480 train_time:221042ms step_avg:155.66ms step:1431/1480 train_time:221208ms step_avg:155.67ms step:1432/1480 train_time:221375ms step_avg:155.68ms step:1433/1480 train_time:221546ms step_avg:155.69ms step:1434/1480 train_time:221714ms step_avg:155.70ms step:1435/1480 train_time:221880ms step_avg:155.71ms step:1436/1480 train_time:222046ms step_avg:155.71ms step:1437/1480 train_time:222208ms step_avg:155.72ms step:1438/1480 train_time:222370ms step_avg:155.72ms step:1439/1480 train_time:222536ms step_avg:155.73ms step:1440/1480 train_time:222699ms step_avg:155.73ms step:1441/1480 train_time:222864ms step_avg:155.74ms step:1442/1480 train_time:223030ms step_avg:155.75ms step:1443/1480 train_time:223205ms step_avg:155.76ms step:1444/1480 train_time:223370ms step_avg:155.77ms step:1445/1480 train_time:223531ms step_avg:155.77ms step:1446/1480 train_time:223698ms step_avg:155.78ms step:1447/1480 train_time:223867ms step_avg:155.79ms step:1448/1480 train_time:224029ms step_avg:155.79ms step:1449/1480 train_time:224192ms step_avg:155.80ms step:1450/1480 train_time:224355ms step_avg:155.80ms step:1451/1480 train_time:224516ms step_avg:155.81ms step:1452/1480 train_time:224684ms step_avg:155.81ms step:1453/1480 train_time:224847ms step_avg:155.82ms step:1454/1480 train_time:225010ms step_avg:155.82ms step:1455/1480 train_time:225177ms step_avg:155.83ms step:1456/1480 train_time:225341ms step_avg:155.84ms step:1457/1480 train_time:225505ms step_avg:155.84ms step:1458/1480 train_time:225668ms step_avg:155.85ms step:1459/1480 train_time:225833ms step_avg:155.85ms step:1460/1480 train_time:225995ms step_avg:155.86ms step:1461/1480 train_time:226159ms step_avg:155.86ms step:1462/1480 train_time:226324ms step_avg:155.87ms step:1463/1480 train_time:226491ms step_avg:155.88ms step:1464/1480 train_time:226654ms step_avg:155.88ms step:1465/1480 train_time:226818ms step_avg:155.89ms step:1466/1480 train_time:226984ms step_avg:155.90ms step:1467/1480 train_time:227149ms step_avg:155.90ms step:1468/1480 train_time:227312ms step_avg:155.91ms step:1469/1480 train_time:227478ms step_avg:155.91ms step:1470/1480 train_time:227649ms step_avg:155.92ms step:1471/1480 train_time:227819ms step_avg:155.93ms step:1472/1480 train_time:227990ms step_avg:155.94ms step:1473/1480 train_time:228152ms step_avg:155.95ms step:1474/1480 train_time:228318ms step_avg:155.96ms step:1475/1480 train_time:228489ms step_avg:155.97ms step:1476/1480 train_time:228652ms step_avg:155.97ms step:1477/1480 train_time:228821ms step_avg:155.98ms step:1478/1480 train_time:228990ms step_avg:155.99ms step:1479/1480 train_time:229155ms step_avg:155.99ms step:1480/1480 train_time:229317ms step_avg:156.00ms step:1480/1480 val_loss:3.2808 train_time:229395ms step_avg:156.05ms peak memory consumption: 34239 MiB