import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 08:55:58 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 117W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 35C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28943ms step_avg:nanms step:2/1480 train_time:29131ms step_avg:nanms step:3/1480 train_time:29254ms step_avg:nanms step:4/1480 train_time:29396ms step_avg:nanms step:5/1480 train_time:29537ms step_avg:nanms step:6/1480 train_time:29678ms step_avg:nanms step:7/1480 train_time:29819ms step_avg:nanms step:8/1480 train_time:29962ms step_avg:nanms step:9/1480 train_time:30105ms step_avg:nanms step:10/1480 train_time:30253ms step_avg:nanms step:11/1480 train_time:141ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:423ms step_avg:141.08ms step:14/1480 train_time:564ms step_avg:141.12ms step:15/1480 train_time:706ms step_avg:141.14ms step:16/1480 train_time:851ms step_avg:141.77ms step:17/1480 train_time:996ms step_avg:142.32ms step:18/1480 train_time:1141ms step_avg:142.58ms step:19/1480 train_time:1284ms step_avg:142.64ms step:20/1480 train_time:1425ms step_avg:142.54ms step:21/1480 train_time:1568ms step_avg:142.53ms step:22/1480 train_time:1710ms step_avg:142.48ms step:23/1480 train_time:1854ms step_avg:142.58ms step:24/1480 train_time:1998ms step_avg:142.68ms step:25/1480 train_time:2140ms step_avg:142.65ms step:26/1480 train_time:2282ms step_avg:142.62ms step:27/1480 train_time:2423ms step_avg:142.53ms step:28/1480 train_time:2565ms step_avg:142.49ms step:29/1480 train_time:2708ms step_avg:142.51ms step:30/1480 train_time:3234ms step_avg:161.72ms step:31/1480 train_time:3336ms step_avg:158.86ms step:32/1480 train_time:3479ms step_avg:158.15ms step:33/1480 train_time:3622ms step_avg:157.47ms step:34/1480 train_time:3762ms step_avg:156.77ms step:35/1480 train_time:3904ms step_avg:156.16ms step:36/1480 train_time:4046ms step_avg:155.63ms step:37/1480 train_time:4191ms step_avg:155.23ms step:38/1480 train_time:4337ms step_avg:154.88ms step:39/1480 train_time:4481ms step_avg:154.51ms step:40/1480 train_time:4623ms step_avg:154.10ms step:41/1480 train_time:4765ms step_avg:153.72ms step:42/1480 train_time:4908ms step_avg:153.38ms step:43/1480 train_time:5050ms step_avg:153.03ms step:44/1480 train_time:5195ms step_avg:152.79ms step:45/1480 train_time:5339ms step_avg:152.53ms step:46/1480 train_time:5482ms step_avg:152.27ms step:47/1480 train_time:5624ms step_avg:151.99ms step:48/1480 train_time:5764ms step_avg:151.68ms step:49/1480 train_time:5908ms step_avg:151.48ms step:50/1480 train_time:6050ms step_avg:151.24ms step:51/1480 train_time:6193ms step_avg:151.04ms step:52/1480 train_time:6336ms step_avg:150.85ms step:53/1480 train_time:6481ms step_avg:150.71ms step:54/1480 train_time:6623ms step_avg:150.52ms step:55/1480 train_time:6764ms step_avg:150.31ms step:56/1480 train_time:6907ms step_avg:150.14ms step:57/1480 train_time:7050ms step_avg:149.99ms step:58/1480 train_time:7194ms step_avg:149.87ms step:59/1480 train_time:7336ms step_avg:149.72ms step:60/1480 train_time:7478ms step_avg:149.56ms step:61/1480 train_time:7620ms step_avg:149.42ms step:62/1480 train_time:7763ms step_avg:149.28ms step:63/1480 train_time:7904ms step_avg:149.14ms step:64/1480 train_time:8046ms step_avg:149.00ms step:65/1480 train_time:8190ms step_avg:148.91ms step:66/1480 train_time:8333ms step_avg:148.81ms step:67/1480 train_time:8477ms step_avg:148.71ms step:68/1480 train_time:8621ms step_avg:148.64ms step:69/1480 train_time:8762ms step_avg:148.51ms step:70/1480 train_time:8904ms step_avg:148.41ms step:71/1480 train_time:9046ms step_avg:148.30ms step:72/1480 train_time:9189ms step_avg:148.21ms step:73/1480 train_time:9332ms step_avg:148.13ms step:74/1480 train_time:9477ms step_avg:148.07ms step:75/1480 train_time:9621ms step_avg:148.01ms step:76/1480 train_time:9763ms step_avg:147.92ms step:77/1480 train_time:9905ms step_avg:147.83ms step:78/1480 train_time:10048ms step_avg:147.76ms step:79/1480 train_time:10190ms step_avg:147.68ms step:80/1480 train_time:10332ms step_avg:147.60ms step:81/1480 train_time:10854ms step_avg:152.88ms step:82/1480 train_time:10953ms step_avg:152.12ms step:83/1480 train_time:11097ms step_avg:152.01ms step:84/1480 train_time:11239ms step_avg:151.88ms step:85/1480 train_time:11380ms step_avg:151.73ms step:86/1480 train_time:11523ms step_avg:151.61ms step:87/1480 train_time:11663ms step_avg:151.47ms step:88/1480 train_time:11807ms step_avg:151.37ms step:89/1480 train_time:11952ms step_avg:151.29ms step:90/1480 train_time:12096ms step_avg:151.20ms step:91/1480 train_time:12239ms step_avg:151.10ms step:92/1480 train_time:12382ms step_avg:151.00ms step:93/1480 train_time:12525ms step_avg:150.90ms step:94/1480 train_time:12667ms step_avg:150.79ms step:95/1480 train_time:12809ms step_avg:150.69ms step:96/1480 train_time:12955ms step_avg:150.64ms step:97/1480 train_time:13487ms step_avg:155.03ms step:98/1480 train_time:13590ms step_avg:154.43ms step:99/1480 train_time:13732ms step_avg:154.29ms step:100/1480 train_time:13875ms step_avg:154.17ms step:101/1480 train_time:14020ms step_avg:154.06ms step:102/1480 train_time:14161ms step_avg:153.93ms step:103/1480 train_time:14303ms step_avg:153.80ms step:104/1480 train_time:14445ms step_avg:153.67ms step:105/1480 train_time:14589ms step_avg:153.56ms step:106/1480 train_time:14732ms step_avg:153.45ms step:107/1480 train_time:14874ms step_avg:153.34ms step:108/1480 train_time:15018ms step_avg:153.24ms step:109/1480 train_time:15160ms step_avg:153.13ms step:110/1480 train_time:15303ms step_avg:153.03ms step:111/1480 train_time:15446ms step_avg:152.93ms step:112/1480 train_time:15592ms step_avg:152.86ms step:113/1480 train_time:15738ms step_avg:152.80ms step:114/1480 train_time:15883ms step_avg:152.72ms step:115/1480 train_time:16029ms step_avg:152.66ms step:116/1480 train_time:16175ms step_avg:152.60ms step:117/1480 train_time:16321ms step_avg:152.54ms step:118/1480 train_time:16467ms step_avg:152.47ms step:119/1480 train_time:16613ms step_avg:152.41ms step:120/1480 train_time:16760ms step_avg:152.36ms step:121/1480 train_time:16905ms step_avg:152.29ms step:122/1480 train_time:17050ms step_avg:152.23ms step:123/1480 train_time:17196ms step_avg:152.18ms step:124/1480 train_time:17342ms step_avg:152.12ms step:125/1480 train_time:17487ms step_avg:152.06ms step:125/1480 val_loss:4.4080 train_time:17553ms step_avg:152.63ms step:126/1480 train_time:17645ms step_avg:152.12ms step:127/1480 train_time:17789ms step_avg:152.04ms step:128/1480 train_time:17936ms step_avg:152.00ms step:129/1480 train_time:18081ms step_avg:151.94ms step:130/1480 train_time:18227ms step_avg:151.89ms step:131/1480 train_time:18372ms step_avg:151.84ms step:132/1480 train_time:18516ms step_avg:151.77ms step:133/1480 train_time:18662ms step_avg:151.72ms step:134/1480 train_time:18810ms step_avg:151.70ms step:135/1480 train_time:18955ms step_avg:151.64ms step:136/1480 train_time:19101ms step_avg:151.59ms step:137/1480 train_time:19248ms step_avg:151.56ms step:138/1480 train_time:19392ms step_avg:151.50ms step:139/1480 train_time:19538ms step_avg:151.45ms step:140/1480 train_time:19685ms step_avg:151.42ms step:141/1480 train_time:19831ms step_avg:151.38ms step:142/1480 train_time:19975ms step_avg:151.32ms step:143/1480 train_time:20120ms step_avg:151.28ms step:144/1480 train_time:20268ms step_avg:151.25ms step:145/1480 train_time:20413ms step_avg:151.21ms step:146/1480 train_time:20559ms step_avg:151.17ms step:147/1480 train_time:20706ms step_avg:151.14ms step:148/1480 train_time:20852ms step_avg:151.10ms step:149/1480 train_time:20996ms step_avg:151.05ms step:150/1480 train_time:21142ms step_avg:151.01ms step:151/1480 train_time:21288ms step_avg:150.98ms step:152/1480 train_time:21433ms step_avg:150.94ms step:153/1480 train_time:21579ms step_avg:150.90ms step:154/1480 train_time:21726ms step_avg:150.87ms step:155/1480 train_time:21872ms step_avg:150.84ms step:156/1480 train_time:22017ms step_avg:150.80ms step:157/1480 train_time:22162ms step_avg:150.76ms step:158/1480 train_time:22309ms step_avg:150.73ms step:159/1480 train_time:22453ms step_avg:150.69ms step:160/1480 train_time:22598ms step_avg:150.65ms step:161/1480 train_time:22745ms step_avg:150.63ms step:162/1480 train_time:22891ms step_avg:150.60ms step:163/1480 train_time:23037ms step_avg:150.57ms step:164/1480 train_time:23183ms step_avg:150.54ms step:165/1480 train_time:23330ms step_avg:150.52ms step:166/1480 train_time:23474ms step_avg:150.48ms step:167/1480 train_time:23620ms step_avg:150.45ms step:168/1480 train_time:23766ms step_avg:150.42ms step:169/1480 train_time:23912ms step_avg:150.39ms step:170/1480 train_time:24057ms step_avg:150.36ms step:171/1480 train_time:24203ms step_avg:150.33ms step:172/1480 train_time:24351ms step_avg:150.31ms step:173/1480 train_time:24495ms step_avg:150.27ms step:174/1480 train_time:24641ms step_avg:150.25ms step:175/1480 train_time:24787ms step_avg:150.22ms step:176/1480 train_time:24932ms step_avg:150.19ms step:177/1480 train_time:25077ms step_avg:150.16ms step:178/1480 train_time:25225ms step_avg:150.15ms step:179/1480 train_time:25372ms step_avg:150.13ms step:180/1480 train_time:25518ms step_avg:150.10ms step:181/1480 train_time:25664ms step_avg:150.08ms step:182/1480 train_time:25811ms step_avg:150.06ms step:183/1480 train_time:25955ms step_avg:150.03ms step:184/1480 train_time:26100ms step_avg:150.00ms step:185/1480 train_time:26247ms step_avg:149.98ms step:186/1480 train_time:26392ms step_avg:149.95ms step:187/1480 train_time:26538ms step_avg:149.93ms step:188/1480 train_time:26685ms step_avg:149.91ms step:189/1480 train_time:26851ms step_avg:150.01ms step:190/1480 train_time:26976ms step_avg:149.87ms step:191/1480 train_time:27121ms step_avg:149.84ms step:192/1480 train_time:27268ms step_avg:149.82ms step:193/1480 train_time:27413ms step_avg:149.80ms step:194/1480 train_time:27558ms step_avg:149.77ms step:195/1480 train_time:27705ms step_avg:149.76ms step:196/1480 train_time:27852ms step_avg:149.74ms step:197/1480 train_time:27997ms step_avg:149.71ms step:198/1480 train_time:28143ms step_avg:149.70ms step:199/1480 train_time:28289ms step_avg:149.68ms step:200/1480 train_time:28434ms step_avg:149.65ms step:201/1480 train_time:28586ms step_avg:149.66ms step:202/1480 train_time:28727ms step_avg:149.62ms step:203/1480 train_time:28873ms step_avg:149.60ms step:204/1480 train_time:29019ms step_avg:149.58ms step:205/1480 train_time:29165ms step_avg:149.57ms step:206/1480 train_time:29311ms step_avg:149.55ms step:207/1480 train_time:29456ms step_avg:149.52ms step:208/1480 train_time:29602ms step_avg:149.50ms step:209/1480 train_time:29749ms step_avg:149.49ms step:210/1480 train_time:29894ms step_avg:149.47ms step:211/1480 train_time:30040ms step_avg:149.45ms step:212/1480 train_time:30186ms step_avg:149.43ms step:213/1480 train_time:30332ms step_avg:149.42ms step:214/1480 train_time:30477ms step_avg:149.40ms step:215/1480 train_time:30625ms step_avg:149.39ms step:216/1480 train_time:30771ms step_avg:149.37ms step:217/1480 train_time:30918ms step_avg:149.36ms step:218/1480 train_time:31064ms step_avg:149.35ms step:219/1480 train_time:31211ms step_avg:149.33ms step:220/1480 train_time:31355ms step_avg:149.31ms step:221/1480 train_time:31903ms step_avg:151.20ms step:222/1480 train_time:32011ms step_avg:151.00ms step:223/1480 train_time:32160ms step_avg:150.99ms step:224/1480 train_time:32310ms step_avg:150.98ms step:225/1480 train_time:32457ms step_avg:150.96ms step:226/1480 train_time:32605ms step_avg:150.95ms step:227/1480 train_time:32753ms step_avg:150.94ms step:228/1480 train_time:32903ms step_avg:150.93ms step:229/1480 train_time:33052ms step_avg:150.92ms step:230/1480 train_time:33199ms step_avg:150.90ms step:231/1480 train_time:33349ms step_avg:150.90ms step:232/1480 train_time:33496ms step_avg:150.88ms step:233/1480 train_time:33644ms step_avg:150.87ms step:234/1480 train_time:33792ms step_avg:150.86ms step:235/1480 train_time:33941ms step_avg:150.85ms step:236/1480 train_time:34089ms step_avg:150.84ms step:237/1480 train_time:34238ms step_avg:150.83ms step:238/1480 train_time:34387ms step_avg:150.82ms step:239/1480 train_time:34534ms step_avg:150.80ms step:240/1480 train_time:34683ms step_avg:150.79ms step:241/1480 train_time:34831ms step_avg:150.78ms step:242/1480 train_time:34979ms step_avg:150.77ms step:243/1480 train_time:35129ms step_avg:150.77ms step:244/1480 train_time:35276ms step_avg:150.75ms step:245/1480 train_time:35425ms step_avg:150.74ms step:246/1480 train_time:35573ms step_avg:150.73ms step:247/1480 train_time:35720ms step_avg:150.72ms step:248/1480 train_time:35870ms step_avg:150.71ms step:249/1480 train_time:36018ms step_avg:150.70ms step:250/1480 train_time:36167ms step_avg:150.70ms step:250/1480 val_loss:3.9850 train_time:36233ms step_avg:150.97ms step:251/1480 train_time:36324ms step_avg:150.72ms step:252/1480 train_time:36471ms step_avg:150.71ms step:253/1480 train_time:36620ms step_avg:150.70ms step:254/1480 train_time:36767ms step_avg:150.69ms step:255/1480 train_time:36915ms step_avg:150.67ms step:256/1480 train_time:37064ms step_avg:150.66ms step:257/1480 train_time:37210ms step_avg:150.65ms step:258/1480 train_time:37359ms step_avg:150.64ms step:259/1480 train_time:37507ms step_avg:150.63ms step:260/1480 train_time:37656ms step_avg:150.63ms step:261/1480 train_time:37805ms step_avg:150.62ms step:262/1480 train_time:37953ms step_avg:150.61ms step:263/1480 train_time:38102ms step_avg:150.60ms step:264/1480 train_time:38250ms step_avg:150.59ms step:265/1480 train_time:38399ms step_avg:150.58ms step:266/1480 train_time:38546ms step_avg:150.57ms step:267/1480 train_time:38695ms step_avg:150.57ms step:268/1480 train_time:38844ms step_avg:150.56ms step:269/1480 train_time:38991ms step_avg:150.54ms step:270/1480 train_time:39140ms step_avg:150.54ms step:271/1480 train_time:39287ms step_avg:150.52ms step:272/1480 train_time:39436ms step_avg:150.52ms step:273/1480 train_time:39585ms step_avg:150.51ms step:274/1480 train_time:39732ms step_avg:150.50ms step:275/1480 train_time:39882ms step_avg:150.50ms step:276/1480 train_time:40028ms step_avg:150.48ms step:277/1480 train_time:40178ms step_avg:150.48ms step:278/1480 train_time:40326ms step_avg:150.47ms step:279/1480 train_time:40474ms step_avg:150.46ms step:280/1480 train_time:40624ms step_avg:150.46ms step:281/1480 train_time:40772ms step_avg:150.45ms step:282/1480 train_time:40922ms step_avg:150.45ms step:283/1480 train_time:41070ms step_avg:150.44ms step:284/1480 train_time:41219ms step_avg:150.44ms step:285/1480 train_time:41367ms step_avg:150.43ms step:286/1480 train_time:41515ms step_avg:150.42ms step:287/1480 train_time:41664ms step_avg:150.41ms step:288/1480 train_time:41812ms step_avg:150.40ms step:289/1480 train_time:41961ms step_avg:150.40ms step:290/1480 train_time:42108ms step_avg:150.39ms step:291/1480 train_time:42256ms step_avg:150.38ms step:292/1480 train_time:42405ms step_avg:150.37ms step:293/1480 train_time:42552ms step_avg:150.36ms step:294/1480 train_time:42702ms step_avg:150.36ms step:295/1480 train_time:42850ms step_avg:150.35ms step:296/1480 train_time:43000ms step_avg:150.35ms step:297/1480 train_time:43147ms step_avg:150.34ms step:298/1480 train_time:43296ms step_avg:150.33ms step:299/1480 train_time:43445ms step_avg:150.33ms step:300/1480 train_time:43593ms step_avg:150.32ms step:301/1480 train_time:43743ms step_avg:150.32ms step:302/1480 train_time:43890ms step_avg:150.31ms step:303/1480 train_time:44040ms step_avg:150.31ms step:304/1480 train_time:44188ms step_avg:150.30ms step:305/1480 train_time:44337ms step_avg:150.30ms step:306/1480 train_time:44487ms step_avg:150.29ms step:307/1480 train_time:44635ms step_avg:150.29ms step:308/1480 train_time:44784ms step_avg:150.28ms step:309/1480 train_time:44932ms step_avg:150.27ms step:310/1480 train_time:45082ms step_avg:150.27ms step:311/1480 train_time:45229ms step_avg:150.26ms step:312/1480 train_time:45377ms step_avg:150.26ms step:313/1480 train_time:45526ms step_avg:150.25ms step:314/1480 train_time:45675ms step_avg:150.25ms step:315/1480 train_time:45824ms step_avg:150.24ms step:316/1480 train_time:45972ms step_avg:150.23ms step:317/1480 train_time:46121ms step_avg:150.23ms step:318/1480 train_time:46269ms step_avg:150.22ms step:319/1480 train_time:46417ms step_avg:150.22ms step:320/1480 train_time:46566ms step_avg:150.21ms step:321/1480 train_time:46714ms step_avg:150.21ms step:322/1480 train_time:46864ms step_avg:150.20ms step:323/1480 train_time:47011ms step_avg:150.19ms step:324/1480 train_time:47161ms step_avg:150.19ms step:325/1480 train_time:47308ms step_avg:150.18ms step:326/1480 train_time:47457ms step_avg:150.18ms step:327/1480 train_time:47605ms step_avg:150.17ms step:328/1480 train_time:47754ms step_avg:150.17ms step:329/1480 train_time:47903ms step_avg:150.17ms step:330/1480 train_time:48051ms step_avg:150.16ms step:331/1480 train_time:48202ms step_avg:150.16ms step:332/1480 train_time:48353ms step_avg:150.17ms step:333/1480 train_time:48505ms step_avg:150.17ms step:334/1480 train_time:48654ms step_avg:150.17ms step:335/1480 train_time:48805ms step_avg:150.17ms step:336/1480 train_time:48956ms step_avg:150.17ms step:337/1480 train_time:49106ms step_avg:150.17ms step:338/1480 train_time:49258ms step_avg:150.18ms step:339/1480 train_time:49408ms step_avg:150.18ms step:340/1480 train_time:49559ms step_avg:150.18ms step:341/1480 train_time:49709ms step_avg:150.18ms step:342/1480 train_time:49861ms step_avg:150.18ms step:343/1480 train_time:50011ms step_avg:150.18ms step:344/1480 train_time:50163ms step_avg:150.19ms step:345/1480 train_time:50314ms step_avg:150.19ms step:346/1480 train_time:50465ms step_avg:150.19ms step:347/1480 train_time:50616ms step_avg:150.20ms step:348/1480 train_time:50768ms step_avg:150.20ms step:349/1480 train_time:50919ms step_avg:150.20ms step:350/1480 train_time:51069ms step_avg:150.20ms step:351/1480 train_time:51220ms step_avg:150.21ms step:352/1480 train_time:51372ms step_avg:150.21ms step:353/1480 train_time:51522ms step_avg:150.21ms step:354/1480 train_time:51672ms step_avg:150.21ms step:355/1480 train_time:51823ms step_avg:150.21ms step:356/1480 train_time:51973ms step_avg:150.21ms step:357/1480 train_time:52125ms step_avg:150.22ms step:358/1480 train_time:52276ms step_avg:150.22ms step:359/1480 train_time:52428ms step_avg:150.22ms step:360/1480 train_time:52579ms step_avg:150.23ms step:361/1480 train_time:52730ms step_avg:150.23ms step:362/1480 train_time:52881ms step_avg:150.23ms step:363/1480 train_time:53030ms step_avg:150.23ms step:364/1480 train_time:53182ms step_avg:150.23ms step:365/1480 train_time:53332ms step_avg:150.23ms step:366/1480 train_time:53483ms step_avg:150.23ms step:367/1480 train_time:53633ms step_avg:150.23ms step:368/1480 train_time:53785ms step_avg:150.24ms step:369/1480 train_time:53934ms step_avg:150.23ms step:370/1480 train_time:54085ms step_avg:150.24ms step:371/1480 train_time:54235ms step_avg:150.24ms step:372/1480 train_time:54387ms step_avg:150.24ms step:373/1480 train_time:54538ms step_avg:150.24ms step:374/1480 train_time:54688ms step_avg:150.24ms step:375/1480 train_time:54840ms step_avg:150.25ms step:375/1480 val_loss:3.8032 train_time:54907ms step_avg:150.43ms step:376/1480 train_time:55000ms step_avg:150.27ms step:377/1480 train_time:55148ms step_avg:150.27ms step:378/1480 train_time:55300ms step_avg:150.27ms step:379/1480 train_time:55467ms step_avg:150.32ms step:380/1480 train_time:55601ms step_avg:150.27ms step:381/1480 train_time:55750ms step_avg:150.27ms step:382/1480 train_time:55902ms step_avg:150.27ms step:383/1480 train_time:56052ms step_avg:150.27ms step:384/1480 train_time:56204ms step_avg:150.28ms step:385/1480 train_time:56356ms step_avg:150.28ms step:386/1480 train_time:56506ms step_avg:150.28ms step:387/1480 train_time:56658ms step_avg:150.29ms step:388/1480 train_time:56808ms step_avg:150.29ms step:389/1480 train_time:56959ms step_avg:150.29ms step:390/1480 train_time:57109ms step_avg:150.29ms step:391/1480 train_time:57262ms step_avg:150.29ms step:392/1480 train_time:57412ms step_avg:150.29ms step:393/1480 train_time:57563ms step_avg:150.30ms step:394/1480 train_time:57715ms step_avg:150.30ms step:395/1480 train_time:57866ms step_avg:150.30ms step:396/1480 train_time:58017ms step_avg:150.30ms step:397/1480 train_time:58167ms step_avg:150.30ms step:398/1480 train_time:58319ms step_avg:150.31ms step:399/1480 train_time:58470ms step_avg:150.31ms step:400/1480 train_time:58622ms step_avg:150.31ms step:401/1480 train_time:58772ms step_avg:150.31ms step:402/1480 train_time:58923ms step_avg:150.31ms step:403/1480 train_time:59074ms step_avg:150.32ms step:404/1480 train_time:59225ms step_avg:150.32ms step:405/1480 train_time:59376ms step_avg:150.32ms step:406/1480 train_time:59527ms step_avg:150.32ms step:407/1480 train_time:59679ms step_avg:150.33ms step:408/1480 train_time:59829ms step_avg:150.32ms step:409/1480 train_time:59981ms step_avg:150.33ms step:410/1480 train_time:60130ms step_avg:150.33ms step:411/1480 train_time:60281ms step_avg:150.33ms step:412/1480 train_time:60432ms step_avg:150.33ms step:413/1480 train_time:60583ms step_avg:150.33ms step:414/1480 train_time:60735ms step_avg:150.33ms step:415/1480 train_time:60886ms step_avg:150.34ms step:416/1480 train_time:61038ms step_avg:150.34ms step:417/1480 train_time:61188ms step_avg:150.34ms step:418/1480 train_time:61339ms step_avg:150.34ms step:419/1480 train_time:61492ms step_avg:150.35ms step:420/1480 train_time:61644ms step_avg:150.35ms step:421/1480 train_time:61795ms step_avg:150.35ms step:422/1480 train_time:61945ms step_avg:150.35ms step:423/1480 train_time:62098ms step_avg:150.36ms step:424/1480 train_time:62248ms step_avg:150.36ms step:425/1480 train_time:62399ms step_avg:150.36ms step:426/1480 train_time:62548ms step_avg:150.36ms step:427/1480 train_time:62700ms step_avg:150.36ms step:428/1480 train_time:62850ms step_avg:150.36ms step:429/1480 train_time:63002ms step_avg:150.36ms step:430/1480 train_time:63151ms step_avg:150.36ms step:431/1480 train_time:63303ms step_avg:150.36ms step:432/1480 train_time:63453ms step_avg:150.36ms step:433/1480 train_time:63603ms step_avg:150.36ms step:434/1480 train_time:63754ms step_avg:150.36ms step:435/1480 train_time:63905ms step_avg:150.36ms step:436/1480 train_time:64056ms step_avg:150.37ms step:437/1480 train_time:64206ms step_avg:150.36ms step:438/1480 train_time:64357ms step_avg:150.37ms step:439/1480 train_time:64508ms step_avg:150.37ms step:440/1480 train_time:64660ms step_avg:150.37ms step:441/1480 train_time:64812ms step_avg:150.38ms step:442/1480 train_time:64966ms step_avg:150.38ms step:443/1480 train_time:65119ms step_avg:150.39ms step:444/1480 train_time:65271ms step_avg:150.39ms step:445/1480 train_time:65424ms step_avg:150.40ms step:446/1480 train_time:65576ms step_avg:150.40ms step:447/1480 train_time:65729ms step_avg:150.41ms step:448/1480 train_time:65882ms step_avg:150.42ms step:449/1480 train_time:66035ms step_avg:150.42ms step:450/1480 train_time:66187ms step_avg:150.43ms step:451/1480 train_time:66340ms step_avg:150.43ms step:452/1480 train_time:66493ms step_avg:150.44ms step:453/1480 train_time:66646ms step_avg:150.44ms step:454/1480 train_time:66800ms step_avg:150.45ms step:455/1480 train_time:66953ms step_avg:150.46ms step:456/1480 train_time:67105ms step_avg:150.46ms step:457/1480 train_time:67258ms step_avg:150.46ms step:458/1480 train_time:67409ms step_avg:150.47ms step:459/1480 train_time:67563ms step_avg:150.47ms step:460/1480 train_time:67716ms step_avg:150.48ms step:461/1480 train_time:67869ms step_avg:150.49ms step:462/1480 train_time:68022ms step_avg:150.49ms step:463/1480 train_time:68176ms step_avg:150.50ms step:464/1480 train_time:68329ms step_avg:150.50ms step:465/1480 train_time:68482ms step_avg:150.51ms step:466/1480 train_time:68635ms step_avg:150.52ms step:467/1480 train_time:68788ms step_avg:150.52ms step:468/1480 train_time:68941ms step_avg:150.53ms step:469/1480 train_time:69096ms step_avg:150.53ms step:470/1480 train_time:69248ms step_avg:150.54ms step:471/1480 train_time:69401ms step_avg:150.54ms step:472/1480 train_time:69553ms step_avg:150.55ms step:473/1480 train_time:69705ms step_avg:150.55ms step:474/1480 train_time:69859ms step_avg:150.56ms step:475/1480 train_time:70012ms step_avg:150.56ms step:476/1480 train_time:70166ms step_avg:150.57ms step:477/1480 train_time:70320ms step_avg:150.58ms step:478/1480 train_time:70473ms step_avg:150.58ms step:479/1480 train_time:70625ms step_avg:150.59ms step:480/1480 train_time:70779ms step_avg:150.59ms step:481/1480 train_time:70932ms step_avg:150.60ms step:482/1480 train_time:71085ms step_avg:150.60ms step:483/1480 train_time:71237ms step_avg:150.61ms step:484/1480 train_time:71393ms step_avg:150.62ms step:485/1480 train_time:71546ms step_avg:150.62ms step:486/1480 train_time:71700ms step_avg:150.63ms step:487/1480 train_time:71851ms step_avg:150.63ms step:488/1480 train_time:72004ms step_avg:150.64ms step:489/1480 train_time:72157ms step_avg:150.64ms step:490/1480 train_time:72309ms step_avg:150.64ms step:491/1480 train_time:72463ms step_avg:150.65ms step:492/1480 train_time:72618ms step_avg:150.66ms step:493/1480 train_time:72770ms step_avg:150.66ms step:494/1480 train_time:72923ms step_avg:150.67ms step:495/1480 train_time:73077ms step_avg:150.67ms step:496/1480 train_time:73231ms step_avg:150.68ms step:497/1480 train_time:73384ms step_avg:150.69ms step:498/1480 train_time:73537ms step_avg:150.69ms step:499/1480 train_time:73690ms step_avg:150.70ms step:500/1480 train_time:73843ms step_avg:150.70ms step:500/1480 val_loss:3.6824 train_time:73913ms step_avg:150.84ms step:501/1480 train_time:74011ms step_avg:150.73ms step:502/1480 train_time:74156ms step_avg:150.72ms step:503/1480 train_time:74308ms step_avg:150.73ms step:504/1480 train_time:74460ms step_avg:150.73ms step:505/1480 train_time:74612ms step_avg:150.73ms step:506/1480 train_time:74765ms step_avg:150.74ms step:507/1480 train_time:74917ms step_avg:150.74ms step:508/1480 train_time:75071ms step_avg:150.75ms step:509/1480 train_time:75226ms step_avg:150.75ms step:510/1480 train_time:75380ms step_avg:150.76ms step:511/1480 train_time:75533ms step_avg:150.76ms step:512/1480 train_time:75686ms step_avg:150.77ms step:513/1480 train_time:75837ms step_avg:150.77ms step:514/1480 train_time:75990ms step_avg:150.77ms step:515/1480 train_time:76142ms step_avg:150.78ms step:516/1480 train_time:76296ms step_avg:150.78ms step:517/1480 train_time:76449ms step_avg:150.79ms step:518/1480 train_time:76604ms step_avg:150.80ms step:519/1480 train_time:76757ms step_avg:150.80ms step:520/1480 train_time:76909ms step_avg:150.80ms step:521/1480 train_time:77063ms step_avg:150.81ms step:522/1480 train_time:77215ms step_avg:150.81ms step:523/1480 train_time:77368ms step_avg:150.82ms step:524/1480 train_time:77522ms step_avg:150.82ms step:525/1480 train_time:77675ms step_avg:150.82ms step:526/1480 train_time:77828ms step_avg:150.83ms step:527/1480 train_time:77981ms step_avg:150.83ms step:528/1480 train_time:78133ms step_avg:150.84ms step:529/1480 train_time:78286ms step_avg:150.84ms step:530/1480 train_time:78439ms step_avg:150.84ms step:531/1480 train_time:78592ms step_avg:150.85ms step:532/1480 train_time:78745ms step_avg:150.85ms step:533/1480 train_time:78899ms step_avg:150.86ms step:534/1480 train_time:79052ms step_avg:150.86ms step:535/1480 train_time:79205ms step_avg:150.87ms step:536/1480 train_time:79357ms step_avg:150.87ms step:537/1480 train_time:79510ms step_avg:150.87ms step:538/1480 train_time:79664ms step_avg:150.88ms step:539/1480 train_time:79816ms step_avg:150.88ms step:540/1480 train_time:79969ms step_avg:150.89ms step:541/1480 train_time:80123ms step_avg:150.89ms step:542/1480 train_time:80276ms step_avg:150.90ms step:543/1480 train_time:80428ms step_avg:150.90ms step:544/1480 train_time:80581ms step_avg:150.90ms step:545/1480 train_time:80734ms step_avg:150.90ms step:546/1480 train_time:80887ms step_avg:150.91ms step:547/1480 train_time:81040ms step_avg:150.91ms step:548/1480 train_time:81194ms step_avg:150.92ms step:549/1480 train_time:81347ms step_avg:150.92ms step:550/1480 train_time:81503ms step_avg:150.93ms step:551/1480 train_time:81657ms step_avg:150.94ms step:552/1480 train_time:81812ms step_avg:150.95ms step:553/1480 train_time:81968ms step_avg:150.95ms step:554/1480 train_time:82123ms step_avg:150.96ms step:555/1480 train_time:82277ms step_avg:150.97ms step:556/1480 train_time:82433ms step_avg:150.98ms step:557/1480 train_time:82588ms step_avg:150.98ms step:558/1480 train_time:82743ms step_avg:150.99ms step:559/1480 train_time:82897ms step_avg:151.00ms step:560/1480 train_time:83051ms step_avg:151.00ms step:561/1480 train_time:83206ms step_avg:151.01ms step:562/1480 train_time:83360ms step_avg:151.01ms step:563/1480 train_time:83513ms step_avg:151.02ms step:564/1480 train_time:83669ms step_avg:151.03ms step:565/1480 train_time:83824ms step_avg:151.03ms step:566/1480 train_time:83978ms step_avg:151.04ms step:567/1480 train_time:84134ms step_avg:151.05ms step:568/1480 train_time:84288ms step_avg:151.05ms step:569/1480 train_time:84459ms step_avg:151.09ms step:570/1480 train_time:84599ms step_avg:151.07ms step:571/1480 train_time:84753ms step_avg:151.08ms step:572/1480 train_time:84909ms step_avg:151.08ms step:573/1480 train_time:85063ms step_avg:151.09ms step:574/1480 train_time:85220ms step_avg:151.10ms step:575/1480 train_time:85375ms step_avg:151.11ms step:576/1480 train_time:85529ms step_avg:151.11ms step:577/1480 train_time:85684ms step_avg:151.12ms step:578/1480 train_time:85838ms step_avg:151.12ms step:579/1480 train_time:85994ms step_avg:151.13ms step:580/1480 train_time:86148ms step_avg:151.14ms step:581/1480 train_time:86303ms step_avg:151.14ms step:582/1480 train_time:86457ms step_avg:151.15ms step:583/1480 train_time:86610ms step_avg:151.15ms step:584/1480 train_time:86766ms step_avg:151.16ms step:585/1480 train_time:86921ms step_avg:151.17ms step:586/1480 train_time:87077ms step_avg:151.18ms step:587/1480 train_time:87233ms step_avg:151.18ms step:588/1480 train_time:87387ms step_avg:151.19ms step:589/1480 train_time:87541ms step_avg:151.19ms step:590/1480 train_time:87696ms step_avg:151.20ms step:591/1480 train_time:87850ms step_avg:151.21ms step:592/1480 train_time:88007ms step_avg:151.21ms step:593/1480 train_time:88162ms step_avg:151.22ms step:594/1480 train_time:88317ms step_avg:151.23ms step:595/1480 train_time:88474ms step_avg:151.24ms step:596/1480 train_time:88630ms step_avg:151.25ms step:597/1480 train_time:88784ms step_avg:151.25ms step:598/1480 train_time:88939ms step_avg:151.26ms step:599/1480 train_time:89093ms step_avg:151.26ms step:600/1480 train_time:89247ms step_avg:151.27ms step:601/1480 train_time:89403ms step_avg:151.27ms step:602/1480 train_time:89557ms step_avg:151.28ms step:603/1480 train_time:89711ms step_avg:151.28ms step:604/1480 train_time:89866ms step_avg:151.29ms step:605/1480 train_time:90021ms step_avg:151.30ms step:606/1480 train_time:90176ms step_avg:151.30ms step:607/1480 train_time:90332ms step_avg:151.31ms step:608/1480 train_time:90487ms step_avg:151.32ms step:609/1480 train_time:90641ms step_avg:151.32ms step:610/1480 train_time:90795ms step_avg:151.33ms step:611/1480 train_time:90950ms step_avg:151.33ms step:612/1480 train_time:91104ms step_avg:151.34ms step:613/1480 train_time:91259ms step_avg:151.34ms step:614/1480 train_time:91415ms step_avg:151.35ms step:615/1480 train_time:91569ms step_avg:151.35ms step:616/1480 train_time:91724ms step_avg:151.36ms step:617/1480 train_time:91879ms step_avg:151.37ms step:618/1480 train_time:92033ms step_avg:151.37ms step:619/1480 train_time:92189ms step_avg:151.38ms step:620/1480 train_time:92342ms step_avg:151.38ms step:621/1480 train_time:92500ms step_avg:151.39ms step:622/1480 train_time:92654ms step_avg:151.40ms step:623/1480 train_time:92810ms step_avg:151.40ms step:624/1480 train_time:92965ms step_avg:151.41ms step:625/1480 train_time:93119ms step_avg:151.41ms step:625/1480 val_loss:3.6019 train_time:93190ms step_avg:151.53ms step:626/1480 train_time:93288ms step_avg:151.44ms step:627/1480 train_time:93434ms step_avg:151.43ms step:628/1480 train_time:93588ms step_avg:151.44ms step:629/1480 train_time:93741ms step_avg:151.44ms step:630/1480 train_time:93896ms step_avg:151.45ms step:631/1480 train_time:94050ms step_avg:151.45ms step:632/1480 train_time:94204ms step_avg:151.45ms step:633/1480 train_time:94359ms step_avg:151.46ms step:634/1480 train_time:94513ms step_avg:151.46ms step:635/1480 train_time:94668ms step_avg:151.47ms step:636/1480 train_time:94823ms step_avg:151.47ms step:637/1480 train_time:94978ms step_avg:151.48ms step:638/1480 train_time:95133ms step_avg:151.48ms step:639/1480 train_time:95288ms step_avg:151.49ms step:640/1480 train_time:95443ms step_avg:151.50ms step:641/1480 train_time:95597ms step_avg:151.50ms step:642/1480 train_time:95752ms step_avg:151.51ms step:643/1480 train_time:95906ms step_avg:151.51ms step:644/1480 train_time:96061ms step_avg:151.52ms step:645/1480 train_time:96216ms step_avg:151.52ms step:646/1480 train_time:96372ms step_avg:151.53ms step:647/1480 train_time:96527ms step_avg:151.53ms step:648/1480 train_time:96681ms step_avg:151.54ms step:649/1480 train_time:96836ms step_avg:151.54ms step:650/1480 train_time:96992ms step_avg:151.55ms step:651/1480 train_time:97148ms step_avg:151.56ms step:652/1480 train_time:97303ms step_avg:151.56ms step:653/1480 train_time:97458ms step_avg:151.57ms step:654/1480 train_time:97614ms step_avg:151.57ms step:655/1480 train_time:97769ms step_avg:151.58ms step:656/1480 train_time:97923ms step_avg:151.58ms step:657/1480 train_time:98077ms step_avg:151.59ms step:658/1480 train_time:98232ms step_avg:151.59ms step:659/1480 train_time:98387ms step_avg:151.60ms step:660/1480 train_time:98544ms step_avg:151.61ms step:661/1480 train_time:98700ms step_avg:151.61ms step:662/1480 train_time:98856ms step_avg:151.62ms step:663/1480 train_time:99010ms step_avg:151.62ms step:664/1480 train_time:99168ms step_avg:151.63ms step:665/1480 train_time:99324ms step_avg:151.64ms step:666/1480 train_time:99480ms step_avg:151.65ms step:667/1480 train_time:99636ms step_avg:151.65ms step:668/1480 train_time:99795ms step_avg:151.66ms step:669/1480 train_time:99953ms step_avg:151.67ms step:670/1480 train_time:100108ms step_avg:151.68ms step:671/1480 train_time:100266ms step_avg:151.69ms step:672/1480 train_time:100421ms step_avg:151.69ms step:673/1480 train_time:100578ms step_avg:151.70ms step:674/1480 train_time:100733ms step_avg:151.71ms step:675/1480 train_time:100890ms step_avg:151.71ms step:676/1480 train_time:101047ms step_avg:151.72ms step:677/1480 train_time:101202ms step_avg:151.73ms step:678/1480 train_time:101359ms step_avg:151.73ms step:679/1480 train_time:101516ms step_avg:151.74ms step:680/1480 train_time:101674ms step_avg:151.75ms step:681/1480 train_time:101829ms step_avg:151.76ms step:682/1480 train_time:101987ms step_avg:151.77ms step:683/1480 train_time:102144ms step_avg:151.77ms step:684/1480 train_time:102301ms step_avg:151.78ms step:685/1480 train_time:102458ms step_avg:151.79ms step:686/1480 train_time:102615ms step_avg:151.80ms step:687/1480 train_time:102771ms step_avg:151.80ms step:688/1480 train_time:102929ms step_avg:151.81ms step:689/1480 train_time:103087ms step_avg:151.82ms step:690/1480 train_time:103247ms step_avg:151.83ms step:691/1480 train_time:103403ms step_avg:151.84ms step:692/1480 train_time:103559ms step_avg:151.85ms step:693/1480 train_time:103717ms step_avg:151.85ms step:694/1480 train_time:103873ms step_avg:151.86ms step:695/1480 train_time:104029ms step_avg:151.87ms step:696/1480 train_time:104185ms step_avg:151.87ms step:697/1480 train_time:104341ms step_avg:151.88ms step:698/1480 train_time:104497ms step_avg:151.89ms step:699/1480 train_time:104654ms step_avg:151.89ms step:700/1480 train_time:104810ms step_avg:151.90ms step:701/1480 train_time:104966ms step_avg:151.90ms step:702/1480 train_time:105121ms step_avg:151.91ms step:703/1480 train_time:105278ms step_avg:151.92ms step:704/1480 train_time:105434ms step_avg:151.92ms step:705/1480 train_time:105591ms step_avg:151.93ms step:706/1480 train_time:105750ms step_avg:151.94ms step:707/1480 train_time:105905ms step_avg:151.94ms step:708/1480 train_time:106060ms step_avg:151.95ms step:709/1480 train_time:106217ms step_avg:151.96ms step:710/1480 train_time:106372ms step_avg:151.96ms step:711/1480 train_time:106528ms step_avg:151.97ms step:712/1480 train_time:106686ms step_avg:151.97ms step:713/1480 train_time:106844ms step_avg:151.98ms step:714/1480 train_time:107000ms step_avg:151.99ms step:715/1480 train_time:107156ms step_avg:151.99ms step:716/1480 train_time:107310ms step_avg:152.00ms step:717/1480 train_time:107466ms step_avg:152.00ms step:718/1480 train_time:107622ms step_avg:152.01ms step:719/1480 train_time:107778ms step_avg:152.01ms step:720/1480 train_time:107935ms step_avg:152.02ms step:721/1480 train_time:108091ms step_avg:152.03ms step:722/1480 train_time:108249ms step_avg:152.04ms step:723/1480 train_time:108405ms step_avg:152.04ms step:724/1480 train_time:108562ms step_avg:152.05ms step:725/1480 train_time:108718ms step_avg:152.05ms step:726/1480 train_time:108874ms step_avg:152.06ms step:727/1480 train_time:109032ms step_avg:152.07ms step:728/1480 train_time:109189ms step_avg:152.07ms step:729/1480 train_time:109345ms step_avg:152.08ms step:730/1480 train_time:109503ms step_avg:152.09ms step:731/1480 train_time:109659ms step_avg:152.09ms step:732/1480 train_time:109815ms step_avg:152.10ms step:733/1480 train_time:109971ms step_avg:152.10ms step:734/1480 train_time:110128ms step_avg:152.11ms step:735/1480 train_time:110283ms step_avg:152.12ms step:736/1480 train_time:110439ms step_avg:152.12ms step:737/1480 train_time:110596ms step_avg:152.13ms step:738/1480 train_time:110751ms step_avg:152.13ms step:739/1480 train_time:110906ms step_avg:152.13ms step:740/1480 train_time:111065ms step_avg:152.14ms step:741/1480 train_time:111222ms step_avg:152.15ms step:742/1480 train_time:111377ms step_avg:152.15ms step:743/1480 train_time:111533ms step_avg:152.16ms step:744/1480 train_time:111690ms step_avg:152.17ms step:745/1480 train_time:111849ms step_avg:152.17ms step:746/1480 train_time:112005ms step_avg:152.18ms step:747/1480 train_time:112162ms step_avg:152.19ms step:748/1480 train_time:112321ms step_avg:152.20ms step:749/1480 train_time:112477ms step_avg:152.20ms step:750/1480 train_time:112632ms step_avg:152.21ms step:750/1480 val_loss:3.5459 train_time:112704ms step_avg:152.30ms step:751/1480 train_time:112802ms step_avg:152.23ms step:752/1480 train_time:112951ms step_avg:152.22ms step:753/1480 train_time:113106ms step_avg:152.23ms step:754/1480 train_time:113263ms step_avg:152.23ms step:755/1480 train_time:113418ms step_avg:152.24ms step:756/1480 train_time:113573ms step_avg:152.24ms step:757/1480 train_time:113730ms step_avg:152.25ms step:758/1480 train_time:113885ms step_avg:152.25ms step:759/1480 train_time:114057ms step_avg:152.28ms step:760/1480 train_time:114200ms step_avg:152.27ms step:761/1480 train_time:114355ms step_avg:152.27ms step:762/1480 train_time:114512ms step_avg:152.28ms step:763/1480 train_time:114669ms step_avg:152.28ms step:764/1480 train_time:114825ms step_avg:152.29ms step:765/1480 train_time:114983ms step_avg:152.29ms step:766/1480 train_time:115141ms step_avg:152.30ms step:767/1480 train_time:115297ms step_avg:152.31ms step:768/1480 train_time:115453ms step_avg:152.31ms step:769/1480 train_time:115610ms step_avg:152.32ms step:770/1480 train_time:115767ms step_avg:152.33ms step:771/1480 train_time:115925ms step_avg:152.33ms step:772/1480 train_time:116083ms step_avg:152.34ms step:773/1480 train_time:116240ms step_avg:152.35ms step:774/1480 train_time:116396ms step_avg:152.35ms step:775/1480 train_time:116553ms step_avg:152.36ms step:776/1480 train_time:116710ms step_avg:152.36ms step:777/1480 train_time:116870ms step_avg:152.37ms step:778/1480 train_time:117029ms step_avg:152.38ms step:779/1480 train_time:117186ms step_avg:152.39ms step:780/1480 train_time:117344ms step_avg:152.40ms step:781/1480 train_time:117503ms step_avg:152.40ms step:782/1480 train_time:117660ms step_avg:152.41ms step:783/1480 train_time:117817ms step_avg:152.41ms step:784/1480 train_time:117976ms step_avg:152.42ms step:785/1480 train_time:118135ms step_avg:152.43ms step:786/1480 train_time:118293ms step_avg:152.44ms step:787/1480 train_time:118453ms step_avg:152.45ms step:788/1480 train_time:118611ms step_avg:152.46ms step:789/1480 train_time:118767ms step_avg:152.46ms step:790/1480 train_time:118924ms step_avg:152.47ms step:791/1480 train_time:119084ms step_avg:152.48ms step:792/1480 train_time:119241ms step_avg:152.48ms step:793/1480 train_time:119399ms step_avg:152.49ms step:794/1480 train_time:119558ms step_avg:152.50ms step:795/1480 train_time:119717ms step_avg:152.51ms step:796/1480 train_time:119877ms step_avg:152.52ms step:797/1480 train_time:120038ms step_avg:152.53ms step:798/1480 train_time:120195ms step_avg:152.53ms step:799/1480 train_time:120356ms step_avg:152.54ms step:800/1480 train_time:120515ms step_avg:152.55ms step:801/1480 train_time:120673ms step_avg:152.56ms step:802/1480 train_time:120834ms step_avg:152.57ms step:803/1480 train_time:120992ms step_avg:152.58ms step:804/1480 train_time:121148ms step_avg:152.58ms step:805/1480 train_time:121309ms step_avg:152.59ms step:806/1480 train_time:121466ms step_avg:152.60ms step:807/1480 train_time:121622ms step_avg:152.60ms step:808/1480 train_time:121780ms step_avg:152.61ms step:809/1480 train_time:121938ms step_avg:152.61ms step:810/1480 train_time:122094ms step_avg:152.62ms step:811/1480 train_time:122253ms step_avg:152.63ms step:812/1480 train_time:122410ms step_avg:152.63ms step:813/1480 train_time:122567ms step_avg:152.64ms step:814/1480 train_time:122724ms step_avg:152.64ms step:815/1480 train_time:122881ms step_avg:152.65ms step:816/1480 train_time:123040ms step_avg:152.65ms step:817/1480 train_time:123196ms step_avg:152.66ms step:818/1480 train_time:123355ms step_avg:152.67ms step:819/1480 train_time:123514ms step_avg:152.67ms step:820/1480 train_time:123671ms step_avg:152.68ms step:821/1480 train_time:123828ms step_avg:152.69ms step:822/1480 train_time:123986ms step_avg:152.69ms step:823/1480 train_time:124144ms step_avg:152.70ms step:824/1480 train_time:124301ms step_avg:152.70ms step:825/1480 train_time:124461ms step_avg:152.71ms step:826/1480 train_time:124619ms step_avg:152.72ms step:827/1480 train_time:124777ms step_avg:152.73ms step:828/1480 train_time:124936ms step_avg:152.73ms step:829/1480 train_time:125095ms step_avg:152.74ms step:830/1480 train_time:125254ms step_avg:152.75ms step:831/1480 train_time:125412ms step_avg:152.76ms step:832/1480 train_time:125572ms step_avg:152.76ms step:833/1480 train_time:125730ms step_avg:152.77ms step:834/1480 train_time:125891ms step_avg:152.78ms step:835/1480 train_time:126048ms step_avg:152.79ms step:836/1480 train_time:126208ms step_avg:152.79ms step:837/1480 train_time:126366ms step_avg:152.80ms step:838/1480 train_time:126522ms step_avg:152.80ms step:839/1480 train_time:126681ms step_avg:152.81ms step:840/1480 train_time:126839ms step_avg:152.82ms step:841/1480 train_time:126997ms step_avg:152.82ms step:842/1480 train_time:127155ms step_avg:152.83ms step:843/1480 train_time:127312ms step_avg:152.84ms step:844/1480 train_time:127468ms step_avg:152.84ms step:845/1480 train_time:127625ms step_avg:152.84ms step:846/1480 train_time:127784ms step_avg:152.85ms step:847/1480 train_time:127943ms step_avg:152.86ms step:848/1480 train_time:128101ms step_avg:152.87ms step:849/1480 train_time:128260ms step_avg:152.87ms step:850/1480 train_time:128419ms step_avg:152.88ms step:851/1480 train_time:128578ms step_avg:152.89ms step:852/1480 train_time:128736ms step_avg:152.89ms step:853/1480 train_time:128894ms step_avg:152.90ms step:854/1480 train_time:129053ms step_avg:152.91ms step:855/1480 train_time:129211ms step_avg:152.91ms step:856/1480 train_time:129369ms step_avg:152.92ms step:857/1480 train_time:129526ms step_avg:152.92ms step:858/1480 train_time:129686ms step_avg:152.93ms step:859/1480 train_time:129844ms step_avg:152.94ms step:860/1480 train_time:130002ms step_avg:152.94ms step:861/1480 train_time:130161ms step_avg:152.95ms step:862/1480 train_time:130323ms step_avg:152.96ms step:863/1480 train_time:130483ms step_avg:152.97ms step:864/1480 train_time:130641ms step_avg:152.98ms step:865/1480 train_time:130797ms step_avg:152.98ms step:866/1480 train_time:130956ms step_avg:152.99ms step:867/1480 train_time:131116ms step_avg:152.99ms step:868/1480 train_time:131273ms step_avg:153.00ms step:869/1480 train_time:131430ms step_avg:153.00ms step:870/1480 train_time:131590ms step_avg:153.01ms step:871/1480 train_time:131747ms step_avg:153.02ms step:872/1480 train_time:131905ms step_avg:153.02ms step:873/1480 train_time:132061ms step_avg:153.03ms step:874/1480 train_time:132222ms step_avg:153.03ms step:875/1480 train_time:132381ms step_avg:153.04ms step:875/1480 val_loss:3.5027 train_time:132452ms step_avg:153.12ms step:876/1480 train_time:132543ms step_avg:153.05ms step:877/1480 train_time:132700ms step_avg:153.06ms step:878/1480 train_time:132858ms step_avg:153.06ms step:879/1480 train_time:133016ms step_avg:153.07ms step:880/1480 train_time:133174ms step_avg:153.07ms step:881/1480 train_time:133331ms step_avg:153.08ms step:882/1480 train_time:133490ms step_avg:153.08ms step:883/1480 train_time:133650ms step_avg:153.09ms step:884/1480 train_time:133811ms step_avg:153.10ms step:885/1480 train_time:133972ms step_avg:153.11ms step:886/1480 train_time:134131ms step_avg:153.12ms step:887/1480 train_time:134291ms step_avg:153.13ms step:888/1480 train_time:134456ms step_avg:153.14ms step:889/1480 train_time:134619ms step_avg:153.15ms step:890/1480 train_time:134775ms step_avg:153.15ms step:891/1480 train_time:134934ms step_avg:153.16ms step:892/1480 train_time:135094ms step_avg:153.17ms step:893/1480 train_time:135253ms step_avg:153.17ms step:894/1480 train_time:135413ms step_avg:153.18ms step:895/1480 train_time:135576ms step_avg:153.19ms step:896/1480 train_time:135733ms step_avg:153.20ms step:897/1480 train_time:135893ms step_avg:153.21ms step:898/1480 train_time:136054ms step_avg:153.21ms step:899/1480 train_time:136214ms step_avg:153.22ms step:900/1480 train_time:136372ms step_avg:153.23ms step:901/1480 train_time:136531ms step_avg:153.23ms step:902/1480 train_time:136690ms step_avg:153.24ms step:903/1480 train_time:136854ms step_avg:153.25ms step:904/1480 train_time:137014ms step_avg:153.26ms step:905/1480 train_time:137172ms step_avg:153.26ms step:906/1480 train_time:137331ms step_avg:153.27ms step:907/1480 train_time:137493ms step_avg:153.28ms step:908/1480 train_time:137651ms step_avg:153.29ms step:909/1480 train_time:137811ms step_avg:153.29ms step:910/1480 train_time:137974ms step_avg:153.30ms step:911/1480 train_time:138134ms step_avg:153.31ms step:912/1480 train_time:138294ms step_avg:153.32ms step:913/1480 train_time:138455ms step_avg:153.33ms step:914/1480 train_time:138617ms step_avg:153.34ms step:915/1480 train_time:138779ms step_avg:153.35ms step:916/1480 train_time:138937ms step_avg:153.35ms step:917/1480 train_time:139094ms step_avg:153.36ms step:918/1480 train_time:139257ms step_avg:153.37ms step:919/1480 train_time:139419ms step_avg:153.38ms step:920/1480 train_time:139579ms step_avg:153.38ms step:921/1480 train_time:139738ms step_avg:153.39ms step:922/1480 train_time:139898ms step_avg:153.40ms step:923/1480 train_time:140057ms step_avg:153.40ms step:924/1480 train_time:140216ms step_avg:153.41ms step:925/1480 train_time:140375ms step_avg:153.42ms step:926/1480 train_time:140535ms step_avg:153.42ms step:927/1480 train_time:140693ms step_avg:153.43ms step:928/1480 train_time:140852ms step_avg:153.43ms step:929/1480 train_time:141012ms step_avg:153.44ms step:930/1480 train_time:141171ms step_avg:153.45ms step:931/1480 train_time:141332ms step_avg:153.45ms step:932/1480 train_time:141492ms step_avg:153.46ms step:933/1480 train_time:141653ms step_avg:153.47ms step:934/1480 train_time:141813ms step_avg:153.48ms step:935/1480 train_time:141974ms step_avg:153.49ms step:936/1480 train_time:142134ms step_avg:153.49ms step:937/1480 train_time:142294ms step_avg:153.50ms step:938/1480 train_time:142452ms step_avg:153.50ms step:939/1480 train_time:142615ms step_avg:153.52ms step:940/1480 train_time:142778ms step_avg:153.52ms step:941/1480 train_time:142937ms step_avg:153.53ms step:942/1480 train_time:143095ms step_avg:153.54ms step:943/1480 train_time:143256ms step_avg:153.54ms step:944/1480 train_time:143418ms step_avg:153.55ms step:945/1480 train_time:143577ms step_avg:153.56ms step:946/1480 train_time:143740ms step_avg:153.57ms step:947/1480 train_time:143899ms step_avg:153.57ms step:948/1480 train_time:144059ms step_avg:153.58ms step:949/1480 train_time:144231ms step_avg:153.60ms step:950/1480 train_time:144378ms step_avg:153.59ms step:951/1480 train_time:144540ms step_avg:153.60ms step:952/1480 train_time:144698ms step_avg:153.61ms step:953/1480 train_time:144858ms step_avg:153.61ms step:954/1480 train_time:145021ms step_avg:153.62ms step:955/1480 train_time:145178ms step_avg:153.63ms step:956/1480 train_time:145337ms step_avg:153.63ms step:957/1480 train_time:145499ms step_avg:153.64ms step:958/1480 train_time:145663ms step_avg:153.65ms step:959/1480 train_time:145822ms step_avg:153.66ms step:960/1480 train_time:145982ms step_avg:153.67ms step:961/1480 train_time:146141ms step_avg:153.67ms step:962/1480 train_time:146299ms step_avg:153.68ms step:963/1480 train_time:146460ms step_avg:153.68ms step:964/1480 train_time:146621ms step_avg:153.69ms step:965/1480 train_time:146779ms step_avg:153.70ms step:966/1480 train_time:146939ms step_avg:153.70ms step:967/1480 train_time:147096ms step_avg:153.71ms step:968/1480 train_time:147257ms step_avg:153.71ms step:969/1480 train_time:147416ms step_avg:153.72ms step:970/1480 train_time:147574ms step_avg:153.72ms step:971/1480 train_time:147733ms step_avg:153.73ms step:972/1480 train_time:147891ms step_avg:153.73ms step:973/1480 train_time:148049ms step_avg:153.74ms step:974/1480 train_time:148212ms step_avg:153.75ms step:975/1480 train_time:148373ms step_avg:153.75ms step:976/1480 train_time:148533ms step_avg:153.76ms step:977/1480 train_time:148693ms step_avg:153.77ms step:978/1480 train_time:148853ms step_avg:153.77ms step:979/1480 train_time:149015ms step_avg:153.78ms step:980/1480 train_time:149174ms step_avg:153.79ms step:981/1480 train_time:149335ms step_avg:153.80ms step:982/1480 train_time:149493ms step_avg:153.80ms step:983/1480 train_time:149652ms step_avg:153.80ms step:984/1480 train_time:149812ms step_avg:153.81ms step:985/1480 train_time:149973ms step_avg:153.82ms step:986/1480 train_time:150135ms step_avg:153.83ms step:987/1480 train_time:150294ms step_avg:153.83ms step:988/1480 train_time:150455ms step_avg:153.84ms step:989/1480 train_time:150614ms step_avg:153.84ms step:990/1480 train_time:150776ms step_avg:153.85ms step:991/1480 train_time:150937ms step_avg:153.86ms step:992/1480 train_time:151100ms step_avg:153.87ms step:993/1480 train_time:151267ms step_avg:153.88ms step:994/1480 train_time:151428ms step_avg:153.89ms step:995/1480 train_time:151588ms step_avg:153.90ms step:996/1480 train_time:151745ms step_avg:153.90ms step:997/1480 train_time:151905ms step_avg:153.91ms step:998/1480 train_time:152063ms step_avg:153.91ms step:999/1480 train_time:152223ms step_avg:153.92ms step:1000/1480 train_time:152385ms step_avg:153.92ms step:1000/1480 val_loss:3.4393 train_time:152457ms step_avg:154.00ms step:1001/1480 train_time:152556ms step_avg:153.94ms step:1002/1480 train_time:152707ms step_avg:153.94ms step:1003/1480 train_time:152872ms step_avg:153.95ms step:1004/1480 train_time:153033ms step_avg:153.96ms step:1005/1480 train_time:153195ms step_avg:153.96ms step:1006/1480 train_time:153355ms step_avg:153.97ms step:1007/1480 train_time:153515ms step_avg:153.98ms step:1008/1480 train_time:153677ms step_avg:153.99ms step:1009/1480 train_time:153841ms step_avg:154.00ms step:1010/1480 train_time:154000ms step_avg:154.00ms step:1011/1480 train_time:154158ms step_avg:154.00ms step:1012/1480 train_time:154318ms step_avg:154.01ms step:1013/1480 train_time:154479ms step_avg:154.02ms step:1014/1480 train_time:154639ms step_avg:154.02ms step:1015/1480 train_time:154802ms step_avg:154.03ms step:1016/1480 train_time:154963ms step_avg:154.04ms step:1017/1480 train_time:155123ms step_avg:154.05ms step:1018/1480 train_time:155284ms step_avg:154.05ms step:1019/1480 train_time:155446ms step_avg:154.06ms step:1020/1480 train_time:155607ms step_avg:154.07ms step:1021/1480 train_time:155765ms step_avg:154.07ms step:1022/1480 train_time:155925ms step_avg:154.08ms step:1023/1480 train_time:156086ms step_avg:154.08ms step:1024/1480 train_time:156249ms step_avg:154.09ms step:1025/1480 train_time:156413ms step_avg:154.10ms step:1026/1480 train_time:156574ms step_avg:154.11ms step:1027/1480 train_time:156733ms step_avg:154.11ms step:1028/1480 train_time:156897ms step_avg:154.12ms step:1029/1480 train_time:157059ms step_avg:154.13ms step:1030/1480 train_time:157221ms step_avg:154.14ms step:1031/1480 train_time:157380ms step_avg:154.14ms step:1032/1480 train_time:157543ms step_avg:154.15ms step:1033/1480 train_time:157703ms step_avg:154.16ms step:1034/1480 train_time:157862ms step_avg:154.16ms step:1035/1480 train_time:158025ms step_avg:154.17ms step:1036/1480 train_time:158184ms step_avg:154.18ms step:1037/1480 train_time:158343ms step_avg:154.18ms step:1038/1480 train_time:158502ms step_avg:154.19ms step:1039/1480 train_time:158663ms step_avg:154.19ms step:1040/1480 train_time:158822ms step_avg:154.20ms step:1041/1480 train_time:158982ms step_avg:154.20ms step:1042/1480 train_time:159140ms step_avg:154.21ms step:1043/1480 train_time:159299ms step_avg:154.21ms step:1044/1480 train_time:159458ms step_avg:154.22ms step:1045/1480 train_time:159621ms step_avg:154.22ms step:1046/1480 train_time:159781ms step_avg:154.23ms step:1047/1480 train_time:159941ms step_avg:154.23ms step:1048/1480 train_time:160102ms step_avg:154.24ms step:1049/1480 train_time:160263ms step_avg:154.25ms step:1050/1480 train_time:160424ms step_avg:154.25ms step:1051/1480 train_time:160587ms step_avg:154.26ms step:1052/1480 train_time:160746ms step_avg:154.27ms step:1053/1480 train_time:160907ms step_avg:154.27ms step:1054/1480 train_time:161067ms step_avg:154.28ms step:1055/1480 train_time:161225ms step_avg:154.28ms step:1056/1480 train_time:161386ms step_avg:154.29ms step:1057/1480 train_time:161547ms step_avg:154.30ms step:1058/1480 train_time:161709ms step_avg:154.30ms step:1059/1480 train_time:161873ms step_avg:154.31ms step:1060/1480 train_time:162036ms step_avg:154.32ms step:1061/1480 train_time:162194ms step_avg:154.32ms step:1062/1480 train_time:162353ms step_avg:154.33ms step:1063/1480 train_time:162513ms step_avg:154.33ms step:1064/1480 train_time:162673ms step_avg:154.34ms step:1065/1480 train_time:162835ms step_avg:154.35ms step:1066/1480 train_time:162996ms step_avg:154.35ms step:1067/1480 train_time:163158ms step_avg:154.36ms step:1068/1480 train_time:163318ms step_avg:154.36ms step:1069/1480 train_time:163483ms step_avg:154.37ms step:1070/1480 train_time:163642ms step_avg:154.38ms step:1071/1480 train_time:163805ms step_avg:154.39ms step:1072/1480 train_time:163963ms step_avg:154.39ms step:1073/1480 train_time:164121ms step_avg:154.39ms step:1074/1480 train_time:164280ms step_avg:154.40ms step:1075/1480 train_time:164440ms step_avg:154.40ms step:1076/1480 train_time:164599ms step_avg:154.41ms step:1077/1480 train_time:164758ms step_avg:154.41ms step:1078/1480 train_time:164924ms step_avg:154.42ms step:1079/1480 train_time:165088ms step_avg:154.43ms step:1080/1480 train_time:165249ms step_avg:154.44ms step:1081/1480 train_time:165410ms step_avg:154.44ms step:1082/1480 train_time:165571ms step_avg:154.45ms step:1083/1480 train_time:165732ms step_avg:154.46ms step:1084/1480 train_time:165893ms step_avg:154.46ms step:1085/1480 train_time:166055ms step_avg:154.47ms step:1086/1480 train_time:166216ms step_avg:154.48ms step:1087/1480 train_time:166377ms step_avg:154.48ms step:1088/1480 train_time:166536ms step_avg:154.49ms step:1089/1480 train_time:166700ms step_avg:154.49ms step:1090/1480 train_time:166861ms step_avg:154.50ms step:1091/1480 train_time:167025ms step_avg:154.51ms step:1092/1480 train_time:167186ms step_avg:154.52ms step:1093/1480 train_time:167347ms step_avg:154.52ms step:1094/1480 train_time:167506ms step_avg:154.53ms step:1095/1480 train_time:167664ms step_avg:154.53ms step:1096/1480 train_time:167826ms step_avg:154.54ms step:1097/1480 train_time:167988ms step_avg:154.54ms step:1098/1480 train_time:168153ms step_avg:154.55ms step:1099/1480 train_time:168316ms step_avg:154.56ms step:1100/1480 train_time:168479ms step_avg:154.57ms step:1101/1480 train_time:168641ms step_avg:154.57ms step:1102/1480 train_time:168802ms step_avg:154.58ms step:1103/1480 train_time:168967ms step_avg:154.59ms step:1104/1480 train_time:169128ms step_avg:154.60ms step:1105/1480 train_time:169292ms step_avg:154.60ms step:1106/1480 train_time:169455ms step_avg:154.61ms step:1107/1480 train_time:169617ms step_avg:154.62ms step:1108/1480 train_time:169778ms step_avg:154.62ms step:1109/1480 train_time:169938ms step_avg:154.63ms step:1110/1480 train_time:170097ms step_avg:154.63ms step:1111/1480 train_time:170259ms step_avg:154.64ms step:1112/1480 train_time:170423ms step_avg:154.65ms step:1113/1480 train_time:170592ms step_avg:154.66ms step:1114/1480 train_time:170754ms step_avg:154.67ms step:1115/1480 train_time:170917ms step_avg:154.68ms step:1116/1480 train_time:171077ms step_avg:154.68ms step:1117/1480 train_time:171242ms step_avg:154.69ms step:1118/1480 train_time:171406ms step_avg:154.70ms step:1119/1480 train_time:171566ms step_avg:154.70ms step:1120/1480 train_time:171727ms step_avg:154.71ms step:1121/1480 train_time:171890ms step_avg:154.72ms step:1122/1480 train_time:172051ms step_avg:154.72ms step:1123/1480 train_time:172213ms step_avg:154.73ms step:1124/1480 train_time:172375ms step_avg:154.74ms step:1125/1480 train_time:172537ms step_avg:154.74ms step:1125/1480 val_loss:3.3838 train_time:172613ms step_avg:154.81ms step:1126/1480 train_time:172709ms step_avg:154.76ms step:1127/1480 train_time:172862ms step_avg:154.76ms step:1128/1480 train_time:173022ms step_avg:154.76ms step:1129/1480 train_time:173185ms step_avg:154.77ms step:1130/1480 train_time:173346ms step_avg:154.77ms step:1131/1480 train_time:173514ms step_avg:154.79ms step:1132/1480 train_time:173674ms step_avg:154.79ms step:1133/1480 train_time:173837ms step_avg:154.80ms step:1134/1480 train_time:173999ms step_avg:154.80ms step:1135/1480 train_time:174159ms step_avg:154.81ms step:1136/1480 train_time:174319ms step_avg:154.81ms step:1137/1480 train_time:174480ms step_avg:154.82ms step:1138/1480 train_time:174645ms step_avg:154.83ms step:1139/1480 train_time:174820ms step_avg:154.84ms step:1140/1480 train_time:174970ms step_avg:154.84ms step:1141/1480 train_time:175134ms step_avg:154.85ms step:1142/1480 train_time:175294ms step_avg:154.85ms step:1143/1480 train_time:175459ms step_avg:154.86ms step:1144/1480 train_time:175622ms step_avg:154.87ms step:1145/1480 train_time:175780ms step_avg:154.87ms step:1146/1480 train_time:175944ms step_avg:154.88ms step:1147/1480 train_time:176105ms step_avg:154.89ms step:1148/1480 train_time:176268ms step_avg:154.89ms step:1149/1480 train_time:176432ms step_avg:154.90ms step:1150/1480 train_time:176593ms step_avg:154.91ms step:1151/1480 train_time:176757ms step_avg:154.91ms step:1152/1480 train_time:176919ms step_avg:154.92ms step:1153/1480 train_time:177083ms step_avg:154.93ms step:1154/1480 train_time:177244ms step_avg:154.93ms step:1155/1480 train_time:177405ms step_avg:154.94ms step:1156/1480 train_time:177575ms step_avg:154.95ms step:1157/1480 train_time:177737ms step_avg:154.96ms step:1158/1480 train_time:177897ms step_avg:154.96ms step:1159/1480 train_time:178058ms step_avg:154.97ms step:1160/1480 train_time:178216ms step_avg:154.97ms step:1161/1480 train_time:178379ms step_avg:154.98ms step:1162/1480 train_time:178541ms step_avg:154.98ms step:1163/1480 train_time:178704ms step_avg:154.99ms step:1164/1480 train_time:178869ms step_avg:155.00ms step:1165/1480 train_time:179033ms step_avg:155.01ms step:1166/1480 train_time:179193ms step_avg:155.01ms step:1167/1480 train_time:179354ms step_avg:155.02ms step:1168/1480 train_time:179514ms step_avg:155.02ms step:1169/1480 train_time:179676ms step_avg:155.03ms step:1170/1480 train_time:179836ms step_avg:155.03ms step:1171/1480 train_time:179997ms step_avg:155.04ms step:1172/1480 train_time:180158ms step_avg:155.04ms step:1173/1480 train_time:180320ms step_avg:155.05ms step:1174/1480 train_time:180492ms step_avg:155.06ms step:1175/1480 train_time:180655ms step_avg:155.07ms step:1176/1480 train_time:180817ms step_avg:155.07ms step:1177/1480 train_time:180983ms step_avg:155.08ms step:1178/1480 train_time:181145ms step_avg:155.09ms step:1179/1480 train_time:181304ms step_avg:155.09ms step:1180/1480 train_time:181475ms step_avg:155.11ms step:1181/1480 train_time:181638ms step_avg:155.11ms step:1182/1480 train_time:181797ms step_avg:155.12ms step:1183/1480 train_time:181959ms step_avg:155.12ms step:1184/1480 train_time:182119ms step_avg:155.13ms step:1185/1480 train_time:182283ms step_avg:155.13ms step:1186/1480 train_time:182444ms step_avg:155.14ms step:1187/1480 train_time:182620ms step_avg:155.16ms step:1188/1480 train_time:182780ms step_avg:155.16ms step:1189/1480 train_time:182941ms step_avg:155.17ms step:1190/1480 train_time:183103ms step_avg:155.17ms step:1191/1480 train_time:183266ms step_avg:155.18ms step:1192/1480 train_time:183426ms step_avg:155.18ms step:1193/1480 train_time:183586ms step_avg:155.19ms step:1194/1480 train_time:183750ms step_avg:155.19ms step:1195/1480 train_time:183913ms step_avg:155.20ms step:1196/1480 train_time:184083ms step_avg:155.21ms step:1197/1480 train_time:184246ms step_avg:155.22ms step:1198/1480 train_time:184414ms step_avg:155.23ms step:1199/1480 train_time:184577ms step_avg:155.24ms step:1200/1480 train_time:184737ms step_avg:155.24ms step:1201/1480 train_time:184897ms step_avg:155.25ms step:1202/1480 train_time:185067ms step_avg:155.26ms step:1203/1480 train_time:185233ms step_avg:155.27ms step:1204/1480 train_time:185396ms step_avg:155.27ms step:1205/1480 train_time:185557ms step_avg:155.28ms step:1206/1480 train_time:185717ms step_avg:155.28ms step:1207/1480 train_time:185879ms step_avg:155.29ms step:1208/1480 train_time:186039ms step_avg:155.29ms step:1209/1480 train_time:186201ms step_avg:155.30ms step:1210/1480 train_time:186367ms step_avg:155.31ms step:1211/1480 train_time:186532ms step_avg:155.31ms step:1212/1480 train_time:186695ms step_avg:155.32ms step:1213/1480 train_time:186858ms step_avg:155.33ms step:1214/1480 train_time:187023ms step_avg:155.34ms step:1215/1480 train_time:187186ms step_avg:155.34ms step:1216/1480 train_time:187348ms step_avg:155.35ms step:1217/1480 train_time:187512ms step_avg:155.35ms step:1218/1480 train_time:187674ms step_avg:155.36ms step:1219/1480 train_time:187840ms step_avg:155.37ms step:1220/1480 train_time:188002ms step_avg:155.37ms step:1221/1480 train_time:188163ms step_avg:155.38ms step:1222/1480 train_time:188324ms step_avg:155.38ms step:1223/1480 train_time:188487ms step_avg:155.39ms step:1224/1480 train_time:188655ms step_avg:155.40ms step:1225/1480 train_time:188818ms step_avg:155.41ms step:1226/1480 train_time:188983ms step_avg:155.41ms step:1227/1480 train_time:189148ms step_avg:155.42ms step:1228/1480 train_time:189311ms step_avg:155.43ms step:1229/1480 train_time:189475ms step_avg:155.43ms step:1230/1480 train_time:189642ms step_avg:155.44ms step:1231/1480 train_time:189806ms step_avg:155.45ms step:1232/1480 train_time:189973ms step_avg:155.46ms step:1233/1480 train_time:190134ms step_avg:155.47ms step:1234/1480 train_time:190295ms step_avg:155.47ms step:1235/1480 train_time:190459ms step_avg:155.48ms step:1236/1480 train_time:190620ms step_avg:155.48ms step:1237/1480 train_time:190782ms step_avg:155.49ms step:1238/1480 train_time:190957ms step_avg:155.50ms step:1239/1480 train_time:191119ms step_avg:155.51ms step:1240/1480 train_time:191283ms step_avg:155.51ms step:1241/1480 train_time:191450ms step_avg:155.52ms step:1242/1480 train_time:191612ms step_avg:155.53ms step:1243/1480 train_time:191777ms step_avg:155.54ms step:1244/1480 train_time:191937ms step_avg:155.54ms step:1245/1480 train_time:192099ms step_avg:155.55ms step:1246/1480 train_time:192261ms step_avg:155.55ms step:1247/1480 train_time:192424ms step_avg:155.56ms step:1248/1480 train_time:192584ms step_avg:155.56ms step:1249/1480 train_time:192746ms step_avg:155.57ms step:1250/1480 train_time:192910ms step_avg:155.57ms step:1250/1480 val_loss:3.3344 train_time:192985ms step_avg:155.63ms step:1251/1480 train_time:193083ms step_avg:155.59ms step:1252/1480 train_time:193242ms step_avg:155.59ms step:1253/1480 train_time:193403ms step_avg:155.59ms step:1254/1480 train_time:193564ms step_avg:155.60ms step:1255/1480 train_time:193734ms step_avg:155.61ms step:1256/1480 train_time:193899ms step_avg:155.62ms step:1257/1480 train_time:194062ms step_avg:155.62ms step:1258/1480 train_time:194228ms step_avg:155.63ms step:1259/1480 train_time:194391ms step_avg:155.64ms step:1260/1480 train_time:194551ms step_avg:155.64ms step:1261/1480 train_time:194713ms step_avg:155.65ms step:1262/1480 train_time:194879ms step_avg:155.65ms step:1263/1480 train_time:195044ms step_avg:155.66ms step:1264/1480 train_time:195205ms step_avg:155.67ms step:1265/1480 train_time:195364ms step_avg:155.67ms step:1266/1480 train_time:195527ms step_avg:155.67ms step:1267/1480 train_time:195687ms step_avg:155.68ms step:1268/1480 train_time:195849ms step_avg:155.68ms step:1269/1480 train_time:196015ms step_avg:155.69ms step:1270/1480 train_time:196179ms step_avg:155.70ms step:1271/1480 train_time:196343ms step_avg:155.70ms step:1272/1480 train_time:196503ms step_avg:155.71ms step:1273/1480 train_time:196667ms step_avg:155.71ms step:1274/1480 train_time:196831ms step_avg:155.72ms step:1275/1480 train_time:196992ms step_avg:155.72ms step:1276/1480 train_time:197153ms step_avg:155.73ms step:1277/1480 train_time:197316ms step_avg:155.74ms step:1278/1480 train_time:197478ms step_avg:155.74ms step:1279/1480 train_time:197641ms step_avg:155.75ms step:1280/1480 train_time:197809ms step_avg:155.75ms step:1281/1480 train_time:197969ms step_avg:155.76ms step:1282/1480 train_time:198129ms step_avg:155.76ms step:1283/1480 train_time:198291ms step_avg:155.77ms step:1284/1480 train_time:198454ms step_avg:155.77ms step:1285/1480 train_time:198616ms step_avg:155.78ms step:1286/1480 train_time:198780ms step_avg:155.78ms step:1287/1480 train_time:198942ms step_avg:155.79ms step:1288/1480 train_time:199105ms step_avg:155.79ms step:1289/1480 train_time:199273ms step_avg:155.80ms step:1290/1480 train_time:199444ms step_avg:155.82ms step:1291/1480 train_time:199608ms step_avg:155.82ms step:1292/1480 train_time:199769ms step_avg:155.83ms step:1293/1480 train_time:199939ms step_avg:155.84ms step:1294/1480 train_time:200104ms step_avg:155.84ms step:1295/1480 train_time:200267ms step_avg:155.85ms step:1296/1480 train_time:200429ms step_avg:155.85ms step:1297/1480 train_time:200592ms step_avg:155.86ms step:1298/1480 train_time:200755ms step_avg:155.87ms step:1299/1480 train_time:200920ms step_avg:155.87ms step:1300/1480 train_time:201082ms step_avg:155.88ms step:1301/1480 train_time:201243ms step_avg:155.88ms step:1302/1480 train_time:201409ms step_avg:155.89ms step:1303/1480 train_time:201576ms step_avg:155.90ms step:1304/1480 train_time:201742ms step_avg:155.91ms step:1305/1480 train_time:201903ms step_avg:155.91ms step:1306/1480 train_time:202067ms step_avg:155.92ms step:1307/1480 train_time:202227ms step_avg:155.92ms step:1308/1480 train_time:202387ms step_avg:155.92ms step:1309/1480 train_time:202553ms step_avg:155.93ms step:1310/1480 train_time:202714ms step_avg:155.93ms step:1311/1480 train_time:202877ms step_avg:155.94ms step:1312/1480 train_time:203041ms step_avg:155.95ms step:1313/1480 train_time:203203ms step_avg:155.95ms step:1314/1480 train_time:203367ms step_avg:155.96ms step:1315/1480 train_time:203529ms step_avg:155.96ms step:1316/1480 train_time:203688ms step_avg:155.96ms step:1317/1480 train_time:203849ms step_avg:155.97ms step:1318/1480 train_time:204016ms step_avg:155.98ms step:1319/1480 train_time:204183ms step_avg:155.98ms step:1320/1480 train_time:204349ms step_avg:155.99ms step:1321/1480 train_time:204513ms step_avg:156.00ms step:1322/1480 train_time:204684ms step_avg:156.01ms step:1323/1480 train_time:204847ms step_avg:156.01ms step:1324/1480 train_time:205010ms step_avg:156.02ms step:1325/1480 train_time:205180ms step_avg:156.03ms step:1326/1480 train_time:205346ms step_avg:156.04ms step:1327/1480 train_time:205508ms step_avg:156.04ms step:1328/1480 train_time:205670ms step_avg:156.05ms step:1329/1480 train_time:205858ms step_avg:156.07ms step:1330/1480 train_time:206018ms step_avg:156.07ms step:1331/1480 train_time:206183ms step_avg:156.08ms step:1332/1480 train_time:206346ms step_avg:156.09ms step:1333/1480 train_time:206509ms step_avg:156.09ms step:1334/1480 train_time:206672ms step_avg:156.10ms step:1335/1480 train_time:206833ms step_avg:156.10ms step:1336/1480 train_time:207003ms step_avg:156.11ms step:1337/1480 train_time:207168ms step_avg:156.12ms step:1338/1480 train_time:207331ms step_avg:156.12ms step:1339/1480 train_time:207496ms step_avg:156.13ms step:1340/1480 train_time:207660ms step_avg:156.14ms step:1341/1480 train_time:207823ms step_avg:156.14ms step:1342/1480 train_time:207989ms step_avg:156.15ms step:1343/1480 train_time:208149ms step_avg:156.15ms step:1344/1480 train_time:208311ms step_avg:156.16ms step:1345/1480 train_time:208481ms step_avg:156.17ms step:1346/1480 train_time:208642ms step_avg:156.17ms step:1347/1480 train_time:208806ms step_avg:156.18ms step:1348/1480 train_time:208969ms step_avg:156.18ms step:1349/1480 train_time:209131ms step_avg:156.18ms step:1350/1480 train_time:209299ms step_avg:156.19ms step:1351/1480 train_time:209462ms step_avg:156.20ms step:1352/1480 train_time:209624ms step_avg:156.20ms step:1353/1480 train_time:209790ms step_avg:156.21ms step:1354/1480 train_time:209953ms step_avg:156.21ms step:1355/1480 train_time:210114ms step_avg:156.22ms step:1356/1480 train_time:210279ms step_avg:156.22ms step:1357/1480 train_time:210444ms step_avg:156.23ms step:1358/1480 train_time:210607ms step_avg:156.24ms step:1359/1480 train_time:210771ms step_avg:156.24ms step:1360/1480 train_time:210938ms step_avg:156.25ms step:1361/1480 train_time:211107ms step_avg:156.26ms step:1362/1480 train_time:211270ms step_avg:156.27ms step:1363/1480 train_time:211439ms step_avg:156.27ms step:1364/1480 train_time:211602ms step_avg:156.28ms step:1365/1480 train_time:211762ms step_avg:156.28ms step:1366/1480 train_time:211928ms step_avg:156.29ms step:1367/1480 train_time:212090ms step_avg:156.29ms step:1368/1480 train_time:212256ms step_avg:156.30ms step:1369/1480 train_time:212427ms step_avg:156.31ms step:1370/1480 train_time:212592ms step_avg:156.32ms step:1371/1480 train_time:212754ms step_avg:156.32ms step:1372/1480 train_time:212923ms step_avg:156.33ms step:1373/1480 train_time:213083ms step_avg:156.33ms step:1374/1480 train_time:213248ms step_avg:156.34ms step:1375/1480 train_time:213411ms step_avg:156.34ms step:1375/1480 val_loss:3.2963 train_time:213486ms step_avg:156.40ms step:1376/1480 train_time:213579ms step_avg:156.35ms step:1377/1480 train_time:213742ms step_avg:156.36ms step:1378/1480 train_time:213904ms step_avg:156.36ms step:1379/1480 train_time:214069ms step_avg:156.37ms step:1380/1480 train_time:214232ms step_avg:156.37ms step:1381/1480 train_time:214400ms step_avg:156.38ms step:1382/1480 train_time:214565ms step_avg:156.39ms step:1383/1480 train_time:214728ms step_avg:156.39ms step:1384/1480 train_time:214893ms step_avg:156.40ms step:1385/1480 train_time:215053ms step_avg:156.40ms step:1386/1480 train_time:215216ms step_avg:156.41ms step:1387/1480 train_time:215381ms step_avg:156.41ms step:1388/1480 train_time:215542ms step_avg:156.42ms step:1389/1480 train_time:215707ms step_avg:156.42ms step:1390/1480 train_time:215869ms step_avg:156.43ms step:1391/1480 train_time:216031ms step_avg:156.43ms step:1392/1480 train_time:216194ms step_avg:156.44ms step:1393/1480 train_time:216355ms step_avg:156.44ms step:1394/1480 train_time:216519ms step_avg:156.44ms step:1395/1480 train_time:216683ms step_avg:156.45ms step:1396/1480 train_time:216846ms step_avg:156.45ms step:1397/1480 train_time:217006ms step_avg:156.46ms step:1398/1480 train_time:217167ms step_avg:156.46ms step:1399/1480 train_time:217330ms step_avg:156.47ms step:1400/1480 train_time:217499ms step_avg:156.47ms step:1401/1480 train_time:217660ms step_avg:156.48ms step:1402/1480 train_time:217823ms step_avg:156.48ms step:1403/1480 train_time:217990ms step_avg:156.49ms step:1404/1480 train_time:218152ms step_avg:156.49ms step:1405/1480 train_time:218316ms step_avg:156.50ms step:1406/1480 train_time:218481ms step_avg:156.50ms step:1407/1480 train_time:218644ms step_avg:156.51ms step:1408/1480 train_time:218806ms step_avg:156.51ms step:1409/1480 train_time:218977ms step_avg:156.52ms step:1410/1480 train_time:219140ms step_avg:156.53ms step:1411/1480 train_time:219300ms step_avg:156.53ms step:1412/1480 train_time:219463ms step_avg:156.54ms step:1413/1480 train_time:219626ms step_avg:156.54ms step:1414/1480 train_time:219790ms step_avg:156.55ms step:1415/1480 train_time:219955ms step_avg:156.55ms step:1416/1480 train_time:220129ms step_avg:156.56ms step:1417/1480 train_time:220293ms step_avg:156.57ms step:1418/1480 train_time:220457ms step_avg:156.57ms step:1419/1480 train_time:220622ms step_avg:156.58ms step:1420/1480 train_time:220787ms step_avg:156.59ms step:1421/1480 train_time:220951ms step_avg:156.59ms step:1422/1480 train_time:221114ms step_avg:156.60ms step:1423/1480 train_time:221276ms step_avg:156.60ms step:1424/1480 train_time:221446ms step_avg:156.61ms step:1425/1480 train_time:221615ms step_avg:156.62ms step:1426/1480 train_time:221780ms step_avg:156.62ms step:1427/1480 train_time:221947ms step_avg:156.63ms step:1428/1480 train_time:222108ms step_avg:156.63ms step:1429/1480 train_time:222270ms step_avg:156.64ms step:1430/1480 train_time:222435ms step_avg:156.64ms step:1431/1480 train_time:222600ms step_avg:156.65ms step:1432/1480 train_time:222769ms step_avg:156.66ms step:1433/1480 train_time:222937ms step_avg:156.67ms step:1434/1480 train_time:223106ms step_avg:156.68ms step:1435/1480 train_time:223271ms step_avg:156.68ms step:1436/1480 train_time:223436ms step_avg:156.69ms step:1437/1480 train_time:223599ms step_avg:156.69ms step:1438/1480 train_time:223761ms step_avg:156.70ms step:1439/1480 train_time:223927ms step_avg:156.70ms step:1440/1480 train_time:224089ms step_avg:156.71ms step:1441/1480 train_time:224253ms step_avg:156.71ms step:1442/1480 train_time:224420ms step_avg:156.72ms step:1443/1480 train_time:224593ms step_avg:156.73ms step:1444/1480 train_time:224756ms step_avg:156.73ms step:1445/1480 train_time:224917ms step_avg:156.74ms step:1446/1480 train_time:225084ms step_avg:156.74ms step:1447/1480 train_time:225252ms step_avg:156.75ms step:1448/1480 train_time:225413ms step_avg:156.75ms step:1449/1480 train_time:225576ms step_avg:156.76ms step:1450/1480 train_time:225741ms step_avg:156.76ms step:1451/1480 train_time:225905ms step_avg:156.77ms step:1452/1480 train_time:226070ms step_avg:156.78ms step:1453/1480 train_time:226232ms step_avg:156.78ms step:1454/1480 train_time:226395ms step_avg:156.78ms step:1455/1480 train_time:226566ms step_avg:156.79ms step:1456/1480 train_time:226729ms step_avg:156.80ms step:1457/1480 train_time:226891ms step_avg:156.80ms step:1458/1480 train_time:227054ms step_avg:156.81ms step:1459/1480 train_time:227219ms step_avg:156.81ms step:1460/1480 train_time:227383ms step_avg:156.82ms step:1461/1480 train_time:227549ms step_avg:156.82ms step:1462/1480 train_time:227712ms step_avg:156.83ms step:1463/1480 train_time:227878ms step_avg:156.83ms step:1464/1480 train_time:228045ms step_avg:156.84ms step:1465/1480 train_time:228208ms step_avg:156.84ms step:1466/1480 train_time:228370ms step_avg:156.85ms step:1467/1480 train_time:228536ms step_avg:156.85ms step:1468/1480 train_time:228699ms step_avg:156.86ms step:1469/1480 train_time:228864ms step_avg:156.86ms step:1470/1480 train_time:229032ms step_avg:156.87ms step:1471/1480 train_time:229204ms step_avg:156.88ms step:1472/1480 train_time:229374ms step_avg:156.89ms step:1473/1480 train_time:229536ms step_avg:156.89ms step:1474/1480 train_time:229704ms step_avg:156.90ms step:1475/1480 train_time:229874ms step_avg:156.91ms step:1476/1480 train_time:230037ms step_avg:156.91ms step:1477/1480 train_time:230206ms step_avg:156.92ms step:1478/1480 train_time:230376ms step_avg:156.93ms step:1479/1480 train_time:230542ms step_avg:156.94ms step:1480/1480 train_time:230705ms step_avg:156.94ms step:1480/1480 val_loss:3.2772 train_time:230781ms step_avg:156.99ms peak memory consumption: 34239 MiB