import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 08:11:07 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 28C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 119W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29032ms step_avg:nanms step:2/1480 train_time:29669ms step_avg:nanms step:3/1480 train_time:29792ms step_avg:nanms step:4/1480 train_time:29931ms step_avg:nanms step:5/1480 train_time:30072ms step_avg:nanms step:6/1480 train_time:30217ms step_avg:nanms step:7/1480 train_time:30358ms step_avg:nanms step:8/1480 train_time:30501ms step_avg:nanms step:9/1480 train_time:30643ms step_avg:nanms step:10/1480 train_time:30786ms step_avg:nanms step:11/1480 train_time:142ms step_avg:nanms step:12/1480 train_time:281ms step_avg:nanms step:13/1480 train_time:423ms step_avg:141.09ms step:14/1480 train_time:565ms step_avg:141.20ms step:15/1480 train_time:708ms step_avg:141.66ms step:16/1480 train_time:852ms step_avg:142.00ms step:17/1480 train_time:993ms step_avg:141.93ms step:18/1480 train_time:1136ms step_avg:142.03ms step:19/1480 train_time:1278ms step_avg:141.98ms step:20/1480 train_time:1420ms step_avg:141.99ms step:21/1480 train_time:1563ms step_avg:142.09ms step:22/1480 train_time:1705ms step_avg:142.08ms step:23/1480 train_time:1849ms step_avg:142.24ms step:24/1480 train_time:1992ms step_avg:142.29ms step:25/1480 train_time:2136ms step_avg:142.43ms step:26/1480 train_time:2278ms step_avg:142.40ms step:27/1480 train_time:2419ms step_avg:142.31ms step:28/1480 train_time:2561ms step_avg:142.25ms step:29/1480 train_time:2705ms step_avg:142.36ms step:30/1480 train_time:3225ms step_avg:161.27ms step:31/1480 train_time:3325ms step_avg:158.33ms step:32/1480 train_time:3468ms step_avg:157.62ms step:33/1480 train_time:3611ms step_avg:156.98ms step:34/1480 train_time:3753ms step_avg:156.36ms step:35/1480 train_time:3894ms step_avg:155.75ms step:36/1480 train_time:4036ms step_avg:155.21ms step:37/1480 train_time:4179ms step_avg:154.77ms step:38/1480 train_time:4321ms step_avg:154.30ms step:39/1480 train_time:4463ms step_avg:153.89ms step:40/1480 train_time:4605ms step_avg:153.51ms step:41/1480 train_time:4748ms step_avg:153.17ms step:42/1480 train_time:4891ms step_avg:152.85ms step:43/1480 train_time:5034ms step_avg:152.56ms step:44/1480 train_time:5177ms step_avg:152.28ms step:45/1480 train_time:5319ms step_avg:151.97ms step:46/1480 train_time:5462ms step_avg:151.72ms step:47/1480 train_time:5605ms step_avg:151.49ms step:48/1480 train_time:5750ms step_avg:151.30ms step:49/1480 train_time:5892ms step_avg:151.08ms step:50/1480 train_time:6034ms step_avg:150.84ms step:51/1480 train_time:6175ms step_avg:150.61ms step:52/1480 train_time:6318ms step_avg:150.43ms step:53/1480 train_time:6459ms step_avg:150.21ms step:54/1480 train_time:6603ms step_avg:150.06ms step:55/1480 train_time:6747ms step_avg:149.92ms step:56/1480 train_time:6891ms step_avg:149.80ms step:57/1480 train_time:7036ms step_avg:149.69ms step:58/1480 train_time:7177ms step_avg:149.52ms step:59/1480 train_time:7318ms step_avg:149.35ms step:60/1480 train_time:7460ms step_avg:149.20ms step:61/1480 train_time:7602ms step_avg:149.06ms step:62/1480 train_time:7747ms step_avg:148.99ms step:63/1480 train_time:7890ms step_avg:148.88ms step:64/1480 train_time:8032ms step_avg:148.75ms step:65/1480 train_time:8174ms step_avg:148.62ms step:66/1480 train_time:8316ms step_avg:148.50ms step:67/1480 train_time:8459ms step_avg:148.41ms step:68/1480 train_time:8600ms step_avg:148.28ms step:69/1480 train_time:8743ms step_avg:148.19ms step:70/1480 train_time:8887ms step_avg:148.11ms step:71/1480 train_time:9030ms step_avg:148.03ms step:72/1480 train_time:9172ms step_avg:147.93ms step:73/1480 train_time:9314ms step_avg:147.85ms step:74/1480 train_time:9457ms step_avg:147.77ms step:75/1480 train_time:9600ms step_avg:147.69ms step:76/1480 train_time:9743ms step_avg:147.62ms step:77/1480 train_time:9885ms step_avg:147.54ms step:78/1480 train_time:10031ms step_avg:147.51ms step:79/1480 train_time:10174ms step_avg:147.45ms step:80/1480 train_time:10316ms step_avg:147.38ms step:81/1480 train_time:10458ms step_avg:147.30ms step:82/1480 train_time:10600ms step_avg:147.23ms step:83/1480 train_time:10743ms step_avg:147.17ms step:84/1480 train_time:10887ms step_avg:147.12ms step:85/1480 train_time:11031ms step_avg:147.08ms step:86/1480 train_time:11173ms step_avg:147.01ms step:87/1480 train_time:11316ms step_avg:146.96ms step:88/1480 train_time:11457ms step_avg:146.88ms step:89/1480 train_time:11597ms step_avg:146.80ms step:90/1480 train_time:11740ms step_avg:146.75ms step:91/1480 train_time:11882ms step_avg:146.69ms step:92/1480 train_time:12026ms step_avg:146.66ms step:93/1480 train_time:12170ms step_avg:146.63ms step:94/1480 train_time:12314ms step_avg:146.59ms step:95/1480 train_time:12457ms step_avg:146.55ms step:96/1480 train_time:12598ms step_avg:146.49ms step:97/1480 train_time:13119ms step_avg:150.80ms step:98/1480 train_time:13620ms step_avg:154.78ms step:99/1480 train_time:13720ms step_avg:154.16ms step:100/1480 train_time:13862ms step_avg:154.02ms step:101/1480 train_time:14006ms step_avg:153.92ms step:102/1480 train_time:14147ms step_avg:153.77ms step:103/1480 train_time:14289ms step_avg:153.64ms step:104/1480 train_time:14432ms step_avg:153.53ms step:105/1480 train_time:14575ms step_avg:153.42ms step:106/1480 train_time:14718ms step_avg:153.31ms step:107/1480 train_time:14860ms step_avg:153.19ms step:108/1480 train_time:15002ms step_avg:153.08ms step:109/1480 train_time:15145ms step_avg:152.98ms step:110/1480 train_time:15288ms step_avg:152.88ms step:111/1480 train_time:15433ms step_avg:152.80ms step:112/1480 train_time:15577ms step_avg:152.72ms step:113/1480 train_time:15722ms step_avg:152.64ms step:114/1480 train_time:15867ms step_avg:152.57ms step:115/1480 train_time:16014ms step_avg:152.52ms step:116/1480 train_time:16159ms step_avg:152.44ms step:117/1480 train_time:16304ms step_avg:152.37ms step:118/1480 train_time:16451ms step_avg:152.32ms step:119/1480 train_time:16596ms step_avg:152.26ms step:120/1480 train_time:16741ms step_avg:152.19ms step:121/1480 train_time:16887ms step_avg:152.13ms step:122/1480 train_time:17034ms step_avg:152.09ms step:123/1480 train_time:17178ms step_avg:152.02ms step:124/1480 train_time:17324ms step_avg:151.96ms step:125/1480 train_time:17471ms step_avg:151.92ms step:125/1480 val_loss:4.4279 train_time:17536ms step_avg:152.48ms step:126/1480 train_time:17631ms step_avg:151.99ms step:127/1480 train_time:17773ms step_avg:151.91ms step:128/1480 train_time:17920ms step_avg:151.86ms step:129/1480 train_time:18066ms step_avg:151.82ms step:130/1480 train_time:18211ms step_avg:151.76ms step:131/1480 train_time:18356ms step_avg:151.70ms step:132/1480 train_time:18502ms step_avg:151.65ms step:133/1480 train_time:18648ms step_avg:151.61ms step:134/1480 train_time:18792ms step_avg:151.55ms step:135/1480 train_time:18938ms step_avg:151.50ms step:136/1480 train_time:19085ms step_avg:151.47ms step:137/1480 train_time:19229ms step_avg:151.41ms step:138/1480 train_time:19374ms step_avg:151.36ms step:139/1480 train_time:19520ms step_avg:151.32ms step:140/1480 train_time:19666ms step_avg:151.28ms step:141/1480 train_time:19810ms step_avg:151.22ms step:142/1480 train_time:19956ms step_avg:151.19ms step:143/1480 train_time:20104ms step_avg:151.15ms step:144/1480 train_time:20248ms step_avg:151.11ms step:145/1480 train_time:20392ms step_avg:151.06ms step:146/1480 train_time:20539ms step_avg:151.02ms step:147/1480 train_time:20686ms step_avg:150.99ms step:148/1480 train_time:20830ms step_avg:150.94ms step:149/1480 train_time:20977ms step_avg:150.92ms step:150/1480 train_time:21124ms step_avg:150.89ms step:151/1480 train_time:21271ms step_avg:150.86ms step:152/1480 train_time:21416ms step_avg:150.81ms step:153/1480 train_time:21562ms step_avg:150.79ms step:154/1480 train_time:21708ms step_avg:150.75ms step:155/1480 train_time:21854ms step_avg:150.71ms step:156/1480 train_time:22000ms step_avg:150.68ms step:157/1480 train_time:22146ms step_avg:150.66ms step:158/1480 train_time:22291ms step_avg:150.62ms step:159/1480 train_time:22436ms step_avg:150.58ms step:160/1480 train_time:22584ms step_avg:150.56ms step:161/1480 train_time:22729ms step_avg:150.52ms step:162/1480 train_time:22874ms step_avg:150.49ms step:163/1480 train_time:23020ms step_avg:150.46ms step:164/1480 train_time:23166ms step_avg:150.43ms step:165/1480 train_time:23310ms step_avg:150.39ms step:166/1480 train_time:23455ms step_avg:150.35ms step:167/1480 train_time:23601ms step_avg:150.33ms step:168/1480 train_time:23747ms step_avg:150.30ms step:169/1480 train_time:23891ms step_avg:150.26ms step:170/1480 train_time:24037ms step_avg:150.23ms step:171/1480 train_time:24184ms step_avg:150.21ms step:172/1480 train_time:24328ms step_avg:150.17ms step:173/1480 train_time:24474ms step_avg:150.15ms step:174/1480 train_time:24621ms step_avg:150.13ms step:175/1480 train_time:24767ms step_avg:150.10ms step:176/1480 train_time:24912ms step_avg:150.07ms step:177/1480 train_time:25057ms step_avg:150.04ms step:178/1480 train_time:25203ms step_avg:150.02ms step:179/1480 train_time:25348ms step_avg:149.99ms step:180/1480 train_time:25493ms step_avg:149.96ms step:181/1480 train_time:25639ms step_avg:149.94ms step:182/1480 train_time:25786ms step_avg:149.92ms step:183/1480 train_time:25929ms step_avg:149.88ms step:184/1480 train_time:26074ms step_avg:149.85ms step:185/1480 train_time:26220ms step_avg:149.83ms step:186/1480 train_time:26366ms step_avg:149.81ms step:187/1480 train_time:26510ms step_avg:149.78ms step:188/1480 train_time:26656ms step_avg:149.76ms step:189/1480 train_time:26827ms step_avg:149.87ms step:190/1480 train_time:26948ms step_avg:149.71ms step:191/1480 train_time:27093ms step_avg:149.68ms step:192/1480 train_time:27238ms step_avg:149.66ms step:193/1480 train_time:27385ms step_avg:149.64ms step:194/1480 train_time:27529ms step_avg:149.61ms step:195/1480 train_time:27673ms step_avg:149.58ms step:196/1480 train_time:27819ms step_avg:149.57ms step:197/1480 train_time:27966ms step_avg:149.55ms step:198/1480 train_time:28110ms step_avg:149.52ms step:199/1480 train_time:28257ms step_avg:149.51ms step:200/1480 train_time:28404ms step_avg:149.50ms step:201/1480 train_time:28550ms step_avg:149.48ms step:202/1480 train_time:28693ms step_avg:149.45ms step:203/1480 train_time:28839ms step_avg:149.42ms step:204/1480 train_time:28986ms step_avg:149.41ms step:205/1480 train_time:29129ms step_avg:149.38ms step:206/1480 train_time:29275ms step_avg:149.36ms step:207/1480 train_time:29421ms step_avg:149.35ms step:208/1480 train_time:29567ms step_avg:149.33ms step:209/1480 train_time:29712ms step_avg:149.31ms step:210/1480 train_time:29858ms step_avg:149.29ms step:211/1480 train_time:30005ms step_avg:149.28ms step:212/1480 train_time:30150ms step_avg:149.26ms step:213/1480 train_time:30294ms step_avg:149.23ms step:214/1480 train_time:30440ms step_avg:149.22ms step:215/1480 train_time:30587ms step_avg:149.21ms step:216/1480 train_time:30732ms step_avg:149.18ms step:217/1480 train_time:30877ms step_avg:149.17ms step:218/1480 train_time:31023ms step_avg:149.15ms step:219/1480 train_time:31169ms step_avg:149.13ms step:220/1480 train_time:31313ms step_avg:149.11ms step:221/1480 train_time:31873ms step_avg:151.06ms step:222/1480 train_time:31982ms step_avg:150.86ms step:223/1480 train_time:32130ms step_avg:150.84ms step:224/1480 train_time:32278ms step_avg:150.83ms step:225/1480 train_time:32427ms step_avg:150.82ms step:226/1480 train_time:32574ms step_avg:150.81ms step:227/1480 train_time:32723ms step_avg:150.80ms step:228/1480 train_time:32874ms step_avg:150.80ms step:229/1480 train_time:33023ms step_avg:150.79ms step:230/1480 train_time:33171ms step_avg:150.78ms step:231/1480 train_time:33321ms step_avg:150.78ms step:232/1480 train_time:33470ms step_avg:150.77ms step:233/1480 train_time:33618ms step_avg:150.75ms step:234/1480 train_time:33767ms step_avg:150.75ms step:235/1480 train_time:33915ms step_avg:150.73ms step:236/1480 train_time:34065ms step_avg:150.73ms step:237/1480 train_time:34212ms step_avg:150.71ms step:238/1480 train_time:34361ms step_avg:150.71ms step:239/1480 train_time:34510ms step_avg:150.70ms step:240/1480 train_time:34660ms step_avg:150.69ms step:241/1480 train_time:34809ms step_avg:150.69ms step:242/1480 train_time:34958ms step_avg:150.68ms step:243/1480 train_time:35107ms step_avg:150.67ms step:244/1480 train_time:35256ms step_avg:150.67ms step:245/1480 train_time:35406ms step_avg:150.66ms step:246/1480 train_time:35554ms step_avg:150.65ms step:247/1480 train_time:35703ms step_avg:150.65ms step:248/1480 train_time:35851ms step_avg:150.64ms step:249/1480 train_time:35999ms step_avg:150.63ms step:250/1480 train_time:36149ms step_avg:150.62ms step:250/1480 val_loss:3.9912 train_time:36215ms step_avg:150.90ms step:251/1480 train_time:36311ms step_avg:150.67ms step:252/1480 train_time:36455ms step_avg:150.64ms step:253/1480 train_time:36603ms step_avg:150.63ms step:254/1480 train_time:36752ms step_avg:150.62ms step:255/1480 train_time:36900ms step_avg:150.61ms step:256/1480 train_time:37048ms step_avg:150.60ms step:257/1480 train_time:37197ms step_avg:150.59ms step:258/1480 train_time:37345ms step_avg:150.58ms step:259/1480 train_time:37494ms step_avg:150.58ms step:260/1480 train_time:37643ms step_avg:150.57ms step:261/1480 train_time:37793ms step_avg:150.57ms step:262/1480 train_time:37941ms step_avg:150.56ms step:263/1480 train_time:38090ms step_avg:150.55ms step:264/1480 train_time:38238ms step_avg:150.54ms step:265/1480 train_time:38387ms step_avg:150.54ms step:266/1480 train_time:38536ms step_avg:150.53ms step:267/1480 train_time:38686ms step_avg:150.53ms step:268/1480 train_time:38836ms step_avg:150.53ms step:269/1480 train_time:38985ms step_avg:150.52ms step:270/1480 train_time:39134ms step_avg:150.52ms step:271/1480 train_time:39282ms step_avg:150.51ms step:272/1480 train_time:39431ms step_avg:150.50ms step:273/1480 train_time:39578ms step_avg:150.49ms step:274/1480 train_time:39727ms step_avg:150.48ms step:275/1480 train_time:39876ms step_avg:150.47ms step:276/1480 train_time:40023ms step_avg:150.46ms step:277/1480 train_time:40173ms step_avg:150.46ms step:278/1480 train_time:40320ms step_avg:150.45ms step:279/1480 train_time:40470ms step_avg:150.44ms step:280/1480 train_time:40618ms step_avg:150.44ms step:281/1480 train_time:40766ms step_avg:150.43ms step:282/1480 train_time:40916ms step_avg:150.43ms step:283/1480 train_time:41063ms step_avg:150.41ms step:284/1480 train_time:41213ms step_avg:150.41ms step:285/1480 train_time:41360ms step_avg:150.40ms step:286/1480 train_time:41509ms step_avg:150.39ms step:287/1480 train_time:41657ms step_avg:150.39ms step:288/1480 train_time:41805ms step_avg:150.38ms step:289/1480 train_time:41953ms step_avg:150.37ms step:290/1480 train_time:42100ms step_avg:150.36ms step:291/1480 train_time:42249ms step_avg:150.35ms step:292/1480 train_time:42397ms step_avg:150.34ms step:293/1480 train_time:42545ms step_avg:150.33ms step:294/1480 train_time:42694ms step_avg:150.33ms step:295/1480 train_time:42843ms step_avg:150.32ms step:296/1480 train_time:42992ms step_avg:150.32ms step:297/1480 train_time:43138ms step_avg:150.31ms step:298/1480 train_time:43288ms step_avg:150.31ms step:299/1480 train_time:43436ms step_avg:150.30ms step:300/1480 train_time:43584ms step_avg:150.29ms step:301/1480 train_time:43734ms step_avg:150.29ms step:302/1480 train_time:43882ms step_avg:150.28ms step:303/1480 train_time:44031ms step_avg:150.28ms step:304/1480 train_time:44179ms step_avg:150.27ms step:305/1480 train_time:44328ms step_avg:150.26ms step:306/1480 train_time:44476ms step_avg:150.26ms step:307/1480 train_time:44624ms step_avg:150.25ms step:308/1480 train_time:44772ms step_avg:150.24ms step:309/1480 train_time:44919ms step_avg:150.23ms step:310/1480 train_time:45068ms step_avg:150.23ms step:311/1480 train_time:45217ms step_avg:150.22ms step:312/1480 train_time:45366ms step_avg:150.22ms step:313/1480 train_time:45514ms step_avg:150.21ms step:314/1480 train_time:45663ms step_avg:150.21ms step:315/1480 train_time:45812ms step_avg:150.20ms step:316/1480 train_time:45960ms step_avg:150.20ms step:317/1480 train_time:46108ms step_avg:150.19ms step:318/1480 train_time:46257ms step_avg:150.19ms step:319/1480 train_time:46405ms step_avg:150.18ms step:320/1480 train_time:46555ms step_avg:150.18ms step:321/1480 train_time:46702ms step_avg:150.17ms step:322/1480 train_time:46850ms step_avg:150.16ms step:323/1480 train_time:46999ms step_avg:150.16ms step:324/1480 train_time:47147ms step_avg:150.15ms step:325/1480 train_time:47297ms step_avg:150.15ms step:326/1480 train_time:47445ms step_avg:150.14ms step:327/1480 train_time:47594ms step_avg:150.14ms step:328/1480 train_time:47741ms step_avg:150.13ms step:329/1480 train_time:47890ms step_avg:150.13ms step:330/1480 train_time:48039ms step_avg:150.12ms step:331/1480 train_time:48191ms step_avg:150.13ms step:332/1480 train_time:48341ms step_avg:150.13ms step:333/1480 train_time:48493ms step_avg:150.13ms step:334/1480 train_time:48643ms step_avg:150.13ms step:335/1480 train_time:48795ms step_avg:150.14ms step:336/1480 train_time:48945ms step_avg:150.14ms step:337/1480 train_time:49097ms step_avg:150.14ms step:338/1480 train_time:49247ms step_avg:150.14ms step:339/1480 train_time:49398ms step_avg:150.15ms step:340/1480 train_time:49549ms step_avg:150.15ms step:341/1480 train_time:49699ms step_avg:150.15ms step:342/1480 train_time:49850ms step_avg:150.15ms step:343/1480 train_time:50001ms step_avg:150.15ms step:344/1480 train_time:50152ms step_avg:150.16ms step:345/1480 train_time:50302ms step_avg:150.16ms step:346/1480 train_time:50454ms step_avg:150.16ms step:347/1480 train_time:50604ms step_avg:150.16ms step:348/1480 train_time:50755ms step_avg:150.16ms step:349/1480 train_time:50905ms step_avg:150.16ms step:350/1480 train_time:51057ms step_avg:150.17ms step:351/1480 train_time:51208ms step_avg:150.17ms step:352/1480 train_time:51358ms step_avg:150.17ms step:353/1480 train_time:51510ms step_avg:150.18ms step:354/1480 train_time:51660ms step_avg:150.17ms step:355/1480 train_time:51811ms step_avg:150.18ms step:356/1480 train_time:51961ms step_avg:150.18ms step:357/1480 train_time:52113ms step_avg:150.18ms step:358/1480 train_time:52262ms step_avg:150.18ms step:359/1480 train_time:52415ms step_avg:150.18ms step:360/1480 train_time:52565ms step_avg:150.19ms step:361/1480 train_time:52716ms step_avg:150.19ms step:362/1480 train_time:52868ms step_avg:150.19ms step:363/1480 train_time:53018ms step_avg:150.19ms step:364/1480 train_time:53171ms step_avg:150.20ms step:365/1480 train_time:53321ms step_avg:150.20ms step:366/1480 train_time:53472ms step_avg:150.20ms step:367/1480 train_time:53623ms step_avg:150.20ms step:368/1480 train_time:53774ms step_avg:150.21ms step:369/1480 train_time:53923ms step_avg:150.20ms step:370/1480 train_time:54074ms step_avg:150.21ms step:371/1480 train_time:54224ms step_avg:150.20ms step:372/1480 train_time:54375ms step_avg:150.21ms step:373/1480 train_time:54526ms step_avg:150.21ms step:374/1480 train_time:54676ms step_avg:150.21ms step:375/1480 train_time:54827ms step_avg:150.21ms step:375/1480 val_loss:3.8082 train_time:54895ms step_avg:150.40ms step:376/1480 train_time:54987ms step_avg:150.24ms step:377/1480 train_time:55140ms step_avg:150.25ms step:378/1480 train_time:55292ms step_avg:150.25ms step:379/1480 train_time:55474ms step_avg:150.34ms step:380/1480 train_time:55592ms step_avg:150.25ms step:381/1480 train_time:55742ms step_avg:150.25ms step:382/1480 train_time:55893ms step_avg:150.25ms step:383/1480 train_time:56044ms step_avg:150.25ms step:384/1480 train_time:56196ms step_avg:150.26ms step:385/1480 train_time:56346ms step_avg:150.26ms step:386/1480 train_time:56498ms step_avg:150.26ms step:387/1480 train_time:56647ms step_avg:150.26ms step:388/1480 train_time:56798ms step_avg:150.26ms step:389/1480 train_time:56948ms step_avg:150.26ms step:390/1480 train_time:57099ms step_avg:150.26ms step:391/1480 train_time:57250ms step_avg:150.26ms step:392/1480 train_time:57400ms step_avg:150.26ms step:393/1480 train_time:57552ms step_avg:150.27ms step:394/1480 train_time:57703ms step_avg:150.27ms step:395/1480 train_time:57854ms step_avg:150.27ms step:396/1480 train_time:58004ms step_avg:150.27ms step:397/1480 train_time:58155ms step_avg:150.27ms step:398/1480 train_time:58305ms step_avg:150.27ms step:399/1480 train_time:58456ms step_avg:150.27ms step:400/1480 train_time:58607ms step_avg:150.27ms step:401/1480 train_time:58759ms step_avg:150.28ms step:402/1480 train_time:58910ms step_avg:150.28ms step:403/1480 train_time:59061ms step_avg:150.28ms step:404/1480 train_time:59213ms step_avg:150.29ms step:405/1480 train_time:59363ms step_avg:150.29ms step:406/1480 train_time:59514ms step_avg:150.29ms step:407/1480 train_time:59665ms step_avg:150.29ms step:408/1480 train_time:59816ms step_avg:150.29ms step:409/1480 train_time:59967ms step_avg:150.29ms step:410/1480 train_time:60119ms step_avg:150.30ms step:411/1480 train_time:60269ms step_avg:150.30ms step:412/1480 train_time:60420ms step_avg:150.30ms step:413/1480 train_time:60572ms step_avg:150.30ms step:414/1480 train_time:60722ms step_avg:150.30ms step:415/1480 train_time:60874ms step_avg:150.31ms step:416/1480 train_time:61024ms step_avg:150.31ms step:417/1480 train_time:61175ms step_avg:150.31ms step:418/1480 train_time:61326ms step_avg:150.31ms step:419/1480 train_time:61477ms step_avg:150.31ms step:420/1480 train_time:61627ms step_avg:150.31ms step:421/1480 train_time:61778ms step_avg:150.31ms step:422/1480 train_time:61928ms step_avg:150.31ms step:423/1480 train_time:62079ms step_avg:150.31ms step:424/1480 train_time:62230ms step_avg:150.31ms step:425/1480 train_time:62381ms step_avg:150.32ms step:426/1480 train_time:62532ms step_avg:150.32ms step:427/1480 train_time:62682ms step_avg:150.32ms step:428/1480 train_time:62834ms step_avg:150.32ms step:429/1480 train_time:62984ms step_avg:150.32ms step:430/1480 train_time:63135ms step_avg:150.32ms step:431/1480 train_time:63286ms step_avg:150.32ms step:432/1480 train_time:63437ms step_avg:150.33ms step:433/1480 train_time:63589ms step_avg:150.33ms step:434/1480 train_time:63740ms step_avg:150.33ms step:435/1480 train_time:63890ms step_avg:150.33ms step:436/1480 train_time:64040ms step_avg:150.33ms step:437/1480 train_time:64192ms step_avg:150.33ms step:438/1480 train_time:64342ms step_avg:150.33ms step:439/1480 train_time:64494ms step_avg:150.33ms step:440/1480 train_time:64644ms step_avg:150.33ms step:441/1480 train_time:64797ms step_avg:150.34ms step:442/1480 train_time:64950ms step_avg:150.35ms step:443/1480 train_time:65103ms step_avg:150.35ms step:444/1480 train_time:65256ms step_avg:150.36ms step:445/1480 train_time:65408ms step_avg:150.36ms step:446/1480 train_time:65561ms step_avg:150.37ms step:447/1480 train_time:65715ms step_avg:150.38ms step:448/1480 train_time:65867ms step_avg:150.38ms step:449/1480 train_time:66020ms step_avg:150.39ms step:450/1480 train_time:66173ms step_avg:150.39ms step:451/1480 train_time:66325ms step_avg:150.40ms step:452/1480 train_time:66479ms step_avg:150.41ms step:453/1480 train_time:66632ms step_avg:150.41ms step:454/1480 train_time:66784ms step_avg:150.41ms step:455/1480 train_time:66937ms step_avg:150.42ms step:456/1480 train_time:67090ms step_avg:150.43ms step:457/1480 train_time:67243ms step_avg:150.43ms step:458/1480 train_time:67397ms step_avg:150.44ms step:459/1480 train_time:67549ms step_avg:150.44ms step:460/1480 train_time:67701ms step_avg:150.45ms step:461/1480 train_time:67854ms step_avg:150.45ms step:462/1480 train_time:68006ms step_avg:150.46ms step:463/1480 train_time:68160ms step_avg:150.46ms step:464/1480 train_time:68313ms step_avg:150.47ms step:465/1480 train_time:68465ms step_avg:150.47ms step:466/1480 train_time:68617ms step_avg:150.48ms step:467/1480 train_time:68772ms step_avg:150.49ms step:468/1480 train_time:68924ms step_avg:150.49ms step:469/1480 train_time:69078ms step_avg:150.50ms step:470/1480 train_time:69230ms step_avg:150.50ms step:471/1480 train_time:69383ms step_avg:150.50ms step:472/1480 train_time:69536ms step_avg:150.51ms step:473/1480 train_time:69688ms step_avg:150.51ms step:474/1480 train_time:69841ms step_avg:150.52ms step:475/1480 train_time:69994ms step_avg:150.53ms step:476/1480 train_time:70147ms step_avg:150.53ms step:477/1480 train_time:70301ms step_avg:150.54ms step:478/1480 train_time:70453ms step_avg:150.54ms step:479/1480 train_time:70604ms step_avg:150.54ms step:480/1480 train_time:70758ms step_avg:150.55ms step:481/1480 train_time:70910ms step_avg:150.55ms step:482/1480 train_time:71062ms step_avg:150.56ms step:483/1480 train_time:71216ms step_avg:150.56ms step:484/1480 train_time:71370ms step_avg:150.57ms step:485/1480 train_time:71523ms step_avg:150.57ms step:486/1480 train_time:71676ms step_avg:150.58ms step:487/1480 train_time:71829ms step_avg:150.58ms step:488/1480 train_time:71981ms step_avg:150.59ms step:489/1480 train_time:72133ms step_avg:150.59ms step:490/1480 train_time:72286ms step_avg:150.60ms step:491/1480 train_time:72439ms step_avg:150.60ms step:492/1480 train_time:72592ms step_avg:150.61ms step:493/1480 train_time:72745ms step_avg:150.61ms step:494/1480 train_time:72899ms step_avg:150.62ms step:495/1480 train_time:73052ms step_avg:150.62ms step:496/1480 train_time:73204ms step_avg:150.63ms step:497/1480 train_time:73357ms step_avg:150.63ms step:498/1480 train_time:73508ms step_avg:150.63ms step:499/1480 train_time:73661ms step_avg:150.64ms step:500/1480 train_time:73814ms step_avg:150.64ms step:500/1480 val_loss:3.6913 train_time:73882ms step_avg:150.78ms step:501/1480 train_time:73978ms step_avg:150.67ms step:502/1480 train_time:74124ms step_avg:150.66ms step:503/1480 train_time:74277ms step_avg:150.66ms step:504/1480 train_time:74429ms step_avg:150.67ms step:505/1480 train_time:74580ms step_avg:150.67ms step:506/1480 train_time:74733ms step_avg:150.67ms step:507/1480 train_time:74884ms step_avg:150.67ms step:508/1480 train_time:75039ms step_avg:150.68ms step:509/1480 train_time:75193ms step_avg:150.69ms step:510/1480 train_time:75346ms step_avg:150.69ms step:511/1480 train_time:75499ms step_avg:150.70ms step:512/1480 train_time:75652ms step_avg:150.70ms step:513/1480 train_time:75804ms step_avg:150.70ms step:514/1480 train_time:75957ms step_avg:150.71ms step:515/1480 train_time:76112ms step_avg:150.72ms step:516/1480 train_time:76266ms step_avg:150.72ms step:517/1480 train_time:76420ms step_avg:150.73ms step:518/1480 train_time:76574ms step_avg:150.74ms step:519/1480 train_time:76726ms step_avg:150.74ms step:520/1480 train_time:76878ms step_avg:150.74ms step:521/1480 train_time:77032ms step_avg:150.75ms step:522/1480 train_time:77186ms step_avg:150.75ms step:523/1480 train_time:77340ms step_avg:150.76ms step:524/1480 train_time:77493ms step_avg:150.77ms step:525/1480 train_time:77645ms step_avg:150.77ms step:526/1480 train_time:77797ms step_avg:150.77ms step:527/1480 train_time:77950ms step_avg:150.77ms step:528/1480 train_time:78102ms step_avg:150.78ms step:529/1480 train_time:78254ms step_avg:150.78ms step:530/1480 train_time:78409ms step_avg:150.79ms step:531/1480 train_time:78562ms step_avg:150.79ms step:532/1480 train_time:78714ms step_avg:150.79ms step:533/1480 train_time:78867ms step_avg:150.80ms step:534/1480 train_time:79021ms step_avg:150.80ms step:535/1480 train_time:79173ms step_avg:150.81ms step:536/1480 train_time:79326ms step_avg:150.81ms step:537/1480 train_time:79479ms step_avg:150.81ms step:538/1480 train_time:79632ms step_avg:150.82ms step:539/1480 train_time:79786ms step_avg:150.83ms step:540/1480 train_time:79940ms step_avg:150.83ms step:541/1480 train_time:80094ms step_avg:150.84ms step:542/1480 train_time:80246ms step_avg:150.84ms step:543/1480 train_time:80399ms step_avg:150.84ms step:544/1480 train_time:80551ms step_avg:150.84ms step:545/1480 train_time:80702ms step_avg:150.85ms step:546/1480 train_time:80855ms step_avg:150.85ms step:547/1480 train_time:81009ms step_avg:150.85ms step:548/1480 train_time:81162ms step_avg:150.86ms step:549/1480 train_time:81314ms step_avg:150.86ms step:550/1480 train_time:81468ms step_avg:150.87ms step:551/1480 train_time:81623ms step_avg:150.87ms step:552/1480 train_time:81777ms step_avg:150.88ms step:553/1480 train_time:81932ms step_avg:150.89ms step:554/1480 train_time:82087ms step_avg:150.89ms step:555/1480 train_time:82242ms step_avg:150.90ms step:556/1480 train_time:82396ms step_avg:150.91ms step:557/1480 train_time:82551ms step_avg:150.92ms step:558/1480 train_time:82705ms step_avg:150.92ms step:559/1480 train_time:82861ms step_avg:150.93ms step:560/1480 train_time:83015ms step_avg:150.94ms step:561/1480 train_time:83170ms step_avg:150.94ms step:562/1480 train_time:83325ms step_avg:150.95ms step:563/1480 train_time:83479ms step_avg:150.96ms step:564/1480 train_time:83634ms step_avg:150.96ms step:565/1480 train_time:83789ms step_avg:150.97ms step:566/1480 train_time:83945ms step_avg:150.98ms step:567/1480 train_time:84100ms step_avg:150.99ms step:568/1480 train_time:84254ms step_avg:150.99ms step:569/1480 train_time:84436ms step_avg:151.05ms step:570/1480 train_time:84563ms step_avg:151.00ms step:571/1480 train_time:84717ms step_avg:151.01ms step:572/1480 train_time:84872ms step_avg:151.02ms step:573/1480 train_time:85026ms step_avg:151.02ms step:574/1480 train_time:85183ms step_avg:151.03ms step:575/1480 train_time:85337ms step_avg:151.04ms step:576/1480 train_time:85491ms step_avg:151.04ms step:577/1480 train_time:85646ms step_avg:151.05ms step:578/1480 train_time:85801ms step_avg:151.06ms step:579/1480 train_time:85955ms step_avg:151.06ms step:580/1480 train_time:86108ms step_avg:151.07ms step:581/1480 train_time:86263ms step_avg:151.07ms step:582/1480 train_time:86417ms step_avg:151.08ms step:583/1480 train_time:86572ms step_avg:151.09ms step:584/1480 train_time:86725ms step_avg:151.09ms step:585/1480 train_time:86880ms step_avg:151.10ms step:586/1480 train_time:87036ms step_avg:151.10ms step:587/1480 train_time:87191ms step_avg:151.11ms step:588/1480 train_time:87345ms step_avg:151.12ms step:589/1480 train_time:87499ms step_avg:151.12ms step:590/1480 train_time:87654ms step_avg:151.13ms step:591/1480 train_time:87807ms step_avg:151.13ms step:592/1480 train_time:87962ms step_avg:151.14ms step:593/1480 train_time:88118ms step_avg:151.15ms step:594/1480 train_time:88273ms step_avg:151.15ms step:595/1480 train_time:88428ms step_avg:151.16ms step:596/1480 train_time:88585ms step_avg:151.17ms step:597/1480 train_time:88740ms step_avg:151.18ms step:598/1480 train_time:88894ms step_avg:151.18ms step:599/1480 train_time:89049ms step_avg:151.19ms step:600/1480 train_time:89203ms step_avg:151.19ms step:601/1480 train_time:89358ms step_avg:151.20ms step:602/1480 train_time:89512ms step_avg:151.20ms step:603/1480 train_time:89667ms step_avg:151.21ms step:604/1480 train_time:89822ms step_avg:151.22ms step:605/1480 train_time:89977ms step_avg:151.22ms step:606/1480 train_time:90132ms step_avg:151.23ms step:607/1480 train_time:90288ms step_avg:151.24ms step:608/1480 train_time:90443ms step_avg:151.24ms step:609/1480 train_time:90598ms step_avg:151.25ms step:610/1480 train_time:90752ms step_avg:151.25ms step:611/1480 train_time:90906ms step_avg:151.26ms step:612/1480 train_time:91061ms step_avg:151.26ms step:613/1480 train_time:91216ms step_avg:151.27ms step:614/1480 train_time:91372ms step_avg:151.28ms step:615/1480 train_time:91526ms step_avg:151.28ms step:616/1480 train_time:91680ms step_avg:151.29ms step:617/1480 train_time:91834ms step_avg:151.29ms step:618/1480 train_time:91988ms step_avg:151.30ms step:619/1480 train_time:92142ms step_avg:151.30ms step:620/1480 train_time:92297ms step_avg:151.31ms step:621/1480 train_time:92452ms step_avg:151.31ms step:622/1480 train_time:92606ms step_avg:151.32ms step:623/1480 train_time:92761ms step_avg:151.32ms step:624/1480 train_time:92916ms step_avg:151.33ms step:625/1480 train_time:93071ms step_avg:151.34ms step:625/1480 val_loss:3.6081 train_time:93140ms step_avg:151.45ms step:626/1480 train_time:93232ms step_avg:151.35ms step:627/1480 train_time:93384ms step_avg:151.35ms step:628/1480 train_time:93539ms step_avg:151.36ms step:629/1480 train_time:93693ms step_avg:151.36ms step:630/1480 train_time:93848ms step_avg:151.37ms step:631/1480 train_time:94002ms step_avg:151.37ms step:632/1480 train_time:94156ms step_avg:151.38ms step:633/1480 train_time:94311ms step_avg:151.38ms step:634/1480 train_time:94466ms step_avg:151.39ms step:635/1480 train_time:94620ms step_avg:151.39ms step:636/1480 train_time:94774ms step_avg:151.40ms step:637/1480 train_time:94929ms step_avg:151.40ms step:638/1480 train_time:95083ms step_avg:151.41ms step:639/1480 train_time:95237ms step_avg:151.41ms step:640/1480 train_time:95392ms step_avg:151.42ms step:641/1480 train_time:95547ms step_avg:151.42ms step:642/1480 train_time:95700ms step_avg:151.42ms step:643/1480 train_time:95854ms step_avg:151.43ms step:644/1480 train_time:96008ms step_avg:151.43ms step:645/1480 train_time:96164ms step_avg:151.44ms step:646/1480 train_time:96319ms step_avg:151.45ms step:647/1480 train_time:96474ms step_avg:151.45ms step:648/1480 train_time:96629ms step_avg:151.46ms step:649/1480 train_time:96784ms step_avg:151.46ms step:650/1480 train_time:96938ms step_avg:151.47ms step:651/1480 train_time:97094ms step_avg:151.47ms step:652/1480 train_time:97249ms step_avg:151.48ms step:653/1480 train_time:97402ms step_avg:151.48ms step:654/1480 train_time:97557ms step_avg:151.49ms step:655/1480 train_time:97713ms step_avg:151.49ms step:656/1480 train_time:97867ms step_avg:151.50ms step:657/1480 train_time:98020ms step_avg:151.50ms step:658/1480 train_time:98174ms step_avg:151.50ms step:659/1480 train_time:98330ms step_avg:151.51ms step:660/1480 train_time:98487ms step_avg:151.52ms step:661/1480 train_time:98644ms step_avg:151.53ms step:662/1480 train_time:98800ms step_avg:151.53ms step:663/1480 train_time:98956ms step_avg:151.54ms step:664/1480 train_time:99113ms step_avg:151.55ms step:665/1480 train_time:99269ms step_avg:151.56ms step:666/1480 train_time:99425ms step_avg:151.56ms step:667/1480 train_time:99581ms step_avg:151.57ms step:668/1480 train_time:99738ms step_avg:151.58ms step:669/1480 train_time:99896ms step_avg:151.59ms step:670/1480 train_time:100052ms step_avg:151.59ms step:671/1480 train_time:100207ms step_avg:151.60ms step:672/1480 train_time:100365ms step_avg:151.61ms step:673/1480 train_time:100521ms step_avg:151.62ms step:674/1480 train_time:100677ms step_avg:151.62ms step:675/1480 train_time:100834ms step_avg:151.63ms step:676/1480 train_time:100991ms step_avg:151.64ms step:677/1480 train_time:101148ms step_avg:151.65ms step:678/1480 train_time:101302ms step_avg:151.65ms step:679/1480 train_time:101459ms step_avg:151.66ms step:680/1480 train_time:101616ms step_avg:151.67ms step:681/1480 train_time:101772ms step_avg:151.67ms step:682/1480 train_time:101929ms step_avg:151.68ms step:683/1480 train_time:102087ms step_avg:151.69ms step:684/1480 train_time:102244ms step_avg:151.70ms step:685/1480 train_time:102400ms step_avg:151.70ms step:686/1480 train_time:102556ms step_avg:151.71ms step:687/1480 train_time:102712ms step_avg:151.72ms step:688/1480 train_time:102869ms step_avg:151.72ms step:689/1480 train_time:103027ms step_avg:151.73ms step:690/1480 train_time:103184ms step_avg:151.74ms step:691/1480 train_time:103341ms step_avg:151.75ms step:692/1480 train_time:103497ms step_avg:151.76ms step:693/1480 train_time:103654ms step_avg:151.76ms step:694/1480 train_time:103810ms step_avg:151.77ms step:695/1480 train_time:103965ms step_avg:151.77ms step:696/1480 train_time:104121ms step_avg:151.78ms step:697/1480 train_time:104277ms step_avg:151.79ms step:698/1480 train_time:104432ms step_avg:151.79ms step:699/1480 train_time:104588ms step_avg:151.80ms step:700/1480 train_time:104745ms step_avg:151.80ms step:701/1480 train_time:104900ms step_avg:151.81ms step:702/1480 train_time:105057ms step_avg:151.82ms step:703/1480 train_time:105213ms step_avg:151.82ms step:704/1480 train_time:105369ms step_avg:151.83ms step:705/1480 train_time:105525ms step_avg:151.84ms step:706/1480 train_time:105683ms step_avg:151.84ms step:707/1480 train_time:105839ms step_avg:151.85ms step:708/1480 train_time:105996ms step_avg:151.86ms step:709/1480 train_time:106151ms step_avg:151.86ms step:710/1480 train_time:106306ms step_avg:151.87ms step:711/1480 train_time:106462ms step_avg:151.87ms step:712/1480 train_time:106621ms step_avg:151.88ms step:713/1480 train_time:106779ms step_avg:151.89ms step:714/1480 train_time:106937ms step_avg:151.90ms step:715/1480 train_time:107093ms step_avg:151.90ms step:716/1480 train_time:107248ms step_avg:151.91ms step:717/1480 train_time:107403ms step_avg:151.91ms step:718/1480 train_time:107560ms step_avg:151.92ms step:719/1480 train_time:107716ms step_avg:151.93ms step:720/1480 train_time:107873ms step_avg:151.93ms step:721/1480 train_time:108029ms step_avg:151.94ms step:722/1480 train_time:108185ms step_avg:151.95ms step:723/1480 train_time:108341ms step_avg:151.95ms step:724/1480 train_time:108499ms step_avg:151.96ms step:725/1480 train_time:108657ms step_avg:151.97ms step:726/1480 train_time:108812ms step_avg:151.97ms step:727/1480 train_time:108970ms step_avg:151.98ms step:728/1480 train_time:109127ms step_avg:151.99ms step:729/1480 train_time:109282ms step_avg:151.99ms step:730/1480 train_time:109440ms step_avg:152.00ms step:731/1480 train_time:109597ms step_avg:152.01ms step:732/1480 train_time:109753ms step_avg:152.01ms step:733/1480 train_time:109909ms step_avg:152.02ms step:734/1480 train_time:110067ms step_avg:152.03ms step:735/1480 train_time:110223ms step_avg:152.03ms step:736/1480 train_time:110378ms step_avg:152.04ms step:737/1480 train_time:110533ms step_avg:152.04ms step:738/1480 train_time:110688ms step_avg:152.04ms step:739/1480 train_time:110844ms step_avg:152.05ms step:740/1480 train_time:111003ms step_avg:152.06ms step:741/1480 train_time:111162ms step_avg:152.07ms step:742/1480 train_time:111318ms step_avg:152.07ms step:743/1480 train_time:111473ms step_avg:152.08ms step:744/1480 train_time:111630ms step_avg:152.08ms step:745/1480 train_time:111788ms step_avg:152.09ms step:746/1480 train_time:111943ms step_avg:152.10ms step:747/1480 train_time:112099ms step_avg:152.10ms step:748/1480 train_time:112258ms step_avg:152.11ms step:749/1480 train_time:112415ms step_avg:152.12ms step:750/1480 train_time:112570ms step_avg:152.12ms step:750/1480 val_loss:3.5522 train_time:112641ms step_avg:152.22ms step:751/1480 train_time:112732ms step_avg:152.13ms step:752/1480 train_time:112888ms step_avg:152.14ms step:753/1480 train_time:113045ms step_avg:152.15ms step:754/1480 train_time:113202ms step_avg:152.15ms step:755/1480 train_time:113358ms step_avg:152.16ms step:756/1480 train_time:113513ms step_avg:152.16ms step:757/1480 train_time:113671ms step_avg:152.17ms step:758/1480 train_time:113827ms step_avg:152.18ms step:759/1480 train_time:114011ms step_avg:152.22ms step:760/1480 train_time:114143ms step_avg:152.19ms step:761/1480 train_time:114298ms step_avg:152.19ms step:762/1480 train_time:114455ms step_avg:152.20ms step:763/1480 train_time:114611ms step_avg:152.21ms step:764/1480 train_time:114768ms step_avg:152.21ms step:765/1480 train_time:114925ms step_avg:152.22ms step:766/1480 train_time:115082ms step_avg:152.22ms step:767/1480 train_time:115239ms step_avg:152.23ms step:768/1480 train_time:115395ms step_avg:152.24ms step:769/1480 train_time:115552ms step_avg:152.24ms step:770/1480 train_time:115708ms step_avg:152.25ms step:771/1480 train_time:115866ms step_avg:152.26ms step:772/1480 train_time:116023ms step_avg:152.26ms step:773/1480 train_time:116179ms step_avg:152.27ms step:774/1480 train_time:116337ms step_avg:152.27ms step:775/1480 train_time:116495ms step_avg:152.28ms step:776/1480 train_time:116653ms step_avg:152.29ms step:777/1480 train_time:116813ms step_avg:152.30ms step:778/1480 train_time:116972ms step_avg:152.31ms step:779/1480 train_time:117129ms step_avg:152.31ms step:780/1480 train_time:117287ms step_avg:152.32ms step:781/1480 train_time:117445ms step_avg:152.33ms step:782/1480 train_time:117602ms step_avg:152.33ms step:783/1480 train_time:117758ms step_avg:152.34ms step:784/1480 train_time:117915ms step_avg:152.35ms step:785/1480 train_time:118074ms step_avg:152.35ms step:786/1480 train_time:118231ms step_avg:152.36ms step:787/1480 train_time:118388ms step_avg:152.37ms step:788/1480 train_time:118547ms step_avg:152.37ms step:789/1480 train_time:118704ms step_avg:152.38ms step:790/1480 train_time:118861ms step_avg:152.39ms step:791/1480 train_time:119021ms step_avg:152.40ms step:792/1480 train_time:119178ms step_avg:152.40ms step:793/1480 train_time:119336ms step_avg:152.41ms step:794/1480 train_time:119495ms step_avg:152.42ms step:795/1480 train_time:119657ms step_avg:152.43ms step:796/1480 train_time:119817ms step_avg:152.44ms step:797/1480 train_time:119977ms step_avg:152.45ms step:798/1480 train_time:120137ms step_avg:152.46ms step:799/1480 train_time:120299ms step_avg:152.47ms step:800/1480 train_time:120457ms step_avg:152.48ms step:801/1480 train_time:120614ms step_avg:152.48ms step:802/1480 train_time:120773ms step_avg:152.49ms step:803/1480 train_time:120930ms step_avg:152.50ms step:804/1480 train_time:121087ms step_avg:152.50ms step:805/1480 train_time:121246ms step_avg:152.51ms step:806/1480 train_time:121403ms step_avg:152.52ms step:807/1480 train_time:121558ms step_avg:152.52ms step:808/1480 train_time:121717ms step_avg:152.53ms step:809/1480 train_time:121875ms step_avg:152.53ms step:810/1480 train_time:122033ms step_avg:152.54ms step:811/1480 train_time:122190ms step_avg:152.55ms step:812/1480 train_time:122347ms step_avg:152.55ms step:813/1480 train_time:122504ms step_avg:152.56ms step:814/1480 train_time:122661ms step_avg:152.56ms step:815/1480 train_time:122817ms step_avg:152.57ms step:816/1480 train_time:122976ms step_avg:152.58ms step:817/1480 train_time:123133ms step_avg:152.58ms step:818/1480 train_time:123291ms step_avg:152.59ms step:819/1480 train_time:123449ms step_avg:152.59ms step:820/1480 train_time:123608ms step_avg:152.60ms step:821/1480 train_time:123765ms step_avg:152.61ms step:822/1480 train_time:123923ms step_avg:152.62ms step:823/1480 train_time:124081ms step_avg:152.62ms step:824/1480 train_time:124238ms step_avg:152.63ms step:825/1480 train_time:124398ms step_avg:152.64ms step:826/1480 train_time:124558ms step_avg:152.64ms step:827/1480 train_time:124717ms step_avg:152.65ms step:828/1480 train_time:124877ms step_avg:152.66ms step:829/1480 train_time:125037ms step_avg:152.67ms step:830/1480 train_time:125198ms step_avg:152.68ms step:831/1480 train_time:125355ms step_avg:152.69ms step:832/1480 train_time:125514ms step_avg:152.69ms step:833/1480 train_time:125671ms step_avg:152.70ms step:834/1480 train_time:125830ms step_avg:152.71ms step:835/1480 train_time:125988ms step_avg:152.71ms step:836/1480 train_time:126147ms step_avg:152.72ms step:837/1480 train_time:126305ms step_avg:152.73ms step:838/1480 train_time:126463ms step_avg:152.73ms step:839/1480 train_time:126621ms step_avg:152.74ms step:840/1480 train_time:126777ms step_avg:152.74ms step:841/1480 train_time:126934ms step_avg:152.75ms step:842/1480 train_time:127092ms step_avg:152.75ms step:843/1480 train_time:127249ms step_avg:152.76ms step:844/1480 train_time:127406ms step_avg:152.76ms step:845/1480 train_time:127563ms step_avg:152.77ms step:846/1480 train_time:127722ms step_avg:152.78ms step:847/1480 train_time:127879ms step_avg:152.78ms step:848/1480 train_time:128037ms step_avg:152.79ms step:849/1480 train_time:128195ms step_avg:152.80ms step:850/1480 train_time:128353ms step_avg:152.80ms step:851/1480 train_time:128514ms step_avg:152.81ms step:852/1480 train_time:128673ms step_avg:152.82ms step:853/1480 train_time:128831ms step_avg:152.82ms step:854/1480 train_time:128988ms step_avg:152.83ms step:855/1480 train_time:129145ms step_avg:152.83ms step:856/1480 train_time:129301ms step_avg:152.84ms step:857/1480 train_time:129458ms step_avg:152.84ms step:858/1480 train_time:129618ms step_avg:152.85ms step:859/1480 train_time:129779ms step_avg:152.86ms step:860/1480 train_time:129937ms step_avg:152.87ms step:861/1480 train_time:130098ms step_avg:152.88ms step:862/1480 train_time:130258ms step_avg:152.88ms step:863/1480 train_time:130418ms step_avg:152.89ms step:864/1480 train_time:130576ms step_avg:152.90ms step:865/1480 train_time:130733ms step_avg:152.90ms step:866/1480 train_time:130893ms step_avg:152.91ms step:867/1480 train_time:131052ms step_avg:152.92ms step:868/1480 train_time:131208ms step_avg:152.92ms step:869/1480 train_time:131364ms step_avg:152.93ms step:870/1480 train_time:131522ms step_avg:152.93ms step:871/1480 train_time:131678ms step_avg:152.94ms step:872/1480 train_time:131837ms step_avg:152.94ms step:873/1480 train_time:131994ms step_avg:152.95ms step:874/1480 train_time:132155ms step_avg:152.96ms step:875/1480 train_time:132315ms step_avg:152.97ms step:875/1480 val_loss:3.5077 train_time:132388ms step_avg:153.05ms step:876/1480 train_time:132481ms step_avg:152.98ms step:877/1480 train_time:132636ms step_avg:152.98ms step:878/1480 train_time:132794ms step_avg:152.99ms step:879/1480 train_time:132954ms step_avg:153.00ms step:880/1480 train_time:133112ms step_avg:153.00ms step:881/1480 train_time:133269ms step_avg:153.01ms step:882/1480 train_time:133428ms step_avg:153.01ms step:883/1480 train_time:133589ms step_avg:153.02ms step:884/1480 train_time:133749ms step_avg:153.03ms step:885/1480 train_time:133909ms step_avg:153.04ms step:886/1480 train_time:134069ms step_avg:153.05ms step:887/1480 train_time:134230ms step_avg:153.06ms step:888/1480 train_time:134393ms step_avg:153.07ms step:889/1480 train_time:134555ms step_avg:153.08ms step:890/1480 train_time:134712ms step_avg:153.08ms step:891/1480 train_time:134870ms step_avg:153.09ms step:892/1480 train_time:135031ms step_avg:153.10ms step:893/1480 train_time:135189ms step_avg:153.10ms step:894/1480 train_time:135349ms step_avg:153.11ms step:895/1480 train_time:135511ms step_avg:153.12ms step:896/1480 train_time:135671ms step_avg:153.13ms step:897/1480 train_time:135833ms step_avg:153.14ms step:898/1480 train_time:135994ms step_avg:153.15ms step:899/1480 train_time:136154ms step_avg:153.15ms step:900/1480 train_time:136313ms step_avg:153.16ms step:901/1480 train_time:136474ms step_avg:153.17ms step:902/1480 train_time:136633ms step_avg:153.18ms step:903/1480 train_time:136795ms step_avg:153.19ms step:904/1480 train_time:136956ms step_avg:153.19ms step:905/1480 train_time:137114ms step_avg:153.20ms step:906/1480 train_time:137275ms step_avg:153.21ms step:907/1480 train_time:137438ms step_avg:153.22ms step:908/1480 train_time:137595ms step_avg:153.22ms step:909/1480 train_time:137756ms step_avg:153.23ms step:910/1480 train_time:137920ms step_avg:153.24ms step:911/1480 train_time:138078ms step_avg:153.25ms step:912/1480 train_time:138237ms step_avg:153.26ms step:913/1480 train_time:138400ms step_avg:153.27ms step:914/1480 train_time:138560ms step_avg:153.27ms step:915/1480 train_time:138722ms step_avg:153.28ms step:916/1480 train_time:138881ms step_avg:153.29ms step:917/1480 train_time:139039ms step_avg:153.30ms step:918/1480 train_time:139200ms step_avg:153.30ms step:919/1480 train_time:139363ms step_avg:153.31ms step:920/1480 train_time:139522ms step_avg:153.32ms step:921/1480 train_time:139681ms step_avg:153.33ms step:922/1480 train_time:139841ms step_avg:153.33ms step:923/1480 train_time:139999ms step_avg:153.34ms step:924/1480 train_time:140158ms step_avg:153.35ms step:925/1480 train_time:140319ms step_avg:153.35ms step:926/1480 train_time:140477ms step_avg:153.36ms step:927/1480 train_time:140636ms step_avg:153.36ms step:928/1480 train_time:140795ms step_avg:153.37ms step:929/1480 train_time:140957ms step_avg:153.38ms step:930/1480 train_time:141117ms step_avg:153.39ms step:931/1480 train_time:141277ms step_avg:153.39ms step:932/1480 train_time:141437ms step_avg:153.40ms step:933/1480 train_time:141596ms step_avg:153.41ms step:934/1480 train_time:141756ms step_avg:153.42ms step:935/1480 train_time:141916ms step_avg:153.42ms step:936/1480 train_time:142076ms step_avg:153.43ms step:937/1480 train_time:142237ms step_avg:153.44ms step:938/1480 train_time:142395ms step_avg:153.44ms step:939/1480 train_time:142558ms step_avg:153.45ms step:940/1480 train_time:142720ms step_avg:153.46ms step:941/1480 train_time:142878ms step_avg:153.47ms step:942/1480 train_time:143037ms step_avg:153.47ms step:943/1480 train_time:143198ms step_avg:153.48ms step:944/1480 train_time:143361ms step_avg:153.49ms step:945/1480 train_time:143519ms step_avg:153.50ms step:946/1480 train_time:143682ms step_avg:153.51ms step:947/1480 train_time:143842ms step_avg:153.51ms step:948/1480 train_time:144002ms step_avg:153.52ms step:949/1480 train_time:144174ms step_avg:153.54ms step:950/1480 train_time:144320ms step_avg:153.53ms step:951/1480 train_time:144481ms step_avg:153.54ms step:952/1480 train_time:144639ms step_avg:153.54ms step:953/1480 train_time:144799ms step_avg:153.55ms step:954/1480 train_time:144962ms step_avg:153.56ms step:955/1480 train_time:145120ms step_avg:153.57ms step:956/1480 train_time:145278ms step_avg:153.57ms step:957/1480 train_time:145437ms step_avg:153.58ms step:958/1480 train_time:145602ms step_avg:153.59ms step:959/1480 train_time:145760ms step_avg:153.59ms step:960/1480 train_time:145921ms step_avg:153.60ms step:961/1480 train_time:146081ms step_avg:153.61ms step:962/1480 train_time:146239ms step_avg:153.61ms step:963/1480 train_time:146400ms step_avg:153.62ms step:964/1480 train_time:146562ms step_avg:153.63ms step:965/1480 train_time:146721ms step_avg:153.63ms step:966/1480 train_time:146880ms step_avg:153.64ms step:967/1480 train_time:147037ms step_avg:153.64ms step:968/1480 train_time:147198ms step_avg:153.65ms step:969/1480 train_time:147358ms step_avg:153.66ms step:970/1480 train_time:147517ms step_avg:153.66ms step:971/1480 train_time:147676ms step_avg:153.67ms step:972/1480 train_time:147835ms step_avg:153.68ms step:973/1480 train_time:147994ms step_avg:153.68ms step:974/1480 train_time:148155ms step_avg:153.69ms step:975/1480 train_time:148316ms step_avg:153.70ms step:976/1480 train_time:148477ms step_avg:153.70ms step:977/1480 train_time:148636ms step_avg:153.71ms step:978/1480 train_time:148796ms step_avg:153.72ms step:979/1480 train_time:148958ms step_avg:153.72ms step:980/1480 train_time:149118ms step_avg:153.73ms step:981/1480 train_time:149281ms step_avg:153.74ms step:982/1480 train_time:149438ms step_avg:153.74ms step:983/1480 train_time:149599ms step_avg:153.75ms step:984/1480 train_time:149758ms step_avg:153.76ms step:985/1480 train_time:149919ms step_avg:153.76ms step:986/1480 train_time:150080ms step_avg:153.77ms step:987/1480 train_time:150238ms step_avg:153.78ms step:988/1480 train_time:150398ms step_avg:153.78ms step:989/1480 train_time:150557ms step_avg:153.79ms step:990/1480 train_time:150720ms step_avg:153.80ms step:991/1480 train_time:150881ms step_avg:153.80ms step:992/1480 train_time:151044ms step_avg:153.81ms step:993/1480 train_time:151209ms step_avg:153.82ms step:994/1480 train_time:151367ms step_avg:153.83ms step:995/1480 train_time:151525ms step_avg:153.83ms step:996/1480 train_time:151682ms step_avg:153.84ms step:997/1480 train_time:151841ms step_avg:153.84ms step:998/1480 train_time:152000ms step_avg:153.85ms step:999/1480 train_time:152159ms step_avg:153.85ms step:1000/1480 train_time:152321ms step_avg:153.86ms step:1000/1480 val_loss:3.4443 train_time:152394ms step_avg:153.93ms step:1001/1480 train_time:152484ms step_avg:153.87ms step:1002/1480 train_time:152643ms step_avg:153.87ms step:1003/1480 train_time:152806ms step_avg:153.88ms step:1004/1480 train_time:152966ms step_avg:153.89ms step:1005/1480 train_time:153126ms step_avg:153.90ms step:1006/1480 train_time:153285ms step_avg:153.90ms step:1007/1480 train_time:153446ms step_avg:153.91ms step:1008/1480 train_time:153606ms step_avg:153.91ms step:1009/1480 train_time:153769ms step_avg:153.92ms step:1010/1480 train_time:153927ms step_avg:153.93ms step:1011/1480 train_time:154086ms step_avg:153.93ms step:1012/1480 train_time:154244ms step_avg:153.94ms step:1013/1480 train_time:154405ms step_avg:153.94ms step:1014/1480 train_time:154565ms step_avg:153.95ms step:1015/1480 train_time:154729ms step_avg:153.96ms step:1016/1480 train_time:154888ms step_avg:153.96ms step:1017/1480 train_time:155049ms step_avg:153.97ms step:1018/1480 train_time:155210ms step_avg:153.98ms step:1019/1480 train_time:155369ms step_avg:153.98ms step:1020/1480 train_time:155531ms step_avg:153.99ms step:1021/1480 train_time:155689ms step_avg:154.00ms step:1022/1480 train_time:155850ms step_avg:154.00ms step:1023/1480 train_time:156009ms step_avg:154.01ms step:1024/1480 train_time:156167ms step_avg:154.01ms step:1025/1480 train_time:156328ms step_avg:154.02ms step:1026/1480 train_time:156486ms step_avg:154.02ms step:1027/1480 train_time:156646ms step_avg:154.03ms step:1028/1480 train_time:156808ms step_avg:154.04ms step:1029/1480 train_time:156972ms step_avg:154.05ms step:1030/1480 train_time:157131ms step_avg:154.05ms step:1031/1480 train_time:157289ms step_avg:154.05ms step:1032/1480 train_time:157453ms step_avg:154.06ms step:1033/1480 train_time:157612ms step_avg:154.07ms step:1034/1480 train_time:157774ms step_avg:154.08ms step:1035/1480 train_time:157935ms step_avg:154.08ms step:1036/1480 train_time:158097ms step_avg:154.09ms step:1037/1480 train_time:158259ms step_avg:154.10ms step:1038/1480 train_time:158419ms step_avg:154.10ms step:1039/1480 train_time:158582ms step_avg:154.11ms step:1040/1480 train_time:158743ms step_avg:154.12ms step:1041/1480 train_time:158903ms step_avg:154.12ms step:1042/1480 train_time:159062ms step_avg:154.13ms step:1043/1480 train_time:159221ms step_avg:154.14ms step:1044/1480 train_time:159381ms step_avg:154.14ms step:1045/1480 train_time:159543ms step_avg:154.15ms step:1046/1480 train_time:159703ms step_avg:154.15ms step:1047/1480 train_time:159862ms step_avg:154.16ms step:1048/1480 train_time:160023ms step_avg:154.16ms step:1049/1480 train_time:160183ms step_avg:154.17ms step:1050/1480 train_time:160347ms step_avg:154.18ms step:1051/1480 train_time:160507ms step_avg:154.19ms step:1052/1480 train_time:160668ms step_avg:154.19ms step:1053/1480 train_time:160827ms step_avg:154.20ms step:1054/1480 train_time:160987ms step_avg:154.20ms step:1055/1480 train_time:161147ms step_avg:154.21ms step:1056/1480 train_time:161307ms step_avg:154.21ms step:1057/1480 train_time:161466ms step_avg:154.22ms step:1058/1480 train_time:161627ms step_avg:154.22ms step:1059/1480 train_time:161788ms step_avg:154.23ms step:1060/1480 train_time:161950ms step_avg:154.24ms step:1061/1480 train_time:162108ms step_avg:154.24ms step:1062/1480 train_time:162266ms step_avg:154.25ms step:1063/1480 train_time:162425ms step_avg:154.25ms step:1064/1480 train_time:162582ms step_avg:154.25ms step:1065/1480 train_time:162743ms step_avg:154.26ms step:1066/1480 train_time:162904ms step_avg:154.27ms step:1067/1480 train_time:163066ms step_avg:154.27ms step:1068/1480 train_time:163226ms step_avg:154.28ms step:1069/1480 train_time:163388ms step_avg:154.28ms step:1070/1480 train_time:163547ms step_avg:154.29ms step:1071/1480 train_time:163709ms step_avg:154.30ms step:1072/1480 train_time:163867ms step_avg:154.30ms step:1073/1480 train_time:164024ms step_avg:154.30ms step:1074/1480 train_time:164183ms step_avg:154.31ms step:1075/1480 train_time:164344ms step_avg:154.31ms step:1076/1480 train_time:164503ms step_avg:154.32ms step:1077/1480 train_time:164662ms step_avg:154.32ms step:1078/1480 train_time:164827ms step_avg:154.33ms step:1079/1480 train_time:164990ms step_avg:154.34ms step:1080/1480 train_time:165151ms step_avg:154.35ms step:1081/1480 train_time:165310ms step_avg:154.35ms step:1082/1480 train_time:165468ms step_avg:154.35ms step:1083/1480 train_time:165628ms step_avg:154.36ms step:1084/1480 train_time:165787ms step_avg:154.36ms step:1085/1480 train_time:165948ms step_avg:154.37ms step:1086/1480 train_time:166107ms step_avg:154.37ms step:1087/1480 train_time:166267ms step_avg:154.38ms step:1088/1480 train_time:166427ms step_avg:154.38ms step:1089/1480 train_time:166588ms step_avg:154.39ms step:1090/1480 train_time:166751ms step_avg:154.40ms step:1091/1480 train_time:166911ms step_avg:154.40ms step:1092/1480 train_time:167071ms step_avg:154.41ms step:1093/1480 train_time:167231ms step_avg:154.41ms step:1094/1480 train_time:167389ms step_avg:154.42ms step:1095/1480 train_time:167548ms step_avg:154.42ms step:1096/1480 train_time:167709ms step_avg:154.43ms step:1097/1480 train_time:167871ms step_avg:154.43ms step:1098/1480 train_time:168033ms step_avg:154.44ms step:1099/1480 train_time:168196ms step_avg:154.45ms step:1100/1480 train_time:168361ms step_avg:154.46ms step:1101/1480 train_time:168525ms step_avg:154.47ms step:1102/1480 train_time:168687ms step_avg:154.47ms step:1103/1480 train_time:168852ms step_avg:154.48ms step:1104/1480 train_time:169012ms step_avg:154.49ms step:1105/1480 train_time:169172ms step_avg:154.50ms step:1106/1480 train_time:169332ms step_avg:154.50ms step:1107/1480 train_time:169493ms step_avg:154.51ms step:1108/1480 train_time:169653ms step_avg:154.51ms step:1109/1480 train_time:169812ms step_avg:154.51ms step:1110/1480 train_time:169973ms step_avg:154.52ms step:1111/1480 train_time:170135ms step_avg:154.53ms step:1112/1480 train_time:170298ms step_avg:154.54ms step:1113/1480 train_time:170467ms step_avg:154.55ms step:1114/1480 train_time:170629ms step_avg:154.56ms step:1115/1480 train_time:170790ms step_avg:154.56ms step:1116/1480 train_time:170949ms step_avg:154.56ms step:1117/1480 train_time:171111ms step_avg:154.57ms step:1118/1480 train_time:171276ms step_avg:154.58ms step:1119/1480 train_time:171438ms step_avg:154.59ms step:1120/1480 train_time:171602ms step_avg:154.60ms step:1121/1480 train_time:171765ms step_avg:154.60ms step:1122/1480 train_time:171926ms step_avg:154.61ms step:1123/1480 train_time:172084ms step_avg:154.61ms step:1124/1480 train_time:172248ms step_avg:154.62ms step:1125/1480 train_time:172410ms step_avg:154.63ms step:1125/1480 val_loss:3.3877 train_time:172484ms step_avg:154.69ms step:1126/1480 train_time:172579ms step_avg:154.64ms step:1127/1480 train_time:172736ms step_avg:154.64ms step:1128/1480 train_time:172897ms step_avg:154.65ms step:1129/1480 train_time:173060ms step_avg:154.66ms step:1130/1480 train_time:173220ms step_avg:154.66ms step:1131/1480 train_time:173386ms step_avg:154.67ms step:1132/1480 train_time:173545ms step_avg:154.67ms step:1133/1480 train_time:173707ms step_avg:154.68ms step:1134/1480 train_time:173870ms step_avg:154.69ms step:1135/1480 train_time:174032ms step_avg:154.70ms step:1136/1480 train_time:174196ms step_avg:154.70ms step:1137/1480 train_time:174357ms step_avg:154.71ms step:1138/1480 train_time:174520ms step_avg:154.72ms step:1139/1480 train_time:174707ms step_avg:154.74ms step:1140/1480 train_time:174840ms step_avg:154.73ms step:1141/1480 train_time:175004ms step_avg:154.73ms step:1142/1480 train_time:175164ms step_avg:154.74ms step:1143/1480 train_time:175330ms step_avg:154.75ms step:1144/1480 train_time:175492ms step_avg:154.76ms step:1145/1480 train_time:175652ms step_avg:154.76ms step:1146/1480 train_time:175815ms step_avg:154.77ms step:1147/1480 train_time:175977ms step_avg:154.77ms step:1148/1480 train_time:176138ms step_avg:154.78ms step:1149/1480 train_time:176299ms step_avg:154.78ms step:1150/1480 train_time:176459ms step_avg:154.79ms step:1151/1480 train_time:176622ms step_avg:154.80ms step:1152/1480 train_time:176785ms step_avg:154.80ms step:1153/1480 train_time:176950ms step_avg:154.81ms step:1154/1480 train_time:177112ms step_avg:154.82ms step:1155/1480 train_time:177275ms step_avg:154.83ms step:1156/1480 train_time:177440ms step_avg:154.83ms step:1157/1480 train_time:177602ms step_avg:154.84ms step:1158/1480 train_time:177762ms step_avg:154.84ms step:1159/1480 train_time:177922ms step_avg:154.85ms step:1160/1480 train_time:178083ms step_avg:154.85ms step:1161/1480 train_time:178245ms step_avg:154.86ms step:1162/1480 train_time:178409ms step_avg:154.87ms step:1163/1480 train_time:178574ms step_avg:154.88ms step:1164/1480 train_time:178737ms step_avg:154.88ms step:1165/1480 train_time:178896ms step_avg:154.89ms step:1166/1480 train_time:179058ms step_avg:154.89ms step:1167/1480 train_time:179217ms step_avg:154.90ms step:1168/1480 train_time:179378ms step_avg:154.90ms step:1169/1480 train_time:179540ms step_avg:154.91ms step:1170/1480 train_time:179700ms step_avg:154.91ms step:1171/1480 train_time:179861ms step_avg:154.92ms step:1172/1480 train_time:180019ms step_avg:154.92ms step:1173/1480 train_time:180181ms step_avg:154.93ms step:1174/1480 train_time:180350ms step_avg:154.94ms step:1175/1480 train_time:180513ms step_avg:154.95ms step:1176/1480 train_time:180677ms step_avg:154.95ms step:1177/1480 train_time:180841ms step_avg:154.96ms step:1178/1480 train_time:181001ms step_avg:154.97ms step:1179/1480 train_time:181160ms step_avg:154.97ms step:1180/1480 train_time:181328ms step_avg:154.98ms step:1181/1480 train_time:181491ms step_avg:154.99ms step:1182/1480 train_time:181653ms step_avg:154.99ms step:1183/1480 train_time:181815ms step_avg:155.00ms step:1184/1480 train_time:181976ms step_avg:155.01ms step:1185/1480 train_time:182141ms step_avg:155.01ms step:1186/1480 train_time:182302ms step_avg:155.02ms step:1187/1480 train_time:182476ms step_avg:155.04ms step:1188/1480 train_time:182636ms step_avg:155.04ms step:1189/1480 train_time:182798ms step_avg:155.04ms step:1190/1480 train_time:182958ms step_avg:155.05ms step:1191/1480 train_time:183120ms step_avg:155.06ms step:1192/1480 train_time:183280ms step_avg:155.06ms step:1193/1480 train_time:183440ms step_avg:155.06ms step:1194/1480 train_time:183602ms step_avg:155.07ms step:1195/1480 train_time:183764ms step_avg:155.08ms step:1196/1480 train_time:183935ms step_avg:155.09ms step:1197/1480 train_time:184097ms step_avg:155.09ms step:1198/1480 train_time:184266ms step_avg:155.11ms step:1199/1480 train_time:184429ms step_avg:155.11ms step:1200/1480 train_time:184591ms step_avg:155.12ms step:1201/1480 train_time:184753ms step_avg:155.12ms step:1202/1480 train_time:184920ms step_avg:155.13ms step:1203/1480 train_time:185085ms step_avg:155.14ms step:1204/1480 train_time:185251ms step_avg:155.15ms step:1205/1480 train_time:185414ms step_avg:155.16ms step:1206/1480 train_time:185575ms step_avg:155.16ms step:1207/1480 train_time:185736ms step_avg:155.17ms step:1208/1480 train_time:185897ms step_avg:155.17ms step:1209/1480 train_time:186061ms step_avg:155.18ms step:1210/1480 train_time:186227ms step_avg:155.19ms step:1211/1480 train_time:186390ms step_avg:155.20ms step:1212/1480 train_time:186554ms step_avg:155.20ms step:1213/1480 train_time:186718ms step_avg:155.21ms step:1214/1480 train_time:186883ms step_avg:155.22ms step:1215/1480 train_time:187048ms step_avg:155.23ms step:1216/1480 train_time:187209ms step_avg:155.23ms step:1217/1480 train_time:187373ms step_avg:155.24ms step:1218/1480 train_time:187537ms step_avg:155.25ms step:1219/1480 train_time:187703ms step_avg:155.26ms step:1220/1480 train_time:187867ms step_avg:155.26ms step:1221/1480 train_time:188028ms step_avg:155.27ms step:1222/1480 train_time:188190ms step_avg:155.27ms step:1223/1480 train_time:188353ms step_avg:155.28ms step:1224/1480 train_time:188519ms step_avg:155.29ms step:1225/1480 train_time:188682ms step_avg:155.29ms step:1226/1480 train_time:188847ms step_avg:155.30ms step:1227/1480 train_time:189013ms step_avg:155.31ms step:1228/1480 train_time:189176ms step_avg:155.32ms step:1229/1480 train_time:189338ms step_avg:155.32ms step:1230/1480 train_time:189507ms step_avg:155.33ms step:1231/1480 train_time:189673ms step_avg:155.34ms step:1232/1480 train_time:189838ms step_avg:155.35ms step:1233/1480 train_time:189999ms step_avg:155.35ms step:1234/1480 train_time:190160ms step_avg:155.36ms step:1235/1480 train_time:190326ms step_avg:155.37ms step:1236/1480 train_time:190488ms step_avg:155.37ms step:1237/1480 train_time:190651ms step_avg:155.38ms step:1238/1480 train_time:190825ms step_avg:155.39ms step:1239/1480 train_time:190987ms step_avg:155.40ms step:1240/1480 train_time:191152ms step_avg:155.41ms step:1241/1480 train_time:191317ms step_avg:155.42ms step:1242/1480 train_time:191478ms step_avg:155.42ms step:1243/1480 train_time:191641ms step_avg:155.43ms step:1244/1480 train_time:191800ms step_avg:155.43ms step:1245/1480 train_time:191961ms step_avg:155.43ms step:1246/1480 train_time:192121ms step_avg:155.44ms step:1247/1480 train_time:192283ms step_avg:155.44ms step:1248/1480 train_time:192446ms step_avg:155.45ms step:1249/1480 train_time:192606ms step_avg:155.45ms step:1250/1480 train_time:192771ms step_avg:155.46ms step:1250/1480 val_loss:3.3386 train_time:192847ms step_avg:155.52ms step:1251/1480 train_time:192940ms step_avg:155.47ms step:1252/1480 train_time:193103ms step_avg:155.48ms step:1253/1480 train_time:193264ms step_avg:155.48ms step:1254/1480 train_time:193425ms step_avg:155.49ms step:1255/1480 train_time:193596ms step_avg:155.50ms step:1256/1480 train_time:193760ms step_avg:155.51ms step:1257/1480 train_time:193921ms step_avg:155.51ms step:1258/1480 train_time:194086ms step_avg:155.52ms step:1259/1480 train_time:194251ms step_avg:155.52ms step:1260/1480 train_time:194412ms step_avg:155.53ms step:1261/1480 train_time:194575ms step_avg:155.54ms step:1262/1480 train_time:194739ms step_avg:155.54ms step:1263/1480 train_time:194906ms step_avg:155.55ms step:1264/1480 train_time:195066ms step_avg:155.55ms step:1265/1480 train_time:195226ms step_avg:155.56ms step:1266/1480 train_time:195390ms step_avg:155.57ms step:1267/1480 train_time:195552ms step_avg:155.57ms step:1268/1480 train_time:195715ms step_avg:155.58ms step:1269/1480 train_time:195879ms step_avg:155.58ms step:1270/1480 train_time:196041ms step_avg:155.59ms step:1271/1480 train_time:196204ms step_avg:155.59ms step:1272/1480 train_time:196366ms step_avg:155.60ms step:1273/1480 train_time:196528ms step_avg:155.60ms step:1274/1480 train_time:196693ms step_avg:155.61ms step:1275/1480 train_time:196854ms step_avg:155.62ms step:1276/1480 train_time:197014ms step_avg:155.62ms step:1277/1480 train_time:197177ms step_avg:155.63ms step:1278/1480 train_time:197336ms step_avg:155.63ms step:1279/1480 train_time:197498ms step_avg:155.63ms step:1280/1480 train_time:197664ms step_avg:155.64ms step:1281/1480 train_time:197827ms step_avg:155.65ms step:1282/1480 train_time:197987ms step_avg:155.65ms step:1283/1480 train_time:198151ms step_avg:155.66ms step:1284/1480 train_time:198314ms step_avg:155.66ms step:1285/1480 train_time:198475ms step_avg:155.67ms step:1286/1480 train_time:198635ms step_avg:155.67ms step:1287/1480 train_time:198798ms step_avg:155.68ms step:1288/1480 train_time:198959ms step_avg:155.68ms step:1289/1480 train_time:199130ms step_avg:155.69ms step:1290/1480 train_time:199298ms step_avg:155.70ms step:1291/1480 train_time:199461ms step_avg:155.71ms step:1292/1480 train_time:199626ms step_avg:155.71ms step:1293/1480 train_time:199795ms step_avg:155.72ms step:1294/1480 train_time:199957ms step_avg:155.73ms step:1295/1480 train_time:200119ms step_avg:155.73ms step:1296/1480 train_time:200282ms step_avg:155.74ms step:1297/1480 train_time:200446ms step_avg:155.75ms step:1298/1480 train_time:200611ms step_avg:155.75ms step:1299/1480 train_time:200774ms step_avg:155.76ms step:1300/1480 train_time:200935ms step_avg:155.76ms step:1301/1480 train_time:201095ms step_avg:155.77ms step:1302/1480 train_time:201259ms step_avg:155.77ms step:1303/1480 train_time:201425ms step_avg:155.78ms step:1304/1480 train_time:201592ms step_avg:155.79ms step:1305/1480 train_time:201754ms step_avg:155.79ms step:1306/1480 train_time:201918ms step_avg:155.80ms step:1307/1480 train_time:202078ms step_avg:155.80ms step:1308/1480 train_time:202240ms step_avg:155.81ms step:1309/1480 train_time:202406ms step_avg:155.82ms step:1310/1480 train_time:202568ms step_avg:155.82ms step:1311/1480 train_time:202731ms step_avg:155.83ms step:1312/1480 train_time:202897ms step_avg:155.83ms step:1313/1480 train_time:203058ms step_avg:155.84ms step:1314/1480 train_time:203222ms step_avg:155.84ms step:1315/1480 train_time:203386ms step_avg:155.85ms step:1316/1480 train_time:203546ms step_avg:155.85ms step:1317/1480 train_time:203709ms step_avg:155.86ms step:1318/1480 train_time:203877ms step_avg:155.87ms step:1319/1480 train_time:204042ms step_avg:155.88ms step:1320/1480 train_time:204211ms step_avg:155.89ms step:1321/1480 train_time:204374ms step_avg:155.89ms step:1322/1480 train_time:204544ms step_avg:155.90ms step:1323/1480 train_time:204710ms step_avg:155.91ms step:1324/1480 train_time:204873ms step_avg:155.92ms step:1325/1480 train_time:205042ms step_avg:155.93ms step:1326/1480 train_time:205209ms step_avg:155.93ms step:1327/1480 train_time:205373ms step_avg:155.94ms step:1328/1480 train_time:205535ms step_avg:155.94ms step:1329/1480 train_time:205733ms step_avg:155.98ms step:1330/1480 train_time:205883ms step_avg:155.97ms step:1331/1480 train_time:206046ms step_avg:155.98ms step:1332/1480 train_time:206211ms step_avg:155.98ms step:1333/1480 train_time:206376ms step_avg:155.99ms step:1334/1480 train_time:206538ms step_avg:156.00ms step:1335/1480 train_time:206698ms step_avg:156.00ms step:1336/1480 train_time:206867ms step_avg:156.01ms step:1337/1480 train_time:207033ms step_avg:156.02ms step:1338/1480 train_time:207197ms step_avg:156.02ms step:1339/1480 train_time:207360ms step_avg:156.03ms step:1340/1480 train_time:207524ms step_avg:156.03ms step:1341/1480 train_time:207686ms step_avg:156.04ms step:1342/1480 train_time:207852ms step_avg:156.05ms step:1343/1480 train_time:208015ms step_avg:156.05ms step:1344/1480 train_time:208178ms step_avg:156.06ms step:1345/1480 train_time:208346ms step_avg:156.06ms step:1346/1480 train_time:208508ms step_avg:156.07ms step:1347/1480 train_time:208672ms step_avg:156.07ms step:1348/1480 train_time:208834ms step_avg:156.08ms step:1349/1480 train_time:208996ms step_avg:156.08ms step:1350/1480 train_time:209163ms step_avg:156.09ms step:1351/1480 train_time:209325ms step_avg:156.10ms step:1352/1480 train_time:209489ms step_avg:156.10ms step:1353/1480 train_time:209655ms step_avg:156.11ms step:1354/1480 train_time:209817ms step_avg:156.11ms step:1355/1480 train_time:209978ms step_avg:156.12ms step:1356/1480 train_time:210142ms step_avg:156.12ms step:1357/1480 train_time:210308ms step_avg:156.13ms step:1358/1480 train_time:210473ms step_avg:156.14ms step:1359/1480 train_time:210637ms step_avg:156.14ms step:1360/1480 train_time:210802ms step_avg:156.15ms step:1361/1480 train_time:210970ms step_avg:156.16ms step:1362/1480 train_time:211135ms step_avg:156.17ms step:1363/1480 train_time:211304ms step_avg:156.17ms step:1364/1480 train_time:211466ms step_avg:156.18ms step:1365/1480 train_time:211626ms step_avg:156.18ms step:1366/1480 train_time:211790ms step_avg:156.19ms step:1367/1480 train_time:211952ms step_avg:156.19ms step:1368/1480 train_time:212117ms step_avg:156.20ms step:1369/1480 train_time:212285ms step_avg:156.21ms step:1370/1480 train_time:212452ms step_avg:156.21ms step:1371/1480 train_time:212615ms step_avg:156.22ms step:1372/1480 train_time:212782ms step_avg:156.23ms step:1373/1480 train_time:212942ms step_avg:156.23ms step:1374/1480 train_time:213111ms step_avg:156.24ms step:1375/1480 train_time:213273ms step_avg:156.24ms step:1375/1480 val_loss:3.3000 train_time:213348ms step_avg:156.30ms step:1376/1480 train_time:213439ms step_avg:156.25ms step:1377/1480 train_time:213601ms step_avg:156.26ms step:1378/1480 train_time:213762ms step_avg:156.26ms step:1379/1480 train_time:213925ms step_avg:156.26ms step:1380/1480 train_time:214090ms step_avg:156.27ms step:1381/1480 train_time:214258ms step_avg:156.28ms step:1382/1480 train_time:214422ms step_avg:156.28ms step:1383/1480 train_time:214583ms step_avg:156.29ms step:1384/1480 train_time:214750ms step_avg:156.30ms step:1385/1480 train_time:214911ms step_avg:156.30ms step:1386/1480 train_time:215075ms step_avg:156.30ms step:1387/1480 train_time:215240ms step_avg:156.31ms step:1388/1480 train_time:215401ms step_avg:156.31ms step:1389/1480 train_time:215565ms step_avg:156.32ms step:1390/1480 train_time:215726ms step_avg:156.32ms step:1391/1480 train_time:215889ms step_avg:156.33ms step:1392/1480 train_time:216054ms step_avg:156.33ms step:1393/1480 train_time:216217ms step_avg:156.34ms step:1394/1480 train_time:216379ms step_avg:156.34ms step:1395/1480 train_time:216543ms step_avg:156.35ms step:1396/1480 train_time:216704ms step_avg:156.35ms step:1397/1480 train_time:216865ms step_avg:156.36ms step:1398/1480 train_time:217025ms step_avg:156.36ms step:1399/1480 train_time:217186ms step_avg:156.36ms step:1400/1480 train_time:217358ms step_avg:156.37ms step:1401/1480 train_time:217518ms step_avg:156.38ms step:1402/1480 train_time:217679ms step_avg:156.38ms step:1403/1480 train_time:217844ms step_avg:156.38ms step:1404/1480 train_time:218007ms step_avg:156.39ms step:1405/1480 train_time:218173ms step_avg:156.40ms step:1406/1480 train_time:218339ms step_avg:156.40ms step:1407/1480 train_time:218500ms step_avg:156.41ms step:1408/1480 train_time:218660ms step_avg:156.41ms step:1409/1480 train_time:218833ms step_avg:156.42ms step:1410/1480 train_time:218996ms step_avg:156.43ms step:1411/1480 train_time:219156ms step_avg:156.43ms step:1412/1480 train_time:219318ms step_avg:156.43ms step:1413/1480 train_time:219480ms step_avg:156.44ms step:1414/1480 train_time:219644ms step_avg:156.44ms step:1415/1480 train_time:219810ms step_avg:156.45ms step:1416/1480 train_time:219985ms step_avg:156.46ms step:1417/1480 train_time:220149ms step_avg:156.47ms step:1418/1480 train_time:220315ms step_avg:156.47ms step:1419/1480 train_time:220478ms step_avg:156.48ms step:1420/1480 train_time:220642ms step_avg:156.48ms step:1421/1480 train_time:220805ms step_avg:156.49ms step:1422/1480 train_time:220971ms step_avg:156.50ms step:1423/1480 train_time:221134ms step_avg:156.50ms step:1424/1480 train_time:221301ms step_avg:156.51ms step:1425/1480 train_time:221470ms step_avg:156.52ms step:1426/1480 train_time:221634ms step_avg:156.52ms step:1427/1480 train_time:221801ms step_avg:156.53ms step:1428/1480 train_time:221963ms step_avg:156.53ms step:1429/1480 train_time:222123ms step_avg:156.53ms step:1430/1480 train_time:222287ms step_avg:156.54ms step:1431/1480 train_time:222452ms step_avg:156.55ms step:1432/1480 train_time:222620ms step_avg:156.55ms step:1433/1480 train_time:222787ms step_avg:156.56ms step:1434/1480 train_time:222957ms step_avg:156.57ms step:1435/1480 train_time:223122ms step_avg:156.58ms step:1436/1480 train_time:223287ms step_avg:156.58ms step:1437/1480 train_time:223449ms step_avg:156.59ms step:1438/1480 train_time:223611ms step_avg:156.59ms step:1439/1480 train_time:223776ms step_avg:156.60ms step:1440/1480 train_time:223939ms step_avg:156.60ms step:1441/1480 train_time:224102ms step_avg:156.61ms step:1442/1480 train_time:224268ms step_avg:156.61ms step:1443/1480 train_time:224442ms step_avg:156.62ms step:1444/1480 train_time:224604ms step_avg:156.63ms step:1445/1480 train_time:224765ms step_avg:156.63ms step:1446/1480 train_time:224933ms step_avg:156.64ms step:1447/1480 train_time:225101ms step_avg:156.65ms step:1448/1480 train_time:225263ms step_avg:156.65ms step:1449/1480 train_time:225427ms step_avg:156.66ms step:1450/1480 train_time:225592ms step_avg:156.66ms step:1451/1480 train_time:225756ms step_avg:156.67ms step:1452/1480 train_time:225920ms step_avg:156.67ms step:1453/1480 train_time:226082ms step_avg:156.68ms step:1454/1480 train_time:226244ms step_avg:156.68ms step:1455/1480 train_time:226415ms step_avg:156.69ms step:1456/1480 train_time:226578ms step_avg:156.69ms step:1457/1480 train_time:226739ms step_avg:156.70ms step:1458/1480 train_time:226901ms step_avg:156.70ms step:1459/1480 train_time:227066ms step_avg:156.71ms step:1460/1480 train_time:227229ms step_avg:156.71ms step:1461/1480 train_time:227394ms step_avg:156.72ms step:1462/1480 train_time:227559ms step_avg:156.72ms step:1463/1480 train_time:227724ms step_avg:156.73ms step:1464/1480 train_time:227890ms step_avg:156.73ms step:1465/1480 train_time:228054ms step_avg:156.74ms step:1466/1480 train_time:228217ms step_avg:156.74ms step:1467/1480 train_time:228382ms step_avg:156.75ms step:1468/1480 train_time:228544ms step_avg:156.75ms step:1469/1480 train_time:228707ms step_avg:156.76ms step:1470/1480 train_time:228875ms step_avg:156.76ms step:1471/1480 train_time:229044ms step_avg:156.77ms step:1472/1480 train_time:229216ms step_avg:156.78ms step:1473/1480 train_time:229379ms step_avg:156.79ms step:1474/1480 train_time:229545ms step_avg:156.79ms step:1475/1480 train_time:229715ms step_avg:156.80ms step:1476/1480 train_time:229879ms step_avg:156.81ms step:1477/1480 train_time:230046ms step_avg:156.81ms step:1478/1480 train_time:230218ms step_avg:156.82ms step:1479/1480 train_time:230383ms step_avg:156.83ms step:1480/1480 train_time:230545ms step_avg:156.83ms step:1480/1480 val_loss:3.2810 train_time:230621ms step_avg:156.89ms peak memory consumption: 34239 MiB