import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 10:53:40 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 36C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29014ms step_avg:nanms step:2/1480 train_time:29119ms step_avg:nanms step:3/1480 train_time:29242ms step_avg:nanms step:4/1480 train_time:29382ms step_avg:nanms step:5/1480 train_time:29523ms step_avg:nanms step:6/1480 train_time:29665ms step_avg:nanms step:7/1480 train_time:29807ms step_avg:nanms step:8/1480 train_time:29949ms step_avg:nanms step:9/1480 train_time:30092ms step_avg:nanms step:10/1480 train_time:30239ms step_avg:nanms step:11/1480 train_time:146ms step_avg:nanms step:12/1480 train_time:280ms step_avg:nanms step:13/1480 train_time:423ms step_avg:140.93ms step:14/1480 train_time:564ms step_avg:140.96ms step:15/1480 train_time:707ms step_avg:141.34ms step:16/1480 train_time:849ms step_avg:141.45ms step:17/1480 train_time:992ms step_avg:141.66ms step:18/1480 train_time:1135ms step_avg:141.83ms step:19/1480 train_time:1278ms step_avg:141.97ms step:20/1480 train_time:1419ms step_avg:141.86ms step:21/1480 train_time:1561ms step_avg:141.95ms step:22/1480 train_time:1705ms step_avg:142.05ms step:23/1480 train_time:1847ms step_avg:142.09ms step:24/1480 train_time:1990ms step_avg:142.11ms step:25/1480 train_time:2132ms step_avg:142.13ms step:26/1480 train_time:2275ms step_avg:142.20ms step:27/1480 train_time:2418ms step_avg:142.21ms step:28/1480 train_time:2560ms step_avg:142.21ms step:29/1480 train_time:2702ms step_avg:142.20ms step:30/1480 train_time:2845ms step_avg:142.26ms step:31/1480 train_time:2987ms step_avg:142.24ms step:32/1480 train_time:3129ms step_avg:142.25ms step:33/1480 train_time:3272ms step_avg:142.28ms step:34/1480 train_time:3414ms step_avg:142.26ms step:35/1480 train_time:3558ms step_avg:142.34ms step:36/1480 train_time:3700ms step_avg:142.33ms step:37/1480 train_time:3842ms step_avg:142.29ms step:38/1480 train_time:3983ms step_avg:142.27ms step:39/1480 train_time:4126ms step_avg:142.26ms step:40/1480 train_time:4267ms step_avg:142.24ms step:41/1480 train_time:4408ms step_avg:142.21ms step:42/1480 train_time:4552ms step_avg:142.25ms step:43/1480 train_time:4694ms step_avg:142.25ms step:44/1480 train_time:4836ms step_avg:142.23ms step:45/1480 train_time:4978ms step_avg:142.22ms step:46/1480 train_time:5119ms step_avg:142.20ms step:47/1480 train_time:5263ms step_avg:142.24ms step:48/1480 train_time:5407ms step_avg:142.28ms step:49/1480 train_time:5549ms step_avg:142.28ms step:50/1480 train_time:5692ms step_avg:142.29ms step:51/1480 train_time:5834ms step_avg:142.28ms step:52/1480 train_time:5977ms step_avg:142.31ms step:53/1480 train_time:6119ms step_avg:142.30ms step:54/1480 train_time:6261ms step_avg:142.29ms step:55/1480 train_time:6403ms step_avg:142.29ms step:56/1480 train_time:6545ms step_avg:142.28ms step:57/1480 train_time:6688ms step_avg:142.30ms step:58/1480 train_time:6831ms step_avg:142.32ms step:59/1480 train_time:6974ms step_avg:142.34ms step:60/1480 train_time:7117ms step_avg:142.33ms step:61/1480 train_time:7259ms step_avg:142.33ms step:62/1480 train_time:7402ms step_avg:142.34ms step:63/1480 train_time:7546ms step_avg:142.39ms step:64/1480 train_time:7690ms step_avg:142.41ms step:65/1480 train_time:7832ms step_avg:142.40ms step:66/1480 train_time:7976ms step_avg:142.42ms step:67/1480 train_time:8119ms step_avg:142.44ms step:68/1480 train_time:8262ms step_avg:142.44ms step:69/1480 train_time:8403ms step_avg:142.43ms step:70/1480 train_time:8548ms step_avg:142.46ms step:71/1480 train_time:8690ms step_avg:142.45ms step:72/1480 train_time:8833ms step_avg:142.47ms step:73/1480 train_time:8976ms step_avg:142.48ms step:74/1480 train_time:9119ms step_avg:142.49ms step:75/1480 train_time:9261ms step_avg:142.48ms step:76/1480 train_time:9408ms step_avg:142.54ms step:77/1480 train_time:9552ms step_avg:142.57ms step:78/1480 train_time:9694ms step_avg:142.56ms step:79/1480 train_time:10217ms step_avg:148.07ms step:80/1480 train_time:10314ms step_avg:147.35ms step:81/1480 train_time:10457ms step_avg:147.29ms step:82/1480 train_time:10599ms step_avg:147.20ms step:83/1480 train_time:10742ms step_avg:147.15ms step:84/1480 train_time:10884ms step_avg:147.08ms step:85/1480 train_time:11026ms step_avg:147.01ms step:86/1480 train_time:11169ms step_avg:146.96ms step:87/1480 train_time:11316ms step_avg:146.97ms step:88/1480 train_time:11462ms step_avg:146.95ms step:89/1480 train_time:11604ms step_avg:146.89ms step:90/1480 train_time:11748ms step_avg:146.86ms step:91/1480 train_time:11894ms step_avg:146.83ms step:92/1480 train_time:12036ms step_avg:146.78ms step:93/1480 train_time:12179ms step_avg:146.74ms step:94/1480 train_time:12323ms step_avg:146.70ms step:95/1480 train_time:12465ms step_avg:146.64ms step:96/1480 train_time:12607ms step_avg:146.60ms step:97/1480 train_time:13133ms step_avg:150.96ms step:98/1480 train_time:13237ms step_avg:150.42ms step:99/1480 train_time:13380ms step_avg:150.34ms step:100/1480 train_time:13523ms step_avg:150.26ms step:101/1480 train_time:13668ms step_avg:150.19ms step:102/1480 train_time:13807ms step_avg:150.07ms step:103/1480 train_time:13950ms step_avg:150.00ms step:104/1480 train_time:14092ms step_avg:149.92ms step:105/1480 train_time:14235ms step_avg:149.85ms step:106/1480 train_time:14378ms step_avg:149.77ms step:107/1480 train_time:14520ms step_avg:149.69ms step:108/1480 train_time:14663ms step_avg:149.62ms step:109/1480 train_time:14806ms step_avg:149.55ms step:110/1480 train_time:14949ms step_avg:149.49ms step:111/1480 train_time:15095ms step_avg:149.45ms step:112/1480 train_time:15240ms step_avg:149.41ms step:113/1480 train_time:15386ms step_avg:149.38ms step:114/1480 train_time:15532ms step_avg:149.34ms step:115/1480 train_time:15678ms step_avg:149.31ms step:116/1480 train_time:15823ms step_avg:149.27ms step:117/1480 train_time:15969ms step_avg:149.25ms step:118/1480 train_time:16116ms step_avg:149.22ms step:119/1480 train_time:16260ms step_avg:149.18ms step:120/1480 train_time:16406ms step_avg:149.14ms step:121/1480 train_time:16552ms step_avg:149.12ms step:122/1480 train_time:16698ms step_avg:149.09ms step:123/1480 train_time:16842ms step_avg:149.05ms step:124/1480 train_time:16990ms step_avg:149.04ms step:125/1480 train_time:17136ms step_avg:149.01ms step:125/1480 val_loss:4.4206 train_time:17201ms step_avg:149.57ms step:126/1480 train_time:17298ms step_avg:149.12ms step:127/1480 train_time:17440ms step_avg:149.06ms step:128/1480 train_time:17586ms step_avg:149.04ms step:129/1480 train_time:17731ms step_avg:149.00ms step:130/1480 train_time:17876ms step_avg:148.96ms step:131/1480 train_time:18022ms step_avg:148.94ms step:132/1480 train_time:18168ms step_avg:148.92ms step:133/1480 train_time:18314ms step_avg:148.89ms step:134/1480 train_time:18459ms step_avg:148.86ms step:135/1480 train_time:18605ms step_avg:148.84ms step:136/1480 train_time:18750ms step_avg:148.81ms step:137/1480 train_time:18895ms step_avg:148.78ms step:138/1480 train_time:19042ms step_avg:148.77ms step:139/1480 train_time:19188ms step_avg:148.74ms step:140/1480 train_time:19334ms step_avg:148.72ms step:141/1480 train_time:19480ms step_avg:148.70ms step:142/1480 train_time:19626ms step_avg:148.68ms step:143/1480 train_time:19770ms step_avg:148.65ms step:144/1480 train_time:19916ms step_avg:148.63ms step:145/1480 train_time:20062ms step_avg:148.61ms step:146/1480 train_time:20207ms step_avg:148.58ms step:147/1480 train_time:20352ms step_avg:148.56ms step:148/1480 train_time:20498ms step_avg:148.54ms step:149/1480 train_time:20646ms step_avg:148.53ms step:150/1480 train_time:20790ms step_avg:148.50ms step:151/1480 train_time:20937ms step_avg:148.49ms step:152/1480 train_time:21083ms step_avg:148.47ms step:153/1480 train_time:21229ms step_avg:148.46ms step:154/1480 train_time:21373ms step_avg:148.42ms step:155/1480 train_time:21519ms step_avg:148.41ms step:156/1480 train_time:21667ms step_avg:148.40ms step:157/1480 train_time:21811ms step_avg:148.38ms step:158/1480 train_time:21957ms step_avg:148.36ms step:159/1480 train_time:22104ms step_avg:148.35ms step:160/1480 train_time:22250ms step_avg:148.33ms step:161/1480 train_time:22395ms step_avg:148.31ms step:162/1480 train_time:22541ms step_avg:148.30ms step:163/1480 train_time:22687ms step_avg:148.28ms step:164/1480 train_time:22833ms step_avg:148.27ms step:165/1480 train_time:22979ms step_avg:148.25ms step:166/1480 train_time:23126ms step_avg:148.25ms step:167/1480 train_time:23271ms step_avg:148.22ms step:168/1480 train_time:23417ms step_avg:148.21ms step:169/1480 train_time:23565ms step_avg:148.21ms step:170/1480 train_time:23710ms step_avg:148.19ms step:171/1480 train_time:23855ms step_avg:148.17ms step:172/1480 train_time:24001ms step_avg:148.15ms step:173/1480 train_time:24147ms step_avg:148.14ms step:174/1480 train_time:24292ms step_avg:148.12ms step:175/1480 train_time:24438ms step_avg:148.11ms step:176/1480 train_time:24584ms step_avg:148.10ms step:177/1480 train_time:24730ms step_avg:148.08ms step:178/1480 train_time:24874ms step_avg:148.06ms step:179/1480 train_time:25021ms step_avg:148.05ms step:180/1480 train_time:25168ms step_avg:148.04ms step:181/1480 train_time:25313ms step_avg:148.03ms step:182/1480 train_time:25458ms step_avg:148.01ms step:183/1480 train_time:25605ms step_avg:148.00ms step:184/1480 train_time:25750ms step_avg:147.99ms step:185/1480 train_time:25894ms step_avg:147.97ms step:186/1480 train_time:26041ms step_avg:147.96ms step:187/1480 train_time:26186ms step_avg:147.95ms step:188/1480 train_time:26332ms step_avg:147.93ms step:189/1480 train_time:26498ms step_avg:148.03ms step:190/1480 train_time:26624ms step_avg:147.91ms step:191/1480 train_time:26769ms step_avg:147.90ms step:192/1480 train_time:26915ms step_avg:147.89ms step:193/1480 train_time:27063ms step_avg:147.88ms step:194/1480 train_time:27208ms step_avg:147.87ms step:195/1480 train_time:27353ms step_avg:147.85ms step:196/1480 train_time:27499ms step_avg:147.84ms step:197/1480 train_time:27647ms step_avg:147.85ms step:198/1480 train_time:27792ms step_avg:147.83ms step:199/1480 train_time:27937ms step_avg:147.82ms step:200/1480 train_time:28084ms step_avg:147.81ms step:201/1480 train_time:28230ms step_avg:147.80ms step:202/1480 train_time:28374ms step_avg:147.78ms step:203/1480 train_time:28521ms step_avg:147.77ms step:204/1480 train_time:28667ms step_avg:147.77ms step:205/1480 train_time:28812ms step_avg:147.75ms step:206/1480 train_time:28957ms step_avg:147.74ms step:207/1480 train_time:29104ms step_avg:147.74ms step:208/1480 train_time:29250ms step_avg:147.73ms step:209/1480 train_time:29394ms step_avg:147.71ms step:210/1480 train_time:29540ms step_avg:147.70ms step:211/1480 train_time:29686ms step_avg:147.69ms step:212/1480 train_time:29832ms step_avg:147.68ms step:213/1480 train_time:29977ms step_avg:147.67ms step:214/1480 train_time:30124ms step_avg:147.66ms step:215/1480 train_time:30270ms step_avg:147.66ms step:216/1480 train_time:30416ms step_avg:147.65ms step:217/1480 train_time:30561ms step_avg:147.64ms step:218/1480 train_time:30708ms step_avg:147.63ms step:219/1480 train_time:30852ms step_avg:147.62ms step:220/1480 train_time:30998ms step_avg:147.61ms step:221/1480 train_time:31553ms step_avg:149.54ms step:222/1480 train_time:31664ms step_avg:149.36ms step:223/1480 train_time:32208ms step_avg:151.21ms step:224/1480 train_time:32317ms step_avg:151.02ms step:225/1480 train_time:32466ms step_avg:151.00ms step:226/1480 train_time:32614ms step_avg:150.99ms step:227/1480 train_time:32762ms step_avg:150.98ms step:228/1480 train_time:32910ms step_avg:150.96ms step:229/1480 train_time:33057ms step_avg:150.95ms step:230/1480 train_time:33207ms step_avg:150.94ms step:231/1480 train_time:33355ms step_avg:150.93ms step:232/1480 train_time:33505ms step_avg:150.92ms step:233/1480 train_time:33652ms step_avg:150.91ms step:234/1480 train_time:33800ms step_avg:150.89ms step:235/1480 train_time:33949ms step_avg:150.88ms step:236/1480 train_time:34097ms step_avg:150.87ms step:237/1480 train_time:34247ms step_avg:150.87ms step:238/1480 train_time:34394ms step_avg:150.85ms step:239/1480 train_time:34543ms step_avg:150.84ms step:240/1480 train_time:34691ms step_avg:150.83ms step:241/1480 train_time:34839ms step_avg:150.82ms step:242/1480 train_time:34987ms step_avg:150.81ms step:243/1480 train_time:35136ms step_avg:150.80ms step:244/1480 train_time:35286ms step_avg:150.79ms step:245/1480 train_time:35435ms step_avg:150.79ms step:246/1480 train_time:35583ms step_avg:150.78ms step:247/1480 train_time:35731ms step_avg:150.77ms step:248/1480 train_time:35879ms step_avg:150.75ms step:249/1480 train_time:36028ms step_avg:150.75ms step:250/1480 train_time:36176ms step_avg:150.73ms step:250/1480 val_loss:3.9936 train_time:36243ms step_avg:151.01ms step:251/1480 train_time:36338ms step_avg:150.78ms step:252/1480 train_time:36482ms step_avg:150.75ms step:253/1480 train_time:36630ms step_avg:150.74ms step:254/1480 train_time:36779ms step_avg:150.73ms step:255/1480 train_time:36926ms step_avg:150.72ms step:256/1480 train_time:37074ms step_avg:150.71ms step:257/1480 train_time:37221ms step_avg:150.69ms step:258/1480 train_time:37370ms step_avg:150.68ms step:259/1480 train_time:37519ms step_avg:150.68ms step:260/1480 train_time:37667ms step_avg:150.67ms step:261/1480 train_time:37815ms step_avg:150.66ms step:262/1480 train_time:37964ms step_avg:150.65ms step:263/1480 train_time:38111ms step_avg:150.64ms step:264/1480 train_time:38260ms step_avg:150.63ms step:265/1480 train_time:38408ms step_avg:150.62ms step:266/1480 train_time:38557ms step_avg:150.61ms step:267/1480 train_time:38705ms step_avg:150.60ms step:268/1480 train_time:38853ms step_avg:150.59ms step:269/1480 train_time:39002ms step_avg:150.59ms step:270/1480 train_time:39150ms step_avg:150.58ms step:271/1480 train_time:39300ms step_avg:150.57ms step:272/1480 train_time:39447ms step_avg:150.56ms step:273/1480 train_time:39597ms step_avg:150.56ms step:274/1480 train_time:39745ms step_avg:150.55ms step:275/1480 train_time:39893ms step_avg:150.54ms step:276/1480 train_time:40043ms step_avg:150.54ms step:277/1480 train_time:40191ms step_avg:150.53ms step:278/1480 train_time:40340ms step_avg:150.52ms step:279/1480 train_time:40488ms step_avg:150.51ms step:280/1480 train_time:40637ms step_avg:150.51ms step:281/1480 train_time:40786ms step_avg:150.50ms step:282/1480 train_time:40934ms step_avg:150.49ms step:283/1480 train_time:41083ms step_avg:150.49ms step:284/1480 train_time:41230ms step_avg:150.47ms step:285/1480 train_time:41380ms step_avg:150.47ms step:286/1480 train_time:41527ms step_avg:150.46ms step:287/1480 train_time:41676ms step_avg:150.46ms step:288/1480 train_time:41824ms step_avg:150.45ms step:289/1480 train_time:41973ms step_avg:150.44ms step:290/1480 train_time:42121ms step_avg:150.43ms step:291/1480 train_time:42272ms step_avg:150.43ms step:292/1480 train_time:42420ms step_avg:150.43ms step:293/1480 train_time:42569ms step_avg:150.42ms step:294/1480 train_time:42718ms step_avg:150.42ms step:295/1480 train_time:42867ms step_avg:150.41ms step:296/1480 train_time:43016ms step_avg:150.40ms step:297/1480 train_time:43164ms step_avg:150.40ms step:298/1480 train_time:43312ms step_avg:150.39ms step:299/1480 train_time:43460ms step_avg:150.38ms step:300/1480 train_time:43608ms step_avg:150.37ms step:301/1480 train_time:43761ms step_avg:150.38ms step:302/1480 train_time:43906ms step_avg:150.36ms step:303/1480 train_time:44055ms step_avg:150.36ms step:304/1480 train_time:44204ms step_avg:150.35ms step:305/1480 train_time:44352ms step_avg:150.34ms step:306/1480 train_time:44502ms step_avg:150.35ms step:307/1480 train_time:44649ms step_avg:150.33ms step:308/1480 train_time:44799ms step_avg:150.33ms step:309/1480 train_time:44946ms step_avg:150.32ms step:310/1480 train_time:45095ms step_avg:150.32ms step:311/1480 train_time:45244ms step_avg:150.31ms step:312/1480 train_time:45391ms step_avg:150.30ms step:313/1480 train_time:45541ms step_avg:150.30ms step:314/1480 train_time:45688ms step_avg:150.29ms step:315/1480 train_time:45836ms step_avg:150.28ms step:316/1480 train_time:45985ms step_avg:150.28ms step:317/1480 train_time:46134ms step_avg:150.27ms step:318/1480 train_time:46283ms step_avg:150.27ms step:319/1480 train_time:46431ms step_avg:150.26ms step:320/1480 train_time:46580ms step_avg:150.26ms step:321/1480 train_time:46727ms step_avg:150.25ms step:322/1480 train_time:46876ms step_avg:150.24ms step:323/1480 train_time:47025ms step_avg:150.24ms step:324/1480 train_time:47173ms step_avg:150.23ms step:325/1480 train_time:47323ms step_avg:150.23ms step:326/1480 train_time:47471ms step_avg:150.22ms step:327/1480 train_time:47620ms step_avg:150.22ms step:328/1480 train_time:47767ms step_avg:150.21ms step:329/1480 train_time:47915ms step_avg:150.20ms step:330/1480 train_time:48065ms step_avg:150.20ms step:331/1480 train_time:48215ms step_avg:150.20ms step:332/1480 train_time:48366ms step_avg:150.20ms step:333/1480 train_time:48516ms step_avg:150.20ms step:334/1480 train_time:48666ms step_avg:150.20ms step:335/1480 train_time:48817ms step_avg:150.21ms step:336/1480 train_time:48967ms step_avg:150.21ms step:337/1480 train_time:49118ms step_avg:150.21ms step:338/1480 train_time:49268ms step_avg:150.21ms step:339/1480 train_time:49419ms step_avg:150.21ms step:340/1480 train_time:49569ms step_avg:150.21ms step:341/1480 train_time:49720ms step_avg:150.21ms step:342/1480 train_time:49870ms step_avg:150.21ms step:343/1480 train_time:50022ms step_avg:150.22ms step:344/1480 train_time:50172ms step_avg:150.21ms step:345/1480 train_time:50322ms step_avg:150.22ms step:346/1480 train_time:50473ms step_avg:150.22ms step:347/1480 train_time:50625ms step_avg:150.22ms step:348/1480 train_time:50776ms step_avg:150.22ms step:349/1480 train_time:50927ms step_avg:150.23ms step:350/1480 train_time:51079ms step_avg:150.23ms step:351/1480 train_time:51229ms step_avg:150.23ms step:352/1480 train_time:51380ms step_avg:150.23ms step:353/1480 train_time:51530ms step_avg:150.23ms step:354/1480 train_time:51681ms step_avg:150.24ms step:355/1480 train_time:51831ms step_avg:150.24ms step:356/1480 train_time:51983ms step_avg:150.24ms step:357/1480 train_time:52134ms step_avg:150.24ms step:358/1480 train_time:52285ms step_avg:150.25ms step:359/1480 train_time:52436ms step_avg:150.25ms step:360/1480 train_time:52587ms step_avg:150.25ms step:361/1480 train_time:52739ms step_avg:150.25ms step:362/1480 train_time:52889ms step_avg:150.25ms step:363/1480 train_time:53040ms step_avg:150.26ms step:364/1480 train_time:53190ms step_avg:150.25ms step:365/1480 train_time:53342ms step_avg:150.26ms step:366/1480 train_time:53492ms step_avg:150.26ms step:367/1480 train_time:53643ms step_avg:150.26ms step:368/1480 train_time:53793ms step_avg:150.26ms step:369/1480 train_time:53945ms step_avg:150.26ms step:370/1480 train_time:54094ms step_avg:150.26ms step:371/1480 train_time:54245ms step_avg:150.26ms step:372/1480 train_time:54395ms step_avg:150.26ms step:373/1480 train_time:54546ms step_avg:150.26ms step:374/1480 train_time:54696ms step_avg:150.26ms step:375/1480 train_time:54847ms step_avg:150.26ms step:375/1480 val_loss:3.8035 train_time:54915ms step_avg:150.45ms step:376/1480 train_time:55006ms step_avg:150.29ms step:377/1480 train_time:55155ms step_avg:150.29ms step:378/1480 train_time:55306ms step_avg:150.29ms step:379/1480 train_time:55469ms step_avg:150.32ms step:380/1480 train_time:55606ms step_avg:150.29ms step:381/1480 train_time:55756ms step_avg:150.29ms step:382/1480 train_time:55907ms step_avg:150.29ms step:383/1480 train_time:56058ms step_avg:150.29ms step:384/1480 train_time:56209ms step_avg:150.29ms step:385/1480 train_time:56361ms step_avg:150.30ms step:386/1480 train_time:56510ms step_avg:150.29ms step:387/1480 train_time:56662ms step_avg:150.30ms step:388/1480 train_time:56813ms step_avg:150.30ms step:389/1480 train_time:56964ms step_avg:150.30ms step:390/1480 train_time:57114ms step_avg:150.30ms step:391/1480 train_time:57265ms step_avg:150.30ms step:392/1480 train_time:57416ms step_avg:150.30ms step:393/1480 train_time:57566ms step_avg:150.30ms step:394/1480 train_time:57717ms step_avg:150.31ms step:395/1480 train_time:57868ms step_avg:150.31ms step:396/1480 train_time:58020ms step_avg:150.31ms step:397/1480 train_time:58170ms step_avg:150.31ms step:398/1480 train_time:58322ms step_avg:150.31ms step:399/1480 train_time:58472ms step_avg:150.31ms step:400/1480 train_time:58624ms step_avg:150.32ms step:401/1480 train_time:58775ms step_avg:150.32ms step:402/1480 train_time:58926ms step_avg:150.32ms step:403/1480 train_time:59077ms step_avg:150.32ms step:404/1480 train_time:59228ms step_avg:150.33ms step:405/1480 train_time:59380ms step_avg:150.33ms step:406/1480 train_time:59530ms step_avg:150.33ms step:407/1480 train_time:59681ms step_avg:150.33ms step:408/1480 train_time:59831ms step_avg:150.33ms step:409/1480 train_time:59983ms step_avg:150.33ms step:410/1480 train_time:60132ms step_avg:150.33ms step:411/1480 train_time:60284ms step_avg:150.33ms step:412/1480 train_time:60434ms step_avg:150.33ms step:413/1480 train_time:60584ms step_avg:150.33ms step:414/1480 train_time:60735ms step_avg:150.33ms step:415/1480 train_time:60887ms step_avg:150.34ms step:416/1480 train_time:61038ms step_avg:150.34ms step:417/1480 train_time:61189ms step_avg:150.34ms step:418/1480 train_time:61340ms step_avg:150.34ms step:419/1480 train_time:61490ms step_avg:150.34ms step:420/1480 train_time:61641ms step_avg:150.34ms step:421/1480 train_time:61793ms step_avg:150.35ms step:422/1480 train_time:61944ms step_avg:150.35ms step:423/1480 train_time:62095ms step_avg:150.35ms step:424/1480 train_time:62246ms step_avg:150.35ms step:425/1480 train_time:62398ms step_avg:150.36ms step:426/1480 train_time:62549ms step_avg:150.36ms step:427/1480 train_time:62701ms step_avg:150.36ms step:428/1480 train_time:62851ms step_avg:150.36ms step:429/1480 train_time:63002ms step_avg:150.36ms step:430/1480 train_time:63152ms step_avg:150.36ms step:431/1480 train_time:63304ms step_avg:150.37ms step:432/1480 train_time:63454ms step_avg:150.36ms step:433/1480 train_time:63606ms step_avg:150.37ms step:434/1480 train_time:63756ms step_avg:150.37ms step:435/1480 train_time:63907ms step_avg:150.37ms step:436/1480 train_time:64058ms step_avg:150.37ms step:437/1480 train_time:64208ms step_avg:150.37ms step:438/1480 train_time:64359ms step_avg:150.37ms step:439/1480 train_time:64509ms step_avg:150.37ms step:440/1480 train_time:64661ms step_avg:150.38ms step:441/1480 train_time:64814ms step_avg:150.38ms step:442/1480 train_time:64967ms step_avg:150.39ms step:443/1480 train_time:65119ms step_avg:150.39ms step:444/1480 train_time:65272ms step_avg:150.40ms step:445/1480 train_time:65424ms step_avg:150.40ms step:446/1480 train_time:65578ms step_avg:150.41ms step:447/1480 train_time:65731ms step_avg:150.41ms step:448/1480 train_time:65884ms step_avg:150.42ms step:449/1480 train_time:66036ms step_avg:150.42ms step:450/1480 train_time:66189ms step_avg:150.43ms step:451/1480 train_time:66342ms step_avg:150.43ms step:452/1480 train_time:66494ms step_avg:150.44ms step:453/1480 train_time:66647ms step_avg:150.44ms step:454/1480 train_time:66800ms step_avg:150.45ms step:455/1480 train_time:66954ms step_avg:150.46ms step:456/1480 train_time:67106ms step_avg:150.46ms step:457/1480 train_time:67259ms step_avg:150.47ms step:458/1480 train_time:67411ms step_avg:150.47ms step:459/1480 train_time:67565ms step_avg:150.48ms step:460/1480 train_time:67717ms step_avg:150.48ms step:461/1480 train_time:67870ms step_avg:150.49ms step:462/1480 train_time:68023ms step_avg:150.49ms step:463/1480 train_time:68176ms step_avg:150.50ms step:464/1480 train_time:68328ms step_avg:150.50ms step:465/1480 train_time:68481ms step_avg:150.51ms step:466/1480 train_time:68633ms step_avg:150.51ms step:467/1480 train_time:68786ms step_avg:150.52ms step:468/1480 train_time:68939ms step_avg:150.52ms step:469/1480 train_time:69091ms step_avg:150.53ms step:470/1480 train_time:69244ms step_avg:150.53ms step:471/1480 train_time:69397ms step_avg:150.54ms step:472/1480 train_time:69550ms step_avg:150.54ms step:473/1480 train_time:69703ms step_avg:150.55ms step:474/1480 train_time:69856ms step_avg:150.55ms step:475/1480 train_time:70008ms step_avg:150.56ms step:476/1480 train_time:70161ms step_avg:150.56ms step:477/1480 train_time:70313ms step_avg:150.56ms step:478/1480 train_time:70467ms step_avg:150.57ms step:479/1480 train_time:70620ms step_avg:150.58ms step:480/1480 train_time:70772ms step_avg:150.58ms step:481/1480 train_time:70926ms step_avg:150.59ms step:482/1480 train_time:71079ms step_avg:150.59ms step:483/1480 train_time:71231ms step_avg:150.59ms step:484/1480 train_time:71385ms step_avg:150.60ms step:485/1480 train_time:71537ms step_avg:150.61ms step:486/1480 train_time:71691ms step_avg:150.61ms step:487/1480 train_time:71844ms step_avg:150.62ms step:488/1480 train_time:71998ms step_avg:150.62ms step:489/1480 train_time:72151ms step_avg:150.63ms step:490/1480 train_time:72304ms step_avg:150.63ms step:491/1480 train_time:72457ms step_avg:150.64ms step:492/1480 train_time:72609ms step_avg:150.64ms step:493/1480 train_time:72763ms step_avg:150.65ms step:494/1480 train_time:72916ms step_avg:150.65ms step:495/1480 train_time:73068ms step_avg:150.66ms step:496/1480 train_time:73222ms step_avg:150.66ms step:497/1480 train_time:73374ms step_avg:150.67ms step:498/1480 train_time:73527ms step_avg:150.67ms step:499/1480 train_time:73681ms step_avg:150.68ms step:500/1480 train_time:73834ms step_avg:150.68ms step:500/1480 val_loss:3.6840 train_time:73903ms step_avg:150.82ms step:501/1480 train_time:73994ms step_avg:150.70ms step:502/1480 train_time:74146ms step_avg:150.70ms step:503/1480 train_time:74299ms step_avg:150.71ms step:504/1480 train_time:74451ms step_avg:150.71ms step:505/1480 train_time:74603ms step_avg:150.71ms step:506/1480 train_time:74755ms step_avg:150.72ms step:507/1480 train_time:74907ms step_avg:150.72ms step:508/1480 train_time:75061ms step_avg:150.72ms step:509/1480 train_time:75215ms step_avg:150.73ms step:510/1480 train_time:75368ms step_avg:150.74ms step:511/1480 train_time:75521ms step_avg:150.74ms step:512/1480 train_time:75676ms step_avg:150.75ms step:513/1480 train_time:75828ms step_avg:150.75ms step:514/1480 train_time:75981ms step_avg:150.76ms step:515/1480 train_time:76135ms step_avg:150.76ms step:516/1480 train_time:76289ms step_avg:150.77ms step:517/1480 train_time:76442ms step_avg:150.77ms step:518/1480 train_time:76595ms step_avg:150.78ms step:519/1480 train_time:76748ms step_avg:150.78ms step:520/1480 train_time:76901ms step_avg:150.79ms step:521/1480 train_time:77054ms step_avg:150.79ms step:522/1480 train_time:77208ms step_avg:150.80ms step:523/1480 train_time:77362ms step_avg:150.80ms step:524/1480 train_time:77515ms step_avg:150.81ms step:525/1480 train_time:77667ms step_avg:150.81ms step:526/1480 train_time:77820ms step_avg:150.81ms step:527/1480 train_time:77974ms step_avg:150.82ms step:528/1480 train_time:78127ms step_avg:150.83ms step:529/1480 train_time:78281ms step_avg:150.83ms step:530/1480 train_time:78435ms step_avg:150.84ms step:531/1480 train_time:78588ms step_avg:150.84ms step:532/1480 train_time:78741ms step_avg:150.85ms step:533/1480 train_time:78895ms step_avg:150.85ms step:534/1480 train_time:79047ms step_avg:150.85ms step:535/1480 train_time:79199ms step_avg:150.85ms step:536/1480 train_time:79351ms step_avg:150.86ms step:537/1480 train_time:79505ms step_avg:150.86ms step:538/1480 train_time:79658ms step_avg:150.87ms step:539/1480 train_time:79812ms step_avg:150.87ms step:540/1480 train_time:79965ms step_avg:150.88ms step:541/1480 train_time:80118ms step_avg:150.88ms step:542/1480 train_time:80271ms step_avg:150.88ms step:543/1480 train_time:80423ms step_avg:150.89ms step:544/1480 train_time:80576ms step_avg:150.89ms step:545/1480 train_time:80728ms step_avg:150.89ms step:546/1480 train_time:80882ms step_avg:150.90ms step:547/1480 train_time:81035ms step_avg:150.90ms step:548/1480 train_time:81189ms step_avg:150.91ms step:549/1480 train_time:81342ms step_avg:150.91ms step:550/1480 train_time:81497ms step_avg:150.92ms step:551/1480 train_time:81650ms step_avg:150.92ms step:552/1480 train_time:81806ms step_avg:150.93ms step:553/1480 train_time:81962ms step_avg:150.94ms step:554/1480 train_time:82117ms step_avg:150.95ms step:555/1480 train_time:82271ms step_avg:150.96ms step:556/1480 train_time:82425ms step_avg:150.96ms step:557/1480 train_time:82579ms step_avg:150.97ms step:558/1480 train_time:82734ms step_avg:150.98ms step:559/1480 train_time:82889ms step_avg:150.98ms step:560/1480 train_time:83044ms step_avg:150.99ms step:561/1480 train_time:83199ms step_avg:151.00ms step:562/1480 train_time:83353ms step_avg:151.00ms step:563/1480 train_time:83508ms step_avg:151.01ms step:564/1480 train_time:83664ms step_avg:151.02ms step:565/1480 train_time:83819ms step_avg:151.02ms step:566/1480 train_time:83973ms step_avg:151.03ms step:567/1480 train_time:84127ms step_avg:151.04ms step:568/1480 train_time:84282ms step_avg:151.04ms step:569/1480 train_time:84449ms step_avg:151.07ms step:570/1480 train_time:84592ms step_avg:151.06ms step:571/1480 train_time:84746ms step_avg:151.06ms step:572/1480 train_time:84901ms step_avg:151.07ms step:573/1480 train_time:85055ms step_avg:151.07ms step:574/1480 train_time:85212ms step_avg:151.09ms step:575/1480 train_time:85366ms step_avg:151.09ms step:576/1480 train_time:85520ms step_avg:151.10ms step:577/1480 train_time:85675ms step_avg:151.10ms step:578/1480 train_time:85829ms step_avg:151.11ms step:579/1480 train_time:85984ms step_avg:151.11ms step:580/1480 train_time:86138ms step_avg:151.12ms step:581/1480 train_time:86293ms step_avg:151.13ms step:582/1480 train_time:86447ms step_avg:151.13ms step:583/1480 train_time:86600ms step_avg:151.13ms step:584/1480 train_time:86755ms step_avg:151.14ms step:585/1480 train_time:86910ms step_avg:151.15ms step:586/1480 train_time:87065ms step_avg:151.15ms step:587/1480 train_time:87219ms step_avg:151.16ms step:588/1480 train_time:87373ms step_avg:151.16ms step:589/1480 train_time:87528ms step_avg:151.17ms step:590/1480 train_time:87683ms step_avg:151.18ms step:591/1480 train_time:87838ms step_avg:151.18ms step:592/1480 train_time:87993ms step_avg:151.19ms step:593/1480 train_time:88149ms step_avg:151.20ms step:594/1480 train_time:88303ms step_avg:151.20ms step:595/1480 train_time:88458ms step_avg:151.21ms step:596/1480 train_time:88614ms step_avg:151.22ms step:597/1480 train_time:88769ms step_avg:151.23ms step:598/1480 train_time:88923ms step_avg:151.23ms step:599/1480 train_time:89078ms step_avg:151.24ms step:600/1480 train_time:89235ms step_avg:151.25ms step:601/1480 train_time:89390ms step_avg:151.25ms step:602/1480 train_time:89545ms step_avg:151.26ms step:603/1480 train_time:89699ms step_avg:151.26ms step:604/1480 train_time:89853ms step_avg:151.27ms step:605/1480 train_time:90008ms step_avg:151.27ms step:606/1480 train_time:90164ms step_avg:151.28ms step:607/1480 train_time:90320ms step_avg:151.29ms step:608/1480 train_time:90475ms step_avg:151.30ms step:609/1480 train_time:90629ms step_avg:151.30ms step:610/1480 train_time:90783ms step_avg:151.31ms step:611/1480 train_time:90939ms step_avg:151.31ms step:612/1480 train_time:91094ms step_avg:151.32ms step:613/1480 train_time:91249ms step_avg:151.32ms step:614/1480 train_time:91404ms step_avg:151.33ms step:615/1480 train_time:91558ms step_avg:151.34ms step:616/1480 train_time:91713ms step_avg:151.34ms step:617/1480 train_time:91867ms step_avg:151.35ms step:618/1480 train_time:92021ms step_avg:151.35ms step:619/1480 train_time:92177ms step_avg:151.36ms step:620/1480 train_time:92333ms step_avg:151.37ms step:621/1480 train_time:92490ms step_avg:151.37ms step:622/1480 train_time:92645ms step_avg:151.38ms step:623/1480 train_time:92801ms step_avg:151.39ms step:624/1480 train_time:92956ms step_avg:151.39ms step:625/1480 train_time:93110ms step_avg:151.40ms step:625/1480 val_loss:3.6040 train_time:93181ms step_avg:151.51ms step:626/1480 train_time:93278ms step_avg:151.43ms step:627/1480 train_time:93424ms step_avg:151.42ms step:628/1480 train_time:93578ms step_avg:151.42ms step:629/1480 train_time:93732ms step_avg:151.43ms step:630/1480 train_time:93887ms step_avg:151.43ms step:631/1480 train_time:94040ms step_avg:151.43ms step:632/1480 train_time:94194ms step_avg:151.44ms step:633/1480 train_time:94349ms step_avg:151.44ms step:634/1480 train_time:94505ms step_avg:151.45ms step:635/1480 train_time:94659ms step_avg:151.46ms step:636/1480 train_time:94813ms step_avg:151.46ms step:637/1480 train_time:94968ms step_avg:151.46ms step:638/1480 train_time:95122ms step_avg:151.47ms step:639/1480 train_time:95276ms step_avg:151.47ms step:640/1480 train_time:95431ms step_avg:151.48ms step:641/1480 train_time:95587ms step_avg:151.48ms step:642/1480 train_time:95741ms step_avg:151.49ms step:643/1480 train_time:95895ms step_avg:151.49ms step:644/1480 train_time:96050ms step_avg:151.50ms step:645/1480 train_time:96205ms step_avg:151.50ms step:646/1480 train_time:96360ms step_avg:151.51ms step:647/1480 train_time:96514ms step_avg:151.51ms step:648/1480 train_time:96670ms step_avg:151.52ms step:649/1480 train_time:96825ms step_avg:151.53ms step:650/1480 train_time:96981ms step_avg:151.53ms step:651/1480 train_time:97136ms step_avg:151.54ms step:652/1480 train_time:97291ms step_avg:151.54ms step:653/1480 train_time:97445ms step_avg:151.55ms step:654/1480 train_time:97601ms step_avg:151.55ms step:655/1480 train_time:97755ms step_avg:151.56ms step:656/1480 train_time:97910ms step_avg:151.56ms step:657/1480 train_time:98065ms step_avg:151.57ms step:658/1480 train_time:98219ms step_avg:151.57ms step:659/1480 train_time:98374ms step_avg:151.58ms step:660/1480 train_time:98531ms step_avg:151.59ms step:661/1480 train_time:98689ms step_avg:151.60ms step:662/1480 train_time:98845ms step_avg:151.60ms step:663/1480 train_time:99001ms step_avg:151.61ms step:664/1480 train_time:99157ms step_avg:151.62ms step:665/1480 train_time:99313ms step_avg:151.62ms step:666/1480 train_time:99469ms step_avg:151.63ms step:667/1480 train_time:99625ms step_avg:151.64ms step:668/1480 train_time:99782ms step_avg:151.65ms step:669/1480 train_time:99940ms step_avg:151.65ms step:670/1480 train_time:100096ms step_avg:151.66ms step:671/1480 train_time:100251ms step_avg:151.67ms step:672/1480 train_time:100409ms step_avg:151.68ms step:673/1480 train_time:100565ms step_avg:151.68ms step:674/1480 train_time:100722ms step_avg:151.69ms step:675/1480 train_time:100879ms step_avg:151.70ms step:676/1480 train_time:101037ms step_avg:151.71ms step:677/1480 train_time:101193ms step_avg:151.71ms step:678/1480 train_time:101348ms step_avg:151.72ms step:679/1480 train_time:101506ms step_avg:151.73ms step:680/1480 train_time:101661ms step_avg:151.73ms step:681/1480 train_time:101816ms step_avg:151.74ms step:682/1480 train_time:101973ms step_avg:151.75ms step:683/1480 train_time:102130ms step_avg:151.75ms step:684/1480 train_time:102286ms step_avg:151.76ms step:685/1480 train_time:102443ms step_avg:151.77ms step:686/1480 train_time:102599ms step_avg:151.77ms step:687/1480 train_time:102754ms step_avg:151.78ms step:688/1480 train_time:102912ms step_avg:151.79ms step:689/1480 train_time:103069ms step_avg:151.80ms step:690/1480 train_time:103228ms step_avg:151.81ms step:691/1480 train_time:103384ms step_avg:151.81ms step:692/1480 train_time:103541ms step_avg:151.82ms step:693/1480 train_time:103697ms step_avg:151.83ms step:694/1480 train_time:103853ms step_avg:151.83ms step:695/1480 train_time:104009ms step_avg:151.84ms step:696/1480 train_time:104165ms step_avg:151.84ms step:697/1480 train_time:104320ms step_avg:151.85ms step:698/1480 train_time:104476ms step_avg:151.85ms step:699/1480 train_time:104633ms step_avg:151.86ms step:700/1480 train_time:104790ms step_avg:151.87ms step:701/1480 train_time:104945ms step_avg:151.87ms step:702/1480 train_time:105103ms step_avg:151.88ms step:703/1480 train_time:105260ms step_avg:151.89ms step:704/1480 train_time:105415ms step_avg:151.90ms step:705/1480 train_time:105572ms step_avg:151.90ms step:706/1480 train_time:105732ms step_avg:151.91ms step:707/1480 train_time:105888ms step_avg:151.92ms step:708/1480 train_time:106044ms step_avg:151.93ms step:709/1480 train_time:106199ms step_avg:151.93ms step:710/1480 train_time:106355ms step_avg:151.94ms step:711/1480 train_time:106511ms step_avg:151.94ms step:712/1480 train_time:106670ms step_avg:151.95ms step:713/1480 train_time:106828ms step_avg:151.96ms step:714/1480 train_time:106985ms step_avg:151.97ms step:715/1480 train_time:107140ms step_avg:151.97ms step:716/1480 train_time:107295ms step_avg:151.98ms step:717/1480 train_time:107450ms step_avg:151.98ms step:718/1480 train_time:107607ms step_avg:151.99ms step:719/1480 train_time:107763ms step_avg:151.99ms step:720/1480 train_time:107920ms step_avg:152.00ms step:721/1480 train_time:108077ms step_avg:152.01ms step:722/1480 train_time:108233ms step_avg:152.01ms step:723/1480 train_time:108389ms step_avg:152.02ms step:724/1480 train_time:108544ms step_avg:152.02ms step:725/1480 train_time:108701ms step_avg:152.03ms step:726/1480 train_time:108856ms step_avg:152.03ms step:727/1480 train_time:109013ms step_avg:152.04ms step:728/1480 train_time:109170ms step_avg:152.05ms step:729/1480 train_time:109327ms step_avg:152.05ms step:730/1480 train_time:109485ms step_avg:152.06ms step:731/1480 train_time:109642ms step_avg:152.07ms step:732/1480 train_time:109798ms step_avg:152.07ms step:733/1480 train_time:109954ms step_avg:152.08ms step:734/1480 train_time:110110ms step_avg:152.09ms step:735/1480 train_time:110266ms step_avg:152.09ms step:736/1480 train_time:110422ms step_avg:152.10ms step:737/1480 train_time:110578ms step_avg:152.10ms step:738/1480 train_time:110734ms step_avg:152.11ms step:739/1480 train_time:110890ms step_avg:152.11ms step:740/1480 train_time:111048ms step_avg:152.12ms step:741/1480 train_time:111206ms step_avg:152.13ms step:742/1480 train_time:111361ms step_avg:152.13ms step:743/1480 train_time:111517ms step_avg:152.14ms step:744/1480 train_time:111673ms step_avg:152.14ms step:745/1480 train_time:111831ms step_avg:152.15ms step:746/1480 train_time:111988ms step_avg:152.16ms step:747/1480 train_time:112144ms step_avg:152.16ms step:748/1480 train_time:112305ms step_avg:152.17ms step:749/1480 train_time:112462ms step_avg:152.18ms step:750/1480 train_time:112617ms step_avg:152.19ms step:750/1480 val_loss:3.5469 train_time:112690ms step_avg:152.28ms step:751/1480 train_time:112785ms step_avg:152.21ms step:752/1480 train_time:112936ms step_avg:152.20ms step:753/1480 train_time:113092ms step_avg:152.21ms step:754/1480 train_time:113249ms step_avg:152.22ms step:755/1480 train_time:113404ms step_avg:152.22ms step:756/1480 train_time:113559ms step_avg:152.22ms step:757/1480 train_time:113717ms step_avg:152.23ms step:758/1480 train_time:113874ms step_avg:152.24ms step:759/1480 train_time:114041ms step_avg:152.26ms step:760/1480 train_time:114189ms step_avg:152.25ms step:761/1480 train_time:114346ms step_avg:152.26ms step:762/1480 train_time:114501ms step_avg:152.26ms step:763/1480 train_time:114658ms step_avg:152.27ms step:764/1480 train_time:114815ms step_avg:152.27ms step:765/1480 train_time:114973ms step_avg:152.28ms step:766/1480 train_time:115130ms step_avg:152.29ms step:767/1480 train_time:115287ms step_avg:152.30ms step:768/1480 train_time:115443ms step_avg:152.30ms step:769/1480 train_time:115600ms step_avg:152.31ms step:770/1480 train_time:115758ms step_avg:152.31ms step:771/1480 train_time:115916ms step_avg:152.32ms step:772/1480 train_time:116074ms step_avg:152.33ms step:773/1480 train_time:116232ms step_avg:152.34ms step:774/1480 train_time:116390ms step_avg:152.34ms step:775/1480 train_time:116549ms step_avg:152.35ms step:776/1480 train_time:116708ms step_avg:152.36ms step:777/1480 train_time:116869ms step_avg:152.37ms step:778/1480 train_time:117028ms step_avg:152.38ms step:779/1480 train_time:117184ms step_avg:152.39ms step:780/1480 train_time:117341ms step_avg:152.39ms step:781/1480 train_time:117498ms step_avg:152.40ms step:782/1480 train_time:117656ms step_avg:152.40ms step:783/1480 train_time:117813ms step_avg:152.41ms step:784/1480 train_time:117972ms step_avg:152.42ms step:785/1480 train_time:118130ms step_avg:152.43ms step:786/1480 train_time:118288ms step_avg:152.43ms step:787/1480 train_time:118445ms step_avg:152.44ms step:788/1480 train_time:118603ms step_avg:152.45ms step:789/1480 train_time:118759ms step_avg:152.45ms step:790/1480 train_time:118917ms step_avg:152.46ms step:791/1480 train_time:119078ms step_avg:152.47ms step:792/1480 train_time:119236ms step_avg:152.48ms step:793/1480 train_time:119393ms step_avg:152.48ms step:794/1480 train_time:119551ms step_avg:152.49ms step:795/1480 train_time:119710ms step_avg:152.50ms step:796/1480 train_time:119871ms step_avg:152.51ms step:797/1480 train_time:120031ms step_avg:152.52ms step:798/1480 train_time:120190ms step_avg:152.53ms step:799/1480 train_time:120351ms step_avg:152.54ms step:800/1480 train_time:120509ms step_avg:152.54ms step:801/1480 train_time:120666ms step_avg:152.55ms step:802/1480 train_time:120824ms step_avg:152.56ms step:803/1480 train_time:120981ms step_avg:152.56ms step:804/1480 train_time:121139ms step_avg:152.57ms step:805/1480 train_time:121298ms step_avg:152.58ms step:806/1480 train_time:121455ms step_avg:152.58ms step:807/1480 train_time:121612ms step_avg:152.59ms step:808/1480 train_time:121771ms step_avg:152.60ms step:809/1480 train_time:121929ms step_avg:152.60ms step:810/1480 train_time:122085ms step_avg:152.61ms step:811/1480 train_time:122243ms step_avg:152.61ms step:812/1480 train_time:122400ms step_avg:152.62ms step:813/1480 train_time:122557ms step_avg:152.62ms step:814/1480 train_time:122715ms step_avg:152.63ms step:815/1480 train_time:122872ms step_avg:152.64ms step:816/1480 train_time:123031ms step_avg:152.64ms step:817/1480 train_time:123188ms step_avg:152.65ms step:818/1480 train_time:123346ms step_avg:152.66ms step:819/1480 train_time:123503ms step_avg:152.66ms step:820/1480 train_time:123662ms step_avg:152.67ms step:821/1480 train_time:123819ms step_avg:152.67ms step:822/1480 train_time:123978ms step_avg:152.68ms step:823/1480 train_time:124134ms step_avg:152.69ms step:824/1480 train_time:124292ms step_avg:152.69ms step:825/1480 train_time:124452ms step_avg:152.70ms step:826/1480 train_time:124612ms step_avg:152.71ms step:827/1480 train_time:124772ms step_avg:152.72ms step:828/1480 train_time:124932ms step_avg:152.73ms step:829/1480 train_time:125091ms step_avg:152.74ms step:830/1480 train_time:125251ms step_avg:152.75ms step:831/1480 train_time:125409ms step_avg:152.75ms step:832/1480 train_time:125567ms step_avg:152.76ms step:833/1480 train_time:125724ms step_avg:152.76ms step:834/1480 train_time:125884ms step_avg:152.77ms step:835/1480 train_time:126041ms step_avg:152.78ms step:836/1480 train_time:126201ms step_avg:152.79ms step:837/1480 train_time:126359ms step_avg:152.79ms step:838/1480 train_time:126517ms step_avg:152.80ms step:839/1480 train_time:126675ms step_avg:152.80ms step:840/1480 train_time:126832ms step_avg:152.81ms step:841/1480 train_time:126990ms step_avg:152.82ms step:842/1480 train_time:127149ms step_avg:152.82ms step:843/1480 train_time:127306ms step_avg:152.83ms step:844/1480 train_time:127462ms step_avg:152.83ms step:845/1480 train_time:127620ms step_avg:152.84ms step:846/1480 train_time:127780ms step_avg:152.85ms step:847/1480 train_time:127937ms step_avg:152.85ms step:848/1480 train_time:128096ms step_avg:152.86ms step:849/1480 train_time:128253ms step_avg:152.86ms step:850/1480 train_time:128413ms step_avg:152.87ms step:851/1480 train_time:128573ms step_avg:152.88ms step:852/1480 train_time:128732ms step_avg:152.89ms step:853/1480 train_time:128890ms step_avg:152.89ms step:854/1480 train_time:129051ms step_avg:152.90ms step:855/1480 train_time:129207ms step_avg:152.91ms step:856/1480 train_time:129365ms step_avg:152.91ms step:857/1480 train_time:129523ms step_avg:152.92ms step:858/1480 train_time:129682ms step_avg:152.93ms step:859/1480 train_time:129839ms step_avg:152.93ms step:860/1480 train_time:129997ms step_avg:152.94ms step:861/1480 train_time:130155ms step_avg:152.94ms step:862/1480 train_time:130317ms step_avg:152.95ms step:863/1480 train_time:130478ms step_avg:152.96ms step:864/1480 train_time:130635ms step_avg:152.97ms step:865/1480 train_time:130794ms step_avg:152.98ms step:866/1480 train_time:130954ms step_avg:152.98ms step:867/1480 train_time:131113ms step_avg:152.99ms step:868/1480 train_time:131271ms step_avg:153.00ms step:869/1480 train_time:131429ms step_avg:153.00ms step:870/1480 train_time:131586ms step_avg:153.01ms step:871/1480 train_time:131743ms step_avg:153.01ms step:872/1480 train_time:131900ms step_avg:153.02ms step:873/1480 train_time:132058ms step_avg:153.02ms step:874/1480 train_time:132216ms step_avg:153.03ms step:875/1480 train_time:132376ms step_avg:153.04ms step:875/1480 val_loss:3.5031 train_time:132447ms step_avg:153.12ms step:876/1480 train_time:132539ms step_avg:153.05ms step:877/1480 train_time:132695ms step_avg:153.05ms step:878/1480 train_time:132853ms step_avg:153.06ms step:879/1480 train_time:133010ms step_avg:153.06ms step:880/1480 train_time:133169ms step_avg:153.07ms step:881/1480 train_time:133327ms step_avg:153.07ms step:882/1480 train_time:133487ms step_avg:153.08ms step:883/1480 train_time:133647ms step_avg:153.09ms step:884/1480 train_time:133809ms step_avg:153.10ms step:885/1480 train_time:133969ms step_avg:153.11ms step:886/1480 train_time:134130ms step_avg:153.12ms step:887/1480 train_time:134289ms step_avg:153.12ms step:888/1480 train_time:134453ms step_avg:153.14ms step:889/1480 train_time:134613ms step_avg:153.14ms step:890/1480 train_time:134771ms step_avg:153.15ms step:891/1480 train_time:134932ms step_avg:153.16ms step:892/1480 train_time:135092ms step_avg:153.17ms step:893/1480 train_time:135251ms step_avg:153.17ms step:894/1480 train_time:135411ms step_avg:153.18ms step:895/1480 train_time:135574ms step_avg:153.19ms step:896/1480 train_time:135732ms step_avg:153.20ms step:897/1480 train_time:135892ms step_avg:153.20ms step:898/1480 train_time:136052ms step_avg:153.21ms step:899/1480 train_time:136211ms step_avg:153.22ms step:900/1480 train_time:136369ms step_avg:153.22ms step:901/1480 train_time:136529ms step_avg:153.23ms step:902/1480 train_time:136687ms step_avg:153.24ms step:903/1480 train_time:136849ms step_avg:153.25ms step:904/1480 train_time:137008ms step_avg:153.25ms step:905/1480 train_time:137167ms step_avg:153.26ms step:906/1480 train_time:137325ms step_avg:153.26ms step:907/1480 train_time:137489ms step_avg:153.28ms step:908/1480 train_time:137648ms step_avg:153.28ms step:909/1480 train_time:137808ms step_avg:153.29ms step:910/1480 train_time:137973ms step_avg:153.30ms step:911/1480 train_time:138131ms step_avg:153.31ms step:912/1480 train_time:138291ms step_avg:153.32ms step:913/1480 train_time:138452ms step_avg:153.32ms step:914/1480 train_time:138612ms step_avg:153.33ms step:915/1480 train_time:138776ms step_avg:153.34ms step:916/1480 train_time:138936ms step_avg:153.35ms step:917/1480 train_time:139095ms step_avg:153.36ms step:918/1480 train_time:139257ms step_avg:153.37ms step:919/1480 train_time:139418ms step_avg:153.37ms step:920/1480 train_time:139576ms step_avg:153.38ms step:921/1480 train_time:139736ms step_avg:153.39ms step:922/1480 train_time:139899ms step_avg:153.40ms step:923/1480 train_time:140057ms step_avg:153.40ms step:924/1480 train_time:140215ms step_avg:153.41ms step:925/1480 train_time:140375ms step_avg:153.42ms step:926/1480 train_time:140534ms step_avg:153.42ms step:927/1480 train_time:140692ms step_avg:153.43ms step:928/1480 train_time:140851ms step_avg:153.43ms step:929/1480 train_time:141010ms step_avg:153.44ms step:930/1480 train_time:141171ms step_avg:153.45ms step:931/1480 train_time:141331ms step_avg:153.45ms step:932/1480 train_time:141489ms step_avg:153.46ms step:933/1480 train_time:141649ms step_avg:153.47ms step:934/1480 train_time:141808ms step_avg:153.47ms step:935/1480 train_time:141969ms step_avg:153.48ms step:936/1480 train_time:142128ms step_avg:153.49ms step:937/1480 train_time:142290ms step_avg:153.49ms step:938/1480 train_time:142449ms step_avg:153.50ms step:939/1480 train_time:142611ms step_avg:153.51ms step:940/1480 train_time:142773ms step_avg:153.52ms step:941/1480 train_time:142931ms step_avg:153.52ms step:942/1480 train_time:143089ms step_avg:153.53ms step:943/1480 train_time:143251ms step_avg:153.54ms step:944/1480 train_time:143412ms step_avg:153.55ms step:945/1480 train_time:143573ms step_avg:153.55ms step:946/1480 train_time:143735ms step_avg:153.56ms step:947/1480 train_time:143896ms step_avg:153.57ms step:948/1480 train_time:144054ms step_avg:153.58ms step:949/1480 train_time:144223ms step_avg:153.59ms step:950/1480 train_time:144374ms step_avg:153.59ms step:951/1480 train_time:144536ms step_avg:153.60ms step:952/1480 train_time:144695ms step_avg:153.60ms step:953/1480 train_time:144855ms step_avg:153.61ms step:954/1480 train_time:145016ms step_avg:153.62ms step:955/1480 train_time:145174ms step_avg:153.62ms step:956/1480 train_time:145331ms step_avg:153.63ms step:957/1480 train_time:145493ms step_avg:153.64ms step:958/1480 train_time:145657ms step_avg:153.65ms step:959/1480 train_time:145814ms step_avg:153.65ms step:960/1480 train_time:145975ms step_avg:153.66ms step:961/1480 train_time:146134ms step_avg:153.66ms step:962/1480 train_time:146292ms step_avg:153.67ms step:963/1480 train_time:146454ms step_avg:153.68ms step:964/1480 train_time:146615ms step_avg:153.68ms step:965/1480 train_time:146774ms step_avg:153.69ms step:966/1480 train_time:146933ms step_avg:153.70ms step:967/1480 train_time:147090ms step_avg:153.70ms step:968/1480 train_time:147251ms step_avg:153.71ms step:969/1480 train_time:147410ms step_avg:153.71ms step:970/1480 train_time:147569ms step_avg:153.72ms step:971/1480 train_time:147729ms step_avg:153.72ms step:972/1480 train_time:147888ms step_avg:153.73ms step:973/1480 train_time:148046ms step_avg:153.73ms step:974/1480 train_time:148205ms step_avg:153.74ms step:975/1480 train_time:148367ms step_avg:153.75ms step:976/1480 train_time:148528ms step_avg:153.76ms step:977/1480 train_time:148688ms step_avg:153.76ms step:978/1480 train_time:148849ms step_avg:153.77ms step:979/1480 train_time:149010ms step_avg:153.78ms step:980/1480 train_time:149170ms step_avg:153.78ms step:981/1480 train_time:149332ms step_avg:153.79ms step:982/1480 train_time:149491ms step_avg:153.80ms step:983/1480 train_time:149653ms step_avg:153.81ms step:984/1480 train_time:149811ms step_avg:153.81ms step:985/1480 train_time:149974ms step_avg:153.82ms step:986/1480 train_time:150134ms step_avg:153.83ms step:987/1480 train_time:150292ms step_avg:153.83ms step:988/1480 train_time:150451ms step_avg:153.84ms step:989/1480 train_time:150611ms step_avg:153.84ms step:990/1480 train_time:150772ms step_avg:153.85ms step:991/1480 train_time:150933ms step_avg:153.86ms step:992/1480 train_time:151098ms step_avg:153.87ms step:993/1480 train_time:151266ms step_avg:153.88ms step:994/1480 train_time:151426ms step_avg:153.89ms step:995/1480 train_time:151584ms step_avg:153.89ms step:996/1480 train_time:151741ms step_avg:153.90ms step:997/1480 train_time:151901ms step_avg:153.90ms step:998/1480 train_time:152059ms step_avg:153.91ms step:999/1480 train_time:152218ms step_avg:153.91ms step:1000/1480 train_time:152378ms step_avg:153.92ms step:1000/1480 val_loss:3.4382 train_time:152452ms step_avg:153.99ms step:1001/1480 train_time:152549ms step_avg:153.93ms step:1002/1480 train_time:152702ms step_avg:153.93ms step:1003/1480 train_time:152867ms step_avg:153.94ms step:1004/1480 train_time:153029ms step_avg:153.95ms step:1005/1480 train_time:153188ms step_avg:153.96ms step:1006/1480 train_time:153348ms step_avg:153.96ms step:1007/1480 train_time:153509ms step_avg:153.97ms step:1008/1480 train_time:153669ms step_avg:153.98ms step:1009/1480 train_time:153835ms step_avg:153.99ms step:1010/1480 train_time:153996ms step_avg:154.00ms step:1011/1480 train_time:154155ms step_avg:154.00ms step:1012/1480 train_time:154316ms step_avg:154.01ms step:1013/1480 train_time:154477ms step_avg:154.01ms step:1014/1480 train_time:154640ms step_avg:154.02ms step:1015/1480 train_time:154802ms step_avg:154.03ms step:1016/1480 train_time:154962ms step_avg:154.04ms step:1017/1480 train_time:155124ms step_avg:154.05ms step:1018/1480 train_time:155284ms step_avg:154.05ms step:1019/1480 train_time:155446ms step_avg:154.06ms step:1020/1480 train_time:155606ms step_avg:154.07ms step:1021/1480 train_time:155765ms step_avg:154.07ms step:1022/1480 train_time:155924ms step_avg:154.07ms step:1023/1480 train_time:156086ms step_avg:154.08ms step:1024/1480 train_time:156247ms step_avg:154.09ms step:1025/1480 train_time:156410ms step_avg:154.10ms step:1026/1480 train_time:156570ms step_avg:154.10ms step:1027/1480 train_time:156730ms step_avg:154.11ms step:1028/1480 train_time:156892ms step_avg:154.12ms step:1029/1480 train_time:157056ms step_avg:154.13ms step:1030/1480 train_time:157216ms step_avg:154.13ms step:1031/1480 train_time:157374ms step_avg:154.14ms step:1032/1480 train_time:157540ms step_avg:154.15ms step:1033/1480 train_time:157700ms step_avg:154.15ms step:1034/1480 train_time:157860ms step_avg:154.16ms step:1035/1480 train_time:158021ms step_avg:154.17ms step:1036/1480 train_time:158180ms step_avg:154.17ms step:1037/1480 train_time:158341ms step_avg:154.18ms step:1038/1480 train_time:158501ms step_avg:154.18ms step:1039/1480 train_time:158663ms step_avg:154.19ms step:1040/1480 train_time:158822ms step_avg:154.20ms step:1041/1480 train_time:158981ms step_avg:154.20ms step:1042/1480 train_time:159139ms step_avg:154.20ms step:1043/1480 train_time:159299ms step_avg:154.21ms step:1044/1480 train_time:159459ms step_avg:154.22ms step:1045/1480 train_time:159620ms step_avg:154.22ms step:1046/1480 train_time:159781ms step_avg:154.23ms step:1047/1480 train_time:159941ms step_avg:154.23ms step:1048/1480 train_time:160101ms step_avg:154.24ms step:1049/1480 train_time:160261ms step_avg:154.25ms step:1050/1480 train_time:160423ms step_avg:154.25ms step:1051/1480 train_time:160585ms step_avg:154.26ms step:1052/1480 train_time:160744ms step_avg:154.26ms step:1053/1480 train_time:160903ms step_avg:154.27ms step:1054/1480 train_time:161064ms step_avg:154.28ms step:1055/1480 train_time:161223ms step_avg:154.28ms step:1056/1480 train_time:161381ms step_avg:154.28ms step:1057/1480 train_time:161541ms step_avg:154.29ms step:1058/1480 train_time:161703ms step_avg:154.30ms step:1059/1480 train_time:161865ms step_avg:154.30ms step:1060/1480 train_time:162027ms step_avg:154.31ms step:1061/1480 train_time:162183ms step_avg:154.31ms step:1062/1480 train_time:162343ms step_avg:154.32ms step:1063/1480 train_time:162502ms step_avg:154.32ms step:1064/1480 train_time:162660ms step_avg:154.33ms step:1065/1480 train_time:162822ms step_avg:154.33ms step:1066/1480 train_time:162983ms step_avg:154.34ms step:1067/1480 train_time:163144ms step_avg:154.35ms step:1068/1480 train_time:163304ms step_avg:154.35ms step:1069/1480 train_time:163468ms step_avg:154.36ms step:1070/1480 train_time:163628ms step_avg:154.37ms step:1071/1480 train_time:163793ms step_avg:154.38ms step:1072/1480 train_time:163952ms step_avg:154.38ms step:1073/1480 train_time:164111ms step_avg:154.38ms step:1074/1480 train_time:164271ms step_avg:154.39ms step:1075/1480 train_time:164431ms step_avg:154.40ms step:1076/1480 train_time:164591ms step_avg:154.40ms step:1077/1480 train_time:164750ms step_avg:154.40ms step:1078/1480 train_time:164916ms step_avg:154.42ms step:1079/1480 train_time:165080ms step_avg:154.42ms step:1080/1480 train_time:165242ms step_avg:154.43ms step:1081/1480 train_time:165402ms step_avg:154.44ms step:1082/1480 train_time:165562ms step_avg:154.44ms step:1083/1480 train_time:165722ms step_avg:154.45ms step:1084/1480 train_time:165881ms step_avg:154.45ms step:1085/1480 train_time:166043ms step_avg:154.46ms step:1086/1480 train_time:166203ms step_avg:154.46ms step:1087/1480 train_time:166364ms step_avg:154.47ms step:1088/1480 train_time:166524ms step_avg:154.48ms step:1089/1480 train_time:166687ms step_avg:154.48ms step:1090/1480 train_time:166851ms step_avg:154.49ms step:1091/1480 train_time:167011ms step_avg:154.50ms step:1092/1480 train_time:167172ms step_avg:154.50ms step:1093/1480 train_time:167334ms step_avg:154.51ms step:1094/1480 train_time:167495ms step_avg:154.52ms step:1095/1480 train_time:167655ms step_avg:154.52ms step:1096/1480 train_time:167819ms step_avg:154.53ms step:1097/1480 train_time:167980ms step_avg:154.54ms step:1098/1480 train_time:168142ms step_avg:154.54ms step:1099/1480 train_time:168304ms step_avg:154.55ms step:1100/1480 train_time:168466ms step_avg:154.56ms step:1101/1480 train_time:168629ms step_avg:154.56ms step:1102/1480 train_time:168792ms step_avg:154.57ms step:1103/1480 train_time:168956ms step_avg:154.58ms step:1104/1480 train_time:169118ms step_avg:154.59ms step:1105/1480 train_time:169283ms step_avg:154.60ms step:1106/1480 train_time:169443ms step_avg:154.60ms step:1107/1480 train_time:169604ms step_avg:154.61ms step:1108/1480 train_time:169763ms step_avg:154.61ms step:1109/1480 train_time:169922ms step_avg:154.62ms step:1110/1480 train_time:170083ms step_avg:154.62ms step:1111/1480 train_time:170244ms step_avg:154.63ms step:1112/1480 train_time:170407ms step_avg:154.63ms step:1113/1480 train_time:170577ms step_avg:154.65ms step:1114/1480 train_time:170741ms step_avg:154.66ms step:1115/1480 train_time:170902ms step_avg:154.66ms step:1116/1480 train_time:171062ms step_avg:154.67ms step:1117/1480 train_time:171225ms step_avg:154.67ms step:1118/1480 train_time:171389ms step_avg:154.68ms step:1119/1480 train_time:171549ms step_avg:154.69ms step:1120/1480 train_time:171710ms step_avg:154.69ms step:1121/1480 train_time:171872ms step_avg:154.70ms step:1122/1480 train_time:172034ms step_avg:154.71ms step:1123/1480 train_time:172196ms step_avg:154.71ms step:1124/1480 train_time:172358ms step_avg:154.72ms step:1125/1480 train_time:172520ms step_avg:154.73ms step:1125/1480 val_loss:3.3827 train_time:172595ms step_avg:154.79ms step:1126/1480 train_time:172691ms step_avg:154.74ms step:1127/1480 train_time:172846ms step_avg:154.74ms step:1128/1480 train_time:173007ms step_avg:154.75ms step:1129/1480 train_time:173171ms step_avg:154.76ms step:1130/1480 train_time:173333ms step_avg:154.76ms step:1131/1480 train_time:173499ms step_avg:154.77ms step:1132/1480 train_time:173659ms step_avg:154.78ms step:1133/1480 train_time:173822ms step_avg:154.78ms step:1134/1480 train_time:173984ms step_avg:154.79ms step:1135/1480 train_time:174145ms step_avg:154.80ms step:1136/1480 train_time:174309ms step_avg:154.80ms step:1137/1480 train_time:174470ms step_avg:154.81ms step:1138/1480 train_time:174636ms step_avg:154.82ms step:1139/1480 train_time:174805ms step_avg:154.83ms step:1140/1480 train_time:174960ms step_avg:154.83ms step:1141/1480 train_time:175123ms step_avg:154.84ms step:1142/1480 train_time:175284ms step_avg:154.84ms step:1143/1480 train_time:175448ms step_avg:154.85ms step:1144/1480 train_time:175609ms step_avg:154.86ms step:1145/1480 train_time:175767ms step_avg:154.86ms step:1146/1480 train_time:175930ms step_avg:154.87ms step:1147/1480 train_time:176091ms step_avg:154.87ms step:1148/1480 train_time:176254ms step_avg:154.88ms step:1149/1480 train_time:176416ms step_avg:154.89ms step:1150/1480 train_time:176577ms step_avg:154.89ms step:1151/1480 train_time:176740ms step_avg:154.90ms step:1152/1480 train_time:176903ms step_avg:154.91ms step:1153/1480 train_time:177067ms step_avg:154.91ms step:1154/1480 train_time:177228ms step_avg:154.92ms step:1155/1480 train_time:177390ms step_avg:154.93ms step:1156/1480 train_time:177557ms step_avg:154.94ms step:1157/1480 train_time:177720ms step_avg:154.94ms step:1158/1480 train_time:177881ms step_avg:154.95ms step:1159/1480 train_time:178041ms step_avg:154.95ms step:1160/1480 train_time:178201ms step_avg:154.96ms step:1161/1480 train_time:178364ms step_avg:154.96ms step:1162/1480 train_time:178526ms step_avg:154.97ms step:1163/1480 train_time:178689ms step_avg:154.98ms step:1164/1480 train_time:178853ms step_avg:154.99ms step:1165/1480 train_time:179013ms step_avg:154.99ms step:1166/1480 train_time:179175ms step_avg:155.00ms step:1167/1480 train_time:179335ms step_avg:155.00ms step:1168/1480 train_time:179497ms step_avg:155.01ms step:1169/1480 train_time:179659ms step_avg:155.01ms step:1170/1480 train_time:179820ms step_avg:155.02ms step:1171/1480 train_time:179982ms step_avg:155.02ms step:1172/1480 train_time:180141ms step_avg:155.03ms step:1173/1480 train_time:180306ms step_avg:155.04ms step:1174/1480 train_time:180478ms step_avg:155.05ms step:1175/1480 train_time:180640ms step_avg:155.06ms step:1176/1480 train_time:180803ms step_avg:155.06ms step:1177/1480 train_time:180971ms step_avg:155.07ms step:1178/1480 train_time:181133ms step_avg:155.08ms step:1179/1480 train_time:181292ms step_avg:155.08ms step:1180/1480 train_time:181461ms step_avg:155.10ms step:1181/1480 train_time:181623ms step_avg:155.10ms step:1182/1480 train_time:181784ms step_avg:155.11ms step:1183/1480 train_time:181945ms step_avg:155.11ms step:1184/1480 train_time:182106ms step_avg:155.12ms step:1185/1480 train_time:182269ms step_avg:155.12ms step:1186/1480 train_time:182434ms step_avg:155.13ms step:1187/1480 train_time:182603ms step_avg:155.14ms step:1188/1480 train_time:182762ms step_avg:155.15ms step:1189/1480 train_time:182923ms step_avg:155.15ms step:1190/1480 train_time:183085ms step_avg:155.16ms step:1191/1480 train_time:183249ms step_avg:155.16ms step:1192/1480 train_time:183411ms step_avg:155.17ms step:1193/1480 train_time:183571ms step_avg:155.17ms step:1194/1480 train_time:183732ms step_avg:155.18ms step:1195/1480 train_time:183896ms step_avg:155.19ms step:1196/1480 train_time:184069ms step_avg:155.20ms step:1197/1480 train_time:184230ms step_avg:155.21ms step:1198/1480 train_time:184400ms step_avg:155.22ms step:1199/1480 train_time:184563ms step_avg:155.23ms step:1200/1480 train_time:184724ms step_avg:155.23ms step:1201/1480 train_time:184884ms step_avg:155.23ms step:1202/1480 train_time:185053ms step_avg:155.25ms step:1203/1480 train_time:185220ms step_avg:155.26ms step:1204/1480 train_time:185385ms step_avg:155.26ms step:1205/1480 train_time:185546ms step_avg:155.27ms step:1206/1480 train_time:185707ms step_avg:155.27ms step:1207/1480 train_time:185867ms step_avg:155.28ms step:1208/1480 train_time:186029ms step_avg:155.28ms step:1209/1480 train_time:186193ms step_avg:155.29ms step:1210/1480 train_time:186359ms step_avg:155.30ms step:1211/1480 train_time:186522ms step_avg:155.31ms step:1212/1480 train_time:186685ms step_avg:155.31ms step:1213/1480 train_time:186849ms step_avg:155.32ms step:1214/1480 train_time:187017ms step_avg:155.33ms step:1215/1480 train_time:187180ms step_avg:155.34ms step:1216/1480 train_time:187341ms step_avg:155.34ms step:1217/1480 train_time:187505ms step_avg:155.35ms step:1218/1480 train_time:187666ms step_avg:155.35ms step:1219/1480 train_time:187835ms step_avg:155.36ms step:1220/1480 train_time:187998ms step_avg:155.37ms step:1221/1480 train_time:188160ms step_avg:155.38ms step:1222/1480 train_time:188321ms step_avg:155.38ms step:1223/1480 train_time:188484ms step_avg:155.39ms step:1224/1480 train_time:188650ms step_avg:155.40ms step:1225/1480 train_time:188814ms step_avg:155.40ms step:1226/1480 train_time:188980ms step_avg:155.41ms step:1227/1480 train_time:189143ms step_avg:155.42ms step:1228/1480 train_time:189304ms step_avg:155.42ms step:1229/1480 train_time:189468ms step_avg:155.43ms step:1230/1480 train_time:189638ms step_avg:155.44ms step:1231/1480 train_time:189803ms step_avg:155.45ms step:1232/1480 train_time:189969ms step_avg:155.46ms step:1233/1480 train_time:190130ms step_avg:155.46ms step:1234/1480 train_time:190292ms step_avg:155.47ms step:1235/1480 train_time:190456ms step_avg:155.47ms step:1236/1480 train_time:190618ms step_avg:155.48ms step:1237/1480 train_time:190779ms step_avg:155.48ms step:1238/1480 train_time:190951ms step_avg:155.50ms step:1239/1480 train_time:191113ms step_avg:155.50ms step:1240/1480 train_time:191278ms step_avg:155.51ms step:1241/1480 train_time:191442ms step_avg:155.52ms step:1242/1480 train_time:191603ms step_avg:155.52ms step:1243/1480 train_time:191767ms step_avg:155.53ms step:1244/1480 train_time:191930ms step_avg:155.53ms step:1245/1480 train_time:192093ms step_avg:155.54ms step:1246/1480 train_time:192256ms step_avg:155.55ms step:1247/1480 train_time:192419ms step_avg:155.55ms step:1248/1480 train_time:192581ms step_avg:155.56ms step:1249/1480 train_time:192742ms step_avg:155.56ms step:1250/1480 train_time:192902ms step_avg:155.57ms step:1250/1480 val_loss:3.3336 train_time:192977ms step_avg:155.63ms step:1251/1480 train_time:193070ms step_avg:155.58ms step:1252/1480 train_time:193234ms step_avg:155.58ms step:1253/1480 train_time:193395ms step_avg:155.59ms step:1254/1480 train_time:193557ms step_avg:155.59ms step:1255/1480 train_time:193727ms step_avg:155.60ms step:1256/1480 train_time:193891ms step_avg:155.61ms step:1257/1480 train_time:194053ms step_avg:155.62ms step:1258/1480 train_time:194217ms step_avg:155.62ms step:1259/1480 train_time:194380ms step_avg:155.63ms step:1260/1480 train_time:194540ms step_avg:155.63ms step:1261/1480 train_time:194702ms step_avg:155.64ms step:1262/1480 train_time:194868ms step_avg:155.65ms step:1263/1480 train_time:195035ms step_avg:155.65ms step:1264/1480 train_time:195193ms step_avg:155.66ms step:1265/1480 train_time:195354ms step_avg:155.66ms step:1266/1480 train_time:195517ms step_avg:155.67ms step:1267/1480 train_time:195678ms step_avg:155.67ms step:1268/1480 train_time:195842ms step_avg:155.68ms step:1269/1480 train_time:196009ms step_avg:155.69ms step:1270/1480 train_time:196172ms step_avg:155.69ms step:1271/1480 train_time:196335ms step_avg:155.70ms step:1272/1480 train_time:196495ms step_avg:155.70ms step:1273/1480 train_time:196657ms step_avg:155.71ms step:1274/1480 train_time:196822ms step_avg:155.71ms step:1275/1480 train_time:196984ms step_avg:155.72ms step:1276/1480 train_time:197145ms step_avg:155.72ms step:1277/1480 train_time:197307ms step_avg:155.73ms step:1278/1480 train_time:197468ms step_avg:155.73ms step:1279/1480 train_time:197631ms step_avg:155.74ms step:1280/1480 train_time:197797ms step_avg:155.75ms step:1281/1480 train_time:197959ms step_avg:155.75ms step:1282/1480 train_time:198118ms step_avg:155.75ms step:1283/1480 train_time:198281ms step_avg:155.76ms step:1284/1480 train_time:198444ms step_avg:155.76ms step:1285/1480 train_time:198607ms step_avg:155.77ms step:1286/1480 train_time:198769ms step_avg:155.77ms step:1287/1480 train_time:198931ms step_avg:155.78ms step:1288/1480 train_time:199093ms step_avg:155.78ms step:1289/1480 train_time:199263ms step_avg:155.80ms step:1290/1480 train_time:199433ms step_avg:155.81ms step:1291/1480 train_time:199596ms step_avg:155.81ms step:1292/1480 train_time:199758ms step_avg:155.82ms step:1293/1480 train_time:199927ms step_avg:155.83ms step:1294/1480 train_time:200091ms step_avg:155.83ms step:1295/1480 train_time:200254ms step_avg:155.84ms step:1296/1480 train_time:200416ms step_avg:155.84ms step:1297/1480 train_time:200580ms step_avg:155.85ms step:1298/1480 train_time:200742ms step_avg:155.86ms step:1299/1480 train_time:200904ms step_avg:155.86ms step:1300/1480 train_time:201065ms step_avg:155.86ms step:1301/1480 train_time:201229ms step_avg:155.87ms step:1302/1480 train_time:201394ms step_avg:155.88ms step:1303/1480 train_time:201560ms step_avg:155.89ms step:1304/1480 train_time:201726ms step_avg:155.89ms step:1305/1480 train_time:201888ms step_avg:155.90ms step:1306/1480 train_time:202053ms step_avg:155.91ms step:1307/1480 train_time:202213ms step_avg:155.91ms step:1308/1480 train_time:202377ms step_avg:155.91ms step:1309/1480 train_time:202543ms step_avg:155.92ms step:1310/1480 train_time:202706ms step_avg:155.93ms step:1311/1480 train_time:202868ms step_avg:155.93ms step:1312/1480 train_time:203033ms step_avg:155.94ms step:1313/1480 train_time:203193ms step_avg:155.94ms step:1314/1480 train_time:203357ms step_avg:155.95ms step:1315/1480 train_time:203519ms step_avg:155.95ms step:1316/1480 train_time:203677ms step_avg:155.95ms step:1317/1480 train_time:203840ms step_avg:155.96ms step:1318/1480 train_time:204008ms step_avg:155.97ms step:1319/1480 train_time:204175ms step_avg:155.98ms step:1320/1480 train_time:204342ms step_avg:155.99ms step:1321/1480 train_time:204506ms step_avg:155.99ms step:1322/1480 train_time:204676ms step_avg:156.00ms step:1323/1480 train_time:204840ms step_avg:156.01ms step:1324/1480 train_time:205003ms step_avg:156.01ms step:1325/1480 train_time:205172ms step_avg:156.02ms step:1326/1480 train_time:205338ms step_avg:156.03ms step:1327/1480 train_time:205500ms step_avg:156.04ms step:1328/1480 train_time:205662ms step_avg:156.04ms step:1329/1480 train_time:205846ms step_avg:156.06ms step:1330/1480 train_time:206013ms step_avg:156.07ms step:1331/1480 train_time:206175ms step_avg:156.07ms step:1332/1480 train_time:206339ms step_avg:156.08ms step:1333/1480 train_time:206505ms step_avg:156.09ms step:1334/1480 train_time:206668ms step_avg:156.09ms step:1335/1480 train_time:206829ms step_avg:156.10ms step:1336/1480 train_time:206999ms step_avg:156.11ms step:1337/1480 train_time:207168ms step_avg:156.12ms step:1338/1480 train_time:207333ms step_avg:156.12ms step:1339/1480 train_time:207496ms step_avg:156.13ms step:1340/1480 train_time:207659ms step_avg:156.13ms step:1341/1480 train_time:207821ms step_avg:156.14ms step:1342/1480 train_time:207987ms step_avg:156.15ms step:1343/1480 train_time:208150ms step_avg:156.15ms step:1344/1480 train_time:208315ms step_avg:156.16ms step:1345/1480 train_time:208483ms step_avg:156.17ms step:1346/1480 train_time:208645ms step_avg:156.17ms step:1347/1480 train_time:208808ms step_avg:156.18ms step:1348/1480 train_time:208972ms step_avg:156.18ms step:1349/1480 train_time:209134ms step_avg:156.19ms step:1350/1480 train_time:209300ms step_avg:156.19ms step:1351/1480 train_time:209462ms step_avg:156.20ms step:1352/1480 train_time:209626ms step_avg:156.20ms step:1353/1480 train_time:209791ms step_avg:156.21ms step:1354/1480 train_time:209956ms step_avg:156.22ms step:1355/1480 train_time:210117ms step_avg:156.22ms step:1356/1480 train_time:210281ms step_avg:156.23ms step:1357/1480 train_time:210446ms step_avg:156.23ms step:1358/1480 train_time:210609ms step_avg:156.24ms step:1359/1480 train_time:210773ms step_avg:156.24ms step:1360/1480 train_time:210939ms step_avg:156.25ms step:1361/1480 train_time:211105ms step_avg:156.26ms step:1362/1480 train_time:211272ms step_avg:156.27ms step:1363/1480 train_time:211441ms step_avg:156.28ms step:1364/1480 train_time:211603ms step_avg:156.28ms step:1365/1480 train_time:211762ms step_avg:156.28ms step:1366/1480 train_time:211927ms step_avg:156.29ms step:1367/1480 train_time:212090ms step_avg:156.29ms step:1368/1480 train_time:212256ms step_avg:156.30ms step:1369/1480 train_time:212427ms step_avg:156.31ms step:1370/1480 train_time:212592ms step_avg:156.32ms step:1371/1480 train_time:212755ms step_avg:156.32ms step:1372/1480 train_time:212923ms step_avg:156.33ms step:1373/1480 train_time:213085ms step_avg:156.34ms step:1374/1480 train_time:213252ms step_avg:156.34ms step:1375/1480 train_time:213414ms step_avg:156.35ms step:1375/1480 val_loss:3.2949 train_time:213489ms step_avg:156.40ms step:1376/1480 train_time:213584ms step_avg:156.36ms step:1377/1480 train_time:213743ms step_avg:156.36ms step:1378/1480 train_time:213905ms step_avg:156.36ms step:1379/1480 train_time:214070ms step_avg:156.37ms step:1380/1480 train_time:214235ms step_avg:156.38ms step:1381/1480 train_time:214403ms step_avg:156.38ms step:1382/1480 train_time:214566ms step_avg:156.39ms step:1383/1480 train_time:214729ms step_avg:156.39ms step:1384/1480 train_time:214896ms step_avg:156.40ms step:1385/1480 train_time:215057ms step_avg:156.41ms step:1386/1480 train_time:215221ms step_avg:156.41ms step:1387/1480 train_time:215387ms step_avg:156.42ms step:1388/1480 train_time:215547ms step_avg:156.42ms step:1389/1480 train_time:215712ms step_avg:156.43ms step:1390/1480 train_time:215874ms step_avg:156.43ms step:1391/1480 train_time:216038ms step_avg:156.44ms step:1392/1480 train_time:216202ms step_avg:156.44ms step:1393/1480 train_time:216365ms step_avg:156.45ms step:1394/1480 train_time:216528ms step_avg:156.45ms step:1395/1480 train_time:216689ms step_avg:156.45ms step:1396/1480 train_time:216850ms step_avg:156.46ms step:1397/1480 train_time:217011ms step_avg:156.46ms step:1398/1480 train_time:217172ms step_avg:156.46ms step:1399/1480 train_time:217335ms step_avg:156.47ms step:1400/1480 train_time:217502ms step_avg:156.48ms step:1401/1480 train_time:217663ms step_avg:156.48ms step:1402/1480 train_time:217825ms step_avg:156.48ms step:1403/1480 train_time:217993ms step_avg:156.49ms step:1404/1480 train_time:218158ms step_avg:156.50ms step:1405/1480 train_time:218322ms step_avg:156.50ms step:1406/1480 train_time:218487ms step_avg:156.51ms step:1407/1480 train_time:218648ms step_avg:156.51ms step:1408/1480 train_time:218808ms step_avg:156.52ms step:1409/1480 train_time:218982ms step_avg:156.53ms step:1410/1480 train_time:219144ms step_avg:156.53ms step:1411/1480 train_time:219305ms step_avg:156.53ms step:1412/1480 train_time:219467ms step_avg:156.54ms step:1413/1480 train_time:219630ms step_avg:156.54ms step:1414/1480 train_time:219795ms step_avg:156.55ms step:1415/1480 train_time:219960ms step_avg:156.55ms step:1416/1480 train_time:220135ms step_avg:156.57ms step:1417/1480 train_time:220303ms step_avg:156.58ms step:1418/1480 train_time:220467ms step_avg:156.58ms step:1419/1480 train_time:220631ms step_avg:156.59ms step:1420/1480 train_time:220796ms step_avg:156.59ms step:1421/1480 train_time:220960ms step_avg:156.60ms step:1422/1480 train_time:221124ms step_avg:156.60ms step:1423/1480 train_time:221286ms step_avg:156.61ms step:1424/1480 train_time:221453ms step_avg:156.61ms step:1425/1480 train_time:221624ms step_avg:156.62ms step:1426/1480 train_time:221787ms step_avg:156.63ms step:1427/1480 train_time:221954ms step_avg:156.64ms step:1428/1480 train_time:222117ms step_avg:156.64ms step:1429/1480 train_time:222278ms step_avg:156.64ms step:1430/1480 train_time:222443ms step_avg:156.65ms step:1431/1480 train_time:222610ms step_avg:156.66ms step:1432/1480 train_time:222778ms step_avg:156.67ms step:1433/1480 train_time:222947ms step_avg:156.67ms step:1434/1480 train_time:223117ms step_avg:156.68ms step:1435/1480 train_time:223283ms step_avg:156.69ms step:1436/1480 train_time:223448ms step_avg:156.70ms step:1437/1480 train_time:223610ms step_avg:156.70ms step:1438/1480 train_time:223772ms step_avg:156.70ms step:1439/1480 train_time:223938ms step_avg:156.71ms step:1440/1480 train_time:224101ms step_avg:156.71ms step:1441/1480 train_time:224265ms step_avg:156.72ms step:1442/1480 train_time:224431ms step_avg:156.73ms step:1443/1480 train_time:224605ms step_avg:156.74ms step:1444/1480 train_time:224768ms step_avg:156.74ms step:1445/1480 train_time:224930ms step_avg:156.75ms step:1446/1480 train_time:225095ms step_avg:156.75ms step:1447/1480 train_time:225263ms step_avg:156.76ms step:1448/1480 train_time:225425ms step_avg:156.76ms step:1449/1480 train_time:225588ms step_avg:156.77ms step:1450/1480 train_time:225754ms step_avg:156.77ms step:1451/1480 train_time:225917ms step_avg:156.78ms step:1452/1480 train_time:226083ms step_avg:156.78ms step:1453/1480 train_time:226246ms step_avg:156.79ms step:1454/1480 train_time:226408ms step_avg:156.79ms step:1455/1480 train_time:226577ms step_avg:156.80ms step:1456/1480 train_time:226740ms step_avg:156.81ms step:1457/1480 train_time:226904ms step_avg:156.81ms step:1458/1480 train_time:227067ms step_avg:156.81ms step:1459/1480 train_time:227234ms step_avg:156.82ms step:1460/1480 train_time:227397ms step_avg:156.83ms step:1461/1480 train_time:227561ms step_avg:156.83ms step:1462/1480 train_time:227724ms step_avg:156.83ms step:1463/1480 train_time:227891ms step_avg:156.84ms step:1464/1480 train_time:228057ms step_avg:156.85ms step:1465/1480 train_time:228221ms step_avg:156.85ms step:1466/1480 train_time:228383ms step_avg:156.86ms step:1467/1480 train_time:228547ms step_avg:156.86ms step:1468/1480 train_time:228711ms step_avg:156.87ms step:1469/1480 train_time:228875ms step_avg:156.87ms step:1470/1480 train_time:229041ms step_avg:156.88ms step:1471/1480 train_time:229214ms step_avg:156.89ms step:1472/1480 train_time:229384ms step_avg:156.90ms step:1473/1480 train_time:229547ms step_avg:156.90ms step:1474/1480 train_time:229713ms step_avg:156.91ms step:1475/1480 train_time:229884ms step_avg:156.92ms step:1476/1480 train_time:230046ms step_avg:156.92ms step:1477/1480 train_time:230215ms step_avg:156.93ms step:1478/1480 train_time:230385ms step_avg:156.94ms step:1479/1480 train_time:230548ms step_avg:156.94ms step:1480/1480 train_time:230711ms step_avg:156.95ms step:1480/1480 val_loss:3.2758 train_time:230787ms step_avg:157.00ms peak memory consumption: 34239 MiB