import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 10:14:26 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 125W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 37C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 119W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:29121ms step_avg:nanms step:2/1480 train_time:29226ms step_avg:nanms step:3/1480 train_time:29349ms step_avg:nanms step:4/1480 train_time:29489ms step_avg:nanms step:5/1480 train_time:29631ms step_avg:nanms step:6/1480 train_time:29772ms step_avg:nanms step:7/1480 train_time:29915ms step_avg:nanms step:8/1480 train_time:30058ms step_avg:nanms step:9/1480 train_time:30201ms step_avg:nanms step:10/1480 train_time:30346ms step_avg:nanms step:11/1480 train_time:144ms step_avg:nanms step:12/1480 train_time:282ms step_avg:nanms step:13/1480 train_time:424ms step_avg:141.40ms step:14/1480 train_time:566ms step_avg:141.40ms step:15/1480 train_time:707ms step_avg:141.35ms step:16/1480 train_time:849ms step_avg:141.55ms step:17/1480 train_time:991ms step_avg:141.52ms step:18/1480 train_time:1134ms step_avg:141.81ms step:19/1480 train_time:1278ms step_avg:142.01ms step:20/1480 train_time:1422ms step_avg:142.20ms step:21/1480 train_time:1565ms step_avg:142.25ms step:22/1480 train_time:1706ms step_avg:142.14ms step:23/1480 train_time:1849ms step_avg:142.19ms step:24/1480 train_time:1990ms step_avg:142.18ms step:25/1480 train_time:2134ms step_avg:142.26ms step:26/1480 train_time:2277ms step_avg:142.29ms step:27/1480 train_time:2420ms step_avg:142.38ms step:28/1480 train_time:2563ms step_avg:142.41ms step:29/1480 train_time:2706ms step_avg:142.42ms step:30/1480 train_time:2849ms step_avg:142.44ms step:31/1480 train_time:2991ms step_avg:142.43ms step:32/1480 train_time:3136ms step_avg:142.53ms step:33/1480 train_time:3280ms step_avg:142.61ms step:34/1480 train_time:3422ms step_avg:142.60ms step:35/1480 train_time:3565ms step_avg:142.61ms step:36/1480 train_time:3707ms step_avg:142.58ms step:37/1480 train_time:3849ms step_avg:142.55ms step:38/1480 train_time:3991ms step_avg:142.53ms step:39/1480 train_time:4135ms step_avg:142.57ms step:40/1480 train_time:4279ms step_avg:142.63ms step:41/1480 train_time:4422ms step_avg:142.65ms step:42/1480 train_time:4566ms step_avg:142.67ms step:43/1480 train_time:4708ms step_avg:142.67ms step:44/1480 train_time:4852ms step_avg:142.69ms step:45/1480 train_time:4993ms step_avg:142.67ms step:46/1480 train_time:5136ms step_avg:142.66ms step:47/1480 train_time:5278ms step_avg:142.65ms step:48/1480 train_time:5420ms step_avg:142.64ms step:49/1480 train_time:5563ms step_avg:142.63ms step:50/1480 train_time:5705ms step_avg:142.62ms step:51/1480 train_time:5848ms step_avg:142.63ms step:52/1480 train_time:5988ms step_avg:142.58ms step:53/1480 train_time:6132ms step_avg:142.60ms step:54/1480 train_time:6275ms step_avg:142.62ms step:55/1480 train_time:6420ms step_avg:142.66ms step:56/1480 train_time:6564ms step_avg:142.69ms step:57/1480 train_time:6705ms step_avg:142.66ms step:58/1480 train_time:6848ms step_avg:142.66ms step:59/1480 train_time:6989ms step_avg:142.63ms step:60/1480 train_time:7132ms step_avg:142.64ms step:61/1480 train_time:7277ms step_avg:142.69ms step:62/1480 train_time:7423ms step_avg:142.75ms step:63/1480 train_time:7566ms step_avg:142.75ms step:64/1480 train_time:7707ms step_avg:142.73ms step:65/1480 train_time:7851ms step_avg:142.74ms step:66/1480 train_time:7992ms step_avg:142.71ms step:67/1480 train_time:8135ms step_avg:142.71ms step:68/1480 train_time:8278ms step_avg:142.73ms step:69/1480 train_time:8421ms step_avg:142.73ms step:70/1480 train_time:8564ms step_avg:142.74ms step:71/1480 train_time:8706ms step_avg:142.72ms step:72/1480 train_time:8849ms step_avg:142.72ms step:73/1480 train_time:8991ms step_avg:142.71ms step:74/1480 train_time:9133ms step_avg:142.70ms step:75/1480 train_time:9275ms step_avg:142.70ms step:76/1480 train_time:9418ms step_avg:142.70ms step:77/1480 train_time:9561ms step_avg:142.70ms step:78/1480 train_time:9704ms step_avg:142.71ms step:79/1480 train_time:10228ms step_avg:148.23ms step:80/1480 train_time:10739ms step_avg:153.42ms step:81/1480 train_time:10839ms step_avg:152.66ms step:82/1480 train_time:10982ms step_avg:152.53ms step:83/1480 train_time:11125ms step_avg:152.40ms step:84/1480 train_time:11267ms step_avg:152.26ms step:85/1480 train_time:11408ms step_avg:152.11ms step:86/1480 train_time:11551ms step_avg:151.99ms step:87/1480 train_time:11698ms step_avg:151.92ms step:88/1480 train_time:11845ms step_avg:151.85ms step:89/1480 train_time:11989ms step_avg:151.76ms step:90/1480 train_time:12129ms step_avg:151.62ms step:91/1480 train_time:12272ms step_avg:151.50ms step:92/1480 train_time:12414ms step_avg:151.39ms step:93/1480 train_time:12558ms step_avg:151.30ms step:94/1480 train_time:12700ms step_avg:151.19ms step:95/1480 train_time:12845ms step_avg:151.11ms step:96/1480 train_time:12988ms step_avg:151.02ms step:97/1480 train_time:13130ms step_avg:150.92ms step:98/1480 train_time:13661ms step_avg:155.23ms step:99/1480 train_time:13764ms step_avg:154.65ms step:100/1480 train_time:13905ms step_avg:154.50ms step:101/1480 train_time:14056ms step_avg:154.46ms step:102/1480 train_time:14189ms step_avg:154.23ms step:103/1480 train_time:14332ms step_avg:154.11ms step:104/1480 train_time:14474ms step_avg:153.98ms step:105/1480 train_time:14619ms step_avg:153.88ms step:106/1480 train_time:14766ms step_avg:153.81ms step:107/1480 train_time:14908ms step_avg:153.69ms step:108/1480 train_time:15051ms step_avg:153.58ms step:109/1480 train_time:15193ms step_avg:153.46ms step:110/1480 train_time:15336ms step_avg:153.36ms step:111/1480 train_time:15480ms step_avg:153.26ms step:112/1480 train_time:15626ms step_avg:153.19ms step:113/1480 train_time:15770ms step_avg:153.11ms step:114/1480 train_time:15916ms step_avg:153.04ms step:115/1480 train_time:16063ms step_avg:152.98ms step:116/1480 train_time:16208ms step_avg:152.90ms step:117/1480 train_time:16354ms step_avg:152.84ms step:118/1480 train_time:16500ms step_avg:152.78ms step:119/1480 train_time:16646ms step_avg:152.72ms step:120/1480 train_time:16791ms step_avg:152.64ms step:121/1480 train_time:16937ms step_avg:152.59ms step:122/1480 train_time:17085ms step_avg:152.55ms step:123/1480 train_time:17230ms step_avg:152.48ms step:124/1480 train_time:17376ms step_avg:152.42ms step:125/1480 train_time:17522ms step_avg:152.37ms step:125/1480 val_loss:4.4402 train_time:17587ms step_avg:152.93ms step:126/1480 train_time:17682ms step_avg:152.43ms step:127/1480 train_time:17826ms step_avg:152.36ms step:128/1480 train_time:17970ms step_avg:152.29ms step:129/1480 train_time:18118ms step_avg:152.25ms step:130/1480 train_time:18264ms step_avg:152.20ms step:131/1480 train_time:18408ms step_avg:152.13ms step:132/1480 train_time:18553ms step_avg:152.07ms step:133/1480 train_time:18700ms step_avg:152.03ms step:134/1480 train_time:18846ms step_avg:151.98ms step:135/1480 train_time:18991ms step_avg:151.93ms step:136/1480 train_time:19138ms step_avg:151.89ms step:137/1480 train_time:19284ms step_avg:151.85ms step:138/1480 train_time:19429ms step_avg:151.79ms step:139/1480 train_time:19575ms step_avg:151.74ms step:140/1480 train_time:19722ms step_avg:151.71ms step:141/1480 train_time:19867ms step_avg:151.66ms step:142/1480 train_time:20013ms step_avg:151.61ms step:143/1480 train_time:20159ms step_avg:151.57ms step:144/1480 train_time:20306ms step_avg:151.54ms step:145/1480 train_time:20450ms step_avg:151.48ms step:146/1480 train_time:20597ms step_avg:151.45ms step:147/1480 train_time:20742ms step_avg:151.40ms step:148/1480 train_time:20888ms step_avg:151.36ms step:149/1480 train_time:21034ms step_avg:151.32ms step:150/1480 train_time:21181ms step_avg:151.29ms step:151/1480 train_time:21328ms step_avg:151.26ms step:152/1480 train_time:21473ms step_avg:151.22ms step:153/1480 train_time:21620ms step_avg:151.19ms step:154/1480 train_time:21766ms step_avg:151.16ms step:155/1480 train_time:21910ms step_avg:151.11ms step:156/1480 train_time:22056ms step_avg:151.07ms step:157/1480 train_time:22203ms step_avg:151.04ms step:158/1480 train_time:22348ms step_avg:151.00ms step:159/1480 train_time:22495ms step_avg:150.97ms step:160/1480 train_time:22640ms step_avg:150.94ms step:161/1480 train_time:22787ms step_avg:150.91ms step:162/1480 train_time:22931ms step_avg:150.86ms step:163/1480 train_time:23077ms step_avg:150.83ms step:164/1480 train_time:23224ms step_avg:150.80ms step:165/1480 train_time:23368ms step_avg:150.76ms step:166/1480 train_time:23514ms step_avg:150.73ms step:167/1480 train_time:23660ms step_avg:150.70ms step:168/1480 train_time:23806ms step_avg:150.67ms step:169/1480 train_time:23951ms step_avg:150.63ms step:170/1480 train_time:24098ms step_avg:150.61ms step:171/1480 train_time:24244ms step_avg:150.58ms step:172/1480 train_time:24389ms step_avg:150.55ms step:173/1480 train_time:24535ms step_avg:150.52ms step:174/1480 train_time:24682ms step_avg:150.50ms step:175/1480 train_time:24827ms step_avg:150.46ms step:176/1480 train_time:24972ms step_avg:150.44ms step:177/1480 train_time:25118ms step_avg:150.41ms step:178/1480 train_time:25265ms step_avg:150.38ms step:179/1480 train_time:25409ms step_avg:150.35ms step:180/1480 train_time:25555ms step_avg:150.33ms step:181/1480 train_time:25703ms step_avg:150.31ms step:182/1480 train_time:25847ms step_avg:150.27ms step:183/1480 train_time:25992ms step_avg:150.25ms step:184/1480 train_time:26138ms step_avg:150.22ms step:185/1480 train_time:26285ms step_avg:150.20ms step:186/1480 train_time:26429ms step_avg:150.17ms step:187/1480 train_time:26575ms step_avg:150.14ms step:188/1480 train_time:26722ms step_avg:150.13ms step:189/1480 train_time:26896ms step_avg:150.26ms step:190/1480 train_time:27011ms step_avg:150.06ms step:191/1480 train_time:27158ms step_avg:150.05ms step:192/1480 train_time:27304ms step_avg:150.02ms step:193/1480 train_time:27450ms step_avg:150.00ms step:194/1480 train_time:27597ms step_avg:149.98ms step:195/1480 train_time:27742ms step_avg:149.96ms step:196/1480 train_time:27889ms step_avg:149.94ms step:197/1480 train_time:28034ms step_avg:149.91ms step:198/1480 train_time:28181ms step_avg:149.90ms step:199/1480 train_time:28327ms step_avg:149.88ms step:200/1480 train_time:28472ms step_avg:149.85ms step:201/1480 train_time:28620ms step_avg:149.84ms step:202/1480 train_time:28763ms step_avg:149.81ms step:203/1480 train_time:28908ms step_avg:149.78ms step:204/1480 train_time:29054ms step_avg:149.76ms step:205/1480 train_time:29202ms step_avg:149.75ms step:206/1480 train_time:29346ms step_avg:149.73ms step:207/1480 train_time:29492ms step_avg:149.70ms step:208/1480 train_time:29638ms step_avg:149.69ms step:209/1480 train_time:29784ms step_avg:149.67ms step:210/1480 train_time:29930ms step_avg:149.65ms step:211/1480 train_time:30077ms step_avg:149.64ms step:212/1480 train_time:30223ms step_avg:149.62ms step:213/1480 train_time:30368ms step_avg:149.60ms step:214/1480 train_time:30514ms step_avg:149.58ms step:215/1480 train_time:30660ms step_avg:149.56ms step:216/1480 train_time:30807ms step_avg:149.55ms step:217/1480 train_time:30950ms step_avg:149.52ms step:218/1480 train_time:31098ms step_avg:149.51ms step:219/1480 train_time:31243ms step_avg:149.49ms step:220/1480 train_time:31390ms step_avg:149.47ms step:221/1480 train_time:31944ms step_avg:151.39ms step:222/1480 train_time:32455ms step_avg:153.09ms step:223/1480 train_time:32564ms step_avg:152.88ms step:224/1480 train_time:32712ms step_avg:152.86ms step:225/1480 train_time:32861ms step_avg:152.84ms step:226/1480 train_time:33009ms step_avg:152.82ms step:227/1480 train_time:33158ms step_avg:152.80ms step:228/1480 train_time:33306ms step_avg:152.78ms step:229/1480 train_time:33453ms step_avg:152.75ms step:230/1480 train_time:33603ms step_avg:152.74ms step:231/1480 train_time:33750ms step_avg:152.72ms step:232/1480 train_time:33900ms step_avg:152.70ms step:233/1480 train_time:34049ms step_avg:152.69ms step:234/1480 train_time:34198ms step_avg:152.67ms step:235/1480 train_time:34348ms step_avg:152.66ms step:236/1480 train_time:34497ms step_avg:152.64ms step:237/1480 train_time:34646ms step_avg:152.63ms step:238/1480 train_time:34794ms step_avg:152.60ms step:239/1480 train_time:34942ms step_avg:152.59ms step:240/1480 train_time:35090ms step_avg:152.56ms step:241/1480 train_time:35239ms step_avg:152.55ms step:242/1480 train_time:35388ms step_avg:152.54ms step:243/1480 train_time:35537ms step_avg:152.52ms step:244/1480 train_time:35687ms step_avg:152.51ms step:245/1480 train_time:35835ms step_avg:152.49ms step:246/1480 train_time:35984ms step_avg:152.48ms step:247/1480 train_time:36132ms step_avg:152.46ms step:248/1480 train_time:36283ms step_avg:152.45ms step:249/1480 train_time:36431ms step_avg:152.43ms step:250/1480 train_time:36581ms step_avg:152.42ms step:250/1480 val_loss:4.0000 train_time:36647ms step_avg:152.70ms step:251/1480 train_time:36746ms step_avg:152.47ms step:252/1480 train_time:36887ms step_avg:152.43ms step:253/1480 train_time:37034ms step_avg:152.40ms step:254/1480 train_time:37183ms step_avg:152.39ms step:255/1480 train_time:37330ms step_avg:152.37ms step:256/1480 train_time:37479ms step_avg:152.35ms step:257/1480 train_time:37627ms step_avg:152.34ms step:258/1480 train_time:37776ms step_avg:152.32ms step:259/1480 train_time:37926ms step_avg:152.31ms step:260/1480 train_time:38074ms step_avg:152.29ms step:261/1480 train_time:38223ms step_avg:152.28ms step:262/1480 train_time:38370ms step_avg:152.26ms step:263/1480 train_time:38520ms step_avg:152.25ms step:264/1480 train_time:38668ms step_avg:152.24ms step:265/1480 train_time:38817ms step_avg:152.22ms step:266/1480 train_time:38965ms step_avg:152.21ms step:267/1480 train_time:39113ms step_avg:152.19ms step:268/1480 train_time:39262ms step_avg:152.18ms step:269/1480 train_time:39411ms step_avg:152.16ms step:270/1480 train_time:39559ms step_avg:152.15ms step:271/1480 train_time:39707ms step_avg:152.14ms step:272/1480 train_time:39855ms step_avg:152.12ms step:273/1480 train_time:40004ms step_avg:152.11ms step:274/1480 train_time:40152ms step_avg:152.09ms step:275/1480 train_time:40302ms step_avg:152.08ms step:276/1480 train_time:40449ms step_avg:152.07ms step:277/1480 train_time:40599ms step_avg:152.06ms step:278/1480 train_time:40748ms step_avg:152.04ms step:279/1480 train_time:40895ms step_avg:152.03ms step:280/1480 train_time:41045ms step_avg:152.02ms step:281/1480 train_time:41192ms step_avg:152.00ms step:282/1480 train_time:41342ms step_avg:151.99ms step:283/1480 train_time:41490ms step_avg:151.98ms step:284/1480 train_time:41640ms step_avg:151.97ms step:285/1480 train_time:41789ms step_avg:151.96ms step:286/1480 train_time:41937ms step_avg:151.95ms step:287/1480 train_time:42086ms step_avg:151.93ms step:288/1480 train_time:42233ms step_avg:151.92ms step:289/1480 train_time:42383ms step_avg:151.91ms step:290/1480 train_time:42530ms step_avg:151.89ms step:291/1480 train_time:42679ms step_avg:151.88ms step:292/1480 train_time:42827ms step_avg:151.87ms step:293/1480 train_time:42976ms step_avg:151.86ms step:294/1480 train_time:43126ms step_avg:151.85ms step:295/1480 train_time:43274ms step_avg:151.84ms step:296/1480 train_time:43424ms step_avg:151.83ms step:297/1480 train_time:43571ms step_avg:151.82ms step:298/1480 train_time:43720ms step_avg:151.81ms step:299/1480 train_time:43868ms step_avg:151.79ms step:300/1480 train_time:44017ms step_avg:151.78ms step:301/1480 train_time:44165ms step_avg:151.77ms step:302/1480 train_time:44314ms step_avg:151.76ms step:303/1480 train_time:44463ms step_avg:151.75ms step:304/1480 train_time:44611ms step_avg:151.74ms step:305/1480 train_time:44761ms step_avg:151.73ms step:306/1480 train_time:44909ms step_avg:151.72ms step:307/1480 train_time:45058ms step_avg:151.71ms step:308/1480 train_time:45207ms step_avg:151.70ms step:309/1480 train_time:45355ms step_avg:151.69ms step:310/1480 train_time:45503ms step_avg:151.68ms step:311/1480 train_time:45651ms step_avg:151.66ms step:312/1480 train_time:45801ms step_avg:151.66ms step:313/1480 train_time:45950ms step_avg:151.65ms step:314/1480 train_time:46099ms step_avg:151.64ms step:315/1480 train_time:46247ms step_avg:151.63ms step:316/1480 train_time:46396ms step_avg:151.62ms step:317/1480 train_time:46546ms step_avg:151.62ms step:318/1480 train_time:46694ms step_avg:151.60ms step:319/1480 train_time:46843ms step_avg:151.60ms step:320/1480 train_time:46992ms step_avg:151.59ms step:321/1480 train_time:47141ms step_avg:151.58ms step:322/1480 train_time:47288ms step_avg:151.56ms step:323/1480 train_time:47437ms step_avg:151.56ms step:324/1480 train_time:47587ms step_avg:151.55ms step:325/1480 train_time:47734ms step_avg:151.54ms step:326/1480 train_time:47884ms step_avg:151.53ms step:327/1480 train_time:48031ms step_avg:151.52ms step:328/1480 train_time:48181ms step_avg:151.51ms step:329/1480 train_time:48330ms step_avg:151.50ms step:330/1480 train_time:48480ms step_avg:151.50ms step:331/1480 train_time:48631ms step_avg:151.50ms step:332/1480 train_time:48782ms step_avg:151.50ms step:333/1480 train_time:48932ms step_avg:151.49ms step:334/1480 train_time:49083ms step_avg:151.49ms step:335/1480 train_time:49233ms step_avg:151.49ms step:336/1480 train_time:49384ms step_avg:151.49ms step:337/1480 train_time:49534ms step_avg:151.48ms step:338/1480 train_time:49686ms step_avg:151.48ms step:339/1480 train_time:49836ms step_avg:151.48ms step:340/1480 train_time:49987ms step_avg:151.48ms step:341/1480 train_time:50137ms step_avg:151.47ms step:342/1480 train_time:50289ms step_avg:151.47ms step:343/1480 train_time:50441ms step_avg:151.47ms step:344/1480 train_time:50591ms step_avg:151.47ms step:345/1480 train_time:50744ms step_avg:151.47ms step:346/1480 train_time:50893ms step_avg:151.47ms step:347/1480 train_time:51045ms step_avg:151.47ms step:348/1480 train_time:51196ms step_avg:151.47ms step:349/1480 train_time:51348ms step_avg:151.47ms step:350/1480 train_time:51498ms step_avg:151.46ms step:351/1480 train_time:51649ms step_avg:151.46ms step:352/1480 train_time:51801ms step_avg:151.47ms step:353/1480 train_time:51952ms step_avg:151.46ms step:354/1480 train_time:52103ms step_avg:151.46ms step:355/1480 train_time:52253ms step_avg:151.46ms step:356/1480 train_time:52405ms step_avg:151.46ms step:357/1480 train_time:52555ms step_avg:151.46ms step:358/1480 train_time:52708ms step_avg:151.46ms step:359/1480 train_time:52858ms step_avg:151.46ms step:360/1480 train_time:53009ms step_avg:151.45ms step:361/1480 train_time:53159ms step_avg:151.45ms step:362/1480 train_time:53310ms step_avg:151.45ms step:363/1480 train_time:53461ms step_avg:151.45ms step:364/1480 train_time:53612ms step_avg:151.45ms step:365/1480 train_time:53764ms step_avg:151.45ms step:366/1480 train_time:53916ms step_avg:151.45ms step:367/1480 train_time:54067ms step_avg:151.45ms step:368/1480 train_time:54218ms step_avg:151.45ms step:369/1480 train_time:54368ms step_avg:151.44ms step:370/1480 train_time:54519ms step_avg:151.44ms step:371/1480 train_time:54670ms step_avg:151.44ms step:372/1480 train_time:54822ms step_avg:151.44ms step:373/1480 train_time:54974ms step_avg:151.44ms step:374/1480 train_time:55125ms step_avg:151.44ms step:375/1480 train_time:55274ms step_avg:151.44ms step:375/1480 val_loss:3.8127 train_time:55343ms step_avg:151.62ms step:376/1480 train_time:55435ms step_avg:151.46ms step:377/1480 train_time:55586ms step_avg:151.46ms step:378/1480 train_time:55738ms step_avg:151.46ms step:379/1480 train_time:55906ms step_avg:151.51ms step:380/1480 train_time:56038ms step_avg:151.45ms step:381/1480 train_time:56187ms step_avg:151.45ms step:382/1480 train_time:56339ms step_avg:151.45ms step:383/1480 train_time:56489ms step_avg:151.45ms step:384/1480 train_time:56641ms step_avg:151.45ms step:385/1480 train_time:56794ms step_avg:151.45ms step:386/1480 train_time:56944ms step_avg:151.45ms step:387/1480 train_time:57095ms step_avg:151.45ms step:388/1480 train_time:57244ms step_avg:151.44ms step:389/1480 train_time:57395ms step_avg:151.44ms step:390/1480 train_time:57545ms step_avg:151.43ms step:391/1480 train_time:57696ms step_avg:151.43ms step:392/1480 train_time:57847ms step_avg:151.43ms step:393/1480 train_time:57999ms step_avg:151.43ms step:394/1480 train_time:58149ms step_avg:151.43ms step:395/1480 train_time:58300ms step_avg:151.43ms step:396/1480 train_time:58450ms step_avg:151.43ms step:397/1480 train_time:58601ms step_avg:151.42ms step:398/1480 train_time:58752ms step_avg:151.42ms step:399/1480 train_time:58902ms step_avg:151.42ms step:400/1480 train_time:59055ms step_avg:151.42ms step:401/1480 train_time:59206ms step_avg:151.42ms step:402/1480 train_time:59357ms step_avg:151.42ms step:403/1480 train_time:59508ms step_avg:151.42ms step:404/1480 train_time:59659ms step_avg:151.42ms step:405/1480 train_time:59810ms step_avg:151.42ms step:406/1480 train_time:59961ms step_avg:151.42ms step:407/1480 train_time:60112ms step_avg:151.42ms step:408/1480 train_time:60262ms step_avg:151.41ms step:409/1480 train_time:60414ms step_avg:151.41ms step:410/1480 train_time:60564ms step_avg:151.41ms step:411/1480 train_time:60716ms step_avg:151.41ms step:412/1480 train_time:60865ms step_avg:151.41ms step:413/1480 train_time:61018ms step_avg:151.41ms step:414/1480 train_time:61169ms step_avg:151.41ms step:415/1480 train_time:61321ms step_avg:151.41ms step:416/1480 train_time:61472ms step_avg:151.41ms step:417/1480 train_time:61623ms step_avg:151.41ms step:418/1480 train_time:61774ms step_avg:151.41ms step:419/1480 train_time:61924ms step_avg:151.40ms step:420/1480 train_time:62075ms step_avg:151.40ms step:421/1480 train_time:62226ms step_avg:151.40ms step:422/1480 train_time:62377ms step_avg:151.40ms step:423/1480 train_time:62529ms step_avg:151.40ms step:424/1480 train_time:62680ms step_avg:151.40ms step:425/1480 train_time:62833ms step_avg:151.40ms step:426/1480 train_time:62984ms step_avg:151.40ms step:427/1480 train_time:63136ms step_avg:151.41ms step:428/1480 train_time:63286ms step_avg:151.40ms step:429/1480 train_time:63438ms step_avg:151.40ms step:430/1480 train_time:63588ms step_avg:151.40ms step:431/1480 train_time:63740ms step_avg:151.40ms step:432/1480 train_time:63890ms step_avg:151.40ms step:433/1480 train_time:64040ms step_avg:151.40ms step:434/1480 train_time:64192ms step_avg:151.40ms step:435/1480 train_time:64343ms step_avg:151.39ms step:436/1480 train_time:64494ms step_avg:151.39ms step:437/1480 train_time:64645ms step_avg:151.39ms step:438/1480 train_time:64797ms step_avg:151.39ms step:439/1480 train_time:64946ms step_avg:151.39ms step:440/1480 train_time:65098ms step_avg:151.39ms step:441/1480 train_time:65250ms step_avg:151.39ms step:442/1480 train_time:65402ms step_avg:151.39ms step:443/1480 train_time:65556ms step_avg:151.40ms step:444/1480 train_time:65709ms step_avg:151.40ms step:445/1480 train_time:65861ms step_avg:151.41ms step:446/1480 train_time:66015ms step_avg:151.41ms step:447/1480 train_time:66168ms step_avg:151.41ms step:448/1480 train_time:66320ms step_avg:151.42ms step:449/1480 train_time:66473ms step_avg:151.42ms step:450/1480 train_time:66625ms step_avg:151.42ms step:451/1480 train_time:66779ms step_avg:151.43ms step:452/1480 train_time:66932ms step_avg:151.43ms step:453/1480 train_time:67085ms step_avg:151.43ms step:454/1480 train_time:67239ms step_avg:151.44ms step:455/1480 train_time:67392ms step_avg:151.44ms step:456/1480 train_time:67544ms step_avg:151.44ms step:457/1480 train_time:67697ms step_avg:151.45ms step:458/1480 train_time:67849ms step_avg:151.45ms step:459/1480 train_time:68001ms step_avg:151.45ms step:460/1480 train_time:68156ms step_avg:151.46ms step:461/1480 train_time:68309ms step_avg:151.46ms step:462/1480 train_time:68463ms step_avg:151.47ms step:463/1480 train_time:68617ms step_avg:151.47ms step:464/1480 train_time:68769ms step_avg:151.47ms step:465/1480 train_time:68921ms step_avg:151.48ms step:466/1480 train_time:69073ms step_avg:151.48ms step:467/1480 train_time:69227ms step_avg:151.48ms step:468/1480 train_time:69380ms step_avg:151.48ms step:469/1480 train_time:69534ms step_avg:151.49ms step:470/1480 train_time:69687ms step_avg:151.49ms step:471/1480 train_time:69840ms step_avg:151.50ms step:472/1480 train_time:69994ms step_avg:151.50ms step:473/1480 train_time:70146ms step_avg:151.50ms step:474/1480 train_time:70298ms step_avg:151.51ms step:475/1480 train_time:70451ms step_avg:151.51ms step:476/1480 train_time:70603ms step_avg:151.51ms step:477/1480 train_time:70758ms step_avg:151.52ms step:478/1480 train_time:70911ms step_avg:151.52ms step:479/1480 train_time:71064ms step_avg:151.52ms step:480/1480 train_time:71218ms step_avg:151.53ms step:481/1480 train_time:71370ms step_avg:151.53ms step:482/1480 train_time:71522ms step_avg:151.53ms step:483/1480 train_time:71675ms step_avg:151.53ms step:484/1480 train_time:71831ms step_avg:151.54ms step:485/1480 train_time:71985ms step_avg:151.55ms step:486/1480 train_time:72139ms step_avg:151.55ms step:487/1480 train_time:72291ms step_avg:151.55ms step:488/1480 train_time:72444ms step_avg:151.56ms step:489/1480 train_time:72597ms step_avg:151.56ms step:490/1480 train_time:72749ms step_avg:151.56ms step:491/1480 train_time:72902ms step_avg:151.56ms step:492/1480 train_time:73057ms step_avg:151.57ms step:493/1480 train_time:73210ms step_avg:151.57ms step:494/1480 train_time:73362ms step_avg:151.57ms step:495/1480 train_time:73517ms step_avg:151.58ms step:496/1480 train_time:73669ms step_avg:151.58ms step:497/1480 train_time:73823ms step_avg:151.59ms step:498/1480 train_time:73975ms step_avg:151.59ms step:499/1480 train_time:74128ms step_avg:151.59ms step:500/1480 train_time:74281ms step_avg:151.59ms step:500/1480 val_loss:3.6881 train_time:74351ms step_avg:151.74ms step:501/1480 train_time:74446ms step_avg:151.62ms step:502/1480 train_time:74594ms step_avg:151.61ms step:503/1480 train_time:74747ms step_avg:151.62ms step:504/1480 train_time:74898ms step_avg:151.62ms step:505/1480 train_time:75051ms step_avg:151.62ms step:506/1480 train_time:75204ms step_avg:151.62ms step:507/1480 train_time:75356ms step_avg:151.62ms step:508/1480 train_time:75511ms step_avg:151.63ms step:509/1480 train_time:75664ms step_avg:151.63ms step:510/1480 train_time:75816ms step_avg:151.63ms step:511/1480 train_time:75968ms step_avg:151.63ms step:512/1480 train_time:76124ms step_avg:151.64ms step:513/1480 train_time:76276ms step_avg:151.64ms step:514/1480 train_time:76430ms step_avg:151.65ms step:515/1480 train_time:76583ms step_avg:151.65ms step:516/1480 train_time:76736ms step_avg:151.65ms step:517/1480 train_time:76890ms step_avg:151.66ms step:518/1480 train_time:77041ms step_avg:151.66ms step:519/1480 train_time:77194ms step_avg:151.66ms step:520/1480 train_time:77349ms step_avg:151.66ms step:521/1480 train_time:77502ms step_avg:151.67ms step:522/1480 train_time:77656ms step_avg:151.67ms step:523/1480 train_time:77810ms step_avg:151.68ms step:524/1480 train_time:77962ms step_avg:151.68ms step:525/1480 train_time:78114ms step_avg:151.68ms step:526/1480 train_time:78267ms step_avg:151.68ms step:527/1480 train_time:78420ms step_avg:151.68ms step:528/1480 train_time:78573ms step_avg:151.69ms step:529/1480 train_time:78727ms step_avg:151.69ms step:530/1480 train_time:78880ms step_avg:151.69ms step:531/1480 train_time:79033ms step_avg:151.70ms step:532/1480 train_time:79187ms step_avg:151.70ms step:533/1480 train_time:79338ms step_avg:151.70ms step:534/1480 train_time:79491ms step_avg:151.70ms step:535/1480 train_time:79643ms step_avg:151.70ms step:536/1480 train_time:79796ms step_avg:151.70ms step:537/1480 train_time:79950ms step_avg:151.71ms step:538/1480 train_time:80102ms step_avg:151.71ms step:539/1480 train_time:80256ms step_avg:151.71ms step:540/1480 train_time:80410ms step_avg:151.72ms step:541/1480 train_time:80561ms step_avg:151.72ms step:542/1480 train_time:80713ms step_avg:151.72ms step:543/1480 train_time:80866ms step_avg:151.72ms step:544/1480 train_time:81019ms step_avg:151.72ms step:545/1480 train_time:81172ms step_avg:151.72ms step:546/1480 train_time:81326ms step_avg:151.73ms step:547/1480 train_time:81478ms step_avg:151.73ms step:548/1480 train_time:81631ms step_avg:151.73ms step:549/1480 train_time:81784ms step_avg:151.73ms step:550/1480 train_time:81937ms step_avg:151.74ms step:551/1480 train_time:82091ms step_avg:151.74ms step:552/1480 train_time:82247ms step_avg:151.75ms step:553/1480 train_time:82403ms step_avg:151.75ms step:554/1480 train_time:82558ms step_avg:151.76ms step:555/1480 train_time:82713ms step_avg:151.77ms step:556/1480 train_time:82867ms step_avg:151.77ms step:557/1480 train_time:83022ms step_avg:151.78ms step:558/1480 train_time:83178ms step_avg:151.78ms step:559/1480 train_time:83332ms step_avg:151.79ms step:560/1480 train_time:83487ms step_avg:151.79ms step:561/1480 train_time:83641ms step_avg:151.80ms step:562/1480 train_time:83795ms step_avg:151.80ms step:563/1480 train_time:83950ms step_avg:151.81ms step:564/1480 train_time:84106ms step_avg:151.82ms step:565/1480 train_time:84261ms step_avg:151.82ms step:566/1480 train_time:84418ms step_avg:151.83ms step:567/1480 train_time:84572ms step_avg:151.83ms step:568/1480 train_time:84727ms step_avg:151.84ms step:569/1480 train_time:84899ms step_avg:151.88ms step:570/1480 train_time:85036ms step_avg:151.85ms step:571/1480 train_time:85190ms step_avg:151.85ms step:572/1480 train_time:85346ms step_avg:151.86ms step:573/1480 train_time:85501ms step_avg:151.87ms step:574/1480 train_time:85656ms step_avg:151.87ms step:575/1480 train_time:85812ms step_avg:151.88ms step:576/1480 train_time:85966ms step_avg:151.88ms step:577/1480 train_time:86120ms step_avg:151.89ms step:578/1480 train_time:86273ms step_avg:151.89ms step:579/1480 train_time:86429ms step_avg:151.90ms step:580/1480 train_time:86584ms step_avg:151.90ms step:581/1480 train_time:86737ms step_avg:151.90ms step:582/1480 train_time:86891ms step_avg:151.91ms step:583/1480 train_time:87046ms step_avg:151.91ms step:584/1480 train_time:87201ms step_avg:151.92ms step:585/1480 train_time:87357ms step_avg:151.92ms step:586/1480 train_time:87512ms step_avg:151.93ms step:587/1480 train_time:87666ms step_avg:151.93ms step:588/1480 train_time:87821ms step_avg:151.94ms step:589/1480 train_time:87974ms step_avg:151.94ms step:590/1480 train_time:88130ms step_avg:151.95ms step:591/1480 train_time:88284ms step_avg:151.95ms step:592/1480 train_time:88438ms step_avg:151.96ms step:593/1480 train_time:88593ms step_avg:151.96ms step:594/1480 train_time:88750ms step_avg:151.97ms step:595/1480 train_time:88905ms step_avg:151.97ms step:596/1480 train_time:89061ms step_avg:151.98ms step:597/1480 train_time:89216ms step_avg:151.99ms step:598/1480 train_time:89370ms step_avg:151.99ms step:599/1480 train_time:89525ms step_avg:152.00ms step:600/1480 train_time:89680ms step_avg:152.00ms step:601/1480 train_time:89835ms step_avg:152.00ms step:602/1480 train_time:89991ms step_avg:152.01ms step:603/1480 train_time:90146ms step_avg:152.02ms step:604/1480 train_time:90301ms step_avg:152.02ms step:605/1480 train_time:90456ms step_avg:152.03ms step:606/1480 train_time:90611ms step_avg:152.03ms step:607/1480 train_time:90766ms step_avg:152.04ms step:608/1480 train_time:90921ms step_avg:152.04ms step:609/1480 train_time:91076ms step_avg:152.05ms step:610/1480 train_time:91230ms step_avg:152.05ms step:611/1480 train_time:91385ms step_avg:152.06ms step:612/1480 train_time:91539ms step_avg:152.06ms step:613/1480 train_time:91694ms step_avg:152.06ms step:614/1480 train_time:91850ms step_avg:152.07ms step:615/1480 train_time:92004ms step_avg:152.07ms step:616/1480 train_time:92159ms step_avg:152.08ms step:617/1480 train_time:92314ms step_avg:152.08ms step:618/1480 train_time:92468ms step_avg:152.09ms step:619/1480 train_time:92625ms step_avg:152.09ms step:620/1480 train_time:92779ms step_avg:152.10ms step:621/1480 train_time:92933ms step_avg:152.10ms step:622/1480 train_time:93089ms step_avg:152.11ms step:623/1480 train_time:93244ms step_avg:152.11ms step:624/1480 train_time:93399ms step_avg:152.12ms step:625/1480 train_time:93553ms step_avg:152.12ms step:625/1480 val_loss:3.6054 train_time:93624ms step_avg:152.23ms step:626/1480 train_time:93732ms step_avg:152.16ms step:627/1480 train_time:93871ms step_avg:152.14ms step:628/1480 train_time:94025ms step_avg:152.14ms step:629/1480 train_time:94180ms step_avg:152.15ms step:630/1480 train_time:94334ms step_avg:152.15ms step:631/1480 train_time:94488ms step_avg:152.15ms step:632/1480 train_time:94642ms step_avg:152.16ms step:633/1480 train_time:94797ms step_avg:152.16ms step:634/1480 train_time:94951ms step_avg:152.17ms step:635/1480 train_time:95105ms step_avg:152.17ms step:636/1480 train_time:95259ms step_avg:152.17ms step:637/1480 train_time:95416ms step_avg:152.18ms step:638/1480 train_time:95570ms step_avg:152.18ms step:639/1480 train_time:95724ms step_avg:152.18ms step:640/1480 train_time:95879ms step_avg:152.19ms step:641/1480 train_time:96035ms step_avg:152.19ms step:642/1480 train_time:96189ms step_avg:152.20ms step:643/1480 train_time:96343ms step_avg:152.20ms step:644/1480 train_time:96499ms step_avg:152.21ms step:645/1480 train_time:96653ms step_avg:152.21ms step:646/1480 train_time:96810ms step_avg:152.22ms step:647/1480 train_time:96965ms step_avg:152.22ms step:648/1480 train_time:97121ms step_avg:152.23ms step:649/1480 train_time:97276ms step_avg:152.23ms step:650/1480 train_time:97430ms step_avg:152.23ms step:651/1480 train_time:97585ms step_avg:152.24ms step:652/1480 train_time:97740ms step_avg:152.24ms step:653/1480 train_time:97895ms step_avg:152.25ms step:654/1480 train_time:98049ms step_avg:152.25ms step:655/1480 train_time:98203ms step_avg:152.25ms step:656/1480 train_time:98357ms step_avg:152.26ms step:657/1480 train_time:98513ms step_avg:152.26ms step:658/1480 train_time:98668ms step_avg:152.27ms step:659/1480 train_time:98823ms step_avg:152.27ms step:660/1480 train_time:98979ms step_avg:152.28ms step:661/1480 train_time:99136ms step_avg:152.28ms step:662/1480 train_time:99293ms step_avg:152.29ms step:663/1480 train_time:99448ms step_avg:152.29ms step:664/1480 train_time:99603ms step_avg:152.30ms step:665/1480 train_time:99759ms step_avg:152.30ms step:666/1480 train_time:99916ms step_avg:152.31ms step:667/1480 train_time:100073ms step_avg:152.32ms step:668/1480 train_time:100229ms step_avg:152.32ms step:669/1480 train_time:100387ms step_avg:152.33ms step:670/1480 train_time:100543ms step_avg:152.34ms step:671/1480 train_time:100700ms step_avg:152.34ms step:672/1480 train_time:100856ms step_avg:152.35ms step:673/1480 train_time:101013ms step_avg:152.36ms step:674/1480 train_time:101170ms step_avg:152.36ms step:675/1480 train_time:101326ms step_avg:152.37ms step:676/1480 train_time:101482ms step_avg:152.38ms step:677/1480 train_time:101639ms step_avg:152.38ms step:678/1480 train_time:101796ms step_avg:152.39ms step:679/1480 train_time:101952ms step_avg:152.39ms step:680/1480 train_time:102108ms step_avg:152.40ms step:681/1480 train_time:102262ms step_avg:152.40ms step:682/1480 train_time:102421ms step_avg:152.41ms step:683/1480 train_time:102578ms step_avg:152.42ms step:684/1480 train_time:102735ms step_avg:152.43ms step:685/1480 train_time:102893ms step_avg:152.43ms step:686/1480 train_time:103049ms step_avg:152.44ms step:687/1480 train_time:103205ms step_avg:152.44ms step:688/1480 train_time:103362ms step_avg:152.45ms step:689/1480 train_time:103519ms step_avg:152.46ms step:690/1480 train_time:103678ms step_avg:152.47ms step:691/1480 train_time:103835ms step_avg:152.47ms step:692/1480 train_time:103991ms step_avg:152.48ms step:693/1480 train_time:104147ms step_avg:152.48ms step:694/1480 train_time:104304ms step_avg:152.49ms step:695/1480 train_time:104460ms step_avg:152.50ms step:696/1480 train_time:104617ms step_avg:152.50ms step:697/1480 train_time:104773ms step_avg:152.51ms step:698/1480 train_time:104929ms step_avg:152.51ms step:699/1480 train_time:105085ms step_avg:152.52ms step:700/1480 train_time:105241ms step_avg:152.52ms step:701/1480 train_time:105397ms step_avg:152.53ms step:702/1480 train_time:105554ms step_avg:152.53ms step:703/1480 train_time:105713ms step_avg:152.54ms step:704/1480 train_time:105869ms step_avg:152.55ms step:705/1480 train_time:106024ms step_avg:152.55ms step:706/1480 train_time:106183ms step_avg:152.56ms step:707/1480 train_time:106340ms step_avg:152.57ms step:708/1480 train_time:106497ms step_avg:152.57ms step:709/1480 train_time:106651ms step_avg:152.58ms step:710/1480 train_time:106807ms step_avg:152.58ms step:711/1480 train_time:106962ms step_avg:152.59ms step:712/1480 train_time:107119ms step_avg:152.59ms step:713/1480 train_time:107278ms step_avg:152.60ms step:714/1480 train_time:107435ms step_avg:152.61ms step:715/1480 train_time:107591ms step_avg:152.61ms step:716/1480 train_time:107746ms step_avg:152.61ms step:717/1480 train_time:107901ms step_avg:152.62ms step:718/1480 train_time:108056ms step_avg:152.62ms step:719/1480 train_time:108213ms step_avg:152.63ms step:720/1480 train_time:108370ms step_avg:152.63ms step:721/1480 train_time:108528ms step_avg:152.64ms step:722/1480 train_time:108684ms step_avg:152.65ms step:723/1480 train_time:108841ms step_avg:152.65ms step:724/1480 train_time:108998ms step_avg:152.66ms step:725/1480 train_time:109154ms step_avg:152.66ms step:726/1480 train_time:109311ms step_avg:152.67ms step:727/1480 train_time:109468ms step_avg:152.68ms step:728/1480 train_time:109624ms step_avg:152.68ms step:729/1480 train_time:109780ms step_avg:152.68ms step:730/1480 train_time:109938ms step_avg:152.69ms step:731/1480 train_time:110096ms step_avg:152.70ms step:732/1480 train_time:110251ms step_avg:152.70ms step:733/1480 train_time:110407ms step_avg:152.71ms step:734/1480 train_time:110564ms step_avg:152.71ms step:735/1480 train_time:110721ms step_avg:152.72ms step:736/1480 train_time:110878ms step_avg:152.72ms step:737/1480 train_time:111033ms step_avg:152.73ms step:738/1480 train_time:111188ms step_avg:152.73ms step:739/1480 train_time:111343ms step_avg:152.73ms step:740/1480 train_time:111501ms step_avg:152.74ms step:741/1480 train_time:111658ms step_avg:152.75ms step:742/1480 train_time:111815ms step_avg:152.75ms step:743/1480 train_time:111972ms step_avg:152.76ms step:744/1480 train_time:112128ms step_avg:152.76ms step:745/1480 train_time:112286ms step_avg:152.77ms step:746/1480 train_time:112442ms step_avg:152.77ms step:747/1480 train_time:112599ms step_avg:152.78ms step:748/1480 train_time:112757ms step_avg:152.79ms step:749/1480 train_time:112915ms step_avg:152.79ms step:750/1480 train_time:113072ms step_avg:152.80ms step:750/1480 val_loss:3.5511 train_time:113143ms step_avg:152.90ms step:751/1480 train_time:113244ms step_avg:152.83ms step:752/1480 train_time:113390ms step_avg:152.82ms step:753/1480 train_time:113546ms step_avg:152.82ms step:754/1480 train_time:113701ms step_avg:152.82ms step:755/1480 train_time:113857ms step_avg:152.83ms step:756/1480 train_time:114014ms step_avg:152.83ms step:757/1480 train_time:114172ms step_avg:152.84ms step:758/1480 train_time:114328ms step_avg:152.85ms step:759/1480 train_time:114501ms step_avg:152.87ms step:760/1480 train_time:114647ms step_avg:152.86ms step:761/1480 train_time:114803ms step_avg:152.87ms step:762/1480 train_time:114958ms step_avg:152.87ms step:763/1480 train_time:115115ms step_avg:152.88ms step:764/1480 train_time:115273ms step_avg:152.88ms step:765/1480 train_time:115429ms step_avg:152.89ms step:766/1480 train_time:115588ms step_avg:152.89ms step:767/1480 train_time:115746ms step_avg:152.90ms step:768/1480 train_time:115902ms step_avg:152.91ms step:769/1480 train_time:116059ms step_avg:152.91ms step:770/1480 train_time:116216ms step_avg:152.92ms step:771/1480 train_time:116375ms step_avg:152.92ms step:772/1480 train_time:116532ms step_avg:152.93ms step:773/1480 train_time:116691ms step_avg:152.94ms step:774/1480 train_time:116849ms step_avg:152.94ms step:775/1480 train_time:117006ms step_avg:152.95ms step:776/1480 train_time:117165ms step_avg:152.96ms step:777/1480 train_time:117327ms step_avg:152.97ms step:778/1480 train_time:117486ms step_avg:152.98ms step:779/1480 train_time:117644ms step_avg:152.98ms step:780/1480 train_time:117801ms step_avg:152.99ms step:781/1480 train_time:117958ms step_avg:152.99ms step:782/1480 train_time:118116ms step_avg:153.00ms step:783/1480 train_time:118273ms step_avg:153.01ms step:784/1480 train_time:118431ms step_avg:153.01ms step:785/1480 train_time:118590ms step_avg:153.02ms step:786/1480 train_time:118747ms step_avg:153.02ms step:787/1480 train_time:118905ms step_avg:153.03ms step:788/1480 train_time:119066ms step_avg:153.04ms step:789/1480 train_time:119224ms step_avg:153.05ms step:790/1480 train_time:119381ms step_avg:153.05ms step:791/1480 train_time:119540ms step_avg:153.06ms step:792/1480 train_time:119697ms step_avg:153.07ms step:793/1480 train_time:119855ms step_avg:153.07ms step:794/1480 train_time:120014ms step_avg:153.08ms step:795/1480 train_time:120173ms step_avg:153.09ms step:796/1480 train_time:120331ms step_avg:153.09ms step:797/1480 train_time:120490ms step_avg:153.10ms step:798/1480 train_time:120647ms step_avg:153.11ms step:799/1480 train_time:120808ms step_avg:153.11ms step:800/1480 train_time:120967ms step_avg:153.12ms step:801/1480 train_time:121124ms step_avg:153.13ms step:802/1480 train_time:121281ms step_avg:153.13ms step:803/1480 train_time:121439ms step_avg:153.14ms step:804/1480 train_time:121596ms step_avg:153.14ms step:805/1480 train_time:121757ms step_avg:153.15ms step:806/1480 train_time:121914ms step_avg:153.16ms step:807/1480 train_time:122073ms step_avg:153.17ms step:808/1480 train_time:122231ms step_avg:153.17ms step:809/1480 train_time:122389ms step_avg:153.18ms step:810/1480 train_time:122546ms step_avg:153.18ms step:811/1480 train_time:122703ms step_avg:153.19ms step:812/1480 train_time:122860ms step_avg:153.19ms step:813/1480 train_time:123018ms step_avg:153.20ms step:814/1480 train_time:123176ms step_avg:153.20ms step:815/1480 train_time:123333ms step_avg:153.21ms step:816/1480 train_time:123494ms step_avg:153.22ms step:817/1480 train_time:123652ms step_avg:153.22ms step:818/1480 train_time:123808ms step_avg:153.23ms step:819/1480 train_time:123966ms step_avg:153.23ms step:820/1480 train_time:124124ms step_avg:153.24ms step:821/1480 train_time:124282ms step_avg:153.24ms step:822/1480 train_time:124439ms step_avg:153.25ms step:823/1480 train_time:124596ms step_avg:153.25ms step:824/1480 train_time:124753ms step_avg:153.26ms step:825/1480 train_time:124911ms step_avg:153.27ms step:826/1480 train_time:125071ms step_avg:153.27ms step:827/1480 train_time:125229ms step_avg:153.28ms step:828/1480 train_time:125387ms step_avg:153.28ms step:829/1480 train_time:125547ms step_avg:153.29ms step:830/1480 train_time:125707ms step_avg:153.30ms step:831/1480 train_time:125865ms step_avg:153.31ms step:832/1480 train_time:126023ms step_avg:153.31ms step:833/1480 train_time:126181ms step_avg:153.32ms step:834/1480 train_time:126342ms step_avg:153.33ms step:835/1480 train_time:126500ms step_avg:153.33ms step:836/1480 train_time:126659ms step_avg:153.34ms step:837/1480 train_time:126816ms step_avg:153.35ms step:838/1480 train_time:126975ms step_avg:153.35ms step:839/1480 train_time:127132ms step_avg:153.36ms step:840/1480 train_time:127290ms step_avg:153.36ms step:841/1480 train_time:127446ms step_avg:153.36ms step:842/1480 train_time:127605ms step_avg:153.37ms step:843/1480 train_time:127762ms step_avg:153.38ms step:844/1480 train_time:127919ms step_avg:153.38ms step:845/1480 train_time:128076ms step_avg:153.38ms step:846/1480 train_time:128235ms step_avg:153.39ms step:847/1480 train_time:128395ms step_avg:153.40ms step:848/1480 train_time:128553ms step_avg:153.40ms step:849/1480 train_time:128711ms step_avg:153.41ms step:850/1480 train_time:128868ms step_avg:153.41ms step:851/1480 train_time:129027ms step_avg:153.42ms step:852/1480 train_time:129185ms step_avg:153.43ms step:853/1480 train_time:129343ms step_avg:153.43ms step:854/1480 train_time:129500ms step_avg:153.44ms step:855/1480 train_time:129657ms step_avg:153.44ms step:856/1480 train_time:129815ms step_avg:153.45ms step:857/1480 train_time:129974ms step_avg:153.45ms step:858/1480 train_time:130134ms step_avg:153.46ms step:859/1480 train_time:130292ms step_avg:153.47ms step:860/1480 train_time:130449ms step_avg:153.47ms step:861/1480 train_time:130607ms step_avg:153.48ms step:862/1480 train_time:130771ms step_avg:153.49ms step:863/1480 train_time:130930ms step_avg:153.49ms step:864/1480 train_time:131089ms step_avg:153.50ms step:865/1480 train_time:131247ms step_avg:153.51ms step:866/1480 train_time:131405ms step_avg:153.51ms step:867/1480 train_time:131566ms step_avg:153.52ms step:868/1480 train_time:131724ms step_avg:153.52ms step:869/1480 train_time:131882ms step_avg:153.53ms step:870/1480 train_time:132039ms step_avg:153.53ms step:871/1480 train_time:132196ms step_avg:153.54ms step:872/1480 train_time:132355ms step_avg:153.54ms step:873/1480 train_time:132512ms step_avg:153.55ms step:874/1480 train_time:132672ms step_avg:153.56ms step:875/1480 train_time:132831ms step_avg:153.56ms step:875/1480 val_loss:3.5068 train_time:132903ms step_avg:153.64ms step:876/1480 train_time:133002ms step_avg:153.58ms step:877/1480 train_time:133151ms step_avg:153.58ms step:878/1480 train_time:133309ms step_avg:153.58ms step:879/1480 train_time:133467ms step_avg:153.59ms step:880/1480 train_time:133624ms step_avg:153.59ms step:881/1480 train_time:133782ms step_avg:153.60ms step:882/1480 train_time:133943ms step_avg:153.60ms step:883/1480 train_time:134102ms step_avg:153.61ms step:884/1480 train_time:134263ms step_avg:153.62ms step:885/1480 train_time:134423ms step_avg:153.63ms step:886/1480 train_time:134587ms step_avg:153.64ms step:887/1480 train_time:134746ms step_avg:153.64ms step:888/1480 train_time:134909ms step_avg:153.65ms step:889/1480 train_time:135069ms step_avg:153.66ms step:890/1480 train_time:135226ms step_avg:153.67ms step:891/1480 train_time:135384ms step_avg:153.67ms step:892/1480 train_time:135545ms step_avg:153.68ms step:893/1480 train_time:135703ms step_avg:153.68ms step:894/1480 train_time:135864ms step_avg:153.69ms step:895/1480 train_time:136027ms step_avg:153.70ms step:896/1480 train_time:136185ms step_avg:153.71ms step:897/1480 train_time:136346ms step_avg:153.72ms step:898/1480 train_time:136505ms step_avg:153.72ms step:899/1480 train_time:136663ms step_avg:153.73ms step:900/1480 train_time:136822ms step_avg:153.73ms step:901/1480 train_time:136983ms step_avg:153.74ms step:902/1480 train_time:137141ms step_avg:153.75ms step:903/1480 train_time:137302ms step_avg:153.75ms step:904/1480 train_time:137463ms step_avg:153.76ms step:905/1480 train_time:137621ms step_avg:153.77ms step:906/1480 train_time:137781ms step_avg:153.77ms step:907/1480 train_time:137943ms step_avg:153.78ms step:908/1480 train_time:138101ms step_avg:153.79ms step:909/1480 train_time:138262ms step_avg:153.79ms step:910/1480 train_time:138427ms step_avg:153.81ms step:911/1480 train_time:138586ms step_avg:153.81ms step:912/1480 train_time:138746ms step_avg:153.82ms step:913/1480 train_time:138906ms step_avg:153.83ms step:914/1480 train_time:139066ms step_avg:153.83ms step:915/1480 train_time:139229ms step_avg:153.84ms step:916/1480 train_time:139389ms step_avg:153.85ms step:917/1480 train_time:139548ms step_avg:153.86ms step:918/1480 train_time:139708ms step_avg:153.86ms step:919/1480 train_time:139871ms step_avg:153.87ms step:920/1480 train_time:140031ms step_avg:153.88ms step:921/1480 train_time:140190ms step_avg:153.89ms step:922/1480 train_time:140352ms step_avg:153.89ms step:923/1480 train_time:140508ms step_avg:153.90ms step:924/1480 train_time:140666ms step_avg:153.90ms step:925/1480 train_time:140828ms step_avg:153.91ms step:926/1480 train_time:140986ms step_avg:153.92ms step:927/1480 train_time:141145ms step_avg:153.92ms step:928/1480 train_time:141304ms step_avg:153.93ms step:929/1480 train_time:141466ms step_avg:153.93ms step:930/1480 train_time:141626ms step_avg:153.94ms step:931/1480 train_time:141785ms step_avg:153.95ms step:932/1480 train_time:141944ms step_avg:153.95ms step:933/1480 train_time:142103ms step_avg:153.96ms step:934/1480 train_time:142263ms step_avg:153.96ms step:935/1480 train_time:142424ms step_avg:153.97ms step:936/1480 train_time:142584ms step_avg:153.98ms step:937/1480 train_time:142744ms step_avg:153.98ms step:938/1480 train_time:142902ms step_avg:153.99ms step:939/1480 train_time:143065ms step_avg:154.00ms step:940/1480 train_time:143225ms step_avg:154.01ms step:941/1480 train_time:143384ms step_avg:154.01ms step:942/1480 train_time:143543ms step_avg:154.02ms step:943/1480 train_time:143701ms step_avg:154.02ms step:944/1480 train_time:143863ms step_avg:154.03ms step:945/1480 train_time:144024ms step_avg:154.04ms step:946/1480 train_time:144188ms step_avg:154.05ms step:947/1480 train_time:144350ms step_avg:154.06ms step:948/1480 train_time:144510ms step_avg:154.06ms step:949/1480 train_time:144684ms step_avg:154.08ms step:950/1480 train_time:144827ms step_avg:154.07ms step:951/1480 train_time:144989ms step_avg:154.08ms step:952/1480 train_time:145149ms step_avg:154.09ms step:953/1480 train_time:145309ms step_avg:154.09ms step:954/1480 train_time:145469ms step_avg:154.10ms step:955/1480 train_time:145628ms step_avg:154.10ms step:956/1480 train_time:145788ms step_avg:154.11ms step:957/1480 train_time:145946ms step_avg:154.11ms step:958/1480 train_time:146109ms step_avg:154.12ms step:959/1480 train_time:146267ms step_avg:154.13ms step:960/1480 train_time:146428ms step_avg:154.13ms step:961/1480 train_time:146588ms step_avg:154.14ms step:962/1480 train_time:146745ms step_avg:154.14ms step:963/1480 train_time:146905ms step_avg:154.15ms step:964/1480 train_time:147066ms step_avg:154.16ms step:965/1480 train_time:147225ms step_avg:154.16ms step:966/1480 train_time:147386ms step_avg:154.17ms step:967/1480 train_time:147546ms step_avg:154.18ms step:968/1480 train_time:147705ms step_avg:154.18ms step:969/1480 train_time:147868ms step_avg:154.19ms step:970/1480 train_time:148025ms step_avg:154.19ms step:971/1480 train_time:148185ms step_avg:154.20ms step:972/1480 train_time:148343ms step_avg:154.20ms step:973/1480 train_time:148501ms step_avg:154.21ms step:974/1480 train_time:148661ms step_avg:154.21ms step:975/1480 train_time:148822ms step_avg:154.22ms step:976/1480 train_time:148981ms step_avg:154.22ms step:977/1480 train_time:149141ms step_avg:154.23ms step:978/1480 train_time:149300ms step_avg:154.24ms step:979/1480 train_time:149460ms step_avg:154.24ms step:980/1480 train_time:149619ms step_avg:154.25ms step:981/1480 train_time:149780ms step_avg:154.25ms step:982/1480 train_time:149938ms step_avg:154.26ms step:983/1480 train_time:150098ms step_avg:154.26ms step:984/1480 train_time:150257ms step_avg:154.27ms step:985/1480 train_time:150420ms step_avg:154.28ms step:986/1480 train_time:150579ms step_avg:154.28ms step:987/1480 train_time:150737ms step_avg:154.29ms step:988/1480 train_time:150897ms step_avg:154.29ms step:989/1480 train_time:151056ms step_avg:154.30ms step:990/1480 train_time:151218ms step_avg:154.30ms step:991/1480 train_time:151379ms step_avg:154.31ms step:992/1480 train_time:151541ms step_avg:154.32ms step:993/1480 train_time:151708ms step_avg:154.33ms step:994/1480 train_time:151869ms step_avg:154.34ms step:995/1480 train_time:152029ms step_avg:154.34ms step:996/1480 train_time:152187ms step_avg:154.35ms step:997/1480 train_time:152346ms step_avg:154.35ms step:998/1480 train_time:152505ms step_avg:154.36ms step:999/1480 train_time:152667ms step_avg:154.36ms step:1000/1480 train_time:152828ms step_avg:154.37ms step:1000/1480 val_loss:3.4430 train_time:152902ms step_avg:154.45ms step:1001/1480 train_time:152995ms step_avg:154.38ms step:1002/1480 train_time:153151ms step_avg:154.39ms step:1003/1480 train_time:153314ms step_avg:154.40ms step:1004/1480 train_time:153476ms step_avg:154.40ms step:1005/1480 train_time:153636ms step_avg:154.41ms step:1006/1480 train_time:153796ms step_avg:154.41ms step:1007/1480 train_time:153956ms step_avg:154.42ms step:1008/1480 train_time:154117ms step_avg:154.43ms step:1009/1480 train_time:154283ms step_avg:154.44ms step:1010/1480 train_time:154443ms step_avg:154.44ms step:1011/1480 train_time:154604ms step_avg:154.45ms step:1012/1480 train_time:154762ms step_avg:154.45ms step:1013/1480 train_time:154923ms step_avg:154.46ms step:1014/1480 train_time:155084ms step_avg:154.47ms step:1015/1480 train_time:155246ms step_avg:154.47ms step:1016/1480 train_time:155406ms step_avg:154.48ms step:1017/1480 train_time:155567ms step_avg:154.49ms step:1018/1480 train_time:155728ms step_avg:154.49ms step:1019/1480 train_time:155890ms step_avg:154.50ms step:1020/1480 train_time:156050ms step_avg:154.50ms step:1021/1480 train_time:156210ms step_avg:154.51ms step:1022/1480 train_time:156371ms step_avg:154.52ms step:1023/1480 train_time:156532ms step_avg:154.52ms step:1024/1480 train_time:156694ms step_avg:154.53ms step:1025/1480 train_time:156856ms step_avg:154.54ms step:1026/1480 train_time:157016ms step_avg:154.54ms step:1027/1480 train_time:157175ms step_avg:154.55ms step:1028/1480 train_time:157338ms step_avg:154.56ms step:1029/1480 train_time:157500ms step_avg:154.56ms step:1030/1480 train_time:157663ms step_avg:154.57ms step:1031/1480 train_time:157821ms step_avg:154.58ms step:1032/1480 train_time:157985ms step_avg:154.58ms step:1033/1480 train_time:158146ms step_avg:154.59ms step:1034/1480 train_time:158307ms step_avg:154.60ms step:1035/1480 train_time:158467ms step_avg:154.60ms step:1036/1480 train_time:158627ms step_avg:154.61ms step:1037/1480 train_time:158788ms step_avg:154.61ms step:1038/1480 train_time:158947ms step_avg:154.62ms step:1039/1480 train_time:159108ms step_avg:154.62ms step:1040/1480 train_time:159268ms step_avg:154.63ms step:1041/1480 train_time:159428ms step_avg:154.63ms step:1042/1480 train_time:159587ms step_avg:154.64ms step:1043/1480 train_time:159746ms step_avg:154.64ms step:1044/1480 train_time:159906ms step_avg:154.65ms step:1045/1480 train_time:160066ms step_avg:154.65ms step:1046/1480 train_time:160227ms step_avg:154.66ms step:1047/1480 train_time:160387ms step_avg:154.66ms step:1048/1480 train_time:160548ms step_avg:154.67ms step:1049/1480 train_time:160708ms step_avg:154.68ms step:1050/1480 train_time:160870ms step_avg:154.68ms step:1051/1480 train_time:161031ms step_avg:154.69ms step:1052/1480 train_time:161194ms step_avg:154.70ms step:1053/1480 train_time:161354ms step_avg:154.70ms step:1054/1480 train_time:161516ms step_avg:154.71ms step:1055/1480 train_time:161675ms step_avg:154.71ms step:1056/1480 train_time:161835ms step_avg:154.72ms step:1057/1480 train_time:161994ms step_avg:154.72ms step:1058/1480 train_time:162154ms step_avg:154.73ms step:1059/1480 train_time:162318ms step_avg:154.74ms step:1060/1480 train_time:162479ms step_avg:154.74ms step:1061/1480 train_time:162638ms step_avg:154.75ms step:1062/1480 train_time:162798ms step_avg:154.75ms step:1063/1480 train_time:162957ms step_avg:154.76ms step:1064/1480 train_time:163116ms step_avg:154.76ms step:1065/1480 train_time:163277ms step_avg:154.77ms step:1066/1480 train_time:163440ms step_avg:154.77ms step:1067/1480 train_time:163604ms step_avg:154.78ms step:1068/1480 train_time:163764ms step_avg:154.79ms step:1069/1480 train_time:163930ms step_avg:154.80ms step:1070/1480 train_time:164090ms step_avg:154.80ms step:1071/1480 train_time:164253ms step_avg:154.81ms step:1072/1480 train_time:164414ms step_avg:154.82ms step:1073/1480 train_time:164572ms step_avg:154.82ms step:1074/1480 train_time:164730ms step_avg:154.82ms step:1075/1480 train_time:164893ms step_avg:154.83ms step:1076/1480 train_time:165052ms step_avg:154.83ms step:1077/1480 train_time:165213ms step_avg:154.84ms step:1078/1480 train_time:165379ms step_avg:154.85ms step:1079/1480 train_time:165542ms step_avg:154.86ms step:1080/1480 train_time:165704ms step_avg:154.86ms step:1081/1480 train_time:165865ms step_avg:154.87ms step:1082/1480 train_time:166025ms step_avg:154.87ms step:1083/1480 train_time:166185ms step_avg:154.88ms step:1084/1480 train_time:166346ms step_avg:154.88ms step:1085/1480 train_time:166506ms step_avg:154.89ms step:1086/1480 train_time:166668ms step_avg:154.90ms step:1087/1480 train_time:166829ms step_avg:154.90ms step:1088/1480 train_time:166989ms step_avg:154.91ms step:1089/1480 train_time:167151ms step_avg:154.91ms step:1090/1480 train_time:167315ms step_avg:154.92ms step:1091/1480 train_time:167476ms step_avg:154.93ms step:1092/1480 train_time:167636ms step_avg:154.93ms step:1093/1480 train_time:167798ms step_avg:154.94ms step:1094/1480 train_time:167957ms step_avg:154.94ms step:1095/1480 train_time:168117ms step_avg:154.95ms step:1096/1480 train_time:168281ms step_avg:154.96ms step:1097/1480 train_time:168443ms step_avg:154.96ms step:1098/1480 train_time:168607ms step_avg:154.97ms step:1099/1480 train_time:168768ms step_avg:154.97ms step:1100/1480 train_time:168931ms step_avg:154.98ms step:1101/1480 train_time:169097ms step_avg:154.99ms step:1102/1480 train_time:169258ms step_avg:155.00ms step:1103/1480 train_time:169424ms step_avg:155.01ms step:1104/1480 train_time:169586ms step_avg:155.01ms step:1105/1480 train_time:169748ms step_avg:155.02ms step:1106/1480 train_time:169911ms step_avg:155.03ms step:1107/1480 train_time:170073ms step_avg:155.03ms step:1108/1480 train_time:170234ms step_avg:155.04ms step:1109/1480 train_time:170395ms step_avg:155.05ms step:1110/1480 train_time:170554ms step_avg:155.05ms step:1111/1480 train_time:170715ms step_avg:155.05ms step:1112/1480 train_time:170877ms step_avg:155.06ms step:1113/1480 train_time:171045ms step_avg:155.07ms step:1114/1480 train_time:171207ms step_avg:155.08ms step:1115/1480 train_time:171370ms step_avg:155.09ms step:1116/1480 train_time:171530ms step_avg:155.09ms step:1117/1480 train_time:171694ms step_avg:155.10ms step:1118/1480 train_time:171857ms step_avg:155.11ms step:1119/1480 train_time:172019ms step_avg:155.11ms step:1120/1480 train_time:172181ms step_avg:155.12ms step:1121/1480 train_time:172344ms step_avg:155.13ms step:1122/1480 train_time:172506ms step_avg:155.13ms step:1123/1480 train_time:172666ms step_avg:155.14ms step:1124/1480 train_time:172830ms step_avg:155.14ms step:1125/1480 train_time:172992ms step_avg:155.15ms step:1125/1480 val_loss:3.3869 train_time:173068ms step_avg:155.22ms step:1126/1480 train_time:173158ms step_avg:155.16ms step:1127/1480 train_time:173319ms step_avg:155.16ms step:1128/1480 train_time:173480ms step_avg:155.17ms step:1129/1480 train_time:173643ms step_avg:155.18ms step:1130/1480 train_time:173803ms step_avg:155.18ms step:1131/1480 train_time:173972ms step_avg:155.19ms step:1132/1480 train_time:174131ms step_avg:155.20ms step:1133/1480 train_time:174294ms step_avg:155.20ms step:1134/1480 train_time:174460ms step_avg:155.21ms step:1135/1480 train_time:174621ms step_avg:155.22ms step:1136/1480 train_time:174783ms step_avg:155.22ms step:1137/1480 train_time:174944ms step_avg:155.23ms step:1138/1480 train_time:175108ms step_avg:155.24ms step:1139/1480 train_time:175285ms step_avg:155.26ms step:1140/1480 train_time:175432ms step_avg:155.25ms step:1141/1480 train_time:175597ms step_avg:155.26ms step:1142/1480 train_time:175757ms step_avg:155.26ms step:1143/1480 train_time:175919ms step_avg:155.27ms step:1144/1480 train_time:176080ms step_avg:155.27ms step:1145/1480 train_time:176240ms step_avg:155.28ms step:1146/1480 train_time:176402ms step_avg:155.28ms step:1147/1480 train_time:176563ms step_avg:155.29ms step:1148/1480 train_time:176724ms step_avg:155.29ms step:1149/1480 train_time:176888ms step_avg:155.30ms step:1150/1480 train_time:177048ms step_avg:155.30ms step:1151/1480 train_time:177214ms step_avg:155.31ms step:1152/1480 train_time:177379ms step_avg:155.32ms step:1153/1480 train_time:177544ms step_avg:155.33ms step:1154/1480 train_time:177704ms step_avg:155.34ms step:1155/1480 train_time:177865ms step_avg:155.34ms step:1156/1480 train_time:178033ms step_avg:155.35ms step:1157/1480 train_time:178196ms step_avg:155.36ms step:1158/1480 train_time:178358ms step_avg:155.36ms step:1159/1480 train_time:178519ms step_avg:155.37ms step:1160/1480 train_time:178679ms step_avg:155.37ms step:1161/1480 train_time:178841ms step_avg:155.38ms step:1162/1480 train_time:179003ms step_avg:155.38ms step:1163/1480 train_time:179165ms step_avg:155.39ms step:1164/1480 train_time:179327ms step_avg:155.40ms step:1165/1480 train_time:179489ms step_avg:155.40ms step:1166/1480 train_time:179651ms step_avg:155.41ms step:1167/1480 train_time:179812ms step_avg:155.41ms step:1168/1480 train_time:179976ms step_avg:155.42ms step:1169/1480 train_time:180139ms step_avg:155.43ms step:1170/1480 train_time:180299ms step_avg:155.43ms step:1171/1480 train_time:180460ms step_avg:155.43ms step:1172/1480 train_time:180620ms step_avg:155.44ms step:1173/1480 train_time:180781ms step_avg:155.44ms step:1174/1480 train_time:180950ms step_avg:155.46ms step:1175/1480 train_time:181113ms step_avg:155.46ms step:1176/1480 train_time:181277ms step_avg:155.47ms step:1177/1480 train_time:181442ms step_avg:155.48ms step:1178/1480 train_time:181601ms step_avg:155.48ms step:1179/1480 train_time:181762ms step_avg:155.49ms step:1180/1480 train_time:181932ms step_avg:155.50ms step:1181/1480 train_time:182096ms step_avg:155.50ms step:1182/1480 train_time:182257ms step_avg:155.51ms step:1183/1480 train_time:182418ms step_avg:155.51ms step:1184/1480 train_time:182579ms step_avg:155.52ms step:1185/1480 train_time:182743ms step_avg:155.53ms step:1186/1480 train_time:182905ms step_avg:155.53ms step:1187/1480 train_time:183078ms step_avg:155.55ms step:1188/1480 train_time:183238ms step_avg:155.55ms step:1189/1480 train_time:183400ms step_avg:155.56ms step:1190/1480 train_time:183562ms step_avg:155.56ms step:1191/1480 train_time:183725ms step_avg:155.57ms step:1192/1480 train_time:183885ms step_avg:155.57ms step:1193/1480 train_time:184047ms step_avg:155.58ms step:1194/1480 train_time:184208ms step_avg:155.58ms step:1195/1480 train_time:184370ms step_avg:155.59ms step:1196/1480 train_time:184541ms step_avg:155.60ms step:1197/1480 train_time:184704ms step_avg:155.61ms step:1198/1480 train_time:184873ms step_avg:155.62ms step:1199/1480 train_time:185036ms step_avg:155.62ms step:1200/1480 train_time:185196ms step_avg:155.63ms step:1201/1480 train_time:185357ms step_avg:155.63ms step:1202/1480 train_time:185526ms step_avg:155.64ms step:1203/1480 train_time:185692ms step_avg:155.65ms step:1204/1480 train_time:185857ms step_avg:155.66ms step:1205/1480 train_time:186019ms step_avg:155.66ms step:1206/1480 train_time:186180ms step_avg:155.67ms step:1207/1480 train_time:186341ms step_avg:155.67ms step:1208/1480 train_time:186501ms step_avg:155.68ms step:1209/1480 train_time:186665ms step_avg:155.68ms step:1210/1480 train_time:186832ms step_avg:155.69ms step:1211/1480 train_time:186996ms step_avg:155.70ms step:1212/1480 train_time:187159ms step_avg:155.71ms step:1213/1480 train_time:187323ms step_avg:155.71ms step:1214/1480 train_time:187489ms step_avg:155.72ms step:1215/1480 train_time:187652ms step_avg:155.73ms step:1216/1480 train_time:187813ms step_avg:155.73ms step:1217/1480 train_time:187976ms step_avg:155.74ms step:1218/1480 train_time:188139ms step_avg:155.74ms step:1219/1480 train_time:188306ms step_avg:155.75ms step:1220/1480 train_time:188469ms step_avg:155.76ms step:1221/1480 train_time:188630ms step_avg:155.76ms step:1222/1480 train_time:188790ms step_avg:155.77ms step:1223/1480 train_time:188953ms step_avg:155.77ms step:1224/1480 train_time:189121ms step_avg:155.78ms step:1225/1480 train_time:189284ms step_avg:155.79ms step:1226/1480 train_time:189449ms step_avg:155.80ms step:1227/1480 train_time:189615ms step_avg:155.81ms step:1228/1480 train_time:189778ms step_avg:155.81ms step:1229/1480 train_time:189941ms step_avg:155.82ms step:1230/1480 train_time:190108ms step_avg:155.83ms step:1231/1480 train_time:190273ms step_avg:155.83ms step:1232/1480 train_time:190439ms step_avg:155.84ms step:1233/1480 train_time:190600ms step_avg:155.85ms step:1234/1480 train_time:190761ms step_avg:155.85ms step:1235/1480 train_time:190928ms step_avg:155.86ms step:1236/1480 train_time:191091ms step_avg:155.87ms step:1237/1480 train_time:191253ms step_avg:155.87ms step:1238/1480 train_time:191423ms step_avg:155.88ms step:1239/1480 train_time:191586ms step_avg:155.89ms step:1240/1480 train_time:191751ms step_avg:155.90ms step:1241/1480 train_time:191917ms step_avg:155.90ms step:1242/1480 train_time:192079ms step_avg:155.91ms step:1243/1480 train_time:192243ms step_avg:155.91ms step:1244/1480 train_time:192402ms step_avg:155.92ms step:1245/1480 train_time:192564ms step_avg:155.92ms step:1246/1480 train_time:192728ms step_avg:155.93ms step:1247/1480 train_time:192893ms step_avg:155.94ms step:1248/1480 train_time:193054ms step_avg:155.94ms step:1249/1480 train_time:193216ms step_avg:155.95ms step:1250/1480 train_time:193378ms step_avg:155.95ms step:1250/1480 val_loss:3.3372 train_time:193455ms step_avg:156.01ms step:1251/1480 train_time:193547ms step_avg:155.96ms step:1252/1480 train_time:193709ms step_avg:155.97ms step:1253/1480 train_time:193871ms step_avg:155.97ms step:1254/1480 train_time:194031ms step_avg:155.97ms step:1255/1480 train_time:194202ms step_avg:155.99ms step:1256/1480 train_time:194367ms step_avg:155.99ms step:1257/1480 train_time:194527ms step_avg:156.00ms step:1258/1480 train_time:194692ms step_avg:156.00ms step:1259/1480 train_time:194856ms step_avg:156.01ms step:1260/1480 train_time:195016ms step_avg:156.01ms step:1261/1480 train_time:195179ms step_avg:156.02ms step:1262/1480 train_time:195343ms step_avg:156.02ms step:1263/1480 train_time:195510ms step_avg:156.03ms step:1264/1480 train_time:195669ms step_avg:156.04ms step:1265/1480 train_time:195830ms step_avg:156.04ms step:1266/1480 train_time:195995ms step_avg:156.05ms step:1267/1480 train_time:196157ms step_avg:156.05ms step:1268/1480 train_time:196321ms step_avg:156.06ms step:1269/1480 train_time:196486ms step_avg:156.07ms step:1270/1480 train_time:196648ms step_avg:156.07ms step:1271/1480 train_time:196813ms step_avg:156.08ms step:1272/1480 train_time:196973ms step_avg:156.08ms step:1273/1480 train_time:197136ms step_avg:156.09ms step:1274/1480 train_time:197302ms step_avg:156.09ms step:1275/1480 train_time:197462ms step_avg:156.10ms step:1276/1480 train_time:197622ms step_avg:156.10ms step:1277/1480 train_time:197785ms step_avg:156.11ms step:1278/1480 train_time:197945ms step_avg:156.11ms step:1279/1480 train_time:198107ms step_avg:156.11ms step:1280/1480 train_time:198274ms step_avg:156.12ms step:1281/1480 train_time:198436ms step_avg:156.13ms step:1282/1480 train_time:198597ms step_avg:156.13ms step:1283/1480 train_time:198760ms step_avg:156.14ms step:1284/1480 train_time:198923ms step_avg:156.14ms step:1285/1480 train_time:199085ms step_avg:156.15ms step:1286/1480 train_time:199246ms step_avg:156.15ms step:1287/1480 train_time:199409ms step_avg:156.15ms step:1288/1480 train_time:199573ms step_avg:156.16ms step:1289/1480 train_time:199742ms step_avg:156.17ms step:1290/1480 train_time:199910ms step_avg:156.18ms step:1291/1480 train_time:200075ms step_avg:156.19ms step:1292/1480 train_time:200240ms step_avg:156.19ms step:1293/1480 train_time:200407ms step_avg:156.20ms step:1294/1480 train_time:200570ms step_avg:156.21ms step:1295/1480 train_time:200734ms step_avg:156.21ms step:1296/1480 train_time:200898ms step_avg:156.22ms step:1297/1480 train_time:201061ms step_avg:156.22ms step:1298/1480 train_time:201223ms step_avg:156.23ms step:1299/1480 train_time:201385ms step_avg:156.23ms step:1300/1480 train_time:201546ms step_avg:156.24ms step:1301/1480 train_time:201706ms step_avg:156.24ms step:1302/1480 train_time:201873ms step_avg:156.25ms step:1303/1480 train_time:202043ms step_avg:156.26ms step:1304/1480 train_time:202207ms step_avg:156.27ms step:1305/1480 train_time:202368ms step_avg:156.27ms step:1306/1480 train_time:202534ms step_avg:156.28ms step:1307/1480 train_time:202695ms step_avg:156.28ms step:1308/1480 train_time:202858ms step_avg:156.28ms step:1309/1480 train_time:203022ms step_avg:156.29ms step:1310/1480 train_time:203185ms step_avg:156.30ms step:1311/1480 train_time:203345ms step_avg:156.30ms step:1312/1480 train_time:203509ms step_avg:156.31ms step:1313/1480 train_time:203674ms step_avg:156.31ms step:1314/1480 train_time:203838ms step_avg:156.32ms step:1315/1480 train_time:204003ms step_avg:156.32ms step:1316/1480 train_time:204162ms step_avg:156.33ms step:1317/1480 train_time:204324ms step_avg:156.33ms step:1318/1480 train_time:204492ms step_avg:156.34ms step:1319/1480 train_time:204659ms step_avg:156.35ms step:1320/1480 train_time:204825ms step_avg:156.36ms step:1321/1480 train_time:204989ms step_avg:156.36ms step:1322/1480 train_time:205159ms step_avg:156.37ms step:1323/1480 train_time:205323ms step_avg:156.38ms step:1324/1480 train_time:205488ms step_avg:156.38ms step:1325/1480 train_time:205659ms step_avg:156.39ms step:1326/1480 train_time:205824ms step_avg:156.40ms step:1327/1480 train_time:205987ms step_avg:156.41ms step:1328/1480 train_time:206148ms step_avg:156.41ms step:1329/1480 train_time:206336ms step_avg:156.43ms step:1330/1480 train_time:206495ms step_avg:156.44ms step:1331/1480 train_time:206660ms step_avg:156.44ms step:1332/1480 train_time:206823ms step_avg:156.45ms step:1333/1480 train_time:206988ms step_avg:156.45ms step:1334/1480 train_time:207152ms step_avg:156.46ms step:1335/1480 train_time:207313ms step_avg:156.46ms step:1336/1480 train_time:207483ms step_avg:156.47ms step:1337/1480 train_time:207649ms step_avg:156.48ms step:1338/1480 train_time:207812ms step_avg:156.49ms step:1339/1480 train_time:207979ms step_avg:156.49ms step:1340/1480 train_time:208142ms step_avg:156.50ms step:1341/1480 train_time:208305ms step_avg:156.50ms step:1342/1480 train_time:208469ms step_avg:156.51ms step:1343/1480 train_time:208630ms step_avg:156.51ms step:1344/1480 train_time:208792ms step_avg:156.52ms step:1345/1480 train_time:208962ms step_avg:156.53ms step:1346/1480 train_time:209123ms step_avg:156.53ms step:1347/1480 train_time:209286ms step_avg:156.53ms step:1348/1480 train_time:209448ms step_avg:156.54ms step:1349/1480 train_time:209611ms step_avg:156.54ms step:1350/1480 train_time:209779ms step_avg:156.55ms step:1351/1480 train_time:209941ms step_avg:156.56ms step:1352/1480 train_time:210104ms step_avg:156.56ms step:1353/1480 train_time:210267ms step_avg:156.57ms step:1354/1480 train_time:210433ms step_avg:156.57ms step:1355/1480 train_time:210595ms step_avg:156.58ms step:1356/1480 train_time:210761ms step_avg:156.58ms step:1357/1480 train_time:210925ms step_avg:156.59ms step:1358/1480 train_time:211088ms step_avg:156.59ms step:1359/1480 train_time:211252ms step_avg:156.60ms step:1360/1480 train_time:211420ms step_avg:156.61ms step:1361/1480 train_time:211586ms step_avg:156.61ms step:1362/1480 train_time:211750ms step_avg:156.62ms step:1363/1480 train_time:211919ms step_avg:156.63ms step:1364/1480 train_time:212081ms step_avg:156.63ms step:1365/1480 train_time:212242ms step_avg:156.64ms step:1366/1480 train_time:212407ms step_avg:156.64ms step:1367/1480 train_time:212572ms step_avg:156.65ms step:1368/1480 train_time:212738ms step_avg:156.66ms step:1369/1480 train_time:212907ms step_avg:156.66ms step:1370/1480 train_time:213073ms step_avg:156.67ms step:1371/1480 train_time:213235ms step_avg:156.68ms step:1372/1480 train_time:213403ms step_avg:156.68ms step:1373/1480 train_time:213564ms step_avg:156.69ms step:1374/1480 train_time:213732ms step_avg:156.70ms step:1375/1480 train_time:213895ms step_avg:156.70ms step:1375/1480 val_loss:3.2988 train_time:213970ms step_avg:156.75ms step:1376/1480 train_time:214064ms step_avg:156.71ms step:1377/1480 train_time:214222ms step_avg:156.71ms step:1378/1480 train_time:214383ms step_avg:156.71ms step:1379/1480 train_time:214548ms step_avg:156.72ms step:1380/1480 train_time:214713ms step_avg:156.72ms step:1381/1480 train_time:214879ms step_avg:156.73ms step:1382/1480 train_time:215041ms step_avg:156.74ms step:1383/1480 train_time:215205ms step_avg:156.74ms step:1384/1480 train_time:215373ms step_avg:156.75ms step:1385/1480 train_time:215534ms step_avg:156.75ms step:1386/1480 train_time:215696ms step_avg:156.76ms step:1387/1480 train_time:215860ms step_avg:156.76ms step:1388/1480 train_time:216021ms step_avg:156.76ms step:1389/1480 train_time:216188ms step_avg:156.77ms step:1390/1480 train_time:216350ms step_avg:156.78ms step:1391/1480 train_time:216513ms step_avg:156.78ms step:1392/1480 train_time:216675ms step_avg:156.78ms step:1393/1480 train_time:216838ms step_avg:156.79ms step:1394/1480 train_time:217001ms step_avg:156.79ms step:1395/1480 train_time:217162ms step_avg:156.80ms step:1396/1480 train_time:217324ms step_avg:156.80ms step:1397/1480 train_time:217484ms step_avg:156.80ms step:1398/1480 train_time:217647ms step_avg:156.81ms step:1399/1480 train_time:217810ms step_avg:156.81ms step:1400/1480 train_time:217977ms step_avg:156.82ms step:1401/1480 train_time:218139ms step_avg:156.82ms step:1402/1480 train_time:218301ms step_avg:156.83ms step:1403/1480 train_time:218466ms step_avg:156.83ms step:1404/1480 train_time:218629ms step_avg:156.84ms step:1405/1480 train_time:218793ms step_avg:156.84ms step:1406/1480 train_time:218959ms step_avg:156.85ms step:1407/1480 train_time:219120ms step_avg:156.85ms step:1408/1480 train_time:219281ms step_avg:156.85ms step:1409/1480 train_time:219456ms step_avg:156.87ms step:1410/1480 train_time:219619ms step_avg:156.87ms step:1411/1480 train_time:219778ms step_avg:156.87ms step:1412/1480 train_time:219941ms step_avg:156.88ms step:1413/1480 train_time:220105ms step_avg:156.88ms step:1414/1480 train_time:220268ms step_avg:156.89ms step:1415/1480 train_time:220435ms step_avg:156.89ms step:1416/1480 train_time:220608ms step_avg:156.90ms step:1417/1480 train_time:220773ms step_avg:156.91ms step:1418/1480 train_time:220937ms step_avg:156.92ms step:1419/1480 train_time:221102ms step_avg:156.92ms step:1420/1480 train_time:221266ms step_avg:156.93ms step:1421/1480 train_time:221435ms step_avg:156.94ms step:1422/1480 train_time:221599ms step_avg:156.94ms step:1423/1480 train_time:221760ms step_avg:156.94ms step:1424/1480 train_time:221927ms step_avg:156.95ms step:1425/1480 train_time:222098ms step_avg:156.96ms step:1426/1480 train_time:222261ms step_avg:156.96ms step:1427/1480 train_time:222428ms step_avg:156.97ms step:1428/1480 train_time:222590ms step_avg:156.97ms step:1429/1480 train_time:222751ms step_avg:156.98ms step:1430/1480 train_time:222917ms step_avg:156.98ms step:1431/1480 train_time:223083ms step_avg:156.99ms step:1432/1480 train_time:223252ms step_avg:157.00ms step:1433/1480 train_time:223420ms step_avg:157.01ms step:1434/1480 train_time:223589ms step_avg:157.01ms step:1435/1480 train_time:223756ms step_avg:157.02ms step:1436/1480 train_time:223920ms step_avg:157.03ms step:1437/1480 train_time:224080ms step_avg:157.03ms step:1438/1480 train_time:224243ms step_avg:157.03ms step:1439/1480 train_time:224410ms step_avg:157.04ms step:1440/1480 train_time:224573ms step_avg:157.04ms step:1441/1480 train_time:224737ms step_avg:157.05ms step:1442/1480 train_time:224902ms step_avg:157.05ms step:1443/1480 train_time:225074ms step_avg:157.07ms step:1444/1480 train_time:225238ms step_avg:157.07ms step:1445/1480 train_time:225401ms step_avg:157.07ms step:1446/1480 train_time:225569ms step_avg:157.08ms step:1447/1480 train_time:225737ms step_avg:157.09ms step:1448/1480 train_time:225901ms step_avg:157.09ms step:1449/1480 train_time:226063ms step_avg:157.10ms step:1450/1480 train_time:226229ms step_avg:157.10ms step:1451/1480 train_time:226393ms step_avg:157.11ms step:1452/1480 train_time:226558ms step_avg:157.11ms step:1453/1480 train_time:226720ms step_avg:157.12ms step:1454/1480 train_time:226881ms step_avg:157.12ms step:1455/1480 train_time:227053ms step_avg:157.13ms step:1456/1480 train_time:227218ms step_avg:157.14ms step:1457/1480 train_time:227379ms step_avg:157.14ms step:1458/1480 train_time:227543ms step_avg:157.14ms step:1459/1480 train_time:227709ms step_avg:157.15ms step:1460/1480 train_time:227873ms step_avg:157.15ms step:1461/1480 train_time:228037ms step_avg:157.16ms step:1462/1480 train_time:228198ms step_avg:157.16ms step:1463/1480 train_time:228363ms step_avg:157.17ms step:1464/1480 train_time:228527ms step_avg:157.17ms step:1465/1480 train_time:228692ms step_avg:157.18ms step:1466/1480 train_time:228855ms step_avg:157.18ms step:1467/1480 train_time:229020ms step_avg:157.19ms step:1468/1480 train_time:229182ms step_avg:157.19ms step:1469/1480 train_time:229346ms step_avg:157.19ms step:1470/1480 train_time:229517ms step_avg:157.20ms step:1471/1480 train_time:229688ms step_avg:157.21ms step:1472/1480 train_time:229859ms step_avg:157.22ms step:1473/1480 train_time:230021ms step_avg:157.23ms step:1474/1480 train_time:230189ms step_avg:157.23ms step:1475/1480 train_time:230358ms step_avg:157.24ms step:1476/1480 train_time:230520ms step_avg:157.24ms step:1477/1480 train_time:230687ms step_avg:157.25ms step:1478/1480 train_time:230859ms step_avg:157.26ms step:1479/1480 train_time:231022ms step_avg:157.27ms step:1480/1480 train_time:231185ms step_avg:157.27ms step:1480/1480 val_loss:3.2798 train_time:231263ms step_avg:157.32ms peak memory consumption: 34239 MiB