import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time import contextlib from dataclasses import dataclass from pathlib import Path import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): self.world_size = int(os.environ['WORLD_SIZE']) self.rank = int(os.environ['RANK']) defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params = list(params) assert all(isinstance(p, torch.Tensor) for p in params) sizes = {p.numel() for p in params} param_groups = [ { 'params': [p for p in params if p.numel() == size], 'update_buffer': [ torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size) ], } for size in sizes ] super().__init__(param_groups, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] nesterov = group['nesterov'] ns_steps = group['ns_steps'] update_buffers = group['update_buffer'] # generate weight updates in distributed fashion params = group['params'] assert len(params) % self.world_size == 0 handle = None params_world = None def update_prev(): if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffers): p_world.data.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() update_prev() handle = dist.all_gather(update_buffers, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, num_heads): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.num_heads, -1) k = self.c_k(x).view(B, T, self.num_heads, -1) v = self.c_v(x).view(B, T, self.num_heads, -1) v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.model_dim, config.num_heads) self.mlp = MLP(config.model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, config: "GPTConfig"): super().__init__() self.__setattr__ self.embed = nn.ModuleList([ nn.Embedding(config.vocab_size, config.model_dim) for _ in range(6) ]) def forward(self, inputs) -> "list[torch.Tensor]": ve = [emb(inputs) for emb in self.embed] ve += reversed(ve) return ve # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 num_layers : int = 12 num_heads : int = 6 # head dim 128 suggested by @Grad62304977 model_dim : int = 768 class GPT(nn.Module): def __init__(self, config: GPTConfig): super().__init__() self.num_layers = config.num_layers # U-net design by @brendanh0gan self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.embed = nn.Embedding(config.vocab_size, config.model_dim) self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning # U-net structure on token value embeddings by @leloykun self.value_embeds = ValueEmbedding(config) self.lm_head = CastedLinear(config.model_dim, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward( self, inputs: torch.Tensor, targets: torch.Tensor, sliding_window_num_blocks: torch.Tensor, ): BLOCK_SIZE = 128 assert inputs.ndim == 1 docs = (inputs == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: torch.Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor): kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx window_bm = q_idx - kv_idx < sliding_window_num_blocks window_full_bm = window_bm # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & window_bm & document_bm full_bm = causal_full_bm & window_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) return BlockMask.from_kv_blocks( kv_num_blocks, kv_indices, full_kv_num_blocks, full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) # forward the GPT model itself x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim) x = norm(x) # @Grad62304977 x0 = x ve = self.value_embeds(inputs) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() # U-net structure on token value embeddings by @leloykun x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(file: Path): # only reads the header, returns header data # header is 256 int32 header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" return int(header[2]) # number of tokens (claimed) def _load_data_shard(path: Path, num_tokens): with path.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, seq_len, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.seq_len = seq_len # glob files that match the pattern self.files = sorted(Path.cwd().glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total self.files_num_tokens = [_peek_data_shard(file) for file in self.files] assert min(self.files_num_tokens) >= num_processes * seq_len + 1 self.total_num_tokens = sum(self.files_num_tokens) self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.seq_len self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) def next_batch(self): batch_size = self.seq_len * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] # host side async is sufficient; # no performance improvement was observed when introducing a separate stream. inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size + 1 >= len(self.tokens): self.advance() return inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1480 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) assert torch.cuda.is_available() device = torch.device(f"cuda:{ddp_local_rank}") torch.cuda.set_device(device) print(f"using device: {device}") dist.init_process_group(backend='nccl', device_id=device) dist.barrier() master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() logdir = Path("logs") / f"{run_id}" logdir.mkdir(exist_ok=True) logfile = Path("logs") / f"{run_id}.txt" print(logfile.stem) # create the log file with logfile.open("w") as f: # begin the log by printing this file (the Python code) print(code, file=f) print("=" * 100, file=f) def print0(s, logonly=False): if master_process: with logfile.open("a") as f: if not logonly: print(s) print(s, file=f) # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running python {sys.version}") print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # calculate the number of steps to take in the val loop. assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") print0('='*100, logonly=True) inputs_train, targets_train = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.blocks.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda") sw_num_blocks_prev = 1 # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social frac_done = step / args.num_iterations # training progress sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128) if sw_num_blocks != sw_num_blocks_prev: sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True) sw_num_blocks_prev = sw_num_blocks # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): inputs_val, targets_val = val_loader.next_batch() val_loss += model(inputs_val, targets_val, sliding_window_num_blocks) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps + 1): with contextlib.ExitStack() as stack: if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step stack.enter_context(model.no_sync()) if step >= 5: stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) model(inputs_train, targets_train, sliding_window_num_blocks).backward() inputs_train, targets_train = train_loader.next_batch() if train_accumulation_steps != 1: for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) for group in optimizer3.param_groups: group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0] Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Wed Dec 11 09:40:50 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 126W / 700W | 7084MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 116W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 29C P0 112W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 37C P0 114W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3451MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 3211MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1000000000 across 10 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1480 train_time:28829ms step_avg:nanms step:2/1480 train_time:28934ms step_avg:nanms step:3/1480 train_time:29057ms step_avg:nanms step:4/1480 train_time:29197ms step_avg:nanms step:5/1480 train_time:29339ms step_avg:nanms step:6/1480 train_time:29482ms step_avg:nanms step:7/1480 train_time:29623ms step_avg:nanms step:8/1480 train_time:29770ms step_avg:nanms step:9/1480 train_time:29908ms step_avg:nanms step:10/1480 train_time:30052ms step_avg:nanms step:11/1480 train_time:144ms step_avg:nanms step:12/1480 train_time:283ms step_avg:nanms step:13/1480 train_time:425ms step_avg:141.82ms step:14/1480 train_time:569ms step_avg:142.14ms step:15/1480 train_time:709ms step_avg:141.79ms step:16/1480 train_time:852ms step_avg:141.98ms step:17/1480 train_time:993ms step_avg:141.82ms step:18/1480 train_time:1136ms step_avg:141.97ms step:19/1480 train_time:1279ms step_avg:142.10ms step:20/1480 train_time:1423ms step_avg:142.29ms step:21/1480 train_time:1568ms step_avg:142.55ms step:22/1480 train_time:1709ms step_avg:142.45ms step:23/1480 train_time:1851ms step_avg:142.38ms step:24/1480 train_time:1994ms step_avg:142.40ms step:25/1480 train_time:2135ms step_avg:142.32ms step:26/1480 train_time:2277ms step_avg:142.29ms step:27/1480 train_time:2419ms step_avg:142.27ms step:28/1480 train_time:2563ms step_avg:142.41ms step:29/1480 train_time:2706ms step_avg:142.44ms step:30/1480 train_time:2851ms step_avg:142.53ms step:31/1480 train_time:2992ms step_avg:142.46ms step:32/1480 train_time:3134ms step_avg:142.45ms step:33/1480 train_time:3276ms step_avg:142.44ms step:34/1480 train_time:3419ms step_avg:142.47ms step:35/1480 train_time:3564ms step_avg:142.57ms step:36/1480 train_time:3708ms step_avg:142.60ms step:37/1480 train_time:3851ms step_avg:142.62ms step:38/1480 train_time:3992ms step_avg:142.58ms step:39/1480 train_time:4132ms step_avg:142.50ms step:40/1480 train_time:4274ms step_avg:142.47ms step:41/1480 train_time:4417ms step_avg:142.49ms step:42/1480 train_time:4561ms step_avg:142.52ms step:43/1480 train_time:4704ms step_avg:142.55ms step:44/1480 train_time:4846ms step_avg:142.54ms step:45/1480 train_time:4989ms step_avg:142.55ms step:46/1480 train_time:5132ms step_avg:142.55ms step:47/1480 train_time:5273ms step_avg:142.52ms step:48/1480 train_time:5415ms step_avg:142.49ms step:49/1480 train_time:5558ms step_avg:142.52ms step:50/1480 train_time:5701ms step_avg:142.54ms step:51/1480 train_time:5844ms step_avg:142.53ms step:52/1480 train_time:5987ms step_avg:142.55ms step:53/1480 train_time:6129ms step_avg:142.53ms step:54/1480 train_time:6270ms step_avg:142.50ms step:55/1480 train_time:6412ms step_avg:142.48ms step:56/1480 train_time:6555ms step_avg:142.50ms step:57/1480 train_time:6696ms step_avg:142.47ms step:58/1480 train_time:6840ms step_avg:142.49ms step:59/1480 train_time:6984ms step_avg:142.53ms step:60/1480 train_time:7127ms step_avg:142.54ms step:61/1480 train_time:7270ms step_avg:142.54ms step:62/1480 train_time:7412ms step_avg:142.53ms step:63/1480 train_time:7554ms step_avg:142.52ms step:64/1480 train_time:7695ms step_avg:142.50ms step:65/1480 train_time:7840ms step_avg:142.55ms step:66/1480 train_time:7986ms step_avg:142.60ms step:67/1480 train_time:8128ms step_avg:142.60ms step:68/1480 train_time:8649ms step_avg:149.11ms step:69/1480 train_time:8749ms step_avg:148.29ms step:70/1480 train_time:8891ms step_avg:148.19ms step:71/1480 train_time:9032ms step_avg:148.06ms step:72/1480 train_time:9173ms step_avg:147.96ms step:73/1480 train_time:9315ms step_avg:147.85ms step:74/1480 train_time:9457ms step_avg:147.76ms step:75/1480 train_time:9599ms step_avg:147.68ms step:76/1480 train_time:9745ms step_avg:147.66ms step:77/1480 train_time:9889ms step_avg:147.60ms step:78/1480 train_time:10030ms step_avg:147.50ms step:79/1480 train_time:10560ms step_avg:153.05ms step:80/1480 train_time:10661ms step_avg:152.30ms step:81/1480 train_time:10803ms step_avg:152.16ms step:82/1480 train_time:10946ms step_avg:152.02ms step:83/1480 train_time:11088ms step_avg:151.89ms step:84/1480 train_time:11229ms step_avg:151.74ms step:85/1480 train_time:11372ms step_avg:151.62ms step:86/1480 train_time:11514ms step_avg:151.50ms step:87/1480 train_time:11655ms step_avg:151.37ms step:88/1480 train_time:11800ms step_avg:151.29ms step:89/1480 train_time:11944ms step_avg:151.19ms step:90/1480 train_time:12086ms step_avg:151.08ms step:91/1480 train_time:12229ms step_avg:150.98ms step:92/1480 train_time:12372ms step_avg:150.88ms step:93/1480 train_time:12515ms step_avg:150.78ms step:94/1480 train_time:12659ms step_avg:150.70ms step:95/1480 train_time:12801ms step_avg:150.60ms step:96/1480 train_time:13327ms step_avg:154.96ms step:97/1480 train_time:13848ms step_avg:159.17ms step:98/1480 train_time:13948ms step_avg:158.50ms step:99/1480 train_time:14090ms step_avg:158.32ms step:100/1480 train_time:14231ms step_avg:158.12ms step:101/1480 train_time:14377ms step_avg:157.98ms step:102/1480 train_time:14515ms step_avg:157.77ms step:103/1480 train_time:14657ms step_avg:157.60ms step:104/1480 train_time:14799ms step_avg:157.44ms step:105/1480 train_time:14943ms step_avg:157.30ms step:106/1480 train_time:15088ms step_avg:157.16ms step:107/1480 train_time:15230ms step_avg:157.01ms step:108/1480 train_time:15373ms step_avg:156.86ms step:109/1480 train_time:15515ms step_avg:156.71ms step:110/1480 train_time:15657ms step_avg:156.57ms step:111/1480 train_time:15801ms step_avg:156.44ms step:112/1480 train_time:15948ms step_avg:156.35ms step:113/1480 train_time:16093ms step_avg:156.24ms step:114/1480 train_time:16238ms step_avg:156.13ms step:115/1480 train_time:16383ms step_avg:156.03ms step:116/1480 train_time:16528ms step_avg:155.93ms step:117/1480 train_time:16673ms step_avg:155.83ms step:118/1480 train_time:16819ms step_avg:155.73ms step:119/1480 train_time:16966ms step_avg:155.66ms step:120/1480 train_time:17111ms step_avg:155.55ms step:121/1480 train_time:17256ms step_avg:155.46ms step:122/1480 train_time:17402ms step_avg:155.38ms step:123/1480 train_time:17548ms step_avg:155.29ms step:124/1480 train_time:17693ms step_avg:155.20ms step:125/1480 train_time:17838ms step_avg:155.11ms step:125/1480 val_loss:4.4128 train_time:17903ms step_avg:155.68ms step:126/1480 train_time:17995ms step_avg:155.13ms step:127/1480 train_time:18141ms step_avg:155.05ms step:128/1480 train_time:18287ms step_avg:154.97ms step:129/1480 train_time:18432ms step_avg:154.89ms step:130/1480 train_time:18576ms step_avg:154.80ms step:131/1480 train_time:18722ms step_avg:154.73ms step:132/1480 train_time:18867ms step_avg:154.64ms step:133/1480 train_time:19012ms step_avg:154.57ms step:134/1480 train_time:19158ms step_avg:154.50ms step:135/1480 train_time:19307ms step_avg:154.46ms step:136/1480 train_time:19451ms step_avg:154.38ms step:137/1480 train_time:19596ms step_avg:154.30ms step:138/1480 train_time:19742ms step_avg:154.23ms step:139/1480 train_time:19887ms step_avg:154.16ms step:140/1480 train_time:20031ms step_avg:154.08ms step:141/1480 train_time:20177ms step_avg:154.02ms step:142/1480 train_time:20325ms step_avg:153.97ms step:143/1480 train_time:20470ms step_avg:153.91ms step:144/1480 train_time:20615ms step_avg:153.84ms step:145/1480 train_time:20761ms step_avg:153.79ms step:146/1480 train_time:20908ms step_avg:153.73ms step:147/1480 train_time:21051ms step_avg:153.66ms step:148/1480 train_time:21197ms step_avg:153.60ms step:149/1480 train_time:21345ms step_avg:153.56ms step:150/1480 train_time:21490ms step_avg:153.50ms step:151/1480 train_time:21635ms step_avg:153.44ms step:152/1480 train_time:21782ms step_avg:153.40ms step:153/1480 train_time:21928ms step_avg:153.35ms step:154/1480 train_time:22072ms step_avg:153.28ms step:155/1480 train_time:22218ms step_avg:153.23ms step:156/1480 train_time:22364ms step_avg:153.18ms step:157/1480 train_time:22510ms step_avg:153.13ms step:158/1480 train_time:22654ms step_avg:153.07ms step:159/1480 train_time:22801ms step_avg:153.03ms step:160/1480 train_time:22947ms step_avg:152.98ms step:161/1480 train_time:23091ms step_avg:152.92ms step:162/1480 train_time:23236ms step_avg:152.87ms step:163/1480 train_time:23382ms step_avg:152.83ms step:164/1480 train_time:23529ms step_avg:152.79ms step:165/1480 train_time:23673ms step_avg:152.73ms step:166/1480 train_time:23820ms step_avg:152.69ms step:167/1480 train_time:23966ms step_avg:152.65ms step:168/1480 train_time:24111ms step_avg:152.60ms step:169/1480 train_time:24256ms step_avg:152.55ms step:170/1480 train_time:24402ms step_avg:152.51ms step:171/1480 train_time:24548ms step_avg:152.47ms step:172/1480 train_time:24693ms step_avg:152.43ms step:173/1480 train_time:24839ms step_avg:152.39ms step:174/1480 train_time:24985ms step_avg:152.35ms step:175/1480 train_time:25130ms step_avg:152.30ms step:176/1480 train_time:25276ms step_avg:152.26ms step:177/1480 train_time:25425ms step_avg:152.24ms step:178/1480 train_time:25569ms step_avg:152.20ms step:179/1480 train_time:25715ms step_avg:152.16ms step:180/1480 train_time:25860ms step_avg:152.12ms step:181/1480 train_time:26007ms step_avg:152.09ms step:182/1480 train_time:26151ms step_avg:152.04ms step:183/1480 train_time:26296ms step_avg:152.00ms step:184/1480 train_time:26442ms step_avg:151.97ms step:185/1480 train_time:26589ms step_avg:151.94ms step:186/1480 train_time:26733ms step_avg:151.89ms step:187/1480 train_time:26879ms step_avg:151.86ms step:188/1480 train_time:27026ms step_avg:151.83ms step:189/1480 train_time:27191ms step_avg:151.91ms step:190/1480 train_time:27316ms step_avg:151.75ms step:191/1480 train_time:27462ms step_avg:151.72ms step:192/1480 train_time:27608ms step_avg:151.69ms step:193/1480 train_time:27752ms step_avg:151.65ms step:194/1480 train_time:27898ms step_avg:151.62ms step:195/1480 train_time:28045ms step_avg:151.59ms step:196/1480 train_time:28190ms step_avg:151.56ms step:197/1480 train_time:28334ms step_avg:151.52ms step:198/1480 train_time:28482ms step_avg:151.50ms step:199/1480 train_time:28628ms step_avg:151.47ms step:200/1480 train_time:28773ms step_avg:151.44ms step:201/1480 train_time:28922ms step_avg:151.42ms step:202/1480 train_time:29065ms step_avg:151.38ms step:203/1480 train_time:29210ms step_avg:151.35ms step:204/1480 train_time:29355ms step_avg:151.31ms step:205/1480 train_time:29502ms step_avg:151.29ms step:206/1480 train_time:29649ms step_avg:151.27ms step:207/1480 train_time:29794ms step_avg:151.24ms step:208/1480 train_time:29940ms step_avg:151.21ms step:209/1480 train_time:30086ms step_avg:151.19ms step:210/1480 train_time:30231ms step_avg:151.15ms step:211/1480 train_time:30375ms step_avg:151.12ms step:212/1480 train_time:30522ms step_avg:151.10ms step:213/1480 train_time:30667ms step_avg:151.07ms step:214/1480 train_time:30813ms step_avg:151.04ms step:215/1480 train_time:30959ms step_avg:151.02ms step:216/1480 train_time:31106ms step_avg:151.00ms step:217/1480 train_time:31251ms step_avg:150.97ms step:218/1480 train_time:31396ms step_avg:150.94ms step:219/1480 train_time:31543ms step_avg:150.92ms step:220/1480 train_time:31688ms step_avg:150.90ms step:221/1480 train_time:32224ms step_avg:152.72ms step:222/1480 train_time:32741ms step_avg:154.44ms step:223/1480 train_time:33248ms step_avg:156.10ms step:224/1480 train_time:33358ms step_avg:155.88ms step:225/1480 train_time:33507ms step_avg:155.84ms step:226/1480 train_time:33654ms step_avg:155.80ms step:227/1480 train_time:33802ms step_avg:155.77ms step:228/1480 train_time:33950ms step_avg:155.73ms step:229/1480 train_time:34097ms step_avg:155.69ms step:230/1480 train_time:34246ms step_avg:155.66ms step:231/1480 train_time:34395ms step_avg:155.63ms step:232/1480 train_time:34545ms step_avg:155.61ms step:233/1480 train_time:34693ms step_avg:155.57ms step:234/1480 train_time:34841ms step_avg:155.54ms step:235/1480 train_time:34989ms step_avg:155.51ms step:236/1480 train_time:35136ms step_avg:155.47ms step:237/1480 train_time:35285ms step_avg:155.44ms step:238/1480 train_time:35432ms step_avg:155.40ms step:239/1480 train_time:35580ms step_avg:155.37ms step:240/1480 train_time:35729ms step_avg:155.34ms step:241/1480 train_time:35876ms step_avg:155.31ms step:242/1480 train_time:36026ms step_avg:155.28ms step:243/1480 train_time:36172ms step_avg:155.25ms step:244/1480 train_time:36320ms step_avg:155.22ms step:245/1480 train_time:36469ms step_avg:155.19ms step:246/1480 train_time:36618ms step_avg:155.16ms step:247/1480 train_time:36767ms step_avg:155.13ms step:248/1480 train_time:36914ms step_avg:155.10ms step:249/1480 train_time:37062ms step_avg:155.07ms step:250/1480 train_time:37211ms step_avg:155.05ms step:250/1480 val_loss:3.9884 train_time:37277ms step_avg:155.32ms step:251/1480 train_time:37373ms step_avg:155.07ms step:252/1480 train_time:37516ms step_avg:155.02ms step:253/1480 train_time:37664ms step_avg:155.00ms step:254/1480 train_time:37812ms step_avg:154.97ms step:255/1480 train_time:37959ms step_avg:154.93ms step:256/1480 train_time:38109ms step_avg:154.91ms step:257/1480 train_time:38255ms step_avg:154.88ms step:258/1480 train_time:38406ms step_avg:154.86ms step:259/1480 train_time:38554ms step_avg:154.83ms step:260/1480 train_time:38704ms step_avg:154.82ms step:261/1480 train_time:38851ms step_avg:154.79ms step:262/1480 train_time:38999ms step_avg:154.76ms step:263/1480 train_time:39148ms step_avg:154.74ms step:264/1480 train_time:39296ms step_avg:154.71ms step:265/1480 train_time:39444ms step_avg:154.68ms step:266/1480 train_time:39592ms step_avg:154.66ms step:267/1480 train_time:39742ms step_avg:154.64ms step:268/1480 train_time:39891ms step_avg:154.62ms step:269/1480 train_time:40040ms step_avg:154.59ms step:270/1480 train_time:40188ms step_avg:154.57ms step:271/1480 train_time:40337ms step_avg:154.55ms step:272/1480 train_time:40485ms step_avg:154.52ms step:273/1480 train_time:40633ms step_avg:154.50ms step:274/1480 train_time:40782ms step_avg:154.48ms step:275/1480 train_time:40931ms step_avg:154.46ms step:276/1480 train_time:41079ms step_avg:154.43ms step:277/1480 train_time:41229ms step_avg:154.41ms step:278/1480 train_time:41376ms step_avg:154.39ms step:279/1480 train_time:41526ms step_avg:154.37ms step:280/1480 train_time:41673ms step_avg:154.34ms step:281/1480 train_time:41821ms step_avg:154.32ms step:282/1480 train_time:41970ms step_avg:154.30ms step:283/1480 train_time:42118ms step_avg:154.28ms step:284/1480 train_time:42267ms step_avg:154.26ms step:285/1480 train_time:42414ms step_avg:154.23ms step:286/1480 train_time:42563ms step_avg:154.21ms step:287/1480 train_time:42711ms step_avg:154.19ms step:288/1480 train_time:42858ms step_avg:154.17ms step:289/1480 train_time:43008ms step_avg:154.15ms step:290/1480 train_time:43155ms step_avg:154.12ms step:291/1480 train_time:43305ms step_avg:154.11ms step:292/1480 train_time:43452ms step_avg:154.09ms step:293/1480 train_time:43602ms step_avg:154.07ms step:294/1480 train_time:43751ms step_avg:154.05ms step:295/1480 train_time:43900ms step_avg:154.04ms step:296/1480 train_time:44049ms step_avg:154.02ms step:297/1480 train_time:44196ms step_avg:153.99ms step:298/1480 train_time:44345ms step_avg:153.97ms step:299/1480 train_time:44492ms step_avg:153.95ms step:300/1480 train_time:44642ms step_avg:153.94ms step:301/1480 train_time:44791ms step_avg:153.92ms step:302/1480 train_time:44939ms step_avg:153.90ms step:303/1480 train_time:45087ms step_avg:153.88ms step:304/1480 train_time:45236ms step_avg:153.86ms step:305/1480 train_time:45385ms step_avg:153.85ms step:306/1480 train_time:45533ms step_avg:153.83ms step:307/1480 train_time:45681ms step_avg:153.81ms step:308/1480 train_time:45830ms step_avg:153.79ms step:309/1480 train_time:45979ms step_avg:153.77ms step:310/1480 train_time:46128ms step_avg:153.76ms step:311/1480 train_time:46276ms step_avg:153.74ms step:312/1480 train_time:46425ms step_avg:153.73ms step:313/1480 train_time:46573ms step_avg:153.71ms step:314/1480 train_time:46722ms step_avg:153.69ms step:315/1480 train_time:46871ms step_avg:153.67ms step:316/1480 train_time:47019ms step_avg:153.66ms step:317/1480 train_time:47168ms step_avg:153.64ms step:318/1480 train_time:47316ms step_avg:153.62ms step:319/1480 train_time:47464ms step_avg:153.61ms step:320/1480 train_time:47612ms step_avg:153.59ms step:321/1480 train_time:47760ms step_avg:153.57ms step:322/1480 train_time:47908ms step_avg:153.55ms step:323/1480 train_time:48056ms step_avg:153.53ms step:324/1480 train_time:48206ms step_avg:153.52ms step:325/1480 train_time:48353ms step_avg:153.50ms step:326/1480 train_time:48501ms step_avg:153.49ms step:327/1480 train_time:48650ms step_avg:153.47ms step:328/1480 train_time:48798ms step_avg:153.45ms step:329/1480 train_time:48947ms step_avg:153.44ms step:330/1480 train_time:49096ms step_avg:153.42ms step:331/1480 train_time:49249ms step_avg:153.42ms step:332/1480 train_time:49398ms step_avg:153.41ms step:333/1480 train_time:49551ms step_avg:153.41ms step:334/1480 train_time:49701ms step_avg:153.40ms step:335/1480 train_time:49852ms step_avg:153.39ms step:336/1480 train_time:50002ms step_avg:153.38ms step:337/1480 train_time:50153ms step_avg:153.37ms step:338/1480 train_time:50304ms step_avg:153.36ms step:339/1480 train_time:50454ms step_avg:153.35ms step:340/1480 train_time:50607ms step_avg:153.35ms step:341/1480 train_time:50756ms step_avg:153.34ms step:342/1480 train_time:50907ms step_avg:153.33ms step:343/1480 train_time:51057ms step_avg:153.32ms step:344/1480 train_time:51209ms step_avg:153.32ms step:345/1480 train_time:51358ms step_avg:153.31ms step:346/1480 train_time:51510ms step_avg:153.30ms step:347/1480 train_time:51661ms step_avg:153.30ms step:348/1480 train_time:51812ms step_avg:153.29ms step:349/1480 train_time:51962ms step_avg:153.28ms step:350/1480 train_time:52113ms step_avg:153.27ms step:351/1480 train_time:52264ms step_avg:153.27ms step:352/1480 train_time:52414ms step_avg:153.26ms step:353/1480 train_time:52567ms step_avg:153.26ms step:354/1480 train_time:52717ms step_avg:153.25ms step:355/1480 train_time:52870ms step_avg:153.25ms step:356/1480 train_time:53020ms step_avg:153.24ms step:357/1480 train_time:53170ms step_avg:153.23ms step:358/1480 train_time:53321ms step_avg:153.22ms step:359/1480 train_time:53472ms step_avg:153.22ms step:360/1480 train_time:53625ms step_avg:153.21ms step:361/1480 train_time:53776ms step_avg:153.21ms step:362/1480 train_time:53928ms step_avg:153.20ms step:363/1480 train_time:54078ms step_avg:153.19ms step:364/1480 train_time:54229ms step_avg:153.19ms step:365/1480 train_time:54379ms step_avg:153.18ms step:366/1480 train_time:54530ms step_avg:153.17ms step:367/1480 train_time:54680ms step_avg:153.16ms step:368/1480 train_time:54831ms step_avg:153.16ms step:369/1480 train_time:54982ms step_avg:153.15ms step:370/1480 train_time:55132ms step_avg:153.15ms step:371/1480 train_time:55283ms step_avg:153.14ms step:372/1480 train_time:55435ms step_avg:153.13ms step:373/1480 train_time:55585ms step_avg:153.13ms step:374/1480 train_time:55735ms step_avg:153.12ms step:375/1480 train_time:55886ms step_avg:153.11ms step:375/1480 val_loss:3.8045 train_time:55956ms step_avg:153.30ms step:376/1480 train_time:56051ms step_avg:153.14ms step:377/1480 train_time:56197ms step_avg:153.13ms step:378/1480 train_time:56349ms step_avg:153.12ms step:379/1480 train_time:56518ms step_avg:153.16ms step:380/1480 train_time:56650ms step_avg:153.11ms step:381/1480 train_time:56801ms step_avg:153.10ms step:382/1480 train_time:56952ms step_avg:153.10ms step:383/1480 train_time:57104ms step_avg:153.09ms step:384/1480 train_time:57256ms step_avg:153.09ms step:385/1480 train_time:57406ms step_avg:153.08ms step:386/1480 train_time:57557ms step_avg:153.08ms step:387/1480 train_time:57706ms step_avg:153.07ms step:388/1480 train_time:57857ms step_avg:153.06ms step:389/1480 train_time:58008ms step_avg:153.06ms step:390/1480 train_time:58160ms step_avg:153.05ms step:391/1480 train_time:58311ms step_avg:153.05ms step:392/1480 train_time:58461ms step_avg:153.04ms step:393/1480 train_time:58613ms step_avg:153.04ms step:394/1480 train_time:58763ms step_avg:153.03ms step:395/1480 train_time:58914ms step_avg:153.02ms step:396/1480 train_time:59063ms step_avg:153.01ms step:397/1480 train_time:59215ms step_avg:153.01ms step:398/1480 train_time:59365ms step_avg:153.00ms step:399/1480 train_time:59516ms step_avg:153.00ms step:400/1480 train_time:59667ms step_avg:152.99ms step:401/1480 train_time:59818ms step_avg:152.99ms step:402/1480 train_time:59970ms step_avg:152.98ms step:403/1480 train_time:60119ms step_avg:152.98ms step:404/1480 train_time:60270ms step_avg:152.97ms step:405/1480 train_time:60420ms step_avg:152.96ms step:406/1480 train_time:60572ms step_avg:152.96ms step:407/1480 train_time:60723ms step_avg:152.95ms step:408/1480 train_time:60875ms step_avg:152.95ms step:409/1480 train_time:61024ms step_avg:152.94ms step:410/1480 train_time:61176ms step_avg:152.94ms step:411/1480 train_time:61326ms step_avg:152.93ms step:412/1480 train_time:61478ms step_avg:152.93ms step:413/1480 train_time:61630ms step_avg:152.93ms step:414/1480 train_time:61780ms step_avg:152.92ms step:415/1480 train_time:61932ms step_avg:152.92ms step:416/1480 train_time:62081ms step_avg:152.91ms step:417/1480 train_time:62233ms step_avg:152.91ms step:418/1480 train_time:62383ms step_avg:152.90ms step:419/1480 train_time:62534ms step_avg:152.89ms step:420/1480 train_time:62684ms step_avg:152.89ms step:421/1480 train_time:62835ms step_avg:152.88ms step:422/1480 train_time:62985ms step_avg:152.88ms step:423/1480 train_time:63136ms step_avg:152.87ms step:424/1480 train_time:63286ms step_avg:152.87ms step:425/1480 train_time:63437ms step_avg:152.86ms step:426/1480 train_time:63587ms step_avg:152.85ms step:427/1480 train_time:63738ms step_avg:152.85ms step:428/1480 train_time:63889ms step_avg:152.85ms step:429/1480 train_time:64040ms step_avg:152.84ms step:430/1480 train_time:64191ms step_avg:152.84ms step:431/1480 train_time:64341ms step_avg:152.83ms step:432/1480 train_time:64492ms step_avg:152.82ms step:433/1480 train_time:64643ms step_avg:152.82ms step:434/1480 train_time:64794ms step_avg:152.82ms step:435/1480 train_time:64946ms step_avg:152.81ms step:436/1480 train_time:65096ms step_avg:152.81ms step:437/1480 train_time:65247ms step_avg:152.80ms step:438/1480 train_time:65398ms step_avg:152.80ms step:439/1480 train_time:65549ms step_avg:152.80ms step:440/1480 train_time:65701ms step_avg:152.79ms step:441/1480 train_time:65855ms step_avg:152.79ms step:442/1480 train_time:66007ms step_avg:152.79ms step:443/1480 train_time:66160ms step_avg:152.79ms step:444/1480 train_time:66313ms step_avg:152.79ms step:445/1480 train_time:66464ms step_avg:152.79ms step:446/1480 train_time:66618ms step_avg:152.79ms step:447/1480 train_time:66770ms step_avg:152.79ms step:448/1480 train_time:66922ms step_avg:152.79ms step:449/1480 train_time:67075ms step_avg:152.79ms step:450/1480 train_time:67228ms step_avg:152.79ms step:451/1480 train_time:67382ms step_avg:152.79ms step:452/1480 train_time:67535ms step_avg:152.79ms step:453/1480 train_time:67687ms step_avg:152.79ms step:454/1480 train_time:67840ms step_avg:152.79ms step:455/1480 train_time:67992ms step_avg:152.79ms step:456/1480 train_time:68145ms step_avg:152.79ms step:457/1480 train_time:68297ms step_avg:152.79ms step:458/1480 train_time:68451ms step_avg:152.79ms step:459/1480 train_time:68605ms step_avg:152.79ms step:460/1480 train_time:68757ms step_avg:152.79ms step:461/1480 train_time:68910ms step_avg:152.79ms step:462/1480 train_time:69062ms step_avg:152.79ms step:463/1480 train_time:69216ms step_avg:152.79ms step:464/1480 train_time:69368ms step_avg:152.79ms step:465/1480 train_time:69521ms step_avg:152.79ms step:466/1480 train_time:69675ms step_avg:152.80ms step:467/1480 train_time:69829ms step_avg:152.80ms step:468/1480 train_time:69982ms step_avg:152.80ms step:469/1480 train_time:70135ms step_avg:152.80ms step:470/1480 train_time:70287ms step_avg:152.80ms step:471/1480 train_time:70439ms step_avg:152.80ms step:472/1480 train_time:70591ms step_avg:152.79ms step:473/1480 train_time:70743ms step_avg:152.79ms step:474/1480 train_time:70896ms step_avg:152.79ms step:475/1480 train_time:71050ms step_avg:152.79ms step:476/1480 train_time:71203ms step_avg:152.80ms step:477/1480 train_time:71356ms step_avg:152.80ms step:478/1480 train_time:71508ms step_avg:152.80ms step:479/1480 train_time:71660ms step_avg:152.79ms step:480/1480 train_time:71814ms step_avg:152.80ms step:481/1480 train_time:71966ms step_avg:152.79ms step:482/1480 train_time:72119ms step_avg:152.79ms step:483/1480 train_time:72272ms step_avg:152.79ms step:484/1480 train_time:72425ms step_avg:152.80ms step:485/1480 train_time:72578ms step_avg:152.80ms step:486/1480 train_time:72732ms step_avg:152.80ms step:487/1480 train_time:72885ms step_avg:152.80ms step:488/1480 train_time:73038ms step_avg:152.80ms step:489/1480 train_time:73190ms step_avg:152.80ms step:490/1480 train_time:73343ms step_avg:152.80ms step:491/1480 train_time:73495ms step_avg:152.80ms step:492/1480 train_time:73648ms step_avg:152.80ms step:493/1480 train_time:73801ms step_avg:152.80ms step:494/1480 train_time:73955ms step_avg:152.80ms step:495/1480 train_time:74108ms step_avg:152.80ms step:496/1480 train_time:74261ms step_avg:152.80ms step:497/1480 train_time:74414ms step_avg:152.80ms step:498/1480 train_time:74565ms step_avg:152.80ms step:499/1480 train_time:74718ms step_avg:152.80ms step:500/1480 train_time:74872ms step_avg:152.80ms step:500/1480 val_loss:3.6827 train_time:74941ms step_avg:152.94ms step:501/1480 train_time:75036ms step_avg:152.82ms step:502/1480 train_time:75182ms step_avg:152.81ms step:503/1480 train_time:75335ms step_avg:152.81ms step:504/1480 train_time:75487ms step_avg:152.81ms step:505/1480 train_time:75640ms step_avg:152.81ms step:506/1480 train_time:75792ms step_avg:152.81ms step:507/1480 train_time:75944ms step_avg:152.81ms step:508/1480 train_time:76098ms step_avg:152.81ms step:509/1480 train_time:76252ms step_avg:152.81ms step:510/1480 train_time:76405ms step_avg:152.81ms step:511/1480 train_time:76558ms step_avg:152.81ms step:512/1480 train_time:76712ms step_avg:152.81ms step:513/1480 train_time:76864ms step_avg:152.81ms step:514/1480 train_time:77017ms step_avg:152.81ms step:515/1480 train_time:77170ms step_avg:152.81ms step:516/1480 train_time:77323ms step_avg:152.81ms step:517/1480 train_time:77476ms step_avg:152.81ms step:518/1480 train_time:77629ms step_avg:152.81ms step:519/1480 train_time:77782ms step_avg:152.81ms step:520/1480 train_time:77934ms step_avg:152.81ms step:521/1480 train_time:78087ms step_avg:152.81ms step:522/1480 train_time:78242ms step_avg:152.82ms step:523/1480 train_time:78394ms step_avg:152.82ms step:524/1480 train_time:78549ms step_avg:152.82ms step:525/1480 train_time:78700ms step_avg:152.82ms step:526/1480 train_time:78853ms step_avg:152.82ms step:527/1480 train_time:79005ms step_avg:152.81ms step:528/1480 train_time:79157ms step_avg:152.81ms step:529/1480 train_time:79310ms step_avg:152.81ms step:530/1480 train_time:79463ms step_avg:152.81ms step:531/1480 train_time:79616ms step_avg:152.81ms step:532/1480 train_time:79770ms step_avg:152.82ms step:533/1480 train_time:79922ms step_avg:152.82ms step:534/1480 train_time:80075ms step_avg:152.82ms step:535/1480 train_time:80229ms step_avg:152.82ms step:536/1480 train_time:80381ms step_avg:152.82ms step:537/1480 train_time:80534ms step_avg:152.82ms step:538/1480 train_time:80686ms step_avg:152.81ms step:539/1480 train_time:80841ms step_avg:152.82ms step:540/1480 train_time:80995ms step_avg:152.82ms step:541/1480 train_time:81148ms step_avg:152.82ms step:542/1480 train_time:81301ms step_avg:152.82ms step:543/1480 train_time:81453ms step_avg:152.82ms step:544/1480 train_time:81606ms step_avg:152.82ms step:545/1480 train_time:81758ms step_avg:152.82ms step:546/1480 train_time:81911ms step_avg:152.82ms step:547/1480 train_time:82064ms step_avg:152.82ms step:548/1480 train_time:82216ms step_avg:152.82ms step:549/1480 train_time:82370ms step_avg:152.82ms step:550/1480 train_time:82524ms step_avg:152.82ms step:551/1480 train_time:82679ms step_avg:152.83ms step:552/1480 train_time:82834ms step_avg:152.83ms step:553/1480 train_time:82989ms step_avg:152.83ms step:554/1480 train_time:83143ms step_avg:152.84ms step:555/1480 train_time:83298ms step_avg:152.84ms step:556/1480 train_time:83452ms step_avg:152.84ms step:557/1480 train_time:83607ms step_avg:152.85ms step:558/1480 train_time:83761ms step_avg:152.85ms step:559/1480 train_time:83917ms step_avg:152.85ms step:560/1480 train_time:84072ms step_avg:152.86ms step:561/1480 train_time:84227ms step_avg:152.86ms step:562/1480 train_time:84381ms step_avg:152.86ms step:563/1480 train_time:84535ms step_avg:152.87ms step:564/1480 train_time:84689ms step_avg:152.87ms step:565/1480 train_time:84845ms step_avg:152.87ms step:566/1480 train_time:85001ms step_avg:152.88ms step:567/1480 train_time:85156ms step_avg:152.88ms step:568/1480 train_time:85310ms step_avg:152.89ms step:569/1480 train_time:85479ms step_avg:152.91ms step:570/1480 train_time:85619ms step_avg:152.89ms step:571/1480 train_time:85774ms step_avg:152.90ms step:572/1480 train_time:85930ms step_avg:152.90ms step:573/1480 train_time:86084ms step_avg:152.90ms step:574/1480 train_time:86240ms step_avg:152.91ms step:575/1480 train_time:86395ms step_avg:152.91ms step:576/1480 train_time:86549ms step_avg:152.91ms step:577/1480 train_time:86704ms step_avg:152.92ms step:578/1480 train_time:86857ms step_avg:152.92ms step:579/1480 train_time:87011ms step_avg:152.92ms step:580/1480 train_time:87166ms step_avg:152.92ms step:581/1480 train_time:87321ms step_avg:152.93ms step:582/1480 train_time:87475ms step_avg:152.93ms step:583/1480 train_time:87630ms step_avg:152.93ms step:584/1480 train_time:87784ms step_avg:152.93ms step:585/1480 train_time:87939ms step_avg:152.94ms step:586/1480 train_time:88093ms step_avg:152.94ms step:587/1480 train_time:88248ms step_avg:152.94ms step:588/1480 train_time:88403ms step_avg:152.95ms step:589/1480 train_time:88557ms step_avg:152.95ms step:590/1480 train_time:88711ms step_avg:152.95ms step:591/1480 train_time:88865ms step_avg:152.95ms step:592/1480 train_time:89020ms step_avg:152.95ms step:593/1480 train_time:89175ms step_avg:152.96ms step:594/1480 train_time:89331ms step_avg:152.96ms step:595/1480 train_time:89485ms step_avg:152.97ms step:596/1480 train_time:89641ms step_avg:152.97ms step:597/1480 train_time:89797ms step_avg:152.98ms step:598/1480 train_time:89951ms step_avg:152.98ms step:599/1480 train_time:90105ms step_avg:152.98ms step:600/1480 train_time:90259ms step_avg:152.98ms step:601/1480 train_time:90414ms step_avg:152.98ms step:602/1480 train_time:90569ms step_avg:152.99ms step:603/1480 train_time:90724ms step_avg:152.99ms step:604/1480 train_time:90879ms step_avg:152.99ms step:605/1480 train_time:91034ms step_avg:153.00ms step:606/1480 train_time:91190ms step_avg:153.00ms step:607/1480 train_time:91346ms step_avg:153.01ms step:608/1480 train_time:91501ms step_avg:153.01ms step:609/1480 train_time:91656ms step_avg:153.01ms step:610/1480 train_time:91810ms step_avg:153.02ms step:611/1480 train_time:91965ms step_avg:153.02ms step:612/1480 train_time:92120ms step_avg:153.02ms step:613/1480 train_time:92275ms step_avg:153.03ms step:614/1480 train_time:92430ms step_avg:153.03ms step:615/1480 train_time:92584ms step_avg:153.03ms step:616/1480 train_time:92739ms step_avg:153.03ms step:617/1480 train_time:92893ms step_avg:153.04ms step:618/1480 train_time:93048ms step_avg:153.04ms step:619/1480 train_time:93203ms step_avg:153.04ms step:620/1480 train_time:93357ms step_avg:153.04ms step:621/1480 train_time:93512ms step_avg:153.05ms step:622/1480 train_time:93667ms step_avg:153.05ms step:623/1480 train_time:93823ms step_avg:153.06ms step:624/1480 train_time:93978ms step_avg:153.06ms step:625/1480 train_time:94132ms step_avg:153.06ms step:625/1480 val_loss:3.6050 train_time:94202ms step_avg:153.17ms step:626/1480 train_time:94293ms step_avg:153.07ms step:627/1480 train_time:94449ms step_avg:153.08ms step:628/1480 train_time:94603ms step_avg:153.08ms step:629/1480 train_time:94758ms step_avg:153.08ms step:630/1480 train_time:94912ms step_avg:153.08ms step:631/1480 train_time:95066ms step_avg:153.08ms step:632/1480 train_time:95219ms step_avg:153.09ms step:633/1480 train_time:95374ms step_avg:153.09ms step:634/1480 train_time:95530ms step_avg:153.09ms step:635/1480 train_time:95684ms step_avg:153.09ms step:636/1480 train_time:95838ms step_avg:153.10ms step:637/1480 train_time:95994ms step_avg:153.10ms step:638/1480 train_time:96148ms step_avg:153.10ms step:639/1480 train_time:96303ms step_avg:153.10ms step:640/1480 train_time:96458ms step_avg:153.11ms step:641/1480 train_time:96612ms step_avg:153.11ms step:642/1480 train_time:96766ms step_avg:153.11ms step:643/1480 train_time:96921ms step_avg:153.11ms step:644/1480 train_time:97075ms step_avg:153.12ms step:645/1480 train_time:97230ms step_avg:153.12ms step:646/1480 train_time:97385ms step_avg:153.12ms step:647/1480 train_time:97539ms step_avg:153.12ms step:648/1480 train_time:97695ms step_avg:153.13ms step:649/1480 train_time:97850ms step_avg:153.13ms step:650/1480 train_time:98005ms step_avg:153.13ms step:651/1480 train_time:98161ms step_avg:153.14ms step:652/1480 train_time:98316ms step_avg:153.14ms step:653/1480 train_time:98469ms step_avg:153.14ms step:654/1480 train_time:98624ms step_avg:153.14ms step:655/1480 train_time:98779ms step_avg:153.15ms step:656/1480 train_time:98934ms step_avg:153.15ms step:657/1480 train_time:99088ms step_avg:153.15ms step:658/1480 train_time:99243ms step_avg:153.15ms step:659/1480 train_time:99398ms step_avg:153.16ms step:660/1480 train_time:99556ms step_avg:153.16ms step:661/1480 train_time:99712ms step_avg:153.17ms step:662/1480 train_time:99868ms step_avg:153.17ms step:663/1480 train_time:100025ms step_avg:153.18ms step:664/1480 train_time:100181ms step_avg:153.18ms step:665/1480 train_time:100337ms step_avg:153.19ms step:666/1480 train_time:100493ms step_avg:153.19ms step:667/1480 train_time:100650ms step_avg:153.20ms step:668/1480 train_time:100806ms step_avg:153.20ms step:669/1480 train_time:100965ms step_avg:153.21ms step:670/1480 train_time:101121ms step_avg:153.21ms step:671/1480 train_time:101277ms step_avg:153.22ms step:672/1480 train_time:101434ms step_avg:153.22ms step:673/1480 train_time:101590ms step_avg:153.23ms step:674/1480 train_time:101745ms step_avg:153.23ms step:675/1480 train_time:101902ms step_avg:153.24ms step:676/1480 train_time:102058ms step_avg:153.24ms step:677/1480 train_time:102215ms step_avg:153.25ms step:678/1480 train_time:102370ms step_avg:153.25ms step:679/1480 train_time:102528ms step_avg:153.26ms step:680/1480 train_time:102685ms step_avg:153.26ms step:681/1480 train_time:102841ms step_avg:153.26ms step:682/1480 train_time:102997ms step_avg:153.27ms step:683/1480 train_time:103154ms step_avg:153.28ms step:684/1480 train_time:103311ms step_avg:153.28ms step:685/1480 train_time:103467ms step_avg:153.28ms step:686/1480 train_time:103623ms step_avg:153.29ms step:687/1480 train_time:103779ms step_avg:153.29ms step:688/1480 train_time:103937ms step_avg:153.30ms step:689/1480 train_time:104094ms step_avg:153.30ms step:690/1480 train_time:104250ms step_avg:153.31ms step:691/1480 train_time:104405ms step_avg:153.31ms step:692/1480 train_time:104562ms step_avg:153.32ms step:693/1480 train_time:104718ms step_avg:153.32ms step:694/1480 train_time:104875ms step_avg:153.33ms step:695/1480 train_time:105031ms step_avg:153.33ms step:696/1480 train_time:105187ms step_avg:153.33ms step:697/1480 train_time:105343ms step_avg:153.34ms step:698/1480 train_time:105498ms step_avg:153.34ms step:699/1480 train_time:105655ms step_avg:153.34ms step:700/1480 train_time:105811ms step_avg:153.35ms step:701/1480 train_time:105967ms step_avg:153.35ms step:702/1480 train_time:106126ms step_avg:153.36ms step:703/1480 train_time:106283ms step_avg:153.37ms step:704/1480 train_time:106438ms step_avg:153.37ms step:705/1480 train_time:106596ms step_avg:153.37ms step:706/1480 train_time:106753ms step_avg:153.38ms step:707/1480 train_time:106910ms step_avg:153.39ms step:708/1480 train_time:107065ms step_avg:153.39ms step:709/1480 train_time:107222ms step_avg:153.39ms step:710/1480 train_time:107377ms step_avg:153.40ms step:711/1480 train_time:107534ms step_avg:153.40ms step:712/1480 train_time:107692ms step_avg:153.41ms step:713/1480 train_time:107850ms step_avg:153.41ms step:714/1480 train_time:108007ms step_avg:153.42ms step:715/1480 train_time:108163ms step_avg:153.42ms step:716/1480 train_time:108319ms step_avg:153.43ms step:717/1480 train_time:108474ms step_avg:153.43ms step:718/1480 train_time:108631ms step_avg:153.43ms step:719/1480 train_time:108787ms step_avg:153.44ms step:720/1480 train_time:108945ms step_avg:153.44ms step:721/1480 train_time:109102ms step_avg:153.45ms step:722/1480 train_time:109257ms step_avg:153.45ms step:723/1480 train_time:109413ms step_avg:153.45ms step:724/1480 train_time:109569ms step_avg:153.46ms step:725/1480 train_time:109727ms step_avg:153.46ms step:726/1480 train_time:109884ms step_avg:153.47ms step:727/1480 train_time:110040ms step_avg:153.47ms step:728/1480 train_time:110196ms step_avg:153.48ms step:729/1480 train_time:110353ms step_avg:153.48ms step:730/1480 train_time:110512ms step_avg:153.49ms step:731/1480 train_time:110668ms step_avg:153.49ms step:732/1480 train_time:110824ms step_avg:153.50ms step:733/1480 train_time:110980ms step_avg:153.50ms step:734/1480 train_time:111137ms step_avg:153.50ms step:735/1480 train_time:111294ms step_avg:153.51ms step:736/1480 train_time:111449ms step_avg:153.51ms step:737/1480 train_time:111604ms step_avg:153.51ms step:738/1480 train_time:111761ms step_avg:153.52ms step:739/1480 train_time:111916ms step_avg:153.52ms step:740/1480 train_time:112073ms step_avg:153.52ms step:741/1480 train_time:112231ms step_avg:153.53ms step:742/1480 train_time:112386ms step_avg:153.53ms step:743/1480 train_time:112542ms step_avg:153.54ms step:744/1480 train_time:112697ms step_avg:153.54ms step:745/1480 train_time:112855ms step_avg:153.54ms step:746/1480 train_time:113011ms step_avg:153.55ms step:747/1480 train_time:113166ms step_avg:153.55ms step:748/1480 train_time:113327ms step_avg:153.56ms step:749/1480 train_time:113484ms step_avg:153.56ms step:750/1480 train_time:113640ms step_avg:153.57ms step:750/1480 val_loss:3.5465 train_time:113712ms step_avg:153.66ms step:751/1480 train_time:113807ms step_avg:153.59ms step:752/1480 train_time:113960ms step_avg:153.58ms step:753/1480 train_time:114116ms step_avg:153.59ms step:754/1480 train_time:114271ms step_avg:153.59ms step:755/1480 train_time:114427ms step_avg:153.59ms step:756/1480 train_time:114584ms step_avg:153.60ms step:757/1480 train_time:114743ms step_avg:153.60ms step:758/1480 train_time:114899ms step_avg:153.61ms step:759/1480 train_time:115070ms step_avg:153.63ms step:760/1480 train_time:115212ms step_avg:153.62ms step:761/1480 train_time:115369ms step_avg:153.62ms step:762/1480 train_time:115524ms step_avg:153.62ms step:763/1480 train_time:115681ms step_avg:153.63ms step:764/1480 train_time:115838ms step_avg:153.63ms step:765/1480 train_time:115995ms step_avg:153.64ms step:766/1480 train_time:116152ms step_avg:153.64ms step:767/1480 train_time:116310ms step_avg:153.65ms step:768/1480 train_time:116467ms step_avg:153.65ms step:769/1480 train_time:116624ms step_avg:153.65ms step:770/1480 train_time:116781ms step_avg:153.66ms step:771/1480 train_time:116939ms step_avg:153.66ms step:772/1480 train_time:117097ms step_avg:153.67ms step:773/1480 train_time:117254ms step_avg:153.67ms step:774/1480 train_time:117412ms step_avg:153.68ms step:775/1480 train_time:117568ms step_avg:153.68ms step:776/1480 train_time:117727ms step_avg:153.69ms step:777/1480 train_time:117886ms step_avg:153.70ms step:778/1480 train_time:118044ms step_avg:153.70ms step:779/1480 train_time:118201ms step_avg:153.71ms step:780/1480 train_time:118360ms step_avg:153.71ms step:781/1480 train_time:118518ms step_avg:153.72ms step:782/1480 train_time:118676ms step_avg:153.73ms step:783/1480 train_time:118834ms step_avg:153.73ms step:784/1480 train_time:118992ms step_avg:153.74ms step:785/1480 train_time:119148ms step_avg:153.74ms step:786/1480 train_time:119306ms step_avg:153.75ms step:787/1480 train_time:119463ms step_avg:153.75ms step:788/1480 train_time:119623ms step_avg:153.76ms step:789/1480 train_time:119780ms step_avg:153.76ms step:790/1480 train_time:119939ms step_avg:153.77ms step:791/1480 train_time:120099ms step_avg:153.78ms step:792/1480 train_time:120257ms step_avg:153.78ms step:793/1480 train_time:120414ms step_avg:153.79ms step:794/1480 train_time:120574ms step_avg:153.79ms step:795/1480 train_time:120734ms step_avg:153.80ms step:796/1480 train_time:120893ms step_avg:153.81ms step:797/1480 train_time:121051ms step_avg:153.81ms step:798/1480 train_time:121211ms step_avg:153.82ms step:799/1480 train_time:121370ms step_avg:153.83ms step:800/1480 train_time:121529ms step_avg:153.83ms step:801/1480 train_time:121685ms step_avg:153.84ms step:802/1480 train_time:121844ms step_avg:153.84ms step:803/1480 train_time:122002ms step_avg:153.85ms step:804/1480 train_time:122161ms step_avg:153.85ms step:805/1480 train_time:122321ms step_avg:153.86ms step:806/1480 train_time:122478ms step_avg:153.87ms step:807/1480 train_time:122635ms step_avg:153.87ms step:808/1480 train_time:122794ms step_avg:153.88ms step:809/1480 train_time:122950ms step_avg:153.88ms step:810/1480 train_time:123108ms step_avg:153.88ms step:811/1480 train_time:123264ms step_avg:153.89ms step:812/1480 train_time:123421ms step_avg:153.89ms step:813/1480 train_time:123578ms step_avg:153.89ms step:814/1480 train_time:123735ms step_avg:153.90ms step:815/1480 train_time:123891ms step_avg:153.90ms step:816/1480 train_time:124048ms step_avg:153.91ms step:817/1480 train_time:124206ms step_avg:153.91ms step:818/1480 train_time:124363ms step_avg:153.91ms step:819/1480 train_time:124520ms step_avg:153.92ms step:820/1480 train_time:124679ms step_avg:153.92ms step:821/1480 train_time:124837ms step_avg:153.93ms step:822/1480 train_time:124995ms step_avg:153.93ms step:823/1480 train_time:125152ms step_avg:153.94ms step:824/1480 train_time:125310ms step_avg:153.94ms step:825/1480 train_time:125468ms step_avg:153.95ms step:826/1480 train_time:125626ms step_avg:153.95ms step:827/1480 train_time:125785ms step_avg:153.96ms step:828/1480 train_time:125942ms step_avg:153.96ms step:829/1480 train_time:126101ms step_avg:153.97ms step:830/1480 train_time:126262ms step_avg:153.98ms step:831/1480 train_time:126420ms step_avg:153.98ms step:832/1480 train_time:126580ms step_avg:153.99ms step:833/1480 train_time:126738ms step_avg:154.00ms step:834/1480 train_time:126898ms step_avg:154.00ms step:835/1480 train_time:127055ms step_avg:154.01ms step:836/1480 train_time:127215ms step_avg:154.01ms step:837/1480 train_time:127371ms step_avg:154.02ms step:838/1480 train_time:127528ms step_avg:154.02ms step:839/1480 train_time:127686ms step_avg:154.02ms step:840/1480 train_time:127843ms step_avg:154.03ms step:841/1480 train_time:128001ms step_avg:154.03ms step:842/1480 train_time:128160ms step_avg:154.04ms step:843/1480 train_time:128316ms step_avg:154.04ms step:844/1480 train_time:128474ms step_avg:154.05ms step:845/1480 train_time:128631ms step_avg:154.05ms step:846/1480 train_time:128791ms step_avg:154.06ms step:847/1480 train_time:128949ms step_avg:154.06ms step:848/1480 train_time:129107ms step_avg:154.07ms step:849/1480 train_time:129265ms step_avg:154.07ms step:850/1480 train_time:129422ms step_avg:154.07ms step:851/1480 train_time:129581ms step_avg:154.08ms step:852/1480 train_time:129740ms step_avg:154.09ms step:853/1480 train_time:129898ms step_avg:154.09ms step:854/1480 train_time:130056ms step_avg:154.09ms step:855/1480 train_time:130212ms step_avg:154.10ms step:856/1480 train_time:130370ms step_avg:154.10ms step:857/1480 train_time:130528ms step_avg:154.11ms step:858/1480 train_time:130687ms step_avg:154.11ms step:859/1480 train_time:130845ms step_avg:154.12ms step:860/1480 train_time:131002ms step_avg:154.12ms step:861/1480 train_time:131161ms step_avg:154.13ms step:862/1480 train_time:131321ms step_avg:154.13ms step:863/1480 train_time:131482ms step_avg:154.14ms step:864/1480 train_time:131642ms step_avg:154.15ms step:865/1480 train_time:131800ms step_avg:154.15ms step:866/1480 train_time:131959ms step_avg:154.16ms step:867/1480 train_time:132118ms step_avg:154.16ms step:868/1480 train_time:132274ms step_avg:154.17ms step:869/1480 train_time:132432ms step_avg:154.17ms step:870/1480 train_time:132591ms step_avg:154.18ms step:871/1480 train_time:132746ms step_avg:154.18ms step:872/1480 train_time:132906ms step_avg:154.18ms step:873/1480 train_time:133063ms step_avg:154.19ms step:874/1480 train_time:133222ms step_avg:154.19ms step:875/1480 train_time:133383ms step_avg:154.20ms step:875/1480 val_loss:3.5015 train_time:133455ms step_avg:154.28ms step:876/1480 train_time:133546ms step_avg:154.21ms step:877/1480 train_time:133704ms step_avg:154.21ms step:878/1480 train_time:133862ms step_avg:154.22ms step:879/1480 train_time:134021ms step_avg:154.22ms step:880/1480 train_time:134178ms step_avg:154.23ms step:881/1480 train_time:134336ms step_avg:154.23ms step:882/1480 train_time:134495ms step_avg:154.24ms step:883/1480 train_time:134655ms step_avg:154.24ms step:884/1480 train_time:134814ms step_avg:154.25ms step:885/1480 train_time:134973ms step_avg:154.26ms step:886/1480 train_time:135133ms step_avg:154.26ms step:887/1480 train_time:135292ms step_avg:154.27ms step:888/1480 train_time:135455ms step_avg:154.28ms step:889/1480 train_time:135615ms step_avg:154.28ms step:890/1480 train_time:135772ms step_avg:154.29ms step:891/1480 train_time:135931ms step_avg:154.29ms step:892/1480 train_time:136089ms step_avg:154.30ms step:893/1480 train_time:136247ms step_avg:154.30ms step:894/1480 train_time:136407ms step_avg:154.31ms step:895/1480 train_time:136569ms step_avg:154.32ms step:896/1480 train_time:136727ms step_avg:154.32ms step:897/1480 train_time:136888ms step_avg:154.33ms step:898/1480 train_time:137049ms step_avg:154.33ms step:899/1480 train_time:137208ms step_avg:154.34ms step:900/1480 train_time:137368ms step_avg:154.35ms step:901/1480 train_time:137528ms step_avg:154.35ms step:902/1480 train_time:137686ms step_avg:154.36ms step:903/1480 train_time:137849ms step_avg:154.37ms step:904/1480 train_time:138008ms step_avg:154.37ms step:905/1480 train_time:138166ms step_avg:154.38ms step:906/1480 train_time:138326ms step_avg:154.38ms step:907/1480 train_time:138487ms step_avg:154.39ms step:908/1480 train_time:138646ms step_avg:154.39ms step:909/1480 train_time:138807ms step_avg:154.40ms step:910/1480 train_time:138972ms step_avg:154.41ms step:911/1480 train_time:139131ms step_avg:154.42ms step:912/1480 train_time:139290ms step_avg:154.42ms step:913/1480 train_time:139450ms step_avg:154.43ms step:914/1480 train_time:139609ms step_avg:154.44ms step:915/1480 train_time:139771ms step_avg:154.44ms step:916/1480 train_time:139929ms step_avg:154.45ms step:917/1480 train_time:140087ms step_avg:154.45ms step:918/1480 train_time:140249ms step_avg:154.46ms step:919/1480 train_time:140411ms step_avg:154.47ms step:920/1480 train_time:140571ms step_avg:154.47ms step:921/1480 train_time:140730ms step_avg:154.48ms step:922/1480 train_time:140890ms step_avg:154.48ms step:923/1480 train_time:141047ms step_avg:154.49ms step:924/1480 train_time:141205ms step_avg:154.49ms step:925/1480 train_time:141365ms step_avg:154.50ms step:926/1480 train_time:141525ms step_avg:154.50ms step:927/1480 train_time:141683ms step_avg:154.51ms step:928/1480 train_time:141843ms step_avg:154.51ms step:929/1480 train_time:142003ms step_avg:154.52ms step:930/1480 train_time:142163ms step_avg:154.53ms step:931/1480 train_time:142322ms step_avg:154.53ms step:932/1480 train_time:142482ms step_avg:154.54ms step:933/1480 train_time:142642ms step_avg:154.54ms step:934/1480 train_time:142801ms step_avg:154.55ms step:935/1480 train_time:142963ms step_avg:154.56ms step:936/1480 train_time:143124ms step_avg:154.56ms step:937/1480 train_time:143284ms step_avg:154.57ms step:938/1480 train_time:143443ms step_avg:154.57ms step:939/1480 train_time:143605ms step_avg:154.58ms step:940/1480 train_time:143768ms step_avg:154.59ms step:941/1480 train_time:143925ms step_avg:154.59ms step:942/1480 train_time:144083ms step_avg:154.60ms step:943/1480 train_time:144245ms step_avg:154.60ms step:944/1480 train_time:144407ms step_avg:154.61ms step:945/1480 train_time:144566ms step_avg:154.62ms step:946/1480 train_time:144730ms step_avg:154.63ms step:947/1480 train_time:144891ms step_avg:154.63ms step:948/1480 train_time:145050ms step_avg:154.64ms step:949/1480 train_time:145223ms step_avg:154.66ms step:950/1480 train_time:145368ms step_avg:154.65ms step:951/1480 train_time:145528ms step_avg:154.65ms step:952/1480 train_time:145686ms step_avg:154.66ms step:953/1480 train_time:145848ms step_avg:154.66ms step:954/1480 train_time:146007ms step_avg:154.67ms step:955/1480 train_time:146166ms step_avg:154.67ms step:956/1480 train_time:146325ms step_avg:154.68ms step:957/1480 train_time:146483ms step_avg:154.68ms step:958/1480 train_time:146651ms step_avg:154.70ms step:959/1480 train_time:146809ms step_avg:154.70ms step:960/1480 train_time:146970ms step_avg:154.71ms step:961/1480 train_time:147130ms step_avg:154.71ms step:962/1480 train_time:147288ms step_avg:154.71ms step:963/1480 train_time:147448ms step_avg:154.72ms step:964/1480 train_time:147609ms step_avg:154.73ms step:965/1480 train_time:147768ms step_avg:154.73ms step:966/1480 train_time:147926ms step_avg:154.73ms step:967/1480 train_time:148085ms step_avg:154.74ms step:968/1480 train_time:148245ms step_avg:154.74ms step:969/1480 train_time:148406ms step_avg:154.75ms step:970/1480 train_time:148564ms step_avg:154.75ms step:971/1480 train_time:148724ms step_avg:154.76ms step:972/1480 train_time:148882ms step_avg:154.76ms step:973/1480 train_time:149041ms step_avg:154.77ms step:974/1480 train_time:149201ms step_avg:154.77ms step:975/1480 train_time:149362ms step_avg:154.78ms step:976/1480 train_time:149524ms step_avg:154.79ms step:977/1480 train_time:149683ms step_avg:154.79ms step:978/1480 train_time:149843ms step_avg:154.80ms step:979/1480 train_time:150005ms step_avg:154.80ms step:980/1480 train_time:150166ms step_avg:154.81ms step:981/1480 train_time:150328ms step_avg:154.82ms step:982/1480 train_time:150484ms step_avg:154.82ms step:983/1480 train_time:150645ms step_avg:154.83ms step:984/1480 train_time:150805ms step_avg:154.83ms step:985/1480 train_time:150966ms step_avg:154.84ms step:986/1480 train_time:151126ms step_avg:154.84ms step:987/1480 train_time:151284ms step_avg:154.85ms step:988/1480 train_time:151446ms step_avg:154.85ms step:989/1480 train_time:151605ms step_avg:154.86ms step:990/1480 train_time:151768ms step_avg:154.87ms step:991/1480 train_time:151930ms step_avg:154.87ms step:992/1480 train_time:152093ms step_avg:154.88ms step:993/1480 train_time:152260ms step_avg:154.89ms step:994/1480 train_time:152420ms step_avg:154.90ms step:995/1480 train_time:152579ms step_avg:154.90ms step:996/1480 train_time:152736ms step_avg:154.90ms step:997/1480 train_time:152896ms step_avg:154.91ms step:998/1480 train_time:153057ms step_avg:154.92ms step:999/1480 train_time:153216ms step_avg:154.92ms step:1000/1480 train_time:153379ms step_avg:154.93ms step:1000/1480 val_loss:3.4389 train_time:153453ms step_avg:155.00ms step:1001/1480 train_time:153544ms step_avg:154.94ms step:1002/1480 train_time:153701ms step_avg:154.94ms step:1003/1480 train_time:153864ms step_avg:154.95ms step:1004/1480 train_time:154025ms step_avg:154.95ms step:1005/1480 train_time:154184ms step_avg:154.96ms step:1006/1480 train_time:154345ms step_avg:154.97ms step:1007/1480 train_time:154505ms step_avg:154.97ms step:1008/1480 train_time:154663ms step_avg:154.97ms step:1009/1480 train_time:154828ms step_avg:154.98ms step:1010/1480 train_time:154986ms step_avg:154.99ms step:1011/1480 train_time:155146ms step_avg:154.99ms step:1012/1480 train_time:155303ms step_avg:154.99ms step:1013/1480 train_time:155464ms step_avg:155.00ms step:1014/1480 train_time:155623ms step_avg:155.00ms step:1015/1480 train_time:155786ms step_avg:155.01ms step:1016/1480 train_time:155945ms step_avg:155.01ms step:1017/1480 train_time:156105ms step_avg:155.02ms step:1018/1480 train_time:156265ms step_avg:155.02ms step:1019/1480 train_time:156426ms step_avg:155.03ms step:1020/1480 train_time:156586ms step_avg:155.04ms step:1021/1480 train_time:156745ms step_avg:155.04ms step:1022/1480 train_time:156905ms step_avg:155.04ms step:1023/1480 train_time:157065ms step_avg:155.05ms step:1024/1480 train_time:157225ms step_avg:155.05ms step:1025/1480 train_time:157387ms step_avg:155.06ms step:1026/1480 train_time:157546ms step_avg:155.06ms step:1027/1480 train_time:157704ms step_avg:155.07ms step:1028/1480 train_time:157866ms step_avg:155.07ms step:1029/1480 train_time:158028ms step_avg:155.08ms step:1030/1480 train_time:158190ms step_avg:155.09ms step:1031/1480 train_time:158351ms step_avg:155.09ms step:1032/1480 train_time:158515ms step_avg:155.10ms step:1033/1480 train_time:158676ms step_avg:155.11ms step:1034/1480 train_time:158836ms step_avg:155.11ms step:1035/1480 train_time:158996ms step_avg:155.12ms step:1036/1480 train_time:159157ms step_avg:155.12ms step:1037/1480 train_time:159317ms step_avg:155.13ms step:1038/1480 train_time:159478ms step_avg:155.13ms step:1039/1480 train_time:159639ms step_avg:155.14ms step:1040/1480 train_time:159801ms step_avg:155.15ms step:1041/1480 train_time:159960ms step_avg:155.15ms step:1042/1480 train_time:160118ms step_avg:155.15ms step:1043/1480 train_time:160278ms step_avg:155.16ms step:1044/1480 train_time:160436ms step_avg:155.16ms step:1045/1480 train_time:160598ms step_avg:155.17ms step:1046/1480 train_time:160758ms step_avg:155.17ms step:1047/1480 train_time:160920ms step_avg:155.18ms step:1048/1480 train_time:161080ms step_avg:155.18ms step:1049/1480 train_time:161239ms step_avg:155.19ms step:1050/1480 train_time:161400ms step_avg:155.19ms step:1051/1480 train_time:161560ms step_avg:155.20ms step:1052/1480 train_time:161722ms step_avg:155.20ms step:1053/1480 train_time:161883ms step_avg:155.21ms step:1054/1480 train_time:162043ms step_avg:155.21ms step:1055/1480 train_time:162203ms step_avg:155.22ms step:1056/1480 train_time:162361ms step_avg:155.22ms step:1057/1480 train_time:162521ms step_avg:155.23ms step:1058/1480 train_time:162682ms step_avg:155.23ms step:1059/1480 train_time:162844ms step_avg:155.24ms step:1060/1480 train_time:163004ms step_avg:155.24ms step:1061/1480 train_time:163161ms step_avg:155.24ms step:1062/1480 train_time:163321ms step_avg:155.25ms step:1063/1480 train_time:163480ms step_avg:155.25ms step:1064/1480 train_time:163638ms step_avg:155.25ms step:1065/1480 train_time:163800ms step_avg:155.26ms step:1066/1480 train_time:163961ms step_avg:155.27ms step:1067/1480 train_time:164123ms step_avg:155.27ms step:1068/1480 train_time:164283ms step_avg:155.28ms step:1069/1480 train_time:164445ms step_avg:155.28ms step:1070/1480 train_time:164603ms step_avg:155.29ms step:1071/1480 train_time:164767ms step_avg:155.29ms step:1072/1480 train_time:164925ms step_avg:155.30ms step:1073/1480 train_time:165083ms step_avg:155.30ms step:1074/1480 train_time:165243ms step_avg:155.30ms step:1075/1480 train_time:165404ms step_avg:155.31ms step:1076/1480 train_time:165561ms step_avg:155.31ms step:1077/1480 train_time:165721ms step_avg:155.32ms step:1078/1480 train_time:165885ms step_avg:155.32ms step:1079/1480 train_time:166047ms step_avg:155.33ms step:1080/1480 train_time:166210ms step_avg:155.34ms step:1081/1480 train_time:166371ms step_avg:155.34ms step:1082/1480 train_time:166533ms step_avg:155.35ms step:1083/1480 train_time:166695ms step_avg:155.35ms step:1084/1480 train_time:166856ms step_avg:155.36ms step:1085/1480 train_time:167017ms step_avg:155.36ms step:1086/1480 train_time:167178ms step_avg:155.37ms step:1087/1480 train_time:167339ms step_avg:155.37ms step:1088/1480 train_time:167499ms step_avg:155.38ms step:1089/1480 train_time:167661ms step_avg:155.39ms step:1090/1480 train_time:167824ms step_avg:155.39ms step:1091/1480 train_time:167983ms step_avg:155.40ms step:1092/1480 train_time:168144ms step_avg:155.40ms step:1093/1480 train_time:168304ms step_avg:155.41ms step:1094/1480 train_time:168463ms step_avg:155.41ms step:1095/1480 train_time:168623ms step_avg:155.41ms step:1096/1480 train_time:168784ms step_avg:155.42ms step:1097/1480 train_time:168945ms step_avg:155.42ms step:1098/1480 train_time:169109ms step_avg:155.43ms step:1099/1480 train_time:169271ms step_avg:155.44ms step:1100/1480 train_time:169436ms step_avg:155.45ms step:1101/1480 train_time:169600ms step_avg:155.45ms step:1102/1480 train_time:169761ms step_avg:155.46ms step:1103/1480 train_time:169926ms step_avg:155.47ms step:1104/1480 train_time:170087ms step_avg:155.47ms step:1105/1480 train_time:170250ms step_avg:155.48ms step:1106/1480 train_time:170412ms step_avg:155.49ms step:1107/1480 train_time:170575ms step_avg:155.49ms step:1108/1480 train_time:170736ms step_avg:155.50ms step:1109/1480 train_time:170896ms step_avg:155.50ms step:1110/1480 train_time:171056ms step_avg:155.51ms step:1111/1480 train_time:171218ms step_avg:155.51ms step:1112/1480 train_time:171381ms step_avg:155.52ms step:1113/1480 train_time:171549ms step_avg:155.53ms step:1114/1480 train_time:171711ms step_avg:155.54ms step:1115/1480 train_time:171872ms step_avg:155.54ms step:1116/1480 train_time:172033ms step_avg:155.54ms step:1117/1480 train_time:172197ms step_avg:155.55ms step:1118/1480 train_time:172361ms step_avg:155.56ms step:1119/1480 train_time:172522ms step_avg:155.57ms step:1120/1480 train_time:172682ms step_avg:155.57ms step:1121/1480 train_time:172844ms step_avg:155.57ms step:1122/1480 train_time:173003ms step_avg:155.58ms step:1123/1480 train_time:173163ms step_avg:155.58ms step:1124/1480 train_time:173324ms step_avg:155.59ms step:1125/1480 train_time:173485ms step_avg:155.59ms step:1125/1480 val_loss:3.3830 train_time:173559ms step_avg:155.66ms step:1126/1480 train_time:173655ms step_avg:155.60ms step:1127/1480 train_time:173809ms step_avg:155.60ms step:1128/1480 train_time:173968ms step_avg:155.61ms step:1129/1480 train_time:174132ms step_avg:155.61ms step:1130/1480 train_time:174292ms step_avg:155.62ms step:1131/1480 train_time:174461ms step_avg:155.63ms step:1132/1480 train_time:174621ms step_avg:155.63ms step:1133/1480 train_time:174785ms step_avg:155.64ms step:1134/1480 train_time:174947ms step_avg:155.65ms step:1135/1480 train_time:175107ms step_avg:155.65ms step:1136/1480 train_time:175270ms step_avg:155.66ms step:1137/1480 train_time:175431ms step_avg:155.66ms step:1138/1480 train_time:175599ms step_avg:155.67ms step:1139/1480 train_time:175772ms step_avg:155.69ms step:1140/1480 train_time:175923ms step_avg:155.68ms step:1141/1480 train_time:176086ms step_avg:155.69ms step:1142/1480 train_time:176247ms step_avg:155.69ms step:1143/1480 train_time:176410ms step_avg:155.70ms step:1144/1480 train_time:176572ms step_avg:155.71ms step:1145/1480 train_time:176732ms step_avg:155.71ms step:1146/1480 train_time:176897ms step_avg:155.72ms step:1147/1480 train_time:177059ms step_avg:155.72ms step:1148/1480 train_time:177220ms step_avg:155.73ms step:1149/1480 train_time:177382ms step_avg:155.74ms step:1150/1480 train_time:177543ms step_avg:155.74ms step:1151/1480 train_time:177705ms step_avg:155.74ms step:1152/1480 train_time:177867ms step_avg:155.75ms step:1153/1480 train_time:178031ms step_avg:155.76ms step:1154/1480 train_time:178191ms step_avg:155.76ms step:1155/1480 train_time:178353ms step_avg:155.77ms step:1156/1480 train_time:178522ms step_avg:155.78ms step:1157/1480 train_time:178684ms step_avg:155.78ms step:1158/1480 train_time:178844ms step_avg:155.79ms step:1159/1480 train_time:179004ms step_avg:155.79ms step:1160/1480 train_time:179164ms step_avg:155.80ms step:1161/1480 train_time:179325ms step_avg:155.80ms step:1162/1480 train_time:179486ms step_avg:155.80ms step:1163/1480 train_time:179648ms step_avg:155.81ms step:1164/1480 train_time:179808ms step_avg:155.81ms step:1165/1480 train_time:179967ms step_avg:155.82ms step:1166/1480 train_time:180128ms step_avg:155.82ms step:1167/1480 train_time:180288ms step_avg:155.82ms step:1168/1480 train_time:180452ms step_avg:155.83ms step:1169/1480 train_time:180615ms step_avg:155.84ms step:1170/1480 train_time:180776ms step_avg:155.84ms step:1171/1480 train_time:180939ms step_avg:155.85ms step:1172/1480 train_time:181100ms step_avg:155.85ms step:1173/1480 train_time:181263ms step_avg:155.86ms step:1174/1480 train_time:181430ms step_avg:155.87ms step:1175/1480 train_time:181592ms step_avg:155.87ms step:1176/1480 train_time:181756ms step_avg:155.88ms step:1177/1480 train_time:181924ms step_avg:155.89ms step:1178/1480 train_time:182085ms step_avg:155.89ms step:1179/1480 train_time:182244ms step_avg:155.90ms step:1180/1480 train_time:182410ms step_avg:155.91ms step:1181/1480 train_time:182571ms step_avg:155.91ms step:1182/1480 train_time:182733ms step_avg:155.92ms step:1183/1480 train_time:182895ms step_avg:155.92ms step:1184/1480 train_time:183058ms step_avg:155.93ms step:1185/1480 train_time:183223ms step_avg:155.93ms step:1186/1480 train_time:183386ms step_avg:155.94ms step:1187/1480 train_time:183558ms step_avg:155.95ms step:1188/1480 train_time:183718ms step_avg:155.96ms step:1189/1480 train_time:183880ms step_avg:155.96ms step:1190/1480 train_time:184042ms step_avg:155.97ms step:1191/1480 train_time:184204ms step_avg:155.97ms step:1192/1480 train_time:184364ms step_avg:155.98ms step:1193/1480 train_time:184524ms step_avg:155.98ms step:1194/1480 train_time:184685ms step_avg:155.98ms step:1195/1480 train_time:184847ms step_avg:155.99ms step:1196/1480 train_time:185015ms step_avg:156.00ms step:1197/1480 train_time:185179ms step_avg:156.01ms step:1198/1480 train_time:185347ms step_avg:156.02ms step:1199/1480 train_time:185509ms step_avg:156.02ms step:1200/1480 train_time:185670ms step_avg:156.03ms step:1201/1480 train_time:185830ms step_avg:156.03ms step:1202/1480 train_time:186000ms step_avg:156.04ms step:1203/1480 train_time:186167ms step_avg:156.05ms step:1204/1480 train_time:186330ms step_avg:156.06ms step:1205/1480 train_time:186490ms step_avg:156.06ms step:1206/1480 train_time:186653ms step_avg:156.06ms step:1207/1480 train_time:186815ms step_avg:156.07ms step:1208/1480 train_time:186978ms step_avg:156.08ms step:1209/1480 train_time:187143ms step_avg:156.08ms step:1210/1480 train_time:187308ms step_avg:156.09ms step:1211/1480 train_time:187470ms step_avg:156.09ms step:1212/1480 train_time:187634ms step_avg:156.10ms step:1213/1480 train_time:187800ms step_avg:156.11ms step:1214/1480 train_time:187966ms step_avg:156.12ms step:1215/1480 train_time:188129ms step_avg:156.12ms step:1216/1480 train_time:188290ms step_avg:156.13ms step:1217/1480 train_time:188454ms step_avg:156.13ms step:1218/1480 train_time:188615ms step_avg:156.14ms step:1219/1480 train_time:188784ms step_avg:156.15ms step:1220/1480 train_time:188946ms step_avg:156.15ms step:1221/1480 train_time:189105ms step_avg:156.16ms step:1222/1480 train_time:189265ms step_avg:156.16ms step:1223/1480 train_time:189428ms step_avg:156.16ms step:1224/1480 train_time:189592ms step_avg:156.17ms step:1225/1480 train_time:189757ms step_avg:156.18ms step:1226/1480 train_time:189922ms step_avg:156.19ms step:1227/1480 train_time:190086ms step_avg:156.19ms step:1228/1480 train_time:190246ms step_avg:156.20ms step:1229/1480 train_time:190410ms step_avg:156.20ms step:1230/1480 train_time:190579ms step_avg:156.21ms step:1231/1480 train_time:190745ms step_avg:156.22ms step:1232/1480 train_time:190909ms step_avg:156.23ms step:1233/1480 train_time:191069ms step_avg:156.23ms step:1234/1480 train_time:191234ms step_avg:156.24ms step:1235/1480 train_time:191401ms step_avg:156.25ms step:1236/1480 train_time:191563ms step_avg:156.25ms step:1237/1480 train_time:191724ms step_avg:156.25ms step:1238/1480 train_time:191897ms step_avg:156.27ms step:1239/1480 train_time:192060ms step_avg:156.27ms step:1240/1480 train_time:192223ms step_avg:156.28ms step:1241/1480 train_time:192388ms step_avg:156.29ms step:1242/1480 train_time:192548ms step_avg:156.29ms step:1243/1480 train_time:192711ms step_avg:156.29ms step:1244/1480 train_time:192873ms step_avg:156.30ms step:1245/1480 train_time:193037ms step_avg:156.31ms step:1246/1480 train_time:193200ms step_avg:156.31ms step:1247/1480 train_time:193362ms step_avg:156.32ms step:1248/1480 train_time:193524ms step_avg:156.32ms step:1249/1480 train_time:193684ms step_avg:156.32ms step:1250/1480 train_time:193845ms step_avg:156.33ms step:1250/1480 val_loss:3.3336 train_time:193920ms step_avg:156.39ms step:1251/1480 train_time:194014ms step_avg:156.34ms step:1252/1480 train_time:194178ms step_avg:156.34ms step:1253/1480 train_time:194339ms step_avg:156.35ms step:1254/1480 train_time:194502ms step_avg:156.35ms step:1255/1480 train_time:194671ms step_avg:156.36ms step:1256/1480 train_time:194837ms step_avg:156.37ms step:1257/1480 train_time:195000ms step_avg:156.37ms step:1258/1480 train_time:195164ms step_avg:156.38ms step:1259/1480 train_time:195326ms step_avg:156.39ms step:1260/1480 train_time:195487ms step_avg:156.39ms step:1261/1480 train_time:195649ms step_avg:156.39ms step:1262/1480 train_time:195813ms step_avg:156.40ms step:1263/1480 train_time:195979ms step_avg:156.41ms step:1264/1480 train_time:196138ms step_avg:156.41ms step:1265/1480 train_time:196299ms step_avg:156.41ms step:1266/1480 train_time:196462ms step_avg:156.42ms step:1267/1480 train_time:196624ms step_avg:156.42ms step:1268/1480 train_time:196786ms step_avg:156.43ms step:1269/1480 train_time:196952ms step_avg:156.44ms step:1270/1480 train_time:197114ms step_avg:156.44ms step:1271/1480 train_time:197279ms step_avg:156.45ms step:1272/1480 train_time:197440ms step_avg:156.45ms step:1273/1480 train_time:197605ms step_avg:156.46ms step:1274/1480 train_time:197769ms step_avg:156.46ms step:1275/1480 train_time:197931ms step_avg:156.47ms step:1276/1480 train_time:198092ms step_avg:156.47ms step:1277/1480 train_time:198255ms step_avg:156.48ms step:1278/1480 train_time:198415ms step_avg:156.48ms step:1279/1480 train_time:198578ms step_avg:156.48ms step:1280/1480 train_time:198744ms step_avg:156.49ms step:1281/1480 train_time:198905ms step_avg:156.49ms step:1282/1480 train_time:199064ms step_avg:156.50ms step:1283/1480 train_time:199225ms step_avg:156.50ms step:1284/1480 train_time:199387ms step_avg:156.50ms step:1285/1480 train_time:199549ms step_avg:156.51ms step:1286/1480 train_time:199710ms step_avg:156.51ms step:1287/1480 train_time:199874ms step_avg:156.52ms step:1288/1480 train_time:200038ms step_avg:156.52ms step:1289/1480 train_time:200207ms step_avg:156.53ms step:1290/1480 train_time:200377ms step_avg:156.54ms step:1291/1480 train_time:200541ms step_avg:156.55ms step:1292/1480 train_time:200704ms step_avg:156.56ms step:1293/1480 train_time:200871ms step_avg:156.56ms step:1294/1480 train_time:201036ms step_avg:156.57ms step:1295/1480 train_time:201200ms step_avg:156.58ms step:1296/1480 train_time:201362ms step_avg:156.58ms step:1297/1480 train_time:201525ms step_avg:156.58ms step:1298/1480 train_time:201687ms step_avg:156.59ms step:1299/1480 train_time:201848ms step_avg:156.59ms step:1300/1480 train_time:202008ms step_avg:156.60ms step:1301/1480 train_time:202172ms step_avg:156.60ms step:1302/1480 train_time:202338ms step_avg:156.61ms step:1303/1480 train_time:202505ms step_avg:156.62ms step:1304/1480 train_time:202669ms step_avg:156.62ms step:1305/1480 train_time:202831ms step_avg:156.63ms step:1306/1480 train_time:202998ms step_avg:156.63ms step:1307/1480 train_time:203161ms step_avg:156.64ms step:1308/1480 train_time:203321ms step_avg:156.64ms step:1309/1480 train_time:203485ms step_avg:156.65ms step:1310/1480 train_time:203646ms step_avg:156.65ms step:1311/1480 train_time:203806ms step_avg:156.65ms step:1312/1480 train_time:203972ms step_avg:156.66ms step:1313/1480 train_time:204136ms step_avg:156.67ms step:1314/1480 train_time:204301ms step_avg:156.67ms step:1315/1480 train_time:204465ms step_avg:156.68ms step:1316/1480 train_time:204625ms step_avg:156.68ms step:1317/1480 train_time:204786ms step_avg:156.68ms step:1318/1480 train_time:204952ms step_avg:156.69ms step:1319/1480 train_time:205119ms step_avg:156.70ms step:1320/1480 train_time:205287ms step_avg:156.71ms step:1321/1480 train_time:205449ms step_avg:156.71ms step:1322/1480 train_time:205622ms step_avg:156.72ms step:1323/1480 train_time:205785ms step_avg:156.73ms step:1324/1480 train_time:205947ms step_avg:156.73ms step:1325/1480 train_time:206118ms step_avg:156.74ms step:1326/1480 train_time:206284ms step_avg:156.75ms step:1327/1480 train_time:206445ms step_avg:156.75ms step:1328/1480 train_time:206607ms step_avg:156.76ms step:1329/1480 train_time:206794ms step_avg:156.78ms step:1330/1480 train_time:206955ms step_avg:156.78ms step:1331/1480 train_time:207118ms step_avg:156.79ms step:1332/1480 train_time:207282ms step_avg:156.79ms step:1333/1480 train_time:207445ms step_avg:156.80ms step:1334/1480 train_time:207609ms step_avg:156.80ms step:1335/1480 train_time:207770ms step_avg:156.81ms step:1336/1480 train_time:207938ms step_avg:156.82ms step:1337/1480 train_time:208105ms step_avg:156.82ms step:1338/1480 train_time:208267ms step_avg:156.83ms step:1339/1480 train_time:208431ms step_avg:156.83ms step:1340/1480 train_time:208596ms step_avg:156.84ms step:1341/1480 train_time:208759ms step_avg:156.84ms step:1342/1480 train_time:208924ms step_avg:156.85ms step:1343/1480 train_time:209085ms step_avg:156.85ms step:1344/1480 train_time:209247ms step_avg:156.86ms step:1345/1480 train_time:209417ms step_avg:156.87ms step:1346/1480 train_time:209580ms step_avg:156.87ms step:1347/1480 train_time:209742ms step_avg:156.88ms step:1348/1480 train_time:209905ms step_avg:156.88ms step:1349/1480 train_time:210066ms step_avg:156.88ms step:1350/1480 train_time:210232ms step_avg:156.89ms step:1351/1480 train_time:210394ms step_avg:156.89ms step:1352/1480 train_time:210558ms step_avg:156.90ms step:1353/1480 train_time:210724ms step_avg:156.91ms step:1354/1480 train_time:210886ms step_avg:156.91ms step:1355/1480 train_time:211047ms step_avg:156.91ms step:1356/1480 train_time:211211ms step_avg:156.92ms step:1357/1480 train_time:211376ms step_avg:156.92ms step:1358/1480 train_time:211541ms step_avg:156.93ms step:1359/1480 train_time:211706ms step_avg:156.94ms step:1360/1480 train_time:211872ms step_avg:156.94ms step:1361/1480 train_time:212041ms step_avg:156.95ms step:1362/1480 train_time:212205ms step_avg:156.96ms step:1363/1480 train_time:212373ms step_avg:156.96ms step:1364/1480 train_time:212537ms step_avg:156.97ms step:1365/1480 train_time:212697ms step_avg:156.97ms step:1366/1480 train_time:212863ms step_avg:156.98ms step:1367/1480 train_time:213025ms step_avg:156.98ms step:1368/1480 train_time:213189ms step_avg:156.99ms step:1369/1480 train_time:213359ms step_avg:157.00ms step:1370/1480 train_time:213525ms step_avg:157.00ms step:1371/1480 train_time:213687ms step_avg:157.01ms step:1372/1480 train_time:213856ms step_avg:157.02ms step:1373/1480 train_time:214017ms step_avg:157.02ms step:1374/1480 train_time:214183ms step_avg:157.03ms step:1375/1480 train_time:214344ms step_avg:157.03ms step:1375/1480 val_loss:3.2953 train_time:214419ms step_avg:157.08ms step:1376/1480 train_time:214515ms step_avg:157.04ms step:1377/1480 train_time:214678ms step_avg:157.04ms step:1378/1480 train_time:214841ms step_avg:157.05ms step:1379/1480 train_time:215005ms step_avg:157.05ms step:1380/1480 train_time:215168ms step_avg:157.06ms step:1381/1480 train_time:215337ms step_avg:157.07ms step:1382/1480 train_time:215501ms step_avg:157.07ms step:1383/1480 train_time:215663ms step_avg:157.07ms step:1384/1480 train_time:215830ms step_avg:157.08ms step:1385/1480 train_time:215990ms step_avg:157.08ms step:1386/1480 train_time:216154ms step_avg:157.09ms step:1387/1480 train_time:216319ms step_avg:157.09ms step:1388/1480 train_time:216479ms step_avg:157.10ms step:1389/1480 train_time:216643ms step_avg:157.10ms step:1390/1480 train_time:216804ms step_avg:157.10ms step:1391/1480 train_time:216965ms step_avg:157.11ms step:1392/1480 train_time:217130ms step_avg:157.11ms step:1393/1480 train_time:217295ms step_avg:157.12ms step:1394/1480 train_time:217459ms step_avg:157.12ms step:1395/1480 train_time:217622ms step_avg:157.13ms step:1396/1480 train_time:217783ms step_avg:157.13ms step:1397/1480 train_time:217943ms step_avg:157.13ms step:1398/1480 train_time:218103ms step_avg:157.13ms step:1399/1480 train_time:218263ms step_avg:157.14ms step:1400/1480 train_time:218432ms step_avg:157.15ms step:1401/1480 train_time:218593ms step_avg:157.15ms step:1402/1480 train_time:218755ms step_avg:157.15ms step:1403/1480 train_time:218923ms step_avg:157.16ms step:1404/1480 train_time:219085ms step_avg:157.16ms step:1405/1480 train_time:219251ms step_avg:157.17ms step:1406/1480 train_time:219418ms step_avg:157.18ms step:1407/1480 train_time:219577ms step_avg:157.18ms step:1408/1480 train_time:219739ms step_avg:157.18ms step:1409/1480 train_time:219911ms step_avg:157.19ms step:1410/1480 train_time:220075ms step_avg:157.20ms step:1411/1480 train_time:220237ms step_avg:157.20ms step:1412/1480 train_time:220399ms step_avg:157.20ms step:1413/1480 train_time:220561ms step_avg:157.21ms step:1414/1480 train_time:220725ms step_avg:157.21ms step:1415/1480 train_time:220888ms step_avg:157.22ms step:1416/1480 train_time:221064ms step_avg:157.23ms step:1417/1480 train_time:221229ms step_avg:157.23ms step:1418/1480 train_time:221396ms step_avg:157.24ms step:1419/1480 train_time:221563ms step_avg:157.25ms step:1420/1480 train_time:221726ms step_avg:157.25ms step:1421/1480 train_time:221893ms step_avg:157.26ms step:1422/1480 train_time:222057ms step_avg:157.26ms step:1423/1480 train_time:222219ms step_avg:157.27ms step:1424/1480 train_time:222383ms step_avg:157.27ms step:1425/1480 train_time:222554ms step_avg:157.28ms step:1426/1480 train_time:222719ms step_avg:157.29ms step:1427/1480 train_time:222883ms step_avg:157.29ms step:1428/1480 train_time:223045ms step_avg:157.30ms step:1429/1480 train_time:223205ms step_avg:157.30ms step:1430/1480 train_time:223372ms step_avg:157.30ms step:1431/1480 train_time:223538ms step_avg:157.31ms step:1432/1480 train_time:223708ms step_avg:157.32ms step:1433/1480 train_time:223878ms step_avg:157.33ms step:1434/1480 train_time:224048ms step_avg:157.34ms step:1435/1480 train_time:224215ms step_avg:157.34ms step:1436/1480 train_time:224379ms step_avg:157.35ms step:1437/1480 train_time:224541ms step_avg:157.35ms step:1438/1480 train_time:224702ms step_avg:157.35ms step:1439/1480 train_time:224866ms step_avg:157.36ms step:1440/1480 train_time:225030ms step_avg:157.36ms step:1441/1480 train_time:225195ms step_avg:157.37ms step:1442/1480 train_time:225361ms step_avg:157.38ms step:1443/1480 train_time:225535ms step_avg:157.39ms step:1444/1480 train_time:225699ms step_avg:157.39ms step:1445/1480 train_time:225861ms step_avg:157.39ms step:1446/1480 train_time:226025ms step_avg:157.40ms step:1447/1480 train_time:226195ms step_avg:157.41ms step:1448/1480 train_time:226357ms step_avg:157.41ms step:1449/1480 train_time:226521ms step_avg:157.42ms step:1450/1480 train_time:226684ms step_avg:157.42ms step:1451/1480 train_time:226847ms step_avg:157.42ms step:1452/1480 train_time:227011ms step_avg:157.43ms step:1453/1480 train_time:227176ms step_avg:157.43ms step:1454/1480 train_time:227339ms step_avg:157.44ms step:1455/1480 train_time:227506ms step_avg:157.44ms step:1456/1480 train_time:227669ms step_avg:157.45ms step:1457/1480 train_time:227832ms step_avg:157.45ms step:1458/1480 train_time:227997ms step_avg:157.46ms step:1459/1480 train_time:228162ms step_avg:157.46ms step:1460/1480 train_time:228325ms step_avg:157.47ms step:1461/1480 train_time:228488ms step_avg:157.47ms step:1462/1480 train_time:228656ms step_avg:157.48ms step:1463/1480 train_time:228822ms step_avg:157.48ms step:1464/1480 train_time:228985ms step_avg:157.49ms step:1465/1480 train_time:229149ms step_avg:157.49ms step:1466/1480 train_time:229313ms step_avg:157.50ms step:1467/1480 train_time:229478ms step_avg:157.50ms step:1468/1480 train_time:229641ms step_avg:157.50ms step:1469/1480 train_time:229804ms step_avg:157.51ms step:1470/1480 train_time:229972ms step_avg:157.52ms step:1471/1480 train_time:230144ms step_avg:157.53ms step:1472/1480 train_time:230316ms step_avg:157.53ms step:1473/1480 train_time:230480ms step_avg:157.54ms step:1474/1480 train_time:230645ms step_avg:157.54ms step:1475/1480 train_time:230815ms step_avg:157.55ms step:1476/1480 train_time:230979ms step_avg:157.56ms step:1477/1480 train_time:231146ms step_avg:157.56ms step:1478/1480 train_time:231318ms step_avg:157.57ms step:1479/1480 train_time:231482ms step_avg:157.58ms step:1480/1480 train_time:231645ms step_avg:157.58ms step:1480/1480 val_loss:3.2764 train_time:231721ms step_avg:157.63ms peak memory consumption: 34237 MiB