import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import glob import time import contextlib from dataclasses import dataclass import numpy as np import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import flex_attention, create_block_mask flex_attention = torch.compile(flex_attention, dynamic=False) create_block_mask = torch.compile(create_block_mask, dynamic=False) # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) super().__init__(params, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] zeropower_backend = zeropower_backends[group['backend']] # generate weight updates in distributed fashion total_params = sum(p.numel() for p in group['params']) updates_flat = torch.zeros(total_params, device='cuda', dtype=torch.bfloat16) curr_idx = 0 for i, p in enumerate(group['params']): # luckily this will perfectly distribute a transformer with multiple of 4 layers to 8 GPUs if i % int(os.environ['WORLD_SIZE']) == int(os.environ['RANK']): g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.mul_(momentum).add_(g) g = g.add(buf, alpha=momentum) if group['nesterov'] else buf g = zeropower_backend(g, steps=group['backend_steps']) g *= max(1, g.size(0)/g.size(1))**0.5 updates_flat[curr_idx:curr_idx+p.numel()] = g.flatten() curr_idx += p.numel() # sync updates across devices. we are not memory-constrained so can do this simple deserialization dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) # deserialize and apply updates curr_idx = 0 for p in group['params']: g = updates_flat[curr_idx:curr_idx+p.numel()].view_as(p.data).type_as(p.data) p.data.add_(g, alpha=-lr) curr_idx += p.numel() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lamb = nn.Parameter(torch.tensor(0.5)) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.n_head, -1) k = self.c_k(x).view(B, T, self.n_head, -1) v = self.c_v(x).view(B, T, self.n_head, -1) v = (1 - self.lamb) * v + self.lamb * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 class GPT(nn.Module): def __init__(self, config): super().__init__() # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning vte = nn.Embedding(config.vocab_size, config.n_embd*12), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx, target, attn_blocksize): docs = (idx == 50256).cumsum(0) def document_causal_mask(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < attn_blocksize return causal_mask & document_mask & window_mask S = len(idx) block_mask = create_block_mask(document_causal_mask, None, None, S, S, device="cuda", _compile=True) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(12, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers+i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(filename): # only reads the header, returns header data with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) if header[0] != 20240520: print("ERROR: magic number mismatch in the data .bin file!") print("---> HINT: Are you passing in a correct file with --input_bin?") print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README") print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try") exit(1) assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) return ntok # for now just return the number of tokens def _load_data_shard(filename): with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) # the rest of it are tokens, stored as uint16 tokens = np.frombuffer(f.read(), dtype=np.uint16) assert len(tokens) == ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(glob.glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total ntok_total = 0 for fname in self.files: shard_ntok = _peek_data_shard(fname) assert shard_ntok >= num_processes * T + 1 ntok_total += int(shard_ntok) self.ntok_total = ntok_total self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] buf = torch.tensor(buf.astype(np.int32), dtype=torch.long) x = buf[:-1] # inputs y = buf[1:] # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size >= len(self.tokens): self.advance() return x.cuda(), y.cuda() # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1530 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) # note that this learning rate is neither sensitive nor tuned optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.time() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.time() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the attention blocksize for the current step, in chunks of 64. By @fernbear.bsky.social attn_blocksize = torch.tensor(64*((step/args.num_iterations * (1792 - 64) + 64)//64), dtype=torch.int, device='cuda') # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, attn_blocksize=attn_blocksize) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.time() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.time() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps+1): ctx = model.no_sync() if i < train_accumulation_steps else contextlib.nullcontext() with ctx: # there's no need to sync gradients every accumulation step # forward pass loss = model(x, y, attn_blocksize=attn_blocksize) # advance the dataset for the next batch x, y = train_loader.next_batch() # backward pass loss.backward() train_loss = loss.detach() for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) optimizer3.param_groups[0]['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. #dist.all_reduce(train_loss, op=dist.ReduceOp.AVG) # all-reducing the training loss would be more correct in terms of logging, but slower approx_time = training_time_ms + 1000 * (time.time() - t0) print0(f"step:{step+1}/{args.num_iterations} train_loss:{train_loss.item():.4f} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Thu Dec 5 00:57:42 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 39C P0 76W / 700W | 3MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 31C P0 115W / 700W | 115MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 31C P0 118W / 700W | 529MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 38C P0 102W / 700W | 22MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 39C P0 123W / 700W | 35MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 110W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 39C P0 128W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 119W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1100000000 across 11 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1530 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1530 train_loss:10.8258 train_time:31730ms step_avg:nanms step:2/1530 train_loss:10.0700 train_time:31841ms step_avg:nanms step:3/1530 train_loss:8.3816 train_time:32001ms step_avg:nanms step:4/1530 train_loss:7.5482 train_time:32162ms step_avg:nanms step:5/1530 train_loss:7.4648 train_time:32323ms step_avg:nanms step:6/1530 train_loss:6.9682 train_time:32484ms step_avg:nanms step:7/1530 train_loss:7.2112 train_time:32645ms step_avg:nanms step:8/1530 train_loss:6.7424 train_time:32806ms step_avg:nanms step:9/1530 train_loss:6.6153 train_time:32967ms step_avg:nanms step:10/1530 train_loss:6.5404 train_time:33128ms step_avg:nanms step:11/1530 train_loss:6.4753 train_time:114ms step_avg:nanms step:12/1530 train_loss:6.3450 train_time:275ms step_avg:nanms step:13/1530 train_loss:6.2061 train_time:437ms step_avg:145.50ms step:14/1530 train_loss:6.1681 train_time:596ms step_avg:149.09ms step:15/1530 train_loss:6.1387 train_time:757ms step_avg:151.35ms step:16/1530 train_loss:6.1526 train_time:916ms step_avg:152.73ms step:17/1530 train_loss:6.1882 train_time:1078ms step_avg:153.94ms step:18/1530 train_loss:5.9534 train_time:1238ms step_avg:154.72ms step:19/1530 train_loss:5.9608 train_time:1397ms step_avg:155.23ms step:20/1530 train_loss:5.6929 train_time:1558ms step_avg:155.81ms step:21/1530 train_loss:5.9611 train_time:1718ms step_avg:156.16ms step:22/1530 train_loss:6.1674 train_time:1878ms step_avg:156.53ms step:23/1530 train_loss:5.8411 train_time:2039ms step_avg:156.82ms step:24/1530 train_loss:6.0184 train_time:2198ms step_avg:157.02ms step:25/1530 train_loss:5.6681 train_time:2358ms step_avg:157.22ms step:26/1530 train_loss:5.5914 train_time:2518ms step_avg:157.37ms step:27/1530 train_loss:5.7461 train_time:2678ms step_avg:157.55ms step:28/1530 train_loss:5.4035 train_time:2839ms step_avg:157.70ms step:29/1530 train_loss:5.6537 train_time:2998ms step_avg:157.81ms step:30/1530 train_loss:5.4548 train_time:3159ms step_avg:157.97ms step:31/1530 train_loss:5.4230 train_time:3320ms step_avg:158.07ms step:32/1530 train_loss:5.2838 train_time:3479ms step_avg:158.16ms step:33/1530 train_loss:5.5861 train_time:3639ms step_avg:158.23ms step:34/1530 train_loss:5.4880 train_time:3799ms step_avg:158.30ms step:35/1530 train_loss:5.5814 train_time:3960ms step_avg:158.38ms step:36/1530 train_loss:5.5462 train_time:4119ms step_avg:158.42ms step:37/1530 train_loss:5.4549 train_time:4279ms step_avg:158.49ms step:38/1530 train_loss:5.3058 train_time:4440ms step_avg:158.58ms step:39/1530 train_loss:5.3092 train_time:4600ms step_avg:158.61ms step:40/1530 train_loss:5.2446 train_time:4760ms step_avg:158.67ms step:41/1530 train_loss:5.2151 train_time:4920ms step_avg:158.72ms step:42/1530 train_loss:5.1580 train_time:5080ms step_avg:158.74ms step:43/1530 train_loss:5.2549 train_time:5240ms step_avg:158.77ms step:44/1530 train_loss:5.2297 train_time:5400ms step_avg:158.81ms step:45/1530 train_loss:5.3787 train_time:5560ms step_avg:158.86ms step:46/1530 train_loss:5.1802 train_time:5720ms step_avg:158.89ms step:47/1530 train_loss:5.0725 train_time:5880ms step_avg:158.91ms step:48/1530 train_loss:5.2113 train_time:6040ms step_avg:158.95ms step:49/1530 train_loss:5.1258 train_time:6200ms step_avg:158.97ms step:50/1530 train_loss:5.2324 train_time:6360ms step_avg:159.00ms step:51/1530 train_loss:5.1244 train_time:6520ms step_avg:159.03ms step:52/1530 train_loss:5.0167 train_time:6680ms step_avg:159.04ms step:53/1530 train_loss:5.1628 train_time:6840ms step_avg:159.07ms step:54/1530 train_loss:5.0155 train_time:6998ms step_avg:159.05ms step:55/1530 train_loss:5.4061 train_time:7160ms step_avg:159.12ms step:56/1530 train_loss:5.0274 train_time:7320ms step_avg:159.12ms step:57/1530 train_loss:4.8790 train_time:7480ms step_avg:159.14ms step:58/1530 train_loss:5.0283 train_time:7640ms step_avg:159.16ms step:59/1530 train_loss:5.0223 train_time:7800ms step_avg:159.19ms step:60/1530 train_loss:5.1331 train_time:7960ms step_avg:159.20ms step:61/1530 train_loss:4.8495 train_time:8119ms step_avg:159.20ms step:62/1530 train_loss:4.9731 train_time:8279ms step_avg:159.21ms step:63/1530 train_loss:4.9743 train_time:8440ms step_avg:159.24ms step:64/1530 train_loss:4.8622 train_time:8599ms step_avg:159.24ms step:65/1530 train_loss:4.7961 train_time:8759ms step_avg:159.26ms step:66/1530 train_loss:4.9459 train_time:8919ms step_avg:159.27ms step:67/1530 train_loss:4.8071 train_time:9079ms step_avg:159.29ms step:68/1530 train_loss:5.0739 train_time:9240ms step_avg:159.31ms step:69/1530 train_loss:4.7344 train_time:9400ms step_avg:159.32ms step:70/1530 train_loss:4.8536 train_time:9560ms step_avg:159.33ms step:71/1530 train_loss:4.9832 train_time:9719ms step_avg:159.33ms step:72/1530 train_loss:4.8978 train_time:9879ms step_avg:159.34ms step:73/1530 train_loss:4.7931 train_time:10040ms step_avg:159.36ms step:74/1530 train_loss:4.9390 train_time:10199ms step_avg:159.36ms step:75/1530 train_loss:4.8778 train_time:10360ms step_avg:159.38ms step:76/1530 train_loss:4.8058 train_time:10520ms step_avg:159.40ms step:77/1530 train_loss:4.9294 train_time:10681ms step_avg:159.42ms step:78/1530 train_loss:5.1149 train_time:10840ms step_avg:159.42ms step:79/1530 train_loss:4.8412 train_time:11000ms step_avg:159.42ms step:80/1530 train_loss:4.8828 train_time:11160ms step_avg:159.43ms step:81/1530 train_loss:4.6585 train_time:11320ms step_avg:159.44ms step:82/1530 train_loss:4.8320 train_time:11480ms step_avg:159.44ms step:83/1530 train_loss:4.7885 train_time:11640ms step_avg:159.45ms step:84/1530 train_loss:4.7637 train_time:11800ms step_avg:159.46ms step:85/1530 train_loss:4.6134 train_time:11960ms step_avg:159.47ms step:86/1530 train_loss:4.8353 train_time:12121ms step_avg:159.48ms step:87/1530 train_loss:4.7611 train_time:12281ms step_avg:159.49ms step:88/1530 train_loss:4.7605 train_time:12441ms step_avg:159.50ms step:89/1530 train_loss:4.7113 train_time:12601ms step_avg:159.51ms step:90/1530 train_loss:4.6613 train_time:12762ms step_avg:159.52ms step:91/1530 train_loss:4.6663 train_time:12921ms step_avg:159.52ms step:92/1530 train_loss:4.8300 train_time:13080ms step_avg:159.51ms step:93/1530 train_loss:4.6291 train_time:13242ms step_avg:159.54ms step:94/1530 train_loss:4.6521 train_time:13402ms step_avg:159.55ms step:95/1530 train_loss:4.7284 train_time:13563ms step_avg:159.56ms step:96/1530 train_loss:4.6127 train_time:13723ms step_avg:159.56ms step:97/1530 train_loss:4.6689 train_time:13882ms step_avg:159.56ms step:98/1530 train_loss:4.5989 train_time:14043ms step_avg:159.58ms step:99/1530 train_loss:4.6936 train_time:14203ms step_avg:159.58ms step:100/1530 train_loss:4.7051 train_time:14366ms step_avg:159.63ms step:101/1530 train_loss:4.5569 train_time:14527ms step_avg:159.64ms step:102/1530 train_loss:4.7301 train_time:14688ms step_avg:159.65ms step:103/1530 train_loss:4.6299 train_time:14849ms step_avg:159.66ms step:104/1530 train_loss:4.5531 train_time:15009ms step_avg:159.68ms step:105/1530 train_loss:4.5737 train_time:15171ms step_avg:159.70ms step:106/1530 train_loss:4.6828 train_time:15332ms step_avg:159.71ms step:107/1530 train_loss:4.5273 train_time:15493ms step_avg:159.72ms step:108/1530 train_loss:4.3788 train_time:15654ms step_avg:159.73ms step:109/1530 train_loss:4.4985 train_time:15813ms step_avg:159.73ms step:110/1530 train_loss:4.5086 train_time:15974ms step_avg:159.74ms step:111/1530 train_loss:4.4504 train_time:16135ms step_avg:159.75ms step:112/1530 train_loss:4.6072 train_time:16295ms step_avg:159.76ms step:113/1530 train_loss:4.5099 train_time:16455ms step_avg:159.76ms step:114/1530 train_loss:4.3931 train_time:16614ms step_avg:159.75ms step:115/1530 train_loss:4.5338 train_time:16777ms step_avg:159.78ms step:116/1530 train_loss:4.4884 train_time:16940ms step_avg:159.82ms step:117/1530 train_loss:4.4055 train_time:17104ms step_avg:159.85ms step:118/1530 train_loss:4.5991 train_time:17268ms step_avg:159.89ms step:119/1530 train_loss:4.4716 train_time:17433ms step_avg:159.93ms step:120/1530 train_loss:4.3611 train_time:17597ms step_avg:159.97ms step:121/1530 train_loss:4.3124 train_time:17760ms step_avg:160.00ms step:122/1530 train_loss:4.4697 train_time:17924ms step_avg:160.03ms step:123/1530 train_loss:4.3007 train_time:18087ms step_avg:160.06ms step:124/1530 train_loss:4.6134 train_time:18252ms step_avg:160.11ms step:125/1530 train_loss:4.4930 train_time:18416ms step_avg:160.14ms step:125/1530 val_loss:4.4321 train_time:18463ms step_avg:160.55ms step:126/1530 train_loss:4.4369 train_time:18583ms step_avg:160.20ms step:127/1530 train_loss:4.4628 train_time:18750ms step_avg:160.26ms step:128/1530 train_loss:4.3902 train_time:18914ms step_avg:160.29ms step:129/1530 train_loss:4.6995 train_time:19078ms step_avg:160.32ms step:130/1530 train_loss:4.3831 train_time:19242ms step_avg:160.35ms step:131/1530 train_loss:4.4073 train_time:19405ms step_avg:160.37ms step:132/1530 train_loss:4.3594 train_time:19569ms step_avg:160.40ms step:133/1530 train_loss:4.4638 train_time:19735ms step_avg:160.45ms step:134/1530 train_loss:4.2853 train_time:19899ms step_avg:160.47ms step:135/1530 train_loss:4.4650 train_time:20062ms step_avg:160.50ms step:136/1530 train_loss:4.2231 train_time:20226ms step_avg:160.53ms step:137/1530 train_loss:4.3929 train_time:20391ms step_avg:160.56ms step:138/1530 train_loss:4.3002 train_time:20555ms step_avg:160.59ms step:139/1530 train_loss:4.3870 train_time:20719ms step_avg:160.61ms step:140/1530 train_loss:4.4755 train_time:20882ms step_avg:160.63ms step:141/1530 train_loss:4.3292 train_time:21047ms step_avg:160.66ms step:142/1530 train_loss:4.3161 train_time:21212ms step_avg:160.69ms step:143/1530 train_loss:4.2604 train_time:21375ms step_avg:160.72ms step:144/1530 train_loss:4.3627 train_time:21540ms step_avg:160.74ms step:145/1530 train_loss:4.3238 train_time:21703ms step_avg:160.76ms step:146/1530 train_loss:4.1847 train_time:21866ms step_avg:160.78ms step:147/1530 train_loss:4.3426 train_time:22030ms step_avg:160.80ms step:148/1530 train_loss:4.3697 train_time:22194ms step_avg:160.83ms step:149/1530 train_loss:4.3072 train_time:22359ms step_avg:160.85ms step:150/1530 train_loss:4.4459 train_time:22522ms step_avg:160.87ms step:151/1530 train_loss:4.2720 train_time:22685ms step_avg:160.88ms step:152/1530 train_loss:4.2869 train_time:22848ms step_avg:160.90ms step:153/1530 train_loss:4.3869 train_time:23013ms step_avg:160.93ms step:154/1530 train_loss:4.3993 train_time:23176ms step_avg:160.95ms step:155/1530 train_loss:4.2833 train_time:23340ms step_avg:160.96ms step:156/1530 train_loss:4.3611 train_time:23504ms step_avg:160.99ms step:157/1530 train_loss:4.4111 train_time:23668ms step_avg:161.01ms step:158/1530 train_loss:4.2629 train_time:23834ms step_avg:161.04ms step:159/1530 train_loss:4.3313 train_time:23997ms step_avg:161.06ms step:160/1530 train_loss:4.1632 train_time:24161ms step_avg:161.08ms step:161/1530 train_loss:4.3662 train_time:24323ms step_avg:161.08ms step:162/1530 train_loss:4.3708 train_time:24488ms step_avg:161.10ms step:163/1530 train_loss:4.3496 train_time:24653ms step_avg:161.13ms step:164/1530 train_loss:4.1979 train_time:24816ms step_avg:161.15ms step:165/1530 train_loss:4.2878 train_time:24980ms step_avg:161.16ms step:166/1530 train_loss:4.3572 train_time:25144ms step_avg:161.18ms step:167/1530 train_loss:4.2141 train_time:25308ms step_avg:161.20ms step:168/1530 train_loss:4.2950 train_time:25473ms step_avg:161.22ms step:169/1530 train_loss:4.1636 train_time:25637ms step_avg:161.24ms step:170/1530 train_loss:4.0322 train_time:25800ms step_avg:161.25ms step:171/1530 train_loss:4.2141 train_time:25962ms step_avg:161.26ms step:172/1530 train_loss:4.2256 train_time:26125ms step_avg:161.27ms step:173/1530 train_loss:4.2681 train_time:26287ms step_avg:161.27ms step:174/1530 train_loss:4.4253 train_time:26451ms step_avg:161.29ms step:175/1530 train_loss:4.2580 train_time:26615ms step_avg:161.30ms step:176/1530 train_loss:4.1022 train_time:26778ms step_avg:161.31ms step:177/1530 train_loss:4.0713 train_time:26941ms step_avg:161.32ms step:178/1530 train_loss:4.1887 train_time:27103ms step_avg:161.33ms step:179/1530 train_loss:4.1348 train_time:27265ms step_avg:161.33ms step:180/1530 train_loss:4.1262 train_time:27429ms step_avg:161.35ms step:181/1530 train_loss:4.2984 train_time:27593ms step_avg:161.36ms step:182/1530 train_loss:4.1532 train_time:27755ms step_avg:161.37ms step:183/1530 train_loss:4.1311 train_time:27919ms step_avg:161.38ms step:184/1530 train_loss:4.1186 train_time:28081ms step_avg:161.39ms step:185/1530 train_loss:4.2106 train_time:28243ms step_avg:161.39ms step:186/1530 train_loss:4.1683 train_time:28406ms step_avg:161.40ms step:187/1530 train_loss:4.2451 train_time:28569ms step_avg:161.41ms step:188/1530 train_loss:4.1712 train_time:28877ms step_avg:162.23ms step:189/1530 train_loss:4.1106 train_time:29208ms step_avg:163.17ms step:190/1530 train_loss:4.2108 train_time:29369ms step_avg:163.16ms step:191/1530 train_loss:4.0773 train_time:29533ms step_avg:163.16ms step:192/1530 train_loss:4.0287 train_time:29695ms step_avg:163.16ms step:193/1530 train_loss:4.2558 train_time:29858ms step_avg:163.16ms step:194/1530 train_loss:4.1775 train_time:30022ms step_avg:163.16ms step:195/1530 train_loss:4.3493 train_time:30184ms step_avg:163.16ms step:196/1530 train_loss:4.1737 train_time:30345ms step_avg:163.15ms step:197/1530 train_loss:4.0472 train_time:30509ms step_avg:163.15ms step:198/1530 train_loss:4.1797 train_time:30673ms step_avg:163.15ms step:199/1530 train_loss:4.0323 train_time:30835ms step_avg:163.15ms step:200/1530 train_loss:4.1136 train_time:30997ms step_avg:163.14ms step:201/1530 train_loss:4.0197 train_time:31160ms step_avg:163.14ms step:202/1530 train_loss:4.2638 train_time:31322ms step_avg:163.14ms step:203/1530 train_loss:4.0654 train_time:31484ms step_avg:163.13ms step:204/1530 train_loss:4.1826 train_time:31647ms step_avg:163.13ms step:205/1530 train_loss:4.2478 train_time:31810ms step_avg:163.13ms step:206/1530 train_loss:3.9431 train_time:31973ms step_avg:163.13ms step:207/1530 train_loss:4.0812 train_time:32136ms step_avg:163.13ms step:208/1530 train_loss:4.1052 train_time:32298ms step_avg:163.12ms step:209/1530 train_loss:4.2452 train_time:32460ms step_avg:163.12ms step:210/1530 train_loss:4.1880 train_time:32623ms step_avg:163.11ms step:211/1530 train_loss:4.0680 train_time:32785ms step_avg:163.11ms step:212/1530 train_loss:4.1336 train_time:32947ms step_avg:163.10ms step:213/1530 train_loss:4.0482 train_time:33112ms step_avg:163.12ms step:214/1530 train_loss:4.1181 train_time:33275ms step_avg:163.11ms step:215/1530 train_loss:3.9503 train_time:33438ms step_avg:163.11ms step:216/1530 train_loss:4.0033 train_time:33601ms step_avg:163.11ms step:217/1530 train_loss:4.0046 train_time:33762ms step_avg:163.10ms step:218/1530 train_loss:4.0870 train_time:33926ms step_avg:163.11ms step:219/1530 train_loss:4.0718 train_time:34089ms step_avg:163.10ms step:220/1530 train_loss:4.0810 train_time:34253ms step_avg:163.11ms step:221/1530 train_loss:4.0915 train_time:34416ms step_avg:163.11ms step:222/1530 train_loss:3.9939 train_time:34579ms step_avg:163.11ms step:223/1530 train_loss:3.9854 train_time:34741ms step_avg:163.11ms step:224/1530 train_loss:4.2927 train_time:34904ms step_avg:163.10ms step:225/1530 train_loss:3.9299 train_time:35066ms step_avg:163.10ms step:226/1530 train_loss:3.9906 train_time:35230ms step_avg:163.10ms step:227/1530 train_loss:3.9810 train_time:35392ms step_avg:163.10ms step:228/1530 train_loss:4.1474 train_time:35558ms step_avg:163.11ms step:229/1530 train_loss:3.9218 train_time:35723ms step_avg:163.12ms step:230/1530 train_loss:4.0431 train_time:35889ms step_avg:163.13ms step:231/1530 train_loss:3.9053 train_time:36055ms step_avg:163.15ms step:232/1530 train_loss:3.9693 train_time:36220ms step_avg:163.15ms step:233/1530 train_loss:4.0864 train_time:36386ms step_avg:163.17ms step:234/1530 train_loss:4.0276 train_time:36553ms step_avg:163.18ms step:235/1530 train_loss:3.9094 train_time:36720ms step_avg:163.20ms step:236/1530 train_loss:4.0751 train_time:36885ms step_avg:163.21ms step:237/1530 train_loss:4.0679 train_time:37053ms step_avg:163.23ms step:238/1530 train_loss:3.9353 train_time:37218ms step_avg:163.24ms step:239/1530 train_loss:4.0823 train_time:37384ms step_avg:163.25ms step:240/1530 train_loss:4.1184 train_time:37550ms step_avg:163.26ms step:241/1530 train_loss:3.9697 train_time:37717ms step_avg:163.28ms step:242/1530 train_loss:4.1393 train_time:37883ms step_avg:163.29ms step:243/1530 train_loss:4.0132 train_time:38048ms step_avg:163.30ms step:244/1530 train_loss:4.0817 train_time:38216ms step_avg:163.32ms step:245/1530 train_loss:4.1432 train_time:38382ms step_avg:163.33ms step:246/1530 train_loss:4.0569 train_time:38550ms step_avg:163.35ms step:247/1530 train_loss:3.9995 train_time:38718ms step_avg:163.37ms step:248/1530 train_loss:4.1021 train_time:38883ms step_avg:163.37ms step:249/1530 train_loss:3.9172 train_time:39050ms step_avg:163.39ms step:250/1530 train_loss:3.9703 train_time:39216ms step_avg:163.40ms step:250/1530 val_loss:3.9967 train_time:39263ms step_avg:163.60ms step:251/1530 train_loss:4.0684 train_time:39385ms step_avg:163.42ms step:252/1530 train_loss:4.1607 train_time:39551ms step_avg:163.43ms step:253/1530 train_loss:3.9275 train_time:39717ms step_avg:163.44ms step:254/1530 train_loss:3.8815 train_time:39883ms step_avg:163.45ms step:255/1530 train_loss:4.0739 train_time:40048ms step_avg:163.46ms step:256/1530 train_loss:3.9797 train_time:40214ms step_avg:163.47ms step:257/1530 train_loss:3.9865 train_time:40380ms step_avg:163.48ms step:258/1530 train_loss:3.9834 train_time:40546ms step_avg:163.49ms step:259/1530 train_loss:4.0277 train_time:40712ms step_avg:163.50ms step:260/1530 train_loss:4.0557 train_time:40879ms step_avg:163.52ms step:261/1530 train_loss:4.0266 train_time:41046ms step_avg:163.53ms step:262/1530 train_loss:3.9926 train_time:41212ms step_avg:163.54ms step:263/1530 train_loss:3.8863 train_time:41378ms step_avg:163.55ms step:264/1530 train_loss:3.9803 train_time:41545ms step_avg:163.56ms step:265/1530 train_loss:3.8623 train_time:41711ms step_avg:163.57ms step:266/1530 train_loss:3.9188 train_time:41876ms step_avg:163.58ms step:267/1530 train_loss:3.9284 train_time:42043ms step_avg:163.59ms step:268/1530 train_loss:3.9523 train_time:42209ms step_avg:163.60ms step:269/1530 train_loss:3.8465 train_time:42375ms step_avg:163.61ms step:270/1530 train_loss:4.0967 train_time:42541ms step_avg:163.62ms step:271/1530 train_loss:3.9769 train_time:42708ms step_avg:163.63ms step:272/1530 train_loss:3.9316 train_time:42873ms step_avg:163.64ms step:273/1530 train_loss:3.9433 train_time:43039ms step_avg:163.64ms step:274/1530 train_loss:4.0408 train_time:43206ms step_avg:163.66ms step:275/1530 train_loss:4.0581 train_time:43371ms step_avg:163.66ms step:276/1530 train_loss:4.2256 train_time:43536ms step_avg:163.67ms step:277/1530 train_loss:4.0340 train_time:43704ms step_avg:163.69ms step:278/1530 train_loss:4.0982 train_time:43870ms step_avg:163.69ms step:279/1530 train_loss:3.9995 train_time:44035ms step_avg:163.70ms step:280/1530 train_loss:4.2053 train_time:44204ms step_avg:163.72ms step:281/1530 train_loss:3.9681 train_time:44369ms step_avg:163.72ms step:282/1530 train_loss:3.9385 train_time:44537ms step_avg:163.74ms step:283/1530 train_loss:3.9119 train_time:44703ms step_avg:163.75ms step:284/1530 train_loss:4.0432 train_time:44868ms step_avg:163.75ms step:285/1530 train_loss:4.0550 train_time:45033ms step_avg:163.76ms step:286/1530 train_loss:4.0881 train_time:45199ms step_avg:163.77ms step:287/1530 train_loss:3.8968 train_time:45364ms step_avg:163.77ms step:288/1530 train_loss:4.0059 train_time:45529ms step_avg:163.77ms step:289/1530 train_loss:3.8795 train_time:45695ms step_avg:163.78ms step:290/1530 train_loss:3.8523 train_time:45860ms step_avg:163.79ms step:291/1530 train_loss:3.9060 train_time:46025ms step_avg:163.79ms step:292/1530 train_loss:3.8615 train_time:46189ms step_avg:163.79ms step:293/1530 train_loss:3.9012 train_time:46353ms step_avg:163.79ms step:294/1530 train_loss:3.9328 train_time:46518ms step_avg:163.80ms step:295/1530 train_loss:3.8442 train_time:46683ms step_avg:163.80ms step:296/1530 train_loss:3.8645 train_time:46849ms step_avg:163.81ms step:297/1530 train_loss:3.8682 train_time:47014ms step_avg:163.81ms step:298/1530 train_loss:3.9741 train_time:47179ms step_avg:163.81ms step:299/1530 train_loss:3.8278 train_time:47345ms step_avg:163.82ms step:300/1530 train_loss:3.9637 train_time:47510ms step_avg:163.83ms step:301/1530 train_loss:3.9599 train_time:47675ms step_avg:163.83ms step:302/1530 train_loss:3.9331 train_time:47841ms step_avg:163.84ms step:303/1530 train_loss:3.9777 train_time:48006ms step_avg:163.84ms step:304/1530 train_loss:3.9621 train_time:48170ms step_avg:163.84ms step:305/1530 train_loss:4.4529 train_time:48335ms step_avg:163.85ms step:306/1530 train_loss:3.9416 train_time:48501ms step_avg:163.85ms step:307/1530 train_loss:3.8331 train_time:48665ms step_avg:163.86ms step:308/1530 train_loss:3.9751 train_time:48830ms step_avg:163.86ms step:309/1530 train_loss:3.8692 train_time:48995ms step_avg:163.86ms step:310/1530 train_loss:4.0798 train_time:49162ms step_avg:163.87ms step:311/1530 train_loss:3.9307 train_time:49328ms step_avg:163.88ms step:312/1530 train_loss:3.8613 train_time:49493ms step_avg:163.88ms step:313/1530 train_loss:3.9266 train_time:49658ms step_avg:163.89ms step:314/1530 train_loss:4.0555 train_time:49824ms step_avg:163.90ms step:315/1530 train_loss:3.9298 train_time:49989ms step_avg:163.90ms step:316/1530 train_loss:3.7920 train_time:50154ms step_avg:163.90ms step:317/1530 train_loss:3.8751 train_time:50321ms step_avg:163.91ms step:318/1530 train_loss:3.9192 train_time:50486ms step_avg:163.92ms step:319/1530 train_loss:3.8922 train_time:50651ms step_avg:163.92ms step:320/1530 train_loss:4.0150 train_time:50815ms step_avg:163.92ms step:321/1530 train_loss:3.9563 train_time:50982ms step_avg:163.93ms step:322/1530 train_loss:3.9305 train_time:51148ms step_avg:163.94ms step:323/1530 train_loss:4.0063 train_time:51314ms step_avg:163.94ms step:324/1530 train_loss:3.9472 train_time:51479ms step_avg:163.95ms step:325/1530 train_loss:4.0088 train_time:51644ms step_avg:163.95ms step:326/1530 train_loss:3.8956 train_time:51810ms step_avg:163.95ms step:327/1530 train_loss:4.3917 train_time:51974ms step_avg:163.96ms step:328/1530 train_loss:4.0673 train_time:52141ms step_avg:163.96ms step:329/1530 train_loss:3.7927 train_time:52306ms step_avg:163.97ms step:330/1530 train_loss:3.7547 train_time:52471ms step_avg:163.97ms step:331/1530 train_loss:3.9755 train_time:52635ms step_avg:163.97ms step:332/1530 train_loss:3.8993 train_time:52803ms step_avg:163.98ms step:333/1530 train_loss:3.8783 train_time:52968ms step_avg:163.99ms step:334/1530 train_loss:3.8399 train_time:53132ms step_avg:163.99ms step:335/1530 train_loss:4.0135 train_time:53297ms step_avg:163.99ms step:336/1530 train_loss:3.9591 train_time:53462ms step_avg:164.00ms step:337/1530 train_loss:4.4192 train_time:53629ms step_avg:164.00ms step:338/1530 train_loss:3.9326 train_time:53793ms step_avg:164.00ms step:339/1530 train_loss:3.8620 train_time:53959ms step_avg:164.01ms step:340/1530 train_loss:3.9316 train_time:54125ms step_avg:164.01ms step:341/1530 train_loss:3.8549 train_time:54290ms step_avg:164.02ms step:342/1530 train_loss:3.8102 train_time:54456ms step_avg:164.02ms step:343/1530 train_loss:3.8276 train_time:54627ms step_avg:164.05ms step:344/1530 train_loss:3.9927 train_time:54794ms step_avg:164.05ms step:345/1530 train_loss:3.8199 train_time:54963ms step_avg:164.07ms step:346/1530 train_loss:3.7604 train_time:55131ms step_avg:164.08ms step:347/1530 train_loss:3.7805 train_time:55300ms step_avg:164.09ms step:348/1530 train_loss:3.8565 train_time:55467ms step_avg:164.10ms step:349/1530 train_loss:3.8239 train_time:55635ms step_avg:164.11ms step:350/1530 train_loss:3.5632 train_time:55804ms step_avg:164.13ms step:351/1530 train_loss:3.8188 train_time:55971ms step_avg:164.14ms step:352/1530 train_loss:4.1934 train_time:56138ms step_avg:164.15ms step:353/1530 train_loss:3.6710 train_time:56308ms step_avg:164.16ms step:354/1530 train_loss:3.9275 train_time:56474ms step_avg:164.17ms step:355/1530 train_loss:3.7879 train_time:56643ms step_avg:164.18ms step:356/1530 train_loss:3.8834 train_time:56812ms step_avg:164.20ms step:357/1530 train_loss:3.7608 train_time:56981ms step_avg:164.21ms step:358/1530 train_loss:3.8617 train_time:57149ms step_avg:164.22ms step:359/1530 train_loss:3.7860 train_time:57321ms step_avg:164.24ms step:360/1530 train_loss:3.4253 train_time:57490ms step_avg:164.26ms step:361/1530 train_loss:4.0210 train_time:57659ms step_avg:164.27ms step:362/1530 train_loss:3.9107 train_time:57827ms step_avg:164.28ms step:363/1530 train_loss:3.8313 train_time:57994ms step_avg:164.29ms step:364/1530 train_loss:3.7455 train_time:58162ms step_avg:164.30ms step:365/1530 train_loss:3.9097 train_time:58329ms step_avg:164.31ms step:366/1530 train_loss:3.8605 train_time:58497ms step_avg:164.32ms step:367/1530 train_loss:3.8544 train_time:58666ms step_avg:164.33ms step:368/1530 train_loss:3.8423 train_time:58833ms step_avg:164.34ms step:369/1530 train_loss:3.7435 train_time:59001ms step_avg:164.35ms step:370/1530 train_loss:3.8742 train_time:59169ms step_avg:164.36ms step:371/1530 train_loss:3.7326 train_time:59336ms step_avg:164.37ms step:372/1530 train_loss:3.6926 train_time:59507ms step_avg:164.38ms step:373/1530 train_loss:3.9109 train_time:59673ms step_avg:164.39ms step:374/1530 train_loss:3.8290 train_time:59841ms step_avg:164.40ms step:375/1530 train_loss:3.7995 train_time:60010ms step_avg:164.41ms step:375/1530 val_loss:3.8224 train_time:60058ms step_avg:164.54ms step:376/1530 train_loss:3.8653 train_time:60179ms step_avg:164.42ms step:377/1530 train_loss:3.7933 train_time:60489ms step_avg:164.82ms step:378/1530 train_loss:3.8420 train_time:60667ms step_avg:164.86ms step:379/1530 train_loss:3.8603 train_time:60988ms step_avg:165.28ms step:380/1530 train_loss:3.9495 train_time:61154ms step_avg:165.28ms step:381/1530 train_loss:3.8418 train_time:61321ms step_avg:165.29ms step:382/1530 train_loss:3.8002 train_time:61491ms step_avg:165.30ms step:383/1530 train_loss:3.8000 train_time:61661ms step_avg:165.31ms step:384/1530 train_loss:3.8803 train_time:61828ms step_avg:165.32ms step:385/1530 train_loss:3.7877 train_time:61997ms step_avg:165.33ms step:386/1530 train_loss:3.8947 train_time:62165ms step_avg:165.33ms step:387/1530 train_loss:4.0547 train_time:62333ms step_avg:165.34ms step:388/1530 train_loss:3.7993 train_time:62500ms step_avg:165.34ms step:389/1530 train_loss:3.8070 train_time:62668ms step_avg:165.35ms step:390/1530 train_loss:3.8926 train_time:62838ms step_avg:165.36ms step:391/1530 train_loss:3.8133 train_time:63004ms step_avg:165.37ms step:392/1530 train_loss:3.9269 train_time:63171ms step_avg:165.37ms step:393/1530 train_loss:3.7663 train_time:63340ms step_avg:165.38ms step:394/1530 train_loss:3.8861 train_time:63507ms step_avg:165.38ms step:395/1530 train_loss:3.6316 train_time:63675ms step_avg:165.39ms step:396/1530 train_loss:3.8349 train_time:63844ms step_avg:165.40ms step:397/1530 train_loss:3.8546 train_time:64010ms step_avg:165.40ms step:398/1530 train_loss:3.8795 train_time:64179ms step_avg:165.41ms step:399/1530 train_loss:3.7638 train_time:64346ms step_avg:165.41ms step:400/1530 train_loss:3.8311 train_time:64513ms step_avg:165.42ms step:401/1530 train_loss:3.9138 train_time:64681ms step_avg:165.42ms step:402/1530 train_loss:3.8441 train_time:64848ms step_avg:165.43ms step:403/1530 train_loss:3.9579 train_time:65014ms step_avg:165.43ms step:404/1530 train_loss:3.6765 train_time:65182ms step_avg:165.44ms step:405/1530 train_loss:3.7837 train_time:65349ms step_avg:165.44ms step:406/1530 train_loss:4.0868 train_time:65516ms step_avg:165.44ms step:407/1530 train_loss:3.7756 train_time:65684ms step_avg:165.45ms step:408/1530 train_loss:3.8226 train_time:65852ms step_avg:165.46ms step:409/1530 train_loss:3.8508 train_time:66019ms step_avg:165.46ms step:410/1530 train_loss:3.7559 train_time:66185ms step_avg:165.46ms step:411/1530 train_loss:3.7629 train_time:66352ms step_avg:165.47ms step:412/1530 train_loss:4.1796 train_time:66521ms step_avg:165.47ms step:413/1530 train_loss:3.6234 train_time:66688ms step_avg:165.48ms step:414/1530 train_loss:4.0126 train_time:66855ms step_avg:165.48ms step:415/1530 train_loss:3.7520 train_time:67022ms step_avg:165.49ms step:416/1530 train_loss:3.7608 train_time:67188ms step_avg:165.49ms step:417/1530 train_loss:3.9493 train_time:67356ms step_avg:165.49ms step:418/1530 train_loss:3.6858 train_time:67524ms step_avg:165.50ms step:419/1530 train_loss:3.7995 train_time:67690ms step_avg:165.50ms step:420/1530 train_loss:3.6957 train_time:67858ms step_avg:165.51ms step:421/1530 train_loss:3.6482 train_time:68024ms step_avg:165.51ms step:422/1530 train_loss:3.7766 train_time:68191ms step_avg:165.51ms step:423/1530 train_loss:3.8693 train_time:68359ms step_avg:165.52ms step:424/1530 train_loss:3.6068 train_time:68526ms step_avg:165.52ms step:425/1530 train_loss:3.7868 train_time:68693ms step_avg:165.52ms step:426/1530 train_loss:3.6452 train_time:68861ms step_avg:165.53ms step:427/1530 train_loss:3.8914 train_time:69027ms step_avg:165.53ms step:428/1530 train_loss:3.8058 train_time:69195ms step_avg:165.54ms step:429/1530 train_loss:3.7551 train_time:69363ms step_avg:165.54ms step:430/1530 train_loss:3.7028 train_time:69530ms step_avg:165.55ms step:431/1530 train_loss:3.6244 train_time:69698ms step_avg:165.55ms step:432/1530 train_loss:3.7661 train_time:69864ms step_avg:165.56ms step:433/1530 train_loss:3.8187 train_time:70031ms step_avg:165.56ms step:434/1530 train_loss:3.7728 train_time:70199ms step_avg:165.56ms step:435/1530 train_loss:3.8032 train_time:70365ms step_avg:165.56ms step:436/1530 train_loss:3.8317 train_time:70533ms step_avg:165.57ms step:437/1530 train_loss:3.7201 train_time:70699ms step_avg:165.57ms step:438/1530 train_loss:3.6999 train_time:70866ms step_avg:165.58ms step:439/1530 train_loss:3.7114 train_time:71034ms step_avg:165.58ms step:440/1530 train_loss:3.8862 train_time:71202ms step_avg:165.59ms step:441/1530 train_loss:3.7541 train_time:71369ms step_avg:165.59ms step:442/1530 train_loss:3.7362 train_time:71536ms step_avg:165.59ms step:443/1530 train_loss:3.6237 train_time:71703ms step_avg:165.60ms step:444/1530 train_loss:3.9195 train_time:71870ms step_avg:165.60ms step:445/1530 train_loss:3.8428 train_time:72037ms step_avg:165.60ms step:446/1530 train_loss:3.8309 train_time:72203ms step_avg:165.60ms step:447/1530 train_loss:3.7540 train_time:72370ms step_avg:165.61ms step:448/1530 train_loss:3.8522 train_time:72538ms step_avg:165.61ms step:449/1530 train_loss:3.6879 train_time:72705ms step_avg:165.61ms step:450/1530 train_loss:3.7127 train_time:72871ms step_avg:165.62ms step:451/1530 train_loss:3.5858 train_time:73040ms step_avg:165.62ms step:452/1530 train_loss:3.7062 train_time:73206ms step_avg:165.62ms step:453/1530 train_loss:3.6695 train_time:73373ms step_avg:165.63ms step:454/1530 train_loss:3.6358 train_time:73541ms step_avg:165.63ms step:455/1530 train_loss:3.8394 train_time:73709ms step_avg:165.64ms step:456/1530 train_loss:3.7251 train_time:73879ms step_avg:165.65ms step:457/1530 train_loss:3.7757 train_time:74049ms step_avg:165.66ms step:458/1530 train_loss:3.8212 train_time:74218ms step_avg:165.66ms step:459/1530 train_loss:3.6292 train_time:74387ms step_avg:165.67ms step:460/1530 train_loss:3.7902 train_time:74556ms step_avg:165.68ms step:461/1530 train_loss:3.6871 train_time:74728ms step_avg:165.69ms step:462/1530 train_loss:3.7282 train_time:74898ms step_avg:165.70ms step:463/1530 train_loss:3.7704 train_time:75068ms step_avg:165.71ms step:464/1530 train_loss:3.7148 train_time:75237ms step_avg:165.72ms step:465/1530 train_loss:3.7138 train_time:75405ms step_avg:165.73ms step:466/1530 train_loss:3.7974 train_time:75575ms step_avg:165.73ms step:467/1530 train_loss:3.8214 train_time:75747ms step_avg:165.75ms step:468/1530 train_loss:3.7857 train_time:75916ms step_avg:165.75ms step:469/1530 train_loss:3.6822 train_time:76085ms step_avg:165.76ms step:470/1530 train_loss:3.7560 train_time:76256ms step_avg:165.77ms step:471/1530 train_loss:3.8102 train_time:76427ms step_avg:165.79ms step:472/1530 train_loss:3.7753 train_time:76599ms step_avg:165.80ms step:473/1530 train_loss:3.7144 train_time:76768ms step_avg:165.81ms step:474/1530 train_loss:3.5844 train_time:76937ms step_avg:165.81ms step:475/1530 train_loss:4.0149 train_time:77106ms step_avg:165.82ms step:476/1530 train_loss:3.7515 train_time:77275ms step_avg:165.83ms step:477/1530 train_loss:3.5898 train_time:77446ms step_avg:165.84ms step:478/1530 train_loss:3.8177 train_time:77615ms step_avg:165.84ms step:479/1530 train_loss:3.7661 train_time:77785ms step_avg:165.85ms step:480/1530 train_loss:3.9147 train_time:77956ms step_avg:165.86ms step:481/1530 train_loss:3.7225 train_time:78125ms step_avg:165.87ms step:482/1530 train_loss:3.5238 train_time:78294ms step_avg:165.88ms step:483/1530 train_loss:3.8044 train_time:78463ms step_avg:165.88ms step:484/1530 train_loss:3.6573 train_time:78634ms step_avg:165.89ms step:485/1530 train_loss:3.6570 train_time:78803ms step_avg:165.90ms step:486/1530 train_loss:3.5639 train_time:78973ms step_avg:165.91ms step:487/1530 train_loss:3.6838 train_time:79144ms step_avg:165.92ms step:488/1530 train_loss:3.8753 train_time:79313ms step_avg:165.93ms step:489/1530 train_loss:3.7096 train_time:79483ms step_avg:165.94ms step:490/1530 train_loss:3.5931 train_time:79651ms step_avg:165.94ms step:491/1530 train_loss:3.6085 train_time:79821ms step_avg:165.95ms step:492/1530 train_loss:3.7302 train_time:79991ms step_avg:165.96ms step:493/1530 train_loss:3.5741 train_time:80162ms step_avg:165.97ms step:494/1530 train_loss:3.7000 train_time:80331ms step_avg:165.97ms step:495/1530 train_loss:3.6552 train_time:80502ms step_avg:165.98ms step:496/1530 train_loss:3.4999 train_time:80672ms step_avg:165.99ms step:497/1530 train_loss:3.7335 train_time:80840ms step_avg:166.00ms step:498/1530 train_loss:3.7831 train_time:81008ms step_avg:166.00ms step:499/1530 train_loss:3.8154 train_time:81179ms step_avg:166.01ms step:500/1530 train_loss:3.7274 train_time:81351ms step_avg:166.02ms step:500/1530 val_loss:3.7023 train_time:81399ms step_avg:166.12ms step:501/1530 train_loss:3.8019 train_time:81522ms step_avg:166.03ms step:502/1530 train_loss:3.7506 train_time:81693ms step_avg:166.04ms step:503/1530 train_loss:3.7726 train_time:81865ms step_avg:166.05ms step:504/1530 train_loss:3.7162 train_time:82032ms step_avg:166.06ms step:505/1530 train_loss:3.8033 train_time:82202ms step_avg:166.06ms step:506/1530 train_loss:3.6397 train_time:82371ms step_avg:166.07ms step:507/1530 train_loss:3.7635 train_time:82540ms step_avg:166.08ms step:508/1530 train_loss:3.8210 train_time:82711ms step_avg:166.09ms step:509/1530 train_loss:3.7758 train_time:82881ms step_avg:166.09ms step:510/1530 train_loss:3.5789 train_time:83050ms step_avg:166.10ms step:511/1530 train_loss:3.7720 train_time:83219ms step_avg:166.11ms step:512/1530 train_loss:3.7124 train_time:83390ms step_avg:166.11ms step:513/1530 train_loss:3.6665 train_time:83559ms step_avg:166.12ms step:514/1530 train_loss:3.8167 train_time:83727ms step_avg:166.13ms step:515/1530 train_loss:3.7319 train_time:83898ms step_avg:166.13ms step:516/1530 train_loss:4.0694 train_time:84067ms step_avg:166.14ms step:517/1530 train_loss:3.6952 train_time:84236ms step_avg:166.15ms step:518/1530 train_loss:3.7641 train_time:84404ms step_avg:166.15ms step:519/1530 train_loss:3.6493 train_time:84573ms step_avg:166.15ms step:520/1530 train_loss:3.6797 train_time:84741ms step_avg:166.16ms step:521/1530 train_loss:3.6642 train_time:84910ms step_avg:166.16ms step:522/1530 train_loss:3.6575 train_time:85080ms step_avg:166.17ms step:523/1530 train_loss:4.2946 train_time:85248ms step_avg:166.18ms step:524/1530 train_loss:3.7302 train_time:85417ms step_avg:166.18ms step:525/1530 train_loss:3.6771 train_time:85586ms step_avg:166.19ms step:526/1530 train_loss:3.6911 train_time:85755ms step_avg:166.19ms step:527/1530 train_loss:3.6538 train_time:85924ms step_avg:166.20ms step:528/1530 train_loss:3.6233 train_time:86093ms step_avg:166.20ms step:529/1530 train_loss:3.8498 train_time:86263ms step_avg:166.21ms step:530/1530 train_loss:3.6440 train_time:86432ms step_avg:166.21ms step:531/1530 train_loss:3.9183 train_time:86602ms step_avg:166.22ms step:532/1530 train_loss:3.7251 train_time:86770ms step_avg:166.23ms step:533/1530 train_loss:3.6550 train_time:86938ms step_avg:166.23ms step:534/1530 train_loss:3.6691 train_time:87107ms step_avg:166.23ms step:535/1530 train_loss:3.6015 train_time:87276ms step_avg:166.24ms step:536/1530 train_loss:3.7438 train_time:87446ms step_avg:166.25ms step:537/1530 train_loss:3.7179 train_time:87615ms step_avg:166.25ms step:538/1530 train_loss:3.6243 train_time:87785ms step_avg:166.26ms step:539/1530 train_loss:4.1086 train_time:87956ms step_avg:166.27ms step:540/1530 train_loss:3.6706 train_time:88124ms step_avg:166.27ms step:541/1530 train_loss:3.7816 train_time:88293ms step_avg:166.28ms step:542/1530 train_loss:3.5859 train_time:88462ms step_avg:166.28ms step:543/1530 train_loss:3.5819 train_time:88630ms step_avg:166.29ms step:544/1530 train_loss:3.6339 train_time:88800ms step_avg:166.29ms step:545/1530 train_loss:3.5867 train_time:88969ms step_avg:166.30ms step:546/1530 train_loss:3.6183 train_time:89139ms step_avg:166.30ms step:547/1530 train_loss:3.6357 train_time:89306ms step_avg:166.31ms step:548/1530 train_loss:3.6092 train_time:89475ms step_avg:166.31ms step:549/1530 train_loss:3.7210 train_time:89644ms step_avg:166.32ms step:550/1530 train_loss:3.6126 train_time:89813ms step_avg:166.32ms step:551/1530 train_loss:3.6302 train_time:89983ms step_avg:166.33ms step:552/1530 train_loss:3.9304 train_time:90150ms step_avg:166.33ms step:553/1530 train_loss:3.7560 train_time:90320ms step_avg:166.33ms step:554/1530 train_loss:3.7076 train_time:90488ms step_avg:166.34ms step:555/1530 train_loss:3.6222 train_time:90656ms step_avg:166.34ms step:556/1530 train_loss:3.6930 train_time:90824ms step_avg:166.34ms step:557/1530 train_loss:3.3039 train_time:90994ms step_avg:166.35ms step:558/1530 train_loss:3.6086 train_time:91162ms step_avg:166.35ms step:559/1530 train_loss:3.6414 train_time:91331ms step_avg:166.36ms step:560/1530 train_loss:3.6844 train_time:91501ms step_avg:166.36ms step:561/1530 train_loss:3.6099 train_time:91669ms step_avg:166.37ms step:562/1530 train_loss:3.5511 train_time:91838ms step_avg:166.37ms step:563/1530 train_loss:3.7497 train_time:92006ms step_avg:166.38ms step:564/1530 train_loss:3.5694 train_time:92179ms step_avg:166.39ms step:565/1530 train_loss:3.6785 train_time:92348ms step_avg:166.39ms step:566/1530 train_loss:3.6200 train_time:92652ms step_avg:166.64ms step:567/1530 train_loss:3.5928 train_time:92832ms step_avg:166.66ms step:568/1530 train_loss:3.6823 train_time:93002ms step_avg:166.67ms step:569/1530 train_loss:3.6421 train_time:93362ms step_avg:167.02ms step:570/1530 train_loss:3.6875 train_time:93530ms step_avg:167.02ms step:571/1530 train_loss:3.7596 train_time:93701ms step_avg:167.03ms step:572/1530 train_loss:3.7226 train_time:93872ms step_avg:167.03ms step:573/1530 train_loss:3.7368 train_time:94044ms step_avg:167.04ms step:574/1530 train_loss:3.7796 train_time:94217ms step_avg:167.05ms step:575/1530 train_loss:3.7246 train_time:94389ms step_avg:167.06ms step:576/1530 train_loss:3.7563 train_time:94560ms step_avg:167.07ms step:577/1530 train_loss:3.6680 train_time:94732ms step_avg:167.08ms step:578/1530 train_loss:3.6660 train_time:94904ms step_avg:167.09ms step:579/1530 train_loss:3.6648 train_time:95075ms step_avg:167.09ms step:580/1530 train_loss:3.5866 train_time:95246ms step_avg:167.10ms step:581/1530 train_loss:3.6314 train_time:95416ms step_avg:167.10ms step:582/1530 train_loss:3.8385 train_time:95587ms step_avg:167.11ms step:583/1530 train_loss:3.6286 train_time:95760ms step_avg:167.12ms step:584/1530 train_loss:3.5900 train_time:95932ms step_avg:167.13ms step:585/1530 train_loss:3.7883 train_time:96102ms step_avg:167.13ms step:586/1530 train_loss:3.5117 train_time:96273ms step_avg:167.14ms step:587/1530 train_loss:3.6622 train_time:96444ms step_avg:167.15ms step:588/1530 train_loss:3.6379 train_time:96614ms step_avg:167.15ms step:589/1530 train_loss:3.9891 train_time:96786ms step_avg:167.16ms step:590/1530 train_loss:3.7717 train_time:96958ms step_avg:167.17ms step:591/1530 train_loss:3.5024 train_time:97129ms step_avg:167.18ms step:592/1530 train_loss:3.5351 train_time:97302ms step_avg:167.19ms step:593/1530 train_loss:3.4958 train_time:97477ms step_avg:167.20ms step:594/1530 train_loss:3.5509 train_time:97647ms step_avg:167.20ms step:595/1530 train_loss:3.9104 train_time:97821ms step_avg:167.22ms step:596/1530 train_loss:3.6425 train_time:97994ms step_avg:167.22ms step:597/1530 train_loss:3.5823 train_time:98164ms step_avg:167.23ms step:598/1530 train_loss:3.6541 train_time:98334ms step_avg:167.23ms step:599/1530 train_loss:3.4749 train_time:98505ms step_avg:167.24ms step:600/1530 train_loss:3.5989 train_time:98675ms step_avg:167.25ms step:601/1530 train_loss:3.6414 train_time:98847ms step_avg:167.25ms step:602/1530 train_loss:3.6638 train_time:99020ms step_avg:167.26ms step:603/1530 train_loss:3.7856 train_time:99191ms step_avg:167.27ms step:604/1530 train_loss:3.6069 train_time:99362ms step_avg:167.28ms step:605/1530 train_loss:3.6092 train_time:99533ms step_avg:167.28ms step:606/1530 train_loss:3.5739 train_time:99706ms step_avg:167.29ms step:607/1530 train_loss:3.8336 train_time:99877ms step_avg:167.30ms step:608/1530 train_loss:3.6323 train_time:100048ms step_avg:167.30ms step:609/1530 train_loss:3.6149 train_time:100219ms step_avg:167.31ms step:610/1530 train_loss:3.6982 train_time:100389ms step_avg:167.31ms step:611/1530 train_loss:3.5906 train_time:100560ms step_avg:167.32ms step:612/1530 train_loss:3.5659 train_time:100731ms step_avg:167.33ms step:613/1530 train_loss:3.7596 train_time:100903ms step_avg:167.33ms step:614/1530 train_loss:3.7042 train_time:101076ms step_avg:167.34ms step:615/1530 train_loss:3.6863 train_time:101245ms step_avg:167.35ms step:616/1530 train_loss:3.6238 train_time:101418ms step_avg:167.36ms step:617/1530 train_loss:3.5542 train_time:101591ms step_avg:167.37ms step:618/1530 train_loss:3.6878 train_time:101762ms step_avg:167.37ms step:619/1530 train_loss:3.5494 train_time:101933ms step_avg:167.38ms step:620/1530 train_loss:3.5849 train_time:102104ms step_avg:167.38ms step:621/1530 train_loss:3.9227 train_time:102276ms step_avg:167.39ms step:622/1530 train_loss:3.5682 train_time:102447ms step_avg:167.40ms step:623/1530 train_loss:3.5984 train_time:102620ms step_avg:167.41ms step:624/1530 train_loss:3.6910 train_time:102792ms step_avg:167.41ms step:625/1530 train_loss:3.7006 train_time:102961ms step_avg:167.42ms step:625/1530 val_loss:3.6191 train_time:103010ms step_avg:167.50ms step:626/1530 train_loss:3.7360 train_time:103133ms step_avg:167.42ms step:627/1530 train_loss:3.7104 train_time:103306ms step_avg:167.43ms step:628/1530 train_loss:3.7557 train_time:103475ms step_avg:167.43ms step:629/1530 train_loss:3.5835 train_time:103646ms step_avg:167.44ms step:630/1530 train_loss:3.7232 train_time:103815ms step_avg:167.44ms step:631/1530 train_loss:3.7387 train_time:103986ms step_avg:167.45ms step:632/1530 train_loss:3.6453 train_time:104156ms step_avg:167.45ms step:633/1530 train_loss:3.6011 train_time:104329ms step_avg:167.46ms step:634/1530 train_loss:3.6909 train_time:104499ms step_avg:167.47ms step:635/1530 train_loss:3.9462 train_time:104670ms step_avg:167.47ms step:636/1530 train_loss:3.5424 train_time:104842ms step_avg:167.48ms step:637/1530 train_loss:3.3476 train_time:105012ms step_avg:167.48ms step:638/1530 train_loss:3.5919 train_time:105181ms step_avg:167.49ms step:639/1530 train_loss:3.6269 train_time:105351ms step_avg:167.49ms step:640/1530 train_loss:3.5634 train_time:105522ms step_avg:167.50ms step:641/1530 train_loss:3.5867 train_time:105691ms step_avg:167.50ms step:642/1530 train_loss:3.6323 train_time:105861ms step_avg:167.50ms step:643/1530 train_loss:3.5914 train_time:106032ms step_avg:167.51ms step:644/1530 train_loss:3.5683 train_time:106202ms step_avg:167.51ms step:645/1530 train_loss:3.7674 train_time:106373ms step_avg:167.52ms step:646/1530 train_loss:3.6651 train_time:106545ms step_avg:167.52ms step:647/1530 train_loss:3.6593 train_time:106715ms step_avg:167.53ms step:648/1530 train_loss:3.7086 train_time:106887ms step_avg:167.53ms step:649/1530 train_loss:3.7613 train_time:107057ms step_avg:167.54ms step:650/1530 train_loss:3.6210 train_time:107228ms step_avg:167.54ms step:651/1530 train_loss:3.7582 train_time:107399ms step_avg:167.55ms step:652/1530 train_loss:3.5805 train_time:107569ms step_avg:167.55ms step:653/1530 train_loss:3.6577 train_time:107740ms step_avg:167.56ms step:654/1530 train_loss:3.4232 train_time:107911ms step_avg:167.56ms step:655/1530 train_loss:3.5814 train_time:108080ms step_avg:167.57ms step:656/1530 train_loss:3.5709 train_time:108250ms step_avg:167.57ms step:657/1530 train_loss:3.4983 train_time:108421ms step_avg:167.57ms step:658/1530 train_loss:3.6823 train_time:108591ms step_avg:167.58ms step:659/1530 train_loss:3.5862 train_time:108762ms step_avg:167.58ms step:660/1530 train_loss:3.6811 train_time:108932ms step_avg:167.59ms step:661/1530 train_loss:3.7485 train_time:109105ms step_avg:167.60ms step:662/1530 train_loss:3.6674 train_time:109274ms step_avg:167.60ms step:663/1530 train_loss:3.5504 train_time:109445ms step_avg:167.60ms step:664/1530 train_loss:3.6008 train_time:109614ms step_avg:167.60ms step:665/1530 train_loss:3.4862 train_time:109785ms step_avg:167.61ms step:666/1530 train_loss:3.7758 train_time:109954ms step_avg:167.61ms step:667/1530 train_loss:3.5968 train_time:110126ms step_avg:167.62ms step:668/1530 train_loss:3.6435 train_time:110297ms step_avg:167.62ms step:669/1530 train_loss:3.4815 train_time:110468ms step_avg:167.63ms step:670/1530 train_loss:3.6005 train_time:110638ms step_avg:167.63ms step:671/1530 train_loss:3.5566 train_time:110809ms step_avg:167.64ms step:672/1530 train_loss:3.5677 train_time:110980ms step_avg:167.64ms step:673/1530 train_loss:3.8464 train_time:111151ms step_avg:167.65ms step:674/1530 train_loss:3.6240 train_time:111322ms step_avg:167.65ms step:675/1530 train_loss:3.7060 train_time:111492ms step_avg:167.66ms step:676/1530 train_loss:3.4868 train_time:111663ms step_avg:167.66ms step:677/1530 train_loss:3.5964 train_time:111834ms step_avg:167.67ms step:678/1530 train_loss:3.5539 train_time:112005ms step_avg:167.67ms step:679/1530 train_loss:3.6717 train_time:112176ms step_avg:167.68ms step:680/1530 train_loss:3.5834 train_time:112347ms step_avg:167.68ms step:681/1530 train_loss:3.6111 train_time:112519ms step_avg:167.69ms step:682/1530 train_loss:3.6593 train_time:112692ms step_avg:167.70ms step:683/1530 train_loss:3.7326 train_time:112865ms step_avg:167.70ms step:684/1530 train_loss:3.6404 train_time:113037ms step_avg:167.71ms step:685/1530 train_loss:3.6830 train_time:113211ms step_avg:167.72ms step:686/1530 train_loss:3.6315 train_time:113384ms step_avg:167.73ms step:687/1530 train_loss:3.6659 train_time:113555ms step_avg:167.73ms step:688/1530 train_loss:3.2053 train_time:113731ms step_avg:167.75ms step:689/1530 train_loss:3.4000 train_time:113906ms step_avg:167.76ms step:690/1530 train_loss:3.5391 train_time:114082ms step_avg:167.77ms step:691/1530 train_loss:3.4084 train_time:114253ms step_avg:167.77ms step:692/1530 train_loss:3.6226 train_time:114425ms step_avg:167.78ms step:693/1530 train_loss:3.6474 train_time:114598ms step_avg:167.79ms step:694/1530 train_loss:3.5474 train_time:114770ms step_avg:167.79ms step:695/1530 train_loss:3.5311 train_time:114941ms step_avg:167.80ms step:696/1530 train_loss:3.8558 train_time:115114ms step_avg:167.80ms step:697/1530 train_loss:3.5825 train_time:115287ms step_avg:167.81ms step:698/1530 train_loss:3.6410 train_time:115458ms step_avg:167.82ms step:699/1530 train_loss:3.7588 train_time:115633ms step_avg:167.83ms step:700/1530 train_loss:3.5672 train_time:115806ms step_avg:167.83ms step:701/1530 train_loss:3.5360 train_time:115976ms step_avg:167.84ms step:702/1530 train_loss:3.5051 train_time:116149ms step_avg:167.85ms step:703/1530 train_loss:3.4927 train_time:116322ms step_avg:167.85ms step:704/1530 train_loss:3.5725 train_time:116495ms step_avg:167.86ms step:705/1530 train_loss:3.5567 train_time:116671ms step_avg:167.87ms step:706/1530 train_loss:3.5763 train_time:116847ms step_avg:167.88ms step:707/1530 train_loss:3.6448 train_time:117022ms step_avg:167.89ms step:708/1530 train_loss:3.5997 train_time:117194ms step_avg:167.90ms step:709/1530 train_loss:3.5813 train_time:117367ms step_avg:167.91ms step:710/1530 train_loss:3.5403 train_time:117538ms step_avg:167.91ms step:711/1530 train_loss:3.5838 train_time:117712ms step_avg:167.92ms step:712/1530 train_loss:3.6383 train_time:117887ms step_avg:167.93ms step:713/1530 train_loss:3.6474 train_time:118062ms step_avg:167.94ms step:714/1530 train_loss:3.5581 train_time:118234ms step_avg:167.95ms step:715/1530 train_loss:3.5680 train_time:118407ms step_avg:167.95ms step:716/1530 train_loss:3.5793 train_time:118579ms step_avg:167.96ms step:717/1530 train_loss:3.7010 train_time:118753ms step_avg:167.97ms step:718/1530 train_loss:3.5900 train_time:118924ms step_avg:167.97ms step:719/1530 train_loss:3.6766 train_time:119096ms step_avg:167.98ms step:720/1530 train_loss:3.8447 train_time:119271ms step_avg:167.99ms step:721/1530 train_loss:3.4603 train_time:119444ms step_avg:167.99ms step:722/1530 train_loss:3.7324 train_time:119616ms step_avg:168.00ms step:723/1530 train_loss:3.7632 train_time:119788ms step_avg:168.00ms step:724/1530 train_loss:3.5623 train_time:119960ms step_avg:168.01ms step:725/1530 train_loss:3.6507 train_time:120133ms step_avg:168.02ms step:726/1530 train_loss:3.5289 train_time:120305ms step_avg:168.02ms step:727/1530 train_loss:3.5752 train_time:120481ms step_avg:168.04ms step:728/1530 train_loss:3.7265 train_time:120654ms step_avg:168.04ms step:729/1530 train_loss:3.6634 train_time:120827ms step_avg:168.05ms step:730/1530 train_loss:3.6587 train_time:121002ms step_avg:168.06ms step:731/1530 train_loss:3.5485 train_time:121173ms step_avg:168.06ms step:732/1530 train_loss:3.5918 train_time:121345ms step_avg:168.07ms step:733/1530 train_loss:3.8291 train_time:121517ms step_avg:168.07ms step:734/1530 train_loss:3.5537 train_time:121691ms step_avg:168.08ms step:735/1530 train_loss:3.6120 train_time:121862ms step_avg:168.09ms step:736/1530 train_loss:3.7323 train_time:122036ms step_avg:168.09ms step:737/1530 train_loss:3.6734 train_time:122207ms step_avg:168.10ms step:738/1530 train_loss:3.5973 train_time:122379ms step_avg:168.10ms step:739/1530 train_loss:3.4922 train_time:122551ms step_avg:168.11ms step:740/1530 train_loss:4.1175 train_time:122730ms step_avg:168.12ms step:741/1530 train_loss:3.4909 train_time:122903ms step_avg:168.13ms step:742/1530 train_loss:3.5488 train_time:123076ms step_avg:168.14ms step:743/1530 train_loss:3.5746 train_time:123248ms step_avg:168.14ms step:744/1530 train_loss:3.6436 train_time:123420ms step_avg:168.15ms step:745/1530 train_loss:3.5888 train_time:123592ms step_avg:168.15ms step:746/1530 train_loss:3.5923 train_time:123763ms step_avg:168.16ms step:747/1530 train_loss:3.6410 train_time:123939ms step_avg:168.17ms step:748/1530 train_loss:3.5643 train_time:124115ms step_avg:168.18ms step:749/1530 train_loss:3.5607 train_time:124287ms step_avg:168.18ms step:750/1530 train_loss:3.5945 train_time:124456ms step_avg:168.18ms step:750/1530 val_loss:3.5631 train_time:124507ms step_avg:168.25ms step:751/1530 train_loss:3.5675 train_time:124632ms step_avg:168.19ms step:752/1530 train_loss:3.6147 train_time:124800ms step_avg:168.19ms step:753/1530 train_loss:3.6136 train_time:124975ms step_avg:168.20ms step:754/1530 train_loss:3.5946 train_time:125148ms step_avg:168.21ms step:755/1530 train_loss:3.6777 train_time:125462ms step_avg:168.41ms step:756/1530 train_loss:3.4521 train_time:125647ms step_avg:168.43ms step:757/1530 train_loss:3.7213 train_time:125821ms step_avg:168.43ms step:758/1530 train_loss:3.6490 train_time:125992ms step_avg:168.44ms step:759/1530 train_loss:3.5866 train_time:126319ms step_avg:168.65ms step:760/1530 train_loss:3.6991 train_time:126491ms step_avg:168.65ms step:761/1530 train_loss:3.3949 train_time:126664ms step_avg:168.66ms step:762/1530 train_loss:3.5450 train_time:126836ms step_avg:168.66ms step:763/1530 train_loss:3.6612 train_time:127009ms step_avg:168.67ms step:764/1530 train_loss:3.3205 train_time:127181ms step_avg:168.67ms step:765/1530 train_loss:3.7313 train_time:127354ms step_avg:168.68ms step:766/1530 train_loss:3.5658 train_time:127525ms step_avg:168.68ms step:767/1530 train_loss:3.5592 train_time:127698ms step_avg:168.69ms step:768/1530 train_loss:3.5644 train_time:127871ms step_avg:168.70ms step:769/1530 train_loss:3.5824 train_time:128044ms step_avg:168.70ms step:770/1530 train_loss:3.6403 train_time:128214ms step_avg:168.70ms step:771/1530 train_loss:3.8827 train_time:128388ms step_avg:168.71ms step:772/1530 train_loss:3.4459 train_time:128560ms step_avg:168.71ms step:773/1530 train_loss:3.6223 train_time:128731ms step_avg:168.72ms step:774/1530 train_loss:3.6347 train_time:128902ms step_avg:168.72ms step:775/1530 train_loss:3.6051 train_time:129074ms step_avg:168.72ms step:776/1530 train_loss:3.4089 train_time:129248ms step_avg:168.73ms step:777/1530 train_loss:3.3885 train_time:129421ms step_avg:168.74ms step:778/1530 train_loss:3.4887 train_time:129593ms step_avg:168.74ms step:779/1530 train_loss:3.5801 train_time:129764ms step_avg:168.74ms step:780/1530 train_loss:3.5840 train_time:129937ms step_avg:168.75ms step:781/1530 train_loss:3.6734 train_time:130110ms step_avg:168.76ms step:782/1530 train_loss:3.5898 train_time:130281ms step_avg:168.76ms step:783/1530 train_loss:3.5674 train_time:130453ms step_avg:168.76ms step:784/1530 train_loss:3.5974 train_time:130625ms step_avg:168.77ms step:785/1530 train_loss:3.5567 train_time:130797ms step_avg:168.77ms step:786/1530 train_loss:3.4421 train_time:130972ms step_avg:168.78ms step:787/1530 train_loss:3.7061 train_time:131145ms step_avg:168.78ms step:788/1530 train_loss:3.4968 train_time:131317ms step_avg:168.79ms step:789/1530 train_loss:3.5451 train_time:131489ms step_avg:168.79ms step:790/1530 train_loss:3.6301 train_time:131663ms step_avg:168.80ms step:791/1530 train_loss:3.7722 train_time:131838ms step_avg:168.81ms step:792/1530 train_loss:3.7561 train_time:132011ms step_avg:168.81ms step:793/1530 train_loss:3.4462 train_time:132181ms step_avg:168.81ms step:794/1530 train_loss:3.5934 train_time:132355ms step_avg:168.82ms step:795/1530 train_loss:3.6710 train_time:132529ms step_avg:168.83ms step:796/1530 train_loss:3.7516 train_time:132704ms step_avg:168.83ms step:797/1530 train_loss:3.5229 train_time:132877ms step_avg:168.84ms step:798/1530 train_loss:3.6417 train_time:133053ms step_avg:168.85ms step:799/1530 train_loss:3.5314 train_time:133228ms step_avg:168.86ms step:800/1530 train_loss:3.5250 train_time:133401ms step_avg:168.86ms step:801/1530 train_loss:3.6205 train_time:133577ms step_avg:168.87ms step:802/1530 train_loss:3.4942 train_time:133753ms step_avg:168.88ms step:803/1530 train_loss:3.4863 train_time:133926ms step_avg:168.89ms step:804/1530 train_loss:3.6243 train_time:134100ms step_avg:168.89ms step:805/1530 train_loss:3.5165 train_time:134276ms step_avg:168.90ms step:806/1530 train_loss:3.5580 train_time:134449ms step_avg:168.91ms step:807/1530 train_loss:3.6409 train_time:134622ms step_avg:168.91ms step:808/1530 train_loss:3.5371 train_time:134797ms step_avg:168.92ms step:809/1530 train_loss:3.4907 train_time:134973ms step_avg:168.93ms step:810/1530 train_loss:3.5603 train_time:135145ms step_avg:168.93ms step:811/1530 train_loss:3.5739 train_time:135318ms step_avg:168.94ms step:812/1530 train_loss:3.6003 train_time:135492ms step_avg:168.94ms step:813/1530 train_loss:3.6213 train_time:135664ms step_avg:168.95ms step:814/1530 train_loss:3.5712 train_time:135838ms step_avg:168.95ms step:815/1530 train_loss:3.5631 train_time:136012ms step_avg:168.96ms step:816/1530 train_loss:3.6853 train_time:136187ms step_avg:168.97ms step:817/1530 train_loss:3.7654 train_time:136360ms step_avg:168.97ms step:818/1530 train_loss:3.5203 train_time:136533ms step_avg:168.98ms step:819/1530 train_loss:3.7133 train_time:136708ms step_avg:168.98ms step:820/1530 train_loss:3.4858 train_time:136883ms step_avg:168.99ms step:821/1530 train_loss:3.5566 train_time:137056ms step_avg:169.00ms step:822/1530 train_loss:3.6952 train_time:137232ms step_avg:169.00ms step:823/1530 train_loss:3.5739 train_time:137405ms step_avg:169.01ms step:824/1530 train_loss:3.5127 train_time:137578ms step_avg:169.01ms step:825/1530 train_loss:3.6075 train_time:137755ms step_avg:169.02ms step:826/1530 train_loss:3.4754 train_time:137930ms step_avg:169.03ms step:827/1530 train_loss:3.7288 train_time:138103ms step_avg:169.04ms step:828/1530 train_loss:3.6184 train_time:138278ms step_avg:169.04ms step:829/1530 train_loss:3.6261 train_time:138456ms step_avg:169.05ms step:830/1530 train_loss:3.5263 train_time:138631ms step_avg:169.06ms step:831/1530 train_loss:3.5966 train_time:138804ms step_avg:169.07ms step:832/1530 train_loss:3.5086 train_time:138978ms step_avg:169.07ms step:833/1530 train_loss:3.6500 train_time:139155ms step_avg:169.08ms step:834/1530 train_loss:3.4739 train_time:139329ms step_avg:169.09ms step:835/1530 train_loss:3.4570 train_time:139502ms step_avg:169.09ms step:836/1530 train_loss:3.7147 train_time:139678ms step_avg:169.10ms step:837/1530 train_loss:3.3957 train_time:139853ms step_avg:169.11ms step:838/1530 train_loss:3.5860 train_time:140026ms step_avg:169.11ms step:839/1530 train_loss:3.4159 train_time:140201ms step_avg:169.12ms step:840/1530 train_loss:3.4660 train_time:140374ms step_avg:169.13ms step:841/1530 train_loss:3.5667 train_time:140545ms step_avg:169.13ms step:842/1530 train_loss:3.5821 train_time:140721ms step_avg:169.14ms step:843/1530 train_loss:3.5581 train_time:140895ms step_avg:169.14ms step:844/1530 train_loss:3.4258 train_time:141067ms step_avg:169.15ms step:845/1530 train_loss:3.6571 train_time:141241ms step_avg:169.15ms step:846/1530 train_loss:3.5151 train_time:141415ms step_avg:169.16ms step:847/1530 train_loss:3.4913 train_time:141589ms step_avg:169.16ms step:848/1530 train_loss:3.6353 train_time:141762ms step_avg:169.17ms step:849/1530 train_loss:3.4901 train_time:141936ms step_avg:169.17ms step:850/1530 train_loss:3.4403 train_time:142111ms step_avg:169.18ms step:851/1530 train_loss:3.7236 train_time:142284ms step_avg:169.18ms step:852/1530 train_loss:3.4350 train_time:142458ms step_avg:169.19ms step:853/1530 train_loss:3.5630 train_time:142631ms step_avg:169.19ms step:854/1530 train_loss:3.6414 train_time:142805ms step_avg:169.20ms step:855/1530 train_loss:3.5107 train_time:142978ms step_avg:169.20ms step:856/1530 train_loss:3.5417 train_time:143151ms step_avg:169.21ms step:857/1530 train_loss:3.6009 train_time:143325ms step_avg:169.21ms step:858/1530 train_loss:3.4618 train_time:143500ms step_avg:169.22ms step:859/1530 train_loss:3.5550 train_time:143674ms step_avg:169.23ms step:860/1530 train_loss:3.5834 train_time:143846ms step_avg:169.23ms step:861/1530 train_loss:3.6276 train_time:144022ms step_avg:169.24ms step:862/1530 train_loss:3.5971 train_time:144199ms step_avg:169.25ms step:863/1530 train_loss:3.5668 train_time:144375ms step_avg:169.26ms step:864/1530 train_loss:3.3785 train_time:144548ms step_avg:169.26ms step:865/1530 train_loss:3.5939 train_time:144720ms step_avg:169.26ms step:866/1530 train_loss:3.8891 train_time:144897ms step_avg:169.27ms step:867/1530 train_loss:3.4543 train_time:145071ms step_avg:169.28ms step:868/1530 train_loss:3.6384 train_time:145241ms step_avg:169.28ms step:869/1530 train_loss:3.6112 train_time:145415ms step_avg:169.28ms step:870/1530 train_loss:3.4439 train_time:145589ms step_avg:169.29ms step:871/1530 train_loss:3.4041 train_time:145764ms step_avg:169.30ms step:872/1530 train_loss:3.6457 train_time:145938ms step_avg:169.30ms step:873/1530 train_loss:3.4589 train_time:146112ms step_avg:169.31ms step:874/1530 train_loss:3.2237 train_time:146288ms step_avg:169.31ms step:875/1530 train_loss:3.6274 train_time:146461ms step_avg:169.32ms step:875/1530 val_loss:3.5184 train_time:146511ms step_avg:169.38ms step:876/1530 train_loss:3.4398 train_time:146637ms step_avg:169.33ms step:877/1530 train_loss:3.6163 train_time:146813ms step_avg:169.33ms step:878/1530 train_loss:3.4630 train_time:146988ms step_avg:169.34ms step:879/1530 train_loss:3.6422 train_time:147161ms step_avg:169.34ms step:880/1530 train_loss:3.3099 train_time:147330ms step_avg:169.35ms step:881/1530 train_loss:3.4713 train_time:147503ms step_avg:169.35ms step:882/1530 train_loss:3.6918 train_time:147673ms step_avg:169.35ms step:883/1530 train_loss:3.8367 train_time:147846ms step_avg:169.35ms step:884/1530 train_loss:3.5617 train_time:148021ms step_avg:169.36ms step:885/1530 train_loss:3.4891 train_time:148195ms step_avg:169.37ms step:886/1530 train_loss:3.5670 train_time:148368ms step_avg:169.37ms step:887/1530 train_loss:4.0786 train_time:148542ms step_avg:169.38ms step:888/1530 train_loss:3.8330 train_time:148721ms step_avg:169.39ms step:889/1530 train_loss:3.5133 train_time:148895ms step_avg:169.39ms step:890/1530 train_loss:3.5298 train_time:149066ms step_avg:169.39ms step:891/1530 train_loss:3.3577 train_time:149241ms step_avg:169.40ms step:892/1530 train_loss:3.7172 train_time:149413ms step_avg:169.40ms step:893/1530 train_loss:3.4192 train_time:149585ms step_avg:169.41ms step:894/1530 train_loss:3.6244 train_time:149760ms step_avg:169.41ms step:895/1530 train_loss:3.6824 train_time:149934ms step_avg:169.42ms step:896/1530 train_loss:3.4944 train_time:150107ms step_avg:169.42ms step:897/1530 train_loss:3.5368 train_time:150283ms step_avg:169.43ms step:898/1530 train_loss:3.5837 train_time:150459ms step_avg:169.44ms step:899/1530 train_loss:3.4698 train_time:150631ms step_avg:169.44ms step:900/1530 train_loss:3.4218 train_time:150803ms step_avg:169.44ms step:901/1530 train_loss:3.6098 train_time:150976ms step_avg:169.45ms step:902/1530 train_loss:3.6288 train_time:151148ms step_avg:169.45ms step:903/1530 train_loss:3.5340 train_time:151324ms step_avg:169.46ms step:904/1530 train_loss:3.4885 train_time:151500ms step_avg:169.46ms step:905/1530 train_loss:3.5003 train_time:151670ms step_avg:169.46ms step:906/1530 train_loss:3.6989 train_time:151845ms step_avg:169.47ms step:907/1530 train_loss:3.5151 train_time:152019ms step_avg:169.47ms step:908/1530 train_loss:3.5605 train_time:152191ms step_avg:169.48ms step:909/1530 train_loss:3.4501 train_time:152366ms step_avg:169.48ms step:910/1530 train_loss:3.5216 train_time:152545ms step_avg:169.49ms step:911/1530 train_loss:3.6422 train_time:152721ms step_avg:169.50ms step:912/1530 train_loss:3.6151 train_time:152903ms step_avg:169.52ms step:913/1530 train_loss:3.4584 train_time:153082ms step_avg:169.53ms step:914/1530 train_loss:3.7472 train_time:153260ms step_avg:169.54ms step:915/1530 train_loss:3.5315 train_time:153439ms step_avg:169.55ms step:916/1530 train_loss:3.6164 train_time:153615ms step_avg:169.55ms step:917/1530 train_loss:3.5949 train_time:153790ms step_avg:169.56ms step:918/1530 train_loss:4.8195 train_time:153969ms step_avg:169.57ms step:919/1530 train_loss:3.4928 train_time:154147ms step_avg:169.58ms step:920/1530 train_loss:3.5827 train_time:154322ms step_avg:169.58ms step:921/1530 train_loss:3.5467 train_time:154501ms step_avg:169.59ms step:922/1530 train_loss:3.5766 train_time:154677ms step_avg:169.60ms step:923/1530 train_loss:3.6062 train_time:154854ms step_avg:169.61ms step:924/1530 train_loss:3.6736 train_time:155029ms step_avg:169.62ms step:925/1530 train_loss:3.6439 train_time:155205ms step_avg:169.62ms step:926/1530 train_loss:3.5548 train_time:155377ms step_avg:169.63ms step:927/1530 train_loss:3.5534 train_time:155553ms step_avg:169.63ms step:928/1530 train_loss:3.7825 train_time:155729ms step_avg:169.64ms step:929/1530 train_loss:3.6066 train_time:155904ms step_avg:169.65ms step:930/1530 train_loss:3.3973 train_time:156080ms step_avg:169.65ms step:931/1530 train_loss:3.4939 train_time:156254ms step_avg:169.66ms step:932/1530 train_loss:3.6392 train_time:156431ms step_avg:169.66ms step:933/1530 train_loss:3.3562 train_time:156608ms step_avg:169.67ms step:934/1530 train_loss:3.5795 train_time:156785ms step_avg:169.68ms step:935/1530 train_loss:3.4352 train_time:156963ms step_avg:169.69ms step:936/1530 train_loss:3.5232 train_time:157141ms step_avg:169.70ms step:937/1530 train_loss:3.6198 train_time:157320ms step_avg:169.71ms step:938/1530 train_loss:3.5337 train_time:157494ms step_avg:169.71ms step:939/1530 train_loss:3.6708 train_time:157674ms step_avg:169.72ms step:940/1530 train_loss:3.4758 train_time:157848ms step_avg:169.73ms step:941/1530 train_loss:3.5425 train_time:158024ms step_avg:169.74ms step:942/1530 train_loss:3.3595 train_time:158202ms step_avg:169.74ms step:943/1530 train_loss:3.7098 train_time:158383ms step_avg:169.76ms step:944/1530 train_loss:3.4021 train_time:158700ms step_avg:169.91ms step:945/1530 train_loss:3.4263 train_time:158883ms step_avg:169.93ms step:946/1530 train_loss:5.0665 train_time:159064ms step_avg:169.94ms step:947/1530 train_loss:3.5975 train_time:159240ms step_avg:169.95ms step:948/1530 train_loss:3.4825 train_time:159414ms step_avg:169.95ms step:949/1530 train_loss:3.3693 train_time:159735ms step_avg:170.11ms step:950/1530 train_loss:3.4394 train_time:159908ms step_avg:170.12ms step:951/1530 train_loss:3.4013 train_time:160087ms step_avg:170.12ms step:952/1530 train_loss:3.4753 train_time:160263ms step_avg:170.13ms step:953/1530 train_loss:3.5630 train_time:160440ms step_avg:170.14ms step:954/1530 train_loss:3.4467 train_time:160618ms step_avg:170.15ms step:955/1530 train_loss:3.4713 train_time:160794ms step_avg:170.15ms step:956/1530 train_loss:3.4349 train_time:160968ms step_avg:170.16ms step:957/1530 train_loss:3.4861 train_time:161147ms step_avg:170.17ms step:958/1530 train_loss:3.5009 train_time:161326ms step_avg:170.18ms step:959/1530 train_loss:3.5060 train_time:161504ms step_avg:170.18ms step:960/1530 train_loss:3.4041 train_time:161680ms step_avg:170.19ms step:961/1530 train_loss:3.6424 train_time:161856ms step_avg:170.20ms step:962/1530 train_loss:3.5918 train_time:162030ms step_avg:170.20ms step:963/1530 train_loss:3.6996 train_time:162207ms step_avg:170.21ms step:964/1530 train_loss:3.4194 train_time:162386ms step_avg:170.22ms step:965/1530 train_loss:3.4743 train_time:162560ms step_avg:170.22ms step:966/1530 train_loss:3.7038 train_time:162732ms step_avg:170.22ms step:967/1530 train_loss:3.5160 train_time:162908ms step_avg:170.23ms step:968/1530 train_loss:3.5100 train_time:163084ms step_avg:170.23ms step:969/1530 train_loss:3.5784 train_time:163259ms step_avg:170.24ms step:970/1530 train_loss:3.3761 train_time:163430ms step_avg:170.24ms step:971/1530 train_loss:3.5314 train_time:163606ms step_avg:170.25ms step:972/1530 train_loss:3.4803 train_time:163778ms step_avg:170.25ms step:973/1530 train_loss:3.5424 train_time:163953ms step_avg:170.25ms step:974/1530 train_loss:3.5878 train_time:164128ms step_avg:170.26ms step:975/1530 train_loss:3.4632 train_time:164306ms step_avg:170.26ms step:976/1530 train_loss:3.6673 train_time:164479ms step_avg:170.27ms step:977/1530 train_loss:3.5714 train_time:164652ms step_avg:170.27ms step:978/1530 train_loss:3.3587 train_time:164826ms step_avg:170.28ms step:979/1530 train_loss:3.6232 train_time:165004ms step_avg:170.28ms step:980/1530 train_loss:3.4157 train_time:165179ms step_avg:170.29ms step:981/1530 train_loss:3.5726 train_time:165357ms step_avg:170.30ms step:982/1530 train_loss:3.5411 train_time:165530ms step_avg:170.30ms step:983/1530 train_loss:3.5111 train_time:165707ms step_avg:170.31ms step:984/1530 train_loss:3.5011 train_time:165882ms step_avg:170.31ms step:985/1530 train_loss:3.5697 train_time:166060ms step_avg:170.32ms step:986/1530 train_loss:3.4130 train_time:166233ms step_avg:170.32ms step:987/1530 train_loss:3.4815 train_time:166407ms step_avg:170.32ms step:988/1530 train_loss:3.4765 train_time:166582ms step_avg:170.33ms step:989/1530 train_loss:3.4136 train_time:166754ms step_avg:170.33ms step:990/1530 train_loss:3.6535 train_time:166929ms step_avg:170.34ms step:991/1530 train_loss:3.4680 train_time:167104ms step_avg:170.34ms step:992/1530 train_loss:3.4431 train_time:167284ms step_avg:170.35ms step:993/1530 train_loss:3.4979 train_time:167462ms step_avg:170.36ms step:994/1530 train_loss:3.5935 train_time:167636ms step_avg:170.36ms step:995/1530 train_loss:3.5217 train_time:167808ms step_avg:170.36ms step:996/1530 train_loss:3.4526 train_time:167982ms step_avg:170.37ms step:997/1530 train_loss:3.7511 train_time:168156ms step_avg:170.37ms step:998/1530 train_loss:3.4379 train_time:168328ms step_avg:170.37ms step:999/1530 train_loss:3.5814 train_time:168503ms step_avg:170.38ms step:1000/1530 train_loss:3.4352 train_time:168680ms step_avg:170.38ms step:1000/1530 val_loss:3.4649 train_time:168731ms step_avg:170.44ms step:1001/1530 train_loss:3.4994 train_time:168856ms step_avg:170.39ms step:1002/1530 train_loss:3.3728 train_time:169029ms step_avg:170.39ms step:1003/1530 train_loss:3.5546 train_time:169206ms step_avg:170.40ms step:1004/1530 train_loss:3.6023 train_time:169381ms step_avg:170.40ms step:1005/1530 train_loss:3.3909 train_time:169555ms step_avg:170.41ms step:1006/1530 train_loss:3.4594 train_time:169730ms step_avg:170.41ms step:1007/1530 train_loss:3.4376 train_time:169906ms step_avg:170.42ms step:1008/1530 train_loss:3.5643 train_time:170081ms step_avg:170.42ms step:1009/1530 train_loss:3.6596 train_time:170258ms step_avg:170.43ms step:1010/1530 train_loss:3.5624 train_time:170431ms step_avg:170.43ms step:1011/1530 train_loss:3.5317 train_time:170605ms step_avg:170.43ms step:1012/1530 train_loss:3.3832 train_time:170779ms step_avg:170.44ms step:1013/1530 train_loss:3.5300 train_time:170955ms step_avg:170.44ms step:1014/1530 train_loss:3.6180 train_time:171132ms step_avg:170.45ms step:1015/1530 train_loss:3.3253 train_time:171309ms step_avg:170.46ms step:1016/1530 train_loss:3.4052 train_time:171484ms step_avg:170.46ms step:1017/1530 train_loss:3.3929 train_time:171661ms step_avg:170.47ms step:1018/1530 train_loss:3.3911 train_time:171836ms step_avg:170.47ms step:1019/1530 train_loss:3.5184 train_time:172012ms step_avg:170.48ms step:1020/1530 train_loss:3.3799 train_time:172189ms step_avg:170.48ms step:1021/1530 train_loss:3.3542 train_time:172363ms step_avg:170.49ms step:1022/1530 train_loss:3.4752 train_time:172540ms step_avg:170.49ms step:1023/1530 train_loss:3.5041 train_time:172714ms step_avg:170.50ms step:1024/1530 train_loss:3.4708 train_time:172892ms step_avg:170.50ms step:1025/1530 train_loss:3.4782 train_time:173068ms step_avg:170.51ms step:1026/1530 train_loss:3.6174 train_time:173243ms step_avg:170.51ms step:1027/1530 train_loss:3.3146 train_time:173417ms step_avg:170.52ms step:1028/1530 train_loss:3.3961 train_time:173599ms step_avg:170.53ms step:1029/1530 train_loss:3.3080 train_time:173779ms step_avg:170.54ms step:1030/1530 train_loss:3.5368 train_time:173954ms step_avg:170.54ms step:1031/1530 train_loss:3.5057 train_time:174130ms step_avg:170.55ms step:1032/1530 train_loss:3.6925 train_time:174313ms step_avg:170.56ms step:1033/1530 train_loss:3.4858 train_time:174489ms step_avg:170.57ms step:1034/1530 train_loss:3.4077 train_time:174666ms step_avg:170.57ms step:1035/1530 train_loss:3.4404 train_time:174846ms step_avg:170.58ms step:1036/1530 train_loss:3.4766 train_time:175022ms step_avg:170.59ms step:1037/1530 train_loss:3.7849 train_time:175200ms step_avg:170.59ms step:1038/1530 train_loss:3.6148 train_time:175378ms step_avg:170.60ms step:1039/1530 train_loss:3.5129 train_time:175561ms step_avg:170.61ms step:1040/1530 train_loss:3.4078 train_time:175736ms step_avg:170.62ms step:1041/1530 train_loss:3.4827 train_time:175915ms step_avg:170.63ms step:1042/1530 train_loss:3.5173 train_time:176088ms step_avg:170.63ms step:1043/1530 train_loss:3.4448 train_time:176263ms step_avg:170.63ms step:1044/1530 train_loss:3.4508 train_time:176439ms step_avg:170.64ms step:1045/1530 train_loss:3.5127 train_time:176617ms step_avg:170.64ms step:1046/1530 train_loss:3.4246 train_time:176792ms step_avg:170.65ms step:1047/1530 train_loss:3.6307 train_time:176968ms step_avg:170.65ms step:1048/1530 train_loss:3.4935 train_time:177145ms step_avg:170.66ms step:1049/1530 train_loss:3.4015 train_time:177321ms step_avg:170.66ms step:1050/1530 train_loss:3.3844 train_time:177498ms step_avg:170.67ms step:1051/1530 train_loss:3.4872 train_time:177675ms step_avg:170.68ms step:1052/1530 train_loss:3.3563 train_time:177852ms step_avg:170.68ms step:1053/1530 train_loss:3.6889 train_time:178030ms step_avg:170.69ms step:1054/1530 train_loss:3.5366 train_time:178211ms step_avg:170.70ms step:1055/1530 train_loss:3.3853 train_time:178386ms step_avg:170.70ms step:1056/1530 train_loss:3.4940 train_time:178561ms step_avg:170.71ms step:1057/1530 train_loss:3.5759 train_time:178738ms step_avg:170.71ms step:1058/1530 train_loss:3.3022 train_time:178918ms step_avg:170.72ms step:1059/1530 train_loss:3.3638 train_time:179099ms step_avg:170.73ms step:1060/1530 train_loss:3.4377 train_time:179274ms step_avg:170.74ms step:1061/1530 train_loss:3.4109 train_time:179448ms step_avg:170.74ms step:1062/1530 train_loss:3.3846 train_time:179625ms step_avg:170.75ms step:1063/1530 train_loss:3.4553 train_time:179800ms step_avg:170.75ms step:1064/1530 train_loss:3.3827 train_time:179972ms step_avg:170.75ms step:1065/1530 train_loss:3.3582 train_time:180151ms step_avg:170.76ms step:1066/1530 train_loss:3.4144 train_time:180329ms step_avg:170.77ms step:1067/1530 train_loss:3.2810 train_time:180508ms step_avg:170.77ms step:1068/1530 train_loss:3.4336 train_time:180684ms step_avg:170.78ms step:1069/1530 train_loss:3.2960 train_time:180865ms step_avg:170.79ms step:1070/1530 train_loss:3.5666 train_time:181039ms step_avg:170.79ms step:1071/1530 train_loss:3.5110 train_time:181219ms step_avg:170.80ms step:1072/1530 train_loss:3.4402 train_time:181393ms step_avg:170.80ms step:1073/1530 train_loss:3.5172 train_time:181566ms step_avg:170.81ms step:1074/1530 train_loss:3.4281 train_time:181744ms step_avg:170.81ms step:1075/1530 train_loss:3.3995 train_time:181922ms step_avg:170.82ms step:1076/1530 train_loss:3.7931 train_time:182096ms step_avg:170.82ms step:1077/1530 train_loss:3.4264 train_time:182271ms step_avg:170.83ms step:1078/1530 train_loss:3.1018 train_time:182456ms step_avg:170.84ms step:1079/1530 train_loss:3.5299 train_time:182632ms step_avg:170.84ms step:1080/1530 train_loss:3.4231 train_time:182810ms step_avg:170.85ms step:1081/1530 train_loss:3.4979 train_time:182984ms step_avg:170.85ms step:1082/1530 train_loss:3.5886 train_time:183157ms step_avg:170.86ms step:1083/1530 train_loss:3.4922 train_time:183333ms step_avg:170.86ms step:1084/1530 train_loss:3.4649 train_time:183508ms step_avg:170.86ms step:1085/1530 train_loss:3.4334 train_time:183684ms step_avg:170.87ms step:1086/1530 train_loss:3.6284 train_time:183858ms step_avg:170.87ms step:1087/1530 train_loss:3.5013 train_time:184035ms step_avg:170.88ms step:1088/1530 train_loss:3.3665 train_time:184213ms step_avg:170.88ms step:1089/1530 train_loss:3.3709 train_time:184392ms step_avg:170.89ms step:1090/1530 train_loss:3.4765 train_time:184570ms step_avg:170.90ms step:1091/1530 train_loss:3.2806 train_time:184746ms step_avg:170.90ms step:1092/1530 train_loss:3.4809 train_time:184923ms step_avg:170.91ms step:1093/1530 train_loss:3.6004 train_time:185098ms step_avg:170.91ms step:1094/1530 train_loss:3.4404 train_time:185272ms step_avg:170.92ms step:1095/1530 train_loss:3.4184 train_time:185447ms step_avg:170.92ms step:1096/1530 train_loss:3.4235 train_time:185624ms step_avg:170.92ms step:1097/1530 train_loss:3.4891 train_time:185802ms step_avg:170.93ms step:1098/1530 train_loss:3.5615 train_time:185980ms step_avg:170.94ms step:1099/1530 train_loss:3.5241 train_time:186156ms step_avg:170.94ms step:1100/1530 train_loss:3.4249 train_time:186335ms step_avg:170.95ms step:1101/1530 train_loss:3.2832 train_time:186511ms step_avg:170.95ms step:1102/1530 train_loss:3.3101 train_time:186690ms step_avg:170.96ms step:1103/1530 train_loss:3.4397 train_time:186870ms step_avg:170.97ms step:1104/1530 train_loss:3.3203 train_time:187046ms step_avg:170.97ms step:1105/1530 train_loss:4.0607 train_time:187224ms step_avg:170.98ms step:1106/1530 train_loss:3.2214 train_time:187399ms step_avg:170.98ms step:1107/1530 train_loss:3.5631 train_time:187573ms step_avg:170.99ms step:1108/1530 train_loss:3.3435 train_time:187748ms step_avg:170.99ms step:1109/1530 train_loss:3.4983 train_time:187924ms step_avg:171.00ms step:1110/1530 train_loss:3.4233 train_time:188097ms step_avg:171.00ms step:1111/1530 train_loss:3.4832 train_time:188273ms step_avg:171.00ms step:1112/1530 train_loss:3.5558 train_time:188451ms step_avg:171.01ms step:1113/1530 train_loss:3.4269 train_time:188634ms step_avg:171.02ms step:1114/1530 train_loss:3.3632 train_time:188813ms step_avg:171.03ms step:1115/1530 train_loss:3.2400 train_time:188991ms step_avg:171.03ms step:1116/1530 train_loss:3.4229 train_time:189165ms step_avg:171.04ms step:1117/1530 train_loss:3.5845 train_time:189344ms step_avg:171.04ms step:1118/1530 train_loss:3.6194 train_time:189523ms step_avg:171.05ms step:1119/1530 train_loss:3.4743 train_time:189697ms step_avg:171.05ms step:1120/1530 train_loss:3.4863 train_time:189873ms step_avg:171.06ms step:1121/1530 train_loss:3.3904 train_time:190050ms step_avg:171.06ms step:1122/1530 train_loss:3.4557 train_time:190227ms step_avg:171.07ms step:1123/1530 train_loss:3.5767 train_time:190403ms step_avg:171.07ms step:1124/1530 train_loss:3.3364 train_time:190578ms step_avg:171.08ms step:1125/1530 train_loss:3.2241 train_time:190754ms step_avg:171.08ms step:1125/1530 val_loss:3.4073 train_time:190804ms step_avg:171.12ms step:1126/1530 train_loss:3.4717 train_time:190932ms step_avg:171.09ms step:1127/1530 train_loss:3.6694 train_time:191112ms step_avg:171.09ms step:1128/1530 train_loss:3.2267 train_time:191288ms step_avg:171.10ms step:1129/1530 train_loss:3.5555 train_time:191466ms step_avg:171.10ms step:1130/1530 train_loss:3.3776 train_time:191644ms step_avg:171.11ms step:1131/1530 train_loss:3.3936 train_time:191825ms step_avg:171.12ms step:1132/1530 train_loss:3.3648 train_time:191998ms step_avg:171.12ms step:1133/1530 train_loss:3.4873 train_time:192309ms step_avg:171.25ms step:1134/1530 train_loss:3.4482 train_time:192494ms step_avg:171.26ms step:1135/1530 train_loss:3.5161 train_time:192670ms step_avg:171.26ms step:1136/1530 train_loss:3.5635 train_time:192848ms step_avg:171.27ms step:1137/1530 train_loss:3.4532 train_time:193027ms step_avg:171.27ms step:1138/1530 train_loss:3.3514 train_time:193205ms step_avg:171.28ms step:1139/1530 train_loss:3.6489 train_time:193533ms step_avg:171.42ms step:1140/1530 train_loss:3.4543 train_time:193708ms step_avg:171.42ms step:1141/1530 train_loss:3.5944 train_time:193889ms step_avg:171.43ms step:1142/1530 train_loss:3.4434 train_time:194066ms step_avg:171.44ms step:1143/1530 train_loss:3.3596 train_time:194244ms step_avg:171.44ms step:1144/1530 train_loss:3.4426 train_time:194422ms step_avg:171.45ms step:1145/1530 train_loss:3.5854 train_time:194596ms step_avg:171.45ms step:1146/1530 train_loss:3.5501 train_time:194778ms step_avg:171.46ms step:1147/1530 train_loss:3.4953 train_time:194957ms step_avg:171.47ms step:1148/1530 train_loss:3.4921 train_time:195136ms step_avg:171.47ms step:1149/1530 train_loss:3.3215 train_time:195317ms step_avg:171.48ms step:1150/1530 train_loss:3.3698 train_time:195492ms step_avg:171.48ms step:1151/1530 train_loss:3.3137 train_time:195671ms step_avg:171.49ms step:1152/1530 train_loss:3.3905 train_time:195852ms step_avg:171.50ms step:1153/1530 train_loss:3.4276 train_time:196034ms step_avg:171.51ms step:1154/1530 train_loss:3.5161 train_time:196210ms step_avg:171.51ms step:1155/1530 train_loss:3.3201 train_time:196394ms step_avg:171.52ms step:1156/1530 train_loss:3.5331 train_time:196577ms step_avg:171.53ms step:1157/1530 train_loss:3.4904 train_time:196756ms step_avg:171.54ms step:1158/1530 train_loss:3.2472 train_time:196933ms step_avg:171.54ms step:1159/1530 train_loss:3.3452 train_time:197108ms step_avg:171.55ms step:1160/1530 train_loss:3.3361 train_time:197282ms step_avg:171.55ms step:1161/1530 train_loss:3.0786 train_time:197462ms step_avg:171.56ms step:1162/1530 train_loss:3.4176 train_time:197639ms step_avg:171.56ms step:1163/1530 train_loss:3.3889 train_time:197818ms step_avg:171.57ms step:1164/1530 train_loss:3.2911 train_time:197996ms step_avg:171.57ms step:1165/1530 train_loss:3.2459 train_time:198171ms step_avg:171.58ms step:1166/1530 train_loss:3.3860 train_time:198352ms step_avg:171.58ms step:1167/1530 train_loss:3.4070 train_time:198528ms step_avg:171.59ms step:1168/1530 train_loss:3.7163 train_time:198702ms step_avg:171.59ms step:1169/1530 train_loss:3.3706 train_time:198879ms step_avg:171.60ms step:1170/1530 train_loss:3.3829 train_time:199056ms step_avg:171.60ms step:1171/1530 train_loss:3.2954 train_time:199233ms step_avg:171.60ms step:1172/1530 train_loss:3.4227 train_time:199408ms step_avg:171.61ms step:1173/1530 train_loss:3.5389 train_time:199588ms step_avg:171.61ms step:1174/1530 train_loss:3.3746 train_time:199772ms step_avg:171.63ms step:1175/1530 train_loss:3.3617 train_time:199952ms step_avg:171.63ms step:1176/1530 train_loss:3.4211 train_time:200134ms step_avg:171.64ms step:1177/1530 train_loss:3.4437 train_time:200318ms step_avg:171.65ms step:1178/1530 train_loss:3.4925 train_time:200495ms step_avg:171.66ms step:1179/1530 train_loss:3.3970 train_time:200670ms step_avg:171.66ms step:1180/1530 train_loss:3.3518 train_time:200858ms step_avg:171.67ms step:1181/1530 train_loss:3.3366 train_time:201036ms step_avg:171.68ms step:1182/1530 train_loss:3.3661 train_time:201214ms step_avg:171.68ms step:1183/1530 train_loss:3.3297 train_time:201391ms step_avg:171.69ms step:1184/1530 train_loss:3.5086 train_time:201568ms step_avg:171.69ms step:1185/1530 train_loss:3.5393 train_time:201749ms step_avg:171.70ms step:1186/1530 train_loss:3.3608 train_time:201928ms step_avg:171.71ms step:1187/1530 train_loss:3.4132 train_time:202114ms step_avg:171.72ms step:1188/1530 train_loss:3.4415 train_time:202290ms step_avg:171.72ms step:1189/1530 train_loss:3.2759 train_time:202471ms step_avg:171.73ms step:1190/1530 train_loss:3.4420 train_time:202649ms step_avg:171.74ms step:1191/1530 train_loss:3.5748 train_time:202832ms step_avg:171.75ms step:1192/1530 train_loss:3.3886 train_time:203008ms step_avg:171.75ms step:1193/1530 train_loss:3.2706 train_time:203182ms step_avg:171.75ms step:1194/1530 train_loss:3.5543 train_time:203358ms step_avg:171.76ms step:1195/1530 train_loss:3.3679 train_time:203540ms step_avg:171.76ms step:1196/1530 train_loss:3.3836 train_time:203726ms step_avg:171.78ms step:1197/1530 train_loss:3.2888 train_time:203905ms step_avg:171.78ms step:1198/1530 train_loss:3.3007 train_time:204090ms step_avg:171.79ms step:1199/1530 train_loss:3.3399 train_time:204268ms step_avg:171.80ms step:1200/1530 train_loss:3.4443 train_time:204445ms step_avg:171.80ms step:1201/1530 train_loss:3.4786 train_time:204623ms step_avg:171.81ms step:1202/1530 train_loss:3.5953 train_time:204813ms step_avg:171.82ms step:1203/1530 train_loss:3.3999 train_time:204993ms step_avg:171.83ms step:1204/1530 train_loss:3.3029 train_time:205176ms step_avg:171.84ms step:1205/1530 train_loss:3.4371 train_time:205352ms step_avg:171.84ms step:1206/1530 train_loss:3.4720 train_time:205528ms step_avg:171.85ms step:1207/1530 train_loss:3.5110 train_time:205705ms step_avg:171.85ms step:1208/1530 train_loss:3.3922 train_time:205880ms step_avg:171.85ms step:1209/1530 train_loss:3.2429 train_time:206060ms step_avg:171.86ms step:1210/1530 train_loss:3.3021 train_time:206239ms step_avg:171.87ms step:1211/1530 train_loss:3.3890 train_time:206415ms step_avg:171.87ms step:1212/1530 train_loss:3.3914 train_time:206593ms step_avg:171.87ms step:1213/1530 train_loss:3.4063 train_time:206772ms step_avg:171.88ms step:1214/1530 train_loss:3.2479 train_time:206953ms step_avg:171.89ms step:1215/1530 train_loss:3.3923 train_time:207130ms step_avg:171.89ms step:1216/1530 train_loss:3.3278 train_time:207306ms step_avg:171.90ms step:1217/1530 train_loss:3.3232 train_time:207483ms step_avg:171.90ms step:1218/1530 train_loss:3.4024 train_time:207661ms step_avg:171.91ms step:1219/1530 train_loss:3.2547 train_time:207845ms step_avg:171.91ms step:1220/1530 train_loss:3.4691 train_time:208022ms step_avg:171.92ms step:1221/1530 train_loss:3.5001 train_time:208198ms step_avg:171.92ms step:1222/1530 train_loss:3.4280 train_time:208374ms step_avg:171.93ms step:1223/1530 train_loss:3.2914 train_time:208551ms step_avg:171.93ms step:1224/1530 train_loss:3.2513 train_time:208734ms step_avg:171.94ms step:1225/1530 train_loss:3.3649 train_time:208913ms step_avg:171.95ms step:1226/1530 train_loss:3.3333 train_time:209094ms step_avg:171.95ms step:1227/1530 train_loss:3.2731 train_time:209273ms step_avg:171.96ms step:1228/1530 train_loss:3.4416 train_time:209448ms step_avg:171.96ms step:1229/1530 train_loss:3.3668 train_time:209628ms step_avg:171.97ms step:1230/1530 train_loss:3.3994 train_time:209810ms step_avg:171.98ms step:1231/1530 train_loss:3.5731 train_time:209990ms step_avg:171.98ms step:1232/1530 train_loss:3.4970 train_time:210168ms step_avg:171.99ms step:1233/1530 train_loss:3.4257 train_time:210343ms step_avg:171.99ms step:1234/1530 train_loss:3.5813 train_time:210522ms step_avg:171.99ms step:1235/1530 train_loss:3.3208 train_time:210701ms step_avg:172.00ms step:1236/1530 train_loss:3.2872 train_time:210879ms step_avg:172.01ms step:1237/1530 train_loss:3.2684 train_time:211057ms step_avg:172.01ms step:1238/1530 train_loss:3.2776 train_time:211240ms step_avg:172.02ms step:1239/1530 train_loss:3.3307 train_time:211419ms step_avg:172.03ms step:1240/1530 train_loss:3.3825 train_time:211596ms step_avg:172.03ms step:1241/1530 train_loss:3.4241 train_time:211774ms step_avg:172.03ms step:1242/1530 train_loss:3.2940 train_time:211951ms step_avg:172.04ms step:1243/1530 train_loss:3.4012 train_time:212130ms step_avg:172.04ms step:1244/1530 train_loss:3.4061 train_time:212303ms step_avg:172.04ms step:1245/1530 train_loss:3.4086 train_time:212479ms step_avg:172.05ms step:1246/1530 train_loss:3.2419 train_time:212657ms step_avg:172.05ms step:1247/1530 train_loss:3.3683 train_time:212834ms step_avg:172.06ms step:1248/1530 train_loss:3.4263 train_time:213009ms step_avg:172.06ms step:1249/1530 train_loss:3.4256 train_time:213189ms step_avg:172.07ms step:1250/1530 train_loss:3.3020 train_time:213366ms step_avg:172.07ms step:1250/1530 val_loss:3.3537 train_time:213421ms step_avg:172.11ms step:1251/1530 train_loss:3.4922 train_time:213551ms step_avg:172.08ms step:1252/1530 train_loss:3.3560 train_time:213728ms step_avg:172.08ms step:1253/1530 train_loss:3.3077 train_time:213906ms step_avg:172.09ms step:1254/1530 train_loss:3.4095 train_time:214085ms step_avg:172.09ms step:1255/1530 train_loss:3.5185 train_time:214277ms step_avg:172.11ms step:1256/1530 train_loss:3.3048 train_time:214457ms step_avg:172.12ms step:1257/1530 train_loss:3.3755 train_time:214636ms step_avg:172.12ms step:1258/1530 train_loss:3.3648 train_time:214820ms step_avg:172.13ms step:1259/1530 train_loss:3.3250 train_time:214998ms step_avg:172.14ms step:1260/1530 train_loss:3.2063 train_time:215174ms step_avg:172.14ms step:1261/1530 train_loss:3.3015 train_time:215355ms step_avg:172.15ms step:1262/1530 train_loss:3.3281 train_time:215538ms step_avg:172.16ms step:1263/1530 train_loss:3.2356 train_time:215720ms step_avg:172.16ms step:1264/1530 train_loss:3.4412 train_time:215894ms step_avg:172.16ms step:1265/1530 train_loss:3.4211 train_time:216070ms step_avg:172.17ms step:1266/1530 train_loss:3.4344 train_time:216250ms step_avg:172.17ms step:1267/1530 train_loss:3.3698 train_time:216430ms step_avg:172.18ms step:1268/1530 train_loss:3.4082 train_time:216610ms step_avg:172.19ms step:1269/1530 train_loss:3.2508 train_time:216794ms step_avg:172.20ms step:1270/1530 train_loss:3.1022 train_time:216971ms step_avg:172.20ms step:1271/1530 train_loss:3.4035 train_time:217149ms step_avg:172.20ms step:1272/1530 train_loss:3.3472 train_time:217325ms step_avg:172.21ms step:1273/1530 train_loss:3.3750 train_time:217504ms step_avg:172.21ms step:1274/1530 train_loss:3.3571 train_time:217684ms step_avg:172.22ms step:1275/1530 train_loss:3.4318 train_time:217859ms step_avg:172.22ms step:1276/1530 train_loss:3.4680 train_time:218033ms step_avg:172.22ms step:1277/1530 train_loss:3.4090 train_time:218212ms step_avg:172.23ms step:1278/1530 train_loss:3.4046 train_time:218387ms step_avg:172.23ms step:1279/1530 train_loss:3.2651 train_time:218568ms step_avg:172.24ms step:1280/1530 train_loss:3.3589 train_time:218755ms step_avg:172.25ms step:1281/1530 train_loss:3.4200 train_time:218932ms step_avg:172.25ms step:1282/1530 train_loss:3.4660 train_time:219108ms step_avg:172.25ms step:1283/1530 train_loss:3.3323 train_time:219287ms step_avg:172.26ms step:1284/1530 train_loss:3.3654 train_time:219464ms step_avg:172.26ms step:1285/1530 train_loss:3.3613 train_time:219642ms step_avg:172.27ms step:1286/1530 train_loss:3.3325 train_time:219816ms step_avg:172.27ms step:1287/1530 train_loss:3.4901 train_time:219994ms step_avg:172.27ms step:1288/1530 train_loss:3.2951 train_time:220173ms step_avg:172.28ms step:1289/1530 train_loss:3.3776 train_time:220359ms step_avg:172.29ms step:1290/1530 train_loss:3.4548 train_time:220542ms step_avg:172.30ms step:1291/1530 train_loss:3.3808 train_time:220721ms step_avg:172.30ms step:1292/1530 train_loss:3.4768 train_time:220903ms step_avg:172.31ms step:1293/1530 train_loss:3.5152 train_time:221083ms step_avg:172.32ms step:1294/1530 train_loss:3.4529 train_time:221263ms step_avg:172.32ms step:1295/1530 train_loss:3.2813 train_time:221442ms step_avg:172.33ms step:1296/1530 train_loss:3.3731 train_time:221624ms step_avg:172.34ms step:1297/1530 train_loss:3.2703 train_time:221803ms step_avg:172.34ms step:1298/1530 train_loss:3.2697 train_time:221984ms step_avg:172.35ms step:1299/1530 train_loss:3.3978 train_time:222162ms step_avg:172.35ms step:1300/1530 train_loss:3.4017 train_time:222337ms step_avg:172.35ms step:1301/1530 train_loss:3.4041 train_time:222514ms step_avg:172.36ms step:1302/1530 train_loss:3.5737 train_time:222696ms step_avg:172.37ms step:1303/1530 train_loss:3.3070 train_time:222877ms step_avg:172.37ms step:1304/1530 train_loss:3.5137 train_time:223057ms step_avg:172.38ms step:1305/1530 train_loss:3.2551 train_time:223233ms step_avg:172.38ms step:1306/1530 train_loss:3.4502 train_time:223415ms step_avg:172.39ms step:1307/1530 train_loss:3.4510 train_time:223589ms step_avg:172.39ms step:1308/1530 train_loss:3.2861 train_time:223767ms step_avg:172.39ms step:1309/1530 train_loss:3.3108 train_time:223946ms step_avg:172.40ms step:1310/1530 train_loss:3.2847 train_time:224125ms step_avg:172.40ms step:1311/1530 train_loss:3.2952 train_time:224301ms step_avg:172.41ms step:1312/1530 train_loss:3.3712 train_time:224481ms step_avg:172.41ms step:1313/1530 train_loss:3.3412 train_time:224656ms step_avg:172.41ms step:1314/1530 train_loss:3.0456 train_time:224839ms step_avg:172.42ms step:1315/1530 train_loss:3.2728 train_time:225016ms step_avg:172.43ms step:1316/1530 train_loss:3.3943 train_time:225191ms step_avg:172.43ms step:1317/1530 train_loss:3.4232 train_time:225370ms step_avg:172.43ms step:1318/1530 train_loss:3.2982 train_time:225556ms step_avg:172.44ms step:1319/1530 train_loss:3.4246 train_time:225736ms step_avg:172.45ms step:1320/1530 train_loss:3.4585 train_time:225918ms step_avg:172.46ms step:1321/1530 train_loss:3.3608 train_time:226096ms step_avg:172.46ms step:1322/1530 train_loss:3.3195 train_time:226413ms step_avg:172.57ms step:1323/1530 train_loss:3.3222 train_time:226605ms step_avg:172.59ms step:1324/1530 train_loss:3.4361 train_time:226784ms step_avg:172.59ms step:1325/1530 train_loss:3.4895 train_time:226968ms step_avg:172.60ms step:1326/1530 train_loss:3.2109 train_time:227150ms step_avg:172.61ms step:1327/1530 train_loss:3.1604 train_time:227328ms step_avg:172.61ms step:1328/1530 train_loss:3.4910 train_time:227507ms step_avg:172.62ms step:1329/1530 train_loss:3.2946 train_time:227851ms step_avg:172.75ms step:1330/1530 train_loss:3.4233 train_time:228031ms step_avg:172.75ms step:1331/1530 train_loss:3.3248 train_time:228207ms step_avg:172.75ms step:1332/1530 train_loss:3.7367 train_time:228389ms step_avg:172.76ms step:1333/1530 train_loss:3.4725 train_time:228570ms step_avg:172.77ms step:1334/1530 train_loss:3.3714 train_time:228749ms step_avg:172.77ms step:1335/1530 train_loss:3.2863 train_time:228929ms step_avg:172.78ms step:1336/1530 train_loss:3.2926 train_time:229114ms step_avg:172.79ms step:1337/1530 train_loss:3.5456 train_time:229294ms step_avg:172.79ms step:1338/1530 train_loss:3.5236 train_time:229473ms step_avg:172.80ms step:1339/1530 train_loss:3.3374 train_time:229653ms step_avg:172.80ms step:1340/1530 train_loss:3.2849 train_time:229832ms step_avg:172.81ms step:1341/1530 train_loss:3.5958 train_time:230010ms step_avg:172.81ms step:1342/1530 train_loss:3.3557 train_time:230191ms step_avg:172.82ms step:1343/1530 train_loss:3.3679 train_time:230370ms step_avg:172.82ms step:1344/1530 train_loss:3.4189 train_time:230552ms step_avg:172.83ms step:1345/1530 train_loss:3.3854 train_time:230734ms step_avg:172.83ms step:1346/1530 train_loss:3.2934 train_time:230911ms step_avg:172.84ms step:1347/1530 train_loss:3.2786 train_time:231089ms step_avg:172.84ms step:1348/1530 train_loss:3.3453 train_time:231267ms step_avg:172.84ms step:1349/1530 train_loss:3.2733 train_time:231443ms step_avg:172.85ms step:1350/1530 train_loss:3.3914 train_time:231624ms step_avg:172.85ms step:1351/1530 train_loss:3.2440 train_time:231801ms step_avg:172.86ms step:1352/1530 train_loss:3.3055 train_time:231977ms step_avg:172.86ms step:1353/1530 train_loss:3.3963 train_time:232157ms step_avg:172.86ms step:1354/1530 train_loss:3.2571 train_time:232335ms step_avg:172.87ms step:1355/1530 train_loss:3.1896 train_time:232513ms step_avg:172.87ms step:1356/1530 train_loss:3.5120 train_time:232694ms step_avg:172.88ms step:1357/1530 train_loss:3.4242 train_time:232874ms step_avg:172.88ms step:1358/1530 train_loss:3.1857 train_time:233053ms step_avg:172.89ms step:1359/1530 train_loss:3.4368 train_time:233233ms step_avg:172.89ms step:1360/1530 train_loss:3.3466 train_time:233412ms step_avg:172.90ms step:1361/1530 train_loss:3.1245 train_time:233598ms step_avg:172.91ms step:1362/1530 train_loss:3.3845 train_time:233779ms step_avg:172.91ms step:1363/1530 train_loss:3.2783 train_time:233967ms step_avg:172.92ms step:1364/1530 train_loss:3.2977 train_time:234145ms step_avg:172.93ms step:1365/1530 train_loss:3.3138 train_time:234323ms step_avg:172.93ms step:1366/1530 train_loss:3.4207 train_time:234505ms step_avg:172.94ms step:1367/1530 train_loss:3.3961 train_time:234685ms step_avg:172.94ms step:1368/1530 train_loss:3.3448 train_time:234866ms step_avg:172.95ms step:1369/1530 train_loss:3.2783 train_time:235055ms step_avg:172.96ms step:1370/1530 train_loss:3.6024 train_time:235236ms step_avg:172.97ms step:1371/1530 train_loss:3.3120 train_time:235418ms step_avg:172.97ms step:1372/1530 train_loss:3.3703 train_time:235600ms step_avg:172.98ms step:1373/1530 train_loss:3.3677 train_time:235779ms step_avg:172.99ms step:1374/1530 train_loss:3.1489 train_time:235961ms step_avg:172.99ms step:1375/1530 train_loss:3.5314 train_time:236139ms step_avg:173.00ms step:1375/1530 val_loss:3.3107 train_time:236189ms step_avg:173.03ms step:1376/1530 train_loss:3.3490 train_time:236318ms step_avg:173.00ms step:1377/1530 train_loss:3.4844 train_time:236496ms step_avg:173.00ms step:1378/1530 train_loss:3.4719 train_time:236672ms step_avg:173.01ms step:1379/1530 train_loss:3.1205 train_time:236854ms step_avg:173.01ms step:1380/1530 train_loss:3.3134 train_time:237033ms step_avg:173.02ms step:1381/1530 train_loss:3.6976 train_time:237217ms step_avg:173.02ms step:1382/1530 train_loss:3.2098 train_time:237395ms step_avg:173.03ms step:1383/1530 train_loss:3.3930 train_time:237577ms step_avg:173.03ms step:1384/1530 train_loss:3.4782 train_time:237761ms step_avg:173.04ms step:1385/1530 train_loss:3.4068 train_time:237935ms step_avg:173.04ms step:1386/1530 train_loss:3.3432 train_time:238114ms step_avg:173.05ms step:1387/1530 train_loss:3.1979 train_time:238294ms step_avg:173.05ms step:1388/1530 train_loss:3.3479 train_time:238470ms step_avg:173.06ms step:1389/1530 train_loss:3.3128 train_time:238653ms step_avg:173.06ms step:1390/1530 train_loss:3.5672 train_time:238830ms step_avg:173.06ms step:1391/1530 train_loss:3.2879 train_time:239008ms step_avg:173.07ms step:1392/1530 train_loss:3.2890 train_time:239187ms step_avg:173.07ms step:1393/1530 train_loss:3.2400 train_time:239367ms step_avg:173.08ms step:1394/1530 train_loss:3.4933 train_time:239544ms step_avg:173.08ms step:1395/1530 train_loss:3.3912 train_time:239723ms step_avg:173.09ms step:1396/1530 train_loss:3.4056 train_time:239901ms step_avg:173.09ms step:1397/1530 train_loss:3.3078 train_time:240077ms step_avg:173.09ms step:1398/1530 train_loss:3.2563 train_time:240253ms step_avg:173.09ms step:1399/1530 train_loss:3.3159 train_time:240432ms step_avg:173.10ms step:1400/1530 train_loss:3.3187 train_time:240615ms step_avg:173.10ms step:1401/1530 train_loss:3.3529 train_time:240791ms step_avg:173.11ms step:1402/1530 train_loss:3.2971 train_time:240969ms step_avg:173.11ms step:1403/1530 train_loss:3.4922 train_time:241154ms step_avg:173.12ms step:1404/1530 train_loss:3.2821 train_time:241331ms step_avg:173.12ms step:1405/1530 train_loss:3.3161 train_time:241511ms step_avg:173.13ms step:1406/1530 train_loss:3.3107 train_time:241690ms step_avg:173.13ms step:1407/1530 train_loss:3.1736 train_time:241866ms step_avg:173.13ms step:1408/1530 train_loss:3.3096 train_time:242045ms step_avg:173.14ms step:1409/1530 train_loss:3.2957 train_time:242232ms step_avg:173.15ms step:1410/1530 train_loss:3.2855 train_time:242409ms step_avg:173.15ms step:1411/1530 train_loss:3.3644 train_time:242585ms step_avg:173.15ms step:1412/1530 train_loss:3.3318 train_time:242763ms step_avg:173.15ms step:1413/1530 train_loss:3.3588 train_time:242942ms step_avg:173.16ms step:1414/1530 train_loss:3.3261 train_time:243123ms step_avg:173.16ms step:1415/1530 train_loss:3.4066 train_time:243307ms step_avg:173.17ms step:1416/1530 train_loss:3.2301 train_time:243494ms step_avg:173.18ms step:1417/1530 train_loss:3.2816 train_time:243676ms step_avg:173.19ms step:1418/1530 train_loss:3.3893 train_time:243857ms step_avg:173.19ms step:1419/1530 train_loss:3.3406 train_time:244039ms step_avg:173.20ms step:1420/1530 train_loss:3.3674 train_time:244220ms step_avg:173.21ms step:1421/1530 train_loss:3.3664 train_time:244400ms step_avg:173.21ms step:1422/1530 train_loss:3.3323 train_time:244577ms step_avg:173.21ms step:1423/1530 train_loss:3.3179 train_time:244755ms step_avg:173.22ms step:1424/1530 train_loss:3.3339 train_time:244937ms step_avg:173.22ms step:1425/1530 train_loss:3.1864 train_time:245124ms step_avg:173.23ms step:1426/1530 train_loss:3.3199 train_time:245303ms step_avg:173.24ms step:1427/1530 train_loss:3.2856 train_time:245486ms step_avg:173.24ms step:1428/1530 train_loss:3.3711 train_time:245665ms step_avg:173.25ms step:1429/1530 train_loss:3.3550 train_time:245842ms step_avg:173.25ms step:1430/1530 train_loss:3.2571 train_time:246025ms step_avg:173.26ms step:1431/1530 train_loss:3.3205 train_time:246206ms step_avg:173.26ms step:1432/1530 train_loss:3.3313 train_time:246387ms step_avg:173.27ms step:1433/1530 train_loss:3.1297 train_time:246569ms step_avg:173.27ms step:1434/1530 train_loss:3.2849 train_time:246754ms step_avg:173.28ms step:1435/1530 train_loss:3.1139 train_time:246933ms step_avg:173.29ms step:1436/1530 train_loss:3.2324 train_time:247113ms step_avg:173.29ms step:1437/1530 train_loss:3.4068 train_time:247290ms step_avg:173.29ms step:1438/1530 train_loss:3.3799 train_time:247468ms step_avg:173.30ms step:1439/1530 train_loss:3.3142 train_time:247649ms step_avg:173.30ms step:1440/1530 train_loss:3.1909 train_time:247825ms step_avg:173.30ms step:1441/1530 train_loss:3.3343 train_time:248003ms step_avg:173.31ms step:1442/1530 train_loss:3.3841 train_time:248188ms step_avg:173.32ms step:1443/1530 train_loss:3.4862 train_time:248374ms step_avg:173.32ms step:1444/1530 train_loss:3.4472 train_time:248551ms step_avg:173.33ms step:1445/1530 train_loss:3.3336 train_time:248730ms step_avg:173.33ms step:1446/1530 train_loss:3.1978 train_time:248910ms step_avg:173.34ms step:1447/1530 train_loss:3.2954 train_time:249090ms step_avg:173.34ms step:1448/1530 train_loss:3.2962 train_time:249268ms step_avg:173.34ms step:1449/1530 train_loss:3.3937 train_time:249447ms step_avg:173.35ms step:1450/1530 train_loss:3.3845 train_time:249627ms step_avg:173.35ms step:1451/1530 train_loss:3.2078 train_time:249806ms step_avg:173.36ms step:1452/1530 train_loss:3.3252 train_time:249986ms step_avg:173.36ms step:1453/1530 train_loss:3.2586 train_time:250163ms step_avg:173.36ms step:1454/1530 train_loss:3.2890 train_time:250341ms step_avg:173.37ms step:1455/1530 train_loss:3.3286 train_time:250525ms step_avg:173.37ms step:1456/1530 train_loss:3.2809 train_time:250702ms step_avg:173.38ms step:1457/1530 train_loss:3.1526 train_time:250878ms step_avg:173.38ms step:1458/1530 train_loss:3.4218 train_time:251054ms step_avg:173.38ms step:1459/1530 train_loss:3.2653 train_time:251235ms step_avg:173.39ms step:1460/1530 train_loss:3.3138 train_time:251415ms step_avg:173.39ms step:1461/1530 train_loss:3.4328 train_time:251595ms step_avg:173.39ms step:1462/1530 train_loss:3.2628 train_time:251770ms step_avg:173.40ms step:1463/1530 train_loss:3.4642 train_time:251953ms step_avg:173.40ms step:1464/1530 train_loss:3.3581 train_time:252132ms step_avg:173.41ms step:1465/1530 train_loss:3.3557 train_time:252313ms step_avg:173.41ms step:1466/1530 train_loss:3.2850 train_time:252489ms step_avg:173.41ms step:1467/1530 train_loss:3.3903 train_time:252669ms step_avg:173.42ms step:1468/1530 train_loss:3.2848 train_time:252846ms step_avg:173.42ms step:1469/1530 train_loss:3.2729 train_time:253027ms step_avg:173.42ms step:1470/1530 train_loss:3.3296 train_time:253210ms step_avg:173.43ms step:1471/1530 train_loss:3.2552 train_time:253393ms step_avg:173.44ms step:1472/1530 train_loss:3.2450 train_time:253574ms step_avg:173.44ms step:1473/1530 train_loss:3.4406 train_time:253751ms step_avg:173.45ms step:1474/1530 train_loss:3.3145 train_time:253933ms step_avg:173.45ms step:1475/1530 train_loss:3.1480 train_time:254120ms step_avg:173.46ms step:1476/1530 train_loss:3.2645 train_time:254298ms step_avg:173.46ms step:1477/1530 train_loss:3.2370 train_time:254486ms step_avg:173.47ms step:1478/1530 train_loss:3.3058 train_time:254671ms step_avg:173.48ms step:1479/1530 train_loss:3.3948 train_time:254852ms step_avg:173.49ms step:1480/1530 train_loss:3.2697 train_time:255030ms step_avg:173.49ms step:1481/1530 train_loss:3.4546 train_time:255211ms step_avg:173.50ms step:1482/1530 train_loss:3.3646 train_time:255398ms step_avg:173.50ms step:1483/1530 train_loss:3.2781 train_time:255589ms step_avg:173.52ms step:1484/1530 train_loss:3.2641 train_time:255776ms step_avg:173.52ms step:1485/1530 train_loss:3.2750 train_time:255955ms step_avg:173.53ms step:1486/1530 train_loss:3.2261 train_time:256139ms step_avg:173.54ms step:1487/1530 train_loss:3.3420 train_time:256321ms step_avg:173.54ms step:1488/1530 train_loss:3.2474 train_time:256507ms step_avg:173.55ms step:1489/1530 train_loss:3.3148 train_time:256688ms step_avg:173.55ms step:1490/1530 train_loss:3.2507 train_time:256866ms step_avg:173.56ms step:1491/1530 train_loss:3.1594 train_time:257046ms step_avg:173.56ms step:1492/1530 train_loss:3.2675 train_time:257227ms step_avg:173.57ms step:1493/1530 train_loss:3.4346 train_time:257407ms step_avg:173.57ms step:1494/1530 train_loss:3.2987 train_time:257586ms step_avg:173.58ms step:1495/1530 train_loss:3.0325 train_time:257769ms step_avg:173.58ms step:1496/1530 train_loss:3.3584 train_time:257951ms step_avg:173.59ms step:1497/1530 train_loss:3.3122 train_time:258134ms step_avg:173.59ms step:1498/1530 train_loss:3.3448 train_time:258318ms step_avg:173.60ms step:1499/1530 train_loss:3.3077 train_time:258508ms step_avg:173.61ms step:1500/1530 train_loss:3.2955 train_time:258697ms step_avg:173.62ms step:1500/1530 val_loss:3.2788 train_time:258753ms step_avg:173.66ms step:1501/1530 train_loss:3.0852 train_time:258889ms step_avg:173.63ms step:1502/1530 train_loss:3.3604 train_time:259079ms step_avg:173.65ms step:1503/1530 train_loss:3.2437 train_time:259259ms step_avg:173.65ms step:1504/1530 train_loss:3.2468 train_time:259440ms step_avg:173.65ms step:1505/1530 train_loss:3.2166 train_time:259619ms step_avg:173.66ms step:1506/1530 train_loss:3.2810 train_time:259803ms step_avg:173.66ms step:1507/1530 train_loss:3.1788 train_time:259998ms step_avg:173.68ms step:1508/1530 train_loss:3.4797 train_time:260181ms step_avg:173.69ms step:1509/1530 train_loss:3.2812 train_time:260359ms step_avg:173.69ms step:1510/1530 train_loss:3.2699 train_time:260538ms step_avg:173.69ms step:1511/1530 train_loss:3.4126 train_time:260855ms step_avg:173.79ms step:1512/1530 train_loss:3.4171 train_time:261042ms step_avg:173.80ms step:1513/1530 train_loss:3.2680 train_time:261226ms step_avg:173.80ms step:1514/1530 train_loss:3.0835 train_time:261408ms step_avg:173.81ms step:1515/1530 train_loss:3.2405 train_time:261588ms step_avg:173.81ms step:1516/1530 train_loss:3.2539 train_time:261774ms step_avg:173.82ms step:1517/1530 train_loss:3.2983 train_time:261955ms step_avg:173.83ms step:1518/1530 train_loss:3.2067 train_time:262139ms step_avg:173.83ms step:1519/1530 train_loss:3.5014 train_time:262469ms step_avg:173.94ms step:1520/1530 train_loss:3.1233 train_time:262648ms step_avg:173.94ms step:1521/1530 train_loss:3.2058 train_time:262827ms step_avg:173.94ms step:1522/1530 train_loss:3.3582 train_time:263013ms step_avg:173.95ms step:1523/1530 train_loss:3.2327 train_time:263191ms step_avg:173.95ms step:1524/1530 train_loss:3.3469 train_time:263370ms step_avg:173.96ms step:1525/1530 train_loss:3.3321 train_time:263557ms step_avg:173.96ms step:1526/1530 train_loss:3.2743 train_time:263748ms step_avg:173.98ms step:1527/1530 train_loss:3.2887 train_time:263929ms step_avg:173.98ms step:1528/1530 train_loss:3.4076 train_time:264108ms step_avg:173.98ms step:1529/1530 train_loss:3.4076 train_time:264287ms step_avg:173.99ms step:1530/1530 train_loss:3.2374 train_time:264463ms step_avg:173.99ms step:1530/1530 val_loss:3.2763 train_time:264518ms step_avg:174.02ms