import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import glob import time import contextlib from dataclasses import dataclass import numpy as np import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import flex_attention, create_block_mask flex_attention = torch.compile(flex_attention, dynamic=False) create_block_mask = torch.compile(create_block_mask, dynamic=False) # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) super().__init__(params, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] zeropower_backend = zeropower_backends[group['backend']] # generate weight updates in distributed fashion total_params = sum(p.numel() for p in group['params']) updates_flat = torch.zeros(total_params, device='cuda', dtype=torch.bfloat16) curr_idx = 0 for i, p in enumerate(group['params']): # luckily this will perfectly distribute a transformer with multiple of 4 layers to 8 GPUs if i % int(os.environ['WORLD_SIZE']) == int(os.environ['RANK']): g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.mul_(momentum).add_(g) g = g.add(buf, alpha=momentum) if group['nesterov'] else buf g = zeropower_backend(g, steps=group['backend_steps']) g *= max(1, g.size(0)/g.size(1))**0.5 updates_flat[curr_idx:curr_idx+p.numel()] = g.flatten() curr_idx += p.numel() # sync updates across devices. we are not memory-constrained so can do this simple deserialization dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) # deserialize and apply updates curr_idx = 0 for p in group['params']: g = updates_flat[curr_idx:curr_idx+p.numel()].view_as(p.data).type_as(p.data) p.data.add_(g, alpha=-lr) curr_idx += p.numel() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lamb = nn.Parameter(torch.tensor(0.5)) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.n_head, -1) k = self.c_k(x).view(B, T, self.n_head, -1) v = self.c_v(x).view(B, T, self.n_head, -1) v = (1 - self.lamb) * v + self.lamb * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 class GPT(nn.Module): def __init__(self, config): super().__init__() # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning vte = nn.Embedding(config.vocab_size, config.n_embd*12), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx, target, attn_blocksize): docs = (idx == 50256).cumsum(0) def document_causal_mask(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < attn_blocksize return causal_mask & document_mask & window_mask S = len(idx) block_mask = create_block_mask(document_causal_mask, None, None, S, S, device="cuda", _compile=True) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(12, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers+i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(filename): # only reads the header, returns header data with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) if header[0] != 20240520: print("ERROR: magic number mismatch in the data .bin file!") print("---> HINT: Are you passing in a correct file with --input_bin?") print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README") print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try") exit(1) assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) return ntok # for now just return the number of tokens def _load_data_shard(filename): with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) # the rest of it are tokens, stored as uint16 tokens = np.frombuffer(f.read(), dtype=np.uint16) assert len(tokens) == ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(glob.glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total ntok_total = 0 for fname in self.files: shard_ntok = _peek_data_shard(fname) assert shard_ntok >= num_processes * T + 1 ntok_total += int(shard_ntok) self.ntok_total = ntok_total self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] buf = torch.tensor(buf.astype(np.int32), dtype=torch.long) x = buf[:-1] # inputs y = buf[1:] # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size >= len(self.tokens): self.advance() return x.cuda(), y.cuda() # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1530 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) # note that this learning rate is neither sensitive nor tuned optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.time() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.time() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the attention blocksize for the current step, in chunks of 64. By @fernbear.bsky.social attn_blocksize = torch.tensor(64*((step/args.num_iterations * (1792 - 64) + 64)//64), dtype=torch.int, device='cuda') # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, attn_blocksize=attn_blocksize) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.time() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.time() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps+1): ctx = model.no_sync() if i < train_accumulation_steps else contextlib.nullcontext() with ctx: # there's no need to sync gradients every accumulation step # forward pass loss = model(x, y, attn_blocksize=attn_blocksize) # advance the dataset for the next batch x, y = train_loader.next_batch() # backward pass loss.backward() train_loss = loss.detach() for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) optimizer3.param_groups[0]['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. #dist.all_reduce(train_loss, op=dist.ReduceOp.AVG) # all-reducing the training loss would be more correct in terms of logging, but slower approx_time = training_time_ms + 1000 * (time.time() - t0) print0(f"step:{step+1}/{args.num_iterations} train_loss:{train_loss.item():.4f} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Thu Dec 5 01:48:07 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 39C P0 76W / 700W | 3MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 31C P0 115W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 31C P0 118W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 529MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 39C P0 123W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 110W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 39C P0 128W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 119W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1100000000 across 11 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1530 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1530 train_loss:10.8258 train_time:31578ms step_avg:nanms step:2/1530 train_loss:10.0783 train_time:31690ms step_avg:nanms step:3/1530 train_loss:8.4190 train_time:31847ms step_avg:nanms step:4/1530 train_loss:7.5594 train_time:32008ms step_avg:nanms step:5/1530 train_loss:7.4027 train_time:32167ms step_avg:nanms step:6/1530 train_loss:6.9492 train_time:32327ms step_avg:nanms step:7/1530 train_loss:7.1731 train_time:32486ms step_avg:nanms step:8/1530 train_loss:6.7285 train_time:32647ms step_avg:nanms step:9/1530 train_loss:6.6165 train_time:32808ms step_avg:nanms step:10/1530 train_loss:6.4975 train_time:32967ms step_avg:nanms step:11/1530 train_loss:6.4643 train_time:114ms step_avg:nanms step:12/1530 train_loss:6.3779 train_time:274ms step_avg:nanms step:13/1530 train_loss:6.2153 train_time:433ms step_avg:144.50ms step:14/1530 train_loss:6.2083 train_time:594ms step_avg:148.53ms step:15/1530 train_loss:6.1718 train_time:754ms step_avg:150.74ms step:16/1530 train_loss:6.1098 train_time:914ms step_avg:152.33ms step:17/1530 train_loss:6.1530 train_time:1073ms step_avg:153.31ms step:18/1530 train_loss:5.9847 train_time:1234ms step_avg:154.23ms step:19/1530 train_loss:5.9773 train_time:1394ms step_avg:154.94ms step:20/1530 train_loss:5.6740 train_time:1554ms step_avg:155.37ms step:21/1530 train_loss:5.9336 train_time:1714ms step_avg:155.83ms step:22/1530 train_loss:6.1639 train_time:1875ms step_avg:156.22ms step:23/1530 train_loss:5.8417 train_time:2035ms step_avg:156.54ms step:24/1530 train_loss:5.9978 train_time:2195ms step_avg:156.76ms step:25/1530 train_loss:5.6720 train_time:2355ms step_avg:156.97ms step:26/1530 train_loss:5.5848 train_time:2516ms step_avg:157.24ms step:27/1530 train_loss:5.7606 train_time:2675ms step_avg:157.38ms step:28/1530 train_loss:5.4180 train_time:2838ms step_avg:157.65ms step:29/1530 train_loss:5.6797 train_time:2998ms step_avg:157.80ms step:30/1530 train_loss:5.4683 train_time:3159ms step_avg:157.93ms step:31/1530 train_loss:5.4235 train_time:3318ms step_avg:158.00ms step:32/1530 train_loss:5.2956 train_time:3479ms step_avg:158.12ms step:33/1530 train_loss:5.5658 train_time:3640ms step_avg:158.25ms step:34/1530 train_loss:5.5028 train_time:3801ms step_avg:158.37ms step:35/1530 train_loss:5.6094 train_time:3962ms step_avg:158.48ms step:36/1530 train_loss:5.5309 train_time:4123ms step_avg:158.58ms step:37/1530 train_loss:5.4436 train_time:4283ms step_avg:158.63ms step:38/1530 train_loss:5.3034 train_time:4444ms step_avg:158.72ms step:39/1530 train_loss:5.3103 train_time:4605ms step_avg:158.79ms step:40/1530 train_loss:5.2652 train_time:4766ms step_avg:158.85ms step:41/1530 train_loss:5.2451 train_time:4926ms step_avg:158.90ms step:42/1530 train_loss:5.1806 train_time:5086ms step_avg:158.94ms step:43/1530 train_loss:5.2625 train_time:5247ms step_avg:159.00ms step:44/1530 train_loss:5.2575 train_time:5407ms step_avg:159.04ms step:45/1530 train_loss:5.4022 train_time:5567ms step_avg:159.06ms step:46/1530 train_loss:5.1702 train_time:5728ms step_avg:159.12ms step:47/1530 train_loss:5.0462 train_time:5888ms step_avg:159.14ms step:48/1530 train_loss:5.2122 train_time:6048ms step_avg:159.17ms step:49/1530 train_loss:5.1527 train_time:6209ms step_avg:159.20ms step:50/1530 train_loss:5.2473 train_time:6369ms step_avg:159.21ms step:51/1530 train_loss:5.1330 train_time:6530ms step_avg:159.26ms step:52/1530 train_loss:5.0226 train_time:6690ms step_avg:159.28ms step:53/1530 train_loss:5.1635 train_time:6850ms step_avg:159.30ms step:54/1530 train_loss:5.0090 train_time:7010ms step_avg:159.31ms step:55/1530 train_loss:5.4073 train_time:7169ms step_avg:159.30ms step:56/1530 train_loss:5.0195 train_time:7330ms step_avg:159.35ms step:57/1530 train_loss:4.8762 train_time:7490ms step_avg:159.36ms step:58/1530 train_loss:5.0395 train_time:7650ms step_avg:159.37ms step:59/1530 train_loss:5.0325 train_time:7810ms step_avg:159.38ms step:60/1530 train_loss:5.1755 train_time:7969ms step_avg:159.39ms step:61/1530 train_loss:4.8914 train_time:8130ms step_avg:159.41ms step:62/1530 train_loss:4.9883 train_time:8290ms step_avg:159.42ms step:63/1530 train_loss:4.9841 train_time:8450ms step_avg:159.42ms step:64/1530 train_loss:4.9932 train_time:8611ms step_avg:159.46ms step:65/1530 train_loss:4.8109 train_time:8770ms step_avg:159.45ms step:66/1530 train_loss:4.9080 train_time:8931ms step_avg:159.48ms step:67/1530 train_loss:4.8111 train_time:9091ms step_avg:159.49ms step:68/1530 train_loss:5.0817 train_time:9251ms step_avg:159.49ms step:69/1530 train_loss:4.7293 train_time:9410ms step_avg:159.49ms step:70/1530 train_loss:4.8319 train_time:9570ms step_avg:159.51ms step:71/1530 train_loss:4.9771 train_time:9732ms step_avg:159.54ms step:72/1530 train_loss:4.8779 train_time:9891ms step_avg:159.53ms step:73/1530 train_loss:4.7706 train_time:10052ms step_avg:159.55ms step:74/1530 train_loss:4.9098 train_time:10212ms step_avg:159.56ms step:75/1530 train_loss:4.8701 train_time:10372ms step_avg:159.57ms step:76/1530 train_loss:4.7926 train_time:10533ms step_avg:159.58ms step:77/1530 train_loss:4.9152 train_time:10692ms step_avg:159.59ms step:78/1530 train_loss:5.1425 train_time:10852ms step_avg:159.59ms step:79/1530 train_loss:4.8377 train_time:11011ms step_avg:159.58ms step:80/1530 train_loss:4.8597 train_time:11171ms step_avg:159.59ms step:81/1530 train_loss:4.6441 train_time:11331ms step_avg:159.59ms step:82/1530 train_loss:4.8157 train_time:11491ms step_avg:159.60ms step:83/1530 train_loss:4.7682 train_time:11651ms step_avg:159.60ms step:84/1530 train_loss:4.7713 train_time:11811ms step_avg:159.61ms step:85/1530 train_loss:4.6279 train_time:11971ms step_avg:159.62ms step:86/1530 train_loss:4.8265 train_time:12131ms step_avg:159.61ms step:87/1530 train_loss:4.7573 train_time:12292ms step_avg:159.64ms step:88/1530 train_loss:4.7456 train_time:12451ms step_avg:159.63ms step:89/1530 train_loss:4.6972 train_time:12611ms step_avg:159.63ms step:90/1530 train_loss:4.6345 train_time:12771ms step_avg:159.64ms step:91/1530 train_loss:4.6343 train_time:12931ms step_avg:159.64ms step:92/1530 train_loss:4.7935 train_time:13091ms step_avg:159.64ms step:93/1530 train_loss:4.6174 train_time:13250ms step_avg:159.64ms step:94/1530 train_loss:4.6389 train_time:13411ms step_avg:159.66ms step:95/1530 train_loss:4.6693 train_time:13572ms step_avg:159.67ms step:96/1530 train_loss:4.5806 train_time:13732ms step_avg:159.68ms step:97/1530 train_loss:4.6538 train_time:13891ms step_avg:159.67ms step:98/1530 train_loss:4.5914 train_time:14051ms step_avg:159.67ms step:99/1530 train_loss:4.6767 train_time:14212ms step_avg:159.68ms step:100/1530 train_loss:4.6817 train_time:14371ms step_avg:159.68ms step:101/1530 train_loss:4.5536 train_time:14533ms step_avg:159.70ms step:102/1530 train_loss:4.7099 train_time:14694ms step_avg:159.71ms step:103/1530 train_loss:4.5926 train_time:14854ms step_avg:159.72ms step:104/1530 train_loss:4.5431 train_time:15015ms step_avg:159.73ms step:105/1530 train_loss:4.5597 train_time:15175ms step_avg:159.74ms step:106/1530 train_loss:4.6296 train_time:15336ms step_avg:159.75ms step:107/1530 train_loss:4.5122 train_time:15495ms step_avg:159.75ms step:108/1530 train_loss:4.3675 train_time:15656ms step_avg:159.75ms step:109/1530 train_loss:4.4882 train_time:15816ms step_avg:159.76ms step:110/1530 train_loss:4.4874 train_time:15976ms step_avg:159.76ms step:111/1530 train_loss:4.4247 train_time:16138ms step_avg:159.78ms step:112/1530 train_loss:4.6019 train_time:16299ms step_avg:159.79ms step:113/1530 train_loss:4.4997 train_time:16460ms step_avg:159.80ms step:114/1530 train_loss:4.3623 train_time:16619ms step_avg:159.80ms step:115/1530 train_loss:4.5054 train_time:16782ms step_avg:159.83ms step:116/1530 train_loss:4.4670 train_time:16946ms step_avg:159.87ms step:117/1530 train_loss:4.3681 train_time:17110ms step_avg:159.91ms step:118/1530 train_loss:4.5927 train_time:17274ms step_avg:159.94ms step:119/1530 train_loss:4.4727 train_time:17438ms step_avg:159.98ms step:120/1530 train_loss:4.3481 train_time:17601ms step_avg:160.01ms step:121/1530 train_loss:4.3076 train_time:17765ms step_avg:160.04ms step:122/1530 train_loss:4.4433 train_time:17928ms step_avg:160.07ms step:123/1530 train_loss:4.2888 train_time:18091ms step_avg:160.10ms step:124/1530 train_loss:4.5913 train_time:18255ms step_avg:160.13ms step:125/1530 train_loss:4.4587 train_time:18419ms step_avg:160.17ms step:125/1530 val_loss:4.4025 train_time:18466ms step_avg:160.58ms step:126/1530 train_loss:4.4221 train_time:18584ms step_avg:160.21ms step:127/1530 train_loss:4.4330 train_time:18749ms step_avg:160.25ms step:128/1530 train_loss:4.3813 train_time:18914ms step_avg:160.29ms step:129/1530 train_loss:4.6868 train_time:19078ms step_avg:160.32ms step:130/1530 train_loss:4.3698 train_time:19242ms step_avg:160.35ms step:131/1530 train_loss:4.3908 train_time:19406ms step_avg:160.38ms step:132/1530 train_loss:4.3487 train_time:19571ms step_avg:160.41ms step:133/1530 train_loss:4.4440 train_time:19734ms step_avg:160.44ms step:134/1530 train_loss:4.2663 train_time:19899ms step_avg:160.47ms step:135/1530 train_loss:4.4449 train_time:20062ms step_avg:160.49ms step:136/1530 train_loss:4.2231 train_time:20226ms step_avg:160.52ms step:137/1530 train_loss:4.3785 train_time:20390ms step_avg:160.55ms step:138/1530 train_loss:4.2934 train_time:20555ms step_avg:160.58ms step:139/1530 train_loss:4.3943 train_time:20719ms step_avg:160.62ms step:140/1530 train_loss:4.4720 train_time:20883ms step_avg:160.64ms step:141/1530 train_loss:4.3208 train_time:21047ms step_avg:160.67ms step:142/1530 train_loss:4.3001 train_time:21212ms step_avg:160.70ms step:143/1530 train_loss:4.2648 train_time:21377ms step_avg:160.73ms step:144/1530 train_loss:4.3641 train_time:21541ms step_avg:160.75ms step:145/1530 train_loss:4.3132 train_time:21705ms step_avg:160.78ms step:146/1530 train_loss:4.1730 train_time:21868ms step_avg:160.80ms step:147/1530 train_loss:4.3322 train_time:22031ms step_avg:160.81ms step:148/1530 train_loss:4.3743 train_time:22197ms step_avg:160.85ms step:149/1530 train_loss:4.3191 train_time:22360ms step_avg:160.86ms step:150/1530 train_loss:4.4503 train_time:22524ms step_avg:160.89ms step:151/1530 train_loss:4.2766 train_time:22688ms step_avg:160.91ms step:152/1530 train_loss:4.2829 train_time:22852ms step_avg:160.93ms step:153/1530 train_loss:4.3729 train_time:23016ms step_avg:160.95ms step:154/1530 train_loss:4.3766 train_time:23180ms step_avg:160.97ms step:155/1530 train_loss:4.2718 train_time:23344ms step_avg:160.99ms step:156/1530 train_loss:4.3512 train_time:23507ms step_avg:161.01ms step:157/1530 train_loss:4.4125 train_time:23672ms step_avg:161.04ms step:158/1530 train_loss:4.2441 train_time:23838ms step_avg:161.06ms step:159/1530 train_loss:4.3115 train_time:24001ms step_avg:161.08ms step:160/1530 train_loss:4.1523 train_time:24164ms step_avg:161.10ms step:161/1530 train_loss:4.3589 train_time:24328ms step_avg:161.11ms step:162/1530 train_loss:4.3567 train_time:24491ms step_avg:161.13ms step:163/1530 train_loss:4.3479 train_time:24654ms step_avg:161.14ms step:164/1530 train_loss:4.2011 train_time:24818ms step_avg:161.16ms step:165/1530 train_loss:4.2976 train_time:24982ms step_avg:161.17ms step:166/1530 train_loss:4.3525 train_time:25145ms step_avg:161.19ms step:167/1530 train_loss:4.2195 train_time:25309ms step_avg:161.20ms step:168/1530 train_loss:4.2943 train_time:25472ms step_avg:161.21ms step:169/1530 train_loss:4.1631 train_time:25636ms step_avg:161.23ms step:170/1530 train_loss:4.0347 train_time:25800ms step_avg:161.25ms step:171/1530 train_loss:4.2150 train_time:25963ms step_avg:161.26ms step:172/1530 train_loss:4.2236 train_time:26126ms step_avg:161.27ms step:173/1530 train_loss:4.2722 train_time:26289ms step_avg:161.28ms step:174/1530 train_loss:4.4346 train_time:26450ms step_avg:161.28ms step:175/1530 train_loss:4.2508 train_time:26614ms step_avg:161.30ms step:176/1530 train_loss:4.1034 train_time:26777ms step_avg:161.31ms step:177/1530 train_loss:4.0682 train_time:26939ms step_avg:161.31ms step:178/1530 train_loss:4.1890 train_time:27102ms step_avg:161.32ms step:179/1530 train_loss:4.1306 train_time:27265ms step_avg:161.33ms step:180/1530 train_loss:4.1253 train_time:27426ms step_avg:161.33ms step:181/1530 train_loss:4.3076 train_time:27589ms step_avg:161.34ms step:182/1530 train_loss:4.1659 train_time:27752ms step_avg:161.35ms step:183/1530 train_loss:4.1345 train_time:27915ms step_avg:161.36ms step:184/1530 train_loss:4.1271 train_time:28078ms step_avg:161.37ms step:185/1530 train_loss:4.2210 train_time:28240ms step_avg:161.37ms step:186/1530 train_loss:4.1789 train_time:28403ms step_avg:161.38ms step:187/1530 train_loss:4.2468 train_time:28566ms step_avg:161.39ms step:188/1530 train_loss:4.1729 train_time:28861ms step_avg:162.14ms step:189/1530 train_loss:4.1244 train_time:29192ms step_avg:163.08ms step:190/1530 train_loss:4.2172 train_time:29354ms step_avg:163.08ms step:191/1530 train_loss:4.0823 train_time:29518ms step_avg:163.08ms step:192/1530 train_loss:4.0384 train_time:29681ms step_avg:163.08ms step:193/1530 train_loss:4.2633 train_time:29843ms step_avg:163.08ms step:194/1530 train_loss:4.1826 train_time:30006ms step_avg:163.08ms step:195/1530 train_loss:4.3667 train_time:30170ms step_avg:163.08ms step:196/1530 train_loss:4.1882 train_time:30332ms step_avg:163.07ms step:197/1530 train_loss:4.0451 train_time:30496ms step_avg:163.08ms step:198/1530 train_loss:4.1800 train_time:30658ms step_avg:163.07ms step:199/1530 train_loss:4.0437 train_time:30822ms step_avg:163.08ms step:200/1530 train_loss:4.1212 train_time:30985ms step_avg:163.08ms step:201/1530 train_loss:4.0264 train_time:31145ms step_avg:163.07ms step:202/1530 train_loss:4.2698 train_time:31310ms step_avg:163.07ms step:203/1530 train_loss:4.0742 train_time:31474ms step_avg:163.08ms step:204/1530 train_loss:4.2060 train_time:31638ms step_avg:163.08ms step:205/1530 train_loss:4.2568 train_time:31801ms step_avg:163.08ms step:206/1530 train_loss:3.9508 train_time:31964ms step_avg:163.08ms step:207/1530 train_loss:4.0889 train_time:32126ms step_avg:163.08ms step:208/1530 train_loss:4.1185 train_time:32290ms step_avg:163.08ms step:209/1530 train_loss:4.2559 train_time:32453ms step_avg:163.08ms step:210/1530 train_loss:4.1899 train_time:32616ms step_avg:163.08ms step:211/1530 train_loss:4.0648 train_time:32779ms step_avg:163.08ms step:212/1530 train_loss:4.1296 train_time:32943ms step_avg:163.08ms step:213/1530 train_loss:4.0528 train_time:33106ms step_avg:163.09ms step:214/1530 train_loss:4.1294 train_time:33270ms step_avg:163.09ms step:215/1530 train_loss:3.9653 train_time:33433ms step_avg:163.09ms step:216/1530 train_loss:4.0192 train_time:33595ms step_avg:163.08ms step:217/1530 train_loss:4.0264 train_time:33758ms step_avg:163.08ms step:218/1530 train_loss:4.0926 train_time:33922ms step_avg:163.09ms step:219/1530 train_loss:4.0825 train_time:34085ms step_avg:163.09ms step:220/1530 train_loss:4.0996 train_time:34248ms step_avg:163.09ms step:221/1530 train_loss:4.1067 train_time:34411ms step_avg:163.09ms step:222/1530 train_loss:4.0170 train_time:34574ms step_avg:163.08ms step:223/1530 train_loss:4.0064 train_time:34737ms step_avg:163.08ms step:224/1530 train_loss:4.3109 train_time:34899ms step_avg:163.08ms step:225/1530 train_loss:3.9422 train_time:35062ms step_avg:163.08ms step:226/1530 train_loss:4.0074 train_time:35225ms step_avg:163.08ms step:227/1530 train_loss:3.9808 train_time:35388ms step_avg:163.08ms step:228/1530 train_loss:4.1573 train_time:35552ms step_avg:163.08ms step:229/1530 train_loss:3.9393 train_time:35719ms step_avg:163.10ms step:230/1530 train_loss:4.0476 train_time:35885ms step_avg:163.11ms step:231/1530 train_loss:3.9179 train_time:36050ms step_avg:163.12ms step:232/1530 train_loss:3.9801 train_time:36217ms step_avg:163.14ms step:233/1530 train_loss:4.0953 train_time:36383ms step_avg:163.15ms step:234/1530 train_loss:4.0359 train_time:36549ms step_avg:163.17ms step:235/1530 train_loss:3.9129 train_time:36717ms step_avg:163.19ms step:236/1530 train_loss:4.0954 train_time:36883ms step_avg:163.20ms step:237/1530 train_loss:4.0975 train_time:37049ms step_avg:163.21ms step:238/1530 train_loss:3.9601 train_time:37217ms step_avg:163.23ms step:239/1530 train_loss:4.0960 train_time:37382ms step_avg:163.24ms step:240/1530 train_loss:4.1244 train_time:37549ms step_avg:163.26ms step:241/1530 train_loss:3.9720 train_time:37716ms step_avg:163.27ms step:242/1530 train_loss:4.1631 train_time:37882ms step_avg:163.28ms step:243/1530 train_loss:4.0292 train_time:38047ms step_avg:163.29ms step:244/1530 train_loss:4.0910 train_time:38213ms step_avg:163.30ms step:245/1530 train_loss:4.1513 train_time:38379ms step_avg:163.31ms step:246/1530 train_loss:4.0655 train_time:38545ms step_avg:163.32ms step:247/1530 train_loss:4.0124 train_time:38713ms step_avg:163.34ms step:248/1530 train_loss:4.1150 train_time:38879ms step_avg:163.36ms step:249/1530 train_loss:3.9328 train_time:39044ms step_avg:163.36ms step:250/1530 train_loss:3.9881 train_time:39211ms step_avg:163.38ms step:250/1530 val_loss:4.0198 train_time:39258ms step_avg:163.58ms step:251/1530 train_loss:4.0830 train_time:39380ms step_avg:163.40ms step:252/1530 train_loss:4.1738 train_time:39545ms step_avg:163.41ms step:253/1530 train_loss:3.9360 train_time:39713ms step_avg:163.43ms step:254/1530 train_loss:3.8863 train_time:39879ms step_avg:163.44ms step:255/1530 train_loss:4.0897 train_time:40044ms step_avg:163.44ms step:256/1530 train_loss:4.0069 train_time:40210ms step_avg:163.45ms step:257/1530 train_loss:4.0073 train_time:40377ms step_avg:163.47ms step:258/1530 train_loss:4.0029 train_time:40543ms step_avg:163.48ms step:259/1530 train_loss:4.0399 train_time:40709ms step_avg:163.49ms step:260/1530 train_loss:4.0707 train_time:40877ms step_avg:163.51ms step:261/1530 train_loss:4.0360 train_time:41043ms step_avg:163.52ms step:262/1530 train_loss:4.0128 train_time:41208ms step_avg:163.52ms step:263/1530 train_loss:3.9059 train_time:41375ms step_avg:163.54ms step:264/1530 train_loss:3.9975 train_time:41541ms step_avg:163.55ms step:265/1530 train_loss:3.8903 train_time:41707ms step_avg:163.56ms step:266/1530 train_loss:3.9403 train_time:41872ms step_avg:163.56ms step:267/1530 train_loss:3.9478 train_time:42039ms step_avg:163.57ms step:268/1530 train_loss:3.9710 train_time:42204ms step_avg:163.58ms step:269/1530 train_loss:3.8628 train_time:42370ms step_avg:163.59ms step:270/1530 train_loss:4.1183 train_time:42536ms step_avg:163.60ms step:271/1530 train_loss:3.9837 train_time:42702ms step_avg:163.61ms step:272/1530 train_loss:3.9376 train_time:42868ms step_avg:163.62ms step:273/1530 train_loss:3.9562 train_time:43033ms step_avg:163.63ms step:274/1530 train_loss:4.0564 train_time:43199ms step_avg:163.63ms step:275/1530 train_loss:4.0749 train_time:43365ms step_avg:163.64ms step:276/1530 train_loss:4.2445 train_time:43531ms step_avg:163.65ms step:277/1530 train_loss:4.0538 train_time:43698ms step_avg:163.66ms step:278/1530 train_loss:4.1030 train_time:43863ms step_avg:163.67ms step:279/1530 train_loss:4.0123 train_time:44029ms step_avg:163.68ms step:280/1530 train_loss:4.1840 train_time:44197ms step_avg:163.69ms step:281/1530 train_loss:3.9843 train_time:44363ms step_avg:163.70ms step:282/1530 train_loss:3.9596 train_time:44530ms step_avg:163.71ms step:283/1530 train_loss:3.9279 train_time:44695ms step_avg:163.72ms step:284/1530 train_loss:4.0629 train_time:44862ms step_avg:163.73ms step:285/1530 train_loss:4.0720 train_time:45026ms step_avg:163.73ms step:286/1530 train_loss:4.1017 train_time:45192ms step_avg:163.74ms step:287/1530 train_loss:3.9267 train_time:45357ms step_avg:163.74ms step:288/1530 train_loss:4.0256 train_time:45521ms step_avg:163.75ms step:289/1530 train_loss:3.8826 train_time:45687ms step_avg:163.75ms step:290/1530 train_loss:3.8705 train_time:45853ms step_avg:163.76ms step:291/1530 train_loss:3.9308 train_time:46019ms step_avg:163.77ms step:292/1530 train_loss:3.8779 train_time:46183ms step_avg:163.77ms step:293/1530 train_loss:3.9154 train_time:46348ms step_avg:163.77ms step:294/1530 train_loss:3.9439 train_time:46513ms step_avg:163.78ms step:295/1530 train_loss:3.8619 train_time:46679ms step_avg:163.79ms step:296/1530 train_loss:3.8804 train_time:46844ms step_avg:163.79ms step:297/1530 train_loss:3.8773 train_time:47008ms step_avg:163.79ms step:298/1530 train_loss:3.9844 train_time:47174ms step_avg:163.80ms step:299/1530 train_loss:3.8344 train_time:47339ms step_avg:163.80ms step:300/1530 train_loss:3.9781 train_time:47504ms step_avg:163.81ms step:301/1530 train_loss:3.9691 train_time:47670ms step_avg:163.81ms step:302/1530 train_loss:3.9453 train_time:47835ms step_avg:163.82ms step:303/1530 train_loss:3.9941 train_time:48000ms step_avg:163.82ms step:304/1530 train_loss:3.9805 train_time:48165ms step_avg:163.82ms step:305/1530 train_loss:4.4614 train_time:48330ms step_avg:163.83ms step:306/1530 train_loss:3.9543 train_time:48495ms step_avg:163.83ms step:307/1530 train_loss:3.8541 train_time:48660ms step_avg:163.84ms step:308/1530 train_loss:3.9953 train_time:48824ms step_avg:163.84ms step:309/1530 train_loss:3.8800 train_time:48988ms step_avg:163.84ms step:310/1530 train_loss:4.0922 train_time:49154ms step_avg:163.85ms step:311/1530 train_loss:3.9373 train_time:49319ms step_avg:163.85ms step:312/1530 train_loss:3.8761 train_time:49484ms step_avg:163.85ms step:313/1530 train_loss:3.9565 train_time:49650ms step_avg:163.86ms step:314/1530 train_loss:4.0754 train_time:49816ms step_avg:163.87ms step:315/1530 train_loss:3.9538 train_time:49981ms step_avg:163.87ms step:316/1530 train_loss:3.8025 train_time:50145ms step_avg:163.87ms step:317/1530 train_loss:3.8953 train_time:50311ms step_avg:163.88ms step:318/1530 train_loss:3.9366 train_time:50477ms step_avg:163.89ms step:319/1530 train_loss:3.9056 train_time:50642ms step_avg:163.89ms step:320/1530 train_loss:4.0258 train_time:50807ms step_avg:163.89ms step:321/1530 train_loss:3.9677 train_time:50974ms step_avg:163.90ms step:322/1530 train_loss:3.9463 train_time:51139ms step_avg:163.91ms step:323/1530 train_loss:4.0192 train_time:51304ms step_avg:163.91ms step:324/1530 train_loss:3.9597 train_time:51470ms step_avg:163.92ms step:325/1530 train_loss:4.0400 train_time:51636ms step_avg:163.92ms step:326/1530 train_loss:3.9079 train_time:51801ms step_avg:163.93ms step:327/1530 train_loss:4.4192 train_time:51967ms step_avg:163.93ms step:328/1530 train_loss:4.0888 train_time:52133ms step_avg:163.94ms step:329/1530 train_loss:3.8131 train_time:52298ms step_avg:163.94ms step:330/1530 train_loss:3.7665 train_time:52463ms step_avg:163.95ms step:331/1530 train_loss:3.9925 train_time:52629ms step_avg:163.95ms step:332/1530 train_loss:3.9255 train_time:52793ms step_avg:163.95ms step:333/1530 train_loss:3.9079 train_time:52959ms step_avg:163.96ms step:334/1530 train_loss:3.8552 train_time:53123ms step_avg:163.96ms step:335/1530 train_loss:4.0222 train_time:53287ms step_avg:163.96ms step:336/1530 train_loss:3.9784 train_time:53454ms step_avg:163.97ms step:337/1530 train_loss:4.4408 train_time:53620ms step_avg:163.97ms step:338/1530 train_loss:3.9531 train_time:53786ms step_avg:163.98ms step:339/1530 train_loss:3.8816 train_time:53951ms step_avg:163.98ms step:340/1530 train_loss:3.9497 train_time:54116ms step_avg:163.99ms step:341/1530 train_loss:3.8730 train_time:54283ms step_avg:164.00ms step:342/1530 train_loss:3.8199 train_time:54451ms step_avg:164.01ms step:343/1530 train_loss:3.8561 train_time:54619ms step_avg:164.02ms step:344/1530 train_loss:4.0054 train_time:54787ms step_avg:164.03ms step:345/1530 train_loss:3.8360 train_time:54956ms step_avg:164.05ms step:346/1530 train_loss:3.7822 train_time:55124ms step_avg:164.06ms step:347/1530 train_loss:3.8171 train_time:55292ms step_avg:164.07ms step:348/1530 train_loss:3.8785 train_time:55460ms step_avg:164.08ms step:349/1530 train_loss:3.8479 train_time:55627ms step_avg:164.09ms step:350/1530 train_loss:3.5810 train_time:55796ms step_avg:164.11ms step:351/1530 train_loss:3.8437 train_time:55964ms step_avg:164.12ms step:352/1530 train_loss:4.1902 train_time:56132ms step_avg:164.13ms step:353/1530 train_loss:3.6753 train_time:56300ms step_avg:164.14ms step:354/1530 train_loss:3.9366 train_time:56467ms step_avg:164.15ms step:355/1530 train_loss:3.8023 train_time:56636ms step_avg:164.16ms step:356/1530 train_loss:3.8965 train_time:56803ms step_avg:164.17ms step:357/1530 train_loss:3.7813 train_time:56973ms step_avg:164.19ms step:358/1530 train_loss:3.8745 train_time:57141ms step_avg:164.20ms step:359/1530 train_loss:3.8068 train_time:57309ms step_avg:164.21ms step:360/1530 train_loss:3.4480 train_time:57481ms step_avg:164.23ms step:361/1530 train_loss:4.0344 train_time:57649ms step_avg:164.24ms step:362/1530 train_loss:3.9351 train_time:57818ms step_avg:164.26ms step:363/1530 train_loss:3.8580 train_time:57985ms step_avg:164.26ms step:364/1530 train_loss:3.7661 train_time:58154ms step_avg:164.28ms step:365/1530 train_loss:3.9320 train_time:58322ms step_avg:164.29ms step:366/1530 train_loss:3.8780 train_time:58490ms step_avg:164.30ms step:367/1530 train_loss:3.8674 train_time:58660ms step_avg:164.31ms step:368/1530 train_loss:3.8628 train_time:58826ms step_avg:164.32ms step:369/1530 train_loss:3.7632 train_time:58994ms step_avg:164.33ms step:370/1530 train_loss:3.8934 train_time:59161ms step_avg:164.34ms step:371/1530 train_loss:3.7495 train_time:59329ms step_avg:164.35ms step:372/1530 train_loss:3.7018 train_time:59499ms step_avg:164.36ms step:373/1530 train_loss:3.9215 train_time:59666ms step_avg:164.37ms step:374/1530 train_loss:3.8433 train_time:59834ms step_avg:164.38ms step:375/1530 train_loss:3.8196 train_time:60001ms step_avg:164.39ms step:375/1530 val_loss:3.8467 train_time:60048ms step_avg:164.52ms step:376/1530 train_loss:3.8822 train_time:60171ms step_avg:164.40ms step:377/1530 train_loss:3.8047 train_time:60472ms step_avg:164.77ms step:378/1530 train_loss:3.8630 train_time:60650ms step_avg:164.81ms step:379/1530 train_loss:3.8932 train_time:60968ms step_avg:165.22ms step:380/1530 train_loss:3.9757 train_time:61134ms step_avg:165.23ms step:381/1530 train_loss:3.8537 train_time:61302ms step_avg:165.23ms step:382/1530 train_loss:3.8137 train_time:61470ms step_avg:165.24ms step:383/1530 train_loss:3.8109 train_time:61637ms step_avg:165.25ms step:384/1530 train_loss:3.8843 train_time:61806ms step_avg:165.26ms step:385/1530 train_loss:3.8106 train_time:61973ms step_avg:165.26ms step:386/1530 train_loss:3.9046 train_time:62139ms step_avg:165.26ms step:387/1530 train_loss:4.0722 train_time:62308ms step_avg:165.27ms step:388/1530 train_loss:3.8062 train_time:62475ms step_avg:165.28ms step:389/1530 train_loss:3.8100 train_time:62643ms step_avg:165.29ms step:390/1530 train_loss:3.9139 train_time:62813ms step_avg:165.30ms step:391/1530 train_loss:3.8301 train_time:62979ms step_avg:165.30ms step:392/1530 train_loss:3.9347 train_time:63147ms step_avg:165.31ms step:393/1530 train_loss:3.7773 train_time:63314ms step_avg:165.31ms step:394/1530 train_loss:3.8972 train_time:63481ms step_avg:165.31ms step:395/1530 train_loss:3.6500 train_time:63648ms step_avg:165.32ms step:396/1530 train_loss:3.8455 train_time:63816ms step_avg:165.33ms step:397/1530 train_loss:3.8727 train_time:63983ms step_avg:165.33ms step:398/1530 train_loss:3.8963 train_time:64151ms step_avg:165.34ms step:399/1530 train_loss:3.7846 train_time:64317ms step_avg:165.34ms step:400/1530 train_loss:3.8441 train_time:64484ms step_avg:165.34ms step:401/1530 train_loss:3.9235 train_time:64652ms step_avg:165.35ms step:402/1530 train_loss:3.8625 train_time:64820ms step_avg:165.36ms step:403/1530 train_loss:3.9743 train_time:64989ms step_avg:165.37ms step:404/1530 train_loss:3.6928 train_time:65155ms step_avg:165.37ms step:405/1530 train_loss:3.7985 train_time:65324ms step_avg:165.38ms step:406/1530 train_loss:4.1106 train_time:65491ms step_avg:165.38ms step:407/1530 train_loss:3.7958 train_time:65658ms step_avg:165.39ms step:408/1530 train_loss:3.8363 train_time:65824ms step_avg:165.39ms step:409/1530 train_loss:3.8677 train_time:65992ms step_avg:165.39ms step:410/1530 train_loss:3.7640 train_time:66158ms step_avg:165.39ms step:411/1530 train_loss:3.7706 train_time:66327ms step_avg:165.40ms step:412/1530 train_loss:4.1979 train_time:66494ms step_avg:165.41ms step:413/1530 train_loss:3.6486 train_time:66660ms step_avg:165.41ms step:414/1530 train_loss:4.0235 train_time:66828ms step_avg:165.42ms step:415/1530 train_loss:3.7698 train_time:66995ms step_avg:165.42ms step:416/1530 train_loss:3.7729 train_time:67162ms step_avg:165.42ms step:417/1530 train_loss:3.9661 train_time:67330ms step_avg:165.43ms step:418/1530 train_loss:3.7026 train_time:67496ms step_avg:165.43ms step:419/1530 train_loss:3.8230 train_time:67663ms step_avg:165.43ms step:420/1530 train_loss:3.7352 train_time:67831ms step_avg:165.44ms step:421/1530 train_loss:3.6677 train_time:67997ms step_avg:165.44ms step:422/1530 train_loss:3.7930 train_time:68164ms step_avg:165.45ms step:423/1530 train_loss:3.8871 train_time:68331ms step_avg:165.45ms step:424/1530 train_loss:3.6321 train_time:68497ms step_avg:165.45ms step:425/1530 train_loss:3.8144 train_time:68666ms step_avg:165.46ms step:426/1530 train_loss:3.6644 train_time:68833ms step_avg:165.46ms step:427/1530 train_loss:3.9100 train_time:69001ms step_avg:165.47ms step:428/1530 train_loss:3.8288 train_time:69168ms step_avg:165.47ms step:429/1530 train_loss:3.7726 train_time:69335ms step_avg:165.48ms step:430/1530 train_loss:3.7279 train_time:69504ms step_avg:165.49ms step:431/1530 train_loss:3.6473 train_time:69671ms step_avg:165.49ms step:432/1530 train_loss:3.7805 train_time:69839ms step_avg:165.49ms step:433/1530 train_loss:3.8301 train_time:70006ms step_avg:165.50ms step:434/1530 train_loss:3.7871 train_time:70173ms step_avg:165.50ms step:435/1530 train_loss:3.8201 train_time:70340ms step_avg:165.51ms step:436/1530 train_loss:3.8445 train_time:70508ms step_avg:165.51ms step:437/1530 train_loss:3.7337 train_time:70675ms step_avg:165.51ms step:438/1530 train_loss:3.7150 train_time:70843ms step_avg:165.52ms step:439/1530 train_loss:3.7281 train_time:71010ms step_avg:165.52ms step:440/1530 train_loss:3.8981 train_time:71177ms step_avg:165.53ms step:441/1530 train_loss:3.7794 train_time:71344ms step_avg:165.53ms step:442/1530 train_loss:3.7519 train_time:71513ms step_avg:165.54ms step:443/1530 train_loss:3.6344 train_time:71679ms step_avg:165.54ms step:444/1530 train_loss:3.9356 train_time:71846ms step_avg:165.54ms step:445/1530 train_loss:3.8601 train_time:72013ms step_avg:165.55ms step:446/1530 train_loss:3.8525 train_time:72179ms step_avg:165.55ms step:447/1530 train_loss:3.7639 train_time:72346ms step_avg:165.55ms step:448/1530 train_loss:3.8596 train_time:72513ms step_avg:165.55ms step:449/1530 train_loss:3.7062 train_time:72678ms step_avg:165.55ms step:450/1530 train_loss:3.7444 train_time:72845ms step_avg:165.56ms step:451/1530 train_loss:3.6037 train_time:73013ms step_avg:165.56ms step:452/1530 train_loss:3.7232 train_time:73181ms step_avg:165.57ms step:453/1530 train_loss:3.6825 train_time:73348ms step_avg:165.57ms step:454/1530 train_loss:3.6530 train_time:73515ms step_avg:165.57ms step:455/1530 train_loss:3.8541 train_time:73684ms step_avg:165.58ms step:456/1530 train_loss:3.7345 train_time:73852ms step_avg:165.59ms step:457/1530 train_loss:3.8020 train_time:74023ms step_avg:165.60ms step:458/1530 train_loss:3.8454 train_time:74192ms step_avg:165.61ms step:459/1530 train_loss:3.6486 train_time:74361ms step_avg:165.62ms step:460/1530 train_loss:3.8102 train_time:74531ms step_avg:165.62ms step:461/1530 train_loss:3.7159 train_time:74701ms step_avg:165.63ms step:462/1530 train_loss:3.7506 train_time:74870ms step_avg:165.64ms step:463/1530 train_loss:3.7969 train_time:75039ms step_avg:165.65ms step:464/1530 train_loss:3.7351 train_time:75210ms step_avg:165.66ms step:465/1530 train_loss:3.7330 train_time:75378ms step_avg:165.67ms step:466/1530 train_loss:3.8176 train_time:75547ms step_avg:165.67ms step:467/1530 train_loss:3.8413 train_time:75718ms step_avg:165.69ms step:468/1530 train_loss:3.8091 train_time:75888ms step_avg:165.69ms step:469/1530 train_loss:3.7031 train_time:76056ms step_avg:165.70ms step:470/1530 train_loss:3.7804 train_time:76228ms step_avg:165.71ms step:471/1530 train_loss:3.8268 train_time:76398ms step_avg:165.72ms step:472/1530 train_loss:3.7956 train_time:76570ms step_avg:165.73ms step:473/1530 train_loss:3.7301 train_time:76738ms step_avg:165.74ms step:474/1530 train_loss:3.6067 train_time:76909ms step_avg:165.75ms step:475/1530 train_loss:4.0499 train_time:77078ms step_avg:165.76ms step:476/1530 train_loss:3.7680 train_time:77248ms step_avg:165.77ms step:477/1530 train_loss:3.6054 train_time:77419ms step_avg:165.78ms step:478/1530 train_loss:3.8340 train_time:77589ms step_avg:165.79ms step:479/1530 train_loss:3.7851 train_time:77758ms step_avg:165.80ms step:480/1530 train_loss:3.9398 train_time:77928ms step_avg:165.80ms step:481/1530 train_loss:3.7428 train_time:78097ms step_avg:165.81ms step:482/1530 train_loss:3.5377 train_time:78267ms step_avg:165.82ms step:483/1530 train_loss:3.8155 train_time:78435ms step_avg:165.82ms step:484/1530 train_loss:3.6766 train_time:78607ms step_avg:165.84ms step:485/1530 train_loss:3.6718 train_time:78776ms step_avg:165.84ms step:486/1530 train_loss:3.5874 train_time:78947ms step_avg:165.85ms step:487/1530 train_loss:3.6969 train_time:79116ms step_avg:165.86ms step:488/1530 train_loss:3.8922 train_time:79285ms step_avg:165.87ms step:489/1530 train_loss:3.7274 train_time:79454ms step_avg:165.88ms step:490/1530 train_loss:3.6171 train_time:79625ms step_avg:165.89ms step:491/1530 train_loss:3.6298 train_time:79793ms step_avg:165.89ms step:492/1530 train_loss:3.7468 train_time:79963ms step_avg:165.90ms step:493/1530 train_loss:3.5883 train_time:80133ms step_avg:165.91ms step:494/1530 train_loss:3.7130 train_time:80302ms step_avg:165.91ms step:495/1530 train_loss:3.6811 train_time:80472ms step_avg:165.92ms step:496/1530 train_loss:3.5368 train_time:80641ms step_avg:165.93ms step:497/1530 train_loss:3.7510 train_time:80810ms step_avg:165.93ms step:498/1530 train_loss:3.8026 train_time:80979ms step_avg:165.94ms step:499/1530 train_loss:3.8350 train_time:81149ms step_avg:165.95ms step:500/1530 train_loss:3.7424 train_time:81319ms step_avg:165.96ms step:500/1530 val_loss:3.7234 train_time:81368ms step_avg:166.06ms step:501/1530 train_loss:3.8197 train_time:81489ms step_avg:165.97ms step:502/1530 train_loss:3.7703 train_time:81662ms step_avg:165.98ms step:503/1530 train_loss:3.7904 train_time:81831ms step_avg:165.99ms step:504/1530 train_loss:3.7368 train_time:81998ms step_avg:165.99ms step:505/1530 train_loss:3.8213 train_time:82170ms step_avg:166.00ms step:506/1530 train_loss:3.6573 train_time:82338ms step_avg:166.00ms step:507/1530 train_loss:3.7811 train_time:82507ms step_avg:166.01ms step:508/1530 train_loss:3.8417 train_time:82677ms step_avg:166.02ms step:509/1530 train_loss:3.7893 train_time:82846ms step_avg:166.02ms step:510/1530 train_loss:3.5960 train_time:83015ms step_avg:166.03ms step:511/1530 train_loss:3.7893 train_time:83184ms step_avg:166.04ms step:512/1530 train_loss:3.7297 train_time:83355ms step_avg:166.05ms step:513/1530 train_loss:3.6805 train_time:83523ms step_avg:166.05ms step:514/1530 train_loss:3.8204 train_time:83693ms step_avg:166.06ms step:515/1530 train_loss:3.7555 train_time:83862ms step_avg:166.06ms step:516/1530 train_loss:4.1018 train_time:84032ms step_avg:166.07ms step:517/1530 train_loss:3.7118 train_time:84201ms step_avg:166.08ms step:518/1530 train_loss:3.7824 train_time:84370ms step_avg:166.08ms step:519/1530 train_loss:3.6759 train_time:84538ms step_avg:166.09ms step:520/1530 train_loss:3.7023 train_time:84707ms step_avg:166.09ms step:521/1530 train_loss:3.6781 train_time:84876ms step_avg:166.10ms step:522/1530 train_loss:3.6740 train_time:85047ms step_avg:166.11ms step:523/1530 train_loss:4.3116 train_time:85216ms step_avg:166.11ms step:524/1530 train_loss:3.7501 train_time:85384ms step_avg:166.12ms step:525/1530 train_loss:3.6911 train_time:85552ms step_avg:166.12ms step:526/1530 train_loss:3.7130 train_time:85720ms step_avg:166.12ms step:527/1530 train_loss:3.6662 train_time:85889ms step_avg:166.13ms step:528/1530 train_loss:3.6401 train_time:86058ms step_avg:166.14ms step:529/1530 train_loss:3.8615 train_time:86228ms step_avg:166.14ms step:530/1530 train_loss:3.6718 train_time:86398ms step_avg:166.15ms step:531/1530 train_loss:3.9307 train_time:86570ms step_avg:166.16ms step:532/1530 train_loss:3.7509 train_time:86739ms step_avg:166.17ms step:533/1530 train_loss:3.6650 train_time:86908ms step_avg:166.17ms step:534/1530 train_loss:3.6865 train_time:87076ms step_avg:166.18ms step:535/1530 train_loss:3.6247 train_time:87246ms step_avg:166.18ms step:536/1530 train_loss:3.7600 train_time:87415ms step_avg:166.19ms step:537/1530 train_loss:3.7395 train_time:87585ms step_avg:166.20ms step:538/1530 train_loss:3.6426 train_time:87755ms step_avg:166.20ms step:539/1530 train_loss:4.1304 train_time:87928ms step_avg:166.22ms step:540/1530 train_loss:3.6896 train_time:88097ms step_avg:166.22ms step:541/1530 train_loss:3.7940 train_time:88266ms step_avg:166.23ms step:542/1530 train_loss:3.6009 train_time:88434ms step_avg:166.23ms step:543/1530 train_loss:3.5929 train_time:88603ms step_avg:166.23ms step:544/1530 train_loss:3.6498 train_time:88772ms step_avg:166.24ms step:545/1530 train_loss:3.6090 train_time:88940ms step_avg:166.24ms step:546/1530 train_loss:3.6408 train_time:89109ms step_avg:166.25ms step:547/1530 train_loss:3.6585 train_time:89277ms step_avg:166.25ms step:548/1530 train_loss:3.6271 train_time:89448ms step_avg:166.26ms step:549/1530 train_loss:3.7419 train_time:89615ms step_avg:166.26ms step:550/1530 train_loss:3.6327 train_time:89785ms step_avg:166.27ms step:551/1530 train_loss:3.6458 train_time:89953ms step_avg:166.27ms step:552/1530 train_loss:3.9483 train_time:90123ms step_avg:166.28ms step:553/1530 train_loss:3.7714 train_time:90292ms step_avg:166.28ms step:554/1530 train_loss:3.7204 train_time:90460ms step_avg:166.29ms step:555/1530 train_loss:3.6446 train_time:90630ms step_avg:166.29ms step:556/1530 train_loss:3.7117 train_time:90798ms step_avg:166.30ms step:557/1530 train_loss:3.3312 train_time:90968ms step_avg:166.30ms step:558/1530 train_loss:3.6337 train_time:91137ms step_avg:166.31ms step:559/1530 train_loss:3.6642 train_time:91306ms step_avg:166.31ms step:560/1530 train_loss:3.7035 train_time:91474ms step_avg:166.32ms step:561/1530 train_loss:3.6282 train_time:91642ms step_avg:166.32ms step:562/1530 train_loss:3.5678 train_time:91809ms step_avg:166.32ms step:563/1530 train_loss:3.7761 train_time:91978ms step_avg:166.33ms step:564/1530 train_loss:3.5906 train_time:92151ms step_avg:166.34ms step:565/1530 train_loss:3.7001 train_time:92320ms step_avg:166.34ms step:566/1530 train_loss:3.6303 train_time:92621ms step_avg:166.58ms step:567/1530 train_loss:3.6164 train_time:92800ms step_avg:166.61ms step:568/1530 train_loss:3.6994 train_time:92970ms step_avg:166.61ms step:569/1530 train_loss:3.6593 train_time:93296ms step_avg:166.90ms step:570/1530 train_loss:3.7064 train_time:93468ms step_avg:166.91ms step:571/1530 train_loss:3.7735 train_time:93637ms step_avg:166.91ms step:572/1530 train_loss:3.7368 train_time:93809ms step_avg:166.92ms step:573/1530 train_loss:3.7509 train_time:93981ms step_avg:166.93ms step:574/1530 train_loss:3.7923 train_time:94154ms step_avg:166.94ms step:575/1530 train_loss:3.7428 train_time:94323ms step_avg:166.94ms step:576/1530 train_loss:3.7786 train_time:94494ms step_avg:166.95ms step:577/1530 train_loss:3.6906 train_time:94665ms step_avg:166.96ms step:578/1530 train_loss:3.6975 train_time:94837ms step_avg:166.97ms step:579/1530 train_loss:3.6828 train_time:95008ms step_avg:166.97ms step:580/1530 train_loss:3.6034 train_time:95178ms step_avg:166.98ms step:581/1530 train_loss:3.6543 train_time:95350ms step_avg:166.99ms step:582/1530 train_loss:3.8645 train_time:95520ms step_avg:166.99ms step:583/1530 train_loss:3.6464 train_time:95691ms step_avg:167.00ms step:584/1530 train_loss:3.6186 train_time:95864ms step_avg:167.01ms step:585/1530 train_loss:3.8019 train_time:96034ms step_avg:167.02ms step:586/1530 train_loss:3.5343 train_time:96206ms step_avg:167.02ms step:587/1530 train_loss:3.6789 train_time:96376ms step_avg:167.03ms step:588/1530 train_loss:3.6588 train_time:96547ms step_avg:167.04ms step:589/1530 train_loss:4.0093 train_time:96718ms step_avg:167.04ms step:590/1530 train_loss:3.7934 train_time:96890ms step_avg:167.05ms step:591/1530 train_loss:3.5207 train_time:97060ms step_avg:167.06ms step:592/1530 train_loss:3.5447 train_time:97233ms step_avg:167.07ms step:593/1530 train_loss:3.5148 train_time:97405ms step_avg:167.08ms step:594/1530 train_loss:3.5706 train_time:97576ms step_avg:167.08ms step:595/1530 train_loss:3.9352 train_time:97751ms step_avg:167.09ms step:596/1530 train_loss:3.6688 train_time:97923ms step_avg:167.10ms step:597/1530 train_loss:3.6039 train_time:98094ms step_avg:167.11ms step:598/1530 train_loss:3.6686 train_time:98264ms step_avg:167.12ms step:599/1530 train_loss:3.4921 train_time:98434ms step_avg:167.12ms step:600/1530 train_loss:3.6136 train_time:98606ms step_avg:167.13ms step:601/1530 train_loss:3.6584 train_time:98778ms step_avg:167.14ms step:602/1530 train_loss:3.6848 train_time:98952ms step_avg:167.15ms step:603/1530 train_loss:3.7994 train_time:99122ms step_avg:167.15ms step:604/1530 train_loss:3.6323 train_time:99293ms step_avg:167.16ms step:605/1530 train_loss:3.6328 train_time:99467ms step_avg:167.17ms step:606/1530 train_loss:3.5911 train_time:99640ms step_avg:167.18ms step:607/1530 train_loss:3.8534 train_time:99810ms step_avg:167.19ms step:608/1530 train_loss:3.6479 train_time:99980ms step_avg:167.19ms step:609/1530 train_loss:3.6322 train_time:100152ms step_avg:167.20ms step:610/1530 train_loss:3.7157 train_time:100323ms step_avg:167.20ms step:611/1530 train_loss:3.6152 train_time:100493ms step_avg:167.21ms step:612/1530 train_loss:3.5871 train_time:100665ms step_avg:167.22ms step:613/1530 train_loss:3.7780 train_time:100836ms step_avg:167.22ms step:614/1530 train_loss:3.7244 train_time:101009ms step_avg:167.23ms step:615/1530 train_loss:3.7092 train_time:101179ms step_avg:167.24ms step:616/1530 train_loss:3.6409 train_time:101350ms step_avg:167.24ms step:617/1530 train_loss:3.5706 train_time:101520ms step_avg:167.25ms step:618/1530 train_loss:3.7005 train_time:101691ms step_avg:167.26ms step:619/1530 train_loss:3.5668 train_time:101863ms step_avg:167.26ms step:620/1530 train_loss:3.6071 train_time:102034ms step_avg:167.27ms step:621/1530 train_loss:3.9365 train_time:102208ms step_avg:167.28ms step:622/1530 train_loss:3.5882 train_time:102379ms step_avg:167.29ms step:623/1530 train_loss:3.6264 train_time:102552ms step_avg:167.30ms step:624/1530 train_loss:3.7093 train_time:102723ms step_avg:167.30ms step:625/1530 train_loss:3.7117 train_time:102893ms step_avg:167.31ms step:625/1530 val_loss:3.6380 train_time:102943ms step_avg:167.39ms step:626/1530 train_loss:3.7492 train_time:103068ms step_avg:167.32ms step:627/1530 train_loss:3.7320 train_time:103240ms step_avg:167.33ms step:628/1530 train_loss:3.7783 train_time:103409ms step_avg:167.33ms step:629/1530 train_loss:3.6070 train_time:103582ms step_avg:167.34ms step:630/1530 train_loss:3.7462 train_time:103751ms step_avg:167.34ms step:631/1530 train_loss:3.7595 train_time:103922ms step_avg:167.35ms step:632/1530 train_loss:3.6670 train_time:104093ms step_avg:167.35ms step:633/1530 train_loss:3.6238 train_time:104264ms step_avg:167.36ms step:634/1530 train_loss:3.7059 train_time:104435ms step_avg:167.36ms step:635/1530 train_loss:3.9616 train_time:104605ms step_avg:167.37ms step:636/1530 train_loss:3.5587 train_time:104775ms step_avg:167.37ms step:637/1530 train_loss:3.3622 train_time:104946ms step_avg:167.38ms step:638/1530 train_loss:3.6029 train_time:105115ms step_avg:167.38ms step:639/1530 train_loss:3.6490 train_time:105284ms step_avg:167.38ms step:640/1530 train_loss:3.5852 train_time:105454ms step_avg:167.39ms step:641/1530 train_loss:3.6045 train_time:105624ms step_avg:167.39ms step:642/1530 train_loss:3.6403 train_time:105794ms step_avg:167.40ms step:643/1530 train_loss:3.6120 train_time:105964ms step_avg:167.40ms step:644/1530 train_loss:3.5769 train_time:106132ms step_avg:167.40ms step:645/1530 train_loss:3.7916 train_time:106304ms step_avg:167.41ms step:646/1530 train_loss:3.6862 train_time:106476ms step_avg:167.42ms step:647/1530 train_loss:3.6771 train_time:106645ms step_avg:167.42ms step:648/1530 train_loss:3.7276 train_time:106816ms step_avg:167.42ms step:649/1530 train_loss:3.7821 train_time:106985ms step_avg:167.43ms step:650/1530 train_loss:3.6394 train_time:107156ms step_avg:167.43ms step:651/1530 train_loss:3.7818 train_time:107327ms step_avg:167.44ms step:652/1530 train_loss:3.5983 train_time:107497ms step_avg:167.44ms step:653/1530 train_loss:3.6789 train_time:107667ms step_avg:167.44ms step:654/1530 train_loss:3.4379 train_time:107840ms step_avg:167.45ms step:655/1530 train_loss:3.5939 train_time:108010ms step_avg:167.46ms step:656/1530 train_loss:3.5944 train_time:108181ms step_avg:167.46ms step:657/1530 train_loss:3.5122 train_time:108351ms step_avg:167.47ms step:658/1530 train_loss:3.6991 train_time:108523ms step_avg:167.47ms step:659/1530 train_loss:3.5961 train_time:108694ms step_avg:167.48ms step:660/1530 train_loss:3.7002 train_time:108865ms step_avg:167.48ms step:661/1530 train_loss:3.7674 train_time:109037ms step_avg:167.49ms step:662/1530 train_loss:3.6850 train_time:109207ms step_avg:167.50ms step:663/1530 train_loss:3.5622 train_time:109377ms step_avg:167.50ms step:664/1530 train_loss:3.6232 train_time:109547ms step_avg:167.50ms step:665/1530 train_loss:3.5083 train_time:109718ms step_avg:167.51ms step:666/1530 train_loss:3.8008 train_time:109888ms step_avg:167.51ms step:667/1530 train_loss:3.6200 train_time:110060ms step_avg:167.52ms step:668/1530 train_loss:3.6597 train_time:110231ms step_avg:167.52ms step:669/1530 train_loss:3.5070 train_time:110404ms step_avg:167.53ms step:670/1530 train_loss:3.6148 train_time:110574ms step_avg:167.54ms step:671/1530 train_loss:3.5730 train_time:110743ms step_avg:167.54ms step:672/1530 train_loss:3.5814 train_time:110913ms step_avg:167.54ms step:673/1530 train_loss:3.8620 train_time:111085ms step_avg:167.55ms step:674/1530 train_loss:3.6438 train_time:111255ms step_avg:167.55ms step:675/1530 train_loss:3.7319 train_time:111427ms step_avg:167.56ms step:676/1530 train_loss:3.5063 train_time:111597ms step_avg:167.56ms step:677/1530 train_loss:3.6120 train_time:111768ms step_avg:167.57ms step:678/1530 train_loss:3.5684 train_time:111940ms step_avg:167.57ms step:679/1530 train_loss:3.6990 train_time:112110ms step_avg:167.58ms step:680/1530 train_loss:3.5982 train_time:112281ms step_avg:167.58ms step:681/1530 train_loss:3.6279 train_time:112451ms step_avg:167.59ms step:682/1530 train_loss:3.6755 train_time:112626ms step_avg:167.60ms step:683/1530 train_loss:3.7583 train_time:112799ms step_avg:167.61ms step:684/1530 train_loss:3.6621 train_time:112972ms step_avg:167.61ms step:685/1530 train_loss:3.6990 train_time:113144ms step_avg:167.62ms step:686/1530 train_loss:3.6467 train_time:113318ms step_avg:167.63ms step:687/1530 train_loss:3.6809 train_time:113489ms step_avg:167.64ms step:688/1530 train_loss:3.2392 train_time:113667ms step_avg:167.65ms step:689/1530 train_loss:3.4184 train_time:113840ms step_avg:167.66ms step:690/1530 train_loss:3.5582 train_time:114015ms step_avg:167.67ms step:691/1530 train_loss:3.4315 train_time:114187ms step_avg:167.68ms step:692/1530 train_loss:3.6450 train_time:114358ms step_avg:167.68ms step:693/1530 train_loss:3.6651 train_time:114530ms step_avg:167.69ms step:694/1530 train_loss:3.5662 train_time:114704ms step_avg:167.70ms step:695/1530 train_loss:3.5443 train_time:114874ms step_avg:167.70ms step:696/1530 train_loss:3.8696 train_time:115047ms step_avg:167.71ms step:697/1530 train_loss:3.6025 train_time:115220ms step_avg:167.72ms step:698/1530 train_loss:3.6572 train_time:115392ms step_avg:167.72ms step:699/1530 train_loss:3.7876 train_time:115566ms step_avg:167.73ms step:700/1530 train_loss:3.5837 train_time:115738ms step_avg:167.74ms step:701/1530 train_loss:3.5575 train_time:115909ms step_avg:167.74ms step:702/1530 train_loss:3.5252 train_time:116083ms step_avg:167.75ms step:703/1530 train_loss:3.5160 train_time:116255ms step_avg:167.76ms step:704/1530 train_loss:3.5863 train_time:116428ms step_avg:167.76ms step:705/1530 train_loss:3.5806 train_time:116606ms step_avg:167.78ms step:706/1530 train_loss:3.5945 train_time:116781ms step_avg:167.79ms step:707/1530 train_loss:3.6606 train_time:116956ms step_avg:167.80ms step:708/1530 train_loss:3.6157 train_time:117128ms step_avg:167.80ms step:709/1530 train_loss:3.5891 train_time:117303ms step_avg:167.82ms step:710/1530 train_loss:3.5523 train_time:117473ms step_avg:167.82ms step:711/1530 train_loss:3.6055 train_time:117647ms step_avg:167.83ms step:712/1530 train_loss:3.6619 train_time:117822ms step_avg:167.84ms step:713/1530 train_loss:3.6673 train_time:117997ms step_avg:167.85ms step:714/1530 train_loss:3.5765 train_time:118169ms step_avg:167.85ms step:715/1530 train_loss:3.5826 train_time:118342ms step_avg:167.86ms step:716/1530 train_loss:3.6028 train_time:118512ms step_avg:167.86ms step:717/1530 train_loss:3.7204 train_time:118687ms step_avg:167.87ms step:718/1530 train_loss:3.6102 train_time:118857ms step_avg:167.88ms step:719/1530 train_loss:3.6898 train_time:119030ms step_avg:167.88ms step:720/1530 train_loss:3.8536 train_time:119206ms step_avg:167.90ms step:721/1530 train_loss:3.4824 train_time:119379ms step_avg:167.90ms step:722/1530 train_loss:3.7494 train_time:119551ms step_avg:167.91ms step:723/1530 train_loss:3.7806 train_time:119722ms step_avg:167.91ms step:724/1530 train_loss:3.5782 train_time:119895ms step_avg:167.92ms step:725/1530 train_loss:3.6666 train_time:120067ms step_avg:167.93ms step:726/1530 train_loss:3.5482 train_time:120242ms step_avg:167.94ms step:727/1530 train_loss:3.5947 train_time:120416ms step_avg:167.94ms step:728/1530 train_loss:3.7478 train_time:120589ms step_avg:167.95ms step:729/1530 train_loss:3.6875 train_time:120764ms step_avg:167.96ms step:730/1530 train_loss:3.6821 train_time:120937ms step_avg:167.97ms step:731/1530 train_loss:3.5704 train_time:121110ms step_avg:167.97ms step:732/1530 train_loss:3.6129 train_time:121282ms step_avg:167.98ms step:733/1530 train_loss:3.8458 train_time:121455ms step_avg:167.99ms step:734/1530 train_loss:3.5777 train_time:121628ms step_avg:168.00ms step:735/1530 train_loss:3.6339 train_time:121802ms step_avg:168.00ms step:736/1530 train_loss:3.7480 train_time:121975ms step_avg:168.01ms step:737/1530 train_loss:3.6916 train_time:122147ms step_avg:168.02ms step:738/1530 train_loss:3.6162 train_time:122318ms step_avg:168.02ms step:739/1530 train_loss:3.5175 train_time:122490ms step_avg:168.02ms step:740/1530 train_loss:4.1260 train_time:122669ms step_avg:168.04ms step:741/1530 train_loss:3.5072 train_time:122842ms step_avg:168.05ms step:742/1530 train_loss:3.5780 train_time:123014ms step_avg:168.05ms step:743/1530 train_loss:3.5988 train_time:123186ms step_avg:168.06ms step:744/1530 train_loss:3.6616 train_time:123359ms step_avg:168.06ms step:745/1530 train_loss:3.6051 train_time:123533ms step_avg:168.07ms step:746/1530 train_loss:3.6135 train_time:123704ms step_avg:168.08ms step:747/1530 train_loss:3.6564 train_time:123880ms step_avg:168.09ms step:748/1530 train_loss:3.5783 train_time:124055ms step_avg:168.10ms step:749/1530 train_loss:3.5720 train_time:124229ms step_avg:168.10ms step:750/1530 train_loss:3.6099 train_time:124400ms step_avg:168.11ms step:750/1530 val_loss:3.5839 train_time:124449ms step_avg:168.17ms step:751/1530 train_loss:3.5910 train_time:124573ms step_avg:168.12ms step:752/1530 train_loss:3.6389 train_time:124744ms step_avg:168.12ms step:753/1530 train_loss:3.6307 train_time:124916ms step_avg:168.12ms step:754/1530 train_loss:3.6037 train_time:125091ms step_avg:168.13ms step:755/1530 train_loss:3.6968 train_time:125397ms step_avg:168.32ms step:756/1530 train_loss:3.4760 train_time:125581ms step_avg:168.34ms step:757/1530 train_loss:3.7389 train_time:125754ms step_avg:168.34ms step:758/1530 train_loss:3.6660 train_time:125924ms step_avg:168.35ms step:759/1530 train_loss:3.6060 train_time:126251ms step_avg:168.56ms step:760/1530 train_loss:3.7215 train_time:126423ms step_avg:168.56ms step:761/1530 train_loss:3.4146 train_time:126595ms step_avg:168.57ms step:762/1530 train_loss:3.5636 train_time:126766ms step_avg:168.57ms step:763/1530 train_loss:3.6820 train_time:126938ms step_avg:168.58ms step:764/1530 train_loss:3.3338 train_time:127112ms step_avg:168.58ms step:765/1530 train_loss:3.7465 train_time:127283ms step_avg:168.59ms step:766/1530 train_loss:3.5908 train_time:127456ms step_avg:168.59ms step:767/1530 train_loss:3.5803 train_time:127628ms step_avg:168.60ms step:768/1530 train_loss:3.5815 train_time:127801ms step_avg:168.60ms step:769/1530 train_loss:3.5962 train_time:127973ms step_avg:168.61ms step:770/1530 train_loss:3.6572 train_time:128144ms step_avg:168.61ms step:771/1530 train_loss:3.9060 train_time:128317ms step_avg:168.62ms step:772/1530 train_loss:3.4685 train_time:128489ms step_avg:168.62ms step:773/1530 train_loss:3.6445 train_time:128658ms step_avg:168.62ms step:774/1530 train_loss:3.6499 train_time:128832ms step_avg:168.63ms step:775/1530 train_loss:3.6240 train_time:129003ms step_avg:168.63ms step:776/1530 train_loss:3.4284 train_time:129175ms step_avg:168.64ms step:777/1530 train_loss:3.4045 train_time:129349ms step_avg:168.64ms step:778/1530 train_loss:3.5039 train_time:129520ms step_avg:168.65ms step:779/1530 train_loss:3.5886 train_time:129693ms step_avg:168.65ms step:780/1530 train_loss:3.5953 train_time:129866ms step_avg:168.66ms step:781/1530 train_loss:3.6854 train_time:130037ms step_avg:168.66ms step:782/1530 train_loss:3.6017 train_time:130211ms step_avg:168.67ms step:783/1530 train_loss:3.5854 train_time:130381ms step_avg:168.67ms step:784/1530 train_loss:3.6181 train_time:130554ms step_avg:168.67ms step:785/1530 train_loss:3.5745 train_time:130725ms step_avg:168.68ms step:786/1530 train_loss:3.4532 train_time:130896ms step_avg:168.68ms step:787/1530 train_loss:3.7381 train_time:131070ms step_avg:168.69ms step:788/1530 train_loss:3.5177 train_time:131241ms step_avg:168.69ms step:789/1530 train_loss:3.5616 train_time:131412ms step_avg:168.69ms step:790/1530 train_loss:3.6399 train_time:131587ms step_avg:168.70ms step:791/1530 train_loss:3.7842 train_time:131761ms step_avg:168.71ms step:792/1530 train_loss:3.7747 train_time:131934ms step_avg:168.71ms step:793/1530 train_loss:3.4690 train_time:132105ms step_avg:168.72ms step:794/1530 train_loss:3.6117 train_time:132276ms step_avg:168.72ms step:795/1530 train_loss:3.6938 train_time:132450ms step_avg:168.73ms step:796/1530 train_loss:3.7728 train_time:132629ms step_avg:168.74ms step:797/1530 train_loss:3.5345 train_time:132801ms step_avg:168.74ms step:798/1530 train_loss:3.6574 train_time:132976ms step_avg:168.75ms step:799/1530 train_loss:3.5525 train_time:133153ms step_avg:168.76ms step:800/1530 train_loss:3.5414 train_time:133327ms step_avg:168.77ms step:801/1530 train_loss:3.6417 train_time:133499ms step_avg:168.77ms step:802/1530 train_loss:3.5083 train_time:133675ms step_avg:168.78ms step:803/1530 train_loss:3.5151 train_time:133847ms step_avg:168.79ms step:804/1530 train_loss:3.6331 train_time:134021ms step_avg:168.79ms step:805/1530 train_loss:3.5342 train_time:134197ms step_avg:168.80ms step:806/1530 train_loss:3.5723 train_time:134371ms step_avg:168.81ms step:807/1530 train_loss:3.6539 train_time:134544ms step_avg:168.81ms step:808/1530 train_loss:3.5599 train_time:134720ms step_avg:168.82ms step:809/1530 train_loss:3.5058 train_time:134895ms step_avg:168.83ms step:810/1530 train_loss:3.5711 train_time:135068ms step_avg:168.83ms step:811/1530 train_loss:3.5920 train_time:135239ms step_avg:168.84ms step:812/1530 train_loss:3.6161 train_time:135413ms step_avg:168.84ms step:813/1530 train_loss:3.6402 train_time:135585ms step_avg:168.85ms step:814/1530 train_loss:3.5748 train_time:135758ms step_avg:168.85ms step:815/1530 train_loss:3.5747 train_time:135934ms step_avg:168.86ms step:816/1530 train_loss:3.6952 train_time:136111ms step_avg:168.87ms step:817/1530 train_loss:3.7856 train_time:136284ms step_avg:168.88ms step:818/1530 train_loss:3.5350 train_time:136457ms step_avg:168.88ms step:819/1530 train_loss:3.7329 train_time:136632ms step_avg:168.89ms step:820/1530 train_loss:3.5068 train_time:136808ms step_avg:168.90ms step:821/1530 train_loss:3.5746 train_time:136980ms step_avg:168.90ms step:822/1530 train_loss:3.7085 train_time:137157ms step_avg:168.91ms step:823/1530 train_loss:3.5867 train_time:137333ms step_avg:168.92ms step:824/1530 train_loss:3.5294 train_time:137505ms step_avg:168.93ms step:825/1530 train_loss:3.6325 train_time:137679ms step_avg:168.93ms step:826/1530 train_loss:3.4990 train_time:137856ms step_avg:168.94ms step:827/1530 train_loss:3.7440 train_time:138032ms step_avg:168.95ms step:828/1530 train_loss:3.6351 train_time:138203ms step_avg:168.95ms step:829/1530 train_loss:3.6417 train_time:138378ms step_avg:168.96ms step:830/1530 train_loss:3.5502 train_time:138553ms step_avg:168.97ms step:831/1530 train_loss:3.6124 train_time:138726ms step_avg:168.97ms step:832/1530 train_loss:3.5287 train_time:138900ms step_avg:168.98ms step:833/1530 train_loss:3.6689 train_time:139075ms step_avg:168.99ms step:834/1530 train_loss:3.4874 train_time:139249ms step_avg:168.99ms step:835/1530 train_loss:3.4704 train_time:139424ms step_avg:169.00ms step:836/1530 train_loss:3.7297 train_time:139598ms step_avg:169.01ms step:837/1530 train_loss:3.4184 train_time:139773ms step_avg:169.01ms step:838/1530 train_loss:3.6093 train_time:139947ms step_avg:169.02ms step:839/1530 train_loss:3.4380 train_time:140121ms step_avg:169.02ms step:840/1530 train_loss:3.4823 train_time:140293ms step_avg:169.03ms step:841/1530 train_loss:3.5823 train_time:140466ms step_avg:169.03ms step:842/1530 train_loss:3.5900 train_time:140640ms step_avg:169.04ms step:843/1530 train_loss:3.5732 train_time:140816ms step_avg:169.05ms step:844/1530 train_loss:3.4412 train_time:140987ms step_avg:169.05ms step:845/1530 train_loss:3.6759 train_time:141162ms step_avg:169.06ms step:846/1530 train_loss:3.5319 train_time:141336ms step_avg:169.06ms step:847/1530 train_loss:3.5097 train_time:141512ms step_avg:169.07ms step:848/1530 train_loss:3.6559 train_time:141683ms step_avg:169.07ms step:849/1530 train_loss:3.5085 train_time:141858ms step_avg:169.08ms step:850/1530 train_loss:3.4594 train_time:142034ms step_avg:169.09ms step:851/1530 train_loss:3.7444 train_time:142208ms step_avg:169.09ms step:852/1530 train_loss:3.4539 train_time:142379ms step_avg:169.10ms step:853/1530 train_loss:3.5809 train_time:142553ms step_avg:169.10ms step:854/1530 train_loss:3.6668 train_time:142729ms step_avg:169.11ms step:855/1530 train_loss:3.5285 train_time:142902ms step_avg:169.11ms step:856/1530 train_loss:3.5591 train_time:143075ms step_avg:169.12ms step:857/1530 train_loss:3.6161 train_time:143250ms step_avg:169.13ms step:858/1530 train_loss:3.4856 train_time:143426ms step_avg:169.13ms step:859/1530 train_loss:3.5735 train_time:143599ms step_avg:169.14ms step:860/1530 train_loss:3.5978 train_time:143771ms step_avg:169.14ms step:861/1530 train_loss:3.6493 train_time:143947ms step_avg:169.15ms step:862/1530 train_loss:3.6184 train_time:144126ms step_avg:169.16ms step:863/1530 train_loss:3.5835 train_time:144301ms step_avg:169.17ms step:864/1530 train_loss:3.3991 train_time:144474ms step_avg:169.17ms step:865/1530 train_loss:3.6164 train_time:144645ms step_avg:169.18ms step:866/1530 train_loss:3.9215 train_time:144823ms step_avg:169.19ms step:867/1530 train_loss:3.4725 train_time:144995ms step_avg:169.19ms step:868/1530 train_loss:3.6585 train_time:145169ms step_avg:169.19ms step:869/1530 train_loss:3.6332 train_time:145340ms step_avg:169.20ms step:870/1530 train_loss:3.4609 train_time:145515ms step_avg:169.20ms step:871/1530 train_loss:3.4269 train_time:145689ms step_avg:169.21ms step:872/1530 train_loss:3.6661 train_time:145863ms step_avg:169.21ms step:873/1530 train_loss:3.4721 train_time:146036ms step_avg:169.22ms step:874/1530 train_loss:3.2319 train_time:146216ms step_avg:169.23ms step:875/1530 train_loss:3.6470 train_time:146390ms step_avg:169.24ms step:875/1530 val_loss:3.5320 train_time:146439ms step_avg:169.29ms step:876/1530 train_loss:3.4524 train_time:146561ms step_avg:169.24ms step:877/1530 train_loss:3.6287 train_time:146737ms step_avg:169.25ms step:878/1530 train_loss:3.4845 train_time:146912ms step_avg:169.25ms step:879/1530 train_loss:3.6688 train_time:147085ms step_avg:169.26ms step:880/1530 train_loss:3.3246 train_time:147257ms step_avg:169.26ms step:881/1530 train_loss:3.4908 train_time:147430ms step_avg:169.27ms step:882/1530 train_loss:3.7101 train_time:147602ms step_avg:169.27ms step:883/1530 train_loss:3.8549 train_time:147775ms step_avg:169.27ms step:884/1530 train_loss:3.5794 train_time:147952ms step_avg:169.28ms step:885/1530 train_loss:3.5045 train_time:148125ms step_avg:169.29ms step:886/1530 train_loss:3.5828 train_time:148297ms step_avg:169.29ms step:887/1530 train_loss:4.1173 train_time:148471ms step_avg:169.29ms step:888/1530 train_loss:3.8550 train_time:148651ms step_avg:169.31ms step:889/1530 train_loss:3.5319 train_time:148825ms step_avg:169.31ms step:890/1530 train_loss:3.5455 train_time:148996ms step_avg:169.31ms step:891/1530 train_loss:3.3720 train_time:149170ms step_avg:169.32ms step:892/1530 train_loss:3.7315 train_time:149343ms step_avg:169.32ms step:893/1530 train_loss:3.4379 train_time:149515ms step_avg:169.33ms step:894/1530 train_loss:3.6713 train_time:149692ms step_avg:169.33ms step:895/1530 train_loss:3.6898 train_time:149867ms step_avg:169.34ms step:896/1530 train_loss:3.5219 train_time:150040ms step_avg:169.35ms step:897/1530 train_loss:3.5598 train_time:150215ms step_avg:169.35ms step:898/1530 train_loss:3.6059 train_time:150390ms step_avg:169.36ms step:899/1530 train_loss:3.4896 train_time:150561ms step_avg:169.36ms step:900/1530 train_loss:3.4415 train_time:150733ms step_avg:169.36ms step:901/1530 train_loss:3.6360 train_time:150907ms step_avg:169.37ms step:902/1530 train_loss:3.6457 train_time:151080ms step_avg:169.37ms step:903/1530 train_loss:3.5549 train_time:151256ms step_avg:169.38ms step:904/1530 train_loss:3.5129 train_time:151430ms step_avg:169.39ms step:905/1530 train_loss:3.5095 train_time:151600ms step_avg:169.39ms step:906/1530 train_loss:3.7226 train_time:151775ms step_avg:169.39ms step:907/1530 train_loss:3.5295 train_time:151950ms step_avg:169.40ms step:908/1530 train_loss:3.5741 train_time:152122ms step_avg:169.40ms step:909/1530 train_loss:3.4705 train_time:152298ms step_avg:169.41ms step:910/1530 train_loss:3.5440 train_time:152478ms step_avg:169.42ms step:911/1530 train_loss:3.6539 train_time:152654ms step_avg:169.43ms step:912/1530 train_loss:3.6075 train_time:152834ms step_avg:169.44ms step:913/1530 train_loss:3.4840 train_time:153013ms step_avg:169.45ms step:914/1530 train_loss:3.7572 train_time:153191ms step_avg:169.46ms step:915/1530 train_loss:3.5443 train_time:153370ms step_avg:169.47ms step:916/1530 train_loss:3.6355 train_time:153547ms step_avg:169.48ms step:917/1530 train_loss:3.6117 train_time:153720ms step_avg:169.48ms step:918/1530 train_loss:4.8332 train_time:153898ms step_avg:169.49ms step:919/1530 train_loss:3.5124 train_time:154076ms step_avg:169.50ms step:920/1530 train_loss:3.6039 train_time:154251ms step_avg:169.51ms step:921/1530 train_loss:3.5639 train_time:154430ms step_avg:169.52ms step:922/1530 train_loss:3.5924 train_time:154607ms step_avg:169.53ms step:923/1530 train_loss:3.6311 train_time:154781ms step_avg:169.53ms step:924/1530 train_loss:3.6912 train_time:154956ms step_avg:169.54ms step:925/1530 train_loss:3.6666 train_time:155130ms step_avg:169.54ms step:926/1530 train_loss:3.5670 train_time:155303ms step_avg:169.54ms step:927/1530 train_loss:3.5687 train_time:155479ms step_avg:169.55ms step:928/1530 train_loss:3.7881 train_time:155655ms step_avg:169.56ms step:929/1530 train_loss:3.6226 train_time:155830ms step_avg:169.56ms step:930/1530 train_loss:3.4153 train_time:156007ms step_avg:169.57ms step:931/1530 train_loss:3.5117 train_time:156181ms step_avg:169.58ms step:932/1530 train_loss:3.6636 train_time:156358ms step_avg:169.59ms step:933/1530 train_loss:3.3807 train_time:156535ms step_avg:169.59ms step:934/1530 train_loss:3.6018 train_time:156712ms step_avg:169.60ms step:935/1530 train_loss:3.4532 train_time:156889ms step_avg:169.61ms step:936/1530 train_loss:3.5344 train_time:157065ms step_avg:169.62ms step:937/1530 train_loss:3.6316 train_time:157245ms step_avg:169.63ms step:938/1530 train_loss:3.5611 train_time:157418ms step_avg:169.63ms step:939/1530 train_loss:3.6934 train_time:157597ms step_avg:169.64ms step:940/1530 train_loss:3.4938 train_time:157772ms step_avg:169.65ms step:941/1530 train_loss:3.5670 train_time:157948ms step_avg:169.65ms step:942/1530 train_loss:3.3659 train_time:158123ms step_avg:169.66ms step:943/1530 train_loss:3.7260 train_time:158302ms step_avg:169.67ms step:944/1530 train_loss:3.4152 train_time:158615ms step_avg:169.82ms step:945/1530 train_loss:3.4381 train_time:158798ms step_avg:169.84ms step:946/1530 train_loss:5.0924 train_time:158978ms step_avg:169.85ms step:947/1530 train_loss:3.6147 train_time:159154ms step_avg:169.85ms step:948/1530 train_loss:3.4963 train_time:159330ms step_avg:169.86ms step:949/1530 train_loss:3.3912 train_time:159656ms step_avg:170.03ms step:950/1530 train_loss:3.4551 train_time:159832ms step_avg:170.03ms step:951/1530 train_loss:3.4228 train_time:160012ms step_avg:170.04ms step:952/1530 train_loss:3.4950 train_time:160188ms step_avg:170.05ms step:953/1530 train_loss:3.5817 train_time:160366ms step_avg:170.06ms step:954/1530 train_loss:3.4566 train_time:160546ms step_avg:170.07ms step:955/1530 train_loss:3.4919 train_time:160719ms step_avg:170.07ms step:956/1530 train_loss:3.4532 train_time:160893ms step_avg:170.08ms step:957/1530 train_loss:3.5095 train_time:161073ms step_avg:170.09ms step:958/1530 train_loss:3.5143 train_time:161253ms step_avg:170.10ms step:959/1530 train_loss:3.5220 train_time:161431ms step_avg:170.11ms step:960/1530 train_loss:3.4172 train_time:161610ms step_avg:170.12ms step:961/1530 train_loss:3.6584 train_time:161785ms step_avg:170.12ms step:962/1530 train_loss:3.6069 train_time:161958ms step_avg:170.12ms step:963/1530 train_loss:3.8107 train_time:162137ms step_avg:170.13ms step:964/1530 train_loss:3.4492 train_time:162313ms step_avg:170.14ms step:965/1530 train_loss:3.4964 train_time:162486ms step_avg:170.14ms step:966/1530 train_loss:3.7230 train_time:162659ms step_avg:170.14ms step:967/1530 train_loss:3.5332 train_time:162835ms step_avg:170.15ms step:968/1530 train_loss:3.5330 train_time:163011ms step_avg:170.16ms step:969/1530 train_loss:3.5950 train_time:163186ms step_avg:170.16ms step:970/1530 train_loss:3.3936 train_time:163358ms step_avg:170.16ms step:971/1530 train_loss:3.5464 train_time:163533ms step_avg:170.17ms step:972/1530 train_loss:3.4974 train_time:163708ms step_avg:170.17ms step:973/1530 train_loss:3.5527 train_time:163882ms step_avg:170.18ms step:974/1530 train_loss:3.6072 train_time:164057ms step_avg:170.18ms step:975/1530 train_loss:3.4768 train_time:164234ms step_avg:170.19ms step:976/1530 train_loss:3.6830 train_time:164409ms step_avg:170.20ms step:977/1530 train_loss:3.5860 train_time:164584ms step_avg:170.20ms step:978/1530 train_loss:3.3766 train_time:164757ms step_avg:170.20ms step:979/1530 train_loss:3.6416 train_time:164933ms step_avg:170.21ms step:980/1530 train_loss:3.4275 train_time:165109ms step_avg:170.22ms step:981/1530 train_loss:3.5896 train_time:165285ms step_avg:170.22ms step:982/1530 train_loss:3.5565 train_time:165458ms step_avg:170.22ms step:983/1530 train_loss:3.5339 train_time:165636ms step_avg:170.23ms step:984/1530 train_loss:3.5101 train_time:165812ms step_avg:170.24ms step:985/1530 train_loss:3.5848 train_time:165989ms step_avg:170.25ms step:986/1530 train_loss:3.4249 train_time:166164ms step_avg:170.25ms step:987/1530 train_loss:3.4989 train_time:166336ms step_avg:170.25ms step:988/1530 train_loss:3.4858 train_time:166510ms step_avg:170.26ms step:989/1530 train_loss:3.4324 train_time:166683ms step_avg:170.26ms step:990/1530 train_loss:3.6714 train_time:166858ms step_avg:170.26ms step:991/1530 train_loss:3.4830 train_time:167033ms step_avg:170.27ms step:992/1530 train_loss:3.4562 train_time:167212ms step_avg:170.28ms step:993/1530 train_loss:3.5126 train_time:167391ms step_avg:170.29ms step:994/1530 train_loss:3.6119 train_time:167565ms step_avg:170.29ms step:995/1530 train_loss:3.5506 train_time:167737ms step_avg:170.29ms step:996/1530 train_loss:3.4651 train_time:167910ms step_avg:170.29ms step:997/1530 train_loss:3.7704 train_time:168085ms step_avg:170.30ms step:998/1530 train_loss:3.4499 train_time:168257ms step_avg:170.30ms step:999/1530 train_loss:3.5977 train_time:168433ms step_avg:170.31ms step:1000/1530 train_loss:3.4490 train_time:168611ms step_avg:170.31ms step:1000/1530 val_loss:3.4801 train_time:168663ms step_avg:170.37ms step:1001/1530 train_loss:3.5097 train_time:168788ms step_avg:170.32ms step:1002/1530 train_loss:3.3852 train_time:168964ms step_avg:170.33ms step:1003/1530 train_loss:3.5726 train_time:169140ms step_avg:170.33ms step:1004/1530 train_loss:3.6162 train_time:169315ms step_avg:170.34ms step:1005/1530 train_loss:3.3995 train_time:169491ms step_avg:170.34ms step:1006/1530 train_loss:3.4780 train_time:169668ms step_avg:170.35ms step:1007/1530 train_loss:3.4490 train_time:169844ms step_avg:170.36ms step:1008/1530 train_loss:3.5734 train_time:170021ms step_avg:170.36ms step:1009/1530 train_loss:3.6747 train_time:170197ms step_avg:170.37ms step:1010/1530 train_loss:3.5714 train_time:170370ms step_avg:170.37ms step:1011/1530 train_loss:3.5474 train_time:170543ms step_avg:170.37ms step:1012/1530 train_loss:3.4001 train_time:170718ms step_avg:170.38ms step:1013/1530 train_loss:3.5449 train_time:170892ms step_avg:170.38ms step:1014/1530 train_loss:3.6346 train_time:171071ms step_avg:170.39ms step:1015/1530 train_loss:3.3383 train_time:171248ms step_avg:170.40ms step:1016/1530 train_loss:3.4220 train_time:171423ms step_avg:170.40ms step:1017/1530 train_loss:3.4118 train_time:171598ms step_avg:170.41ms step:1018/1530 train_loss:3.4044 train_time:171774ms step_avg:170.41ms step:1019/1530 train_loss:3.5285 train_time:171949ms step_avg:170.42ms step:1020/1530 train_loss:3.3965 train_time:172127ms step_avg:170.42ms step:1021/1530 train_loss:3.3621 train_time:172300ms step_avg:170.43ms step:1022/1530 train_loss:3.4902 train_time:172477ms step_avg:170.43ms step:1023/1530 train_loss:3.5198 train_time:172653ms step_avg:170.44ms step:1024/1530 train_loss:3.4905 train_time:172831ms step_avg:170.45ms step:1025/1530 train_loss:3.4933 train_time:173009ms step_avg:170.45ms step:1026/1530 train_loss:3.6269 train_time:173185ms step_avg:170.46ms step:1027/1530 train_loss:3.3299 train_time:173363ms step_avg:170.46ms step:1028/1530 train_loss:3.4120 train_time:173543ms step_avg:170.47ms step:1029/1530 train_loss:3.3244 train_time:173724ms step_avg:170.48ms step:1030/1530 train_loss:3.5490 train_time:173898ms step_avg:170.49ms step:1031/1530 train_loss:3.5231 train_time:174074ms step_avg:170.49ms step:1032/1530 train_loss:3.7064 train_time:174254ms step_avg:170.50ms step:1033/1530 train_loss:3.4991 train_time:174430ms step_avg:170.51ms step:1034/1530 train_loss:3.4179 train_time:174609ms step_avg:170.52ms step:1035/1530 train_loss:3.4562 train_time:174787ms step_avg:170.52ms step:1036/1530 train_loss:3.4989 train_time:174965ms step_avg:170.53ms step:1037/1530 train_loss:3.8018 train_time:175141ms step_avg:170.54ms step:1038/1530 train_loss:3.6281 train_time:175319ms step_avg:170.54ms step:1039/1530 train_loss:3.5224 train_time:175498ms step_avg:170.55ms step:1040/1530 train_loss:3.4292 train_time:175673ms step_avg:170.56ms step:1041/1530 train_loss:3.5018 train_time:175852ms step_avg:170.56ms step:1042/1530 train_loss:3.5355 train_time:176026ms step_avg:170.57ms step:1043/1530 train_loss:3.4589 train_time:176201ms step_avg:170.57ms step:1044/1530 train_loss:3.4706 train_time:176375ms step_avg:170.58ms step:1045/1530 train_loss:3.5235 train_time:176553ms step_avg:170.58ms step:1046/1530 train_loss:3.4379 train_time:176727ms step_avg:170.59ms step:1047/1530 train_loss:3.6432 train_time:176904ms step_avg:170.59ms step:1048/1530 train_loss:3.5096 train_time:177079ms step_avg:170.60ms step:1049/1530 train_loss:3.4195 train_time:177255ms step_avg:170.60ms step:1050/1530 train_loss:3.4065 train_time:177433ms step_avg:170.61ms step:1051/1530 train_loss:3.5090 train_time:177610ms step_avg:170.61ms step:1052/1530 train_loss:3.3757 train_time:177789ms step_avg:170.62ms step:1053/1530 train_loss:3.6980 train_time:177967ms step_avg:170.63ms step:1054/1530 train_loss:3.5504 train_time:178146ms step_avg:170.64ms step:1055/1530 train_loss:3.3900 train_time:178322ms step_avg:170.64ms step:1056/1530 train_loss:3.5060 train_time:178497ms step_avg:170.65ms step:1057/1530 train_loss:3.5904 train_time:178675ms step_avg:170.65ms step:1058/1530 train_loss:3.3155 train_time:178853ms step_avg:170.66ms step:1059/1530 train_loss:3.3922 train_time:179034ms step_avg:170.67ms step:1060/1530 train_loss:3.4539 train_time:179209ms step_avg:170.68ms step:1061/1530 train_loss:3.4303 train_time:179384ms step_avg:170.68ms step:1062/1530 train_loss:3.3912 train_time:179560ms step_avg:170.68ms step:1063/1530 train_loss:3.4689 train_time:179735ms step_avg:170.69ms step:1064/1530 train_loss:3.3931 train_time:179908ms step_avg:170.69ms step:1065/1530 train_loss:3.3690 train_time:180086ms step_avg:170.70ms step:1066/1530 train_loss:3.4233 train_time:180263ms step_avg:170.70ms step:1067/1530 train_loss:3.2942 train_time:180441ms step_avg:170.71ms step:1068/1530 train_loss:3.4483 train_time:180617ms step_avg:170.72ms step:1069/1530 train_loss:3.3062 train_time:180799ms step_avg:170.73ms step:1070/1530 train_loss:3.5809 train_time:180973ms step_avg:170.73ms step:1071/1530 train_loss:3.5237 train_time:181152ms step_avg:170.74ms step:1072/1530 train_loss:3.4564 train_time:181327ms step_avg:170.74ms step:1073/1530 train_loss:3.5351 train_time:181500ms step_avg:170.74ms step:1074/1530 train_loss:3.4376 train_time:181675ms step_avg:170.75ms step:1075/1530 train_loss:3.4108 train_time:181853ms step_avg:170.75ms step:1076/1530 train_loss:3.8092 train_time:182029ms step_avg:170.76ms step:1077/1530 train_loss:3.4531 train_time:182205ms step_avg:170.76ms step:1078/1530 train_loss:3.1033 train_time:182389ms step_avg:170.78ms step:1079/1530 train_loss:3.5441 train_time:182566ms step_avg:170.78ms step:1080/1530 train_loss:3.4385 train_time:182746ms step_avg:170.79ms step:1081/1530 train_loss:3.5139 train_time:182921ms step_avg:170.79ms step:1082/1530 train_loss:3.5991 train_time:183095ms step_avg:170.80ms step:1083/1530 train_loss:3.5054 train_time:183270ms step_avg:170.80ms step:1084/1530 train_loss:3.4748 train_time:183446ms step_avg:170.81ms step:1085/1530 train_loss:3.4421 train_time:183621ms step_avg:170.81ms step:1086/1530 train_loss:3.6389 train_time:183799ms step_avg:170.82ms step:1087/1530 train_loss:3.5132 train_time:183973ms step_avg:170.82ms step:1088/1530 train_loss:3.3800 train_time:184150ms step_avg:170.83ms step:1089/1530 train_loss:3.3829 train_time:184330ms step_avg:170.83ms step:1090/1530 train_loss:3.4909 train_time:184508ms step_avg:170.84ms step:1091/1530 train_loss:3.3015 train_time:184685ms step_avg:170.85ms step:1092/1530 train_loss:3.4982 train_time:184862ms step_avg:170.85ms step:1093/1530 train_loss:3.6148 train_time:185037ms step_avg:170.86ms step:1094/1530 train_loss:3.4565 train_time:185212ms step_avg:170.86ms step:1095/1530 train_loss:3.4300 train_time:185386ms step_avg:170.86ms step:1096/1530 train_loss:3.4387 train_time:185563ms step_avg:170.87ms step:1097/1530 train_loss:3.4950 train_time:185740ms step_avg:170.87ms step:1098/1530 train_loss:3.5783 train_time:185918ms step_avg:170.88ms step:1099/1530 train_loss:3.5388 train_time:186095ms step_avg:170.89ms step:1100/1530 train_loss:3.4417 train_time:186275ms step_avg:170.89ms step:1101/1530 train_loss:3.3012 train_time:186453ms step_avg:170.90ms step:1102/1530 train_loss:3.3219 train_time:186631ms step_avg:170.91ms step:1103/1530 train_loss:3.4538 train_time:186813ms step_avg:170.92ms step:1104/1530 train_loss:3.3277 train_time:186989ms step_avg:170.92ms step:1105/1530 train_loss:4.0654 train_time:187169ms step_avg:170.93ms step:1106/1530 train_loss:3.2405 train_time:187344ms step_avg:170.93ms step:1107/1530 train_loss:3.5793 train_time:187520ms step_avg:170.94ms step:1108/1530 train_loss:3.3590 train_time:187693ms step_avg:170.94ms step:1109/1530 train_loss:3.5124 train_time:187868ms step_avg:170.94ms step:1110/1530 train_loss:3.4383 train_time:188042ms step_avg:170.95ms step:1111/1530 train_loss:3.4961 train_time:188216ms step_avg:170.95ms step:1112/1530 train_loss:3.5724 train_time:188395ms step_avg:170.96ms step:1113/1530 train_loss:3.4405 train_time:188579ms step_avg:170.97ms step:1114/1530 train_loss:3.3875 train_time:188758ms step_avg:170.98ms step:1115/1530 train_loss:3.2477 train_time:188935ms step_avg:170.98ms step:1116/1530 train_loss:3.4383 train_time:189107ms step_avg:170.98ms step:1117/1530 train_loss:3.6009 train_time:189286ms step_avg:170.99ms step:1118/1530 train_loss:3.6409 train_time:189464ms step_avg:171.00ms step:1119/1530 train_loss:3.4886 train_time:189638ms step_avg:171.00ms step:1120/1530 train_loss:3.5023 train_time:189814ms step_avg:171.00ms step:1121/1530 train_loss:3.4005 train_time:189990ms step_avg:171.01ms step:1122/1530 train_loss:3.4711 train_time:190167ms step_avg:171.01ms step:1123/1530 train_loss:3.5919 train_time:190345ms step_avg:171.02ms step:1124/1530 train_loss:3.3508 train_time:190519ms step_avg:171.02ms step:1125/1530 train_loss:3.2429 train_time:190694ms step_avg:171.03ms step:1125/1530 val_loss:3.4198 train_time:190745ms step_avg:171.07ms step:1126/1530 train_loss:3.4863 train_time:190871ms step_avg:171.03ms step:1127/1530 train_loss:3.6792 train_time:191049ms step_avg:171.04ms step:1128/1530 train_loss:3.2412 train_time:191225ms step_avg:171.04ms step:1129/1530 train_loss:3.5675 train_time:191405ms step_avg:171.05ms step:1130/1530 train_loss:3.3915 train_time:191583ms step_avg:171.06ms step:1131/1530 train_loss:3.4116 train_time:191765ms step_avg:171.07ms step:1132/1530 train_loss:3.3792 train_time:191939ms step_avg:171.07ms step:1133/1530 train_loss:3.4991 train_time:192251ms step_avg:171.19ms step:1134/1530 train_loss:3.4596 train_time:192434ms step_avg:171.20ms step:1135/1530 train_loss:3.5321 train_time:192610ms step_avg:171.21ms step:1136/1530 train_loss:3.5776 train_time:192787ms step_avg:171.21ms step:1137/1530 train_loss:3.4714 train_time:192965ms step_avg:171.22ms step:1138/1530 train_loss:3.3685 train_time:193144ms step_avg:171.23ms step:1139/1530 train_loss:3.6652 train_time:193472ms step_avg:171.37ms step:1140/1530 train_loss:3.4687 train_time:193650ms step_avg:171.37ms step:1141/1530 train_loss:3.5995 train_time:193831ms step_avg:171.38ms step:1142/1530 train_loss:3.4606 train_time:194007ms step_avg:171.38ms step:1143/1530 train_loss:3.3763 train_time:194187ms step_avg:171.39ms step:1144/1530 train_loss:3.4552 train_time:194365ms step_avg:171.40ms step:1145/1530 train_loss:3.6038 train_time:194538ms step_avg:171.40ms step:1146/1530 train_loss:3.5689 train_time:194719ms step_avg:171.41ms step:1147/1530 train_loss:3.5053 train_time:194898ms step_avg:171.41ms step:1148/1530 train_loss:3.5073 train_time:195076ms step_avg:171.42ms step:1149/1530 train_loss:3.3392 train_time:195257ms step_avg:171.43ms step:1150/1530 train_loss:3.3863 train_time:195432ms step_avg:171.43ms step:1151/1530 train_loss:3.3360 train_time:195612ms step_avg:171.44ms step:1152/1530 train_loss:3.4202 train_time:195793ms step_avg:171.45ms step:1153/1530 train_loss:3.4441 train_time:195975ms step_avg:171.46ms step:1154/1530 train_loss:3.5313 train_time:196151ms step_avg:171.46ms step:1155/1530 train_loss:3.3331 train_time:196332ms step_avg:171.47ms step:1156/1530 train_loss:3.5485 train_time:196515ms step_avg:171.48ms step:1157/1530 train_loss:3.5059 train_time:196692ms step_avg:171.48ms step:1158/1530 train_loss:3.2620 train_time:196869ms step_avg:171.49ms step:1159/1530 train_loss:3.3614 train_time:197044ms step_avg:171.49ms step:1160/1530 train_loss:3.3477 train_time:197218ms step_avg:171.49ms step:1161/1530 train_loss:3.1042 train_time:197398ms step_avg:171.50ms step:1162/1530 train_loss:3.4335 train_time:197577ms step_avg:171.51ms step:1163/1530 train_loss:3.4041 train_time:197756ms step_avg:171.51ms step:1164/1530 train_loss:3.3023 train_time:197934ms step_avg:171.52ms step:1165/1530 train_loss:3.2597 train_time:198109ms step_avg:171.52ms step:1166/1530 train_loss:3.4015 train_time:198287ms step_avg:171.53ms step:1167/1530 train_loss:3.4215 train_time:198463ms step_avg:171.53ms step:1168/1530 train_loss:3.7330 train_time:198638ms step_avg:171.54ms step:1169/1530 train_loss:3.3860 train_time:198815ms step_avg:171.54ms step:1170/1530 train_loss:3.4010 train_time:198993ms step_avg:171.55ms step:1171/1530 train_loss:3.3039 train_time:199168ms step_avg:171.55ms step:1172/1530 train_loss:3.4354 train_time:199343ms step_avg:171.55ms step:1173/1530 train_loss:3.5550 train_time:199524ms step_avg:171.56ms step:1174/1530 train_loss:3.3880 train_time:199709ms step_avg:171.57ms step:1175/1530 train_loss:3.3799 train_time:199889ms step_avg:171.58ms step:1176/1530 train_loss:3.4363 train_time:200071ms step_avg:171.59ms step:1177/1530 train_loss:3.4572 train_time:200255ms step_avg:171.60ms step:1178/1530 train_loss:3.5072 train_time:200431ms step_avg:171.60ms step:1179/1530 train_loss:3.4117 train_time:200606ms step_avg:171.60ms step:1180/1530 train_loss:3.3708 train_time:200792ms step_avg:171.62ms step:1181/1530 train_loss:3.3490 train_time:200971ms step_avg:171.62ms step:1182/1530 train_loss:3.3883 train_time:201147ms step_avg:171.63ms step:1183/1530 train_loss:3.3494 train_time:201324ms step_avg:171.63ms step:1184/1530 train_loss:3.5211 train_time:201503ms step_avg:171.64ms step:1185/1530 train_loss:3.5523 train_time:201684ms step_avg:171.65ms step:1186/1530 train_loss:3.3799 train_time:201864ms step_avg:171.65ms step:1187/1530 train_loss:3.4302 train_time:202051ms step_avg:171.67ms step:1188/1530 train_loss:3.4510 train_time:202227ms step_avg:171.67ms step:1189/1530 train_loss:3.2862 train_time:202407ms step_avg:171.68ms step:1190/1530 train_loss:3.4561 train_time:202585ms step_avg:171.68ms step:1191/1530 train_loss:3.5957 train_time:202766ms step_avg:171.69ms step:1192/1530 train_loss:3.4037 train_time:202940ms step_avg:171.69ms step:1193/1530 train_loss:3.2835 train_time:203115ms step_avg:171.69ms step:1194/1530 train_loss:3.5645 train_time:203293ms step_avg:171.70ms step:1195/1530 train_loss:3.3855 train_time:203474ms step_avg:171.71ms step:1196/1530 train_loss:3.3959 train_time:203659ms step_avg:171.72ms step:1197/1530 train_loss:3.3023 train_time:203838ms step_avg:171.73ms step:1198/1530 train_loss:3.3120 train_time:204023ms step_avg:171.74ms step:1199/1530 train_loss:3.3549 train_time:204203ms step_avg:171.74ms step:1200/1530 train_loss:3.4564 train_time:204380ms step_avg:171.75ms step:1201/1530 train_loss:3.4937 train_time:204559ms step_avg:171.75ms step:1202/1530 train_loss:3.6187 train_time:204746ms step_avg:171.77ms step:1203/1530 train_loss:3.4158 train_time:204926ms step_avg:171.77ms step:1204/1530 train_loss:3.3193 train_time:205106ms step_avg:171.78ms step:1205/1530 train_loss:3.4494 train_time:205282ms step_avg:171.78ms step:1206/1530 train_loss:3.4835 train_time:205459ms step_avg:171.79ms step:1207/1530 train_loss:3.5303 train_time:205637ms step_avg:171.79ms step:1208/1530 train_loss:3.4068 train_time:205813ms step_avg:171.80ms step:1209/1530 train_loss:3.2565 train_time:205993ms step_avg:171.80ms step:1210/1530 train_loss:3.3159 train_time:206172ms step_avg:171.81ms step:1211/1530 train_loss:3.4098 train_time:206349ms step_avg:171.81ms step:1212/1530 train_loss:3.4104 train_time:206525ms step_avg:171.82ms step:1213/1530 train_loss:3.4203 train_time:206703ms step_avg:171.82ms step:1214/1530 train_loss:3.2685 train_time:206885ms step_avg:171.83ms step:1215/1530 train_loss:3.4083 train_time:207061ms step_avg:171.83ms step:1216/1530 train_loss:3.3422 train_time:207238ms step_avg:171.84ms step:1217/1530 train_loss:3.3342 train_time:207414ms step_avg:171.84ms step:1218/1530 train_loss:3.4187 train_time:207594ms step_avg:171.85ms step:1219/1530 train_loss:3.2627 train_time:207777ms step_avg:171.86ms step:1220/1530 train_loss:3.4861 train_time:207953ms step_avg:171.86ms step:1221/1530 train_loss:3.5185 train_time:208129ms step_avg:171.87ms step:1222/1530 train_loss:3.4417 train_time:208303ms step_avg:171.87ms step:1223/1530 train_loss:3.3041 train_time:208480ms step_avg:171.87ms step:1224/1530 train_loss:3.2686 train_time:208663ms step_avg:171.88ms step:1225/1530 train_loss:3.3786 train_time:208840ms step_avg:171.88ms step:1226/1530 train_loss:3.3415 train_time:209020ms step_avg:171.89ms step:1227/1530 train_loss:3.2910 train_time:209199ms step_avg:171.90ms step:1228/1530 train_loss:3.4547 train_time:209375ms step_avg:171.90ms step:1229/1530 train_loss:3.3816 train_time:209555ms step_avg:171.91ms step:1230/1530 train_loss:3.4099 train_time:209736ms step_avg:171.92ms step:1231/1530 train_loss:3.5912 train_time:209916ms step_avg:171.92ms step:1232/1530 train_loss:3.5069 train_time:210097ms step_avg:171.93ms step:1233/1530 train_loss:3.4391 train_time:210274ms step_avg:171.93ms step:1234/1530 train_loss:3.5947 train_time:210452ms step_avg:171.94ms step:1235/1530 train_loss:3.3363 train_time:210632ms step_avg:171.94ms step:1236/1530 train_loss:3.3013 train_time:210808ms step_avg:171.95ms step:1237/1530 train_loss:3.2772 train_time:210985ms step_avg:171.95ms step:1238/1530 train_loss:3.2930 train_time:211171ms step_avg:171.96ms step:1239/1530 train_loss:3.3436 train_time:211348ms step_avg:171.97ms step:1240/1530 train_loss:3.3960 train_time:211525ms step_avg:171.97ms step:1241/1530 train_loss:3.4381 train_time:211704ms step_avg:171.98ms step:1242/1530 train_loss:3.3089 train_time:211880ms step_avg:171.98ms step:1243/1530 train_loss:3.4162 train_time:212059ms step_avg:171.99ms step:1244/1530 train_loss:3.4193 train_time:212233ms step_avg:171.99ms step:1245/1530 train_loss:3.4207 train_time:212409ms step_avg:171.99ms step:1246/1530 train_loss:3.2558 train_time:212588ms step_avg:172.00ms step:1247/1530 train_loss:3.3810 train_time:212764ms step_avg:172.00ms step:1248/1530 train_loss:3.4389 train_time:212941ms step_avg:172.00ms step:1249/1530 train_loss:3.4374 train_time:213120ms step_avg:172.01ms step:1250/1530 train_loss:3.3197 train_time:213300ms step_avg:172.02ms step:1250/1530 val_loss:3.3668 train_time:213354ms step_avg:172.06ms step:1251/1530 train_loss:3.5083 train_time:213484ms step_avg:172.03ms step:1252/1530 train_loss:3.3709 train_time:213660ms step_avg:172.03ms step:1253/1530 train_loss:3.3194 train_time:213838ms step_avg:172.03ms step:1254/1530 train_loss:3.4305 train_time:214019ms step_avg:172.04ms step:1255/1530 train_loss:3.5313 train_time:214209ms step_avg:172.06ms step:1256/1530 train_loss:3.3185 train_time:214390ms step_avg:172.06ms step:1257/1530 train_loss:3.3833 train_time:214567ms step_avg:172.07ms step:1258/1530 train_loss:3.3750 train_time:214750ms step_avg:172.08ms step:1259/1530 train_loss:3.3394 train_time:214929ms step_avg:172.08ms step:1260/1530 train_loss:3.2192 train_time:215104ms step_avg:172.08ms step:1261/1530 train_loss:3.3199 train_time:215283ms step_avg:172.09ms step:1262/1530 train_loss:3.3388 train_time:215465ms step_avg:172.10ms step:1263/1530 train_loss:3.2474 train_time:215646ms step_avg:172.10ms step:1264/1530 train_loss:3.4510 train_time:215821ms step_avg:172.11ms step:1265/1530 train_loss:3.4359 train_time:215995ms step_avg:172.11ms step:1266/1530 train_loss:3.4484 train_time:216174ms step_avg:172.11ms step:1267/1530 train_loss:3.3822 train_time:216353ms step_avg:172.12ms step:1268/1530 train_loss:3.4214 train_time:216533ms step_avg:172.12ms step:1269/1530 train_loss:3.2663 train_time:216716ms step_avg:172.13ms step:1270/1530 train_loss:3.1129 train_time:216893ms step_avg:172.14ms step:1271/1530 train_loss:3.4180 train_time:217073ms step_avg:172.14ms step:1272/1530 train_loss:3.3647 train_time:217249ms step_avg:172.15ms step:1273/1530 train_loss:3.3907 train_time:217428ms step_avg:172.15ms step:1274/1530 train_loss:3.3717 train_time:217606ms step_avg:172.16ms step:1275/1530 train_loss:3.4450 train_time:217781ms step_avg:172.16ms step:1276/1530 train_loss:3.4784 train_time:217955ms step_avg:172.16ms step:1277/1530 train_loss:3.4249 train_time:218135ms step_avg:172.17ms step:1278/1530 train_loss:3.4203 train_time:218310ms step_avg:172.17ms step:1279/1530 train_loss:3.2737 train_time:218490ms step_avg:172.18ms step:1280/1530 train_loss:3.3725 train_time:218677ms step_avg:172.19ms step:1281/1530 train_loss:3.4368 train_time:218856ms step_avg:172.19ms step:1282/1530 train_loss:3.4796 train_time:219032ms step_avg:172.19ms step:1283/1530 train_loss:3.3451 train_time:219212ms step_avg:172.20ms step:1284/1530 train_loss:3.3801 train_time:219390ms step_avg:172.21ms step:1285/1530 train_loss:3.3739 train_time:219568ms step_avg:172.21ms step:1286/1530 train_loss:3.3457 train_time:219745ms step_avg:172.21ms step:1287/1530 train_loss:3.4969 train_time:219922ms step_avg:172.22ms step:1288/1530 train_loss:3.3058 train_time:220102ms step_avg:172.22ms step:1289/1530 train_loss:3.3929 train_time:220289ms step_avg:172.24ms step:1290/1530 train_loss:3.4753 train_time:220475ms step_avg:172.25ms step:1291/1530 train_loss:3.3903 train_time:220656ms step_avg:172.25ms step:1292/1530 train_loss:3.4944 train_time:220837ms step_avg:172.26ms step:1293/1530 train_loss:3.5232 train_time:221016ms step_avg:172.27ms step:1294/1530 train_loss:3.4687 train_time:221197ms step_avg:172.27ms step:1295/1530 train_loss:3.2959 train_time:221377ms step_avg:172.28ms step:1296/1530 train_loss:3.3840 train_time:221558ms step_avg:172.28ms step:1297/1530 train_loss:3.2896 train_time:221738ms step_avg:172.29ms step:1298/1530 train_loss:3.2883 train_time:221919ms step_avg:172.30ms step:1299/1530 train_loss:3.4100 train_time:222096ms step_avg:172.30ms step:1300/1530 train_loss:3.4146 train_time:222274ms step_avg:172.31ms step:1301/1530 train_loss:3.4135 train_time:222452ms step_avg:172.31ms step:1302/1530 train_loss:3.5886 train_time:222634ms step_avg:172.32ms step:1303/1530 train_loss:3.3204 train_time:222815ms step_avg:172.32ms step:1304/1530 train_loss:3.5295 train_time:222997ms step_avg:172.33ms step:1305/1530 train_loss:3.2681 train_time:223174ms step_avg:172.33ms step:1306/1530 train_loss:3.4634 train_time:223356ms step_avg:172.34ms step:1307/1530 train_loss:3.4658 train_time:223531ms step_avg:172.34ms step:1308/1530 train_loss:3.2999 train_time:223707ms step_avg:172.35ms step:1309/1530 train_loss:3.3256 train_time:223885ms step_avg:172.35ms step:1310/1530 train_loss:3.2954 train_time:224065ms step_avg:172.36ms step:1311/1530 train_loss:3.3051 train_time:224243ms step_avg:172.36ms step:1312/1530 train_loss:3.3845 train_time:224423ms step_avg:172.37ms step:1313/1530 train_loss:3.3508 train_time:224599ms step_avg:172.37ms step:1314/1530 train_loss:3.0539 train_time:224782ms step_avg:172.38ms step:1315/1530 train_loss:3.2882 train_time:224958ms step_avg:172.38ms step:1316/1530 train_loss:3.4091 train_time:225134ms step_avg:172.38ms step:1317/1530 train_loss:3.4328 train_time:225312ms step_avg:172.39ms step:1318/1530 train_loss:3.3103 train_time:225498ms step_avg:172.40ms step:1319/1530 train_loss:3.4346 train_time:225677ms step_avg:172.40ms step:1320/1530 train_loss:3.4728 train_time:225860ms step_avg:172.41ms step:1321/1530 train_loss:3.3782 train_time:226038ms step_avg:172.42ms step:1322/1530 train_loss:3.3333 train_time:226350ms step_avg:172.52ms step:1323/1530 train_loss:3.3333 train_time:226538ms step_avg:172.53ms step:1324/1530 train_loss:3.4487 train_time:226718ms step_avg:172.54ms step:1325/1530 train_loss:3.5034 train_time:226904ms step_avg:172.55ms step:1326/1530 train_loss:3.2226 train_time:227083ms step_avg:172.56ms step:1327/1530 train_loss:3.1794 train_time:227260ms step_avg:172.56ms step:1328/1530 train_loss:3.5037 train_time:227439ms step_avg:172.56ms step:1329/1530 train_loss:3.3186 train_time:227782ms step_avg:172.69ms step:1330/1530 train_loss:3.4378 train_time:227963ms step_avg:172.70ms step:1331/1530 train_loss:3.3472 train_time:228138ms step_avg:172.70ms step:1332/1530 train_loss:3.7557 train_time:228321ms step_avg:172.71ms step:1333/1530 train_loss:3.4906 train_time:228501ms step_avg:172.71ms step:1334/1530 train_loss:3.3807 train_time:228680ms step_avg:172.72ms step:1335/1530 train_loss:3.3010 train_time:228859ms step_avg:172.72ms step:1336/1530 train_loss:3.3071 train_time:229043ms step_avg:172.73ms step:1337/1530 train_loss:3.5605 train_time:229224ms step_avg:172.74ms step:1338/1530 train_loss:3.5362 train_time:229402ms step_avg:172.74ms step:1339/1530 train_loss:3.3490 train_time:229581ms step_avg:172.75ms step:1340/1530 train_loss:3.2940 train_time:229760ms step_avg:172.75ms step:1341/1530 train_loss:3.6059 train_time:229938ms step_avg:172.76ms step:1342/1530 train_loss:3.3673 train_time:230118ms step_avg:172.76ms step:1343/1530 train_loss:3.3758 train_time:230297ms step_avg:172.77ms step:1344/1530 train_loss:3.4250 train_time:230478ms step_avg:172.77ms step:1345/1530 train_loss:3.3943 train_time:230660ms step_avg:172.78ms step:1346/1530 train_loss:3.3111 train_time:230837ms step_avg:172.78ms step:1347/1530 train_loss:3.2862 train_time:231014ms step_avg:172.79ms step:1348/1530 train_loss:3.3595 train_time:231191ms step_avg:172.79ms step:1349/1530 train_loss:3.2856 train_time:231367ms step_avg:172.79ms step:1350/1530 train_loss:3.4015 train_time:231549ms step_avg:172.80ms step:1351/1530 train_loss:3.2540 train_time:231723ms step_avg:172.80ms step:1352/1530 train_loss:3.3171 train_time:231901ms step_avg:172.80ms step:1353/1530 train_loss:3.4139 train_time:232079ms step_avg:172.81ms step:1354/1530 train_loss:3.2719 train_time:232258ms step_avg:172.81ms step:1355/1530 train_loss:3.1949 train_time:232435ms step_avg:172.81ms step:1356/1530 train_loss:3.5186 train_time:232615ms step_avg:172.82ms step:1357/1530 train_loss:3.4357 train_time:232796ms step_avg:172.83ms step:1358/1530 train_loss:3.1979 train_time:232974ms step_avg:172.83ms step:1359/1530 train_loss:3.4550 train_time:233155ms step_avg:172.84ms step:1360/1530 train_loss:3.3607 train_time:233336ms step_avg:172.84ms step:1361/1530 train_loss:3.1446 train_time:233520ms step_avg:172.85ms step:1362/1530 train_loss:3.4020 train_time:233701ms step_avg:172.86ms step:1363/1530 train_loss:3.2942 train_time:233889ms step_avg:172.87ms step:1364/1530 train_loss:3.3138 train_time:234068ms step_avg:172.87ms step:1365/1530 train_loss:3.3236 train_time:234244ms step_avg:172.87ms step:1366/1530 train_loss:3.4358 train_time:234424ms step_avg:172.88ms step:1367/1530 train_loss:3.4113 train_time:234604ms step_avg:172.88ms step:1368/1530 train_loss:3.3582 train_time:234784ms step_avg:172.89ms step:1369/1530 train_loss:3.2854 train_time:234972ms step_avg:172.90ms step:1370/1530 train_loss:3.6160 train_time:235153ms step_avg:172.91ms step:1371/1530 train_loss:3.3264 train_time:235335ms step_avg:172.91ms step:1372/1530 train_loss:3.3839 train_time:235517ms step_avg:172.92ms step:1373/1530 train_loss:3.3842 train_time:235697ms step_avg:172.93ms step:1374/1530 train_loss:3.1611 train_time:235878ms step_avg:172.93ms step:1375/1530 train_loss:3.5511 train_time:236059ms step_avg:172.94ms step:1375/1530 val_loss:3.3248 train_time:236110ms step_avg:172.97ms step:1376/1530 train_loss:3.3593 train_time:236238ms step_avg:172.94ms step:1377/1530 train_loss:3.4918 train_time:236419ms step_avg:172.95ms step:1378/1530 train_loss:3.4799 train_time:236596ms step_avg:172.95ms step:1379/1530 train_loss:3.1328 train_time:236778ms step_avg:172.96ms step:1380/1530 train_loss:3.3284 train_time:236958ms step_avg:172.96ms step:1381/1530 train_loss:3.7195 train_time:237142ms step_avg:172.97ms step:1382/1530 train_loss:3.2201 train_time:237321ms step_avg:172.97ms step:1383/1530 train_loss:3.4068 train_time:237500ms step_avg:172.98ms step:1384/1530 train_loss:3.4853 train_time:237685ms step_avg:172.99ms step:1385/1530 train_loss:3.4144 train_time:237861ms step_avg:172.99ms step:1386/1530 train_loss:3.3587 train_time:238039ms step_avg:172.99ms step:1387/1530 train_loss:3.2121 train_time:238218ms step_avg:173.00ms step:1388/1530 train_loss:3.3599 train_time:238396ms step_avg:173.00ms step:1389/1530 train_loss:3.3265 train_time:238578ms step_avg:173.01ms step:1390/1530 train_loss:3.5806 train_time:238755ms step_avg:173.01ms step:1391/1530 train_loss:3.3019 train_time:238934ms step_avg:173.02ms step:1392/1530 train_loss:3.2968 train_time:239114ms step_avg:173.02ms step:1393/1530 train_loss:3.2507 train_time:239294ms step_avg:173.03ms step:1394/1530 train_loss:3.5120 train_time:239471ms step_avg:173.03ms step:1395/1530 train_loss:3.4095 train_time:239650ms step_avg:173.03ms step:1396/1530 train_loss:3.4195 train_time:239828ms step_avg:173.04ms step:1397/1530 train_loss:3.3218 train_time:240004ms step_avg:173.04ms step:1398/1530 train_loss:3.2667 train_time:240180ms step_avg:173.04ms step:1399/1530 train_loss:3.3293 train_time:240359ms step_avg:173.04ms step:1400/1530 train_loss:3.3322 train_time:240544ms step_avg:173.05ms step:1401/1530 train_loss:3.3617 train_time:240722ms step_avg:173.06ms step:1402/1530 train_loss:3.3117 train_time:240900ms step_avg:173.06ms step:1403/1530 train_loss:3.5070 train_time:241085ms step_avg:173.07ms step:1404/1530 train_loss:3.2912 train_time:241263ms step_avg:173.07ms step:1405/1530 train_loss:3.3288 train_time:241444ms step_avg:173.08ms step:1406/1530 train_loss:3.3282 train_time:241624ms step_avg:173.08ms step:1407/1530 train_loss:3.1865 train_time:241800ms step_avg:173.09ms step:1408/1530 train_loss:3.3240 train_time:241980ms step_avg:173.09ms step:1409/1530 train_loss:3.3140 train_time:242170ms step_avg:173.10ms step:1410/1530 train_loss:3.3022 train_time:242348ms step_avg:173.11ms step:1411/1530 train_loss:3.3787 train_time:242524ms step_avg:173.11ms step:1412/1530 train_loss:3.3455 train_time:242703ms step_avg:173.11ms step:1413/1530 train_loss:3.3740 train_time:242881ms step_avg:173.12ms step:1414/1530 train_loss:3.3442 train_time:243062ms step_avg:173.12ms step:1415/1530 train_loss:3.4171 train_time:243245ms step_avg:173.13ms step:1416/1530 train_loss:3.2414 train_time:243434ms step_avg:173.14ms step:1417/1530 train_loss:3.2903 train_time:243616ms step_avg:173.15ms step:1418/1530 train_loss:3.4006 train_time:243796ms step_avg:173.15ms step:1419/1530 train_loss:3.3584 train_time:243979ms step_avg:173.16ms step:1420/1530 train_loss:3.3806 train_time:244160ms step_avg:173.16ms step:1421/1530 train_loss:3.3877 train_time:244339ms step_avg:173.17ms step:1422/1530 train_loss:3.3457 train_time:244516ms step_avg:173.17ms step:1423/1530 train_loss:3.3300 train_time:244696ms step_avg:173.17ms step:1424/1530 train_loss:3.3464 train_time:244882ms step_avg:173.18ms step:1425/1530 train_loss:3.2063 train_time:245071ms step_avg:173.19ms step:1426/1530 train_loss:3.3364 train_time:245250ms step_avg:173.20ms step:1427/1530 train_loss:3.2948 train_time:245432ms step_avg:173.21ms step:1428/1530 train_loss:3.3924 train_time:245610ms step_avg:173.21ms step:1429/1530 train_loss:3.3659 train_time:245788ms step_avg:173.21ms step:1430/1530 train_loss:3.2734 train_time:245970ms step_avg:173.22ms step:1431/1530 train_loss:3.3327 train_time:246151ms step_avg:173.22ms step:1432/1530 train_loss:3.3503 train_time:246333ms step_avg:173.23ms step:1433/1530 train_loss:3.1532 train_time:246515ms step_avg:173.24ms step:1434/1530 train_loss:3.3001 train_time:246700ms step_avg:173.24ms step:1435/1530 train_loss:3.1251 train_time:246880ms step_avg:173.25ms step:1436/1530 train_loss:3.2393 train_time:247061ms step_avg:173.25ms step:1437/1530 train_loss:3.4220 train_time:247237ms step_avg:173.26ms step:1438/1530 train_loss:3.3917 train_time:247413ms step_avg:173.26ms step:1439/1530 train_loss:3.3284 train_time:247593ms step_avg:173.26ms step:1440/1530 train_loss:3.2009 train_time:247768ms step_avg:173.26ms step:1441/1530 train_loss:3.3504 train_time:247948ms step_avg:173.27ms step:1442/1530 train_loss:3.4040 train_time:248131ms step_avg:173.28ms step:1443/1530 train_loss:3.5032 train_time:248317ms step_avg:173.28ms step:1444/1530 train_loss:3.4614 train_time:248493ms step_avg:173.29ms step:1445/1530 train_loss:3.3506 train_time:248672ms step_avg:173.29ms step:1446/1530 train_loss:3.2107 train_time:248854ms step_avg:173.30ms step:1447/1530 train_loss:3.3115 train_time:249035ms step_avg:173.30ms step:1448/1530 train_loss:3.3088 train_time:249212ms step_avg:173.30ms step:1449/1530 train_loss:3.4090 train_time:249391ms step_avg:173.31ms step:1450/1530 train_loss:3.4016 train_time:249571ms step_avg:173.31ms step:1451/1530 train_loss:3.2150 train_time:249751ms step_avg:173.32ms step:1452/1530 train_loss:3.3438 train_time:249932ms step_avg:173.32ms step:1453/1530 train_loss:3.2686 train_time:250107ms step_avg:173.32ms step:1454/1530 train_loss:3.3006 train_time:250286ms step_avg:173.33ms step:1455/1530 train_loss:3.3397 train_time:250469ms step_avg:173.34ms step:1456/1530 train_loss:3.2964 train_time:250646ms step_avg:173.34ms step:1457/1530 train_loss:3.1648 train_time:250824ms step_avg:173.34ms step:1458/1530 train_loss:3.4340 train_time:251001ms step_avg:173.34ms step:1459/1530 train_loss:3.2817 train_time:251183ms step_avg:173.35ms step:1460/1530 train_loss:3.3286 train_time:251360ms step_avg:173.35ms step:1461/1530 train_loss:3.4430 train_time:251540ms step_avg:173.36ms step:1462/1530 train_loss:3.2715 train_time:251716ms step_avg:173.36ms step:1463/1530 train_loss:3.4786 train_time:251899ms step_avg:173.36ms step:1464/1530 train_loss:3.3739 train_time:252078ms step_avg:173.37ms step:1465/1530 train_loss:3.3702 train_time:252258ms step_avg:173.37ms step:1466/1530 train_loss:3.3009 train_time:252435ms step_avg:173.38ms step:1467/1530 train_loss:3.4079 train_time:252614ms step_avg:173.38ms step:1468/1530 train_loss:3.3004 train_time:252791ms step_avg:173.38ms step:1469/1530 train_loss:3.2900 train_time:252972ms step_avg:173.39ms step:1470/1530 train_loss:3.3414 train_time:253155ms step_avg:173.39ms step:1471/1530 train_loss:3.2681 train_time:253340ms step_avg:173.40ms step:1472/1530 train_loss:3.2597 train_time:253523ms step_avg:173.41ms step:1473/1530 train_loss:3.4546 train_time:253701ms step_avg:173.41ms step:1474/1530 train_loss:3.3240 train_time:253883ms step_avg:173.42ms step:1475/1530 train_loss:3.1574 train_time:254070ms step_avg:173.43ms step:1476/1530 train_loss:3.2761 train_time:254250ms step_avg:173.43ms step:1477/1530 train_loss:3.2512 train_time:254435ms step_avg:173.44ms step:1478/1530 train_loss:3.3209 train_time:254619ms step_avg:173.45ms step:1479/1530 train_loss:3.4096 train_time:254802ms step_avg:173.45ms step:1480/1530 train_loss:3.2799 train_time:254980ms step_avg:173.46ms step:1481/1530 train_loss:3.4603 train_time:255161ms step_avg:173.46ms step:1482/1530 train_loss:3.3789 train_time:255347ms step_avg:173.47ms step:1483/1530 train_loss:3.2910 train_time:255539ms step_avg:173.48ms step:1484/1530 train_loss:3.2772 train_time:255726ms step_avg:173.49ms step:1485/1530 train_loss:3.2944 train_time:255905ms step_avg:173.49ms step:1486/1530 train_loss:3.2370 train_time:256091ms step_avg:173.50ms step:1487/1530 train_loss:3.3477 train_time:256272ms step_avg:173.51ms step:1488/1530 train_loss:3.2590 train_time:256456ms step_avg:173.52ms step:1489/1530 train_loss:3.3284 train_time:256636ms step_avg:173.52ms step:1490/1530 train_loss:3.2660 train_time:256816ms step_avg:173.52ms step:1491/1530 train_loss:3.1713 train_time:256997ms step_avg:173.53ms step:1492/1530 train_loss:3.2816 train_time:257177ms step_avg:173.53ms step:1493/1530 train_loss:3.4520 train_time:257357ms step_avg:173.54ms step:1494/1530 train_loss:3.3120 train_time:257535ms step_avg:173.54ms step:1495/1530 train_loss:3.0441 train_time:257719ms step_avg:173.55ms step:1496/1530 train_loss:3.3759 train_time:257902ms step_avg:173.55ms step:1497/1530 train_loss:3.3245 train_time:258085ms step_avg:173.56ms step:1498/1530 train_loss:3.3600 train_time:258270ms step_avg:173.57ms step:1499/1530 train_loss:3.3192 train_time:258459ms step_avg:173.58ms step:1500/1530 train_loss:3.3089 train_time:258652ms step_avg:173.59ms step:1500/1530 val_loss:3.2923 train_time:258707ms step_avg:173.63ms step:1501/1530 train_loss:3.1033 train_time:258845ms step_avg:173.61ms step:1502/1530 train_loss:3.3680 train_time:259034ms step_avg:173.62ms step:1503/1530 train_loss:3.2519 train_time:259210ms step_avg:173.62ms step:1504/1530 train_loss:3.2603 train_time:259391ms step_avg:173.62ms step:1505/1530 train_loss:3.2235 train_time:259572ms step_avg:173.63ms step:1506/1530 train_loss:3.2925 train_time:259756ms step_avg:173.63ms step:1507/1530 train_loss:3.1900 train_time:259949ms step_avg:173.65ms step:1508/1530 train_loss:3.4996 train_time:260132ms step_avg:173.65ms step:1509/1530 train_loss:3.2900 train_time:260309ms step_avg:173.66ms step:1510/1530 train_loss:3.2834 train_time:260488ms step_avg:173.66ms step:1511/1530 train_loss:3.4256 train_time:260800ms step_avg:173.75ms step:1512/1530 train_loss:3.4298 train_time:260989ms step_avg:173.76ms step:1513/1530 train_loss:3.2834 train_time:261172ms step_avg:173.77ms step:1514/1530 train_loss:3.0968 train_time:261354ms step_avg:173.77ms step:1515/1530 train_loss:3.2591 train_time:261534ms step_avg:173.78ms step:1516/1530 train_loss:3.2714 train_time:261720ms step_avg:173.78ms step:1517/1530 train_loss:3.3199 train_time:261902ms step_avg:173.79ms step:1518/1530 train_loss:3.2207 train_time:262085ms step_avg:173.80ms step:1519/1530 train_loss:3.5181 train_time:262421ms step_avg:173.90ms step:1520/1530 train_loss:3.1423 train_time:262605ms step_avg:173.91ms step:1521/1530 train_loss:3.2190 train_time:262781ms step_avg:173.91ms step:1522/1530 train_loss:3.3681 train_time:262966ms step_avg:173.92ms step:1523/1530 train_loss:3.2447 train_time:263144ms step_avg:173.92ms step:1524/1530 train_loss:3.3577 train_time:263325ms step_avg:173.93ms step:1525/1530 train_loss:3.3517 train_time:263511ms step_avg:173.93ms step:1526/1530 train_loss:3.2899 train_time:263702ms step_avg:173.95ms step:1527/1530 train_loss:3.3008 train_time:263883ms step_avg:173.95ms step:1528/1530 train_loss:3.4212 train_time:264063ms step_avg:173.95ms step:1529/1530 train_loss:3.4202 train_time:264241ms step_avg:173.96ms step:1530/1530 train_loss:3.2517 train_time:264419ms step_avg:173.96ms step:1530/1530 val_loss:3.2898 train_time:264472ms step_avg:173.99ms