import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import glob import time import contextlib from dataclasses import dataclass import numpy as np import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import flex_attention, create_block_mask flex_attention = torch.compile(flex_attention, dynamic=False) create_block_mask = torch.compile(create_block_mask, dynamic=False) # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) super().__init__(params, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] zeropower_backend = zeropower_backends[group['backend']] # generate weight updates in distributed fashion total_params = sum(p.numel() for p in group['params']) updates_flat = torch.zeros(total_params, device='cuda', dtype=torch.bfloat16) curr_idx = 0 for i, p in enumerate(group['params']): # luckily this will perfectly distribute a transformer with multiple of 4 layers to 8 GPUs if i % int(os.environ['WORLD_SIZE']) == int(os.environ['RANK']): g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.mul_(momentum).add_(g) g = g.add(buf, alpha=momentum) if group['nesterov'] else buf g = zeropower_backend(g, steps=group['backend_steps']) g *= max(1, g.size(0)/g.size(1))**0.5 updates_flat[curr_idx:curr_idx+p.numel()] = g.flatten() curr_idx += p.numel() # sync updates across devices. we are not memory-constrained so can do this simple deserialization dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) # deserialize and apply updates curr_idx = 0 for p in group['params']: g = updates_flat[curr_idx:curr_idx+p.numel()].view_as(p.data).type_as(p.data) p.data.add_(g, alpha=-lr) curr_idx += p.numel() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lamb = nn.Parameter(torch.tensor(0.5)) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.n_head, -1) k = self.c_k(x).view(B, T, self.n_head, -1) v = self.c_v(x).view(B, T, self.n_head, -1) v = (1 - self.lamb) * v + self.lamb * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 class GPT(nn.Module): def __init__(self, config): super().__init__() # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning vte = nn.Embedding(config.vocab_size, config.n_embd*12), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx, target, attn_blocksize): docs = (idx == 50256).cumsum(0) def document_causal_mask(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < attn_blocksize return causal_mask & document_mask & window_mask S = len(idx) block_mask = create_block_mask(document_causal_mask, None, None, S, S, device="cuda", _compile=True) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(12, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers+i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(filename): # only reads the header, returns header data with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) if header[0] != 20240520: print("ERROR: magic number mismatch in the data .bin file!") print("---> HINT: Are you passing in a correct file with --input_bin?") print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README") print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try") exit(1) assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) return ntok # for now just return the number of tokens def _load_data_shard(filename): with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) # the rest of it are tokens, stored as uint16 tokens = np.frombuffer(f.read(), dtype=np.uint16) assert len(tokens) == ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(glob.glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total ntok_total = 0 for fname in self.files: shard_ntok = _peek_data_shard(fname) assert shard_ntok >= num_processes * T + 1 ntok_total += int(shard_ntok) self.ntok_total = ntok_total self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] buf = torch.tensor(buf.astype(np.int32), dtype=torch.long) x = buf[:-1] # inputs y = buf[1:] # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size >= len(self.tokens): self.advance() return x.cuda(), y.cuda() # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1530 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) # note that this learning rate is neither sensitive nor tuned optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.time() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.time() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the attention blocksize for the current step, in chunks of 64. By @fernbear.bsky.social attn_blocksize = torch.tensor(64*((step/args.num_iterations * (1792 - 64) + 64)//64), dtype=torch.int, device='cuda') # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, attn_blocksize=attn_blocksize) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.time() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.time() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps+1): ctx = model.no_sync() if i < train_accumulation_steps else contextlib.nullcontext() with ctx: # there's no need to sync gradients every accumulation step # forward pass loss = model(x, y, attn_blocksize=attn_blocksize) # advance the dataset for the next batch x, y = train_loader.next_batch() # backward pass loss.backward() train_loss = loss.detach() for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) optimizer3.param_groups[0]['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. #dist.all_reduce(train_loss, op=dist.ReduceOp.AVG) # all-reducing the training loss would be more correct in terms of logging, but slower approx_time = training_time_ms + 1000 * (time.time() - t0) print0(f"step:{step+1}/{args.num_iterations} train_loss:{train_loss.item():.4f} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Thu Dec 5 02:13:18 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 75W / 700W | 3MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 115W / 700W | 529MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 30C P0 98W / 700W | 22MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 38C P0 118W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 103W / 700W | 22MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 29C P0 110W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 38C P0 127W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1100000000 across 11 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1530 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1530 train_loss:10.8258 train_time:31694ms step_avg:nanms step:2/1530 train_loss:10.0689 train_time:31806ms step_avg:nanms step:3/1530 train_loss:8.3863 train_time:31967ms step_avg:nanms step:4/1530 train_loss:7.6209 train_time:32128ms step_avg:nanms step:5/1530 train_loss:7.4663 train_time:32288ms step_avg:nanms step:6/1530 train_loss:6.9726 train_time:32449ms step_avg:nanms step:7/1530 train_loss:7.2150 train_time:32610ms step_avg:nanms step:8/1530 train_loss:6.7276 train_time:32770ms step_avg:nanms step:9/1530 train_loss:6.6214 train_time:32931ms step_avg:nanms step:10/1530 train_loss:6.5201 train_time:33091ms step_avg:nanms step:11/1530 train_loss:6.4799 train_time:114ms step_avg:nanms step:12/1530 train_loss:6.3867 train_time:275ms step_avg:nanms step:13/1530 train_loss:6.2460 train_time:435ms step_avg:144.93ms step:14/1530 train_loss:6.2092 train_time:595ms step_avg:148.81ms step:15/1530 train_loss:6.1474 train_time:755ms step_avg:151.00ms step:16/1530 train_loss:6.1078 train_time:916ms step_avg:152.62ms step:17/1530 train_loss:6.1490 train_time:1076ms step_avg:153.68ms step:18/1530 train_loss:5.9833 train_time:1236ms step_avg:154.53ms step:19/1530 train_loss:5.9844 train_time:1397ms step_avg:155.22ms step:20/1530 train_loss:5.7024 train_time:1556ms step_avg:155.61ms step:21/1530 train_loss:5.9450 train_time:1716ms step_avg:156.00ms step:22/1530 train_loss:6.1606 train_time:1877ms step_avg:156.40ms step:23/1530 train_loss:5.8473 train_time:2037ms step_avg:156.66ms step:24/1530 train_loss:6.0053 train_time:2198ms step_avg:156.99ms step:25/1530 train_loss:5.6752 train_time:2357ms step_avg:157.14ms step:26/1530 train_loss:5.6047 train_time:2518ms step_avg:157.37ms step:27/1530 train_loss:5.7619 train_time:2679ms step_avg:157.58ms step:28/1530 train_loss:5.4196 train_time:2840ms step_avg:157.79ms step:29/1530 train_loss:5.6834 train_time:3001ms step_avg:157.97ms step:30/1530 train_loss:5.4770 train_time:3162ms step_avg:158.09ms step:31/1530 train_loss:5.4232 train_time:3322ms step_avg:158.19ms step:32/1530 train_loss:5.3007 train_time:3481ms step_avg:158.23ms step:33/1530 train_loss:5.5847 train_time:3642ms step_avg:158.33ms step:34/1530 train_loss:5.4940 train_time:3803ms step_avg:158.46ms step:35/1530 train_loss:5.6040 train_time:3963ms step_avg:158.52ms step:36/1530 train_loss:5.5411 train_time:4123ms step_avg:158.59ms step:37/1530 train_loss:5.4571 train_time:4284ms step_avg:158.67ms step:38/1530 train_loss:5.3089 train_time:4445ms step_avg:158.75ms step:39/1530 train_loss:5.3297 train_time:4606ms step_avg:158.82ms step:40/1530 train_loss:5.2535 train_time:4767ms step_avg:158.91ms step:41/1530 train_loss:5.2287 train_time:4927ms step_avg:158.95ms step:42/1530 train_loss:5.1829 train_time:5088ms step_avg:158.99ms step:43/1530 train_loss:5.2835 train_time:5248ms step_avg:159.04ms step:44/1530 train_loss:5.2625 train_time:5409ms step_avg:159.09ms step:45/1530 train_loss:5.3933 train_time:5569ms step_avg:159.12ms step:46/1530 train_loss:5.1755 train_time:5730ms step_avg:159.17ms step:47/1530 train_loss:5.0724 train_time:5891ms step_avg:159.22ms step:48/1530 train_loss:5.2272 train_time:6051ms step_avg:159.24ms step:49/1530 train_loss:5.1417 train_time:6212ms step_avg:159.28ms step:50/1530 train_loss:5.2437 train_time:6372ms step_avg:159.31ms step:51/1530 train_loss:5.1455 train_time:6533ms step_avg:159.33ms step:52/1530 train_loss:5.0266 train_time:6694ms step_avg:159.37ms step:53/1530 train_loss:5.1839 train_time:6853ms step_avg:159.37ms step:54/1530 train_loss:5.0270 train_time:7014ms step_avg:159.41ms step:55/1530 train_loss:5.4191 train_time:7175ms step_avg:159.45ms step:56/1530 train_loss:5.0291 train_time:7335ms step_avg:159.45ms step:57/1530 train_loss:4.8784 train_time:7496ms step_avg:159.49ms step:58/1530 train_loss:5.0389 train_time:7655ms step_avg:159.49ms step:59/1530 train_loss:5.0112 train_time:7816ms step_avg:159.51ms step:60/1530 train_loss:5.1373 train_time:7976ms step_avg:159.52ms step:61/1530 train_loss:4.8537 train_time:8136ms step_avg:159.53ms step:62/1530 train_loss:4.9744 train_time:8297ms step_avg:159.55ms step:63/1530 train_loss:4.9634 train_time:8457ms step_avg:159.56ms step:64/1530 train_loss:4.8828 train_time:8617ms step_avg:159.57ms step:65/1530 train_loss:4.7967 train_time:8777ms step_avg:159.58ms step:66/1530 train_loss:4.9405 train_time:8937ms step_avg:159.59ms step:67/1530 train_loss:4.8334 train_time:9098ms step_avg:159.61ms step:68/1530 train_loss:5.1233 train_time:9258ms step_avg:159.62ms step:69/1530 train_loss:4.7270 train_time:9418ms step_avg:159.63ms step:70/1530 train_loss:4.8715 train_time:9580ms step_avg:159.66ms step:71/1530 train_loss:4.9816 train_time:9740ms step_avg:159.68ms step:72/1530 train_loss:4.8807 train_time:9901ms step_avg:159.69ms step:73/1530 train_loss:4.7694 train_time:10061ms step_avg:159.70ms step:74/1530 train_loss:4.9163 train_time:10222ms step_avg:159.71ms step:75/1530 train_loss:4.8642 train_time:10383ms step_avg:159.74ms step:76/1530 train_loss:4.8048 train_time:10544ms step_avg:159.75ms step:77/1530 train_loss:4.9148 train_time:10705ms step_avg:159.78ms step:78/1530 train_loss:5.1187 train_time:10866ms step_avg:159.79ms step:79/1530 train_loss:4.8136 train_time:11026ms step_avg:159.79ms step:80/1530 train_loss:4.8701 train_time:11186ms step_avg:159.80ms step:81/1530 train_loss:4.6660 train_time:11347ms step_avg:159.82ms step:82/1530 train_loss:4.8240 train_time:11508ms step_avg:159.84ms step:83/1530 train_loss:4.7812 train_time:11668ms step_avg:159.84ms step:84/1530 train_loss:4.7778 train_time:11829ms step_avg:159.86ms step:85/1530 train_loss:4.6313 train_time:11990ms step_avg:159.87ms step:86/1530 train_loss:4.8468 train_time:12150ms step_avg:159.87ms step:87/1530 train_loss:4.7828 train_time:12310ms step_avg:159.88ms step:88/1530 train_loss:4.7921 train_time:12471ms step_avg:159.88ms step:89/1530 train_loss:4.7253 train_time:12631ms step_avg:159.89ms step:90/1530 train_loss:4.6479 train_time:12791ms step_avg:159.89ms step:91/1530 train_loss:4.6562 train_time:12951ms step_avg:159.88ms step:92/1530 train_loss:4.8289 train_time:13112ms step_avg:159.90ms step:93/1530 train_loss:4.6329 train_time:13273ms step_avg:159.92ms step:94/1530 train_loss:4.6342 train_time:13434ms step_avg:159.93ms step:95/1530 train_loss:4.6884 train_time:13595ms step_avg:159.94ms step:96/1530 train_loss:4.6012 train_time:13755ms step_avg:159.94ms step:97/1530 train_loss:4.6482 train_time:13915ms step_avg:159.95ms step:98/1530 train_loss:4.5847 train_time:14075ms step_avg:159.95ms step:99/1530 train_loss:4.6797 train_time:14236ms step_avg:159.95ms step:100/1530 train_loss:4.6869 train_time:14396ms step_avg:159.95ms step:101/1530 train_loss:4.5621 train_time:14556ms step_avg:159.95ms step:102/1530 train_loss:4.7222 train_time:14716ms step_avg:159.96ms step:103/1530 train_loss:4.5885 train_time:14877ms step_avg:159.97ms step:104/1530 train_loss:4.5606 train_time:15036ms step_avg:159.95ms step:105/1530 train_loss:4.5662 train_time:15198ms step_avg:159.97ms step:106/1530 train_loss:4.6334 train_time:15357ms step_avg:159.97ms step:107/1530 train_loss:4.5195 train_time:15517ms step_avg:159.97ms step:108/1530 train_loss:4.4066 train_time:15679ms step_avg:159.99ms step:109/1530 train_loss:4.5073 train_time:15839ms step_avg:159.99ms step:110/1530 train_loss:4.5124 train_time:16001ms step_avg:160.01ms step:111/1530 train_loss:4.4329 train_time:16161ms step_avg:160.01ms step:112/1530 train_loss:4.5981 train_time:16321ms step_avg:160.01ms step:113/1530 train_loss:4.4951 train_time:16482ms step_avg:160.02ms step:114/1530 train_loss:4.3689 train_time:16641ms step_avg:160.01ms step:115/1530 train_loss:4.5201 train_time:16807ms step_avg:160.06ms step:116/1530 train_loss:4.4782 train_time:16972ms step_avg:160.11ms step:117/1530 train_loss:4.3690 train_time:17135ms step_avg:160.14ms step:118/1530 train_loss:4.5987 train_time:17300ms step_avg:160.18ms step:119/1530 train_loss:4.4648 train_time:17463ms step_avg:160.21ms step:120/1530 train_loss:4.3375 train_time:17627ms step_avg:160.24ms step:121/1530 train_loss:4.3117 train_time:17791ms step_avg:160.28ms step:122/1530 train_loss:4.4562 train_time:17955ms step_avg:160.31ms step:123/1530 train_loss:4.2936 train_time:18118ms step_avg:160.34ms step:124/1530 train_loss:4.5874 train_time:18284ms step_avg:160.38ms step:125/1530 train_loss:4.4662 train_time:18449ms step_avg:160.42ms step:125/1530 val_loss:4.4152 train_time:18496ms step_avg:160.83ms step:126/1530 train_loss:4.4204 train_time:18615ms step_avg:160.47ms step:127/1530 train_loss:4.4429 train_time:18780ms step_avg:160.51ms step:128/1530 train_loss:4.3930 train_time:18945ms step_avg:160.55ms step:129/1530 train_loss:4.6834 train_time:19109ms step_avg:160.58ms step:130/1530 train_loss:4.3900 train_time:19273ms step_avg:160.61ms step:131/1530 train_loss:4.4127 train_time:19436ms step_avg:160.63ms step:132/1530 train_loss:4.3517 train_time:19600ms step_avg:160.66ms step:133/1530 train_loss:4.4507 train_time:19763ms step_avg:160.68ms step:134/1530 train_loss:4.2681 train_time:19927ms step_avg:160.70ms step:135/1530 train_loss:4.4463 train_time:20092ms step_avg:160.74ms step:136/1530 train_loss:4.2169 train_time:20257ms step_avg:160.77ms step:137/1530 train_loss:4.3867 train_time:20420ms step_avg:160.78ms step:138/1530 train_loss:4.2929 train_time:20584ms step_avg:160.81ms step:139/1530 train_loss:4.3729 train_time:20748ms step_avg:160.84ms step:140/1530 train_loss:4.4634 train_time:20913ms step_avg:160.87ms step:141/1530 train_loss:4.3134 train_time:21077ms step_avg:160.89ms step:142/1530 train_loss:4.2990 train_time:21240ms step_avg:160.91ms step:143/1530 train_loss:4.2704 train_time:21403ms step_avg:160.93ms step:144/1530 train_loss:4.3599 train_time:21569ms step_avg:160.96ms step:145/1530 train_loss:4.3158 train_time:21732ms step_avg:160.98ms step:146/1530 train_loss:4.1873 train_time:21896ms step_avg:161.00ms step:147/1530 train_loss:4.3400 train_time:22060ms step_avg:161.03ms step:148/1530 train_loss:4.3653 train_time:22224ms step_avg:161.05ms step:149/1530 train_loss:4.3093 train_time:22390ms step_avg:161.08ms step:150/1530 train_loss:4.4450 train_time:22554ms step_avg:161.10ms step:151/1530 train_loss:4.2669 train_time:22718ms step_avg:161.12ms step:152/1530 train_loss:4.2745 train_time:22883ms step_avg:161.15ms step:153/1530 train_loss:4.3681 train_time:23048ms step_avg:161.18ms step:154/1530 train_loss:4.3702 train_time:23213ms step_avg:161.20ms step:155/1530 train_loss:4.2587 train_time:23376ms step_avg:161.22ms step:156/1530 train_loss:4.3423 train_time:23539ms step_avg:161.23ms step:157/1530 train_loss:4.4020 train_time:23703ms step_avg:161.25ms step:158/1530 train_loss:4.2390 train_time:23868ms step_avg:161.27ms step:159/1530 train_loss:4.3089 train_time:24031ms step_avg:161.28ms step:160/1530 train_loss:4.1335 train_time:24195ms step_avg:161.30ms step:161/1530 train_loss:4.3515 train_time:24358ms step_avg:161.31ms step:162/1530 train_loss:4.3592 train_time:24522ms step_avg:161.33ms step:163/1530 train_loss:4.3307 train_time:24688ms step_avg:161.36ms step:164/1530 train_loss:4.1804 train_time:24852ms step_avg:161.38ms step:165/1530 train_loss:4.2832 train_time:25016ms step_avg:161.39ms step:166/1530 train_loss:4.3541 train_time:25180ms step_avg:161.41ms step:167/1530 train_loss:4.2057 train_time:25343ms step_avg:161.42ms step:168/1530 train_loss:4.2781 train_time:25506ms step_avg:161.43ms step:169/1530 train_loss:4.1548 train_time:25672ms step_avg:161.46ms step:170/1530 train_loss:4.0267 train_time:25836ms step_avg:161.48ms step:171/1530 train_loss:4.2068 train_time:25999ms step_avg:161.48ms step:172/1530 train_loss:4.2160 train_time:26163ms step_avg:161.50ms step:173/1530 train_loss:4.2588 train_time:26326ms step_avg:161.51ms step:174/1530 train_loss:4.4218 train_time:26490ms step_avg:161.53ms step:175/1530 train_loss:4.2439 train_time:26653ms step_avg:161.53ms step:176/1530 train_loss:4.1013 train_time:26816ms step_avg:161.54ms step:177/1530 train_loss:4.0801 train_time:26980ms step_avg:161.56ms step:178/1530 train_loss:4.1893 train_time:27143ms step_avg:161.57ms step:179/1530 train_loss:4.1261 train_time:27306ms step_avg:161.58ms step:180/1530 train_loss:4.1168 train_time:27469ms step_avg:161.58ms step:181/1530 train_loss:4.2944 train_time:27631ms step_avg:161.58ms step:182/1530 train_loss:4.1558 train_time:27794ms step_avg:161.59ms step:183/1530 train_loss:4.1296 train_time:27956ms step_avg:161.60ms step:184/1530 train_loss:4.1275 train_time:28119ms step_avg:161.60ms step:185/1530 train_loss:4.2070 train_time:28282ms step_avg:161.61ms step:186/1530 train_loss:4.1723 train_time:28444ms step_avg:161.62ms step:187/1530 train_loss:4.2374 train_time:28608ms step_avg:161.63ms step:188/1530 train_loss:4.1690 train_time:28904ms step_avg:162.38ms step:189/1530 train_loss:4.1164 train_time:29234ms step_avg:163.32ms step:190/1530 train_loss:4.2157 train_time:29396ms step_avg:163.31ms step:191/1530 train_loss:4.0784 train_time:29559ms step_avg:163.31ms step:192/1530 train_loss:4.0301 train_time:29723ms step_avg:163.31ms step:193/1530 train_loss:4.2497 train_time:29887ms step_avg:163.32ms step:194/1530 train_loss:4.1808 train_time:30051ms step_avg:163.32ms step:195/1530 train_loss:4.3523 train_time:30214ms step_avg:163.32ms step:196/1530 train_loss:4.1690 train_time:30377ms step_avg:163.31ms step:197/1530 train_loss:4.0331 train_time:30538ms step_avg:163.31ms step:198/1530 train_loss:4.1696 train_time:30701ms step_avg:163.30ms step:199/1530 train_loss:4.0271 train_time:30865ms step_avg:163.31ms step:200/1530 train_loss:4.1169 train_time:31029ms step_avg:163.31ms step:201/1530 train_loss:4.0005 train_time:31193ms step_avg:163.31ms step:202/1530 train_loss:4.2597 train_time:31356ms step_avg:163.31ms step:203/1530 train_loss:4.0667 train_time:31518ms step_avg:163.30ms step:204/1530 train_loss:4.1968 train_time:31681ms step_avg:163.31ms step:205/1530 train_loss:4.2492 train_time:31844ms step_avg:163.30ms step:206/1530 train_loss:3.9453 train_time:32007ms step_avg:163.30ms step:207/1530 train_loss:4.0945 train_time:32170ms step_avg:163.30ms step:208/1530 train_loss:4.1119 train_time:32332ms step_avg:163.29ms step:209/1530 train_loss:4.2376 train_time:32496ms step_avg:163.29ms step:210/1530 train_loss:4.1727 train_time:32659ms step_avg:163.29ms step:211/1530 train_loss:4.0582 train_time:32822ms step_avg:163.29ms step:212/1530 train_loss:4.1218 train_time:32987ms step_avg:163.30ms step:213/1530 train_loss:4.0480 train_time:33152ms step_avg:163.31ms step:214/1530 train_loss:4.1039 train_time:33314ms step_avg:163.31ms step:215/1530 train_loss:3.9524 train_time:33477ms step_avg:163.30ms step:216/1530 train_loss:3.9979 train_time:33639ms step_avg:163.30ms step:217/1530 train_loss:4.0073 train_time:33802ms step_avg:163.29ms step:218/1530 train_loss:4.0736 train_time:33965ms step_avg:163.29ms step:219/1530 train_loss:4.0812 train_time:34129ms step_avg:163.30ms step:220/1530 train_loss:4.0878 train_time:34292ms step_avg:163.30ms step:221/1530 train_loss:4.0948 train_time:34455ms step_avg:163.29ms step:222/1530 train_loss:4.0068 train_time:34618ms step_avg:163.29ms step:223/1530 train_loss:3.9923 train_time:34782ms step_avg:163.30ms step:224/1530 train_loss:4.2994 train_time:34946ms step_avg:163.30ms step:225/1530 train_loss:3.9238 train_time:35109ms step_avg:163.30ms step:226/1530 train_loss:3.9909 train_time:35272ms step_avg:163.30ms step:227/1530 train_loss:3.9722 train_time:35434ms step_avg:163.29ms step:228/1530 train_loss:4.1389 train_time:35600ms step_avg:163.30ms step:229/1530 train_loss:3.9287 train_time:35767ms step_avg:163.32ms step:230/1530 train_loss:4.0392 train_time:35933ms step_avg:163.33ms step:231/1530 train_loss:3.9049 train_time:36099ms step_avg:163.34ms step:232/1530 train_loss:3.9640 train_time:36265ms step_avg:163.36ms step:233/1530 train_loss:4.0823 train_time:36431ms step_avg:163.37ms step:234/1530 train_loss:4.0242 train_time:36596ms step_avg:163.38ms step:235/1530 train_loss:3.8972 train_time:36764ms step_avg:163.39ms step:236/1530 train_loss:4.0819 train_time:36931ms step_avg:163.41ms step:237/1530 train_loss:4.0788 train_time:37097ms step_avg:163.42ms step:238/1530 train_loss:3.9461 train_time:37262ms step_avg:163.43ms step:239/1530 train_loss:4.0775 train_time:37429ms step_avg:163.44ms step:240/1530 train_loss:4.1118 train_time:37595ms step_avg:163.45ms step:241/1530 train_loss:3.9625 train_time:37760ms step_avg:163.46ms step:242/1530 train_loss:4.1354 train_time:37927ms step_avg:163.48ms step:243/1530 train_loss:4.0055 train_time:38094ms step_avg:163.49ms step:244/1530 train_loss:4.0685 train_time:38259ms step_avg:163.50ms step:245/1530 train_loss:4.1414 train_time:38424ms step_avg:163.51ms step:246/1530 train_loss:4.0595 train_time:38590ms step_avg:163.52ms step:247/1530 train_loss:4.0030 train_time:38757ms step_avg:163.53ms step:248/1530 train_loss:4.1080 train_time:38923ms step_avg:163.54ms step:249/1530 train_loss:3.9221 train_time:39090ms step_avg:163.56ms step:250/1530 train_loss:3.9723 train_time:39255ms step_avg:163.56ms step:250/1530 val_loss:4.0058 train_time:39303ms step_avg:163.76ms step:251/1530 train_loss:4.0746 train_time:39424ms step_avg:163.59ms step:252/1530 train_loss:4.1660 train_time:39591ms step_avg:163.60ms step:253/1530 train_loss:3.9278 train_time:39759ms step_avg:163.62ms step:254/1530 train_loss:3.8675 train_time:39925ms step_avg:163.63ms step:255/1530 train_loss:4.0743 train_time:40091ms step_avg:163.64ms step:256/1530 train_loss:3.9819 train_time:40257ms step_avg:163.65ms step:257/1530 train_loss:3.9941 train_time:40424ms step_avg:163.66ms step:258/1530 train_loss:3.9862 train_time:40590ms step_avg:163.67ms step:259/1530 train_loss:4.0330 train_time:40756ms step_avg:163.68ms step:260/1530 train_loss:4.0601 train_time:40924ms step_avg:163.69ms step:261/1530 train_loss:4.0143 train_time:41090ms step_avg:163.70ms step:262/1530 train_loss:3.9864 train_time:41256ms step_avg:163.71ms step:263/1530 train_loss:3.8858 train_time:41423ms step_avg:163.73ms step:264/1530 train_loss:3.9815 train_time:41588ms step_avg:163.73ms step:265/1530 train_loss:3.8693 train_time:41755ms step_avg:163.74ms step:266/1530 train_loss:3.9137 train_time:41921ms step_avg:163.75ms step:267/1530 train_loss:3.9250 train_time:42087ms step_avg:163.76ms step:268/1530 train_loss:3.9566 train_time:42252ms step_avg:163.77ms step:269/1530 train_loss:3.8543 train_time:42418ms step_avg:163.78ms step:270/1530 train_loss:4.0978 train_time:42584ms step_avg:163.79ms step:271/1530 train_loss:3.9674 train_time:42750ms step_avg:163.79ms step:272/1530 train_loss:3.9294 train_time:42916ms step_avg:163.80ms step:273/1530 train_loss:3.9390 train_time:43083ms step_avg:163.81ms step:274/1530 train_loss:4.0359 train_time:43249ms step_avg:163.82ms step:275/1530 train_loss:4.0533 train_time:43414ms step_avg:163.83ms step:276/1530 train_loss:4.2257 train_time:43581ms step_avg:163.84ms step:277/1530 train_loss:4.0350 train_time:43747ms step_avg:163.85ms step:278/1530 train_loss:4.0837 train_time:43912ms step_avg:163.85ms step:279/1530 train_loss:3.9928 train_time:44078ms step_avg:163.86ms step:280/1530 train_loss:4.1777 train_time:44246ms step_avg:163.88ms step:281/1530 train_loss:3.9766 train_time:44412ms step_avg:163.88ms step:282/1530 train_loss:3.9461 train_time:44578ms step_avg:163.89ms step:283/1530 train_loss:3.9091 train_time:44745ms step_avg:163.90ms step:284/1530 train_loss:4.0329 train_time:44911ms step_avg:163.91ms step:285/1530 train_loss:4.0617 train_time:45076ms step_avg:163.91ms step:286/1530 train_loss:4.0907 train_time:45242ms step_avg:163.92ms step:287/1530 train_loss:3.9038 train_time:45407ms step_avg:163.92ms step:288/1530 train_loss:4.0065 train_time:45572ms step_avg:163.93ms step:289/1530 train_loss:3.8783 train_time:45737ms step_avg:163.93ms step:290/1530 train_loss:3.8519 train_time:45902ms step_avg:163.94ms step:291/1530 train_loss:3.9040 train_time:46067ms step_avg:163.94ms step:292/1530 train_loss:3.8552 train_time:46232ms step_avg:163.94ms step:293/1530 train_loss:3.8965 train_time:46397ms step_avg:163.95ms step:294/1530 train_loss:3.9299 train_time:46563ms step_avg:163.96ms step:295/1530 train_loss:3.8436 train_time:46728ms step_avg:163.96ms step:296/1530 train_loss:3.8595 train_time:46893ms step_avg:163.96ms step:297/1530 train_loss:3.8629 train_time:47059ms step_avg:163.97ms step:298/1530 train_loss:3.9669 train_time:47224ms step_avg:163.97ms step:299/1530 train_loss:3.8177 train_time:47389ms step_avg:163.98ms step:300/1530 train_loss:3.9597 train_time:47554ms step_avg:163.98ms step:301/1530 train_loss:3.9586 train_time:47719ms step_avg:163.98ms step:302/1530 train_loss:3.9331 train_time:47884ms step_avg:163.99ms step:303/1530 train_loss:3.9857 train_time:48049ms step_avg:163.99ms step:304/1530 train_loss:3.9698 train_time:48212ms step_avg:163.99ms step:305/1530 train_loss:4.4565 train_time:48378ms step_avg:163.99ms step:306/1530 train_loss:3.9388 train_time:48544ms step_avg:164.00ms step:307/1530 train_loss:3.8373 train_time:48708ms step_avg:164.00ms step:308/1530 train_loss:3.9767 train_time:48874ms step_avg:164.01ms step:309/1530 train_loss:3.8762 train_time:49041ms step_avg:164.02ms step:310/1530 train_loss:4.0818 train_time:49205ms step_avg:164.02ms step:311/1530 train_loss:3.9253 train_time:49370ms step_avg:164.02ms step:312/1530 train_loss:3.8561 train_time:49536ms step_avg:164.02ms step:313/1530 train_loss:3.9361 train_time:49702ms step_avg:164.03ms step:314/1530 train_loss:4.0604 train_time:49868ms step_avg:164.04ms step:315/1530 train_loss:3.9390 train_time:50032ms step_avg:164.04ms step:316/1530 train_loss:3.7869 train_time:50199ms step_avg:164.05ms step:317/1530 train_loss:3.8684 train_time:50365ms step_avg:164.06ms step:318/1530 train_loss:3.9190 train_time:50530ms step_avg:164.06ms step:319/1530 train_loss:3.8875 train_time:50695ms step_avg:164.06ms step:320/1530 train_loss:4.0047 train_time:50863ms step_avg:164.07ms step:321/1530 train_loss:3.9604 train_time:51027ms step_avg:164.08ms step:322/1530 train_loss:3.9297 train_time:51193ms step_avg:164.08ms step:323/1530 train_loss:4.0026 train_time:51358ms step_avg:164.08ms step:324/1530 train_loss:3.9335 train_time:51524ms step_avg:164.09ms step:325/1530 train_loss:4.0104 train_time:51689ms step_avg:164.09ms step:326/1530 train_loss:3.8867 train_time:51855ms step_avg:164.10ms step:327/1530 train_loss:4.3842 train_time:52021ms step_avg:164.11ms step:328/1530 train_loss:4.0690 train_time:52186ms step_avg:164.11ms step:329/1530 train_loss:3.7906 train_time:52351ms step_avg:164.11ms step:330/1530 train_loss:3.7456 train_time:52517ms step_avg:164.12ms step:331/1530 train_loss:3.9774 train_time:52682ms step_avg:164.12ms step:332/1530 train_loss:3.9121 train_time:52847ms step_avg:164.12ms step:333/1530 train_loss:3.8851 train_time:53011ms step_avg:164.12ms step:334/1530 train_loss:3.8362 train_time:53176ms step_avg:164.12ms step:335/1530 train_loss:4.0148 train_time:53343ms step_avg:164.13ms step:336/1530 train_loss:3.9648 train_time:53507ms step_avg:164.13ms step:337/1530 train_loss:4.4133 train_time:53672ms step_avg:164.13ms step:338/1530 train_loss:3.9278 train_time:53838ms step_avg:164.14ms step:339/1530 train_loss:3.8606 train_time:54003ms step_avg:164.14ms step:340/1530 train_loss:3.9341 train_time:54169ms step_avg:164.15ms step:341/1530 train_loss:3.8531 train_time:54337ms step_avg:164.16ms step:342/1530 train_loss:3.8108 train_time:54505ms step_avg:164.17ms step:343/1530 train_loss:3.8375 train_time:54672ms step_avg:164.18ms step:344/1530 train_loss:3.9926 train_time:54839ms step_avg:164.19ms step:345/1530 train_loss:3.8115 train_time:55008ms step_avg:164.20ms step:346/1530 train_loss:3.7603 train_time:55176ms step_avg:164.22ms step:347/1530 train_loss:3.7935 train_time:55346ms step_avg:164.23ms step:348/1530 train_loss:3.8517 train_time:55514ms step_avg:164.24ms step:349/1530 train_loss:3.8266 train_time:55681ms step_avg:164.25ms step:350/1530 train_loss:3.5643 train_time:55850ms step_avg:164.26ms step:351/1530 train_loss:3.8195 train_time:56017ms step_avg:164.27ms step:352/1530 train_loss:4.1811 train_time:56185ms step_avg:164.28ms step:353/1530 train_loss:3.6727 train_time:56353ms step_avg:164.29ms step:354/1530 train_loss:3.9171 train_time:56520ms step_avg:164.30ms step:355/1530 train_loss:3.7765 train_time:56688ms step_avg:164.31ms step:356/1530 train_loss:3.8774 train_time:56856ms step_avg:164.32ms step:357/1530 train_loss:3.7577 train_time:57026ms step_avg:164.34ms step:358/1530 train_loss:3.8625 train_time:57193ms step_avg:164.35ms step:359/1530 train_loss:3.7916 train_time:57364ms step_avg:164.37ms step:360/1530 train_loss:3.4250 train_time:57533ms step_avg:164.38ms step:361/1530 train_loss:4.0228 train_time:57703ms step_avg:164.40ms step:362/1530 train_loss:3.9142 train_time:57871ms step_avg:164.41ms step:363/1530 train_loss:3.8349 train_time:58039ms step_avg:164.42ms step:364/1530 train_loss:3.7362 train_time:58208ms step_avg:164.43ms step:365/1530 train_loss:3.9107 train_time:58376ms step_avg:164.44ms step:366/1530 train_loss:3.8534 train_time:58545ms step_avg:164.45ms step:367/1530 train_loss:3.8540 train_time:58712ms step_avg:164.46ms step:368/1530 train_loss:3.8435 train_time:58879ms step_avg:164.47ms step:369/1530 train_loss:3.7450 train_time:59048ms step_avg:164.48ms step:370/1530 train_loss:3.8730 train_time:59216ms step_avg:164.49ms step:371/1530 train_loss:3.7293 train_time:59384ms step_avg:164.50ms step:372/1530 train_loss:3.6916 train_time:59552ms step_avg:164.51ms step:373/1530 train_loss:3.9114 train_time:59720ms step_avg:164.52ms step:374/1530 train_loss:3.8191 train_time:59887ms step_avg:164.53ms step:375/1530 train_loss:3.7913 train_time:60055ms step_avg:164.54ms step:375/1530 val_loss:3.8220 train_time:60104ms step_avg:164.67ms step:376/1530 train_loss:3.8643 train_time:60227ms step_avg:164.55ms step:377/1530 train_loss:3.7912 train_time:60536ms step_avg:164.95ms step:378/1530 train_loss:3.8650 train_time:60714ms step_avg:164.98ms step:379/1530 train_loss:3.8684 train_time:61028ms step_avg:165.39ms step:380/1530 train_loss:3.9492 train_time:61196ms step_avg:165.39ms step:381/1530 train_loss:3.8351 train_time:61363ms step_avg:165.40ms step:382/1530 train_loss:3.7921 train_time:61531ms step_avg:165.41ms step:383/1530 train_loss:3.7938 train_time:61700ms step_avg:165.41ms step:384/1530 train_loss:3.8679 train_time:61865ms step_avg:165.41ms step:385/1530 train_loss:3.7897 train_time:62034ms step_avg:165.42ms step:386/1530 train_loss:3.8850 train_time:62201ms step_avg:165.43ms step:387/1530 train_loss:4.0552 train_time:62369ms step_avg:165.43ms step:388/1530 train_loss:3.7918 train_time:62538ms step_avg:165.45ms step:389/1530 train_loss:3.7941 train_time:62708ms step_avg:165.46ms step:390/1530 train_loss:3.8990 train_time:62878ms step_avg:165.47ms step:391/1530 train_loss:3.8103 train_time:63045ms step_avg:165.47ms step:392/1530 train_loss:3.9205 train_time:63213ms step_avg:165.48ms step:393/1530 train_loss:3.7609 train_time:63380ms step_avg:165.48ms step:394/1530 train_loss:3.8826 train_time:63548ms step_avg:165.49ms step:395/1530 train_loss:3.6211 train_time:63717ms step_avg:165.50ms step:396/1530 train_loss:3.8355 train_time:63885ms step_avg:165.51ms step:397/1530 train_loss:3.8549 train_time:64053ms step_avg:165.51ms step:398/1530 train_loss:3.8851 train_time:64220ms step_avg:165.52ms step:399/1530 train_loss:3.7641 train_time:64386ms step_avg:165.52ms step:400/1530 train_loss:3.8291 train_time:64555ms step_avg:165.53ms step:401/1530 train_loss:3.9100 train_time:64722ms step_avg:165.53ms step:402/1530 train_loss:3.8387 train_time:64889ms step_avg:165.53ms step:403/1530 train_loss:3.9553 train_time:65057ms step_avg:165.54ms step:404/1530 train_loss:3.6788 train_time:65223ms step_avg:165.54ms step:405/1530 train_loss:3.7835 train_time:65392ms step_avg:165.55ms step:406/1530 train_loss:4.0937 train_time:65558ms step_avg:165.55ms step:407/1530 train_loss:3.7787 train_time:65724ms step_avg:165.55ms step:408/1530 train_loss:3.8210 train_time:65892ms step_avg:165.56ms step:409/1530 train_loss:3.8488 train_time:66059ms step_avg:165.56ms step:410/1530 train_loss:3.7596 train_time:66226ms step_avg:165.57ms step:411/1530 train_loss:3.7538 train_time:66395ms step_avg:165.57ms step:412/1530 train_loss:4.1747 train_time:66561ms step_avg:165.58ms step:413/1530 train_loss:3.6626 train_time:66728ms step_avg:165.58ms step:414/1530 train_loss:4.0106 train_time:66896ms step_avg:165.58ms step:415/1530 train_loss:3.7502 train_time:67062ms step_avg:165.58ms step:416/1530 train_loss:3.7556 train_time:67229ms step_avg:165.59ms step:417/1530 train_loss:3.9507 train_time:67399ms step_avg:165.60ms step:418/1530 train_loss:3.6880 train_time:67566ms step_avg:165.60ms step:419/1530 train_loss:3.8024 train_time:67733ms step_avg:165.61ms step:420/1530 train_loss:3.7010 train_time:67900ms step_avg:165.61ms step:421/1530 train_loss:3.6457 train_time:68066ms step_avg:165.61ms step:422/1530 train_loss:3.7809 train_time:68235ms step_avg:165.62ms step:423/1530 train_loss:3.8652 train_time:68402ms step_avg:165.62ms step:424/1530 train_loss:3.6120 train_time:68569ms step_avg:165.63ms step:425/1530 train_loss:3.7879 train_time:68737ms step_avg:165.63ms step:426/1530 train_loss:3.6533 train_time:68904ms step_avg:165.63ms step:427/1530 train_loss:3.8883 train_time:69072ms step_avg:165.64ms step:428/1530 train_loss:3.8085 train_time:69238ms step_avg:165.64ms step:429/1530 train_loss:3.7566 train_time:69405ms step_avg:165.65ms step:430/1530 train_loss:3.7047 train_time:69573ms step_avg:165.65ms step:431/1530 train_loss:3.6279 train_time:69740ms step_avg:165.65ms step:432/1530 train_loss:3.7655 train_time:69909ms step_avg:165.66ms step:433/1530 train_loss:3.8127 train_time:70077ms step_avg:165.67ms step:434/1530 train_loss:3.7692 train_time:70243ms step_avg:165.67ms step:435/1530 train_loss:3.7975 train_time:70412ms step_avg:165.68ms step:436/1530 train_loss:3.8255 train_time:70579ms step_avg:165.68ms step:437/1530 train_loss:3.7166 train_time:70747ms step_avg:165.68ms step:438/1530 train_loss:3.6958 train_time:70915ms step_avg:165.69ms step:439/1530 train_loss:3.7063 train_time:71082ms step_avg:165.69ms step:440/1530 train_loss:3.8856 train_time:71250ms step_avg:165.70ms step:441/1530 train_loss:3.7530 train_time:71418ms step_avg:165.70ms step:442/1530 train_loss:3.7346 train_time:71584ms step_avg:165.70ms step:443/1530 train_loss:3.6195 train_time:71750ms step_avg:165.71ms step:444/1530 train_loss:3.9208 train_time:71919ms step_avg:165.71ms step:445/1530 train_loss:3.8421 train_time:72085ms step_avg:165.71ms step:446/1530 train_loss:3.8352 train_time:72253ms step_avg:165.72ms step:447/1530 train_loss:3.7493 train_time:72420ms step_avg:165.72ms step:448/1530 train_loss:3.8464 train_time:72586ms step_avg:165.72ms step:449/1530 train_loss:3.6854 train_time:72754ms step_avg:165.73ms step:450/1530 train_loss:3.7134 train_time:72921ms step_avg:165.73ms step:451/1530 train_loss:3.5786 train_time:73089ms step_avg:165.73ms step:452/1530 train_loss:3.7081 train_time:73256ms step_avg:165.74ms step:453/1530 train_loss:3.6681 train_time:73424ms step_avg:165.74ms step:454/1530 train_loss:3.6337 train_time:73594ms step_avg:165.75ms step:455/1530 train_loss:3.8379 train_time:73761ms step_avg:165.75ms step:456/1530 train_loss:3.7197 train_time:73931ms step_avg:165.76ms step:457/1530 train_loss:3.7713 train_time:74102ms step_avg:165.78ms step:458/1530 train_loss:3.8204 train_time:74271ms step_avg:165.78ms step:459/1530 train_loss:3.6235 train_time:74441ms step_avg:165.79ms step:460/1530 train_loss:3.7874 train_time:74612ms step_avg:165.80ms step:461/1530 train_loss:3.6816 train_time:74781ms step_avg:165.81ms step:462/1530 train_loss:3.7349 train_time:74952ms step_avg:165.82ms step:463/1530 train_loss:3.7712 train_time:75122ms step_avg:165.83ms step:464/1530 train_loss:3.7111 train_time:75292ms step_avg:165.84ms step:465/1530 train_loss:3.7096 train_time:75460ms step_avg:165.85ms step:466/1530 train_loss:3.7877 train_time:75630ms step_avg:165.85ms step:467/1530 train_loss:3.8206 train_time:75804ms step_avg:165.87ms step:468/1530 train_loss:3.7886 train_time:75972ms step_avg:165.88ms step:469/1530 train_loss:3.6841 train_time:76141ms step_avg:165.89ms step:470/1530 train_loss:3.7660 train_time:76313ms step_avg:165.90ms step:471/1530 train_loss:3.8036 train_time:76483ms step_avg:165.91ms step:472/1530 train_loss:3.7791 train_time:76655ms step_avg:165.92ms step:473/1530 train_loss:3.7130 train_time:76824ms step_avg:165.93ms step:474/1530 train_loss:3.5942 train_time:76995ms step_avg:165.94ms step:475/1530 train_loss:4.0152 train_time:77164ms step_avg:165.94ms step:476/1530 train_loss:3.7513 train_time:77334ms step_avg:165.95ms step:477/1530 train_loss:3.5972 train_time:77504ms step_avg:165.96ms step:478/1530 train_loss:3.8194 train_time:77673ms step_avg:165.97ms step:479/1530 train_loss:3.7723 train_time:77842ms step_avg:165.98ms step:480/1530 train_loss:3.9125 train_time:78013ms step_avg:165.99ms step:481/1530 train_loss:3.7198 train_time:78182ms step_avg:165.99ms step:482/1530 train_loss:3.5251 train_time:78352ms step_avg:166.00ms step:483/1530 train_loss:3.7996 train_time:78521ms step_avg:166.01ms step:484/1530 train_loss:3.6477 train_time:78692ms step_avg:166.02ms step:485/1530 train_loss:3.6524 train_time:78861ms step_avg:166.02ms step:486/1530 train_loss:3.5706 train_time:79031ms step_avg:166.03ms step:487/1530 train_loss:3.6814 train_time:79201ms step_avg:166.04ms step:488/1530 train_loss:3.8790 train_time:79370ms step_avg:166.05ms step:489/1530 train_loss:3.7050 train_time:79541ms step_avg:166.06ms step:490/1530 train_loss:3.5883 train_time:79710ms step_avg:166.06ms step:491/1530 train_loss:3.6112 train_time:79879ms step_avg:166.07ms step:492/1530 train_loss:3.7309 train_time:80049ms step_avg:166.08ms step:493/1530 train_loss:3.5722 train_time:80220ms step_avg:166.09ms step:494/1530 train_loss:3.6947 train_time:80389ms step_avg:166.09ms step:495/1530 train_loss:3.6572 train_time:80559ms step_avg:166.10ms step:496/1530 train_loss:3.5087 train_time:80729ms step_avg:166.11ms step:497/1530 train_loss:3.7300 train_time:80898ms step_avg:166.12ms step:498/1530 train_loss:3.7818 train_time:81067ms step_avg:166.12ms step:499/1530 train_loss:3.8117 train_time:81237ms step_avg:166.13ms step:500/1530 train_loss:3.7322 train_time:81407ms step_avg:166.14ms step:500/1530 val_loss:3.7028 train_time:81457ms step_avg:166.24ms step:501/1530 train_loss:3.8035 train_time:81579ms step_avg:166.15ms step:502/1530 train_loss:3.7467 train_time:81750ms step_avg:166.16ms step:503/1530 train_loss:3.7722 train_time:81920ms step_avg:166.17ms step:504/1530 train_loss:3.7166 train_time:82089ms step_avg:166.17ms step:505/1530 train_loss:3.8020 train_time:82259ms step_avg:166.18ms step:506/1530 train_loss:3.6458 train_time:82429ms step_avg:166.19ms step:507/1530 train_loss:3.7585 train_time:82598ms step_avg:166.19ms step:508/1530 train_loss:3.8215 train_time:82768ms step_avg:166.20ms step:509/1530 train_loss:3.7705 train_time:82938ms step_avg:166.21ms step:510/1530 train_loss:3.5768 train_time:83107ms step_avg:166.21ms step:511/1530 train_loss:3.7736 train_time:83277ms step_avg:166.22ms step:512/1530 train_loss:3.7157 train_time:83450ms step_avg:166.24ms step:513/1530 train_loss:3.6592 train_time:83619ms step_avg:166.24ms step:514/1530 train_loss:3.8216 train_time:83788ms step_avg:166.25ms step:515/1530 train_loss:3.7310 train_time:83958ms step_avg:166.25ms step:516/1530 train_loss:4.0674 train_time:84128ms step_avg:166.26ms step:517/1530 train_loss:3.6935 train_time:84297ms step_avg:166.27ms step:518/1530 train_loss:3.7671 train_time:84464ms step_avg:166.27ms step:519/1530 train_loss:3.6581 train_time:84634ms step_avg:166.27ms step:520/1530 train_loss:3.6768 train_time:84802ms step_avg:166.28ms step:521/1530 train_loss:3.6605 train_time:84971ms step_avg:166.28ms step:522/1530 train_loss:3.6572 train_time:85142ms step_avg:166.29ms step:523/1530 train_loss:4.2965 train_time:85312ms step_avg:166.30ms step:524/1530 train_loss:3.7409 train_time:85481ms step_avg:166.31ms step:525/1530 train_loss:3.6769 train_time:85651ms step_avg:166.31ms step:526/1530 train_loss:3.6922 train_time:85820ms step_avg:166.32ms step:527/1530 train_loss:3.6510 train_time:85990ms step_avg:166.33ms step:528/1530 train_loss:3.6222 train_time:86160ms step_avg:166.33ms step:529/1530 train_loss:3.8441 train_time:86329ms step_avg:166.34ms step:530/1530 train_loss:3.6398 train_time:86498ms step_avg:166.34ms step:531/1530 train_loss:3.9174 train_time:86668ms step_avg:166.35ms step:532/1530 train_loss:3.7319 train_time:86837ms step_avg:166.36ms step:533/1530 train_loss:3.6496 train_time:87005ms step_avg:166.36ms step:534/1530 train_loss:3.6663 train_time:87175ms step_avg:166.36ms step:535/1530 train_loss:3.6026 train_time:87344ms step_avg:166.37ms step:536/1530 train_loss:3.7503 train_time:87515ms step_avg:166.38ms step:537/1530 train_loss:3.7251 train_time:87685ms step_avg:166.38ms step:538/1530 train_loss:3.6240 train_time:87856ms step_avg:166.39ms step:539/1530 train_loss:4.1127 train_time:88026ms step_avg:166.40ms step:540/1530 train_loss:3.6741 train_time:88195ms step_avg:166.41ms step:541/1530 train_loss:3.7840 train_time:88363ms step_avg:166.41ms step:542/1530 train_loss:3.5854 train_time:88532ms step_avg:166.41ms step:543/1530 train_loss:3.5837 train_time:88700ms step_avg:166.42ms step:544/1530 train_loss:3.6364 train_time:88869ms step_avg:166.42ms step:545/1530 train_loss:3.5867 train_time:89039ms step_avg:166.43ms step:546/1530 train_loss:3.6219 train_time:89207ms step_avg:166.43ms step:547/1530 train_loss:3.6400 train_time:89376ms step_avg:166.44ms step:548/1530 train_loss:3.6080 train_time:89544ms step_avg:166.44ms step:549/1530 train_loss:3.7170 train_time:89713ms step_avg:166.44ms step:550/1530 train_loss:3.6086 train_time:89881ms step_avg:166.45ms step:551/1530 train_loss:3.6228 train_time:90050ms step_avg:166.45ms step:552/1530 train_loss:3.9354 train_time:90220ms step_avg:166.46ms step:553/1530 train_loss:3.7497 train_time:90389ms step_avg:166.46ms step:554/1530 train_loss:3.7060 train_time:90558ms step_avg:166.47ms step:555/1530 train_loss:3.6167 train_time:90727ms step_avg:166.47ms step:556/1530 train_loss:3.6958 train_time:90895ms step_avg:166.48ms step:557/1530 train_loss:3.3073 train_time:91064ms step_avg:166.48ms step:558/1530 train_loss:3.6098 train_time:91234ms step_avg:166.49ms step:559/1530 train_loss:3.6472 train_time:91403ms step_avg:166.49ms step:560/1530 train_loss:3.6811 train_time:91573ms step_avg:166.50ms step:561/1530 train_loss:3.6027 train_time:91741ms step_avg:166.50ms step:562/1530 train_loss:3.5494 train_time:91910ms step_avg:166.50ms step:563/1530 train_loss:3.7543 train_time:92081ms step_avg:166.51ms step:564/1530 train_loss:3.5684 train_time:92251ms step_avg:166.52ms step:565/1530 train_loss:3.6692 train_time:92419ms step_avg:166.52ms step:566/1530 train_loss:3.6191 train_time:92720ms step_avg:166.76ms step:567/1530 train_loss:3.5974 train_time:92898ms step_avg:166.78ms step:568/1530 train_loss:3.6857 train_time:93068ms step_avg:166.79ms step:569/1530 train_loss:3.6454 train_time:93395ms step_avg:167.08ms step:570/1530 train_loss:3.6853 train_time:93574ms step_avg:167.10ms step:571/1530 train_loss:3.7553 train_time:93744ms step_avg:167.10ms step:572/1530 train_loss:3.7259 train_time:93917ms step_avg:167.11ms step:573/1530 train_loss:3.7332 train_time:94089ms step_avg:167.12ms step:574/1530 train_loss:3.7742 train_time:94264ms step_avg:167.13ms step:575/1530 train_loss:3.7265 train_time:94435ms step_avg:167.14ms step:576/1530 train_loss:3.7573 train_time:94605ms step_avg:167.15ms step:577/1530 train_loss:3.6599 train_time:94777ms step_avg:167.16ms step:578/1530 train_loss:3.6699 train_time:94949ms step_avg:167.16ms step:579/1530 train_loss:3.6647 train_time:95120ms step_avg:167.17ms step:580/1530 train_loss:3.5887 train_time:95291ms step_avg:167.18ms step:581/1530 train_loss:3.6283 train_time:95463ms step_avg:167.18ms step:582/1530 train_loss:3.8406 train_time:95634ms step_avg:167.19ms step:583/1530 train_loss:3.6164 train_time:95804ms step_avg:167.20ms step:584/1530 train_loss:3.5898 train_time:95976ms step_avg:167.20ms step:585/1530 train_loss:3.7801 train_time:96146ms step_avg:167.21ms step:586/1530 train_loss:3.5146 train_time:96318ms step_avg:167.22ms step:587/1530 train_loss:3.6687 train_time:96490ms step_avg:167.23ms step:588/1530 train_loss:3.6341 train_time:96661ms step_avg:167.23ms step:589/1530 train_loss:3.9877 train_time:96833ms step_avg:167.24ms step:590/1530 train_loss:3.7783 train_time:97004ms step_avg:167.25ms step:591/1530 train_loss:3.4995 train_time:97176ms step_avg:167.26ms step:592/1530 train_loss:3.5345 train_time:97349ms step_avg:167.27ms step:593/1530 train_loss:3.4994 train_time:97522ms step_avg:167.28ms step:594/1530 train_loss:3.5479 train_time:97694ms step_avg:167.28ms step:595/1530 train_loss:3.9163 train_time:97867ms step_avg:167.29ms step:596/1530 train_loss:3.6410 train_time:98040ms step_avg:167.30ms step:597/1530 train_loss:3.5763 train_time:98210ms step_avg:167.31ms step:598/1530 train_loss:3.6530 train_time:98381ms step_avg:167.32ms step:599/1530 train_loss:3.4768 train_time:98555ms step_avg:167.33ms step:600/1530 train_loss:3.5878 train_time:98725ms step_avg:167.33ms step:601/1530 train_loss:3.6420 train_time:98898ms step_avg:167.34ms step:602/1530 train_loss:3.6660 train_time:99069ms step_avg:167.35ms step:603/1530 train_loss:3.7771 train_time:99242ms step_avg:167.36ms step:604/1530 train_loss:3.6083 train_time:99413ms step_avg:167.36ms step:605/1530 train_loss:3.6093 train_time:99585ms step_avg:167.37ms step:606/1530 train_loss:3.5677 train_time:99759ms step_avg:167.38ms step:607/1530 train_loss:3.8329 train_time:99929ms step_avg:167.39ms step:608/1530 train_loss:3.6253 train_time:100100ms step_avg:167.39ms step:609/1530 train_loss:3.6167 train_time:100271ms step_avg:167.40ms step:610/1530 train_loss:3.7018 train_time:100441ms step_avg:167.40ms step:611/1530 train_loss:3.5955 train_time:100612ms step_avg:167.41ms step:612/1530 train_loss:3.5656 train_time:100782ms step_avg:167.41ms step:613/1530 train_loss:3.7580 train_time:100955ms step_avg:167.42ms step:614/1530 train_loss:3.6921 train_time:101126ms step_avg:167.43ms step:615/1530 train_loss:3.6867 train_time:101297ms step_avg:167.43ms step:616/1530 train_loss:3.6270 train_time:101466ms step_avg:167.44ms step:617/1530 train_loss:3.5612 train_time:101640ms step_avg:167.45ms step:618/1530 train_loss:3.6827 train_time:101811ms step_avg:167.45ms step:619/1530 train_loss:3.5428 train_time:101981ms step_avg:167.46ms step:620/1530 train_loss:3.5871 train_time:102153ms step_avg:167.46ms step:621/1530 train_loss:3.9240 train_time:102325ms step_avg:167.47ms step:622/1530 train_loss:3.5683 train_time:102497ms step_avg:167.48ms step:623/1530 train_loss:3.5918 train_time:102670ms step_avg:167.49ms step:624/1530 train_loss:3.6870 train_time:102842ms step_avg:167.49ms step:625/1530 train_loss:3.7024 train_time:103012ms step_avg:167.50ms step:625/1530 val_loss:3.6177 train_time:103061ms step_avg:167.58ms step:626/1530 train_loss:3.7300 train_time:103185ms step_avg:167.51ms step:627/1530 train_loss:3.7067 train_time:103358ms step_avg:167.52ms step:628/1530 train_loss:3.7592 train_time:103528ms step_avg:167.52ms step:629/1530 train_loss:3.5882 train_time:103701ms step_avg:167.53ms step:630/1530 train_loss:3.7234 train_time:103872ms step_avg:167.54ms step:631/1530 train_loss:3.7408 train_time:104042ms step_avg:167.54ms step:632/1530 train_loss:3.6466 train_time:104213ms step_avg:167.55ms step:633/1530 train_loss:3.5943 train_time:104385ms step_avg:167.55ms step:634/1530 train_loss:3.6917 train_time:104556ms step_avg:167.56ms step:635/1530 train_loss:3.9507 train_time:104726ms step_avg:167.56ms step:636/1530 train_loss:3.5474 train_time:104899ms step_avg:167.57ms step:637/1530 train_loss:3.3534 train_time:105069ms step_avg:167.57ms step:638/1530 train_loss:3.5934 train_time:105240ms step_avg:167.58ms step:639/1530 train_loss:3.6285 train_time:105409ms step_avg:167.58ms step:640/1530 train_loss:3.5648 train_time:105581ms step_avg:167.59ms step:641/1530 train_loss:3.5828 train_time:105750ms step_avg:167.59ms step:642/1530 train_loss:3.6292 train_time:105920ms step_avg:167.60ms step:643/1530 train_loss:3.5916 train_time:106091ms step_avg:167.60ms step:644/1530 train_loss:3.5591 train_time:106261ms step_avg:167.60ms step:645/1530 train_loss:3.7692 train_time:106431ms step_avg:167.61ms step:646/1530 train_loss:3.6665 train_time:106604ms step_avg:167.62ms step:647/1530 train_loss:3.6635 train_time:106775ms step_avg:167.62ms step:648/1530 train_loss:3.7065 train_time:106945ms step_avg:167.62ms step:649/1530 train_loss:3.7628 train_time:107115ms step_avg:167.63ms step:650/1530 train_loss:3.6190 train_time:107287ms step_avg:167.64ms step:651/1530 train_loss:3.7641 train_time:107459ms step_avg:167.64ms step:652/1530 train_loss:3.5810 train_time:107629ms step_avg:167.65ms step:653/1530 train_loss:3.6599 train_time:107800ms step_avg:167.65ms step:654/1530 train_loss:3.4242 train_time:107970ms step_avg:167.66ms step:655/1530 train_loss:3.5831 train_time:108139ms step_avg:167.66ms step:656/1530 train_loss:3.5702 train_time:108308ms step_avg:167.66ms step:657/1530 train_loss:3.4975 train_time:108480ms step_avg:167.67ms step:658/1530 train_loss:3.6831 train_time:108650ms step_avg:167.67ms step:659/1530 train_loss:3.5814 train_time:108821ms step_avg:167.68ms step:660/1530 train_loss:3.6820 train_time:108990ms step_avg:167.68ms step:661/1530 train_loss:3.7473 train_time:109163ms step_avg:167.69ms step:662/1530 train_loss:3.6669 train_time:109335ms step_avg:167.69ms step:663/1530 train_loss:3.5530 train_time:109505ms step_avg:167.70ms step:664/1530 train_loss:3.6052 train_time:109677ms step_avg:167.70ms step:665/1530 train_loss:3.4891 train_time:109847ms step_avg:167.71ms step:666/1530 train_loss:3.7769 train_time:110018ms step_avg:167.71ms step:667/1530 train_loss:3.6004 train_time:110188ms step_avg:167.71ms step:668/1530 train_loss:3.6420 train_time:110360ms step_avg:167.72ms step:669/1530 train_loss:3.4854 train_time:110530ms step_avg:167.72ms step:670/1530 train_loss:3.6024 train_time:110703ms step_avg:167.73ms step:671/1530 train_loss:3.5531 train_time:110875ms step_avg:167.74ms step:672/1530 train_loss:3.5623 train_time:111045ms step_avg:167.74ms step:673/1530 train_loss:3.8432 train_time:111216ms step_avg:167.75ms step:674/1530 train_loss:3.6228 train_time:111386ms step_avg:167.75ms step:675/1530 train_loss:3.7072 train_time:111558ms step_avg:167.76ms step:676/1530 train_loss:3.4883 train_time:111728ms step_avg:167.76ms step:677/1530 train_loss:3.5940 train_time:111902ms step_avg:167.77ms step:678/1530 train_loss:3.5533 train_time:112070ms step_avg:167.77ms step:679/1530 train_loss:3.6725 train_time:112242ms step_avg:167.78ms step:680/1530 train_loss:3.5827 train_time:112411ms step_avg:167.78ms step:681/1530 train_loss:3.6092 train_time:112585ms step_avg:167.79ms step:682/1530 train_loss:3.6588 train_time:112762ms step_avg:167.80ms step:683/1530 train_loss:3.7298 train_time:112936ms step_avg:167.81ms step:684/1530 train_loss:3.6456 train_time:113108ms step_avg:167.82ms step:685/1530 train_loss:3.6802 train_time:113282ms step_avg:167.83ms step:686/1530 train_loss:3.6315 train_time:113454ms step_avg:167.83ms step:687/1530 train_loss:3.6601 train_time:113627ms step_avg:167.84ms step:688/1530 train_loss:3.1947 train_time:113804ms step_avg:167.85ms step:689/1530 train_loss:3.4070 train_time:113978ms step_avg:167.86ms step:690/1530 train_loss:3.5392 train_time:114152ms step_avg:167.87ms step:691/1530 train_loss:3.4073 train_time:114324ms step_avg:167.88ms step:692/1530 train_loss:3.6238 train_time:114496ms step_avg:167.88ms step:693/1530 train_loss:3.6452 train_time:114668ms step_avg:167.89ms step:694/1530 train_loss:3.5480 train_time:114841ms step_avg:167.90ms step:695/1530 train_loss:3.5262 train_time:115011ms step_avg:167.90ms step:696/1530 train_loss:3.8505 train_time:115185ms step_avg:167.91ms step:697/1530 train_loss:3.5839 train_time:115359ms step_avg:167.92ms step:698/1530 train_loss:3.6414 train_time:115531ms step_avg:167.92ms step:699/1530 train_loss:3.7573 train_time:115705ms step_avg:167.93ms step:700/1530 train_loss:3.5639 train_time:115877ms step_avg:167.94ms step:701/1530 train_loss:3.5391 train_time:116048ms step_avg:167.94ms step:702/1530 train_loss:3.5087 train_time:116221ms step_avg:167.95ms step:703/1530 train_loss:3.4942 train_time:116393ms step_avg:167.96ms step:704/1530 train_loss:3.5729 train_time:116565ms step_avg:167.96ms step:705/1530 train_loss:3.5545 train_time:116742ms step_avg:167.97ms step:706/1530 train_loss:3.5751 train_time:116919ms step_avg:167.99ms step:707/1530 train_loss:3.6420 train_time:117093ms step_avg:168.00ms step:708/1530 train_loss:3.5974 train_time:117266ms step_avg:168.00ms step:709/1530 train_loss:3.5750 train_time:117441ms step_avg:168.01ms step:710/1530 train_loss:3.5345 train_time:117611ms step_avg:168.02ms step:711/1530 train_loss:3.5902 train_time:117785ms step_avg:168.02ms step:712/1530 train_loss:3.6449 train_time:117960ms step_avg:168.03ms step:713/1530 train_loss:3.6560 train_time:118134ms step_avg:168.04ms step:714/1530 train_loss:3.5620 train_time:118306ms step_avg:168.05ms step:715/1530 train_loss:3.5647 train_time:118479ms step_avg:168.06ms step:716/1530 train_loss:3.5873 train_time:118649ms step_avg:168.06ms step:717/1530 train_loss:3.7014 train_time:118824ms step_avg:168.07ms step:718/1530 train_loss:3.5920 train_time:118996ms step_avg:168.07ms step:719/1530 train_loss:3.6722 train_time:119169ms step_avg:168.08ms step:720/1530 train_loss:3.8477 train_time:119343ms step_avg:168.09ms step:721/1530 train_loss:3.4610 train_time:119517ms step_avg:168.10ms step:722/1530 train_loss:3.7325 train_time:119690ms step_avg:168.10ms step:723/1530 train_loss:3.7666 train_time:119862ms step_avg:168.11ms step:724/1530 train_loss:3.5679 train_time:120035ms step_avg:168.12ms step:725/1530 train_loss:3.6541 train_time:120207ms step_avg:168.12ms step:726/1530 train_loss:3.5286 train_time:120381ms step_avg:168.13ms step:727/1530 train_loss:3.5783 train_time:120556ms step_avg:168.14ms step:728/1530 train_loss:3.7314 train_time:120729ms step_avg:168.15ms step:729/1530 train_loss:3.6679 train_time:120903ms step_avg:168.15ms step:730/1530 train_loss:3.6604 train_time:121076ms step_avg:168.16ms step:731/1530 train_loss:3.5486 train_time:121248ms step_avg:168.17ms step:732/1530 train_loss:3.5956 train_time:121419ms step_avg:168.17ms step:733/1530 train_loss:3.8283 train_time:121592ms step_avg:168.18ms step:734/1530 train_loss:3.5554 train_time:121768ms step_avg:168.19ms step:735/1530 train_loss:3.6109 train_time:121942ms step_avg:168.20ms step:736/1530 train_loss:3.7336 train_time:122114ms step_avg:168.20ms step:737/1530 train_loss:3.6722 train_time:122285ms step_avg:168.21ms step:738/1530 train_loss:3.5990 train_time:122457ms step_avg:168.21ms step:739/1530 train_loss:3.4938 train_time:122628ms step_avg:168.21ms step:740/1530 train_loss:4.1083 train_time:122806ms step_avg:168.23ms step:741/1530 train_loss:3.4852 train_time:122979ms step_avg:168.23ms step:742/1530 train_loss:3.5586 train_time:123150ms step_avg:168.24ms step:743/1530 train_loss:3.5761 train_time:123323ms step_avg:168.24ms step:744/1530 train_loss:3.6416 train_time:123495ms step_avg:168.25ms step:745/1530 train_loss:3.5736 train_time:123668ms step_avg:168.26ms step:746/1530 train_loss:3.5902 train_time:123840ms step_avg:168.26ms step:747/1530 train_loss:3.6382 train_time:124013ms step_avg:168.27ms step:748/1530 train_loss:3.5577 train_time:124190ms step_avg:168.28ms step:749/1530 train_loss:3.5539 train_time:124363ms step_avg:168.29ms step:750/1530 train_loss:3.5917 train_time:124533ms step_avg:168.29ms step:750/1530 val_loss:3.5623 train_time:124583ms step_avg:168.36ms step:751/1530 train_loss:3.5711 train_time:124710ms step_avg:168.30ms step:752/1530 train_loss:3.6123 train_time:124880ms step_avg:168.30ms step:753/1530 train_loss:3.6062 train_time:125053ms step_avg:168.31ms step:754/1530 train_loss:3.5945 train_time:125225ms step_avg:168.31ms step:755/1530 train_loss:3.6798 train_time:125530ms step_avg:168.50ms step:756/1530 train_loss:3.4530 train_time:125716ms step_avg:168.52ms step:757/1530 train_loss:3.7165 train_time:125887ms step_avg:168.52ms step:758/1530 train_loss:3.6477 train_time:126059ms step_avg:168.53ms step:759/1530 train_loss:3.5874 train_time:126382ms step_avg:168.73ms step:760/1530 train_loss:3.7028 train_time:126552ms step_avg:168.74ms step:761/1530 train_loss:3.3965 train_time:126725ms step_avg:168.74ms step:762/1530 train_loss:3.5559 train_time:126897ms step_avg:168.75ms step:763/1530 train_loss:3.6592 train_time:127071ms step_avg:168.75ms step:764/1530 train_loss:3.3145 train_time:127242ms step_avg:168.76ms step:765/1530 train_loss:3.7273 train_time:127416ms step_avg:168.76ms step:766/1530 train_loss:3.5668 train_time:127588ms step_avg:168.77ms step:767/1530 train_loss:3.5630 train_time:127760ms step_avg:168.77ms step:768/1530 train_loss:3.5716 train_time:127934ms step_avg:168.78ms step:769/1530 train_loss:3.5818 train_time:128105ms step_avg:168.78ms step:770/1530 train_loss:3.6342 train_time:128277ms step_avg:168.79ms step:771/1530 train_loss:3.8809 train_time:128451ms step_avg:168.79ms step:772/1530 train_loss:3.4480 train_time:128622ms step_avg:168.80ms step:773/1530 train_loss:3.6249 train_time:128794ms step_avg:168.80ms step:774/1530 train_loss:3.6367 train_time:128965ms step_avg:168.80ms step:775/1530 train_loss:3.6071 train_time:129137ms step_avg:168.81ms step:776/1530 train_loss:3.4020 train_time:129311ms step_avg:168.81ms step:777/1530 train_loss:3.3767 train_time:129484ms step_avg:168.82ms step:778/1530 train_loss:3.4872 train_time:129656ms step_avg:168.82ms step:779/1530 train_loss:3.5815 train_time:129828ms step_avg:168.83ms step:780/1530 train_loss:3.5847 train_time:129999ms step_avg:168.83ms step:781/1530 train_loss:3.6680 train_time:130172ms step_avg:168.84ms step:782/1530 train_loss:3.5877 train_time:130343ms step_avg:168.84ms step:783/1530 train_loss:3.5625 train_time:130515ms step_avg:168.84ms step:784/1530 train_loss:3.5904 train_time:130686ms step_avg:168.85ms step:785/1530 train_loss:3.5566 train_time:130857ms step_avg:168.85ms step:786/1530 train_loss:3.4366 train_time:131034ms step_avg:168.86ms step:787/1530 train_loss:3.7174 train_time:131205ms step_avg:168.86ms step:788/1530 train_loss:3.4967 train_time:131377ms step_avg:168.87ms step:789/1530 train_loss:3.5469 train_time:131549ms step_avg:168.87ms step:790/1530 train_loss:3.6208 train_time:131723ms step_avg:168.88ms step:791/1530 train_loss:3.7669 train_time:131899ms step_avg:168.89ms step:792/1530 train_loss:3.7547 train_time:132073ms step_avg:168.89ms step:793/1530 train_loss:3.4424 train_time:132244ms step_avg:168.89ms step:794/1530 train_loss:3.5928 train_time:132418ms step_avg:168.90ms step:795/1530 train_loss:3.6662 train_time:132593ms step_avg:168.91ms step:796/1530 train_loss:3.7413 train_time:132771ms step_avg:168.92ms step:797/1530 train_loss:3.5237 train_time:132944ms step_avg:168.93ms step:798/1530 train_loss:3.6457 train_time:133121ms step_avg:168.93ms step:799/1530 train_loss:3.5291 train_time:133297ms step_avg:168.94ms step:800/1530 train_loss:3.5411 train_time:133471ms step_avg:168.95ms step:801/1530 train_loss:3.6201 train_time:133643ms step_avg:168.96ms step:802/1530 train_loss:3.4885 train_time:133821ms step_avg:168.97ms step:803/1530 train_loss:3.4829 train_time:133995ms step_avg:168.97ms step:804/1530 train_loss:3.6168 train_time:134168ms step_avg:168.98ms step:805/1530 train_loss:3.5139 train_time:134344ms step_avg:168.99ms step:806/1530 train_loss:3.5597 train_time:134517ms step_avg:168.99ms step:807/1530 train_loss:3.6404 train_time:134690ms step_avg:169.00ms step:808/1530 train_loss:3.5435 train_time:134865ms step_avg:169.00ms step:809/1530 train_loss:3.4862 train_time:135039ms step_avg:169.01ms step:810/1530 train_loss:3.5515 train_time:135212ms step_avg:169.01ms step:811/1530 train_loss:3.5753 train_time:135383ms step_avg:169.02ms step:812/1530 train_loss:3.6052 train_time:135556ms step_avg:169.02ms step:813/1530 train_loss:3.6189 train_time:135728ms step_avg:169.03ms step:814/1530 train_loss:3.5577 train_time:135903ms step_avg:169.03ms step:815/1530 train_loss:3.5627 train_time:136076ms step_avg:169.04ms step:816/1530 train_loss:3.6841 train_time:136252ms step_avg:169.05ms step:817/1530 train_loss:3.7623 train_time:136424ms step_avg:169.05ms step:818/1530 train_loss:3.5251 train_time:136596ms step_avg:169.05ms step:819/1530 train_loss:3.7126 train_time:136771ms step_avg:169.06ms step:820/1530 train_loss:3.4914 train_time:136948ms step_avg:169.07ms step:821/1530 train_loss:3.5571 train_time:137121ms step_avg:169.08ms step:822/1530 train_loss:3.6925 train_time:137297ms step_avg:169.08ms step:823/1530 train_loss:3.5691 train_time:137472ms step_avg:169.09ms step:824/1530 train_loss:3.5110 train_time:137645ms step_avg:169.10ms step:825/1530 train_loss:3.6079 train_time:137820ms step_avg:169.10ms step:826/1530 train_loss:3.4708 train_time:137996ms step_avg:169.11ms step:827/1530 train_loss:3.7278 train_time:138170ms step_avg:169.12ms step:828/1530 train_loss:3.6160 train_time:138342ms step_avg:169.12ms step:829/1530 train_loss:3.6291 train_time:138518ms step_avg:169.13ms step:830/1530 train_loss:3.5326 train_time:138694ms step_avg:169.14ms step:831/1530 train_loss:3.5966 train_time:138866ms step_avg:169.14ms step:832/1530 train_loss:3.5133 train_time:139041ms step_avg:169.15ms step:833/1530 train_loss:3.6432 train_time:139218ms step_avg:169.16ms step:834/1530 train_loss:3.4826 train_time:139391ms step_avg:169.16ms step:835/1530 train_loss:3.4560 train_time:139564ms step_avg:169.17ms step:836/1530 train_loss:3.7083 train_time:139739ms step_avg:169.18ms step:837/1530 train_loss:3.3912 train_time:139915ms step_avg:169.18ms step:838/1530 train_loss:3.5885 train_time:140088ms step_avg:169.19ms step:839/1530 train_loss:3.4183 train_time:140262ms step_avg:169.19ms step:840/1530 train_loss:3.4614 train_time:140434ms step_avg:169.20ms step:841/1530 train_loss:3.5659 train_time:140607ms step_avg:169.20ms step:842/1530 train_loss:3.5849 train_time:140781ms step_avg:169.21ms step:843/1530 train_loss:3.5564 train_time:140953ms step_avg:169.21ms step:844/1530 train_loss:3.4248 train_time:141126ms step_avg:169.22ms step:845/1530 train_loss:3.6643 train_time:141300ms step_avg:169.22ms step:846/1530 train_loss:3.5163 train_time:141475ms step_avg:169.23ms step:847/1530 train_loss:3.4905 train_time:141651ms step_avg:169.24ms step:848/1530 train_loss:3.6349 train_time:141823ms step_avg:169.24ms step:849/1530 train_loss:3.4845 train_time:141998ms step_avg:169.25ms step:850/1530 train_loss:3.4388 train_time:142172ms step_avg:169.25ms step:851/1530 train_loss:3.7299 train_time:142345ms step_avg:169.26ms step:852/1530 train_loss:3.4349 train_time:142520ms step_avg:169.26ms step:853/1530 train_loss:3.5647 train_time:142693ms step_avg:169.27ms step:854/1530 train_loss:3.6462 train_time:142868ms step_avg:169.27ms step:855/1530 train_loss:3.5095 train_time:143042ms step_avg:169.28ms step:856/1530 train_loss:3.5429 train_time:143217ms step_avg:169.29ms step:857/1530 train_loss:3.6023 train_time:143391ms step_avg:169.29ms step:858/1530 train_loss:3.4644 train_time:143565ms step_avg:169.30ms step:859/1530 train_loss:3.5616 train_time:143739ms step_avg:169.30ms step:860/1530 train_loss:3.5794 train_time:143912ms step_avg:169.31ms step:861/1530 train_loss:3.6296 train_time:144088ms step_avg:169.32ms step:862/1530 train_loss:3.5984 train_time:144264ms step_avg:169.32ms step:863/1530 train_loss:3.5692 train_time:144440ms step_avg:169.33ms step:864/1530 train_loss:3.3804 train_time:144615ms step_avg:169.34ms step:865/1530 train_loss:3.5955 train_time:144787ms step_avg:169.34ms step:866/1530 train_loss:3.9058 train_time:144963ms step_avg:169.35ms step:867/1530 train_loss:3.4574 train_time:145136ms step_avg:169.35ms step:868/1530 train_loss:3.6421 train_time:145307ms step_avg:169.36ms step:869/1530 train_loss:3.6112 train_time:145479ms step_avg:169.36ms step:870/1530 train_loss:3.4490 train_time:145654ms step_avg:169.36ms step:871/1530 train_loss:3.3929 train_time:145827ms step_avg:169.37ms step:872/1530 train_loss:3.6441 train_time:146001ms step_avg:169.38ms step:873/1530 train_loss:3.4550 train_time:146174ms step_avg:169.38ms step:874/1530 train_loss:3.2201 train_time:146351ms step_avg:169.39ms step:875/1530 train_loss:3.6305 train_time:146525ms step_avg:169.39ms step:875/1530 val_loss:3.5174 train_time:146575ms step_avg:169.45ms step:876/1530 train_loss:3.4365 train_time:146698ms step_avg:169.40ms step:877/1530 train_loss:3.6213 train_time:146875ms step_avg:169.41ms step:878/1530 train_loss:3.4742 train_time:147048ms step_avg:169.41ms step:879/1530 train_loss:3.6453 train_time:147221ms step_avg:169.41ms step:880/1530 train_loss:3.3098 train_time:147393ms step_avg:169.42ms step:881/1530 train_loss:3.4767 train_time:147565ms step_avg:169.42ms step:882/1530 train_loss:3.6975 train_time:147739ms step_avg:169.43ms step:883/1530 train_loss:3.8328 train_time:147911ms step_avg:169.43ms step:884/1530 train_loss:3.5587 train_time:148085ms step_avg:169.43ms step:885/1530 train_loss:3.4871 train_time:148258ms step_avg:169.44ms step:886/1530 train_loss:3.5667 train_time:148431ms step_avg:169.44ms step:887/1530 train_loss:4.0826 train_time:148605ms step_avg:169.45ms step:888/1530 train_loss:3.8316 train_time:148784ms step_avg:169.46ms step:889/1530 train_loss:3.5173 train_time:148957ms step_avg:169.46ms step:890/1530 train_loss:3.5286 train_time:149130ms step_avg:169.47ms step:891/1530 train_loss:3.3550 train_time:149302ms step_avg:169.47ms step:892/1530 train_loss:3.7149 train_time:149476ms step_avg:169.47ms step:893/1530 train_loss:3.4209 train_time:149647ms step_avg:169.48ms step:894/1530 train_loss:3.6352 train_time:149824ms step_avg:169.48ms step:895/1530 train_loss:3.6728 train_time:149999ms step_avg:169.49ms step:896/1530 train_loss:3.4939 train_time:150173ms step_avg:169.50ms step:897/1530 train_loss:3.5404 train_time:150347ms step_avg:169.50ms step:898/1530 train_loss:3.5859 train_time:150522ms step_avg:169.51ms step:899/1530 train_loss:3.4728 train_time:150694ms step_avg:169.51ms step:900/1530 train_loss:3.4263 train_time:150866ms step_avg:169.51ms step:901/1530 train_loss:3.6182 train_time:151040ms step_avg:169.52ms step:902/1530 train_loss:3.6316 train_time:151214ms step_avg:169.52ms step:903/1530 train_loss:3.5353 train_time:151390ms step_avg:169.53ms step:904/1530 train_loss:3.4875 train_time:151562ms step_avg:169.53ms step:905/1530 train_loss:3.4973 train_time:151734ms step_avg:169.53ms step:906/1530 train_loss:3.7057 train_time:151907ms step_avg:169.54ms step:907/1530 train_loss:3.5108 train_time:152081ms step_avg:169.54ms step:908/1530 train_loss:3.5613 train_time:152255ms step_avg:169.55ms step:909/1530 train_loss:3.4486 train_time:152430ms step_avg:169.56ms step:910/1530 train_loss:3.5229 train_time:152610ms step_avg:169.57ms step:911/1530 train_loss:3.6391 train_time:152785ms step_avg:169.57ms step:912/1530 train_loss:3.5873 train_time:152965ms step_avg:169.58ms step:913/1530 train_loss:3.4580 train_time:153144ms step_avg:169.59ms step:914/1530 train_loss:3.7449 train_time:153322ms step_avg:169.60ms step:915/1530 train_loss:3.5317 train_time:153501ms step_avg:169.61ms step:916/1530 train_loss:3.6116 train_time:153677ms step_avg:169.62ms step:917/1530 train_loss:3.5953 train_time:153853ms step_avg:169.63ms step:918/1530 train_loss:4.8277 train_time:154032ms step_avg:169.64ms step:919/1530 train_loss:3.4951 train_time:154212ms step_avg:169.65ms step:920/1530 train_loss:3.5856 train_time:154386ms step_avg:169.66ms step:921/1530 train_loss:3.5518 train_time:154563ms step_avg:169.66ms step:922/1530 train_loss:3.5790 train_time:154740ms step_avg:169.67ms step:923/1530 train_loss:3.6070 train_time:154914ms step_avg:169.68ms step:924/1530 train_loss:3.6779 train_time:155090ms step_avg:169.68ms step:925/1530 train_loss:3.6414 train_time:155264ms step_avg:169.69ms step:926/1530 train_loss:3.5486 train_time:155438ms step_avg:169.69ms step:927/1530 train_loss:3.5517 train_time:155613ms step_avg:169.70ms step:928/1530 train_loss:3.7757 train_time:155789ms step_avg:169.70ms step:929/1530 train_loss:3.6095 train_time:155963ms step_avg:169.71ms step:930/1530 train_loss:3.4016 train_time:156141ms step_avg:169.72ms step:931/1530 train_loss:3.4956 train_time:156315ms step_avg:169.72ms step:932/1530 train_loss:3.6471 train_time:156492ms step_avg:169.73ms step:933/1530 train_loss:3.3586 train_time:156669ms step_avg:169.74ms step:934/1530 train_loss:3.5764 train_time:156846ms step_avg:169.75ms step:935/1530 train_loss:3.4356 train_time:157026ms step_avg:169.76ms step:936/1530 train_loss:3.5080 train_time:157203ms step_avg:169.77ms step:937/1530 train_loss:3.6152 train_time:157382ms step_avg:169.78ms step:938/1530 train_loss:3.5361 train_time:157556ms step_avg:169.78ms step:939/1530 train_loss:3.6672 train_time:157737ms step_avg:169.79ms step:940/1530 train_loss:3.4731 train_time:157912ms step_avg:169.80ms step:941/1530 train_loss:3.5398 train_time:158087ms step_avg:169.80ms step:942/1530 train_loss:3.3524 train_time:158264ms step_avg:169.81ms step:943/1530 train_loss:3.7114 train_time:158444ms step_avg:169.82ms step:944/1530 train_loss:3.3991 train_time:158755ms step_avg:169.97ms step:945/1530 train_loss:3.4217 train_time:158937ms step_avg:169.99ms step:946/1530 train_loss:5.0655 train_time:159118ms step_avg:170.00ms step:947/1530 train_loss:3.5975 train_time:159295ms step_avg:170.00ms step:948/1530 train_loss:3.4795 train_time:159470ms step_avg:170.01ms step:949/1530 train_loss:3.3708 train_time:159793ms step_avg:170.17ms step:950/1530 train_loss:3.4398 train_time:159967ms step_avg:170.18ms step:951/1530 train_loss:3.4047 train_time:160147ms step_avg:170.19ms step:952/1530 train_loss:3.4754 train_time:160322ms step_avg:170.19ms step:953/1530 train_loss:3.5644 train_time:160498ms step_avg:170.20ms step:954/1530 train_loss:3.4444 train_time:160677ms step_avg:170.21ms step:955/1530 train_loss:3.4730 train_time:160852ms step_avg:170.21ms step:956/1530 train_loss:3.4387 train_time:161028ms step_avg:170.22ms step:957/1530 train_loss:3.4878 train_time:161206ms step_avg:170.23ms step:958/1530 train_loss:3.5026 train_time:161385ms step_avg:170.24ms step:959/1530 train_loss:3.5036 train_time:161561ms step_avg:170.24ms step:960/1530 train_loss:3.3998 train_time:161739ms step_avg:170.25ms step:961/1530 train_loss:3.6396 train_time:161915ms step_avg:170.26ms step:962/1530 train_loss:3.5911 train_time:162089ms step_avg:170.26ms step:963/1530 train_loss:3.6948 train_time:162265ms step_avg:170.27ms step:964/1530 train_loss:3.4270 train_time:162443ms step_avg:170.28ms step:965/1530 train_loss:3.4765 train_time:162616ms step_avg:170.28ms step:966/1530 train_loss:3.7053 train_time:162792ms step_avg:170.28ms step:967/1530 train_loss:3.5161 train_time:162967ms step_avg:170.29ms step:968/1530 train_loss:3.5145 train_time:163143ms step_avg:170.30ms step:969/1530 train_loss:3.5795 train_time:163318ms step_avg:170.30ms step:970/1530 train_loss:3.3739 train_time:163491ms step_avg:170.30ms step:971/1530 train_loss:3.5274 train_time:163664ms step_avg:170.31ms step:972/1530 train_loss:3.4739 train_time:163838ms step_avg:170.31ms step:973/1530 train_loss:3.5345 train_time:164012ms step_avg:170.31ms step:974/1530 train_loss:3.5833 train_time:164189ms step_avg:170.32ms step:975/1530 train_loss:3.4639 train_time:164364ms step_avg:170.33ms step:976/1530 train_loss:3.6712 train_time:164538ms step_avg:170.33ms step:977/1530 train_loss:3.5671 train_time:164713ms step_avg:170.33ms step:978/1530 train_loss:3.3538 train_time:164887ms step_avg:170.34ms step:979/1530 train_loss:3.6211 train_time:165062ms step_avg:170.34ms step:980/1530 train_loss:3.4147 train_time:165241ms step_avg:170.35ms step:981/1530 train_loss:3.5728 train_time:165419ms step_avg:170.36ms step:982/1530 train_loss:3.5419 train_time:165594ms step_avg:170.36ms step:983/1530 train_loss:3.5156 train_time:165770ms step_avg:170.37ms step:984/1530 train_loss:3.4915 train_time:165945ms step_avg:170.37ms step:985/1530 train_loss:3.5720 train_time:166122ms step_avg:170.38ms step:986/1530 train_loss:3.4113 train_time:166298ms step_avg:170.39ms step:987/1530 train_loss:3.4803 train_time:166472ms step_avg:170.39ms step:988/1530 train_loss:3.4803 train_time:166645ms step_avg:170.39ms step:989/1530 train_loss:3.4088 train_time:166819ms step_avg:170.40ms step:990/1530 train_loss:3.6591 train_time:166996ms step_avg:170.40ms step:991/1530 train_loss:3.4655 train_time:167170ms step_avg:170.41ms step:992/1530 train_loss:3.4387 train_time:167349ms step_avg:170.42ms step:993/1530 train_loss:3.4952 train_time:167531ms step_avg:170.43ms step:994/1530 train_loss:3.5958 train_time:167703ms step_avg:170.43ms step:995/1530 train_loss:3.5273 train_time:167876ms step_avg:170.43ms step:996/1530 train_loss:3.4537 train_time:168048ms step_avg:170.43ms step:997/1530 train_loss:3.7535 train_time:168224ms step_avg:170.44ms step:998/1530 train_loss:3.4346 train_time:168397ms step_avg:170.44ms step:999/1530 train_loss:3.5861 train_time:168572ms step_avg:170.45ms step:1000/1530 train_loss:3.4336 train_time:168749ms step_avg:170.45ms step:1000/1530 val_loss:3.4634 train_time:168801ms step_avg:170.51ms step:1001/1530 train_loss:3.4948 train_time:168927ms step_avg:170.46ms step:1002/1530 train_loss:3.3709 train_time:169100ms step_avg:170.46ms step:1003/1530 train_loss:3.5511 train_time:169276ms step_avg:170.47ms step:1004/1530 train_loss:3.6020 train_time:169453ms step_avg:170.48ms step:1005/1530 train_loss:3.3831 train_time:169628ms step_avg:170.48ms step:1006/1530 train_loss:3.4625 train_time:169804ms step_avg:170.49ms step:1007/1530 train_loss:3.4353 train_time:169978ms step_avg:170.49ms step:1008/1530 train_loss:3.5558 train_time:170154ms step_avg:170.50ms step:1009/1530 train_loss:3.6605 train_time:170331ms step_avg:170.50ms step:1010/1530 train_loss:3.5576 train_time:170505ms step_avg:170.51ms step:1011/1530 train_loss:3.5293 train_time:170677ms step_avg:170.51ms step:1012/1530 train_loss:3.3897 train_time:170853ms step_avg:170.51ms step:1013/1530 train_loss:3.5311 train_time:171028ms step_avg:170.52ms step:1014/1530 train_loss:3.6157 train_time:171203ms step_avg:170.52ms step:1015/1530 train_loss:3.3229 train_time:171379ms step_avg:170.53ms step:1016/1530 train_loss:3.4041 train_time:171554ms step_avg:170.53ms step:1017/1530 train_loss:3.3906 train_time:171731ms step_avg:170.54ms step:1018/1530 train_loss:3.3903 train_time:171907ms step_avg:170.54ms step:1019/1530 train_loss:3.5159 train_time:172081ms step_avg:170.55ms step:1020/1530 train_loss:3.3771 train_time:172259ms step_avg:170.55ms step:1021/1530 train_loss:3.3504 train_time:172433ms step_avg:170.56ms step:1022/1530 train_loss:3.4717 train_time:172609ms step_avg:170.56ms step:1023/1530 train_loss:3.4995 train_time:172784ms step_avg:170.57ms step:1024/1530 train_loss:3.4691 train_time:172961ms step_avg:170.57ms step:1025/1530 train_loss:3.4762 train_time:173139ms step_avg:170.58ms step:1026/1530 train_loss:3.6121 train_time:173315ms step_avg:170.59ms step:1027/1530 train_loss:3.3171 train_time:173492ms step_avg:170.59ms step:1028/1530 train_loss:3.3931 train_time:173672ms step_avg:170.60ms step:1029/1530 train_loss:3.3069 train_time:173852ms step_avg:170.61ms step:1030/1530 train_loss:3.5368 train_time:174029ms step_avg:170.62ms step:1031/1530 train_loss:3.5029 train_time:174204ms step_avg:170.62ms step:1032/1530 train_loss:3.6916 train_time:174383ms step_avg:170.63ms step:1033/1530 train_loss:3.4908 train_time:174560ms step_avg:170.64ms step:1034/1530 train_loss:3.3949 train_time:174736ms step_avg:170.64ms step:1035/1530 train_loss:3.4357 train_time:174914ms step_avg:170.65ms step:1036/1530 train_loss:3.4760 train_time:175090ms step_avg:170.65ms step:1037/1530 train_loss:3.7862 train_time:175269ms step_avg:170.66ms step:1038/1530 train_loss:3.6134 train_time:175444ms step_avg:170.67ms step:1039/1530 train_loss:3.5077 train_time:175627ms step_avg:170.68ms step:1040/1530 train_loss:3.4122 train_time:175802ms step_avg:170.68ms step:1041/1530 train_loss:3.4830 train_time:175979ms step_avg:170.69ms step:1042/1530 train_loss:3.5175 train_time:176152ms step_avg:170.69ms step:1043/1530 train_loss:3.4415 train_time:176327ms step_avg:170.69ms step:1044/1530 train_loss:3.4548 train_time:176503ms step_avg:170.70ms step:1045/1530 train_loss:3.5092 train_time:176680ms step_avg:170.71ms step:1046/1530 train_loss:3.4241 train_time:176856ms step_avg:170.71ms step:1047/1530 train_loss:3.6337 train_time:177034ms step_avg:170.72ms step:1048/1530 train_loss:3.4896 train_time:177210ms step_avg:170.72ms step:1049/1530 train_loss:3.4016 train_time:177384ms step_avg:170.73ms step:1050/1530 train_loss:3.3909 train_time:177561ms step_avg:170.73ms step:1051/1530 train_loss:3.4936 train_time:177738ms step_avg:170.74ms step:1052/1530 train_loss:3.3587 train_time:177915ms step_avg:170.74ms step:1053/1530 train_loss:3.6814 train_time:178093ms step_avg:170.75ms step:1054/1530 train_loss:3.5275 train_time:178273ms step_avg:170.76ms step:1055/1530 train_loss:3.3806 train_time:178448ms step_avg:170.76ms step:1056/1530 train_loss:3.4969 train_time:178622ms step_avg:170.77ms step:1057/1530 train_loss:3.5727 train_time:178799ms step_avg:170.77ms step:1058/1530 train_loss:3.3013 train_time:178977ms step_avg:170.78ms step:1059/1530 train_loss:3.3677 train_time:179159ms step_avg:170.79ms step:1060/1530 train_loss:3.4316 train_time:179335ms step_avg:170.80ms step:1061/1530 train_loss:3.4137 train_time:179510ms step_avg:170.80ms step:1062/1530 train_loss:3.3792 train_time:179687ms step_avg:170.81ms step:1063/1530 train_loss:3.4569 train_time:179862ms step_avg:170.81ms step:1064/1530 train_loss:3.3776 train_time:180035ms step_avg:170.81ms step:1065/1530 train_loss:3.3556 train_time:180212ms step_avg:170.82ms step:1066/1530 train_loss:3.4087 train_time:180387ms step_avg:170.82ms step:1067/1530 train_loss:3.2797 train_time:180566ms step_avg:170.83ms step:1068/1530 train_loss:3.4323 train_time:180742ms step_avg:170.83ms step:1069/1530 train_loss:3.2890 train_time:180922ms step_avg:170.84ms step:1070/1530 train_loss:3.5623 train_time:181097ms step_avg:170.85ms step:1071/1530 train_loss:3.5060 train_time:181276ms step_avg:170.85ms step:1072/1530 train_loss:3.4382 train_time:181451ms step_avg:170.86ms step:1073/1530 train_loss:3.5174 train_time:181625ms step_avg:170.86ms step:1074/1530 train_loss:3.4274 train_time:181801ms step_avg:170.87ms step:1075/1530 train_loss:3.3963 train_time:181976ms step_avg:170.87ms step:1076/1530 train_loss:3.7938 train_time:182152ms step_avg:170.87ms step:1077/1530 train_loss:3.4301 train_time:182328ms step_avg:170.88ms step:1078/1530 train_loss:3.0978 train_time:182515ms step_avg:170.89ms step:1079/1530 train_loss:3.5271 train_time:182692ms step_avg:170.90ms step:1080/1530 train_loss:3.4221 train_time:182870ms step_avg:170.91ms step:1081/1530 train_loss:3.4933 train_time:183045ms step_avg:170.91ms step:1082/1530 train_loss:3.5878 train_time:183219ms step_avg:170.91ms step:1083/1530 train_loss:3.4899 train_time:183395ms step_avg:170.92ms step:1084/1530 train_loss:3.4637 train_time:183571ms step_avg:170.92ms step:1085/1530 train_loss:3.4267 train_time:183747ms step_avg:170.93ms step:1086/1530 train_loss:3.6222 train_time:183920ms step_avg:170.93ms step:1087/1530 train_loss:3.4977 train_time:184095ms step_avg:170.93ms step:1088/1530 train_loss:3.3679 train_time:184273ms step_avg:170.94ms step:1089/1530 train_loss:3.3728 train_time:184453ms step_avg:170.95ms step:1090/1530 train_loss:3.4731 train_time:184633ms step_avg:170.96ms step:1091/1530 train_loss:3.2792 train_time:184808ms step_avg:170.96ms step:1092/1530 train_loss:3.4781 train_time:184983ms step_avg:170.96ms step:1093/1530 train_loss:3.5992 train_time:185159ms step_avg:170.97ms step:1094/1530 train_loss:3.4446 train_time:185334ms step_avg:170.97ms step:1095/1530 train_loss:3.4166 train_time:185509ms step_avg:170.98ms step:1096/1530 train_loss:3.4193 train_time:185686ms step_avg:170.98ms step:1097/1530 train_loss:3.4873 train_time:185863ms step_avg:170.99ms step:1098/1530 train_loss:3.5628 train_time:186041ms step_avg:170.99ms step:1099/1530 train_loss:3.5259 train_time:186216ms step_avg:171.00ms step:1100/1530 train_loss:3.4206 train_time:186397ms step_avg:171.01ms step:1101/1530 train_loss:3.2868 train_time:186574ms step_avg:171.01ms step:1102/1530 train_loss:3.3129 train_time:186753ms step_avg:171.02ms step:1103/1530 train_loss:3.4429 train_time:186934ms step_avg:171.03ms step:1104/1530 train_loss:3.3135 train_time:187109ms step_avg:171.03ms step:1105/1530 train_loss:4.0541 train_time:187287ms step_avg:171.04ms step:1106/1530 train_loss:3.2262 train_time:187462ms step_avg:171.04ms step:1107/1530 train_loss:3.5649 train_time:187636ms step_avg:171.04ms step:1108/1530 train_loss:3.3432 train_time:187809ms step_avg:171.05ms step:1109/1530 train_loss:3.5008 train_time:187985ms step_avg:171.05ms step:1110/1530 train_loss:3.4216 train_time:188158ms step_avg:171.05ms step:1111/1530 train_loss:3.4817 train_time:188333ms step_avg:171.06ms step:1112/1530 train_loss:3.5540 train_time:188512ms step_avg:171.06ms step:1113/1530 train_loss:3.4253 train_time:188695ms step_avg:171.07ms step:1114/1530 train_loss:3.3743 train_time:188875ms step_avg:171.08ms step:1115/1530 train_loss:3.2403 train_time:189054ms step_avg:171.09ms step:1116/1530 train_loss:3.4267 train_time:189227ms step_avg:171.09ms step:1117/1530 train_loss:3.5879 train_time:189405ms step_avg:171.10ms step:1118/1530 train_loss:3.6177 train_time:189583ms step_avg:171.10ms step:1119/1530 train_loss:3.4731 train_time:189757ms step_avg:171.11ms step:1120/1530 train_loss:3.4814 train_time:189934ms step_avg:171.11ms step:1121/1530 train_loss:3.3851 train_time:190110ms step_avg:171.12ms step:1122/1530 train_loss:3.4518 train_time:190284ms step_avg:171.12ms step:1123/1530 train_loss:3.5763 train_time:190461ms step_avg:171.12ms step:1124/1530 train_loss:3.3317 train_time:190636ms step_avg:171.13ms step:1125/1530 train_loss:3.2329 train_time:190812ms step_avg:171.13ms step:1125/1530 val_loss:3.4051 train_time:190862ms step_avg:171.18ms step:1126/1530 train_loss:3.4723 train_time:190988ms step_avg:171.14ms step:1127/1530 train_loss:3.6654 train_time:191167ms step_avg:171.14ms step:1128/1530 train_loss:3.2239 train_time:191345ms step_avg:171.15ms step:1129/1530 train_loss:3.5521 train_time:191525ms step_avg:171.16ms step:1130/1530 train_loss:3.3754 train_time:191703ms step_avg:171.16ms step:1131/1530 train_loss:3.3953 train_time:191885ms step_avg:171.17ms step:1132/1530 train_loss:3.3632 train_time:192059ms step_avg:171.18ms step:1133/1530 train_loss:3.4826 train_time:192367ms step_avg:171.30ms step:1134/1530 train_loss:3.4364 train_time:192554ms step_avg:171.31ms step:1135/1530 train_loss:3.5167 train_time:192730ms step_avg:171.32ms step:1136/1530 train_loss:3.5625 train_time:192908ms step_avg:171.32ms step:1137/1530 train_loss:3.4526 train_time:193085ms step_avg:171.33ms step:1138/1530 train_loss:3.3518 train_time:193264ms step_avg:171.33ms step:1139/1530 train_loss:3.6511 train_time:193590ms step_avg:171.47ms step:1140/1530 train_loss:3.4533 train_time:193766ms step_avg:171.47ms step:1141/1530 train_loss:3.5888 train_time:193946ms step_avg:171.48ms step:1142/1530 train_loss:3.4493 train_time:194123ms step_avg:171.49ms step:1143/1530 train_loss:3.3579 train_time:194302ms step_avg:171.49ms step:1144/1530 train_loss:3.4445 train_time:194480ms step_avg:171.50ms step:1145/1530 train_loss:3.5863 train_time:194654ms step_avg:171.50ms step:1146/1530 train_loss:3.5484 train_time:194834ms step_avg:171.51ms step:1147/1530 train_loss:3.4861 train_time:195013ms step_avg:171.52ms step:1148/1530 train_loss:3.4948 train_time:195191ms step_avg:171.52ms step:1149/1530 train_loss:3.3243 train_time:195373ms step_avg:171.53ms step:1150/1530 train_loss:3.3700 train_time:195549ms step_avg:171.53ms step:1151/1530 train_loss:3.3256 train_time:195729ms step_avg:171.54ms step:1152/1530 train_loss:3.3916 train_time:195911ms step_avg:171.55ms step:1153/1530 train_loss:3.4316 train_time:196091ms step_avg:171.56ms step:1154/1530 train_loss:3.5103 train_time:196267ms step_avg:171.56ms step:1155/1530 train_loss:3.3135 train_time:196450ms step_avg:171.57ms step:1156/1530 train_loss:3.5302 train_time:196633ms step_avg:171.58ms step:1157/1530 train_loss:3.4920 train_time:196810ms step_avg:171.59ms step:1158/1530 train_loss:3.2467 train_time:196987ms step_avg:171.59ms step:1159/1530 train_loss:3.3463 train_time:197164ms step_avg:171.60ms step:1160/1530 train_loss:3.3317 train_time:197339ms step_avg:171.60ms step:1161/1530 train_loss:3.0785 train_time:197519ms step_avg:171.61ms step:1162/1530 train_loss:3.4172 train_time:197696ms step_avg:171.61ms step:1163/1530 train_loss:3.3861 train_time:197877ms step_avg:171.62ms step:1164/1530 train_loss:3.2917 train_time:198057ms step_avg:171.63ms step:1165/1530 train_loss:3.2400 train_time:198232ms step_avg:171.63ms step:1166/1530 train_loss:3.3798 train_time:198413ms step_avg:171.64ms step:1167/1530 train_loss:3.4096 train_time:198589ms step_avg:171.64ms step:1168/1530 train_loss:3.7171 train_time:198765ms step_avg:171.65ms step:1169/1530 train_loss:3.3729 train_time:198942ms step_avg:171.65ms step:1170/1530 train_loss:3.3864 train_time:199119ms step_avg:171.65ms step:1171/1530 train_loss:3.3133 train_time:199294ms step_avg:171.66ms step:1172/1530 train_loss:3.4180 train_time:199469ms step_avg:171.66ms step:1173/1530 train_loss:3.5335 train_time:199650ms step_avg:171.67ms step:1174/1530 train_loss:3.3770 train_time:199836ms step_avg:171.68ms step:1175/1530 train_loss:3.3616 train_time:200016ms step_avg:171.69ms step:1176/1530 train_loss:3.4265 train_time:200197ms step_avg:171.70ms step:1177/1530 train_loss:3.4482 train_time:200379ms step_avg:171.70ms step:1178/1530 train_loss:3.4961 train_time:200557ms step_avg:171.71ms step:1179/1530 train_loss:3.3918 train_time:200732ms step_avg:171.71ms step:1180/1530 train_loss:3.3516 train_time:200920ms step_avg:171.73ms step:1181/1530 train_loss:3.3311 train_time:201098ms step_avg:171.73ms step:1182/1530 train_loss:3.3760 train_time:201276ms step_avg:171.74ms step:1183/1530 train_loss:3.3349 train_time:201452ms step_avg:171.74ms step:1184/1530 train_loss:3.5057 train_time:201628ms step_avg:171.74ms step:1185/1530 train_loss:3.5392 train_time:201809ms step_avg:171.75ms step:1186/1530 train_loss:3.3593 train_time:201989ms step_avg:171.76ms step:1187/1530 train_loss:3.4102 train_time:202177ms step_avg:171.77ms step:1188/1530 train_loss:3.4385 train_time:202352ms step_avg:171.78ms step:1189/1530 train_loss:3.2732 train_time:202532ms step_avg:171.78ms step:1190/1530 train_loss:3.4389 train_time:202708ms step_avg:171.79ms step:1191/1530 train_loss:3.5747 train_time:202889ms step_avg:171.79ms step:1192/1530 train_loss:3.3875 train_time:203064ms step_avg:171.80ms step:1193/1530 train_loss:3.2736 train_time:203240ms step_avg:171.80ms step:1194/1530 train_loss:3.5533 train_time:203417ms step_avg:171.80ms step:1195/1530 train_loss:3.3649 train_time:203598ms step_avg:171.81ms step:1196/1530 train_loss:3.3798 train_time:203784ms step_avg:171.82ms step:1197/1530 train_loss:3.2913 train_time:203964ms step_avg:171.83ms step:1198/1530 train_loss:3.3002 train_time:204147ms step_avg:171.84ms step:1199/1530 train_loss:3.3394 train_time:204327ms step_avg:171.85ms step:1200/1530 train_loss:3.4448 train_time:204504ms step_avg:171.85ms step:1201/1530 train_loss:3.4774 train_time:204682ms step_avg:171.86ms step:1202/1530 train_loss:3.5985 train_time:204870ms step_avg:171.87ms step:1203/1530 train_loss:3.3998 train_time:205051ms step_avg:171.88ms step:1204/1530 train_loss:3.3002 train_time:205230ms step_avg:171.88ms step:1205/1530 train_loss:3.4269 train_time:205407ms step_avg:171.89ms step:1206/1530 train_loss:3.4723 train_time:205582ms step_avg:171.89ms step:1207/1530 train_loss:3.5121 train_time:205761ms step_avg:171.90ms step:1208/1530 train_loss:3.3906 train_time:205937ms step_avg:171.90ms step:1209/1530 train_loss:3.2445 train_time:206118ms step_avg:171.91ms step:1210/1530 train_loss:3.2981 train_time:206296ms step_avg:171.91ms step:1211/1530 train_loss:3.3915 train_time:206475ms step_avg:171.92ms step:1212/1530 train_loss:3.3913 train_time:206652ms step_avg:171.92ms step:1213/1530 train_loss:3.4030 train_time:206831ms step_avg:171.93ms step:1214/1530 train_loss:3.2482 train_time:207013ms step_avg:171.94ms step:1215/1530 train_loss:3.3868 train_time:207188ms step_avg:171.94ms step:1216/1530 train_loss:3.3268 train_time:207367ms step_avg:171.95ms step:1217/1530 train_loss:3.3165 train_time:207543ms step_avg:171.95ms step:1218/1530 train_loss:3.4034 train_time:207721ms step_avg:171.95ms step:1219/1530 train_loss:3.2476 train_time:207905ms step_avg:171.96ms step:1220/1530 train_loss:3.4742 train_time:208081ms step_avg:171.97ms step:1221/1530 train_loss:3.5041 train_time:208257ms step_avg:171.97ms step:1222/1530 train_loss:3.4334 train_time:208434ms step_avg:171.97ms step:1223/1530 train_loss:3.2946 train_time:208611ms step_avg:171.98ms step:1224/1530 train_loss:3.2496 train_time:208792ms step_avg:171.99ms step:1225/1530 train_loss:3.3641 train_time:208971ms step_avg:171.99ms step:1226/1530 train_loss:3.3281 train_time:209149ms step_avg:172.00ms step:1227/1530 train_loss:3.2738 train_time:209328ms step_avg:172.00ms step:1228/1530 train_loss:3.4383 train_time:209503ms step_avg:172.01ms step:1229/1530 train_loss:3.3609 train_time:209684ms step_avg:172.01ms step:1230/1530 train_loss:3.3949 train_time:209867ms step_avg:172.02ms step:1231/1530 train_loss:3.5756 train_time:210047ms step_avg:172.03ms step:1232/1530 train_loss:3.4923 train_time:210227ms step_avg:172.04ms step:1233/1530 train_loss:3.4254 train_time:210404ms step_avg:172.04ms step:1234/1530 train_loss:3.5828 train_time:210582ms step_avg:172.04ms step:1235/1530 train_loss:3.3213 train_time:210762ms step_avg:172.05ms step:1236/1530 train_loss:3.2833 train_time:210940ms step_avg:172.06ms step:1237/1530 train_loss:3.2710 train_time:211117ms step_avg:172.06ms step:1238/1530 train_loss:3.2714 train_time:211300ms step_avg:172.07ms step:1239/1530 train_loss:3.3274 train_time:211480ms step_avg:172.07ms step:1240/1530 train_loss:3.3777 train_time:211656ms step_avg:172.08ms step:1241/1530 train_loss:3.4257 train_time:211834ms step_avg:172.08ms step:1242/1530 train_loss:3.2948 train_time:212011ms step_avg:172.09ms step:1243/1530 train_loss:3.3978 train_time:212190ms step_avg:172.09ms step:1244/1530 train_loss:3.4023 train_time:212363ms step_avg:172.09ms step:1245/1530 train_loss:3.4122 train_time:212540ms step_avg:172.10ms step:1246/1530 train_loss:3.2411 train_time:212719ms step_avg:172.10ms step:1247/1530 train_loss:3.3671 train_time:212895ms step_avg:172.11ms step:1248/1530 train_loss:3.4259 train_time:213071ms step_avg:172.11ms step:1249/1530 train_loss:3.4206 train_time:213251ms step_avg:172.12ms step:1250/1530 train_loss:3.3071 train_time:213430ms step_avg:172.12ms step:1250/1530 val_loss:3.3521 train_time:213485ms step_avg:172.17ms step:1251/1530 train_loss:3.4878 train_time:213618ms step_avg:172.13ms step:1252/1530 train_loss:3.3546 train_time:213793ms step_avg:172.14ms step:1253/1530 train_loss:3.3078 train_time:213972ms step_avg:172.14ms step:1254/1530 train_loss:3.4098 train_time:214151ms step_avg:172.15ms step:1255/1530 train_loss:3.5128 train_time:214343ms step_avg:172.16ms step:1256/1530 train_loss:3.3006 train_time:214526ms step_avg:172.17ms step:1257/1530 train_loss:3.3735 train_time:214703ms step_avg:172.18ms step:1258/1530 train_loss:3.3604 train_time:214887ms step_avg:172.18ms step:1259/1530 train_loss:3.3225 train_time:215065ms step_avg:172.19ms step:1260/1530 train_loss:3.2038 train_time:215241ms step_avg:172.19ms step:1261/1530 train_loss:3.3052 train_time:215420ms step_avg:172.20ms step:1262/1530 train_loss:3.3255 train_time:215602ms step_avg:172.21ms step:1263/1530 train_loss:3.2328 train_time:215783ms step_avg:172.21ms step:1264/1530 train_loss:3.4404 train_time:215959ms step_avg:172.22ms step:1265/1530 train_loss:3.4251 train_time:216135ms step_avg:172.22ms step:1266/1530 train_loss:3.4360 train_time:216314ms step_avg:172.22ms step:1267/1530 train_loss:3.3681 train_time:216496ms step_avg:172.23ms step:1268/1530 train_loss:3.4075 train_time:216676ms step_avg:172.24ms step:1269/1530 train_loss:3.2507 train_time:216860ms step_avg:172.25ms step:1270/1530 train_loss:3.1067 train_time:217037ms step_avg:172.25ms step:1271/1530 train_loss:3.4020 train_time:217216ms step_avg:172.26ms step:1272/1530 train_loss:3.3465 train_time:217391ms step_avg:172.26ms step:1273/1530 train_loss:3.3746 train_time:217574ms step_avg:172.27ms step:1274/1530 train_loss:3.3575 train_time:217754ms step_avg:172.27ms step:1275/1530 train_loss:3.4267 train_time:217931ms step_avg:172.28ms step:1276/1530 train_loss:3.4686 train_time:218105ms step_avg:172.28ms step:1277/1530 train_loss:3.4060 train_time:218285ms step_avg:172.29ms step:1278/1530 train_loss:3.4117 train_time:218462ms step_avg:172.29ms step:1279/1530 train_loss:3.2639 train_time:218643ms step_avg:172.30ms step:1280/1530 train_loss:3.3661 train_time:218829ms step_avg:172.31ms step:1281/1530 train_loss:3.4208 train_time:219007ms step_avg:172.31ms step:1282/1530 train_loss:3.4651 train_time:219181ms step_avg:172.31ms step:1283/1530 train_loss:3.3338 train_time:219360ms step_avg:172.32ms step:1284/1530 train_loss:3.3694 train_time:219538ms step_avg:172.32ms step:1285/1530 train_loss:3.3544 train_time:219717ms step_avg:172.33ms step:1286/1530 train_loss:3.3325 train_time:219895ms step_avg:172.33ms step:1287/1530 train_loss:3.4864 train_time:220074ms step_avg:172.34ms step:1288/1530 train_loss:3.2922 train_time:220256ms step_avg:172.34ms step:1289/1530 train_loss:3.3821 train_time:220442ms step_avg:172.35ms step:1290/1530 train_loss:3.4561 train_time:220627ms step_avg:172.36ms step:1291/1530 train_loss:3.3820 train_time:220804ms step_avg:172.37ms step:1292/1530 train_loss:3.4765 train_time:220988ms step_avg:172.38ms step:1293/1530 train_loss:3.5141 train_time:221169ms step_avg:172.38ms step:1294/1530 train_loss:3.4517 train_time:221350ms step_avg:172.39ms step:1295/1530 train_loss:3.2799 train_time:221529ms step_avg:172.40ms step:1296/1530 train_loss:3.3698 train_time:221707ms step_avg:172.40ms step:1297/1530 train_loss:3.2710 train_time:221888ms step_avg:172.41ms step:1298/1530 train_loss:3.2713 train_time:222072ms step_avg:172.42ms step:1299/1530 train_loss:3.3942 train_time:222251ms step_avg:172.42ms step:1300/1530 train_loss:3.3988 train_time:222427ms step_avg:172.42ms step:1301/1530 train_loss:3.4020 train_time:222605ms step_avg:172.43ms step:1302/1530 train_loss:3.5745 train_time:222789ms step_avg:172.44ms step:1303/1530 train_loss:3.3015 train_time:222971ms step_avg:172.45ms step:1304/1530 train_loss:3.5111 train_time:223153ms step_avg:172.45ms step:1305/1530 train_loss:3.2577 train_time:223330ms step_avg:172.46ms step:1306/1530 train_loss:3.4497 train_time:223511ms step_avg:172.46ms step:1307/1530 train_loss:3.4516 train_time:223687ms step_avg:172.46ms step:1308/1530 train_loss:3.2805 train_time:223865ms step_avg:172.47ms step:1309/1530 train_loss:3.3036 train_time:224044ms step_avg:172.47ms step:1310/1530 train_loss:3.2838 train_time:224221ms step_avg:172.48ms step:1311/1530 train_loss:3.2926 train_time:224400ms step_avg:172.48ms step:1312/1530 train_loss:3.3746 train_time:224580ms step_avg:172.49ms step:1313/1530 train_loss:3.3395 train_time:224756ms step_avg:172.49ms step:1314/1530 train_loss:3.0431 train_time:224940ms step_avg:172.50ms step:1315/1530 train_loss:3.2719 train_time:225116ms step_avg:172.50ms step:1316/1530 train_loss:3.3948 train_time:225291ms step_avg:172.50ms step:1317/1530 train_loss:3.4189 train_time:225469ms step_avg:172.51ms step:1318/1530 train_loss:3.2978 train_time:225656ms step_avg:172.52ms step:1319/1530 train_loss:3.4268 train_time:225836ms step_avg:172.53ms step:1320/1530 train_loss:3.4562 train_time:226018ms step_avg:172.53ms step:1321/1530 train_loss:3.3611 train_time:226196ms step_avg:172.54ms step:1322/1530 train_loss:3.3236 train_time:226507ms step_avg:172.64ms step:1323/1530 train_loss:3.3206 train_time:226695ms step_avg:172.65ms step:1324/1530 train_loss:3.4269 train_time:226875ms step_avg:172.66ms step:1325/1530 train_loss:3.4906 train_time:227059ms step_avg:172.67ms step:1326/1530 train_loss:3.2123 train_time:227240ms step_avg:172.67ms step:1327/1530 train_loss:3.1629 train_time:227416ms step_avg:172.68ms step:1328/1530 train_loss:3.4880 train_time:227594ms step_avg:172.68ms step:1329/1530 train_loss:3.2912 train_time:227932ms step_avg:172.81ms step:1330/1530 train_loss:3.4268 train_time:228113ms step_avg:172.81ms step:1331/1530 train_loss:3.3274 train_time:228288ms step_avg:172.81ms step:1332/1530 train_loss:3.7395 train_time:228471ms step_avg:172.82ms step:1333/1530 train_loss:3.4764 train_time:228651ms step_avg:172.83ms step:1334/1530 train_loss:3.3688 train_time:228829ms step_avg:172.83ms step:1335/1530 train_loss:3.2902 train_time:229006ms step_avg:172.83ms step:1336/1530 train_loss:3.2936 train_time:229190ms step_avg:172.84ms step:1337/1530 train_loss:3.5495 train_time:229370ms step_avg:172.85ms step:1338/1530 train_loss:3.5232 train_time:229546ms step_avg:172.85ms step:1339/1530 train_loss:3.3325 train_time:229727ms step_avg:172.86ms step:1340/1530 train_loss:3.2843 train_time:229904ms step_avg:172.86ms step:1341/1530 train_loss:3.5888 train_time:230080ms step_avg:172.86ms step:1342/1530 train_loss:3.3575 train_time:230261ms step_avg:172.87ms step:1343/1530 train_loss:3.3628 train_time:230439ms step_avg:172.87ms step:1344/1530 train_loss:3.4120 train_time:230619ms step_avg:172.88ms step:1345/1530 train_loss:3.3814 train_time:230801ms step_avg:172.88ms step:1346/1530 train_loss:3.2912 train_time:230979ms step_avg:172.89ms step:1347/1530 train_loss:3.2742 train_time:231157ms step_avg:172.89ms step:1348/1530 train_loss:3.3469 train_time:231335ms step_avg:172.90ms step:1349/1530 train_loss:3.2699 train_time:231511ms step_avg:172.90ms step:1350/1530 train_loss:3.3898 train_time:231691ms step_avg:172.90ms step:1351/1530 train_loss:3.2365 train_time:231867ms step_avg:172.91ms step:1352/1530 train_loss:3.3059 train_time:232046ms step_avg:172.91ms step:1353/1530 train_loss:3.3991 train_time:232225ms step_avg:172.92ms step:1354/1530 train_loss:3.2605 train_time:232404ms step_avg:172.92ms step:1355/1530 train_loss:3.1879 train_time:232581ms step_avg:172.92ms step:1356/1530 train_loss:3.5076 train_time:232761ms step_avg:172.93ms step:1357/1530 train_loss:3.4240 train_time:232942ms step_avg:172.93ms step:1358/1530 train_loss:3.1811 train_time:233119ms step_avg:172.94ms step:1359/1530 train_loss:3.4395 train_time:233299ms step_avg:172.94ms step:1360/1530 train_loss:3.3507 train_time:233479ms step_avg:172.95ms step:1361/1530 train_loss:3.1226 train_time:233664ms step_avg:172.96ms step:1362/1530 train_loss:3.3901 train_time:233845ms step_avg:172.96ms step:1363/1530 train_loss:3.2823 train_time:234031ms step_avg:172.97ms step:1364/1530 train_loss:3.3012 train_time:234209ms step_avg:172.98ms step:1365/1530 train_loss:3.3131 train_time:234386ms step_avg:172.98ms step:1366/1530 train_loss:3.4207 train_time:234568ms step_avg:172.99ms step:1367/1530 train_loss:3.3946 train_time:234746ms step_avg:172.99ms step:1368/1530 train_loss:3.3415 train_time:234927ms step_avg:172.99ms step:1369/1530 train_loss:3.2725 train_time:235114ms step_avg:173.01ms step:1370/1530 train_loss:3.6049 train_time:235295ms step_avg:173.01ms step:1371/1530 train_loss:3.3128 train_time:235477ms step_avg:173.02ms step:1372/1530 train_loss:3.3639 train_time:235661ms step_avg:173.03ms step:1373/1530 train_loss:3.3660 train_time:235840ms step_avg:173.03ms step:1374/1530 train_loss:3.1476 train_time:236022ms step_avg:173.04ms step:1375/1530 train_loss:3.5307 train_time:236200ms step_avg:173.04ms step:1375/1530 val_loss:3.3094 train_time:236252ms step_avg:173.08ms step:1376/1530 train_loss:3.3425 train_time:236381ms step_avg:173.05ms step:1377/1530 train_loss:3.4810 train_time:236562ms step_avg:173.05ms step:1378/1530 train_loss:3.4662 train_time:236738ms step_avg:173.05ms step:1379/1530 train_loss:3.1127 train_time:236918ms step_avg:173.06ms step:1380/1530 train_loss:3.3131 train_time:237098ms step_avg:173.06ms step:1381/1530 train_loss:3.6864 train_time:237283ms step_avg:173.07ms step:1382/1530 train_loss:3.2098 train_time:237463ms step_avg:173.08ms step:1383/1530 train_loss:3.3915 train_time:237644ms step_avg:173.08ms step:1384/1530 train_loss:3.4756 train_time:237827ms step_avg:173.09ms step:1385/1530 train_loss:3.4062 train_time:238002ms step_avg:173.09ms step:1386/1530 train_loss:3.3357 train_time:238179ms step_avg:173.10ms step:1387/1530 train_loss:3.1965 train_time:238359ms step_avg:173.10ms step:1388/1530 train_loss:3.3411 train_time:238535ms step_avg:173.10ms step:1389/1530 train_loss:3.3153 train_time:238717ms step_avg:173.11ms step:1390/1530 train_loss:3.5650 train_time:238894ms step_avg:173.11ms step:1391/1530 train_loss:3.2883 train_time:239073ms step_avg:173.12ms step:1392/1530 train_loss:3.2810 train_time:239252ms step_avg:173.12ms step:1393/1530 train_loss:3.2368 train_time:239433ms step_avg:173.13ms step:1394/1530 train_loss:3.4950 train_time:239609ms step_avg:173.13ms step:1395/1530 train_loss:3.3894 train_time:239788ms step_avg:173.13ms step:1396/1530 train_loss:3.4062 train_time:239967ms step_avg:173.14ms step:1397/1530 train_loss:3.3049 train_time:240144ms step_avg:173.14ms step:1398/1530 train_loss:3.2540 train_time:240320ms step_avg:173.14ms step:1399/1530 train_loss:3.3139 train_time:240498ms step_avg:173.14ms step:1400/1530 train_loss:3.3178 train_time:240682ms step_avg:173.15ms step:1401/1530 train_loss:3.3483 train_time:240857ms step_avg:173.15ms step:1402/1530 train_loss:3.2960 train_time:241035ms step_avg:173.16ms step:1403/1530 train_loss:3.4929 train_time:241220ms step_avg:173.17ms step:1404/1530 train_loss:3.2794 train_time:241397ms step_avg:173.17ms step:1405/1530 train_loss:3.3109 train_time:241580ms step_avg:173.18ms step:1406/1530 train_loss:3.3113 train_time:241763ms step_avg:173.18ms step:1407/1530 train_loss:3.1724 train_time:241939ms step_avg:173.18ms step:1408/1530 train_loss:3.3118 train_time:242119ms step_avg:173.19ms step:1409/1530 train_loss:3.3010 train_time:242306ms step_avg:173.20ms step:1410/1530 train_loss:3.2863 train_time:242482ms step_avg:173.20ms step:1411/1530 train_loss:3.3613 train_time:242657ms step_avg:173.20ms step:1412/1530 train_loss:3.3350 train_time:242835ms step_avg:173.21ms step:1413/1530 train_loss:3.3584 train_time:243013ms step_avg:173.21ms step:1414/1530 train_loss:3.3245 train_time:243194ms step_avg:173.21ms step:1415/1530 train_loss:3.4040 train_time:243379ms step_avg:173.22ms step:1416/1530 train_loss:3.2259 train_time:243568ms step_avg:173.23ms step:1417/1530 train_loss:3.2751 train_time:243751ms step_avg:173.24ms step:1418/1530 train_loss:3.3882 train_time:243931ms step_avg:173.25ms step:1419/1530 train_loss:3.3444 train_time:244114ms step_avg:173.25ms step:1420/1530 train_loss:3.3662 train_time:244294ms step_avg:173.26ms step:1421/1530 train_loss:3.3674 train_time:244473ms step_avg:173.26ms step:1422/1530 train_loss:3.3302 train_time:244652ms step_avg:173.27ms step:1423/1530 train_loss:3.3117 train_time:244830ms step_avg:173.27ms step:1424/1530 train_loss:3.3358 train_time:245015ms step_avg:173.28ms step:1425/1530 train_loss:3.1830 train_time:245203ms step_avg:173.29ms step:1426/1530 train_loss:3.3234 train_time:245381ms step_avg:173.29ms step:1427/1530 train_loss:3.2825 train_time:245565ms step_avg:173.30ms step:1428/1530 train_loss:3.3727 train_time:245741ms step_avg:173.30ms step:1429/1530 train_loss:3.3519 train_time:245917ms step_avg:173.30ms step:1430/1530 train_loss:3.2568 train_time:246097ms step_avg:173.31ms step:1431/1530 train_loss:3.3193 train_time:246279ms step_avg:173.31ms step:1432/1530 train_loss:3.3367 train_time:246461ms step_avg:173.32ms step:1433/1530 train_loss:3.1320 train_time:246644ms step_avg:173.33ms step:1434/1530 train_loss:3.2870 train_time:246829ms step_avg:173.33ms step:1435/1530 train_loss:3.1138 train_time:247009ms step_avg:173.34ms step:1436/1530 train_loss:3.2251 train_time:247188ms step_avg:173.34ms step:1437/1530 train_loss:3.4058 train_time:247366ms step_avg:173.35ms step:1438/1530 train_loss:3.3773 train_time:247542ms step_avg:173.35ms step:1439/1530 train_loss:3.3115 train_time:247721ms step_avg:173.35ms step:1440/1530 train_loss:3.1907 train_time:247896ms step_avg:173.35ms step:1441/1530 train_loss:3.3377 train_time:248076ms step_avg:173.36ms step:1442/1530 train_loss:3.3819 train_time:248261ms step_avg:173.37ms step:1443/1530 train_loss:3.4889 train_time:248450ms step_avg:173.38ms step:1444/1530 train_loss:3.4477 train_time:248627ms step_avg:173.38ms step:1445/1530 train_loss:3.3368 train_time:248804ms step_avg:173.38ms step:1446/1530 train_loss:3.1969 train_time:248984ms step_avg:173.39ms step:1447/1530 train_loss:3.2939 train_time:249165ms step_avg:173.39ms step:1448/1530 train_loss:3.2972 train_time:249344ms step_avg:173.40ms step:1449/1530 train_loss:3.3912 train_time:249522ms step_avg:173.40ms step:1450/1530 train_loss:3.3839 train_time:249702ms step_avg:173.40ms step:1451/1530 train_loss:3.2025 train_time:249880ms step_avg:173.41ms step:1452/1530 train_loss:3.3260 train_time:250061ms step_avg:173.41ms step:1453/1530 train_loss:3.2575 train_time:250236ms step_avg:173.41ms step:1454/1530 train_loss:3.2870 train_time:250414ms step_avg:173.42ms step:1455/1530 train_loss:3.3270 train_time:250596ms step_avg:173.42ms step:1456/1530 train_loss:3.2811 train_time:250773ms step_avg:173.43ms step:1457/1530 train_loss:3.1555 train_time:250952ms step_avg:173.43ms step:1458/1530 train_loss:3.4204 train_time:251132ms step_avg:173.43ms step:1459/1530 train_loss:3.2711 train_time:251314ms step_avg:173.44ms step:1460/1530 train_loss:3.3121 train_time:251494ms step_avg:173.44ms step:1461/1530 train_loss:3.4263 train_time:251675ms step_avg:173.45ms step:1462/1530 train_loss:3.2548 train_time:251852ms step_avg:173.45ms step:1463/1530 train_loss:3.4665 train_time:252034ms step_avg:173.46ms step:1464/1530 train_loss:3.3570 train_time:252213ms step_avg:173.46ms step:1465/1530 train_loss:3.3562 train_time:252394ms step_avg:173.47ms step:1466/1530 train_loss:3.2810 train_time:252572ms step_avg:173.47ms step:1467/1530 train_loss:3.3899 train_time:252752ms step_avg:173.47ms step:1468/1530 train_loss:3.2833 train_time:252930ms step_avg:173.48ms step:1469/1530 train_loss:3.2733 train_time:253111ms step_avg:173.48ms step:1470/1530 train_loss:3.3326 train_time:253293ms step_avg:173.49ms step:1471/1530 train_loss:3.2578 train_time:253480ms step_avg:173.50ms step:1472/1530 train_loss:3.2451 train_time:253665ms step_avg:173.51ms step:1473/1530 train_loss:3.4346 train_time:253843ms step_avg:173.51ms step:1474/1530 train_loss:3.3099 train_time:254026ms step_avg:173.52ms step:1475/1530 train_loss:3.1480 train_time:254210ms step_avg:173.52ms step:1476/1530 train_loss:3.2654 train_time:254389ms step_avg:173.53ms step:1477/1530 train_loss:3.2349 train_time:254575ms step_avg:173.53ms step:1478/1530 train_loss:3.3071 train_time:254761ms step_avg:173.54ms step:1479/1530 train_loss:3.3933 train_time:254945ms step_avg:173.55ms step:1480/1530 train_loss:3.2679 train_time:255121ms step_avg:173.55ms step:1481/1530 train_loss:3.4497 train_time:255305ms step_avg:173.56ms step:1482/1530 train_loss:3.3622 train_time:255493ms step_avg:173.57ms step:1483/1530 train_loss:3.2774 train_time:255683ms step_avg:173.58ms step:1484/1530 train_loss:3.2589 train_time:255870ms step_avg:173.59ms step:1485/1530 train_loss:3.2744 train_time:256050ms step_avg:173.59ms step:1486/1530 train_loss:3.2269 train_time:256235ms step_avg:173.60ms step:1487/1530 train_loss:3.3423 train_time:256416ms step_avg:173.61ms step:1488/1530 train_loss:3.2431 train_time:256601ms step_avg:173.61ms step:1489/1530 train_loss:3.3144 train_time:256782ms step_avg:173.62ms step:1490/1530 train_loss:3.2569 train_time:256963ms step_avg:173.62ms step:1491/1530 train_loss:3.1584 train_time:257143ms step_avg:173.63ms step:1492/1530 train_loss:3.2657 train_time:257323ms step_avg:173.63ms step:1493/1530 train_loss:3.4325 train_time:257501ms step_avg:173.64ms step:1494/1530 train_loss:3.2960 train_time:257681ms step_avg:173.64ms step:1495/1530 train_loss:3.0294 train_time:257866ms step_avg:173.65ms step:1496/1530 train_loss:3.3625 train_time:258049ms step_avg:173.65ms step:1497/1530 train_loss:3.3140 train_time:258233ms step_avg:173.66ms step:1498/1530 train_loss:3.3430 train_time:258417ms step_avg:173.67ms step:1499/1530 train_loss:3.3144 train_time:258606ms step_avg:173.68ms step:1500/1530 train_loss:3.2931 train_time:258797ms step_avg:173.69ms step:1500/1530 val_loss:3.2781 train_time:258852ms step_avg:173.73ms step:1501/1530 train_loss:3.0889 train_time:258990ms step_avg:173.70ms step:1502/1530 train_loss:3.3608 train_time:259179ms step_avg:173.71ms step:1503/1530 train_loss:3.2422 train_time:259359ms step_avg:173.72ms step:1504/1530 train_loss:3.2427 train_time:259540ms step_avg:173.72ms step:1505/1530 train_loss:3.2111 train_time:259720ms step_avg:173.73ms step:1506/1530 train_loss:3.2794 train_time:259901ms step_avg:173.73ms step:1507/1530 train_loss:3.1791 train_time:260096ms step_avg:173.74ms step:1508/1530 train_loss:3.4773 train_time:260278ms step_avg:173.75ms step:1509/1530 train_loss:3.2795 train_time:260454ms step_avg:173.75ms step:1510/1530 train_loss:3.2729 train_time:260635ms step_avg:173.76ms step:1511/1530 train_loss:3.4130 train_time:260946ms step_avg:173.85ms step:1512/1530 train_loss:3.4185 train_time:261134ms step_avg:173.86ms step:1513/1530 train_loss:3.2661 train_time:261319ms step_avg:173.86ms step:1514/1530 train_loss:3.0797 train_time:261500ms step_avg:173.87ms step:1515/1530 train_loss:3.2411 train_time:261679ms step_avg:173.87ms step:1516/1530 train_loss:3.2525 train_time:261866ms step_avg:173.88ms step:1517/1530 train_loss:3.2983 train_time:262048ms step_avg:173.89ms step:1518/1530 train_loss:3.2053 train_time:262232ms step_avg:173.89ms step:1519/1530 train_loss:3.4956 train_time:262564ms step_avg:174.00ms step:1520/1530 train_loss:3.1261 train_time:262748ms step_avg:174.01ms step:1521/1530 train_loss:3.2057 train_time:262924ms step_avg:174.01ms step:1522/1530 train_loss:3.3513 train_time:263110ms step_avg:174.01ms step:1523/1530 train_loss:3.2288 train_time:263288ms step_avg:174.02ms step:1524/1530 train_loss:3.3465 train_time:263469ms step_avg:174.02ms step:1525/1530 train_loss:3.3361 train_time:263656ms step_avg:174.03ms step:1526/1530 train_loss:3.2737 train_time:263847ms step_avg:174.04ms step:1527/1530 train_loss:3.2886 train_time:264029ms step_avg:174.05ms step:1528/1530 train_loss:3.4048 train_time:264209ms step_avg:174.05ms step:1529/1530 train_loss:3.4059 train_time:264387ms step_avg:174.05ms step:1530/1530 train_loss:3.2362 train_time:264564ms step_avg:174.06ms step:1530/1530 val_loss:3.2756 train_time:264617ms step_avg:174.09ms