import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import glob import time import contextlib from dataclasses import dataclass import numpy as np import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import flex_attention, create_block_mask flex_attention = torch.compile(flex_attention, dynamic=False) create_block_mask = torch.compile(create_block_mask, dynamic=False) # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) super().__init__(params, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] zeropower_backend = zeropower_backends[group['backend']] # generate weight updates in distributed fashion total_params = sum(p.numel() for p in group['params']) updates_flat = torch.zeros(total_params, device='cuda', dtype=torch.bfloat16) curr_idx = 0 for i, p in enumerate(group['params']): # luckily this will perfectly distribute a transformer with multiple of 4 layers to 8 GPUs if i % int(os.environ['WORLD_SIZE']) == int(os.environ['RANK']): g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.mul_(momentum).add_(g) g = g.add(buf, alpha=momentum) if group['nesterov'] else buf g = zeropower_backend(g, steps=group['backend_steps']) g *= max(1, g.size(0)/g.size(1))**0.5 updates_flat[curr_idx:curr_idx+p.numel()] = g.flatten() curr_idx += p.numel() # sync updates across devices. we are not memory-constrained so can do this simple deserialization dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) # deserialize and apply updates curr_idx = 0 for p in group['params']: g = updates_flat[curr_idx:curr_idx+p.numel()].view_as(p.data).type_as(p.data) p.data.add_(g, alpha=-lr) curr_idx += p.numel() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lamb = nn.Parameter(torch.tensor(0.5)) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.n_head, -1) k = self.c_k(x).view(B, T, self.n_head, -1) v = self.c_v(x).view(B, T, self.n_head, -1) v = (1 - self.lamb) * v + self.lamb * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 class GPT(nn.Module): def __init__(self, config): super().__init__() # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning vte = nn.Embedding(config.vocab_size, config.n_embd*12), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx, target, attn_blocksize): docs = (idx == 50256).cumsum(0) def document_causal_mask(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < attn_blocksize return causal_mask & document_mask & window_mask S = len(idx) block_mask = create_block_mask(document_causal_mask, None, None, S, S, device="cuda", _compile=True) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(12, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers+i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(filename): # only reads the header, returns header data with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) if header[0] != 20240520: print("ERROR: magic number mismatch in the data .bin file!") print("---> HINT: Are you passing in a correct file with --input_bin?") print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README") print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try") exit(1) assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) return ntok # for now just return the number of tokens def _load_data_shard(filename): with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) # the rest of it are tokens, stored as uint16 tokens = np.frombuffer(f.read(), dtype=np.uint16) assert len(tokens) == ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(glob.glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total ntok_total = 0 for fname in self.files: shard_ntok = _peek_data_shard(fname) assert shard_ntok >= num_processes * T + 1 ntok_total += int(shard_ntok) self.ntok_total = ntok_total self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] buf = torch.tensor(buf.astype(np.int32), dtype=torch.long) x = buf[:-1] # inputs y = buf[1:] # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size >= len(self.tokens): self.advance() return x.cuda(), y.cuda() # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1530 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) # note that this learning rate is neither sensitive nor tuned optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.time() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.time() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the attention blocksize for the current step, in chunks of 64. By @fernbear.bsky.social attn_blocksize = torch.tensor(64*((step/args.num_iterations * (1792 - 64) + 64)//64), dtype=torch.int, device='cuda') # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, attn_blocksize=attn_blocksize) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.time() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.time() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps+1): ctx = model.no_sync() if i < train_accumulation_steps else contextlib.nullcontext() with ctx: # there's no need to sync gradients every accumulation step # forward pass loss = model(x, y, attn_blocksize=attn_blocksize) # advance the dataset for the next batch x, y = train_loader.next_batch() # backward pass loss.backward() train_loss = loss.detach() for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) optimizer3.param_groups[0]['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. #dist.all_reduce(train_loss, op=dist.ReduceOp.AVG) # all-reducing the training loss would be more correct in terms of logging, but slower approx_time = training_time_ms + 1000 * (time.time() - t0) print0(f"step:{step+1}/{args.num_iterations} train_loss:{train_loss.item():.4f} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Thu Dec 5 02:38:29 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 75W / 700W | 3MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 30C P0 114W / 700W | 529MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 31C P0 118W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 37C P0 117W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 38C P0 122W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 29C P0 110W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 38C P0 127W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 118W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1100000000 across 11 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1530 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1530 train_loss:10.8258 train_time:31718ms step_avg:nanms step:2/1530 train_loss:10.0751 train_time:31831ms step_avg:nanms step:3/1530 train_loss:8.3533 train_time:31990ms step_avg:nanms step:4/1530 train_loss:7.5922 train_time:32150ms step_avg:nanms step:5/1530 train_loss:7.4689 train_time:32311ms step_avg:nanms step:6/1530 train_loss:6.9960 train_time:32470ms step_avg:nanms step:7/1530 train_loss:7.2182 train_time:32631ms step_avg:nanms step:8/1530 train_loss:6.7392 train_time:32793ms step_avg:nanms step:9/1530 train_loss:6.6227 train_time:32952ms step_avg:nanms step:10/1530 train_loss:6.5007 train_time:33113ms step_avg:nanms step:11/1530 train_loss:6.4443 train_time:114ms step_avg:nanms step:12/1530 train_loss:6.3307 train_time:276ms step_avg:nanms step:13/1530 train_loss:6.2878 train_time:436ms step_avg:145.43ms step:14/1530 train_loss:6.2192 train_time:596ms step_avg:148.98ms step:15/1530 train_loss:6.1847 train_time:758ms step_avg:151.54ms step:16/1530 train_loss:6.0960 train_time:919ms step_avg:153.15ms step:17/1530 train_loss:6.1850 train_time:1080ms step_avg:154.27ms step:18/1530 train_loss:5.9535 train_time:1240ms step_avg:155.04ms step:19/1530 train_loss:6.0010 train_time:1401ms step_avg:155.67ms step:20/1530 train_loss:5.6853 train_time:1562ms step_avg:156.15ms step:21/1530 train_loss:5.9671 train_time:1721ms step_avg:156.46ms step:22/1530 train_loss:6.2000 train_time:1883ms step_avg:156.91ms step:23/1530 train_loss:5.8574 train_time:2043ms step_avg:157.16ms step:24/1530 train_loss:6.0250 train_time:2203ms step_avg:157.36ms step:25/1530 train_loss:5.6975 train_time:2364ms step_avg:157.60ms step:26/1530 train_loss:5.5942 train_time:2523ms step_avg:157.71ms step:27/1530 train_loss:5.7937 train_time:2684ms step_avg:157.86ms step:28/1530 train_loss:5.4106 train_time:2844ms step_avg:158.01ms step:29/1530 train_loss:5.6881 train_time:3005ms step_avg:158.16ms step:30/1530 train_loss:5.4846 train_time:3165ms step_avg:158.23ms step:31/1530 train_loss:5.4473 train_time:3324ms step_avg:158.30ms step:32/1530 train_loss:5.2847 train_time:3486ms step_avg:158.46ms step:33/1530 train_loss:5.5960 train_time:3646ms step_avg:158.53ms step:34/1530 train_loss:5.5088 train_time:3807ms step_avg:158.64ms step:35/1530 train_loss:5.6332 train_time:3968ms step_avg:158.72ms step:36/1530 train_loss:5.5565 train_time:4128ms step_avg:158.78ms step:37/1530 train_loss:5.4518 train_time:4289ms step_avg:158.85ms step:38/1530 train_loss:5.3158 train_time:4449ms step_avg:158.89ms step:39/1530 train_loss:5.3350 train_time:4609ms step_avg:158.94ms step:40/1530 train_loss:5.2476 train_time:4769ms step_avg:158.98ms step:41/1530 train_loss:5.2323 train_time:4929ms step_avg:159.01ms step:42/1530 train_loss:5.1788 train_time:5089ms step_avg:159.04ms step:43/1530 train_loss:5.2876 train_time:5250ms step_avg:159.08ms step:44/1530 train_loss:5.2320 train_time:5410ms step_avg:159.11ms step:45/1530 train_loss:5.3832 train_time:5570ms step_avg:159.14ms step:46/1530 train_loss:5.1663 train_time:5730ms step_avg:159.16ms step:47/1530 train_loss:5.0681 train_time:5890ms step_avg:159.19ms step:48/1530 train_loss:5.1953 train_time:6049ms step_avg:159.20ms step:49/1530 train_loss:5.1351 train_time:6209ms step_avg:159.21ms step:50/1530 train_loss:5.2604 train_time:6370ms step_avg:159.26ms step:51/1530 train_loss:5.1484 train_time:6530ms step_avg:159.26ms step:52/1530 train_loss:5.0430 train_time:6689ms step_avg:159.27ms step:53/1530 train_loss:5.1787 train_time:6850ms step_avg:159.30ms step:54/1530 train_loss:5.0100 train_time:7010ms step_avg:159.31ms step:55/1530 train_loss:5.4082 train_time:7170ms step_avg:159.32ms step:56/1530 train_loss:5.0384 train_time:7330ms step_avg:159.34ms step:57/1530 train_loss:4.9019 train_time:7490ms step_avg:159.36ms step:58/1530 train_loss:5.0446 train_time:7650ms step_avg:159.37ms step:59/1530 train_loss:5.0239 train_time:7810ms step_avg:159.38ms step:60/1530 train_loss:5.1546 train_time:7971ms step_avg:159.42ms step:61/1530 train_loss:4.8699 train_time:8131ms step_avg:159.43ms step:62/1530 train_loss:4.9875 train_time:8290ms step_avg:159.43ms step:63/1530 train_loss:4.9678 train_time:8451ms step_avg:159.45ms step:64/1530 train_loss:4.9834 train_time:8611ms step_avg:159.46ms step:65/1530 train_loss:4.7982 train_time:8772ms step_avg:159.49ms step:66/1530 train_loss:4.9230 train_time:8931ms step_avg:159.48ms step:67/1530 train_loss:4.8182 train_time:9092ms step_avg:159.50ms step:68/1530 train_loss:5.1024 train_time:9252ms step_avg:159.52ms step:69/1530 train_loss:4.7231 train_time:9412ms step_avg:159.53ms step:70/1530 train_loss:4.8517 train_time:9573ms step_avg:159.54ms step:71/1530 train_loss:4.9876 train_time:9733ms step_avg:159.55ms step:72/1530 train_loss:4.9148 train_time:9892ms step_avg:159.56ms step:73/1530 train_loss:4.8009 train_time:10052ms step_avg:159.55ms step:74/1530 train_loss:4.9354 train_time:10213ms step_avg:159.57ms step:75/1530 train_loss:4.9033 train_time:10374ms step_avg:159.60ms step:76/1530 train_loss:4.8000 train_time:10534ms step_avg:159.60ms step:77/1530 train_loss:4.9186 train_time:10693ms step_avg:159.60ms step:78/1530 train_loss:5.1327 train_time:10853ms step_avg:159.61ms step:79/1530 train_loss:4.8451 train_time:11014ms step_avg:159.62ms step:80/1530 train_loss:4.8587 train_time:11173ms step_avg:159.62ms step:81/1530 train_loss:4.6494 train_time:11333ms step_avg:159.62ms step:82/1530 train_loss:4.8318 train_time:11493ms step_avg:159.62ms step:83/1530 train_loss:4.7760 train_time:11653ms step_avg:159.63ms step:84/1530 train_loss:4.7638 train_time:11813ms step_avg:159.63ms step:85/1530 train_loss:4.6303 train_time:11974ms step_avg:159.65ms step:86/1530 train_loss:4.8456 train_time:12134ms step_avg:159.65ms step:87/1530 train_loss:4.7603 train_time:12293ms step_avg:159.65ms step:88/1530 train_loss:4.7534 train_time:12454ms step_avg:159.67ms step:89/1530 train_loss:4.7162 train_time:12615ms step_avg:159.69ms step:90/1530 train_loss:4.6485 train_time:12777ms step_avg:159.71ms step:91/1530 train_loss:4.6332 train_time:12937ms step_avg:159.71ms step:92/1530 train_loss:4.7953 train_time:13097ms step_avg:159.72ms step:93/1530 train_loss:4.6131 train_time:13257ms step_avg:159.73ms step:94/1530 train_loss:4.6369 train_time:13418ms step_avg:159.74ms step:95/1530 train_loss:4.6914 train_time:13580ms step_avg:159.77ms step:96/1530 train_loss:4.5907 train_time:13742ms step_avg:159.79ms step:97/1530 train_loss:4.6614 train_time:13902ms step_avg:159.79ms step:98/1530 train_loss:4.6001 train_time:14063ms step_avg:159.80ms step:99/1530 train_loss:4.6813 train_time:14222ms step_avg:159.80ms step:100/1530 train_loss:4.6808 train_time:14383ms step_avg:159.82ms step:101/1530 train_loss:4.5329 train_time:14544ms step_avg:159.82ms step:102/1530 train_loss:4.6995 train_time:14704ms step_avg:159.82ms step:103/1530 train_loss:4.5843 train_time:14864ms step_avg:159.83ms step:104/1530 train_loss:4.5498 train_time:15024ms step_avg:159.83ms step:105/1530 train_loss:4.5816 train_time:15185ms step_avg:159.84ms step:106/1530 train_loss:4.6338 train_time:15345ms step_avg:159.85ms step:107/1530 train_loss:4.5152 train_time:15506ms step_avg:159.85ms step:108/1530 train_loss:4.3600 train_time:15666ms step_avg:159.86ms step:109/1530 train_loss:4.4967 train_time:15825ms step_avg:159.85ms step:110/1530 train_loss:4.4972 train_time:15986ms step_avg:159.86ms step:111/1530 train_loss:4.4313 train_time:16147ms step_avg:159.87ms step:112/1530 train_loss:4.5865 train_time:16308ms step_avg:159.88ms step:113/1530 train_loss:4.4943 train_time:16468ms step_avg:159.89ms step:114/1530 train_loss:4.3673 train_time:16628ms step_avg:159.88ms step:115/1530 train_loss:4.5083 train_time:16791ms step_avg:159.91ms step:116/1530 train_loss:4.4712 train_time:16954ms step_avg:159.95ms step:117/1530 train_loss:4.3702 train_time:17119ms step_avg:159.99ms step:118/1530 train_loss:4.5869 train_time:17284ms step_avg:160.04ms step:119/1530 train_loss:4.4581 train_time:17448ms step_avg:160.07ms step:120/1530 train_loss:4.3426 train_time:17610ms step_avg:160.10ms step:121/1530 train_loss:4.2962 train_time:17774ms step_avg:160.13ms step:122/1530 train_loss:4.4407 train_time:17938ms step_avg:160.16ms step:123/1530 train_loss:4.2685 train_time:18101ms step_avg:160.19ms step:124/1530 train_loss:4.5808 train_time:18266ms step_avg:160.22ms step:125/1530 train_loss:4.4628 train_time:18429ms step_avg:160.25ms step:125/1530 val_loss:4.4028 train_time:18476ms step_avg:160.66ms step:126/1530 train_loss:4.4073 train_time:18596ms step_avg:160.31ms step:127/1530 train_loss:4.4337 train_time:18762ms step_avg:160.36ms step:128/1530 train_loss:4.3847 train_time:18926ms step_avg:160.39ms step:129/1530 train_loss:4.6815 train_time:19089ms step_avg:160.41ms step:130/1530 train_loss:4.3736 train_time:19253ms step_avg:160.44ms step:131/1530 train_loss:4.4115 train_time:19418ms step_avg:160.48ms step:132/1530 train_loss:4.3466 train_time:19582ms step_avg:160.51ms step:133/1530 train_loss:4.4368 train_time:19745ms step_avg:160.53ms step:134/1530 train_loss:4.2535 train_time:19908ms step_avg:160.55ms step:135/1530 train_loss:4.4373 train_time:20072ms step_avg:160.57ms step:136/1530 train_loss:4.2150 train_time:20235ms step_avg:160.60ms step:137/1530 train_loss:4.3713 train_time:20399ms step_avg:160.62ms step:138/1530 train_loss:4.2810 train_time:20562ms step_avg:160.64ms step:139/1530 train_loss:4.3697 train_time:20725ms step_avg:160.66ms step:140/1530 train_loss:4.4697 train_time:20890ms step_avg:160.69ms step:141/1530 train_loss:4.3035 train_time:21055ms step_avg:160.72ms step:142/1530 train_loss:4.2912 train_time:21219ms step_avg:160.75ms step:143/1530 train_loss:4.2548 train_time:21382ms step_avg:160.77ms step:144/1530 train_loss:4.3452 train_time:21546ms step_avg:160.79ms step:145/1530 train_loss:4.3050 train_time:21708ms step_avg:160.80ms step:146/1530 train_loss:4.1678 train_time:21873ms step_avg:160.83ms step:147/1530 train_loss:4.3199 train_time:22037ms step_avg:160.86ms step:148/1530 train_loss:4.3505 train_time:22200ms step_avg:160.87ms step:149/1530 train_loss:4.2953 train_time:22364ms step_avg:160.89ms step:150/1530 train_loss:4.4376 train_time:22528ms step_avg:160.91ms step:151/1530 train_loss:4.2671 train_time:22691ms step_avg:160.93ms step:152/1530 train_loss:4.2845 train_time:22855ms step_avg:160.95ms step:153/1530 train_loss:4.3788 train_time:23019ms step_avg:160.97ms step:154/1530 train_loss:4.3562 train_time:23182ms step_avg:160.99ms step:155/1530 train_loss:4.2652 train_time:23346ms step_avg:161.01ms step:156/1530 train_loss:4.3434 train_time:23509ms step_avg:161.02ms step:157/1530 train_loss:4.3906 train_time:23672ms step_avg:161.03ms step:158/1530 train_loss:4.2435 train_time:23838ms step_avg:161.07ms step:159/1530 train_loss:4.3036 train_time:24002ms step_avg:161.09ms step:160/1530 train_loss:4.1236 train_time:24165ms step_avg:161.10ms step:161/1530 train_loss:4.3393 train_time:24329ms step_avg:161.12ms step:162/1530 train_loss:4.3516 train_time:24493ms step_avg:161.14ms step:163/1530 train_loss:4.3340 train_time:24657ms step_avg:161.16ms step:164/1530 train_loss:4.1823 train_time:24821ms step_avg:161.17ms step:165/1530 train_loss:4.2778 train_time:24984ms step_avg:161.19ms step:166/1530 train_loss:4.3369 train_time:25148ms step_avg:161.21ms step:167/1530 train_loss:4.2028 train_time:25313ms step_avg:161.23ms step:168/1530 train_loss:4.2827 train_time:25477ms step_avg:161.25ms step:169/1530 train_loss:4.1532 train_time:25641ms step_avg:161.26ms step:170/1530 train_loss:4.0153 train_time:25804ms step_avg:161.28ms step:171/1530 train_loss:4.1929 train_time:25966ms step_avg:161.28ms step:172/1530 train_loss:4.1969 train_time:26129ms step_avg:161.29ms step:173/1530 train_loss:4.2599 train_time:26292ms step_avg:161.30ms step:174/1530 train_loss:4.4128 train_time:26454ms step_avg:161.31ms step:175/1530 train_loss:4.2450 train_time:26618ms step_avg:161.32ms step:176/1530 train_loss:4.0984 train_time:26780ms step_avg:161.33ms step:177/1530 train_loss:4.0695 train_time:26943ms step_avg:161.34ms step:178/1530 train_loss:4.1814 train_time:27106ms step_avg:161.34ms step:179/1530 train_loss:4.1305 train_time:27268ms step_avg:161.35ms step:180/1530 train_loss:4.1144 train_time:27432ms step_avg:161.36ms step:181/1530 train_loss:4.2939 train_time:27595ms step_avg:161.37ms step:182/1530 train_loss:4.1417 train_time:27758ms step_avg:161.39ms step:183/1530 train_loss:4.1104 train_time:27921ms step_avg:161.39ms step:184/1530 train_loss:4.1196 train_time:28084ms step_avg:161.40ms step:185/1530 train_loss:4.2085 train_time:28248ms step_avg:161.41ms step:186/1530 train_loss:4.1662 train_time:28410ms step_avg:161.42ms step:187/1530 train_loss:4.2255 train_time:28573ms step_avg:161.43ms step:188/1530 train_loss:4.1578 train_time:28876ms step_avg:162.23ms step:189/1530 train_loss:4.0972 train_time:29206ms step_avg:163.16ms step:190/1530 train_loss:4.2031 train_time:29368ms step_avg:163.16ms step:191/1530 train_loss:4.0736 train_time:29531ms step_avg:163.16ms step:192/1530 train_loss:4.0274 train_time:29693ms step_avg:163.15ms step:193/1530 train_loss:4.2413 train_time:29857ms step_avg:163.15ms step:194/1530 train_loss:4.1677 train_time:30020ms step_avg:163.15ms step:195/1530 train_loss:4.3484 train_time:30183ms step_avg:163.15ms step:196/1530 train_loss:4.1724 train_time:30346ms step_avg:163.15ms step:197/1530 train_loss:4.0388 train_time:30509ms step_avg:163.15ms step:198/1530 train_loss:4.1727 train_time:30671ms step_avg:163.14ms step:199/1530 train_loss:4.0364 train_time:30835ms step_avg:163.15ms step:200/1530 train_loss:4.1144 train_time:30999ms step_avg:163.15ms step:201/1530 train_loss:4.0018 train_time:31161ms step_avg:163.15ms step:202/1530 train_loss:4.2490 train_time:31324ms step_avg:163.15ms step:203/1530 train_loss:4.0605 train_time:31488ms step_avg:163.15ms step:204/1530 train_loss:4.1769 train_time:31650ms step_avg:163.14ms step:205/1530 train_loss:4.2314 train_time:31813ms step_avg:163.14ms step:206/1530 train_loss:3.9479 train_time:31976ms step_avg:163.14ms step:207/1530 train_loss:4.0851 train_time:32140ms step_avg:163.14ms step:208/1530 train_loss:4.0943 train_time:32302ms step_avg:163.14ms step:209/1530 train_loss:4.2262 train_time:32466ms step_avg:163.14ms step:210/1530 train_loss:4.1748 train_time:32628ms step_avg:163.14ms step:211/1530 train_loss:4.0580 train_time:32792ms step_avg:163.14ms step:212/1530 train_loss:4.1015 train_time:32956ms step_avg:163.15ms step:213/1530 train_loss:4.0419 train_time:33120ms step_avg:163.15ms step:214/1530 train_loss:4.1051 train_time:33283ms step_avg:163.15ms step:215/1530 train_loss:3.9482 train_time:33446ms step_avg:163.15ms step:216/1530 train_loss:3.9979 train_time:33608ms step_avg:163.15ms step:217/1530 train_loss:4.0081 train_time:33772ms step_avg:163.15ms step:218/1530 train_loss:4.0764 train_time:33936ms step_avg:163.15ms step:219/1530 train_loss:4.0616 train_time:34098ms step_avg:163.15ms step:220/1530 train_loss:4.0759 train_time:34261ms step_avg:163.15ms step:221/1530 train_loss:4.0885 train_time:34425ms step_avg:163.15ms step:222/1530 train_loss:3.9938 train_time:34587ms step_avg:163.15ms step:223/1530 train_loss:3.9811 train_time:34751ms step_avg:163.15ms step:224/1530 train_loss:4.2952 train_time:34914ms step_avg:163.15ms step:225/1530 train_loss:3.9155 train_time:35077ms step_avg:163.15ms step:226/1530 train_loss:3.9849 train_time:35241ms step_avg:163.15ms step:227/1530 train_loss:3.9658 train_time:35403ms step_avg:163.15ms step:228/1530 train_loss:4.1448 train_time:35567ms step_avg:163.15ms step:229/1530 train_loss:3.9191 train_time:35735ms step_avg:163.18ms step:230/1530 train_loss:4.0433 train_time:35901ms step_avg:163.19ms step:231/1530 train_loss:3.9032 train_time:36067ms step_avg:163.20ms step:232/1530 train_loss:3.9617 train_time:36234ms step_avg:163.21ms step:233/1530 train_loss:4.0812 train_time:36400ms step_avg:163.23ms step:234/1530 train_loss:4.0242 train_time:36565ms step_avg:163.24ms step:235/1530 train_loss:3.8920 train_time:36733ms step_avg:163.26ms step:236/1530 train_loss:4.0746 train_time:36900ms step_avg:163.27ms step:237/1530 train_loss:4.0679 train_time:37066ms step_avg:163.29ms step:238/1530 train_loss:3.9318 train_time:37233ms step_avg:163.30ms step:239/1530 train_loss:4.0768 train_time:37399ms step_avg:163.31ms step:240/1530 train_loss:4.1070 train_time:37564ms step_avg:163.32ms step:241/1530 train_loss:3.9628 train_time:37730ms step_avg:163.34ms step:242/1530 train_loss:4.1367 train_time:37897ms step_avg:163.35ms step:243/1530 train_loss:4.0049 train_time:38064ms step_avg:163.36ms step:244/1530 train_loss:4.0782 train_time:38228ms step_avg:163.37ms step:245/1530 train_loss:4.1393 train_time:38396ms step_avg:163.39ms step:246/1530 train_loss:4.0540 train_time:38561ms step_avg:163.40ms step:247/1530 train_loss:3.9969 train_time:38727ms step_avg:163.41ms step:248/1530 train_loss:4.0950 train_time:38894ms step_avg:163.42ms step:249/1530 train_loss:3.9229 train_time:39061ms step_avg:163.43ms step:250/1530 train_loss:3.9691 train_time:39226ms step_avg:163.44ms step:250/1530 val_loss:3.9933 train_time:39274ms step_avg:163.64ms step:251/1530 train_loss:4.0626 train_time:39393ms step_avg:163.46ms step:252/1530 train_loss:4.1523 train_time:39561ms step_avg:163.48ms step:253/1530 train_loss:3.9251 train_time:39728ms step_avg:163.49ms step:254/1530 train_loss:3.8737 train_time:39893ms step_avg:163.50ms step:255/1530 train_loss:4.0735 train_time:40059ms step_avg:163.51ms step:256/1530 train_loss:3.9821 train_time:40226ms step_avg:163.52ms step:257/1530 train_loss:3.9894 train_time:40391ms step_avg:163.53ms step:258/1530 train_loss:3.9802 train_time:40557ms step_avg:163.54ms step:259/1530 train_loss:4.0229 train_time:40725ms step_avg:163.55ms step:260/1530 train_loss:4.0582 train_time:40891ms step_avg:163.56ms step:261/1530 train_loss:4.0207 train_time:41056ms step_avg:163.57ms step:262/1530 train_loss:3.9842 train_time:41223ms step_avg:163.59ms step:263/1530 train_loss:3.8805 train_time:41389ms step_avg:163.59ms step:264/1530 train_loss:3.9823 train_time:41555ms step_avg:163.60ms step:265/1530 train_loss:3.8693 train_time:41722ms step_avg:163.62ms step:266/1530 train_loss:3.9213 train_time:41888ms step_avg:163.63ms step:267/1530 train_loss:3.9323 train_time:42055ms step_avg:163.64ms step:268/1530 train_loss:3.9555 train_time:42222ms step_avg:163.65ms step:269/1530 train_loss:3.8558 train_time:42388ms step_avg:163.66ms step:270/1530 train_loss:4.0958 train_time:42554ms step_avg:163.67ms step:271/1530 train_loss:3.9641 train_time:42721ms step_avg:163.68ms step:272/1530 train_loss:3.9211 train_time:42886ms step_avg:163.69ms step:273/1530 train_loss:3.9344 train_time:43052ms step_avg:163.70ms step:274/1530 train_loss:4.0360 train_time:43220ms step_avg:163.71ms step:275/1530 train_loss:4.0618 train_time:43385ms step_avg:163.72ms step:276/1530 train_loss:4.2166 train_time:43551ms step_avg:163.73ms step:277/1530 train_loss:4.0348 train_time:43716ms step_avg:163.73ms step:278/1530 train_loss:4.0863 train_time:43884ms step_avg:163.74ms step:279/1530 train_loss:3.9993 train_time:44050ms step_avg:163.75ms step:280/1530 train_loss:4.1858 train_time:44219ms step_avg:163.77ms step:281/1530 train_loss:3.9667 train_time:44386ms step_avg:163.78ms step:282/1530 train_loss:3.9388 train_time:44552ms step_avg:163.79ms step:283/1530 train_loss:3.9137 train_time:44718ms step_avg:163.80ms step:284/1530 train_loss:4.0431 train_time:44884ms step_avg:163.81ms step:285/1530 train_loss:4.0553 train_time:45048ms step_avg:163.81ms step:286/1530 train_loss:4.0792 train_time:45214ms step_avg:163.82ms step:287/1530 train_loss:3.9072 train_time:45379ms step_avg:163.82ms step:288/1530 train_loss:4.0061 train_time:45543ms step_avg:163.82ms step:289/1530 train_loss:3.8768 train_time:45708ms step_avg:163.83ms step:290/1530 train_loss:3.8477 train_time:45872ms step_avg:163.83ms step:291/1530 train_loss:3.9022 train_time:46038ms step_avg:163.83ms step:292/1530 train_loss:3.8586 train_time:46204ms step_avg:163.84ms step:293/1530 train_loss:3.9037 train_time:46368ms step_avg:163.84ms step:294/1530 train_loss:3.9327 train_time:46533ms step_avg:163.85ms step:295/1530 train_loss:3.8415 train_time:46698ms step_avg:163.85ms step:296/1530 train_loss:3.8603 train_time:46863ms step_avg:163.86ms step:297/1530 train_loss:3.8627 train_time:47028ms step_avg:163.86ms step:298/1530 train_loss:3.9732 train_time:47194ms step_avg:163.87ms step:299/1530 train_loss:3.8233 train_time:47359ms step_avg:163.87ms step:300/1530 train_loss:3.9601 train_time:47525ms step_avg:163.88ms step:301/1530 train_loss:3.9514 train_time:47691ms step_avg:163.89ms step:302/1530 train_loss:3.9283 train_time:47857ms step_avg:163.89ms step:303/1530 train_loss:3.9787 train_time:48023ms step_avg:163.90ms step:304/1530 train_loss:3.9650 train_time:48187ms step_avg:163.90ms step:305/1530 train_loss:4.4544 train_time:48353ms step_avg:163.91ms step:306/1530 train_loss:3.9350 train_time:48519ms step_avg:163.91ms step:307/1530 train_loss:3.8288 train_time:48683ms step_avg:163.92ms step:308/1530 train_loss:3.9673 train_time:48848ms step_avg:163.92ms step:309/1530 train_loss:3.8670 train_time:49014ms step_avg:163.93ms step:310/1530 train_loss:4.0770 train_time:49180ms step_avg:163.93ms step:311/1530 train_loss:3.9182 train_time:49346ms step_avg:163.94ms step:312/1530 train_loss:3.8543 train_time:49511ms step_avg:163.94ms step:313/1530 train_loss:3.9317 train_time:49676ms step_avg:163.95ms step:314/1530 train_loss:4.0597 train_time:49840ms step_avg:163.95ms step:315/1530 train_loss:3.9369 train_time:50006ms step_avg:163.95ms step:316/1530 train_loss:3.7902 train_time:50170ms step_avg:163.95ms step:317/1530 train_loss:3.8707 train_time:50335ms step_avg:163.96ms step:318/1530 train_loss:3.9156 train_time:50502ms step_avg:163.97ms step:319/1530 train_loss:3.8844 train_time:50667ms step_avg:163.97ms step:320/1530 train_loss:4.0096 train_time:50832ms step_avg:163.97ms step:321/1530 train_loss:3.9551 train_time:50996ms step_avg:163.97ms step:322/1530 train_loss:3.9245 train_time:51163ms step_avg:163.98ms step:323/1530 train_loss:4.0037 train_time:51329ms step_avg:163.99ms step:324/1530 train_loss:3.9396 train_time:51494ms step_avg:163.99ms step:325/1530 train_loss:4.0041 train_time:51659ms step_avg:164.00ms step:326/1530 train_loss:3.8911 train_time:51825ms step_avg:164.00ms step:327/1530 train_loss:4.3827 train_time:51991ms step_avg:164.01ms step:328/1530 train_loss:4.0734 train_time:52156ms step_avg:164.01ms step:329/1530 train_loss:3.7922 train_time:52322ms step_avg:164.02ms step:330/1530 train_loss:3.7517 train_time:52487ms step_avg:164.02ms step:331/1530 train_loss:3.9745 train_time:52652ms step_avg:164.03ms step:332/1530 train_loss:3.9107 train_time:52816ms step_avg:164.03ms step:333/1530 train_loss:3.8847 train_time:52983ms step_avg:164.03ms step:334/1530 train_loss:3.8372 train_time:53147ms step_avg:164.04ms step:335/1530 train_loss:4.0096 train_time:53312ms step_avg:164.04ms step:336/1530 train_loss:3.9629 train_time:53477ms step_avg:164.04ms step:337/1530 train_loss:4.4157 train_time:53643ms step_avg:164.05ms step:338/1530 train_loss:3.9334 train_time:53808ms step_avg:164.05ms step:339/1530 train_loss:3.8588 train_time:53972ms step_avg:164.05ms step:340/1530 train_loss:3.9331 train_time:54137ms step_avg:164.05ms step:341/1530 train_loss:3.8482 train_time:54305ms step_avg:164.06ms step:342/1530 train_loss:3.8096 train_time:54472ms step_avg:164.07ms step:343/1530 train_loss:3.8316 train_time:54640ms step_avg:164.09ms step:344/1530 train_loss:3.9908 train_time:54809ms step_avg:164.10ms step:345/1530 train_loss:3.8075 train_time:54977ms step_avg:164.11ms step:346/1530 train_loss:3.7591 train_time:55145ms step_avg:164.12ms step:347/1530 train_loss:3.7902 train_time:55314ms step_avg:164.14ms step:348/1530 train_loss:3.8551 train_time:55481ms step_avg:164.15ms step:349/1530 train_loss:3.8290 train_time:55650ms step_avg:164.16ms step:350/1530 train_loss:3.5605 train_time:55819ms step_avg:164.17ms step:351/1530 train_loss:3.8261 train_time:55988ms step_avg:164.19ms step:352/1530 train_loss:4.1768 train_time:56156ms step_avg:164.20ms step:353/1530 train_loss:3.6522 train_time:56324ms step_avg:164.21ms step:354/1530 train_loss:3.9245 train_time:56492ms step_avg:164.22ms step:355/1530 train_loss:3.7818 train_time:56662ms step_avg:164.24ms step:356/1530 train_loss:3.8813 train_time:56830ms step_avg:164.25ms step:357/1530 train_loss:3.7481 train_time:56998ms step_avg:164.26ms step:358/1530 train_loss:3.8559 train_time:57165ms step_avg:164.27ms step:359/1530 train_loss:3.7529 train_time:57334ms step_avg:164.28ms step:360/1530 train_loss:3.4211 train_time:57504ms step_avg:164.30ms step:361/1530 train_loss:4.0119 train_time:57672ms step_avg:164.31ms step:362/1530 train_loss:3.9047 train_time:57840ms step_avg:164.32ms step:363/1530 train_loss:3.8386 train_time:58008ms step_avg:164.33ms step:364/1530 train_loss:3.7420 train_time:58176ms step_avg:164.34ms step:365/1530 train_loss:3.9069 train_time:58344ms step_avg:164.35ms step:366/1530 train_loss:3.8581 train_time:58513ms step_avg:164.36ms step:367/1530 train_loss:3.8528 train_time:58681ms step_avg:164.37ms step:368/1530 train_loss:3.8472 train_time:58848ms step_avg:164.38ms step:369/1530 train_loss:3.7444 train_time:59017ms step_avg:164.39ms step:370/1530 train_loss:3.8805 train_time:59183ms step_avg:164.40ms step:371/1530 train_loss:3.7234 train_time:59351ms step_avg:164.41ms step:372/1530 train_loss:3.6933 train_time:59520ms step_avg:164.42ms step:373/1530 train_loss:3.9115 train_time:59687ms step_avg:164.43ms step:374/1530 train_loss:3.8259 train_time:59855ms step_avg:164.44ms step:375/1530 train_loss:3.7945 train_time:60025ms step_avg:164.45ms step:375/1530 val_loss:3.8186 train_time:60074ms step_avg:164.59ms step:376/1530 train_loss:3.8587 train_time:60196ms step_avg:164.47ms step:377/1530 train_loss:3.7887 train_time:60499ms step_avg:164.85ms step:378/1530 train_loss:3.8531 train_time:60676ms step_avg:164.88ms step:379/1530 train_loss:3.8579 train_time:60999ms step_avg:165.31ms step:380/1530 train_loss:3.9424 train_time:61175ms step_avg:165.34ms step:381/1530 train_loss:3.8355 train_time:61342ms step_avg:165.34ms step:382/1530 train_loss:3.8006 train_time:61511ms step_avg:165.35ms step:383/1530 train_loss:3.7914 train_time:61680ms step_avg:165.36ms step:384/1530 train_loss:3.8711 train_time:61847ms step_avg:165.37ms step:385/1530 train_loss:3.7919 train_time:62016ms step_avg:165.38ms step:386/1530 train_loss:3.8905 train_time:62183ms step_avg:165.38ms step:387/1530 train_loss:4.0551 train_time:62351ms step_avg:165.39ms step:388/1530 train_loss:3.7874 train_time:62517ms step_avg:165.39ms step:389/1530 train_loss:3.7908 train_time:62685ms step_avg:165.40ms step:390/1530 train_loss:3.8915 train_time:62855ms step_avg:165.41ms step:391/1530 train_loss:3.8037 train_time:63021ms step_avg:165.41ms step:392/1530 train_loss:3.9135 train_time:63188ms step_avg:165.41ms step:393/1530 train_loss:3.7609 train_time:63357ms step_avg:165.42ms step:394/1530 train_loss:3.8812 train_time:63525ms step_avg:165.43ms step:395/1530 train_loss:3.6214 train_time:63693ms step_avg:165.44ms step:396/1530 train_loss:3.8328 train_time:63861ms step_avg:165.44ms step:397/1530 train_loss:3.8594 train_time:64028ms step_avg:165.45ms step:398/1530 train_loss:3.8844 train_time:64197ms step_avg:165.46ms step:399/1530 train_loss:3.7689 train_time:64363ms step_avg:165.46ms step:400/1530 train_loss:3.8193 train_time:64530ms step_avg:165.46ms step:401/1530 train_loss:3.9047 train_time:64698ms step_avg:165.47ms step:402/1530 train_loss:3.8381 train_time:64864ms step_avg:165.47ms step:403/1530 train_loss:3.9571 train_time:65033ms step_avg:165.48ms step:404/1530 train_loss:3.6741 train_time:65200ms step_avg:165.48ms step:405/1530 train_loss:3.7855 train_time:65367ms step_avg:165.49ms step:406/1530 train_loss:4.0913 train_time:65535ms step_avg:165.49ms step:407/1530 train_loss:3.7750 train_time:65702ms step_avg:165.50ms step:408/1530 train_loss:3.8177 train_time:65870ms step_avg:165.50ms step:409/1530 train_loss:3.8553 train_time:66038ms step_avg:165.51ms step:410/1530 train_loss:3.7563 train_time:66204ms step_avg:165.51ms step:411/1530 train_loss:3.7586 train_time:66372ms step_avg:165.52ms step:412/1530 train_loss:4.1741 train_time:66541ms step_avg:165.53ms step:413/1530 train_loss:3.6437 train_time:66709ms step_avg:165.53ms step:414/1530 train_loss:4.0074 train_time:66877ms step_avg:165.54ms step:415/1530 train_loss:3.7455 train_time:67043ms step_avg:165.54ms step:416/1530 train_loss:3.7590 train_time:67211ms step_avg:165.54ms step:417/1530 train_loss:3.9532 train_time:67380ms step_avg:165.55ms step:418/1530 train_loss:3.6819 train_time:67546ms step_avg:165.55ms step:419/1530 train_loss:3.7977 train_time:67714ms step_avg:165.56ms step:420/1530 train_loss:3.7005 train_time:67881ms step_avg:165.56ms step:421/1530 train_loss:3.6419 train_time:68048ms step_avg:165.57ms step:422/1530 train_loss:3.7820 train_time:68215ms step_avg:165.57ms step:423/1530 train_loss:3.8715 train_time:68381ms step_avg:165.57ms step:424/1530 train_loss:3.6098 train_time:68549ms step_avg:165.58ms step:425/1530 train_loss:3.7931 train_time:68716ms step_avg:165.58ms step:426/1530 train_loss:3.6420 train_time:68883ms step_avg:165.58ms step:427/1530 train_loss:3.8861 train_time:69050ms step_avg:165.59ms step:428/1530 train_loss:3.8066 train_time:69217ms step_avg:165.59ms step:429/1530 train_loss:3.7579 train_time:69384ms step_avg:165.59ms step:430/1530 train_loss:3.6982 train_time:69551ms step_avg:165.60ms step:431/1530 train_loss:3.6165 train_time:69718ms step_avg:165.60ms step:432/1530 train_loss:3.7546 train_time:69886ms step_avg:165.61ms step:433/1530 train_loss:3.8146 train_time:70055ms step_avg:165.61ms step:434/1530 train_loss:3.7720 train_time:70221ms step_avg:165.61ms step:435/1530 train_loss:3.8026 train_time:70388ms step_avg:165.62ms step:436/1530 train_loss:3.8271 train_time:70556ms step_avg:165.62ms step:437/1530 train_loss:3.7172 train_time:70723ms step_avg:165.63ms step:438/1530 train_loss:3.6945 train_time:70891ms step_avg:165.63ms step:439/1530 train_loss:3.7052 train_time:71060ms step_avg:165.64ms step:440/1530 train_loss:3.8854 train_time:71226ms step_avg:165.64ms step:441/1530 train_loss:3.7547 train_time:71395ms step_avg:165.65ms step:442/1530 train_loss:3.7379 train_time:71562ms step_avg:165.65ms step:443/1530 train_loss:3.6246 train_time:71729ms step_avg:165.66ms step:444/1530 train_loss:3.9159 train_time:71895ms step_avg:165.66ms step:445/1530 train_loss:3.8415 train_time:72062ms step_avg:165.66ms step:446/1530 train_loss:3.8345 train_time:72229ms step_avg:165.66ms step:447/1530 train_loss:3.7511 train_time:72396ms step_avg:165.67ms step:448/1530 train_loss:3.8520 train_time:72563ms step_avg:165.67ms step:449/1530 train_loss:3.6863 train_time:72731ms step_avg:165.67ms step:450/1530 train_loss:3.7211 train_time:72899ms step_avg:165.68ms step:451/1530 train_loss:3.5759 train_time:73067ms step_avg:165.68ms step:452/1530 train_loss:3.7016 train_time:73234ms step_avg:165.69ms step:453/1530 train_loss:3.6670 train_time:73400ms step_avg:165.69ms step:454/1530 train_loss:3.6337 train_time:73567ms step_avg:165.69ms step:455/1530 train_loss:3.8406 train_time:73736ms step_avg:165.70ms step:456/1530 train_loss:3.7274 train_time:73904ms step_avg:165.70ms step:457/1530 train_loss:3.7765 train_time:74075ms step_avg:165.72ms step:458/1530 train_loss:3.8251 train_time:74244ms step_avg:165.72ms step:459/1530 train_loss:3.6313 train_time:74415ms step_avg:165.74ms step:460/1530 train_loss:3.7858 train_time:74584ms step_avg:165.74ms step:461/1530 train_loss:3.6912 train_time:74757ms step_avg:165.76ms step:462/1530 train_loss:3.7280 train_time:74926ms step_avg:165.76ms step:463/1530 train_loss:3.7692 train_time:75097ms step_avg:165.78ms step:464/1530 train_loss:3.7059 train_time:75266ms step_avg:165.78ms step:465/1530 train_loss:3.7129 train_time:75435ms step_avg:165.79ms step:466/1530 train_loss:3.7966 train_time:75604ms step_avg:165.80ms step:467/1530 train_loss:3.8137 train_time:75777ms step_avg:165.81ms step:468/1530 train_loss:3.7908 train_time:75946ms step_avg:165.82ms step:469/1530 train_loss:3.6825 train_time:76115ms step_avg:165.83ms step:470/1530 train_loss:3.7683 train_time:76284ms step_avg:165.83ms step:471/1530 train_loss:3.8030 train_time:76456ms step_avg:165.85ms step:472/1530 train_loss:3.7821 train_time:76624ms step_avg:165.85ms step:473/1530 train_loss:3.7074 train_time:76794ms step_avg:165.86ms step:474/1530 train_loss:3.5886 train_time:76964ms step_avg:165.87ms step:475/1530 train_loss:4.0226 train_time:77134ms step_avg:165.88ms step:476/1530 train_loss:3.7548 train_time:77303ms step_avg:165.89ms step:477/1530 train_loss:3.5976 train_time:77473ms step_avg:165.90ms step:478/1530 train_loss:3.8195 train_time:77643ms step_avg:165.90ms step:479/1530 train_loss:3.7710 train_time:77813ms step_avg:165.91ms step:480/1530 train_loss:3.9218 train_time:77983ms step_avg:165.92ms step:481/1530 train_loss:3.7232 train_time:78154ms step_avg:165.93ms step:482/1530 train_loss:3.5234 train_time:78323ms step_avg:165.94ms step:483/1530 train_loss:3.7986 train_time:78491ms step_avg:165.94ms step:484/1530 train_loss:3.6581 train_time:78661ms step_avg:165.95ms step:485/1530 train_loss:3.6543 train_time:78831ms step_avg:165.96ms step:486/1530 train_loss:3.5771 train_time:79002ms step_avg:165.97ms step:487/1530 train_loss:3.6795 train_time:79172ms step_avg:165.98ms step:488/1530 train_loss:3.8789 train_time:79341ms step_avg:165.99ms step:489/1530 train_loss:3.7088 train_time:79510ms step_avg:165.99ms step:490/1530 train_loss:3.5879 train_time:79679ms step_avg:166.00ms step:491/1530 train_loss:3.6141 train_time:79847ms step_avg:166.00ms step:492/1530 train_loss:3.7286 train_time:80017ms step_avg:166.01ms step:493/1530 train_loss:3.5715 train_time:80187ms step_avg:166.02ms step:494/1530 train_loss:3.6943 train_time:80357ms step_avg:166.03ms step:495/1530 train_loss:3.6629 train_time:80527ms step_avg:166.03ms step:496/1530 train_loss:3.5146 train_time:80698ms step_avg:166.04ms step:497/1530 train_loss:3.7304 train_time:80867ms step_avg:166.05ms step:498/1530 train_loss:3.7840 train_time:81038ms step_avg:166.06ms step:499/1530 train_loss:3.8177 train_time:81207ms step_avg:166.07ms step:500/1530 train_loss:3.7295 train_time:81377ms step_avg:166.08ms step:500/1530 val_loss:3.7032 train_time:81426ms step_avg:166.17ms step:501/1530 train_loss:3.8037 train_time:81548ms step_avg:166.09ms step:502/1530 train_loss:3.7467 train_time:81720ms step_avg:166.10ms step:503/1530 train_loss:3.7733 train_time:81891ms step_avg:166.11ms step:504/1530 train_loss:3.7127 train_time:82059ms step_avg:166.11ms step:505/1530 train_loss:3.8037 train_time:82228ms step_avg:166.12ms step:506/1530 train_loss:3.6415 train_time:82400ms step_avg:166.13ms step:507/1530 train_loss:3.7625 train_time:82569ms step_avg:166.14ms step:508/1530 train_loss:3.8257 train_time:82740ms step_avg:166.15ms step:509/1530 train_loss:3.7680 train_time:82911ms step_avg:166.15ms step:510/1530 train_loss:3.5800 train_time:83080ms step_avg:166.16ms step:511/1530 train_loss:3.7719 train_time:83250ms step_avg:166.17ms step:512/1530 train_loss:3.7139 train_time:83421ms step_avg:166.18ms step:513/1530 train_loss:3.6589 train_time:83590ms step_avg:166.18ms step:514/1530 train_loss:3.8055 train_time:83760ms step_avg:166.19ms step:515/1530 train_loss:3.7321 train_time:83928ms step_avg:166.19ms step:516/1530 train_loss:4.0704 train_time:84098ms step_avg:166.20ms step:517/1530 train_loss:3.6858 train_time:84267ms step_avg:166.21ms step:518/1530 train_loss:3.7693 train_time:84435ms step_avg:166.21ms step:519/1530 train_loss:3.6491 train_time:84605ms step_avg:166.22ms step:520/1530 train_loss:3.6817 train_time:84775ms step_avg:166.23ms step:521/1530 train_loss:3.6593 train_time:84944ms step_avg:166.23ms step:522/1530 train_loss:3.6505 train_time:85114ms step_avg:166.24ms step:523/1530 train_loss:4.2935 train_time:85284ms step_avg:166.25ms step:524/1530 train_loss:3.7397 train_time:85451ms step_avg:166.25ms step:525/1530 train_loss:3.6758 train_time:85621ms step_avg:166.25ms step:526/1530 train_loss:3.6914 train_time:85790ms step_avg:166.26ms step:527/1530 train_loss:3.6500 train_time:85960ms step_avg:166.27ms step:528/1530 train_loss:3.6228 train_time:86128ms step_avg:166.27ms step:529/1530 train_loss:3.8449 train_time:86299ms step_avg:166.28ms step:530/1530 train_loss:3.6499 train_time:86468ms step_avg:166.29ms step:531/1530 train_loss:3.9150 train_time:86638ms step_avg:166.29ms step:532/1530 train_loss:3.7271 train_time:86808ms step_avg:166.30ms step:533/1530 train_loss:3.6493 train_time:86978ms step_avg:166.31ms step:534/1530 train_loss:3.6621 train_time:87146ms step_avg:166.31ms step:535/1530 train_loss:3.6079 train_time:87317ms step_avg:166.32ms step:536/1530 train_loss:3.7459 train_time:87485ms step_avg:166.32ms step:537/1530 train_loss:3.7146 train_time:87656ms step_avg:166.33ms step:538/1530 train_loss:3.6230 train_time:87824ms step_avg:166.33ms step:539/1530 train_loss:4.1102 train_time:87997ms step_avg:166.35ms step:540/1530 train_loss:3.6686 train_time:88165ms step_avg:166.35ms step:541/1530 train_loss:3.7818 train_time:88333ms step_avg:166.35ms step:542/1530 train_loss:3.5817 train_time:88502ms step_avg:166.36ms step:543/1530 train_loss:3.5762 train_time:88670ms step_avg:166.36ms step:544/1530 train_loss:3.6309 train_time:88839ms step_avg:166.37ms step:545/1530 train_loss:3.5913 train_time:89008ms step_avg:166.37ms step:546/1530 train_loss:3.6306 train_time:89178ms step_avg:166.38ms step:547/1530 train_loss:3.6353 train_time:89346ms step_avg:166.38ms step:548/1530 train_loss:3.6047 train_time:89516ms step_avg:166.39ms step:549/1530 train_loss:3.7205 train_time:89684ms step_avg:166.39ms step:550/1530 train_loss:3.6156 train_time:89854ms step_avg:166.40ms step:551/1530 train_loss:3.6296 train_time:90022ms step_avg:166.40ms step:552/1530 train_loss:3.9330 train_time:90191ms step_avg:166.40ms step:553/1530 train_loss:3.7583 train_time:90360ms step_avg:166.41ms step:554/1530 train_loss:3.7024 train_time:90528ms step_avg:166.41ms step:555/1530 train_loss:3.6230 train_time:90698ms step_avg:166.42ms step:556/1530 train_loss:3.6956 train_time:90866ms step_avg:166.42ms step:557/1530 train_loss:3.3079 train_time:91035ms step_avg:166.43ms step:558/1530 train_loss:3.6104 train_time:91204ms step_avg:166.43ms step:559/1530 train_loss:3.6494 train_time:91373ms step_avg:166.43ms step:560/1530 train_loss:3.6811 train_time:91541ms step_avg:166.44ms step:561/1530 train_loss:3.6088 train_time:91710ms step_avg:166.44ms step:562/1530 train_loss:3.5552 train_time:91879ms step_avg:166.45ms step:563/1530 train_loss:3.7490 train_time:92048ms step_avg:166.45ms step:564/1530 train_loss:3.5688 train_time:92218ms step_avg:166.46ms step:565/1530 train_loss:3.6806 train_time:92386ms step_avg:166.46ms step:566/1530 train_loss:3.6056 train_time:92693ms step_avg:166.71ms step:567/1530 train_loss:3.5977 train_time:92872ms step_avg:166.74ms step:568/1530 train_loss:3.6835 train_time:93043ms step_avg:166.74ms step:569/1530 train_loss:3.6453 train_time:93367ms step_avg:167.03ms step:570/1530 train_loss:3.6884 train_time:93538ms step_avg:167.03ms step:571/1530 train_loss:3.7555 train_time:93707ms step_avg:167.04ms step:572/1530 train_loss:3.7232 train_time:93881ms step_avg:167.05ms step:573/1530 train_loss:3.7347 train_time:94055ms step_avg:167.06ms step:574/1530 train_loss:3.7789 train_time:94228ms step_avg:167.07ms step:575/1530 train_loss:3.7234 train_time:94400ms step_avg:167.08ms step:576/1530 train_loss:3.7563 train_time:94569ms step_avg:167.08ms step:577/1530 train_loss:3.6681 train_time:94741ms step_avg:167.09ms step:578/1530 train_loss:3.6644 train_time:94914ms step_avg:167.10ms step:579/1530 train_loss:3.6622 train_time:95084ms step_avg:167.11ms step:580/1530 train_loss:3.5855 train_time:95255ms step_avg:167.11ms step:581/1530 train_loss:3.6329 train_time:95426ms step_avg:167.12ms step:582/1530 train_loss:3.8431 train_time:95597ms step_avg:167.13ms step:583/1530 train_loss:3.6265 train_time:95767ms step_avg:167.13ms step:584/1530 train_loss:3.5923 train_time:95939ms step_avg:167.14ms step:585/1530 train_loss:3.7851 train_time:96109ms step_avg:167.15ms step:586/1530 train_loss:3.5148 train_time:96280ms step_avg:167.15ms step:587/1530 train_loss:3.6673 train_time:96451ms step_avg:167.16ms step:588/1530 train_loss:3.6362 train_time:96621ms step_avg:167.16ms step:589/1530 train_loss:3.9959 train_time:96792ms step_avg:167.17ms step:590/1530 train_loss:3.7778 train_time:96964ms step_avg:167.18ms step:591/1530 train_loss:3.5073 train_time:97138ms step_avg:167.19ms step:592/1530 train_loss:3.5340 train_time:97311ms step_avg:167.20ms step:593/1530 train_loss:3.4968 train_time:97485ms step_avg:167.21ms step:594/1530 train_loss:3.5469 train_time:97657ms step_avg:167.22ms step:595/1530 train_loss:3.9135 train_time:97829ms step_avg:167.23ms step:596/1530 train_loss:3.6445 train_time:98002ms step_avg:167.24ms step:597/1530 train_loss:3.5826 train_time:98172ms step_avg:167.24ms step:598/1530 train_loss:3.6509 train_time:98342ms step_avg:167.25ms step:599/1530 train_loss:3.4736 train_time:98514ms step_avg:167.26ms step:600/1530 train_loss:3.5879 train_time:98684ms step_avg:167.26ms step:601/1530 train_loss:3.6395 train_time:98859ms step_avg:167.27ms step:602/1530 train_loss:3.6733 train_time:99031ms step_avg:167.28ms step:603/1530 train_loss:3.7774 train_time:99203ms step_avg:167.29ms step:604/1530 train_loss:3.6046 train_time:99375ms step_avg:167.30ms step:605/1530 train_loss:3.6105 train_time:99545ms step_avg:167.30ms step:606/1530 train_loss:3.5674 train_time:99719ms step_avg:167.31ms step:607/1530 train_loss:3.8373 train_time:99890ms step_avg:167.32ms step:608/1530 train_loss:3.6295 train_time:100061ms step_avg:167.33ms step:609/1530 train_loss:3.6124 train_time:100231ms step_avg:167.33ms step:610/1530 train_loss:3.6946 train_time:100401ms step_avg:167.34ms step:611/1530 train_loss:3.5949 train_time:100573ms step_avg:167.34ms step:612/1530 train_loss:3.5656 train_time:100743ms step_avg:167.35ms step:613/1530 train_loss:3.7608 train_time:100916ms step_avg:167.36ms step:614/1530 train_loss:3.6971 train_time:101087ms step_avg:167.36ms step:615/1530 train_loss:3.6852 train_time:101257ms step_avg:167.37ms step:616/1530 train_loss:3.6246 train_time:101427ms step_avg:167.37ms step:617/1530 train_loss:3.5646 train_time:101600ms step_avg:167.38ms step:618/1530 train_loss:3.6883 train_time:101771ms step_avg:167.39ms step:619/1530 train_loss:3.5415 train_time:101941ms step_avg:167.39ms step:620/1530 train_loss:3.5800 train_time:102112ms step_avg:167.40ms step:621/1530 train_loss:3.9214 train_time:102284ms step_avg:167.40ms step:622/1530 train_loss:3.5664 train_time:102458ms step_avg:167.41ms step:623/1530 train_loss:3.6014 train_time:102629ms step_avg:167.42ms step:624/1530 train_loss:3.6946 train_time:102801ms step_avg:167.43ms step:625/1530 train_loss:3.7073 train_time:102971ms step_avg:167.43ms step:625/1530 val_loss:3.6190 train_time:103020ms step_avg:167.51ms step:626/1530 train_loss:3.7374 train_time:103144ms step_avg:167.44ms step:627/1530 train_loss:3.7095 train_time:103317ms step_avg:167.45ms step:628/1530 train_loss:3.7520 train_time:103487ms step_avg:167.46ms step:629/1530 train_loss:3.5857 train_time:103660ms step_avg:167.46ms step:630/1530 train_loss:3.7232 train_time:103831ms step_avg:167.47ms step:631/1530 train_loss:3.7402 train_time:104001ms step_avg:167.47ms step:632/1530 train_loss:3.6425 train_time:104173ms step_avg:167.48ms step:633/1530 train_loss:3.6010 train_time:104345ms step_avg:167.49ms step:634/1530 train_loss:3.6974 train_time:104516ms step_avg:167.49ms step:635/1530 train_loss:3.9456 train_time:104686ms step_avg:167.50ms step:636/1530 train_loss:3.5487 train_time:104858ms step_avg:167.50ms step:637/1530 train_loss:3.3462 train_time:105028ms step_avg:167.51ms step:638/1530 train_loss:3.5900 train_time:105198ms step_avg:167.51ms step:639/1530 train_loss:3.6340 train_time:105367ms step_avg:167.52ms step:640/1530 train_loss:3.5651 train_time:105538ms step_avg:167.52ms step:641/1530 train_loss:3.5863 train_time:105707ms step_avg:167.52ms step:642/1530 train_loss:3.6290 train_time:105877ms step_avg:167.53ms step:643/1530 train_loss:3.5957 train_time:106047ms step_avg:167.53ms step:644/1530 train_loss:3.5543 train_time:106217ms step_avg:167.53ms step:645/1530 train_loss:3.7702 train_time:106387ms step_avg:167.54ms step:646/1530 train_loss:3.6704 train_time:106560ms step_avg:167.55ms step:647/1530 train_loss:3.6595 train_time:106729ms step_avg:167.55ms step:648/1530 train_loss:3.7061 train_time:106901ms step_avg:167.56ms step:649/1530 train_loss:3.7588 train_time:107073ms step_avg:167.56ms step:650/1530 train_loss:3.6144 train_time:107244ms step_avg:167.57ms step:651/1530 train_loss:3.7673 train_time:107415ms step_avg:167.57ms step:652/1530 train_loss:3.5818 train_time:107585ms step_avg:167.58ms step:653/1530 train_loss:3.6585 train_time:107756ms step_avg:167.58ms step:654/1530 train_loss:3.4276 train_time:107928ms step_avg:167.59ms step:655/1530 train_loss:3.5745 train_time:108098ms step_avg:167.59ms step:656/1530 train_loss:3.5724 train_time:108267ms step_avg:167.60ms step:657/1530 train_loss:3.4947 train_time:108439ms step_avg:167.60ms step:658/1530 train_loss:3.6827 train_time:108608ms step_avg:167.61ms step:659/1530 train_loss:3.5825 train_time:108779ms step_avg:167.61ms step:660/1530 train_loss:3.6834 train_time:108949ms step_avg:167.61ms step:661/1530 train_loss:3.7466 train_time:109121ms step_avg:167.62ms step:662/1530 train_loss:3.6683 train_time:109292ms step_avg:167.63ms step:663/1530 train_loss:3.5507 train_time:109462ms step_avg:167.63ms step:664/1530 train_loss:3.6042 train_time:109635ms step_avg:167.64ms step:665/1530 train_loss:3.4903 train_time:109805ms step_avg:167.64ms step:666/1530 train_loss:3.7776 train_time:109977ms step_avg:167.65ms step:667/1530 train_loss:3.6028 train_time:110148ms step_avg:167.65ms step:668/1530 train_loss:3.6432 train_time:110319ms step_avg:167.66ms step:669/1530 train_loss:3.4873 train_time:110491ms step_avg:167.66ms step:670/1530 train_loss:3.5980 train_time:110662ms step_avg:167.67ms step:671/1530 train_loss:3.5627 train_time:110833ms step_avg:167.67ms step:672/1530 train_loss:3.5581 train_time:111004ms step_avg:167.68ms step:673/1530 train_loss:3.8504 train_time:111175ms step_avg:167.69ms step:674/1530 train_loss:3.6180 train_time:111346ms step_avg:167.69ms step:675/1530 train_loss:3.7064 train_time:111519ms step_avg:167.70ms step:676/1530 train_loss:3.4859 train_time:111691ms step_avg:167.70ms step:677/1530 train_loss:3.5990 train_time:111863ms step_avg:167.71ms step:678/1530 train_loss:3.5522 train_time:112035ms step_avg:167.72ms step:679/1530 train_loss:3.6765 train_time:112204ms step_avg:167.72ms step:680/1530 train_loss:3.5811 train_time:112376ms step_avg:167.72ms step:681/1530 train_loss:3.6170 train_time:112548ms step_avg:167.73ms step:682/1530 train_loss:3.6590 train_time:112723ms step_avg:167.74ms step:683/1530 train_loss:3.7324 train_time:112896ms step_avg:167.75ms step:684/1530 train_loss:3.6421 train_time:113067ms step_avg:167.76ms step:685/1530 train_loss:3.6829 train_time:113241ms step_avg:167.77ms step:686/1530 train_loss:3.6325 train_time:113414ms step_avg:167.77ms step:687/1530 train_loss:3.6625 train_time:113585ms step_avg:167.78ms step:688/1530 train_loss:3.2076 train_time:113763ms step_avg:167.79ms step:689/1530 train_loss:3.4049 train_time:113937ms step_avg:167.80ms step:690/1530 train_loss:3.5363 train_time:114109ms step_avg:167.81ms step:691/1530 train_loss:3.4025 train_time:114282ms step_avg:167.81ms step:692/1530 train_loss:3.6250 train_time:114454ms step_avg:167.82ms step:693/1530 train_loss:3.6391 train_time:114627ms step_avg:167.83ms step:694/1530 train_loss:3.5515 train_time:114800ms step_avg:167.84ms step:695/1530 train_loss:3.5300 train_time:114970ms step_avg:167.84ms step:696/1530 train_loss:3.8495 train_time:115144ms step_avg:167.85ms step:697/1530 train_loss:3.5830 train_time:115317ms step_avg:167.86ms step:698/1530 train_loss:3.6378 train_time:115489ms step_avg:167.86ms step:699/1530 train_loss:3.7611 train_time:115664ms step_avg:167.87ms step:700/1530 train_loss:3.5667 train_time:115835ms step_avg:167.88ms step:701/1530 train_loss:3.5438 train_time:116007ms step_avg:167.88ms step:702/1530 train_loss:3.5108 train_time:116179ms step_avg:167.89ms step:703/1530 train_loss:3.4937 train_time:116351ms step_avg:167.90ms step:704/1530 train_loss:3.5701 train_time:116523ms step_avg:167.90ms step:705/1530 train_loss:3.5594 train_time:116699ms step_avg:167.91ms step:706/1530 train_loss:3.5773 train_time:116875ms step_avg:167.92ms step:707/1530 train_loss:3.6466 train_time:117051ms step_avg:167.94ms step:708/1530 train_loss:3.6001 train_time:117223ms step_avg:167.94ms step:709/1530 train_loss:3.5826 train_time:117398ms step_avg:167.95ms step:710/1530 train_loss:3.5396 train_time:117569ms step_avg:167.96ms step:711/1530 train_loss:3.5908 train_time:117743ms step_avg:167.96ms step:712/1530 train_loss:3.6450 train_time:117919ms step_avg:167.98ms step:713/1530 train_loss:3.6478 train_time:118093ms step_avg:167.98ms step:714/1530 train_loss:3.5541 train_time:118265ms step_avg:167.99ms step:715/1530 train_loss:3.5617 train_time:118438ms step_avg:168.00ms step:716/1530 train_loss:3.5889 train_time:118608ms step_avg:168.00ms step:717/1530 train_loss:3.6977 train_time:118783ms step_avg:168.01ms step:718/1530 train_loss:3.5955 train_time:118956ms step_avg:168.02ms step:719/1530 train_loss:3.6741 train_time:119129ms step_avg:168.02ms step:720/1530 train_loss:3.8468 train_time:119304ms step_avg:168.03ms step:721/1530 train_loss:3.4603 train_time:119477ms step_avg:168.04ms step:722/1530 train_loss:3.7345 train_time:119648ms step_avg:168.05ms step:723/1530 train_loss:3.7615 train_time:119819ms step_avg:168.05ms step:724/1530 train_loss:3.5633 train_time:119992ms step_avg:168.06ms step:725/1530 train_loss:3.6500 train_time:120165ms step_avg:168.06ms step:726/1530 train_loss:3.5294 train_time:120340ms step_avg:168.07ms step:727/1530 train_loss:3.5756 train_time:120515ms step_avg:168.08ms step:728/1530 train_loss:3.7250 train_time:120687ms step_avg:168.09ms step:729/1530 train_loss:3.6692 train_time:120861ms step_avg:168.10ms step:730/1530 train_loss:3.6615 train_time:121034ms step_avg:168.10ms step:731/1530 train_loss:3.5529 train_time:121207ms step_avg:168.11ms step:732/1530 train_loss:3.5927 train_time:121379ms step_avg:168.12ms step:733/1530 train_loss:3.8255 train_time:121553ms step_avg:168.12ms step:734/1530 train_loss:3.5556 train_time:121727ms step_avg:168.13ms step:735/1530 train_loss:3.6143 train_time:121900ms step_avg:168.14ms step:736/1530 train_loss:3.7376 train_time:122072ms step_avg:168.14ms step:737/1530 train_loss:3.6777 train_time:122244ms step_avg:168.15ms step:738/1530 train_loss:3.5932 train_time:122417ms step_avg:168.15ms step:739/1530 train_loss:3.4994 train_time:122588ms step_avg:168.16ms step:740/1530 train_loss:4.1071 train_time:122765ms step_avg:168.17ms step:741/1530 train_loss:3.4918 train_time:122937ms step_avg:168.18ms step:742/1530 train_loss:3.5499 train_time:123107ms step_avg:168.18ms step:743/1530 train_loss:3.5799 train_time:123280ms step_avg:168.19ms step:744/1530 train_loss:3.6430 train_time:123453ms step_avg:168.19ms step:745/1530 train_loss:3.5742 train_time:123627ms step_avg:168.20ms step:746/1530 train_loss:3.5896 train_time:123799ms step_avg:168.21ms step:747/1530 train_loss:3.6422 train_time:123973ms step_avg:168.21ms step:748/1530 train_loss:3.5631 train_time:124149ms step_avg:168.22ms step:749/1530 train_loss:3.5638 train_time:124320ms step_avg:168.23ms step:750/1530 train_loss:3.5947 train_time:124491ms step_avg:168.23ms step:750/1530 val_loss:3.5614 train_time:124541ms step_avg:168.30ms step:751/1530 train_loss:3.5693 train_time:124664ms step_avg:168.24ms step:752/1530 train_loss:3.6151 train_time:124836ms step_avg:168.24ms step:753/1530 train_loss:3.6120 train_time:125010ms step_avg:168.25ms step:754/1530 train_loss:3.5851 train_time:125182ms step_avg:168.25ms step:755/1530 train_loss:3.6749 train_time:125493ms step_avg:168.45ms step:756/1530 train_loss:3.4502 train_time:125678ms step_avg:168.47ms step:757/1530 train_loss:3.7183 train_time:125850ms step_avg:168.47ms step:758/1530 train_loss:3.6441 train_time:126022ms step_avg:168.48ms step:759/1530 train_loss:3.5879 train_time:126346ms step_avg:168.69ms step:760/1530 train_loss:3.7044 train_time:126516ms step_avg:168.69ms step:761/1530 train_loss:3.4018 train_time:126689ms step_avg:168.69ms step:762/1530 train_loss:3.5473 train_time:126861ms step_avg:168.70ms step:763/1530 train_loss:3.6554 train_time:127033ms step_avg:168.70ms step:764/1530 train_loss:3.3150 train_time:127205ms step_avg:168.71ms step:765/1530 train_loss:3.7326 train_time:127378ms step_avg:168.71ms step:766/1530 train_loss:3.5688 train_time:127552ms step_avg:168.72ms step:767/1530 train_loss:3.5662 train_time:127723ms step_avg:168.72ms step:768/1530 train_loss:3.5645 train_time:127899ms step_avg:168.73ms step:769/1530 train_loss:3.5830 train_time:128074ms step_avg:168.74ms step:770/1530 train_loss:3.6392 train_time:128245ms step_avg:168.74ms step:771/1530 train_loss:3.8840 train_time:128418ms step_avg:168.75ms step:772/1530 train_loss:3.4423 train_time:128589ms step_avg:168.75ms step:773/1530 train_loss:3.6276 train_time:128760ms step_avg:168.76ms step:774/1530 train_loss:3.6383 train_time:128934ms step_avg:168.76ms step:775/1530 train_loss:3.6028 train_time:129105ms step_avg:168.77ms step:776/1530 train_loss:3.4039 train_time:129280ms step_avg:168.77ms step:777/1530 train_loss:3.3875 train_time:129454ms step_avg:168.78ms step:778/1530 train_loss:3.4893 train_time:129626ms step_avg:168.78ms step:779/1530 train_loss:3.5794 train_time:129798ms step_avg:168.79ms step:780/1530 train_loss:3.5875 train_time:129972ms step_avg:168.79ms step:781/1530 train_loss:3.6672 train_time:130144ms step_avg:168.80ms step:782/1530 train_loss:3.5828 train_time:130316ms step_avg:168.80ms step:783/1530 train_loss:3.5642 train_time:130487ms step_avg:168.81ms step:784/1530 train_loss:3.6005 train_time:130660ms step_avg:168.81ms step:785/1530 train_loss:3.5551 train_time:130832ms step_avg:168.82ms step:786/1530 train_loss:3.4333 train_time:131004ms step_avg:168.82ms step:787/1530 train_loss:3.7380 train_time:131178ms step_avg:168.83ms step:788/1530 train_loss:3.4927 train_time:131352ms step_avg:168.83ms step:789/1530 train_loss:3.5464 train_time:131523ms step_avg:168.84ms step:790/1530 train_loss:3.6254 train_time:131698ms step_avg:168.84ms step:791/1530 train_loss:3.7677 train_time:131874ms step_avg:168.85ms step:792/1530 train_loss:3.7583 train_time:132046ms step_avg:168.86ms step:793/1530 train_loss:3.4410 train_time:132216ms step_avg:168.86ms step:794/1530 train_loss:3.5954 train_time:132391ms step_avg:168.87ms step:795/1530 train_loss:3.6687 train_time:132565ms step_avg:168.87ms step:796/1530 train_loss:3.7405 train_time:132741ms step_avg:168.88ms step:797/1530 train_loss:3.5194 train_time:132914ms step_avg:168.89ms step:798/1530 train_loss:3.6469 train_time:133090ms step_avg:168.90ms step:799/1530 train_loss:3.5278 train_time:133266ms step_avg:168.90ms step:800/1530 train_loss:3.5250 train_time:133439ms step_avg:168.91ms step:801/1530 train_loss:3.6252 train_time:133614ms step_avg:168.92ms step:802/1530 train_loss:3.4917 train_time:133791ms step_avg:168.93ms step:803/1530 train_loss:3.4901 train_time:133965ms step_avg:168.93ms step:804/1530 train_loss:3.6217 train_time:134139ms step_avg:168.94ms step:805/1530 train_loss:3.5138 train_time:134315ms step_avg:168.95ms step:806/1530 train_loss:3.5566 train_time:134488ms step_avg:168.95ms step:807/1530 train_loss:3.6401 train_time:134661ms step_avg:168.96ms step:808/1530 train_loss:3.5386 train_time:134837ms step_avg:168.97ms step:809/1530 train_loss:3.4873 train_time:135009ms step_avg:168.97ms step:810/1530 train_loss:3.5597 train_time:135181ms step_avg:168.98ms step:811/1530 train_loss:3.5825 train_time:135355ms step_avg:168.98ms step:812/1530 train_loss:3.6021 train_time:135528ms step_avg:168.99ms step:813/1530 train_loss:3.6234 train_time:135699ms step_avg:168.99ms step:814/1530 train_loss:3.5619 train_time:135875ms step_avg:169.00ms step:815/1530 train_loss:3.5618 train_time:136047ms step_avg:169.00ms step:816/1530 train_loss:3.6826 train_time:136220ms step_avg:169.01ms step:817/1530 train_loss:3.7605 train_time:136395ms step_avg:169.02ms step:818/1530 train_loss:3.5256 train_time:136568ms step_avg:169.02ms step:819/1530 train_loss:3.7182 train_time:136743ms step_avg:169.03ms step:820/1530 train_loss:3.4940 train_time:136919ms step_avg:169.04ms step:821/1530 train_loss:3.5587 train_time:137092ms step_avg:169.04ms step:822/1530 train_loss:3.6938 train_time:137269ms step_avg:169.05ms step:823/1530 train_loss:3.5719 train_time:137441ms step_avg:169.05ms step:824/1530 train_loss:3.5096 train_time:137615ms step_avg:169.06ms step:825/1530 train_loss:3.6157 train_time:137791ms step_avg:169.07ms step:826/1530 train_loss:3.4757 train_time:137967ms step_avg:169.08ms step:827/1530 train_loss:3.7293 train_time:138140ms step_avg:169.08ms step:828/1530 train_loss:3.6158 train_time:138314ms step_avg:169.09ms step:829/1530 train_loss:3.6223 train_time:138491ms step_avg:169.10ms step:830/1530 train_loss:3.5311 train_time:138665ms step_avg:169.10ms step:831/1530 train_loss:3.5959 train_time:138839ms step_avg:169.11ms step:832/1530 train_loss:3.5115 train_time:139015ms step_avg:169.12ms step:833/1530 train_loss:3.6464 train_time:139192ms step_avg:169.13ms step:834/1530 train_loss:3.4663 train_time:139365ms step_avg:169.13ms step:835/1530 train_loss:3.4521 train_time:139538ms step_avg:169.14ms step:836/1530 train_loss:3.7085 train_time:139715ms step_avg:169.15ms step:837/1530 train_loss:3.3943 train_time:139889ms step_avg:169.15ms step:838/1530 train_loss:3.5873 train_time:140062ms step_avg:169.16ms step:839/1530 train_loss:3.4125 train_time:140236ms step_avg:169.16ms step:840/1530 train_loss:3.4688 train_time:140408ms step_avg:169.17ms step:841/1530 train_loss:3.5718 train_time:140582ms step_avg:169.17ms step:842/1530 train_loss:3.5725 train_time:140758ms step_avg:169.18ms step:843/1530 train_loss:3.5547 train_time:140932ms step_avg:169.19ms step:844/1530 train_loss:3.4262 train_time:141106ms step_avg:169.19ms step:845/1530 train_loss:3.6615 train_time:141281ms step_avg:169.20ms step:846/1530 train_loss:3.5130 train_time:141456ms step_avg:169.21ms step:847/1530 train_loss:3.4886 train_time:141631ms step_avg:169.21ms step:848/1530 train_loss:3.6337 train_time:141804ms step_avg:169.22ms step:849/1530 train_loss:3.4850 train_time:141978ms step_avg:169.22ms step:850/1530 train_loss:3.4452 train_time:142152ms step_avg:169.23ms step:851/1530 train_loss:3.7261 train_time:142324ms step_avg:169.23ms step:852/1530 train_loss:3.4425 train_time:142498ms step_avg:169.24ms step:853/1530 train_loss:3.5631 train_time:142670ms step_avg:169.24ms step:854/1530 train_loss:3.6471 train_time:142844ms step_avg:169.25ms step:855/1530 train_loss:3.5042 train_time:143019ms step_avg:169.25ms step:856/1530 train_loss:3.5407 train_time:143194ms step_avg:169.26ms step:857/1530 train_loss:3.6007 train_time:143369ms step_avg:169.27ms step:858/1530 train_loss:3.4564 train_time:143543ms step_avg:169.27ms step:859/1530 train_loss:3.5520 train_time:143717ms step_avg:169.28ms step:860/1530 train_loss:3.5842 train_time:143891ms step_avg:169.28ms step:861/1530 train_loss:3.6256 train_time:144067ms step_avg:169.29ms step:862/1530 train_loss:3.6012 train_time:144244ms step_avg:169.30ms step:863/1530 train_loss:3.5645 train_time:144420ms step_avg:169.31ms step:864/1530 train_loss:3.3789 train_time:144594ms step_avg:169.31ms step:865/1530 train_loss:3.5963 train_time:144767ms step_avg:169.32ms step:866/1530 train_loss:3.8734 train_time:144943ms step_avg:169.33ms step:867/1530 train_loss:3.4534 train_time:145116ms step_avg:169.33ms step:868/1530 train_loss:3.6337 train_time:145290ms step_avg:169.34ms step:869/1530 train_loss:3.6170 train_time:145462ms step_avg:169.34ms step:870/1530 train_loss:3.4434 train_time:145637ms step_avg:169.35ms step:871/1530 train_loss:3.4092 train_time:145812ms step_avg:169.35ms step:872/1530 train_loss:3.6406 train_time:145988ms step_avg:169.36ms step:873/1530 train_loss:3.4538 train_time:146161ms step_avg:169.36ms step:874/1530 train_loss:3.2223 train_time:146338ms step_avg:169.37ms step:875/1530 train_loss:3.6290 train_time:146513ms step_avg:169.38ms step:875/1530 val_loss:3.5155 train_time:146563ms step_avg:169.44ms step:876/1530 train_loss:3.4304 train_time:146687ms step_avg:169.38ms step:877/1530 train_loss:3.6118 train_time:146864ms step_avg:169.39ms step:878/1530 train_loss:3.4649 train_time:147038ms step_avg:169.40ms step:879/1530 train_loss:3.6439 train_time:147210ms step_avg:169.40ms step:880/1530 train_loss:3.3027 train_time:147382ms step_avg:169.41ms step:881/1530 train_loss:3.4720 train_time:147555ms step_avg:169.41ms step:882/1530 train_loss:3.6887 train_time:147729ms step_avg:169.41ms step:883/1530 train_loss:3.8364 train_time:147904ms step_avg:169.42ms step:884/1530 train_loss:3.5660 train_time:148078ms step_avg:169.43ms step:885/1530 train_loss:3.4905 train_time:148251ms step_avg:169.43ms step:886/1530 train_loss:3.5681 train_time:148426ms step_avg:169.44ms step:887/1530 train_loss:4.0756 train_time:148601ms step_avg:169.44ms step:888/1530 train_loss:3.8293 train_time:148782ms step_avg:169.46ms step:889/1530 train_loss:3.5164 train_time:148955ms step_avg:169.46ms step:890/1530 train_loss:3.5302 train_time:149128ms step_avg:169.46ms step:891/1530 train_loss:3.3562 train_time:149302ms step_avg:169.47ms step:892/1530 train_loss:3.7110 train_time:149475ms step_avg:169.47ms step:893/1530 train_loss:3.4147 train_time:149647ms step_avg:169.48ms step:894/1530 train_loss:3.6299 train_time:149824ms step_avg:169.48ms step:895/1530 train_loss:3.6703 train_time:149998ms step_avg:169.49ms step:896/1530 train_loss:3.4960 train_time:150170ms step_avg:169.49ms step:897/1530 train_loss:3.5402 train_time:150346ms step_avg:169.50ms step:898/1530 train_loss:3.5862 train_time:150523ms step_avg:169.51ms step:899/1530 train_loss:3.4741 train_time:150695ms step_avg:169.51ms step:900/1530 train_loss:3.4243 train_time:150867ms step_avg:169.51ms step:901/1530 train_loss:3.6138 train_time:151041ms step_avg:169.52ms step:902/1530 train_loss:3.6347 train_time:151214ms step_avg:169.52ms step:903/1530 train_loss:3.5388 train_time:151390ms step_avg:169.53ms step:904/1530 train_loss:3.4886 train_time:151564ms step_avg:169.53ms step:905/1530 train_loss:3.5041 train_time:151734ms step_avg:169.54ms step:906/1530 train_loss:3.7022 train_time:151909ms step_avg:169.54ms step:907/1530 train_loss:3.5136 train_time:152084ms step_avg:169.55ms step:908/1530 train_loss:3.5585 train_time:152256ms step_avg:169.55ms step:909/1530 train_loss:3.4538 train_time:152432ms step_avg:169.56ms step:910/1530 train_loss:3.5232 train_time:152613ms step_avg:169.57ms step:911/1530 train_loss:3.6439 train_time:152789ms step_avg:169.58ms step:912/1530 train_loss:3.5937 train_time:152967ms step_avg:169.59ms step:913/1530 train_loss:3.4577 train_time:153146ms step_avg:169.60ms step:914/1530 train_loss:3.7443 train_time:153324ms step_avg:169.61ms step:915/1530 train_loss:3.5275 train_time:153506ms step_avg:169.62ms step:916/1530 train_loss:3.6140 train_time:153680ms step_avg:169.62ms step:917/1530 train_loss:3.5964 train_time:153853ms step_avg:169.63ms step:918/1530 train_loss:4.8155 train_time:154033ms step_avg:169.64ms step:919/1530 train_loss:3.4953 train_time:154212ms step_avg:169.65ms step:920/1530 train_loss:3.5864 train_time:154387ms step_avg:169.66ms step:921/1530 train_loss:3.5477 train_time:154565ms step_avg:169.67ms step:922/1530 train_loss:3.5768 train_time:154742ms step_avg:169.67ms step:923/1530 train_loss:3.6077 train_time:154918ms step_avg:169.68ms step:924/1530 train_loss:3.6778 train_time:155094ms step_avg:169.69ms step:925/1530 train_loss:3.6452 train_time:155268ms step_avg:169.69ms step:926/1530 train_loss:3.5538 train_time:155442ms step_avg:169.70ms step:927/1530 train_loss:3.5511 train_time:155616ms step_avg:169.70ms step:928/1530 train_loss:3.7748 train_time:155792ms step_avg:169.71ms step:929/1530 train_loss:3.6067 train_time:155966ms step_avg:169.71ms step:930/1530 train_loss:3.3970 train_time:156143ms step_avg:169.72ms step:931/1530 train_loss:3.4907 train_time:156317ms step_avg:169.73ms step:932/1530 train_loss:3.6434 train_time:156494ms step_avg:169.73ms step:933/1530 train_loss:3.3611 train_time:156669ms step_avg:169.74ms step:934/1530 train_loss:3.5745 train_time:156846ms step_avg:169.75ms step:935/1530 train_loss:3.4357 train_time:157024ms step_avg:169.76ms step:936/1530 train_loss:3.5135 train_time:157201ms step_avg:169.76ms step:937/1530 train_loss:3.6146 train_time:157379ms step_avg:169.77ms step:938/1530 train_loss:3.5388 train_time:157552ms step_avg:169.78ms step:939/1530 train_loss:3.6680 train_time:157732ms step_avg:169.79ms step:940/1530 train_loss:3.4811 train_time:157908ms step_avg:169.79ms step:941/1530 train_loss:3.5453 train_time:158083ms step_avg:169.80ms step:942/1530 train_loss:3.3515 train_time:158260ms step_avg:169.81ms step:943/1530 train_loss:3.7122 train_time:158439ms step_avg:169.82ms step:944/1530 train_loss:3.3960 train_time:158755ms step_avg:169.97ms step:945/1530 train_loss:3.4193 train_time:158937ms step_avg:169.99ms step:946/1530 train_loss:5.0804 train_time:159118ms step_avg:170.00ms step:947/1530 train_loss:3.5960 train_time:159294ms step_avg:170.00ms step:948/1530 train_loss:3.4835 train_time:159468ms step_avg:170.01ms step:949/1530 train_loss:3.3696 train_time:159793ms step_avg:170.17ms step:950/1530 train_loss:3.4390 train_time:159968ms step_avg:170.18ms step:951/1530 train_loss:3.3992 train_time:160144ms step_avg:170.18ms step:952/1530 train_loss:3.4759 train_time:160321ms step_avg:170.19ms step:953/1530 train_loss:3.5610 train_time:160498ms step_avg:170.20ms step:954/1530 train_loss:3.4366 train_time:160676ms step_avg:170.21ms step:955/1530 train_loss:3.4714 train_time:160849ms step_avg:170.21ms step:956/1530 train_loss:3.4361 train_time:161027ms step_avg:170.22ms step:957/1530 train_loss:3.4927 train_time:161207ms step_avg:170.23ms step:958/1530 train_loss:3.4998 train_time:161386ms step_avg:170.24ms step:959/1530 train_loss:3.5045 train_time:161564ms step_avg:170.25ms step:960/1530 train_loss:3.4057 train_time:161739ms step_avg:170.25ms step:961/1530 train_loss:3.6415 train_time:161914ms step_avg:170.26ms step:962/1530 train_loss:3.5891 train_time:162088ms step_avg:170.26ms step:963/1530 train_loss:3.6949 train_time:162266ms step_avg:170.27ms step:964/1530 train_loss:3.4237 train_time:162444ms step_avg:170.28ms step:965/1530 train_loss:3.4769 train_time:162618ms step_avg:170.28ms step:966/1530 train_loss:3.7058 train_time:162794ms step_avg:170.29ms step:967/1530 train_loss:3.5158 train_time:162968ms step_avg:170.29ms step:968/1530 train_loss:3.5149 train_time:163143ms step_avg:170.29ms step:969/1530 train_loss:3.5790 train_time:163317ms step_avg:170.30ms step:970/1530 train_loss:3.3721 train_time:163490ms step_avg:170.30ms step:971/1530 train_loss:3.5300 train_time:163665ms step_avg:170.31ms step:972/1530 train_loss:3.4820 train_time:163837ms step_avg:170.31ms step:973/1530 train_loss:3.5385 train_time:164011ms step_avg:170.31ms step:974/1530 train_loss:3.5875 train_time:164188ms step_avg:170.32ms step:975/1530 train_loss:3.4568 train_time:164364ms step_avg:170.33ms step:976/1530 train_loss:3.6682 train_time:164537ms step_avg:170.33ms step:977/1530 train_loss:3.5677 train_time:164710ms step_avg:170.33ms step:978/1530 train_loss:3.3552 train_time:164886ms step_avg:170.34ms step:979/1530 train_loss:3.6173 train_time:165062ms step_avg:170.34ms step:980/1530 train_loss:3.4122 train_time:165238ms step_avg:170.35ms step:981/1530 train_loss:3.5689 train_time:165418ms step_avg:170.36ms step:982/1530 train_loss:3.5425 train_time:165591ms step_avg:170.36ms step:983/1530 train_loss:3.5193 train_time:165766ms step_avg:170.37ms step:984/1530 train_loss:3.4925 train_time:165940ms step_avg:170.37ms step:985/1530 train_loss:3.5768 train_time:166119ms step_avg:170.38ms step:986/1530 train_loss:3.4109 train_time:166296ms step_avg:170.39ms step:987/1530 train_loss:3.4833 train_time:166468ms step_avg:170.39ms step:988/1530 train_loss:3.4542 train_time:166642ms step_avg:170.39ms step:989/1530 train_loss:3.4098 train_time:166815ms step_avg:170.39ms step:990/1530 train_loss:3.6575 train_time:166992ms step_avg:170.40ms step:991/1530 train_loss:3.4667 train_time:167168ms step_avg:170.41ms step:992/1530 train_loss:3.4377 train_time:167348ms step_avg:170.42ms step:993/1530 train_loss:3.4910 train_time:167527ms step_avg:170.42ms step:994/1530 train_loss:3.5923 train_time:167702ms step_avg:170.43ms step:995/1530 train_loss:3.5266 train_time:167874ms step_avg:170.43ms step:996/1530 train_loss:3.4489 train_time:168046ms step_avg:170.43ms step:997/1530 train_loss:3.7496 train_time:168222ms step_avg:170.44ms step:998/1530 train_loss:3.4341 train_time:168393ms step_avg:170.44ms step:999/1530 train_loss:3.5854 train_time:168567ms step_avg:170.44ms step:1000/1530 train_loss:3.4385 train_time:168745ms step_avg:170.45ms step:1000/1530 val_loss:3.4624 train_time:168796ms step_avg:170.50ms step:1001/1530 train_loss:3.4967 train_time:168921ms step_avg:170.46ms step:1002/1530 train_loss:3.3712 train_time:169094ms step_avg:170.46ms step:1003/1530 train_loss:3.5510 train_time:169269ms step_avg:170.46ms step:1004/1530 train_loss:3.5987 train_time:169446ms step_avg:170.47ms step:1005/1530 train_loss:3.3856 train_time:169621ms step_avg:170.47ms step:1006/1530 train_loss:3.4588 train_time:169798ms step_avg:170.48ms step:1007/1530 train_loss:3.4296 train_time:169973ms step_avg:170.48ms step:1008/1530 train_loss:3.5560 train_time:170148ms step_avg:170.49ms step:1009/1530 train_loss:3.6638 train_time:170327ms step_avg:170.50ms step:1010/1530 train_loss:3.5605 train_time:170501ms step_avg:170.50ms step:1011/1530 train_loss:3.5320 train_time:170676ms step_avg:170.51ms step:1012/1530 train_loss:3.3847 train_time:170850ms step_avg:170.51ms step:1013/1530 train_loss:3.5320 train_time:171026ms step_avg:170.51ms step:1014/1530 train_loss:3.6183 train_time:171202ms step_avg:170.52ms step:1015/1530 train_loss:3.3238 train_time:171379ms step_avg:170.53ms step:1016/1530 train_loss:3.4025 train_time:171552ms step_avg:170.53ms step:1017/1530 train_loss:3.3937 train_time:171730ms step_avg:170.54ms step:1018/1530 train_loss:3.3894 train_time:171906ms step_avg:170.54ms step:1019/1530 train_loss:3.5124 train_time:172082ms step_avg:170.55ms step:1020/1530 train_loss:3.3728 train_time:172260ms step_avg:170.55ms step:1021/1530 train_loss:3.3484 train_time:172435ms step_avg:170.56ms step:1022/1530 train_loss:3.4734 train_time:172612ms step_avg:170.57ms step:1023/1530 train_loss:3.4995 train_time:172787ms step_avg:170.57ms step:1024/1530 train_loss:3.4712 train_time:172964ms step_avg:170.58ms step:1025/1530 train_loss:3.4708 train_time:173142ms step_avg:170.58ms step:1026/1530 train_loss:3.6160 train_time:173319ms step_avg:170.59ms step:1027/1530 train_loss:3.3170 train_time:173493ms step_avg:170.59ms step:1028/1530 train_loss:3.3971 train_time:173673ms step_avg:170.60ms step:1029/1530 train_loss:3.3051 train_time:173855ms step_avg:170.61ms step:1030/1530 train_loss:3.5366 train_time:174032ms step_avg:170.62ms step:1031/1530 train_loss:3.5084 train_time:174208ms step_avg:170.62ms step:1032/1530 train_loss:3.6896 train_time:174389ms step_avg:170.63ms step:1033/1530 train_loss:3.4834 train_time:174564ms step_avg:170.64ms step:1034/1530 train_loss:3.3939 train_time:174741ms step_avg:170.65ms step:1035/1530 train_loss:3.4418 train_time:174920ms step_avg:170.65ms step:1036/1530 train_loss:3.4790 train_time:175096ms step_avg:170.66ms step:1037/1530 train_loss:3.7827 train_time:175274ms step_avg:170.67ms step:1038/1530 train_loss:3.6065 train_time:175452ms step_avg:170.67ms step:1039/1530 train_loss:3.5064 train_time:175635ms step_avg:170.68ms step:1040/1530 train_loss:3.4111 train_time:175808ms step_avg:170.69ms step:1041/1530 train_loss:3.4892 train_time:175986ms step_avg:170.69ms step:1042/1530 train_loss:3.5179 train_time:176159ms step_avg:170.70ms step:1043/1530 train_loss:3.4476 train_time:176335ms step_avg:170.70ms step:1044/1530 train_loss:3.4484 train_time:176512ms step_avg:170.71ms step:1045/1530 train_loss:3.5092 train_time:176689ms step_avg:170.71ms step:1046/1530 train_loss:3.4235 train_time:176865ms step_avg:170.72ms step:1047/1530 train_loss:3.6252 train_time:177043ms step_avg:170.73ms step:1048/1530 train_loss:3.4948 train_time:177221ms step_avg:170.73ms step:1049/1530 train_loss:3.4013 train_time:177396ms step_avg:170.74ms step:1050/1530 train_loss:3.3862 train_time:177574ms step_avg:170.74ms step:1051/1530 train_loss:3.4904 train_time:177750ms step_avg:170.75ms step:1052/1530 train_loss:3.3563 train_time:177929ms step_avg:170.76ms step:1053/1530 train_loss:3.6855 train_time:178106ms step_avg:170.76ms step:1054/1530 train_loss:3.5328 train_time:178284ms step_avg:170.77ms step:1055/1530 train_loss:3.3767 train_time:178459ms step_avg:170.77ms step:1056/1530 train_loss:3.4926 train_time:178634ms step_avg:170.78ms step:1057/1530 train_loss:3.5698 train_time:178811ms step_avg:170.78ms step:1058/1530 train_loss:3.3075 train_time:178988ms step_avg:170.79ms step:1059/1530 train_loss:3.3623 train_time:179169ms step_avg:170.80ms step:1060/1530 train_loss:3.4313 train_time:179345ms step_avg:170.80ms step:1061/1530 train_loss:3.4149 train_time:179521ms step_avg:170.81ms step:1062/1530 train_loss:3.3753 train_time:179696ms step_avg:170.81ms step:1063/1530 train_loss:3.4576 train_time:179871ms step_avg:170.82ms step:1064/1530 train_loss:3.3788 train_time:180045ms step_avg:170.82ms step:1065/1530 train_loss:3.3552 train_time:180224ms step_avg:170.83ms step:1066/1530 train_loss:3.4075 train_time:180401ms step_avg:170.83ms step:1067/1530 train_loss:3.2763 train_time:180580ms step_avg:170.84ms step:1068/1530 train_loss:3.4281 train_time:180756ms step_avg:170.85ms step:1069/1530 train_loss:3.2923 train_time:180937ms step_avg:170.86ms step:1070/1530 train_loss:3.5653 train_time:181112ms step_avg:170.86ms step:1071/1530 train_loss:3.5110 train_time:181292ms step_avg:170.87ms step:1072/1530 train_loss:3.4320 train_time:181468ms step_avg:170.87ms step:1073/1530 train_loss:3.5166 train_time:181642ms step_avg:170.88ms step:1074/1530 train_loss:3.4247 train_time:181819ms step_avg:170.88ms step:1075/1530 train_loss:3.3947 train_time:181996ms step_avg:170.89ms step:1076/1530 train_loss:3.7916 train_time:182172ms step_avg:170.89ms step:1077/1530 train_loss:3.4236 train_time:182346ms step_avg:170.90ms step:1078/1530 train_loss:3.0804 train_time:182532ms step_avg:170.91ms step:1079/1530 train_loss:3.5281 train_time:182707ms step_avg:170.91ms step:1080/1530 train_loss:3.4275 train_time:182885ms step_avg:170.92ms step:1081/1530 train_loss:3.4945 train_time:183059ms step_avg:170.92ms step:1082/1530 train_loss:3.5856 train_time:183236ms step_avg:170.93ms step:1083/1530 train_loss:3.4900 train_time:183410ms step_avg:170.93ms step:1084/1530 train_loss:3.4597 train_time:183587ms step_avg:170.94ms step:1085/1530 train_loss:3.4254 train_time:183763ms step_avg:170.94ms step:1086/1530 train_loss:3.6232 train_time:183940ms step_avg:170.95ms step:1087/1530 train_loss:3.4928 train_time:184113ms step_avg:170.95ms step:1088/1530 train_loss:3.3651 train_time:184290ms step_avg:170.96ms step:1089/1530 train_loss:3.3725 train_time:184468ms step_avg:170.96ms step:1090/1530 train_loss:3.4777 train_time:184647ms step_avg:170.97ms step:1091/1530 train_loss:3.2798 train_time:184825ms step_avg:170.98ms step:1092/1530 train_loss:3.4812 train_time:185004ms step_avg:170.98ms step:1093/1530 train_loss:3.5971 train_time:185181ms step_avg:170.99ms step:1094/1530 train_loss:3.4456 train_time:185356ms step_avg:170.99ms step:1095/1530 train_loss:3.4189 train_time:185531ms step_avg:171.00ms step:1096/1530 train_loss:3.4211 train_time:185708ms step_avg:171.00ms step:1097/1530 train_loss:3.4876 train_time:185885ms step_avg:171.01ms step:1098/1530 train_loss:3.5554 train_time:186062ms step_avg:171.01ms step:1099/1530 train_loss:3.5231 train_time:186240ms step_avg:171.02ms step:1100/1530 train_loss:3.4248 train_time:186420ms step_avg:171.03ms step:1101/1530 train_loss:3.2884 train_time:186598ms step_avg:171.03ms step:1102/1530 train_loss:3.3001 train_time:186776ms step_avg:171.04ms step:1103/1530 train_loss:3.4374 train_time:186959ms step_avg:171.05ms step:1104/1530 train_loss:3.3174 train_time:187135ms step_avg:171.06ms step:1105/1530 train_loss:4.0578 train_time:187313ms step_avg:171.06ms step:1106/1530 train_loss:3.2239 train_time:187489ms step_avg:171.07ms step:1107/1530 train_loss:3.5654 train_time:187664ms step_avg:171.07ms step:1108/1530 train_loss:3.3439 train_time:187840ms step_avg:171.08ms step:1109/1530 train_loss:3.4975 train_time:188017ms step_avg:171.08ms step:1110/1530 train_loss:3.4213 train_time:188190ms step_avg:171.08ms step:1111/1530 train_loss:3.4781 train_time:188365ms step_avg:171.09ms step:1112/1530 train_loss:3.5576 train_time:188545ms step_avg:171.09ms step:1113/1530 train_loss:3.4282 train_time:188728ms step_avg:171.10ms step:1114/1530 train_loss:3.3766 train_time:188908ms step_avg:171.11ms step:1115/1530 train_loss:3.2345 train_time:189086ms step_avg:171.12ms step:1116/1530 train_loss:3.4271 train_time:189260ms step_avg:171.12ms step:1117/1530 train_loss:3.5841 train_time:189439ms step_avg:171.13ms step:1118/1530 train_loss:3.6230 train_time:189616ms step_avg:171.13ms step:1119/1530 train_loss:3.4757 train_time:189789ms step_avg:171.14ms step:1120/1530 train_loss:3.4874 train_time:189966ms step_avg:171.14ms step:1121/1530 train_loss:3.3866 train_time:190143ms step_avg:171.15ms step:1122/1530 train_loss:3.4540 train_time:190319ms step_avg:171.15ms step:1123/1530 train_loss:3.5754 train_time:190494ms step_avg:171.15ms step:1124/1530 train_loss:3.3385 train_time:190670ms step_avg:171.16ms step:1125/1530 train_loss:3.2156 train_time:190846ms step_avg:171.16ms step:1125/1530 val_loss:3.4056 train_time:190896ms step_avg:171.21ms step:1126/1530 train_loss:3.4707 train_time:191021ms step_avg:171.17ms step:1127/1530 train_loss:3.6714 train_time:191200ms step_avg:171.17ms step:1128/1530 train_loss:3.2238 train_time:191379ms step_avg:171.18ms step:1129/1530 train_loss:3.5552 train_time:191560ms step_avg:171.19ms step:1130/1530 train_loss:3.3797 train_time:191738ms step_avg:171.19ms step:1131/1530 train_loss:3.3948 train_time:191918ms step_avg:171.20ms step:1132/1530 train_loss:3.3594 train_time:192093ms step_avg:171.21ms step:1133/1530 train_loss:3.4826 train_time:192408ms step_avg:171.33ms step:1134/1530 train_loss:3.4426 train_time:192592ms step_avg:171.35ms step:1135/1530 train_loss:3.5183 train_time:192771ms step_avg:171.35ms step:1136/1530 train_loss:3.5583 train_time:192949ms step_avg:171.36ms step:1137/1530 train_loss:3.4540 train_time:193125ms step_avg:171.36ms step:1138/1530 train_loss:3.3495 train_time:193304ms step_avg:171.37ms step:1139/1530 train_loss:3.6529 train_time:193635ms step_avg:171.51ms step:1140/1530 train_loss:3.4518 train_time:193809ms step_avg:171.51ms step:1141/1530 train_loss:3.5914 train_time:193989ms step_avg:171.52ms step:1142/1530 train_loss:3.4368 train_time:194165ms step_avg:171.52ms step:1143/1530 train_loss:3.3592 train_time:194345ms step_avg:171.53ms step:1144/1530 train_loss:3.4421 train_time:194521ms step_avg:171.54ms step:1145/1530 train_loss:3.5819 train_time:194695ms step_avg:171.54ms step:1146/1530 train_loss:3.5481 train_time:194877ms step_avg:171.55ms step:1147/1530 train_loss:3.4756 train_time:195056ms step_avg:171.55ms step:1148/1530 train_loss:3.4907 train_time:195234ms step_avg:171.56ms step:1149/1530 train_loss:3.3193 train_time:195414ms step_avg:171.57ms step:1150/1530 train_loss:3.3750 train_time:195590ms step_avg:171.57ms step:1151/1530 train_loss:3.3139 train_time:195769ms step_avg:171.58ms step:1152/1530 train_loss:3.3931 train_time:195951ms step_avg:171.59ms step:1153/1530 train_loss:3.4180 train_time:196132ms step_avg:171.59ms step:1154/1530 train_loss:3.5128 train_time:196308ms step_avg:171.60ms step:1155/1530 train_loss:3.3183 train_time:196490ms step_avg:171.61ms step:1156/1530 train_loss:3.5334 train_time:196674ms step_avg:171.62ms step:1157/1530 train_loss:3.4905 train_time:196852ms step_avg:171.62ms step:1158/1530 train_loss:3.2469 train_time:197029ms step_avg:171.63ms step:1159/1530 train_loss:3.3418 train_time:197205ms step_avg:171.63ms step:1160/1530 train_loss:3.3364 train_time:197379ms step_avg:171.63ms step:1161/1530 train_loss:3.0739 train_time:197560ms step_avg:171.64ms step:1162/1530 train_loss:3.4191 train_time:197737ms step_avg:171.65ms step:1163/1530 train_loss:3.3916 train_time:197916ms step_avg:171.65ms step:1164/1530 train_loss:3.2889 train_time:198094ms step_avg:171.66ms step:1165/1530 train_loss:3.2452 train_time:198270ms step_avg:171.66ms step:1166/1530 train_loss:3.3846 train_time:198451ms step_avg:171.67ms step:1167/1530 train_loss:3.4071 train_time:198626ms step_avg:171.67ms step:1168/1530 train_loss:3.7177 train_time:198801ms step_avg:171.68ms step:1169/1530 train_loss:3.3709 train_time:198978ms step_avg:171.68ms step:1170/1530 train_loss:3.3878 train_time:199155ms step_avg:171.69ms step:1171/1530 train_loss:3.3161 train_time:199331ms step_avg:171.69ms step:1172/1530 train_loss:3.4177 train_time:199505ms step_avg:171.69ms step:1173/1530 train_loss:3.5307 train_time:199684ms step_avg:171.70ms step:1174/1530 train_loss:3.3801 train_time:199869ms step_avg:171.71ms step:1175/1530 train_loss:3.3592 train_time:200049ms step_avg:171.72ms step:1176/1530 train_loss:3.4216 train_time:200229ms step_avg:171.72ms step:1177/1530 train_loss:3.4486 train_time:200413ms step_avg:171.73ms step:1178/1530 train_loss:3.4966 train_time:200589ms step_avg:171.74ms step:1179/1530 train_loss:3.3970 train_time:200763ms step_avg:171.74ms step:1180/1530 train_loss:3.3459 train_time:200950ms step_avg:171.75ms step:1181/1530 train_loss:3.3340 train_time:201128ms step_avg:171.76ms step:1182/1530 train_loss:3.3711 train_time:201305ms step_avg:171.76ms step:1183/1530 train_loss:3.3292 train_time:201485ms step_avg:171.77ms step:1184/1530 train_loss:3.5035 train_time:201661ms step_avg:171.77ms step:1185/1530 train_loss:3.5386 train_time:201844ms step_avg:171.78ms step:1186/1530 train_loss:3.3610 train_time:202024ms step_avg:171.79ms step:1187/1530 train_loss:3.4140 train_time:202210ms step_avg:171.80ms step:1188/1530 train_loss:3.4371 train_time:202386ms step_avg:171.80ms step:1189/1530 train_loss:3.2742 train_time:202565ms step_avg:171.81ms step:1190/1530 train_loss:3.4403 train_time:202744ms step_avg:171.82ms step:1191/1530 train_loss:3.5771 train_time:202923ms step_avg:171.82ms step:1192/1530 train_loss:3.3880 train_time:203097ms step_avg:171.82ms step:1193/1530 train_loss:3.2709 train_time:203272ms step_avg:171.83ms step:1194/1530 train_loss:3.5505 train_time:203450ms step_avg:171.83ms step:1195/1530 train_loss:3.3636 train_time:203630ms step_avg:171.84ms step:1196/1530 train_loss:3.3821 train_time:203817ms step_avg:171.85ms step:1197/1530 train_loss:3.2843 train_time:203997ms step_avg:171.86ms step:1198/1530 train_loss:3.2986 train_time:204181ms step_avg:171.87ms step:1199/1530 train_loss:3.3397 train_time:204362ms step_avg:171.88ms step:1200/1530 train_loss:3.4415 train_time:204539ms step_avg:171.88ms step:1201/1530 train_loss:3.4788 train_time:204717ms step_avg:171.89ms step:1202/1530 train_loss:3.5959 train_time:204905ms step_avg:171.90ms step:1203/1530 train_loss:3.4044 train_time:205085ms step_avg:171.91ms step:1204/1530 train_loss:3.2984 train_time:205266ms step_avg:171.91ms step:1205/1530 train_loss:3.4358 train_time:205443ms step_avg:171.92ms step:1206/1530 train_loss:3.4716 train_time:205619ms step_avg:171.92ms step:1207/1530 train_loss:3.5115 train_time:205797ms step_avg:171.93ms step:1208/1530 train_loss:3.3921 train_time:205973ms step_avg:171.93ms step:1209/1530 train_loss:3.2450 train_time:206154ms step_avg:171.94ms step:1210/1530 train_loss:3.2975 train_time:206333ms step_avg:171.94ms step:1211/1530 train_loss:3.3923 train_time:206509ms step_avg:171.95ms step:1212/1530 train_loss:3.3922 train_time:206685ms step_avg:171.95ms step:1213/1530 train_loss:3.4036 train_time:206863ms step_avg:171.96ms step:1214/1530 train_loss:3.2471 train_time:207045ms step_avg:171.96ms step:1215/1530 train_loss:3.3924 train_time:207221ms step_avg:171.97ms step:1216/1530 train_loss:3.3317 train_time:207397ms step_avg:171.97ms step:1217/1530 train_loss:3.3150 train_time:207575ms step_avg:171.98ms step:1218/1530 train_loss:3.4029 train_time:207755ms step_avg:171.98ms step:1219/1530 train_loss:3.2499 train_time:207937ms step_avg:171.99ms step:1220/1530 train_loss:3.4744 train_time:208113ms step_avg:171.99ms step:1221/1530 train_loss:3.4978 train_time:208288ms step_avg:172.00ms step:1222/1530 train_loss:3.4257 train_time:208462ms step_avg:172.00ms step:1223/1530 train_loss:3.2931 train_time:208641ms step_avg:172.00ms step:1224/1530 train_loss:3.2510 train_time:208823ms step_avg:172.01ms step:1225/1530 train_loss:3.3610 train_time:209000ms step_avg:172.02ms step:1226/1530 train_loss:3.3262 train_time:209179ms step_avg:172.02ms step:1227/1530 train_loss:3.2719 train_time:209359ms step_avg:172.03ms step:1228/1530 train_loss:3.4447 train_time:209534ms step_avg:172.03ms step:1229/1530 train_loss:3.3645 train_time:209713ms step_avg:172.04ms step:1230/1530 train_loss:3.3977 train_time:209895ms step_avg:172.05ms step:1231/1530 train_loss:3.5744 train_time:210076ms step_avg:172.05ms step:1232/1530 train_loss:3.4979 train_time:210257ms step_avg:172.06ms step:1233/1530 train_loss:3.4265 train_time:210434ms step_avg:172.06ms step:1234/1530 train_loss:3.5855 train_time:210612ms step_avg:172.07ms step:1235/1530 train_loss:3.3214 train_time:210793ms step_avg:172.08ms step:1236/1530 train_loss:3.2876 train_time:210969ms step_avg:172.08ms step:1237/1530 train_loss:3.2674 train_time:211146ms step_avg:172.08ms step:1238/1530 train_loss:3.2744 train_time:211330ms step_avg:172.09ms step:1239/1530 train_loss:3.3324 train_time:211508ms step_avg:172.10ms step:1240/1530 train_loss:3.3796 train_time:211685ms step_avg:172.10ms step:1241/1530 train_loss:3.4245 train_time:211864ms step_avg:172.11ms step:1242/1530 train_loss:3.2976 train_time:212042ms step_avg:172.11ms step:1243/1530 train_loss:3.4031 train_time:212221ms step_avg:172.12ms step:1244/1530 train_loss:3.3996 train_time:212394ms step_avg:172.12ms step:1245/1530 train_loss:3.4086 train_time:212571ms step_avg:172.12ms step:1246/1530 train_loss:3.2405 train_time:212752ms step_avg:172.13ms step:1247/1530 train_loss:3.3667 train_time:212926ms step_avg:172.13ms step:1248/1530 train_loss:3.4251 train_time:213102ms step_avg:172.13ms step:1249/1530 train_loss:3.4254 train_time:213279ms step_avg:172.14ms step:1250/1530 train_loss:3.3053 train_time:213460ms step_avg:172.14ms step:1250/1530 val_loss:3.3525 train_time:213515ms step_avg:172.19ms step:1251/1530 train_loss:3.4827 train_time:213646ms step_avg:172.16ms step:1252/1530 train_loss:3.3553 train_time:213822ms step_avg:172.16ms step:1253/1530 train_loss:3.3062 train_time:213998ms step_avg:172.16ms step:1254/1530 train_loss:3.4160 train_time:214179ms step_avg:172.17ms step:1255/1530 train_loss:3.5178 train_time:214369ms step_avg:172.18ms step:1256/1530 train_loss:3.3052 train_time:214551ms step_avg:172.19ms step:1257/1530 train_loss:3.3758 train_time:214729ms step_avg:172.20ms step:1258/1530 train_loss:3.3642 train_time:214912ms step_avg:172.21ms step:1259/1530 train_loss:3.3255 train_time:215091ms step_avg:172.21ms step:1260/1530 train_loss:3.2076 train_time:215268ms step_avg:172.21ms step:1261/1530 train_loss:3.3019 train_time:215450ms step_avg:172.22ms step:1262/1530 train_loss:3.3222 train_time:215633ms step_avg:172.23ms step:1263/1530 train_loss:3.2409 train_time:215815ms step_avg:172.24ms step:1264/1530 train_loss:3.4396 train_time:215992ms step_avg:172.24ms step:1265/1530 train_loss:3.4189 train_time:216167ms step_avg:172.24ms step:1266/1530 train_loss:3.4324 train_time:216347ms step_avg:172.25ms step:1267/1530 train_loss:3.3685 train_time:216524ms step_avg:172.25ms step:1268/1530 train_loss:3.4053 train_time:216705ms step_avg:172.26ms step:1269/1530 train_loss:3.2473 train_time:216886ms step_avg:172.27ms step:1270/1530 train_loss:3.1007 train_time:217062ms step_avg:172.27ms step:1271/1530 train_loss:3.3983 train_time:217241ms step_avg:172.28ms step:1272/1530 train_loss:3.3496 train_time:217417ms step_avg:172.28ms step:1273/1530 train_loss:3.3751 train_time:217598ms step_avg:172.29ms step:1274/1530 train_loss:3.3571 train_time:217780ms step_avg:172.29ms step:1275/1530 train_loss:3.4324 train_time:217956ms step_avg:172.30ms step:1276/1530 train_loss:3.4640 train_time:218131ms step_avg:172.30ms step:1277/1530 train_loss:3.4093 train_time:218309ms step_avg:172.30ms step:1278/1530 train_loss:3.4042 train_time:218484ms step_avg:172.31ms step:1279/1530 train_loss:3.2667 train_time:218666ms step_avg:172.31ms step:1280/1530 train_loss:3.3623 train_time:218853ms step_avg:172.32ms step:1281/1530 train_loss:3.4211 train_time:219030ms step_avg:172.33ms step:1282/1530 train_loss:3.4660 train_time:219204ms step_avg:172.33ms step:1283/1530 train_loss:3.3308 train_time:219383ms step_avg:172.34ms step:1284/1530 train_loss:3.3702 train_time:219562ms step_avg:172.34ms step:1285/1530 train_loss:3.3585 train_time:219741ms step_avg:172.35ms step:1286/1530 train_loss:3.3299 train_time:219918ms step_avg:172.35ms step:1287/1530 train_loss:3.4841 train_time:220095ms step_avg:172.35ms step:1288/1530 train_loss:3.2942 train_time:220276ms step_avg:172.36ms step:1289/1530 train_loss:3.3764 train_time:220461ms step_avg:172.37ms step:1290/1530 train_loss:3.4599 train_time:220646ms step_avg:172.38ms step:1291/1530 train_loss:3.3799 train_time:220825ms step_avg:172.39ms step:1292/1530 train_loss:3.4782 train_time:221006ms step_avg:172.39ms step:1293/1530 train_loss:3.5153 train_time:221187ms step_avg:172.40ms step:1294/1530 train_loss:3.4505 train_time:221367ms step_avg:172.40ms step:1295/1530 train_loss:3.2798 train_time:221547ms step_avg:172.41ms step:1296/1530 train_loss:3.3744 train_time:221728ms step_avg:172.42ms step:1297/1530 train_loss:3.2689 train_time:221907ms step_avg:172.42ms step:1298/1530 train_loss:3.2668 train_time:222089ms step_avg:172.43ms step:1299/1530 train_loss:3.3964 train_time:222267ms step_avg:172.43ms step:1300/1530 train_loss:3.3974 train_time:222443ms step_avg:172.44ms step:1301/1530 train_loss:3.4002 train_time:222619ms step_avg:172.44ms step:1302/1530 train_loss:3.5730 train_time:222802ms step_avg:172.45ms step:1303/1530 train_loss:3.3052 train_time:222984ms step_avg:172.46ms step:1304/1530 train_loss:3.5064 train_time:223164ms step_avg:172.46ms step:1305/1530 train_loss:3.2547 train_time:223341ms step_avg:172.46ms step:1306/1530 train_loss:3.4518 train_time:223522ms step_avg:172.47ms step:1307/1530 train_loss:3.4548 train_time:223696ms step_avg:172.47ms step:1308/1530 train_loss:3.2872 train_time:223874ms step_avg:172.48ms step:1309/1530 train_loss:3.3055 train_time:224055ms step_avg:172.48ms step:1310/1530 train_loss:3.2852 train_time:224234ms step_avg:172.49ms step:1311/1530 train_loss:3.2931 train_time:224412ms step_avg:172.49ms step:1312/1530 train_loss:3.3749 train_time:224592ms step_avg:172.50ms step:1313/1530 train_loss:3.3421 train_time:224769ms step_avg:172.50ms step:1314/1530 train_loss:3.0431 train_time:224954ms step_avg:172.51ms step:1315/1530 train_loss:3.2781 train_time:225133ms step_avg:172.52ms step:1316/1530 train_loss:3.3978 train_time:225309ms step_avg:172.52ms step:1317/1530 train_loss:3.4189 train_time:225486ms step_avg:172.52ms step:1318/1530 train_loss:3.3011 train_time:225674ms step_avg:172.53ms step:1319/1530 train_loss:3.4250 train_time:225855ms step_avg:172.54ms step:1320/1530 train_loss:3.4598 train_time:226037ms step_avg:172.55ms step:1321/1530 train_loss:3.3643 train_time:226215ms step_avg:172.55ms step:1322/1530 train_loss:3.3220 train_time:226532ms step_avg:172.66ms step:1323/1530 train_loss:3.3188 train_time:226719ms step_avg:172.67ms step:1324/1530 train_loss:3.4337 train_time:226898ms step_avg:172.68ms step:1325/1530 train_loss:3.4935 train_time:227082ms step_avg:172.69ms step:1326/1530 train_loss:3.2127 train_time:227261ms step_avg:172.69ms step:1327/1530 train_loss:3.1651 train_time:227438ms step_avg:172.69ms step:1328/1530 train_loss:3.4899 train_time:227618ms step_avg:172.70ms step:1329/1530 train_loss:3.2943 train_time:227963ms step_avg:172.83ms step:1330/1530 train_loss:3.4222 train_time:228143ms step_avg:172.84ms step:1331/1530 train_loss:3.3293 train_time:228319ms step_avg:172.84ms step:1332/1530 train_loss:3.7388 train_time:228500ms step_avg:172.84ms step:1333/1530 train_loss:3.4744 train_time:228680ms step_avg:172.85ms step:1334/1530 train_loss:3.3747 train_time:228859ms step_avg:172.85ms step:1335/1530 train_loss:3.2888 train_time:229038ms step_avg:172.86ms step:1336/1530 train_loss:3.2952 train_time:229223ms step_avg:172.87ms step:1337/1530 train_loss:3.5513 train_time:229402ms step_avg:172.87ms step:1338/1530 train_loss:3.5220 train_time:229581ms step_avg:172.88ms step:1339/1530 train_loss:3.3337 train_time:229761ms step_avg:172.88ms step:1340/1530 train_loss:3.2841 train_time:229941ms step_avg:172.89ms step:1341/1530 train_loss:3.5877 train_time:230117ms step_avg:172.89ms step:1342/1530 train_loss:3.3548 train_time:230298ms step_avg:172.90ms step:1343/1530 train_loss:3.3607 train_time:230476ms step_avg:172.90ms step:1344/1530 train_loss:3.4117 train_time:230658ms step_avg:172.91ms step:1345/1530 train_loss:3.3846 train_time:230842ms step_avg:172.92ms step:1346/1530 train_loss:3.2971 train_time:231018ms step_avg:172.92ms step:1347/1530 train_loss:3.2746 train_time:231195ms step_avg:172.92ms step:1348/1530 train_loss:3.3469 train_time:231373ms step_avg:172.92ms step:1349/1530 train_loss:3.2715 train_time:231550ms step_avg:172.93ms step:1350/1530 train_loss:3.3876 train_time:231731ms step_avg:172.93ms step:1351/1530 train_loss:3.2421 train_time:231907ms step_avg:172.94ms step:1352/1530 train_loss:3.3052 train_time:232084ms step_avg:172.94ms step:1353/1530 train_loss:3.3973 train_time:232263ms step_avg:172.94ms step:1354/1530 train_loss:3.2629 train_time:232441ms step_avg:172.95ms step:1355/1530 train_loss:3.1863 train_time:232618ms step_avg:172.95ms step:1356/1530 train_loss:3.5058 train_time:232798ms step_avg:172.96ms step:1357/1530 train_loss:3.4195 train_time:232978ms step_avg:172.96ms step:1358/1530 train_loss:3.1843 train_time:233156ms step_avg:172.96ms step:1359/1530 train_loss:3.4350 train_time:233337ms step_avg:172.97ms step:1360/1530 train_loss:3.3489 train_time:233517ms step_avg:172.98ms step:1361/1530 train_loss:3.1251 train_time:233703ms step_avg:172.99ms step:1362/1530 train_loss:3.3935 train_time:233882ms step_avg:172.99ms step:1363/1530 train_loss:3.2794 train_time:234070ms step_avg:173.00ms step:1364/1530 train_loss:3.3031 train_time:234250ms step_avg:173.01ms step:1365/1530 train_loss:3.3144 train_time:234427ms step_avg:173.01ms step:1366/1530 train_loss:3.4230 train_time:234607ms step_avg:173.01ms step:1367/1530 train_loss:3.3949 train_time:234785ms step_avg:173.02ms step:1368/1530 train_loss:3.3479 train_time:234964ms step_avg:173.02ms step:1369/1530 train_loss:3.2729 train_time:235151ms step_avg:173.03ms step:1370/1530 train_loss:3.6030 train_time:235334ms step_avg:173.04ms step:1371/1530 train_loss:3.3122 train_time:235517ms step_avg:173.05ms step:1372/1530 train_loss:3.3670 train_time:235701ms step_avg:173.05ms step:1373/1530 train_loss:3.3646 train_time:235880ms step_avg:173.06ms step:1374/1530 train_loss:3.1481 train_time:236060ms step_avg:173.06ms step:1375/1530 train_loss:3.5343 train_time:236240ms step_avg:173.07ms step:1375/1530 val_loss:3.3099 train_time:236291ms step_avg:173.11ms step:1376/1530 train_loss:3.3447 train_time:236418ms step_avg:173.07ms step:1377/1530 train_loss:3.4778 train_time:236598ms step_avg:173.08ms step:1378/1530 train_loss:3.4675 train_time:236775ms step_avg:173.08ms step:1379/1530 train_loss:3.1135 train_time:236957ms step_avg:173.09ms step:1380/1530 train_loss:3.3136 train_time:237136ms step_avg:173.09ms step:1381/1530 train_loss:3.6984 train_time:237320ms step_avg:173.10ms step:1382/1530 train_loss:3.2081 train_time:237498ms step_avg:173.10ms step:1383/1530 train_loss:3.3910 train_time:237680ms step_avg:173.11ms step:1384/1530 train_loss:3.4710 train_time:237864ms step_avg:173.12ms step:1385/1530 train_loss:3.4060 train_time:238039ms step_avg:173.12ms step:1386/1530 train_loss:3.3416 train_time:238219ms step_avg:173.12ms step:1387/1530 train_loss:3.1976 train_time:238398ms step_avg:173.13ms step:1388/1530 train_loss:3.3483 train_time:238574ms step_avg:173.13ms step:1389/1530 train_loss:3.3151 train_time:238757ms step_avg:173.14ms step:1390/1530 train_loss:3.5651 train_time:238934ms step_avg:173.14ms step:1391/1530 train_loss:3.2878 train_time:239112ms step_avg:173.14ms step:1392/1530 train_loss:3.2853 train_time:239292ms step_avg:173.15ms step:1393/1530 train_loss:3.2333 train_time:239471ms step_avg:173.15ms step:1394/1530 train_loss:3.4975 train_time:239647ms step_avg:173.16ms step:1395/1530 train_loss:3.3869 train_time:239829ms step_avg:173.16ms step:1396/1530 train_loss:3.4007 train_time:240006ms step_avg:173.16ms step:1397/1530 train_loss:3.3099 train_time:240182ms step_avg:173.17ms step:1398/1530 train_loss:3.2571 train_time:240358ms step_avg:173.17ms step:1399/1530 train_loss:3.3158 train_time:240538ms step_avg:173.17ms step:1400/1530 train_loss:3.3183 train_time:240722ms step_avg:173.18ms step:1401/1530 train_loss:3.3488 train_time:240898ms step_avg:173.18ms step:1402/1530 train_loss:3.3010 train_time:241078ms step_avg:173.19ms step:1403/1530 train_loss:3.4914 train_time:241263ms step_avg:173.20ms step:1404/1530 train_loss:3.2800 train_time:241439ms step_avg:173.20ms step:1405/1530 train_loss:3.3148 train_time:241623ms step_avg:173.21ms step:1406/1530 train_loss:3.3115 train_time:241804ms step_avg:173.21ms step:1407/1530 train_loss:3.1740 train_time:241980ms step_avg:173.21ms step:1408/1530 train_loss:3.3128 train_time:242160ms step_avg:173.22ms step:1409/1530 train_loss:3.3028 train_time:242348ms step_avg:173.23ms step:1410/1530 train_loss:3.2844 train_time:242527ms step_avg:173.23ms step:1411/1530 train_loss:3.3607 train_time:242702ms step_avg:173.23ms step:1412/1530 train_loss:3.3316 train_time:242878ms step_avg:173.24ms step:1413/1530 train_loss:3.3617 train_time:243057ms step_avg:173.24ms step:1414/1530 train_loss:3.3275 train_time:243238ms step_avg:173.25ms step:1415/1530 train_loss:3.4043 train_time:243422ms step_avg:173.25ms step:1416/1530 train_loss:3.2280 train_time:243611ms step_avg:173.27ms step:1417/1530 train_loss:3.2769 train_time:243795ms step_avg:173.27ms step:1418/1530 train_loss:3.3844 train_time:243974ms step_avg:173.28ms step:1419/1530 train_loss:3.3433 train_time:244157ms step_avg:173.28ms step:1420/1530 train_loss:3.3696 train_time:244339ms step_avg:173.29ms step:1421/1530 train_loss:3.3741 train_time:244518ms step_avg:173.29ms step:1422/1530 train_loss:3.3351 train_time:244697ms step_avg:173.30ms step:1423/1530 train_loss:3.3134 train_time:244875ms step_avg:173.30ms step:1424/1530 train_loss:3.3308 train_time:245061ms step_avg:173.31ms step:1425/1530 train_loss:3.1911 train_time:245249ms step_avg:173.32ms step:1426/1530 train_loss:3.3224 train_time:245428ms step_avg:173.33ms step:1427/1530 train_loss:3.2836 train_time:245610ms step_avg:173.33ms step:1428/1530 train_loss:3.3789 train_time:245788ms step_avg:173.33ms step:1429/1530 train_loss:3.3507 train_time:245965ms step_avg:173.34ms step:1430/1530 train_loss:3.2567 train_time:246147ms step_avg:173.34ms step:1431/1530 train_loss:3.3206 train_time:246330ms step_avg:173.35ms step:1432/1530 train_loss:3.3362 train_time:246512ms step_avg:173.36ms step:1433/1530 train_loss:3.1306 train_time:246694ms step_avg:173.36ms step:1434/1530 train_loss:3.2858 train_time:246879ms step_avg:173.37ms step:1435/1530 train_loss:3.1145 train_time:247059ms step_avg:173.37ms step:1436/1530 train_loss:3.2269 train_time:247238ms step_avg:173.38ms step:1437/1530 train_loss:3.4054 train_time:247416ms step_avg:173.38ms step:1438/1530 train_loss:3.3804 train_time:247594ms step_avg:173.38ms step:1439/1530 train_loss:3.3158 train_time:247773ms step_avg:173.39ms step:1440/1530 train_loss:3.1956 train_time:247950ms step_avg:173.39ms step:1441/1530 train_loss:3.3377 train_time:248129ms step_avg:173.40ms step:1442/1530 train_loss:3.3847 train_time:248313ms step_avg:173.40ms step:1443/1530 train_loss:3.4913 train_time:248499ms step_avg:173.41ms step:1444/1530 train_loss:3.4444 train_time:248675ms step_avg:173.41ms step:1445/1530 train_loss:3.3361 train_time:248853ms step_avg:173.42ms step:1446/1530 train_loss:3.1949 train_time:249034ms step_avg:173.42ms step:1447/1530 train_loss:3.2981 train_time:249216ms step_avg:173.43ms step:1448/1530 train_loss:3.2970 train_time:249394ms step_avg:173.43ms step:1449/1530 train_loss:3.3987 train_time:249572ms step_avg:173.43ms step:1450/1530 train_loss:3.3912 train_time:249753ms step_avg:173.44ms step:1451/1530 train_loss:3.2043 train_time:249932ms step_avg:173.44ms step:1452/1530 train_loss:3.3255 train_time:250113ms step_avg:173.45ms step:1453/1530 train_loss:3.2579 train_time:250288ms step_avg:173.45ms step:1454/1530 train_loss:3.2881 train_time:250465ms step_avg:173.45ms step:1455/1530 train_loss:3.3266 train_time:250647ms step_avg:173.46ms step:1456/1530 train_loss:3.2851 train_time:250824ms step_avg:173.46ms step:1457/1530 train_loss:3.1542 train_time:251001ms step_avg:173.46ms step:1458/1530 train_loss:3.4206 train_time:251178ms step_avg:173.47ms step:1459/1530 train_loss:3.2701 train_time:251360ms step_avg:173.47ms step:1460/1530 train_loss:3.3124 train_time:251538ms step_avg:173.47ms step:1461/1530 train_loss:3.4315 train_time:251718ms step_avg:173.48ms step:1462/1530 train_loss:3.2606 train_time:251896ms step_avg:173.48ms step:1463/1530 train_loss:3.4685 train_time:252079ms step_avg:173.49ms step:1464/1530 train_loss:3.3572 train_time:252257ms step_avg:173.49ms step:1465/1530 train_loss:3.3572 train_time:252437ms step_avg:173.50ms step:1466/1530 train_loss:3.2801 train_time:252615ms step_avg:173.50ms step:1467/1530 train_loss:3.3946 train_time:252795ms step_avg:173.50ms step:1468/1530 train_loss:3.2863 train_time:252972ms step_avg:173.51ms step:1469/1530 train_loss:3.2756 train_time:253153ms step_avg:173.51ms step:1470/1530 train_loss:3.3270 train_time:253337ms step_avg:173.52ms step:1471/1530 train_loss:3.2609 train_time:253523ms step_avg:173.53ms step:1472/1530 train_loss:3.2471 train_time:253708ms step_avg:173.53ms step:1473/1530 train_loss:3.4386 train_time:253886ms step_avg:173.54ms step:1474/1530 train_loss:3.3103 train_time:254070ms step_avg:173.55ms step:1475/1530 train_loss:3.1510 train_time:254256ms step_avg:173.55ms step:1476/1530 train_loss:3.2675 train_time:254434ms step_avg:173.56ms step:1477/1530 train_loss:3.2431 train_time:254623ms step_avg:173.57ms step:1478/1530 train_loss:3.3101 train_time:254809ms step_avg:173.58ms step:1479/1530 train_loss:3.3974 train_time:254992ms step_avg:173.58ms step:1480/1530 train_loss:3.2690 train_time:255170ms step_avg:173.59ms step:1481/1530 train_loss:3.4489 train_time:255353ms step_avg:173.59ms step:1482/1530 train_loss:3.3665 train_time:255540ms step_avg:173.60ms step:1483/1530 train_loss:3.2757 train_time:255731ms step_avg:173.61ms step:1484/1530 train_loss:3.2657 train_time:255917ms step_avg:173.62ms step:1485/1530 train_loss:3.2788 train_time:256096ms step_avg:173.62ms step:1486/1530 train_loss:3.2265 train_time:256281ms step_avg:173.63ms step:1487/1530 train_loss:3.3444 train_time:256463ms step_avg:173.64ms step:1488/1530 train_loss:3.2419 train_time:256645ms step_avg:173.64ms step:1489/1530 train_loss:3.3120 train_time:256826ms step_avg:173.65ms step:1490/1530 train_loss:3.2515 train_time:257007ms step_avg:173.65ms step:1491/1530 train_loss:3.1570 train_time:257188ms step_avg:173.66ms step:1492/1530 train_loss:3.2673 train_time:257368ms step_avg:173.66ms step:1493/1530 train_loss:3.4308 train_time:257547ms step_avg:173.67ms step:1494/1530 train_loss:3.2946 train_time:257728ms step_avg:173.67ms step:1495/1530 train_loss:3.0315 train_time:257912ms step_avg:173.68ms step:1496/1530 train_loss:3.3564 train_time:258093ms step_avg:173.68ms step:1497/1530 train_loss:3.3083 train_time:258276ms step_avg:173.69ms step:1498/1530 train_loss:3.3435 train_time:258460ms step_avg:173.70ms step:1499/1530 train_loss:3.3126 train_time:258648ms step_avg:173.71ms step:1500/1530 train_loss:3.2943 train_time:258840ms step_avg:173.72ms step:1500/1530 val_loss:3.2788 train_time:258896ms step_avg:173.76ms step:1501/1530 train_loss:3.0863 train_time:259033ms step_avg:173.73ms step:1502/1530 train_loss:3.3584 train_time:259222ms step_avg:173.74ms step:1503/1530 train_loss:3.2461 train_time:259402ms step_avg:173.75ms step:1504/1530 train_loss:3.2489 train_time:259584ms step_avg:173.75ms step:1505/1530 train_loss:3.2080 train_time:259763ms step_avg:173.75ms step:1506/1530 train_loss:3.2818 train_time:259944ms step_avg:173.76ms step:1507/1530 train_loss:3.1746 train_time:260140ms step_avg:173.77ms step:1508/1530 train_loss:3.4831 train_time:260324ms step_avg:173.78ms step:1509/1530 train_loss:3.2765 train_time:260500ms step_avg:173.78ms step:1510/1530 train_loss:3.2727 train_time:260681ms step_avg:173.79ms step:1511/1530 train_loss:3.4125 train_time:260997ms step_avg:173.88ms step:1512/1530 train_loss:3.4180 train_time:261188ms step_avg:173.89ms step:1513/1530 train_loss:3.2665 train_time:261370ms step_avg:173.90ms step:1514/1530 train_loss:3.0800 train_time:261553ms step_avg:173.91ms step:1515/1530 train_loss:3.2423 train_time:261734ms step_avg:173.91ms step:1516/1530 train_loss:3.2584 train_time:261920ms step_avg:173.92ms step:1517/1530 train_loss:3.2976 train_time:262101ms step_avg:173.92ms step:1518/1530 train_loss:3.2112 train_time:262286ms step_avg:173.93ms step:1519/1530 train_loss:3.5008 train_time:262619ms step_avg:174.03ms step:1520/1530 train_loss:3.1295 train_time:262803ms step_avg:174.04ms step:1521/1530 train_loss:3.2050 train_time:262980ms step_avg:174.04ms step:1522/1530 train_loss:3.3586 train_time:263164ms step_avg:174.05ms step:1523/1530 train_loss:3.2306 train_time:263342ms step_avg:174.05ms step:1524/1530 train_loss:3.3450 train_time:263521ms step_avg:174.06ms step:1525/1530 train_loss:3.3358 train_time:263710ms step_avg:174.07ms step:1526/1530 train_loss:3.2728 train_time:263900ms step_avg:174.08ms step:1527/1530 train_loss:3.2940 train_time:264082ms step_avg:174.08ms step:1528/1530 train_loss:3.4076 train_time:264259ms step_avg:174.08ms step:1529/1530 train_loss:3.4040 train_time:264436ms step_avg:174.09ms step:1530/1530 train_loss:3.2342 train_time:264616ms step_avg:174.09ms step:1530/1530 val_loss:3.2764 train_time:264670ms step_avg:174.13ms