import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import glob import time import contextlib from dataclasses import dataclass import numpy as np import torch from torch import nn import torch.nn.functional as F import torch.distributed as dist import torch._inductor.config as config from torch.nn.parallel import DistributedDataParallel as DDP # Use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import flex_attention, create_block_mask flex_attention = torch.compile(flex_attention, dynamic=False) create_block_mask = torch.compile(create_block_mask, dynamic=False) # ----------------------------------------------------------------------------- # Muon optimizer def zeropower_via_svd(G, steps=None): U, S, V = G.svd() return U @ V.T @torch.compile def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert len(G.shape) == 2 a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() X /= (X.norm() + eps) # ensure top singular value <= 1 if G.size(0) > G.size(1): X = X.T for _ in range(steps): A = X @ X.T B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(0) > G.size(1): X = X.T return X zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') backend_steps: The number of iteration steps to use in the backend, if it is iterative. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, backend='newtonschulz5', backend_steps=5): defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) super().__init__(params, defaults) def step(self): for group in self.param_groups: lr = group['lr'] momentum = group['momentum'] zeropower_backend = zeropower_backends[group['backend']] # generate weight updates in distributed fashion total_params = sum(p.numel() for p in group['params']) updates_flat = torch.zeros(total_params, device='cuda', dtype=torch.bfloat16) curr_idx = 0 for i, p in enumerate(group['params']): # luckily this will perfectly distribute a transformer with multiple of 4 layers to 8 GPUs if i % int(os.environ['WORLD_SIZE']) == int(os.environ['RANK']): g = p.grad assert g is not None state = self.state[p] if 'momentum_buffer' not in state: state['momentum_buffer'] = torch.zeros_like(g) buf = state['momentum_buffer'] buf.mul_(momentum).add_(g) g = g.add(buf, alpha=momentum) if group['nesterov'] else buf g = zeropower_backend(g, steps=group['backend_steps']) g *= max(1, g.size(0)/g.size(1))**0.5 updates_flat[curr_idx:curr_idx+p.numel()] = g.flatten() curr_idx += p.numel() # sync updates across devices. we are not memory-constrained so can do this simple deserialization dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) # deserialize and apply updates curr_idx = 0 for p in group['params']: g = updates_flat[curr_idx:curr_idx+p.numel()].view_as(p.data).type_as(p.data) p.data.add_(g, alpha=-lr) curr_idx += p.numel() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the GPT-2 model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features, out_features): super().__init__(in_features, out_features, bias=False) def forward(self, x): return F.linear(x, self.weight.to(x.dtype)) class Rotary(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) self.seq_len_cached = None self.cos_cached = None self.sin_cached = None def forward(self, x): seq_len = x.shape[1] if seq_len != self.seq_len_cached: t = torch.arange(seq_len, device=x.device) freqs = torch.outer(t, self.inv_freq) self.seq_len_cached = seq_len self.cos_cached = freqs.cos() self.sin_cached = freqs.sin() cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] # apply_rotary_emb(x, cos, sin) x1, x2 = x.chunk(2, dim=3) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x) class CausalSelfAttention(nn.Module): def __init__(self, dim, n_head): super().__init__() assert dim % n_head == 0 self.n_head = n_head self.c_q = CastedLinear(dim, dim) self.c_k = CastedLinear(dim, dim) self.c_v = CastedLinear(dim, dim) # value residual lambda self.lamb = nn.Parameter(torch.tensor(0.5)) # @Grad62304977 # rotary embeddings self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim # output projection self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x, vi, block_mask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q = self.c_q(x).view(B, T, self.n_head, -1) k = self.c_k(x).view(B, T, self.n_head, -1) v = self.c_v(x).view(B, T, self.n_head, -1) v = (1 - self.lamb) * v + self.lamb * vi.view_as(v) # @Grad62304977 q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, config): super().__init__() self.attn = CausalSelfAttention(config.n_embd, config.n_head) self.mlp = MLP(config.n_embd) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, vi, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 x = x + self.attn(norm(x), vi, block_mask) x = x + self.mlp(norm(x)) return x # ----------------------------------------------------------------------------- # The main GPT-2 model @dataclass class GPTConfig: vocab_size : int = 50304 n_layer : int = 12 n_head : int = 6 # head dim 128 suggested by @Grad62304977 n_embd : int = 768 class GPT(nn.Module): def __init__(self, config): super().__init__() # U-net design by @brendanh0gan self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning vte = nn.Embedding(config.vocab_size, config.n_embd*12), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), )) self.lm_head = CastedLinear(config.n_embd, config.vocab_size) self.lm_head.weight.data.zero_() # @Grad62304977 def forward(self, idx, target, attn_blocksize): docs = (idx == 50256).cumsum(0) def document_causal_mask(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] window_mask = q_idx - kv_idx < attn_blocksize return causal_mask & document_mask & window_mask S = len(idx) block_mask = create_block_mask(document_causal_mask, None, None, S, S, device="cuda", _compile=True) # forward the GPT model itself x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) x = norm(x) # @Grad62304977 x0 = x vi = self.transformer.vte(idx[None]).chunk(12, dim=-1) # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks for i in range(self.num_encoder_layers): x = self.transformer.h[i](x, vi[i], x0, block_mask) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers+i], x0, block_mask) x = norm(x) logits = self.lm_head(x) logits = 30 * torch.tanh(logits / 30) # @Grad62304977 logits = logits.float() loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _peek_data_shard(filename): # only reads the header, returns header data with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) if header[0] != 20240520: print("ERROR: magic number mismatch in the data .bin file!") print("---> HINT: Are you passing in a correct file with --input_bin?") print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README") print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try") exit(1) assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) return ntok # for now just return the number of tokens def _load_data_shard(filename): with open(filename, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) # the rest of it are tokens, stored as uint16 tokens = np.frombuffer(f.read(), dtype=np.uint16) assert len(tokens) == ntok, "number of tokens read does not match header?" return tokens class DistributedDataLoader: def __init__(self, filename_pattern, T, process_rank, num_processes): self.process_rank = process_rank self.num_processes = num_processes self.T = T # glob files that match the pattern self.files = sorted(glob.glob(filename_pattern)) assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" # load and validate all data shards, count number of tokens in total ntok_total = 0 for fname in self.files: shard_ntok = _peek_data_shard(fname) assert shard_ntok >= num_processes * T + 1 ntok_total += int(shard_ntok) self.ntok_total = ntok_total self.reset() def reset(self): self.current_shard = -1 self.advance() def advance(self): # advance to next data shard self.current_shard = (self.current_shard + 1) % len(self.files) self.current_position = self.process_rank * self.T self.tokens = _load_data_shard(self.files[self.current_shard]) def next_batch(self): batch_size = self.T * self.num_processes buf = self.tokens[self.current_position:self.current_position+self.T+1] buf = torch.tensor(buf.astype(np.int32), dtype=torch.long) x = buf[:-1] # inputs y = buf[1:] # targets # advance current position and load next shard if necessary self.current_position += batch_size if self.current_position + batch_size >= len(self.tokens): self.advance() return x.cuda(), y.cuda() # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data hyperparams input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on # optimization hyperparams batch_size : int = 8 # batch size, in sequences, across all devices sequence_length : int = 64*1024 # sequence length, in tokens num_iterations : int = 1530 # number of iterations to run warmup_iters : int = 0 cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule weight_decay : float = 0 # evaluation and logging hyperparams val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end args = Hyperparameters() # set up DDP (distributed data parallel). torchrun sets this env variable assert torch.cuda.is_available() dist.init_process_group(backend='nccl') ddp_rank = int(os.environ['RANK']) ddp_local_rank = int(os.environ['LOCAL_RANK']) ddp_world_size = int(os.environ['WORLD_SIZE']) device = f'cuda:{ddp_local_rank}' torch.cuda.set_device(device) print(f"using device: {device}") master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = str(uuid.uuid4()) logdir = 'logs/%s/' % run_id os.makedirs(logdir, exist_ok=True) logfile = 'logs/%s.txt' % run_id # create the log file with open(logfile, "w") as f: # begin the log by printing this file (the Python code) f.write(code) f.write('='*100 + '\n') def print0(s, logonly=False): if master_process: with open(logfile, "a") as f: if not logonly: print(s) f.write(s+'\n') # log information about the hardware/software environment this is running on # and print the full `nvidia-smi` to file print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") import subprocess result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print0(f'{result.stdout}', logonly=True) print0('='*100, logonly=True) # convenience variables T = args.sequence_length # calculate the number of steps to take in the val loop. assert args.val_tokens % (T * ddp_world_size) == 0 val_steps = args.val_tokens // (T * ddp_world_size) # calculate the steps of gradient accumulation required to attain the desired global batch size. assert args.batch_size % (ddp_world_size) == 0 train_accumulation_steps = args.batch_size // ddp_world_size # load tokens train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size) val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size) print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") print0('='*100, logonly=True) x, y = train_loader.next_batch() # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. # this originates from Karpathy's experiments. num_vocab = 50304 model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) model = model.cuda().bfloat16() for m in model.modules(): if isinstance(m, CastedLinear): m.float() if hasattr(config, "coordinate_descent_tuning"): config.coordinate_descent_tuning = True # suggested by @Chillee model = torch.compile(model) # here we wrap model into DDP container model = DDP(model, device_ids=[ddp_local_rank]) raw_model = model.module # always contains the "raw" unwrapped model # init the optimizer(s) optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) params = list(raw_model.transformer.h.parameters()) matrix_params = [p for p in params if p.ndim == 2] scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) # note that this learning rate is neither sensitive nor tuned optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] # learning rate decay scheduler (linear warmup and cooldown) def get_lr(it): assert it <= args.num_iterations # 1) linear warmup for warmup_iters steps if it < args.warmup_iters: return (it+1) / args.warmup_iters # 2) constant lr for a while elif it < args.num_iterations - args.cooldown_iters: return 1.0 # 3) linear cooldown else: decay_ratio = (args.num_iterations - it) / args.cooldown_iters return decay_ratio schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] # Start training loop training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.time() # begin training for step in range(args.num_iterations + 1): last_step = (step == args.num_iterations) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.time() timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Set the attention blocksize for the current step, in chunks of 64. By @fernbear.bsky.social attn_blocksize = torch.tensor(64*((step/args.num_iterations * (1792 - 64) + 64)//64), dtype=torch.int, device='cuda') # once in a while evaluate the validation dataset if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # run validation batches model.eval() val_loader.reset() val_loss = 0.0 for _ in range(val_steps): with torch.no_grad(): x_val, y_val = val_loader.next_batch() val_loss += model(x_val, y_val, attn_blocksize=attn_blocksize) dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) val_loss /= val_steps # log val loss to console and to logfile print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') # start the clock again torch.cuda.synchronize() t0 = time.time() if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.time() - t0) # save the state of the training process log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) # start the clock again torch.cuda.synchronize() t0 = time.time() # bit confusing: we want to make sure to eval on 0th iteration # but also after the very last iteration. so we loop for step <= num_iterations # instead of just < num_iterations (one extra due to <=), only to do # the validation/sampling one last time, and then we break right here as we're done. if last_step: break # --------------- TRAINING SECTION BEGIN ----------------- model.train() for i in range(1, train_accumulation_steps+1): ctx = model.no_sync() if i < train_accumulation_steps else contextlib.nullcontext() with ctx: # there's no need to sync gradients every accumulation step # forward pass loss = model(x, y, attn_blocksize=attn_blocksize) # advance the dataset for the next batch x, y = train_loader.next_batch() # backward pass loss.backward() train_loss = loss.detach() for p in model.parameters(): p.grad /= train_accumulation_steps # momentum warmup for Muon frac = min(step/300, 1) optimizer3.param_groups[0]['momentum'] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # --------------- TRAINING SECTION END ------------------- # everything that follows now is just diagnostics, prints, logging, etc. #dist.all_reduce(train_loss, op=dist.ReduceOp.AVG) # all-reducing the training loss would be more correct in terms of logging, but slower approx_time = training_time_ms + 1000 * (time.time() - t0) print0(f"step:{step+1}/{args.num_iterations} train_loss:{train_loss.item():.4f} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") if master_process: print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") # ------------------------------------------------------------------------- # clean up nice dist.destroy_process_group() ==================================================================================================== Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 nvidia-smi: Thu Dec 5 02:51:03 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | | N/A 38C P0 75W / 700W | 3MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | | N/A 31C P0 115W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | | N/A 31C P0 119W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | | N/A 38C P0 119W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | | N/A 39C P0 123W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | | N/A 30C P0 110W / 700W | 529MiB / 81559MiB | 1% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | N/A 39C P0 128W / 700W | 529MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | N/A 30C P0 107W / 700W | 22MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+ ==================================================================================================== Training DataLoader: total number of tokens: 1100000000 across 11 files Validation DataLoader: total number of tokens: 100000000 across 1 files ==================================================================================================== step:0/1530 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1530 train_loss:10.8258 train_time:31747ms step_avg:nanms step:2/1530 train_loss:10.0728 train_time:31858ms step_avg:nanms step:3/1530 train_loss:8.3607 train_time:32016ms step_avg:nanms step:4/1530 train_loss:7.5529 train_time:32177ms step_avg:nanms step:5/1530 train_loss:7.4912 train_time:32339ms step_avg:nanms step:6/1530 train_loss:7.0229 train_time:32499ms step_avg:nanms step:7/1530 train_loss:7.2442 train_time:32659ms step_avg:nanms step:8/1530 train_loss:6.7480 train_time:32819ms step_avg:nanms step:9/1530 train_loss:6.6141 train_time:32981ms step_avg:nanms step:10/1530 train_loss:6.5220 train_time:33142ms step_avg:nanms step:11/1530 train_loss:6.4699 train_time:115ms step_avg:nanms step:12/1530 train_loss:6.3858 train_time:277ms step_avg:nanms step:13/1530 train_loss:6.2178 train_time:437ms step_avg:145.61ms step:14/1530 train_loss:6.1736 train_time:598ms step_avg:149.46ms step:15/1530 train_loss:6.1430 train_time:758ms step_avg:151.61ms step:16/1530 train_loss:6.1365 train_time:918ms step_avg:153.02ms step:17/1530 train_loss:6.1634 train_time:1079ms step_avg:154.09ms step:18/1530 train_loss:5.9464 train_time:1238ms step_avg:154.77ms step:19/1530 train_loss:5.9492 train_time:1399ms step_avg:155.46ms step:20/1530 train_loss:5.6568 train_time:1560ms step_avg:155.98ms step:21/1530 train_loss:5.9305 train_time:1720ms step_avg:156.39ms step:22/1530 train_loss:6.1579 train_time:1881ms step_avg:156.73ms step:23/1530 train_loss:5.8604 train_time:2041ms step_avg:156.99ms step:24/1530 train_loss:6.0251 train_time:2202ms step_avg:157.26ms step:25/1530 train_loss:5.6782 train_time:2362ms step_avg:157.48ms step:26/1530 train_loss:5.5989 train_time:2523ms step_avg:157.68ms step:27/1530 train_loss:5.7538 train_time:2685ms step_avg:157.95ms step:28/1530 train_loss:5.3981 train_time:2844ms step_avg:158.01ms step:29/1530 train_loss:5.6582 train_time:3004ms step_avg:158.13ms step:30/1530 train_loss:5.4727 train_time:3165ms step_avg:158.26ms step:31/1530 train_loss:5.4253 train_time:3325ms step_avg:158.34ms step:32/1530 train_loss:5.2834 train_time:3485ms step_avg:158.43ms step:33/1530 train_loss:5.5587 train_time:3645ms step_avg:158.49ms step:34/1530 train_loss:5.4766 train_time:3805ms step_avg:158.54ms step:35/1530 train_loss:5.5878 train_time:3965ms step_avg:158.60ms step:36/1530 train_loss:5.5314 train_time:4125ms step_avg:158.66ms step:37/1530 train_loss:5.4373 train_time:4286ms step_avg:158.74ms step:38/1530 train_loss:5.2925 train_time:4446ms step_avg:158.78ms step:39/1530 train_loss:5.3033 train_time:4605ms step_avg:158.79ms step:40/1530 train_loss:5.2400 train_time:4765ms step_avg:158.84ms step:41/1530 train_loss:5.2468 train_time:4925ms step_avg:158.89ms step:42/1530 train_loss:5.2109 train_time:5086ms step_avg:158.94ms step:43/1530 train_loss:5.2526 train_time:5247ms step_avg:158.99ms step:44/1530 train_loss:5.2314 train_time:5406ms step_avg:159.01ms step:45/1530 train_loss:5.4045 train_time:5567ms step_avg:159.06ms step:46/1530 train_loss:5.1621 train_time:5727ms step_avg:159.09ms step:47/1530 train_loss:5.0483 train_time:5888ms step_avg:159.14ms step:48/1530 train_loss:5.2000 train_time:6048ms step_avg:159.16ms step:49/1530 train_loss:5.1361 train_time:6207ms step_avg:159.15ms step:50/1530 train_loss:5.2453 train_time:6367ms step_avg:159.19ms step:51/1530 train_loss:5.1233 train_time:6528ms step_avg:159.22ms step:52/1530 train_loss:5.0038 train_time:6688ms step_avg:159.23ms step:53/1530 train_loss:5.1533 train_time:6849ms step_avg:159.27ms step:54/1530 train_loss:4.9891 train_time:7008ms step_avg:159.27ms step:55/1530 train_loss:5.3913 train_time:7168ms step_avg:159.29ms step:56/1530 train_loss:5.0173 train_time:7327ms step_avg:159.29ms step:57/1530 train_loss:4.8646 train_time:7488ms step_avg:159.31ms step:58/1530 train_loss:5.0349 train_time:7648ms step_avg:159.33ms step:59/1530 train_loss:5.0150 train_time:7808ms step_avg:159.34ms step:60/1530 train_loss:5.1618 train_time:7968ms step_avg:159.36ms step:61/1530 train_loss:4.8920 train_time:8128ms step_avg:159.37ms step:62/1530 train_loss:4.9726 train_time:8288ms step_avg:159.38ms step:63/1530 train_loss:4.9776 train_time:8447ms step_avg:159.39ms step:64/1530 train_loss:4.9821 train_time:8609ms step_avg:159.42ms step:65/1530 train_loss:4.7869 train_time:8769ms step_avg:159.43ms step:66/1530 train_loss:4.9298 train_time:8929ms step_avg:159.44ms step:67/1530 train_loss:4.8218 train_time:9089ms step_avg:159.45ms step:68/1530 train_loss:5.0833 train_time:9248ms step_avg:159.45ms step:69/1530 train_loss:4.7120 train_time:9408ms step_avg:159.47ms step:70/1530 train_loss:4.8591 train_time:9568ms step_avg:159.47ms step:71/1530 train_loss:4.9616 train_time:9728ms step_avg:159.48ms step:72/1530 train_loss:4.8764 train_time:9888ms step_avg:159.49ms step:73/1530 train_loss:4.7603 train_time:10047ms step_avg:159.48ms step:74/1530 train_loss:4.9156 train_time:10207ms step_avg:159.49ms step:75/1530 train_loss:4.8641 train_time:10368ms step_avg:159.50ms step:76/1530 train_loss:4.8090 train_time:10528ms step_avg:159.52ms step:77/1530 train_loss:4.9158 train_time:10687ms step_avg:159.51ms step:78/1530 train_loss:5.1169 train_time:10848ms step_avg:159.52ms step:79/1530 train_loss:4.8153 train_time:11008ms step_avg:159.53ms step:80/1530 train_loss:4.8596 train_time:11169ms step_avg:159.55ms step:81/1530 train_loss:4.6433 train_time:11328ms step_avg:159.55ms step:82/1530 train_loss:4.8172 train_time:11489ms step_avg:159.57ms step:83/1530 train_loss:4.7660 train_time:11649ms step_avg:159.57ms step:84/1530 train_loss:4.7609 train_time:11808ms step_avg:159.56ms step:85/1530 train_loss:4.6243 train_time:11969ms step_avg:159.58ms step:86/1530 train_loss:4.8430 train_time:12130ms step_avg:159.60ms step:87/1530 train_loss:4.7470 train_time:12291ms step_avg:159.62ms step:88/1530 train_loss:4.7579 train_time:12451ms step_avg:159.63ms step:89/1530 train_loss:4.6925 train_time:12611ms step_avg:159.63ms step:90/1530 train_loss:4.6423 train_time:12773ms step_avg:159.66ms step:91/1530 train_loss:4.6682 train_time:12934ms step_avg:159.68ms step:92/1530 train_loss:4.8393 train_time:13095ms step_avg:159.70ms step:93/1530 train_loss:4.6135 train_time:13256ms step_avg:159.71ms step:94/1530 train_loss:4.6279 train_time:13417ms step_avg:159.72ms step:95/1530 train_loss:4.6901 train_time:13577ms step_avg:159.73ms step:96/1530 train_loss:4.5905 train_time:13737ms step_avg:159.74ms step:97/1530 train_loss:4.6403 train_time:13898ms step_avg:159.75ms step:98/1530 train_loss:4.5846 train_time:14059ms step_avg:159.76ms step:99/1530 train_loss:4.6694 train_time:14220ms step_avg:159.78ms step:100/1530 train_loss:4.6787 train_time:14381ms step_avg:159.78ms step:101/1530 train_loss:4.5338 train_time:14540ms step_avg:159.78ms step:102/1530 train_loss:4.7115 train_time:14701ms step_avg:159.79ms step:103/1530 train_loss:4.5897 train_time:14861ms step_avg:159.80ms step:104/1530 train_loss:4.5385 train_time:15022ms step_avg:159.81ms step:105/1530 train_loss:4.5653 train_time:15183ms step_avg:159.82ms step:106/1530 train_loss:4.6253 train_time:15342ms step_avg:159.81ms step:107/1530 train_loss:4.5153 train_time:15503ms step_avg:159.83ms step:108/1530 train_loss:4.3682 train_time:15664ms step_avg:159.83ms step:109/1530 train_loss:4.4920 train_time:15825ms step_avg:159.85ms step:110/1530 train_loss:4.4823 train_time:15985ms step_avg:159.85ms step:111/1530 train_loss:4.4191 train_time:16145ms step_avg:159.85ms step:112/1530 train_loss:4.5958 train_time:16305ms step_avg:159.86ms step:113/1530 train_loss:4.5071 train_time:16465ms step_avg:159.85ms step:114/1530 train_loss:4.3622 train_time:16625ms step_avg:159.85ms step:115/1530 train_loss:4.4998 train_time:16789ms step_avg:159.89ms step:116/1530 train_loss:4.4760 train_time:16952ms step_avg:159.93ms step:117/1530 train_loss:4.3852 train_time:17116ms step_avg:159.96ms step:118/1530 train_loss:4.5949 train_time:17280ms step_avg:160.00ms step:119/1530 train_loss:4.4676 train_time:17444ms step_avg:160.03ms step:120/1530 train_loss:4.3308 train_time:17607ms step_avg:160.06ms step:121/1530 train_loss:4.3014 train_time:17772ms step_avg:160.11ms step:122/1530 train_loss:4.4554 train_time:17936ms step_avg:160.14ms step:123/1530 train_loss:4.2906 train_time:18100ms step_avg:160.18ms step:124/1530 train_loss:4.5866 train_time:18264ms step_avg:160.21ms step:125/1530 train_loss:4.4569 train_time:18428ms step_avg:160.24ms step:125/1530 val_loss:4.4039 train_time:18475ms step_avg:160.65ms step:126/1530 train_loss:4.4175 train_time:18592ms step_avg:160.28ms step:127/1530 train_loss:4.4419 train_time:18757ms step_avg:160.32ms step:128/1530 train_loss:4.3965 train_time:18923ms step_avg:160.36ms step:129/1530 train_loss:4.6936 train_time:19086ms step_avg:160.39ms step:130/1530 train_loss:4.3638 train_time:19250ms step_avg:160.41ms step:131/1530 train_loss:4.4093 train_time:19413ms step_avg:160.44ms step:132/1530 train_loss:4.3516 train_time:19577ms step_avg:160.46ms step:133/1530 train_loss:4.4456 train_time:19740ms step_avg:160.49ms step:134/1530 train_loss:4.2636 train_time:19904ms step_avg:160.52ms step:135/1530 train_loss:4.4524 train_time:20068ms step_avg:160.55ms step:136/1530 train_loss:4.2212 train_time:20231ms step_avg:160.57ms step:137/1530 train_loss:4.3846 train_time:20396ms step_avg:160.60ms step:138/1530 train_loss:4.2933 train_time:20560ms step_avg:160.63ms step:139/1530 train_loss:4.3884 train_time:20725ms step_avg:160.66ms step:140/1530 train_loss:4.4793 train_time:20888ms step_avg:160.68ms step:141/1530 train_loss:4.3190 train_time:21052ms step_avg:160.70ms step:142/1530 train_loss:4.3025 train_time:21215ms step_avg:160.72ms step:143/1530 train_loss:4.2471 train_time:21379ms step_avg:160.74ms step:144/1530 train_loss:4.3468 train_time:21544ms step_avg:160.77ms step:145/1530 train_loss:4.3125 train_time:21706ms step_avg:160.79ms step:146/1530 train_loss:4.1817 train_time:21870ms step_avg:160.81ms step:147/1530 train_loss:4.3284 train_time:22034ms step_avg:160.83ms step:148/1530 train_loss:4.3518 train_time:22198ms step_avg:160.85ms step:149/1530 train_loss:4.3039 train_time:22362ms step_avg:160.88ms step:150/1530 train_loss:4.4450 train_time:22526ms step_avg:160.90ms step:151/1530 train_loss:4.2907 train_time:22689ms step_avg:160.92ms step:152/1530 train_loss:4.2862 train_time:22853ms step_avg:160.94ms step:153/1530 train_loss:4.3861 train_time:23017ms step_avg:160.96ms step:154/1530 train_loss:4.3841 train_time:23181ms step_avg:160.98ms step:155/1530 train_loss:4.2699 train_time:23345ms step_avg:161.00ms step:156/1530 train_loss:4.3578 train_time:23508ms step_avg:161.01ms step:157/1530 train_loss:4.4113 train_time:23671ms step_avg:161.03ms step:158/1530 train_loss:4.2559 train_time:23835ms step_avg:161.05ms step:159/1530 train_loss:4.3111 train_time:23999ms step_avg:161.07ms step:160/1530 train_loss:4.1409 train_time:24164ms step_avg:161.09ms step:161/1530 train_loss:4.3637 train_time:24327ms step_avg:161.11ms step:162/1530 train_loss:4.3646 train_time:24491ms step_avg:161.12ms step:163/1530 train_loss:4.3445 train_time:24655ms step_avg:161.14ms step:164/1530 train_loss:4.1896 train_time:24819ms step_avg:161.16ms step:165/1530 train_loss:4.2944 train_time:24982ms step_avg:161.18ms step:166/1530 train_loss:4.3523 train_time:25146ms step_avg:161.20ms step:167/1530 train_loss:4.2106 train_time:25310ms step_avg:161.21ms step:168/1530 train_loss:4.2931 train_time:25474ms step_avg:161.23ms step:169/1530 train_loss:4.1643 train_time:25637ms step_avg:161.24ms step:170/1530 train_loss:4.0431 train_time:25802ms step_avg:161.26ms step:171/1530 train_loss:4.2153 train_time:25965ms step_avg:161.28ms step:172/1530 train_loss:4.2128 train_time:26129ms step_avg:161.29ms step:173/1530 train_loss:4.2651 train_time:26294ms step_avg:161.31ms step:174/1530 train_loss:4.4219 train_time:26456ms step_avg:161.32ms step:175/1530 train_loss:4.2565 train_time:26619ms step_avg:161.33ms step:176/1530 train_loss:4.0992 train_time:26782ms step_avg:161.34ms step:177/1530 train_loss:4.0728 train_time:26946ms step_avg:161.35ms step:178/1530 train_loss:4.1920 train_time:27108ms step_avg:161.35ms step:179/1530 train_loss:4.1324 train_time:27271ms step_avg:161.36ms step:180/1530 train_loss:4.1226 train_time:27434ms step_avg:161.37ms step:181/1530 train_loss:4.3000 train_time:27597ms step_avg:161.39ms step:182/1530 train_loss:4.1556 train_time:27760ms step_avg:161.40ms step:183/1530 train_loss:4.1360 train_time:27923ms step_avg:161.40ms step:184/1530 train_loss:4.1426 train_time:28086ms step_avg:161.41ms step:185/1530 train_loss:4.2252 train_time:28249ms step_avg:161.42ms step:186/1530 train_loss:4.1865 train_time:28411ms step_avg:161.43ms step:187/1530 train_loss:4.2459 train_time:28573ms step_avg:161.43ms step:188/1530 train_loss:4.1693 train_time:28868ms step_avg:162.18ms step:189/1530 train_loss:4.1100 train_time:29203ms step_avg:163.14ms step:190/1530 train_loss:4.2183 train_time:29367ms step_avg:163.15ms step:191/1530 train_loss:4.0889 train_time:29530ms step_avg:163.15ms step:192/1530 train_loss:4.0355 train_time:29692ms step_avg:163.14ms step:193/1530 train_loss:4.2625 train_time:29855ms step_avg:163.14ms step:194/1530 train_loss:4.1737 train_time:30017ms step_avg:163.14ms step:195/1530 train_loss:4.3572 train_time:30180ms step_avg:163.13ms step:196/1530 train_loss:4.1785 train_time:30345ms step_avg:163.15ms step:197/1530 train_loss:4.0488 train_time:30508ms step_avg:163.14ms step:198/1530 train_loss:4.1773 train_time:30670ms step_avg:163.14ms step:199/1530 train_loss:4.0453 train_time:30833ms step_avg:163.14ms step:200/1530 train_loss:4.1179 train_time:30996ms step_avg:163.14ms step:201/1530 train_loss:4.0136 train_time:31160ms step_avg:163.14ms step:202/1530 train_loss:4.2700 train_time:31323ms step_avg:163.14ms step:203/1530 train_loss:4.0848 train_time:31486ms step_avg:163.14ms step:204/1530 train_loss:4.1958 train_time:31649ms step_avg:163.14ms step:205/1530 train_loss:4.2503 train_time:31812ms step_avg:163.14ms step:206/1530 train_loss:3.9548 train_time:31974ms step_avg:163.13ms step:207/1530 train_loss:4.0930 train_time:32136ms step_avg:163.13ms step:208/1530 train_loss:4.1057 train_time:32299ms step_avg:163.13ms step:209/1530 train_loss:4.2496 train_time:32463ms step_avg:163.13ms step:210/1530 train_loss:4.1776 train_time:32627ms step_avg:163.13ms step:211/1530 train_loss:4.0614 train_time:32790ms step_avg:163.13ms step:212/1530 train_loss:4.1363 train_time:32952ms step_avg:163.13ms step:213/1530 train_loss:4.0595 train_time:33114ms step_avg:163.12ms step:214/1530 train_loss:4.1229 train_time:33277ms step_avg:163.12ms step:215/1530 train_loss:3.9744 train_time:33442ms step_avg:163.13ms step:216/1530 train_loss:4.0145 train_time:33605ms step_avg:163.13ms step:217/1530 train_loss:4.0261 train_time:33768ms step_avg:163.13ms step:218/1530 train_loss:4.0887 train_time:33931ms step_avg:163.13ms step:219/1530 train_loss:4.0796 train_time:34093ms step_avg:163.13ms step:220/1530 train_loss:4.0881 train_time:34257ms step_avg:163.13ms step:221/1530 train_loss:4.0969 train_time:34421ms step_avg:163.13ms step:222/1530 train_loss:4.0027 train_time:34584ms step_avg:163.13ms step:223/1530 train_loss:3.9989 train_time:34748ms step_avg:163.13ms step:224/1530 train_loss:4.2962 train_time:34911ms step_avg:163.13ms step:225/1530 train_loss:3.9263 train_time:35073ms step_avg:163.13ms step:226/1530 train_loss:3.9969 train_time:35236ms step_avg:163.13ms step:227/1530 train_loss:3.9908 train_time:35399ms step_avg:163.13ms step:228/1530 train_loss:4.1536 train_time:35565ms step_avg:163.14ms step:229/1530 train_loss:3.9377 train_time:35731ms step_avg:163.16ms step:230/1530 train_loss:4.0473 train_time:35897ms step_avg:163.17ms step:231/1530 train_loss:3.9071 train_time:36064ms step_avg:163.19ms step:232/1530 train_loss:3.9742 train_time:36230ms step_avg:163.20ms step:233/1530 train_loss:4.1020 train_time:36395ms step_avg:163.21ms step:234/1530 train_loss:4.0472 train_time:36563ms step_avg:163.23ms step:235/1530 train_loss:3.9158 train_time:36729ms step_avg:163.24ms step:236/1530 train_loss:4.0952 train_time:36894ms step_avg:163.25ms step:237/1530 train_loss:4.0886 train_time:37061ms step_avg:163.27ms step:238/1530 train_loss:3.9423 train_time:37228ms step_avg:163.28ms step:239/1530 train_loss:4.0798 train_time:37393ms step_avg:163.29ms step:240/1530 train_loss:4.1230 train_time:37560ms step_avg:163.30ms step:241/1530 train_loss:3.9798 train_time:37727ms step_avg:163.32ms step:242/1530 train_loss:4.1571 train_time:37893ms step_avg:163.33ms step:243/1530 train_loss:4.0156 train_time:38060ms step_avg:163.35ms step:244/1530 train_loss:4.0800 train_time:38226ms step_avg:163.36ms step:245/1530 train_loss:4.1435 train_time:38391ms step_avg:163.37ms step:246/1530 train_loss:4.0656 train_time:38558ms step_avg:163.38ms step:247/1530 train_loss:4.0051 train_time:38725ms step_avg:163.40ms step:248/1530 train_loss:4.1141 train_time:38891ms step_avg:163.41ms step:249/1530 train_loss:3.9376 train_time:39056ms step_avg:163.41ms step:250/1530 train_loss:3.9801 train_time:39222ms step_avg:163.43ms step:250/1530 val_loss:4.0160 train_time:39270ms step_avg:163.62ms step:251/1530 train_loss:4.0890 train_time:39389ms step_avg:163.44ms step:252/1530 train_loss:4.1650 train_time:39558ms step_avg:163.46ms step:253/1530 train_loss:3.9453 train_time:39725ms step_avg:163.48ms step:254/1530 train_loss:3.8814 train_time:39891ms step_avg:163.49ms step:255/1530 train_loss:4.0827 train_time:40057ms step_avg:163.50ms step:256/1530 train_loss:3.9994 train_time:40224ms step_avg:163.51ms step:257/1530 train_loss:3.9979 train_time:40390ms step_avg:163.52ms step:258/1530 train_loss:3.9901 train_time:40555ms step_avg:163.53ms step:259/1530 train_loss:4.0388 train_time:40722ms step_avg:163.54ms step:260/1530 train_loss:4.0630 train_time:40888ms step_avg:163.55ms step:261/1530 train_loss:4.0301 train_time:41054ms step_avg:163.56ms step:262/1530 train_loss:4.0037 train_time:41221ms step_avg:163.57ms step:263/1530 train_loss:3.9028 train_time:41386ms step_avg:163.58ms step:264/1530 train_loss:3.9982 train_time:41552ms step_avg:163.59ms step:265/1530 train_loss:3.8820 train_time:41721ms step_avg:163.61ms step:266/1530 train_loss:3.9367 train_time:41887ms step_avg:163.62ms step:267/1530 train_loss:3.9361 train_time:42052ms step_avg:163.63ms step:268/1530 train_loss:3.9650 train_time:42219ms step_avg:163.64ms step:269/1530 train_loss:3.8595 train_time:42384ms step_avg:163.65ms step:270/1530 train_loss:4.1062 train_time:42552ms step_avg:163.66ms step:271/1530 train_loss:3.9719 train_time:42718ms step_avg:163.67ms step:272/1530 train_loss:3.9295 train_time:42885ms step_avg:163.68ms step:273/1530 train_loss:3.9509 train_time:43050ms step_avg:163.69ms step:274/1530 train_loss:4.0535 train_time:43215ms step_avg:163.69ms step:275/1530 train_loss:4.0682 train_time:43383ms step_avg:163.71ms step:276/1530 train_loss:4.2398 train_time:43550ms step_avg:163.72ms step:277/1530 train_loss:4.0480 train_time:43718ms step_avg:163.74ms step:278/1530 train_loss:4.0966 train_time:43884ms step_avg:163.75ms step:279/1530 train_loss:4.0086 train_time:44049ms step_avg:163.75ms step:280/1530 train_loss:4.2110 train_time:44218ms step_avg:163.77ms step:281/1530 train_loss:3.9840 train_time:44384ms step_avg:163.78ms step:282/1530 train_loss:3.9580 train_time:44549ms step_avg:163.78ms step:283/1530 train_loss:3.9169 train_time:44717ms step_avg:163.80ms step:284/1530 train_loss:4.0546 train_time:44884ms step_avg:163.81ms step:285/1530 train_loss:4.0715 train_time:45049ms step_avg:163.81ms step:286/1530 train_loss:4.0954 train_time:45215ms step_avg:163.82ms step:287/1530 train_loss:3.9126 train_time:45380ms step_avg:163.83ms step:288/1530 train_loss:4.0102 train_time:45545ms step_avg:163.83ms step:289/1530 train_loss:3.8784 train_time:45710ms step_avg:163.83ms step:290/1530 train_loss:3.8635 train_time:45873ms step_avg:163.83ms step:291/1530 train_loss:3.9165 train_time:46039ms step_avg:163.84ms step:292/1530 train_loss:3.8740 train_time:46205ms step_avg:163.85ms step:293/1530 train_loss:3.9066 train_time:46370ms step_avg:163.85ms step:294/1530 train_loss:3.9474 train_time:46535ms step_avg:163.85ms step:295/1530 train_loss:3.8426 train_time:46701ms step_avg:163.86ms step:296/1530 train_loss:3.8700 train_time:46865ms step_avg:163.86ms step:297/1530 train_loss:3.8698 train_time:47031ms step_avg:163.87ms step:298/1530 train_loss:3.9784 train_time:47197ms step_avg:163.88ms step:299/1530 train_loss:3.8276 train_time:47362ms step_avg:163.88ms step:300/1530 train_loss:3.9823 train_time:47526ms step_avg:163.88ms step:301/1530 train_loss:3.9709 train_time:47690ms step_avg:163.88ms step:302/1530 train_loss:3.9367 train_time:47856ms step_avg:163.89ms step:303/1530 train_loss:3.9864 train_time:48022ms step_avg:163.90ms step:304/1530 train_loss:3.9786 train_time:48188ms step_avg:163.90ms step:305/1530 train_loss:4.4736 train_time:48353ms step_avg:163.91ms step:306/1530 train_loss:3.9513 train_time:48519ms step_avg:163.92ms step:307/1530 train_loss:3.8417 train_time:48684ms step_avg:163.92ms step:308/1530 train_loss:3.9880 train_time:48848ms step_avg:163.92ms step:309/1530 train_loss:3.8738 train_time:49015ms step_avg:163.93ms step:310/1530 train_loss:4.0877 train_time:49180ms step_avg:163.93ms step:311/1530 train_loss:3.9289 train_time:49345ms step_avg:163.94ms step:312/1530 train_loss:3.8740 train_time:49509ms step_avg:163.94ms step:313/1530 train_loss:3.9556 train_time:49675ms step_avg:163.95ms step:314/1530 train_loss:4.0662 train_time:49841ms step_avg:163.95ms step:315/1530 train_loss:3.9544 train_time:50006ms step_avg:163.95ms step:316/1530 train_loss:3.8041 train_time:50170ms step_avg:163.95ms step:317/1530 train_loss:3.8839 train_time:50334ms step_avg:163.96ms step:318/1530 train_loss:3.9356 train_time:50501ms step_avg:163.96ms step:319/1530 train_loss:3.9049 train_time:50665ms step_avg:163.96ms step:320/1530 train_loss:4.0219 train_time:50830ms step_avg:163.97ms step:321/1530 train_loss:3.9701 train_time:50995ms step_avg:163.97ms step:322/1530 train_loss:3.9420 train_time:51160ms step_avg:163.97ms step:323/1530 train_loss:4.0152 train_time:51326ms step_avg:163.98ms step:324/1530 train_loss:3.9498 train_time:51491ms step_avg:163.98ms step:325/1530 train_loss:4.0193 train_time:51657ms step_avg:163.99ms step:326/1530 train_loss:3.8983 train_time:51823ms step_avg:164.00ms step:327/1530 train_loss:4.3970 train_time:51988ms step_avg:164.00ms step:328/1530 train_loss:4.0828 train_time:52154ms step_avg:164.01ms step:329/1530 train_loss:3.8089 train_time:52321ms step_avg:164.01ms step:330/1530 train_loss:3.7563 train_time:52486ms step_avg:164.02ms step:331/1530 train_loss:3.9819 train_time:52652ms step_avg:164.02ms step:332/1530 train_loss:3.9217 train_time:52818ms step_avg:164.03ms step:333/1530 train_loss:3.8941 train_time:52982ms step_avg:164.03ms step:334/1530 train_loss:3.8467 train_time:53146ms step_avg:164.03ms step:335/1530 train_loss:4.0144 train_time:53312ms step_avg:164.04ms step:336/1530 train_loss:3.9641 train_time:53476ms step_avg:164.04ms step:337/1530 train_loss:4.4311 train_time:53643ms step_avg:164.05ms step:338/1530 train_loss:3.9495 train_time:53808ms step_avg:164.05ms step:339/1530 train_loss:3.8790 train_time:53973ms step_avg:164.05ms step:340/1530 train_loss:3.9379 train_time:54137ms step_avg:164.05ms step:341/1530 train_loss:3.8605 train_time:54304ms step_avg:164.06ms step:342/1530 train_loss:3.8093 train_time:54472ms step_avg:164.07ms step:343/1530 train_loss:3.8446 train_time:54641ms step_avg:164.09ms step:344/1530 train_loss:3.9983 train_time:54809ms step_avg:164.10ms step:345/1530 train_loss:3.8247 train_time:54979ms step_avg:164.12ms step:346/1530 train_loss:3.7734 train_time:55146ms step_avg:164.13ms step:347/1530 train_loss:3.8084 train_time:55315ms step_avg:164.14ms step:348/1530 train_loss:3.8651 train_time:55483ms step_avg:164.15ms step:349/1530 train_loss:3.8411 train_time:55653ms step_avg:164.17ms step:350/1530 train_loss:3.5729 train_time:55821ms step_avg:164.18ms step:351/1530 train_loss:3.8359 train_time:55989ms step_avg:164.19ms step:352/1530 train_loss:4.1869 train_time:56158ms step_avg:164.21ms step:353/1530 train_loss:3.6582 train_time:56326ms step_avg:164.22ms step:354/1530 train_loss:3.9367 train_time:56494ms step_avg:164.23ms step:355/1530 train_loss:3.7963 train_time:56663ms step_avg:164.24ms step:356/1530 train_loss:3.8884 train_time:56831ms step_avg:164.25ms step:357/1530 train_loss:3.7689 train_time:57001ms step_avg:164.27ms step:358/1530 train_loss:3.8671 train_time:57168ms step_avg:164.28ms step:359/1530 train_loss:3.7650 train_time:57337ms step_avg:164.29ms step:360/1530 train_loss:3.4359 train_time:57506ms step_avg:164.30ms step:361/1530 train_loss:4.0234 train_time:57675ms step_avg:164.32ms step:362/1530 train_loss:3.9224 train_time:57843ms step_avg:164.33ms step:363/1530 train_loss:3.8480 train_time:58010ms step_avg:164.34ms step:364/1530 train_loss:3.7531 train_time:58179ms step_avg:164.35ms step:365/1530 train_loss:3.9164 train_time:58346ms step_avg:164.36ms step:366/1530 train_loss:3.8678 train_time:58514ms step_avg:164.37ms step:367/1530 train_loss:3.8626 train_time:58683ms step_avg:164.38ms step:368/1530 train_loss:3.8575 train_time:58849ms step_avg:164.38ms step:369/1530 train_loss:3.7557 train_time:59019ms step_avg:164.40ms step:370/1530 train_loss:3.8866 train_time:59188ms step_avg:164.41ms step:371/1530 train_loss:3.7411 train_time:59356ms step_avg:164.42ms step:372/1530 train_loss:3.7018 train_time:59526ms step_avg:164.44ms step:373/1530 train_loss:3.9226 train_time:59693ms step_avg:164.44ms step:374/1530 train_loss:3.8387 train_time:59861ms step_avg:164.45ms step:375/1530 train_loss:3.8092 train_time:60029ms step_avg:164.46ms step:375/1530 val_loss:3.8305 train_time:60078ms step_avg:164.60ms step:376/1530 train_loss:3.8693 train_time:60198ms step_avg:164.48ms step:377/1530 train_loss:3.8015 train_time:60499ms step_avg:164.85ms step:378/1530 train_loss:3.8565 train_time:60676ms step_avg:164.88ms step:379/1530 train_loss:3.8785 train_time:60995ms step_avg:165.30ms step:380/1530 train_loss:3.9587 train_time:61164ms step_avg:165.31ms step:381/1530 train_loss:3.8451 train_time:61333ms step_avg:165.32ms step:382/1530 train_loss:3.8076 train_time:61501ms step_avg:165.32ms step:383/1530 train_loss:3.8054 train_time:61670ms step_avg:165.34ms step:384/1530 train_loss:3.8784 train_time:61837ms step_avg:165.34ms step:385/1530 train_loss:3.7936 train_time:62007ms step_avg:165.35ms step:386/1530 train_loss:3.8952 train_time:62174ms step_avg:165.36ms step:387/1530 train_loss:4.0634 train_time:62342ms step_avg:165.36ms step:388/1530 train_loss:3.7991 train_time:62509ms step_avg:165.37ms step:389/1530 train_loss:3.7975 train_time:62677ms step_avg:165.37ms step:390/1530 train_loss:3.9025 train_time:62846ms step_avg:165.39ms step:391/1530 train_loss:3.8188 train_time:63014ms step_avg:165.39ms step:392/1530 train_loss:3.9315 train_time:63181ms step_avg:165.40ms step:393/1530 train_loss:3.7698 train_time:63350ms step_avg:165.40ms step:394/1530 train_loss:3.8895 train_time:63517ms step_avg:165.41ms step:395/1530 train_loss:3.6399 train_time:63687ms step_avg:165.42ms step:396/1530 train_loss:3.8447 train_time:63854ms step_avg:165.43ms step:397/1530 train_loss:3.8693 train_time:64022ms step_avg:165.43ms step:398/1530 train_loss:3.8854 train_time:64190ms step_avg:165.44ms step:399/1530 train_loss:3.7760 train_time:64358ms step_avg:165.44ms step:400/1530 train_loss:3.8339 train_time:64526ms step_avg:165.45ms step:401/1530 train_loss:3.9145 train_time:64693ms step_avg:165.45ms step:402/1530 train_loss:3.8513 train_time:64860ms step_avg:165.46ms step:403/1530 train_loss:3.9663 train_time:65028ms step_avg:165.46ms step:404/1530 train_loss:3.6923 train_time:65195ms step_avg:165.47ms step:405/1530 train_loss:3.7900 train_time:65363ms step_avg:165.47ms step:406/1530 train_loss:4.0993 train_time:65530ms step_avg:165.48ms step:407/1530 train_loss:3.7835 train_time:65697ms step_avg:165.48ms step:408/1530 train_loss:3.8357 train_time:65865ms step_avg:165.49ms step:409/1530 train_loss:3.8571 train_time:66032ms step_avg:165.49ms step:410/1530 train_loss:3.7654 train_time:66198ms step_avg:165.50ms step:411/1530 train_loss:3.7744 train_time:66366ms step_avg:165.50ms step:412/1530 train_loss:4.1859 train_time:66533ms step_avg:165.50ms step:413/1530 train_loss:3.6323 train_time:66700ms step_avg:165.51ms step:414/1530 train_loss:4.0238 train_time:66868ms step_avg:165.51ms step:415/1530 train_loss:3.7673 train_time:67034ms step_avg:165.52ms step:416/1530 train_loss:3.7672 train_time:67202ms step_avg:165.52ms step:417/1530 train_loss:3.9592 train_time:67369ms step_avg:165.53ms step:418/1530 train_loss:3.6909 train_time:67537ms step_avg:165.53ms step:419/1530 train_loss:3.8093 train_time:67705ms step_avg:165.54ms step:420/1530 train_loss:3.7105 train_time:67871ms step_avg:165.54ms step:421/1530 train_loss:3.6553 train_time:68037ms step_avg:165.54ms step:422/1530 train_loss:3.7858 train_time:68205ms step_avg:165.55ms step:423/1530 train_loss:3.8751 train_time:68373ms step_avg:165.55ms step:424/1530 train_loss:3.6168 train_time:68541ms step_avg:165.56ms step:425/1530 train_loss:3.7986 train_time:68707ms step_avg:165.56ms step:426/1530 train_loss:3.6498 train_time:68876ms step_avg:165.57ms step:427/1530 train_loss:3.8959 train_time:69044ms step_avg:165.57ms step:428/1530 train_loss:3.8142 train_time:69211ms step_avg:165.58ms step:429/1530 train_loss:3.7637 train_time:69379ms step_avg:165.58ms step:430/1530 train_loss:3.7066 train_time:69546ms step_avg:165.58ms step:431/1530 train_loss:3.6336 train_time:69713ms step_avg:165.59ms step:432/1530 train_loss:3.7736 train_time:69879ms step_avg:165.59ms step:433/1530 train_loss:3.8175 train_time:70048ms step_avg:165.60ms step:434/1530 train_loss:3.7816 train_time:70214ms step_avg:165.60ms step:435/1530 train_loss:3.8120 train_time:70381ms step_avg:165.60ms step:436/1530 train_loss:3.8388 train_time:70549ms step_avg:165.61ms step:437/1530 train_loss:3.7310 train_time:70716ms step_avg:165.61ms step:438/1530 train_loss:3.7065 train_time:70882ms step_avg:165.61ms step:439/1530 train_loss:3.7180 train_time:71050ms step_avg:165.62ms step:440/1530 train_loss:3.8922 train_time:71217ms step_avg:165.62ms step:441/1530 train_loss:3.7699 train_time:71385ms step_avg:165.63ms step:442/1530 train_loss:3.7417 train_time:71553ms step_avg:165.63ms step:443/1530 train_loss:3.6304 train_time:71719ms step_avg:165.63ms step:444/1530 train_loss:3.9277 train_time:71886ms step_avg:165.63ms step:445/1530 train_loss:3.8474 train_time:72054ms step_avg:165.64ms step:446/1530 train_loss:3.8358 train_time:72221ms step_avg:165.64ms step:447/1530 train_loss:3.7544 train_time:72388ms step_avg:165.65ms step:448/1530 train_loss:3.8551 train_time:72556ms step_avg:165.65ms step:449/1530 train_loss:3.6932 train_time:72723ms step_avg:165.66ms step:450/1530 train_loss:3.7170 train_time:72890ms step_avg:165.66ms step:451/1530 train_loss:3.5908 train_time:73059ms step_avg:165.67ms step:452/1530 train_loss:3.7187 train_time:73226ms step_avg:165.67ms step:453/1530 train_loss:3.6742 train_time:73393ms step_avg:165.67ms step:454/1530 train_loss:3.6370 train_time:73560ms step_avg:165.68ms step:455/1530 train_loss:3.8510 train_time:73729ms step_avg:165.68ms step:456/1530 train_loss:3.7315 train_time:73898ms step_avg:165.69ms step:457/1530 train_loss:3.7867 train_time:74070ms step_avg:165.70ms step:458/1530 train_loss:3.8309 train_time:74239ms step_avg:165.71ms step:459/1530 train_loss:3.6373 train_time:74409ms step_avg:165.72ms step:460/1530 train_loss:3.7943 train_time:74578ms step_avg:165.73ms step:461/1530 train_loss:3.6923 train_time:74750ms step_avg:165.74ms step:462/1530 train_loss:3.7419 train_time:74920ms step_avg:165.75ms step:463/1530 train_loss:3.7848 train_time:75090ms step_avg:165.76ms step:464/1530 train_loss:3.7207 train_time:75259ms step_avg:165.77ms step:465/1530 train_loss:3.7200 train_time:75428ms step_avg:165.78ms step:466/1530 train_loss:3.8033 train_time:75596ms step_avg:165.78ms step:467/1530 train_loss:3.8302 train_time:75770ms step_avg:165.80ms step:468/1530 train_loss:3.8024 train_time:75939ms step_avg:165.81ms step:469/1530 train_loss:3.6949 train_time:76109ms step_avg:165.82ms step:470/1530 train_loss:3.7748 train_time:76279ms step_avg:165.82ms step:471/1530 train_loss:3.8178 train_time:76449ms step_avg:165.83ms step:472/1530 train_loss:3.7949 train_time:76619ms step_avg:165.84ms step:473/1530 train_loss:3.7166 train_time:76788ms step_avg:165.85ms step:474/1530 train_loss:3.6003 train_time:76958ms step_avg:165.86ms step:475/1530 train_loss:4.0250 train_time:77128ms step_avg:165.87ms step:476/1530 train_loss:3.7621 train_time:77297ms step_avg:165.87ms step:477/1530 train_loss:3.5923 train_time:77469ms step_avg:165.89ms step:478/1530 train_loss:3.8260 train_time:77638ms step_avg:165.89ms step:479/1530 train_loss:3.7749 train_time:77808ms step_avg:165.90ms step:480/1530 train_loss:3.9231 train_time:77978ms step_avg:165.91ms step:481/1530 train_loss:3.7288 train_time:78148ms step_avg:165.92ms step:482/1530 train_loss:3.5356 train_time:78318ms step_avg:165.93ms step:483/1530 train_loss:3.8021 train_time:78487ms step_avg:165.93ms step:484/1530 train_loss:3.6644 train_time:78657ms step_avg:165.94ms step:485/1530 train_loss:3.6628 train_time:78827ms step_avg:165.95ms step:486/1530 train_loss:3.5754 train_time:78996ms step_avg:165.96ms step:487/1530 train_loss:3.6884 train_time:79166ms step_avg:165.97ms step:488/1530 train_loss:3.8845 train_time:79335ms step_avg:165.97ms step:489/1530 train_loss:3.7155 train_time:79505ms step_avg:165.98ms step:490/1530 train_loss:3.6012 train_time:79674ms step_avg:165.99ms step:491/1530 train_loss:3.6181 train_time:79843ms step_avg:165.99ms step:492/1530 train_loss:3.7371 train_time:80012ms step_avg:166.00ms step:493/1530 train_loss:3.5801 train_time:80182ms step_avg:166.01ms step:494/1530 train_loss:3.7025 train_time:80352ms step_avg:166.02ms step:495/1530 train_loss:3.6656 train_time:80523ms step_avg:166.03ms step:496/1530 train_loss:3.5109 train_time:80693ms step_avg:166.04ms step:497/1530 train_loss:3.7485 train_time:80862ms step_avg:166.04ms step:498/1530 train_loss:3.7961 train_time:81031ms step_avg:166.05ms step:499/1530 train_loss:3.8219 train_time:81201ms step_avg:166.06ms step:500/1530 train_loss:3.7403 train_time:81371ms step_avg:166.06ms step:500/1530 val_loss:3.7112 train_time:81420ms step_avg:166.16ms step:501/1530 train_loss:3.8093 train_time:81541ms step_avg:166.07ms step:502/1530 train_loss:3.7564 train_time:81714ms step_avg:166.08ms step:503/1530 train_loss:3.7802 train_time:81883ms step_avg:166.09ms step:504/1530 train_loss:3.7225 train_time:82052ms step_avg:166.10ms step:505/1530 train_loss:3.8077 train_time:82221ms step_avg:166.10ms step:506/1530 train_loss:3.6556 train_time:82393ms step_avg:166.12ms step:507/1530 train_loss:3.7690 train_time:82561ms step_avg:166.12ms step:508/1530 train_loss:3.8295 train_time:82733ms step_avg:166.13ms step:509/1530 train_loss:3.7771 train_time:82902ms step_avg:166.14ms step:510/1530 train_loss:3.5884 train_time:83072ms step_avg:166.14ms step:511/1530 train_loss:3.7849 train_time:83241ms step_avg:166.15ms step:512/1530 train_loss:3.7263 train_time:83414ms step_avg:166.16ms step:513/1530 train_loss:3.6638 train_time:83581ms step_avg:166.17ms step:514/1530 train_loss:3.8730 train_time:83752ms step_avg:166.18ms step:515/1530 train_loss:3.7396 train_time:83920ms step_avg:166.18ms step:516/1530 train_loss:4.0805 train_time:84093ms step_avg:166.19ms step:517/1530 train_loss:3.6986 train_time:84261ms step_avg:166.19ms step:518/1530 train_loss:3.7742 train_time:84429ms step_avg:166.20ms step:519/1530 train_loss:3.6567 train_time:84599ms step_avg:166.21ms step:520/1530 train_loss:3.6858 train_time:84769ms step_avg:166.21ms step:521/1530 train_loss:3.6731 train_time:84938ms step_avg:166.22ms step:522/1530 train_loss:3.6656 train_time:85110ms step_avg:166.23ms step:523/1530 train_loss:4.2860 train_time:85279ms step_avg:166.24ms step:524/1530 train_loss:3.7444 train_time:85447ms step_avg:166.24ms step:525/1530 train_loss:3.6848 train_time:85615ms step_avg:166.24ms step:526/1530 train_loss:3.7047 train_time:85783ms step_avg:166.25ms step:527/1530 train_loss:3.6585 train_time:85954ms step_avg:166.25ms step:528/1530 train_loss:3.6283 train_time:86122ms step_avg:166.26ms step:529/1530 train_loss:3.8543 train_time:86294ms step_avg:166.27ms step:530/1530 train_loss:3.6584 train_time:86463ms step_avg:166.27ms step:531/1530 train_loss:3.9278 train_time:86634ms step_avg:166.28ms step:532/1530 train_loss:3.7396 train_time:86804ms step_avg:166.29ms step:533/1530 train_loss:3.6626 train_time:86974ms step_avg:166.30ms step:534/1530 train_loss:3.6770 train_time:87142ms step_avg:166.30ms step:535/1530 train_loss:3.6117 train_time:87313ms step_avg:166.31ms step:536/1530 train_loss:3.7571 train_time:87482ms step_avg:166.32ms step:537/1530 train_loss:3.7376 train_time:87652ms step_avg:166.32ms step:538/1530 train_loss:3.6321 train_time:87820ms step_avg:166.33ms step:539/1530 train_loss:4.1163 train_time:87992ms step_avg:166.34ms step:540/1530 train_loss:3.6768 train_time:88162ms step_avg:166.34ms step:541/1530 train_loss:3.7907 train_time:88330ms step_avg:166.35ms step:542/1530 train_loss:3.5905 train_time:88499ms step_avg:166.35ms step:543/1530 train_loss:3.5888 train_time:88668ms step_avg:166.36ms step:544/1530 train_loss:3.6400 train_time:88836ms step_avg:166.36ms step:545/1530 train_loss:3.5904 train_time:89005ms step_avg:166.36ms step:546/1530 train_loss:3.6291 train_time:89175ms step_avg:166.37ms step:547/1530 train_loss:3.6458 train_time:89344ms step_avg:166.38ms step:548/1530 train_loss:3.6100 train_time:89515ms step_avg:166.39ms step:549/1530 train_loss:3.7280 train_time:89683ms step_avg:166.39ms step:550/1530 train_loss:3.6190 train_time:89852ms step_avg:166.39ms step:551/1530 train_loss:3.6397 train_time:90021ms step_avg:166.40ms step:552/1530 train_loss:3.9370 train_time:90192ms step_avg:166.41ms step:553/1530 train_loss:3.7629 train_time:90361ms step_avg:166.41ms step:554/1530 train_loss:3.7150 train_time:90530ms step_avg:166.42ms step:555/1530 train_loss:3.6339 train_time:90699ms step_avg:166.42ms step:556/1530 train_loss:3.6992 train_time:90867ms step_avg:166.42ms step:557/1530 train_loss:3.3144 train_time:91036ms step_avg:166.43ms step:558/1530 train_loss:3.6155 train_time:91205ms step_avg:166.43ms step:559/1530 train_loss:3.6540 train_time:91374ms step_avg:166.44ms step:560/1530 train_loss:3.6947 train_time:91543ms step_avg:166.44ms step:561/1530 train_loss:3.6163 train_time:91711ms step_avg:166.45ms step:562/1530 train_loss:3.5566 train_time:91880ms step_avg:166.45ms step:563/1530 train_loss:3.7567 train_time:92050ms step_avg:166.46ms step:564/1530 train_loss:3.5807 train_time:92219ms step_avg:166.46ms step:565/1530 train_loss:3.6824 train_time:92389ms step_avg:166.47ms step:566/1530 train_loss:3.6195 train_time:92691ms step_avg:166.71ms step:567/1530 train_loss:3.6081 train_time:92871ms step_avg:166.73ms step:568/1530 train_loss:3.6884 train_time:93040ms step_avg:166.74ms step:569/1530 train_loss:3.6530 train_time:93358ms step_avg:167.01ms step:570/1530 train_loss:3.6943 train_time:93539ms step_avg:167.03ms step:571/1530 train_loss:3.7652 train_time:93709ms step_avg:167.04ms step:572/1530 train_loss:3.7328 train_time:93880ms step_avg:167.05ms step:573/1530 train_loss:3.7367 train_time:94055ms step_avg:167.06ms step:574/1530 train_loss:3.7841 train_time:94226ms step_avg:167.07ms step:575/1530 train_loss:3.7299 train_time:94398ms step_avg:167.08ms step:576/1530 train_loss:3.7568 train_time:94569ms step_avg:167.08ms step:577/1530 train_loss:3.6743 train_time:94740ms step_avg:167.09ms step:578/1530 train_loss:3.6781 train_time:94914ms step_avg:167.10ms step:579/1530 train_loss:3.6786 train_time:95085ms step_avg:167.11ms step:580/1530 train_loss:3.5938 train_time:95255ms step_avg:167.11ms step:581/1530 train_loss:3.6440 train_time:95427ms step_avg:167.12ms step:582/1530 train_loss:3.8585 train_time:95598ms step_avg:167.13ms step:583/1530 train_loss:3.6319 train_time:95772ms step_avg:167.14ms step:584/1530 train_loss:3.5958 train_time:95941ms step_avg:167.15ms step:585/1530 train_loss:3.7931 train_time:96114ms step_avg:167.16ms step:586/1530 train_loss:3.5222 train_time:96283ms step_avg:167.16ms step:587/1530 train_loss:3.6727 train_time:96455ms step_avg:167.17ms step:588/1530 train_loss:3.6431 train_time:96625ms step_avg:167.17ms step:589/1530 train_loss:4.0047 train_time:96798ms step_avg:167.18ms step:590/1530 train_loss:3.7803 train_time:96969ms step_avg:167.19ms step:591/1530 train_loss:3.5157 train_time:97139ms step_avg:167.19ms step:592/1530 train_loss:3.5393 train_time:97314ms step_avg:167.21ms step:593/1530 train_loss:3.5070 train_time:97485ms step_avg:167.21ms step:594/1530 train_loss:3.5573 train_time:97656ms step_avg:167.22ms step:595/1530 train_loss:3.9166 train_time:97827ms step_avg:167.23ms step:596/1530 train_loss:3.6549 train_time:98000ms step_avg:167.23ms step:597/1530 train_loss:3.5913 train_time:98171ms step_avg:167.24ms step:598/1530 train_loss:3.6639 train_time:98341ms step_avg:167.25ms step:599/1530 train_loss:3.4817 train_time:98513ms step_avg:167.26ms step:600/1530 train_loss:3.6065 train_time:98683ms step_avg:167.26ms step:601/1530 train_loss:3.6514 train_time:98856ms step_avg:167.27ms step:602/1530 train_loss:3.6760 train_time:99030ms step_avg:167.28ms step:603/1530 train_loss:3.7833 train_time:99201ms step_avg:167.29ms step:604/1530 train_loss:3.6160 train_time:99374ms step_avg:167.30ms step:605/1530 train_loss:3.6180 train_time:99546ms step_avg:167.30ms step:606/1530 train_loss:3.5796 train_time:99719ms step_avg:167.31ms step:607/1530 train_loss:3.8409 train_time:99891ms step_avg:167.32ms step:608/1530 train_loss:3.6402 train_time:100061ms step_avg:167.33ms step:609/1530 train_loss:3.6235 train_time:100233ms step_avg:167.33ms step:610/1530 train_loss:3.7085 train_time:100403ms step_avg:167.34ms step:611/1530 train_loss:3.6028 train_time:100574ms step_avg:167.35ms step:612/1530 train_loss:3.5773 train_time:100745ms step_avg:167.35ms step:613/1530 train_loss:3.7626 train_time:100916ms step_avg:167.36ms step:614/1530 train_loss:3.7075 train_time:101088ms step_avg:167.36ms step:615/1530 train_loss:3.7125 train_time:101258ms step_avg:167.37ms step:616/1530 train_loss:3.6375 train_time:101429ms step_avg:167.37ms step:617/1530 train_loss:3.5602 train_time:101603ms step_avg:167.39ms step:618/1530 train_loss:3.6914 train_time:101774ms step_avg:167.39ms step:619/1530 train_loss:3.5537 train_time:101945ms step_avg:167.40ms step:620/1530 train_loss:3.5956 train_time:102117ms step_avg:167.40ms step:621/1530 train_loss:3.9304 train_time:102289ms step_avg:167.41ms step:622/1530 train_loss:3.5763 train_time:102460ms step_avg:167.42ms step:623/1530 train_loss:3.6107 train_time:102634ms step_avg:167.43ms step:624/1530 train_loss:3.6953 train_time:102806ms step_avg:167.44ms step:625/1530 train_loss:3.7107 train_time:102975ms step_avg:167.44ms step:625/1530 val_loss:3.6283 train_time:103025ms step_avg:167.52ms step:626/1530 train_loss:3.7433 train_time:103148ms step_avg:167.45ms step:627/1530 train_loss:3.7232 train_time:103321ms step_avg:167.46ms step:628/1530 train_loss:3.7641 train_time:103492ms step_avg:167.46ms step:629/1530 train_loss:3.5991 train_time:103664ms step_avg:167.47ms step:630/1530 train_loss:3.7268 train_time:103834ms step_avg:167.47ms step:631/1530 train_loss:3.7503 train_time:104006ms step_avg:167.48ms step:632/1530 train_loss:3.6551 train_time:104178ms step_avg:167.49ms step:633/1530 train_loss:3.6101 train_time:104349ms step_avg:167.49ms step:634/1530 train_loss:3.7004 train_time:104519ms step_avg:167.50ms step:635/1530 train_loss:3.9589 train_time:104688ms step_avg:167.50ms step:636/1530 train_loss:3.5548 train_time:104860ms step_avg:167.51ms step:637/1530 train_loss:3.3554 train_time:105032ms step_avg:167.51ms step:638/1530 train_loss:3.5911 train_time:105204ms step_avg:167.52ms step:639/1530 train_loss:3.6387 train_time:105374ms step_avg:167.53ms step:640/1530 train_loss:3.5760 train_time:105547ms step_avg:167.53ms step:641/1530 train_loss:3.5940 train_time:105716ms step_avg:167.54ms step:642/1530 train_loss:3.6335 train_time:105887ms step_avg:167.54ms step:643/1530 train_loss:3.5991 train_time:106057ms step_avg:167.55ms step:644/1530 train_loss:3.5632 train_time:106228ms step_avg:167.55ms step:645/1530 train_loss:3.7785 train_time:106398ms step_avg:167.56ms step:646/1530 train_loss:3.6758 train_time:106569ms step_avg:167.56ms step:647/1530 train_loss:3.6623 train_time:106740ms step_avg:167.57ms step:648/1530 train_loss:3.7178 train_time:106912ms step_avg:167.57ms step:649/1530 train_loss:3.7689 train_time:107083ms step_avg:167.58ms step:650/1530 train_loss:3.6215 train_time:107254ms step_avg:167.58ms step:651/1530 train_loss:3.7775 train_time:107426ms step_avg:167.59ms step:652/1530 train_loss:3.5933 train_time:107596ms step_avg:167.59ms step:653/1530 train_loss:3.6598 train_time:107767ms step_avg:167.60ms step:654/1530 train_loss:3.4316 train_time:107940ms step_avg:167.61ms step:655/1530 train_loss:3.5875 train_time:108109ms step_avg:167.61ms step:656/1530 train_loss:3.5807 train_time:108279ms step_avg:167.62ms step:657/1530 train_loss:3.5038 train_time:108450ms step_avg:167.62ms step:658/1530 train_loss:3.6913 train_time:108621ms step_avg:167.63ms step:659/1530 train_loss:3.5904 train_time:108792ms step_avg:167.63ms step:660/1530 train_loss:3.6909 train_time:108963ms step_avg:167.64ms step:661/1530 train_loss:3.7556 train_time:109135ms step_avg:167.64ms step:662/1530 train_loss:3.6761 train_time:109306ms step_avg:167.65ms step:663/1530 train_loss:3.5617 train_time:109475ms step_avg:167.65ms step:664/1530 train_loss:3.6151 train_time:109648ms step_avg:167.66ms step:665/1530 train_loss:3.4972 train_time:109819ms step_avg:167.66ms step:666/1530 train_loss:3.7859 train_time:109989ms step_avg:167.67ms step:667/1530 train_loss:3.6087 train_time:110162ms step_avg:167.67ms step:668/1530 train_loss:3.6478 train_time:110333ms step_avg:167.68ms step:669/1530 train_loss:3.4890 train_time:110506ms step_avg:167.69ms step:670/1530 train_loss:3.6036 train_time:110676ms step_avg:167.69ms step:671/1530 train_loss:3.5645 train_time:110848ms step_avg:167.70ms step:672/1530 train_loss:3.5717 train_time:111020ms step_avg:167.70ms step:673/1530 train_loss:3.8518 train_time:111190ms step_avg:167.71ms step:674/1530 train_loss:3.6311 train_time:111360ms step_avg:167.71ms step:675/1530 train_loss:3.7172 train_time:111532ms step_avg:167.72ms step:676/1530 train_loss:3.4953 train_time:111703ms step_avg:167.72ms step:677/1530 train_loss:3.6060 train_time:111873ms step_avg:167.73ms step:678/1530 train_loss:3.5571 train_time:112044ms step_avg:167.73ms step:679/1530 train_loss:3.6826 train_time:112215ms step_avg:167.73ms step:680/1530 train_loss:3.5875 train_time:112386ms step_avg:167.74ms step:681/1530 train_loss:3.6238 train_time:112558ms step_avg:167.75ms step:682/1530 train_loss:3.6654 train_time:112735ms step_avg:167.76ms step:683/1530 train_loss:3.7447 train_time:112908ms step_avg:167.77ms step:684/1530 train_loss:3.6499 train_time:113078ms step_avg:167.77ms step:685/1530 train_loss:3.6910 train_time:113254ms step_avg:167.78ms step:686/1530 train_loss:3.6447 train_time:113428ms step_avg:167.79ms step:687/1530 train_loss:3.6714 train_time:113600ms step_avg:167.80ms step:688/1530 train_loss:3.2120 train_time:113775ms step_avg:167.81ms step:689/1530 train_loss:3.4054 train_time:113950ms step_avg:167.82ms step:690/1530 train_loss:3.5451 train_time:114124ms step_avg:167.83ms step:691/1530 train_loss:3.4177 train_time:114295ms step_avg:167.83ms step:692/1530 train_loss:3.6287 train_time:114467ms step_avg:167.84ms step:693/1530 train_loss:3.6530 train_time:114640ms step_avg:167.85ms step:694/1530 train_loss:3.5588 train_time:114811ms step_avg:167.85ms step:695/1530 train_loss:3.5374 train_time:114982ms step_avg:167.86ms step:696/1530 train_loss:3.8522 train_time:115155ms step_avg:167.86ms step:697/1530 train_loss:3.5881 train_time:115329ms step_avg:167.87ms step:698/1530 train_loss:3.6457 train_time:115500ms step_avg:167.88ms step:699/1530 train_loss:3.7784 train_time:115673ms step_avg:167.89ms step:700/1530 train_loss:3.5726 train_time:115846ms step_avg:167.89ms step:701/1530 train_loss:3.5472 train_time:116016ms step_avg:167.90ms step:702/1530 train_loss:3.5202 train_time:116190ms step_avg:167.90ms step:703/1530 train_loss:3.5071 train_time:116360ms step_avg:167.91ms step:704/1530 train_loss:3.5752 train_time:116533ms step_avg:167.92ms step:705/1530 train_loss:3.5676 train_time:116709ms step_avg:167.93ms step:706/1530 train_loss:3.5892 train_time:116885ms step_avg:167.94ms step:707/1530 train_loss:3.6518 train_time:117059ms step_avg:167.95ms step:708/1530 train_loss:3.6088 train_time:117233ms step_avg:167.96ms step:709/1530 train_loss:3.5858 train_time:117408ms step_avg:167.97ms step:710/1530 train_loss:3.5451 train_time:117579ms step_avg:167.97ms step:711/1530 train_loss:3.5950 train_time:117753ms step_avg:167.98ms step:712/1530 train_loss:3.6509 train_time:117930ms step_avg:167.99ms step:713/1530 train_loss:3.6627 train_time:118107ms step_avg:168.00ms step:714/1530 train_loss:3.5665 train_time:118279ms step_avg:168.01ms step:715/1530 train_loss:3.5726 train_time:118451ms step_avg:168.02ms step:716/1530 train_loss:3.5976 train_time:118623ms step_avg:168.02ms step:717/1530 train_loss:3.7115 train_time:118796ms step_avg:168.03ms step:718/1530 train_loss:3.6016 train_time:118967ms step_avg:168.03ms step:719/1530 train_loss:3.6820 train_time:119140ms step_avg:168.04ms step:720/1530 train_loss:3.8550 train_time:119314ms step_avg:168.05ms step:721/1530 train_loss:3.4712 train_time:119487ms step_avg:168.05ms step:722/1530 train_loss:3.7421 train_time:119658ms step_avg:168.06ms step:723/1530 train_loss:3.7702 train_time:119830ms step_avg:168.07ms step:724/1530 train_loss:3.5717 train_time:120004ms step_avg:168.07ms step:725/1530 train_loss:3.6559 train_time:120177ms step_avg:168.08ms step:726/1530 train_loss:3.5349 train_time:120352ms step_avg:168.09ms step:727/1530 train_loss:3.5827 train_time:120529ms step_avg:168.10ms step:728/1530 train_loss:3.7321 train_time:120701ms step_avg:168.11ms step:729/1530 train_loss:3.6766 train_time:120872ms step_avg:168.11ms step:730/1530 train_loss:3.6654 train_time:121047ms step_avg:168.12ms step:731/1530 train_loss:3.5583 train_time:121219ms step_avg:168.13ms step:732/1530 train_loss:3.5966 train_time:121390ms step_avg:168.13ms step:733/1530 train_loss:3.8346 train_time:121564ms step_avg:168.14ms step:734/1530 train_loss:3.5675 train_time:121738ms step_avg:168.15ms step:735/1530 train_loss:3.6276 train_time:121911ms step_avg:168.15ms step:736/1530 train_loss:3.7374 train_time:122083ms step_avg:168.16ms step:737/1530 train_loss:3.6806 train_time:122254ms step_avg:168.16ms step:738/1530 train_loss:3.6040 train_time:122427ms step_avg:168.17ms step:739/1530 train_loss:3.5038 train_time:122597ms step_avg:168.17ms step:740/1530 train_loss:4.1237 train_time:122774ms step_avg:168.18ms step:741/1530 train_loss:3.4919 train_time:122948ms step_avg:168.19ms step:742/1530 train_loss:3.5602 train_time:123120ms step_avg:168.20ms step:743/1530 train_loss:3.5847 train_time:123292ms step_avg:168.20ms step:744/1530 train_loss:3.6519 train_time:123466ms step_avg:168.21ms step:745/1530 train_loss:3.5916 train_time:123639ms step_avg:168.22ms step:746/1530 train_loss:3.5935 train_time:123811ms step_avg:168.22ms step:747/1530 train_loss:3.6517 train_time:123986ms step_avg:168.23ms step:748/1530 train_loss:3.5682 train_time:124161ms step_avg:168.24ms step:749/1530 train_loss:3.5679 train_time:124334ms step_avg:168.25ms step:750/1530 train_loss:3.5992 train_time:124505ms step_avg:168.25ms step:750/1530 val_loss:3.5726 train_time:124553ms step_avg:168.32ms step:751/1530 train_loss:3.5800 train_time:124678ms step_avg:168.26ms step:752/1530 train_loss:3.6206 train_time:124849ms step_avg:168.26ms step:753/1530 train_loss:3.6244 train_time:125022ms step_avg:168.27ms step:754/1530 train_loss:3.5972 train_time:125195ms step_avg:168.27ms step:755/1530 train_loss:3.6867 train_time:125501ms step_avg:168.46ms step:756/1530 train_loss:3.4629 train_time:125684ms step_avg:168.48ms step:757/1530 train_loss:3.7378 train_time:125857ms step_avg:168.48ms step:758/1530 train_loss:3.6525 train_time:126028ms step_avg:168.49ms step:759/1530 train_loss:3.5975 train_time:126343ms step_avg:168.68ms step:760/1530 train_loss:3.7108 train_time:126513ms step_avg:168.68ms step:761/1530 train_loss:3.4081 train_time:126686ms step_avg:168.69ms step:762/1530 train_loss:3.5525 train_time:126859ms step_avg:168.70ms step:763/1530 train_loss:3.6645 train_time:127032ms step_avg:168.70ms step:764/1530 train_loss:3.3231 train_time:127204ms step_avg:168.71ms step:765/1530 train_loss:3.7375 train_time:127377ms step_avg:168.71ms step:766/1530 train_loss:3.5755 train_time:127551ms step_avg:168.72ms step:767/1530 train_loss:3.5691 train_time:127723ms step_avg:168.72ms step:768/1530 train_loss:3.5757 train_time:127897ms step_avg:168.73ms step:769/1530 train_loss:3.5869 train_time:128070ms step_avg:168.73ms step:770/1530 train_loss:3.6442 train_time:128241ms step_avg:168.74ms step:771/1530 train_loss:3.8912 train_time:128415ms step_avg:168.74ms step:772/1530 train_loss:3.4555 train_time:128586ms step_avg:168.75ms step:773/1530 train_loss:3.6374 train_time:128758ms step_avg:168.75ms step:774/1530 train_loss:3.6463 train_time:128929ms step_avg:168.76ms step:775/1530 train_loss:3.6080 train_time:129100ms step_avg:168.76ms step:776/1530 train_loss:3.4120 train_time:129275ms step_avg:168.77ms step:777/1530 train_loss:3.3888 train_time:129447ms step_avg:168.77ms step:778/1530 train_loss:3.4985 train_time:129619ms step_avg:168.77ms step:779/1530 train_loss:3.5861 train_time:129792ms step_avg:168.78ms step:780/1530 train_loss:3.5914 train_time:129964ms step_avg:168.78ms step:781/1530 train_loss:3.6726 train_time:130136ms step_avg:168.79ms step:782/1530 train_loss:3.5957 train_time:130309ms step_avg:168.79ms step:783/1530 train_loss:3.5745 train_time:130480ms step_avg:168.80ms step:784/1530 train_loss:3.6183 train_time:130654ms step_avg:168.80ms step:785/1530 train_loss:3.5644 train_time:130826ms step_avg:168.81ms step:786/1530 train_loss:3.4425 train_time:130999ms step_avg:168.81ms step:787/1530 train_loss:3.7239 train_time:131170ms step_avg:168.82ms step:788/1530 train_loss:3.5039 train_time:131342ms step_avg:168.82ms step:789/1530 train_loss:3.5530 train_time:131515ms step_avg:168.83ms step:790/1530 train_loss:3.6317 train_time:131689ms step_avg:168.83ms step:791/1530 train_loss:3.7757 train_time:131865ms step_avg:168.84ms step:792/1530 train_loss:3.7653 train_time:132037ms step_avg:168.85ms step:793/1530 train_loss:3.4574 train_time:132208ms step_avg:168.85ms step:794/1530 train_loss:3.5995 train_time:132381ms step_avg:168.85ms step:795/1530 train_loss:3.6778 train_time:132557ms step_avg:168.86ms step:796/1530 train_loss:3.7554 train_time:132734ms step_avg:168.87ms step:797/1530 train_loss:3.5261 train_time:132908ms step_avg:168.88ms step:798/1530 train_loss:3.6519 train_time:133082ms step_avg:168.89ms step:799/1530 train_loss:3.5452 train_time:133260ms step_avg:168.90ms step:800/1530 train_loss:3.5340 train_time:133433ms step_avg:168.90ms step:801/1530 train_loss:3.6266 train_time:133606ms step_avg:168.91ms step:802/1530 train_loss:3.5006 train_time:133783ms step_avg:168.92ms step:803/1530 train_loss:3.4867 train_time:133957ms step_avg:168.92ms step:804/1530 train_loss:3.6226 train_time:134130ms step_avg:168.93ms step:805/1530 train_loss:3.5163 train_time:134306ms step_avg:168.94ms step:806/1530 train_loss:3.5680 train_time:134480ms step_avg:168.94ms step:807/1530 train_loss:3.6561 train_time:134654ms step_avg:168.95ms step:808/1530 train_loss:3.5438 train_time:134830ms step_avg:168.96ms step:809/1530 train_loss:3.4954 train_time:135003ms step_avg:168.96ms step:810/1530 train_loss:3.5669 train_time:135177ms step_avg:168.97ms step:811/1530 train_loss:3.5821 train_time:135351ms step_avg:168.98ms step:812/1530 train_loss:3.6060 train_time:135524ms step_avg:168.98ms step:813/1530 train_loss:3.6304 train_time:135696ms step_avg:168.99ms step:814/1530 train_loss:3.5699 train_time:135872ms step_avg:168.99ms step:815/1530 train_loss:3.5689 train_time:136045ms step_avg:169.00ms step:816/1530 train_loss:3.6878 train_time:136219ms step_avg:169.01ms step:817/1530 train_loss:3.7733 train_time:136393ms step_avg:169.01ms step:818/1530 train_loss:3.5267 train_time:136566ms step_avg:169.02ms step:819/1530 train_loss:3.7201 train_time:136740ms step_avg:169.02ms step:820/1530 train_loss:3.5003 train_time:136916ms step_avg:169.03ms step:821/1530 train_loss:3.5614 train_time:137088ms step_avg:169.04ms step:822/1530 train_loss:3.6996 train_time:137265ms step_avg:169.05ms step:823/1530 train_loss:3.5792 train_time:137439ms step_avg:169.05ms step:824/1530 train_loss:3.5203 train_time:137612ms step_avg:169.06ms step:825/1530 train_loss:3.6219 train_time:137786ms step_avg:169.06ms step:826/1530 train_loss:3.4904 train_time:137961ms step_avg:169.07ms step:827/1530 train_loss:3.7388 train_time:138136ms step_avg:169.08ms step:828/1530 train_loss:3.6288 train_time:138309ms step_avg:169.08ms step:829/1530 train_loss:3.6316 train_time:138484ms step_avg:169.09ms step:830/1530 train_loss:3.5444 train_time:138658ms step_avg:169.10ms step:831/1530 train_loss:3.6085 train_time:138830ms step_avg:169.10ms step:832/1530 train_loss:3.5188 train_time:139004ms step_avg:169.10ms step:833/1530 train_loss:3.6514 train_time:139180ms step_avg:169.11ms step:834/1530 train_loss:3.4732 train_time:139354ms step_avg:169.12ms step:835/1530 train_loss:3.4610 train_time:139525ms step_avg:169.12ms step:836/1530 train_loss:3.7217 train_time:139702ms step_avg:169.13ms step:837/1530 train_loss:3.4041 train_time:139877ms step_avg:169.14ms step:838/1530 train_loss:3.5968 train_time:140051ms step_avg:169.14ms step:839/1530 train_loss:3.4283 train_time:140224ms step_avg:169.15ms step:840/1530 train_loss:3.4746 train_time:140397ms step_avg:169.15ms step:841/1530 train_loss:3.5731 train_time:140570ms step_avg:169.16ms step:842/1530 train_loss:3.5821 train_time:140745ms step_avg:169.16ms step:843/1530 train_loss:3.5676 train_time:140917ms step_avg:169.17ms step:844/1530 train_loss:3.4364 train_time:141090ms step_avg:169.17ms step:845/1530 train_loss:3.6684 train_time:141264ms step_avg:169.18ms step:846/1530 train_loss:3.5222 train_time:141440ms step_avg:169.19ms step:847/1530 train_loss:3.5002 train_time:141616ms step_avg:169.19ms step:848/1530 train_loss:3.6472 train_time:141787ms step_avg:169.20ms step:849/1530 train_loss:3.4943 train_time:141961ms step_avg:169.20ms step:850/1530 train_loss:3.4479 train_time:142135ms step_avg:169.21ms step:851/1530 train_loss:3.7408 train_time:142309ms step_avg:169.21ms step:852/1530 train_loss:3.4417 train_time:142481ms step_avg:169.22ms step:853/1530 train_loss:3.5677 train_time:142654ms step_avg:169.22ms step:854/1530 train_loss:3.6559 train_time:142828ms step_avg:169.23ms step:855/1530 train_loss:3.5181 train_time:143002ms step_avg:169.23ms step:856/1530 train_loss:3.5524 train_time:143176ms step_avg:169.24ms step:857/1530 train_loss:3.6075 train_time:143350ms step_avg:169.24ms step:858/1530 train_loss:3.4768 train_time:143525ms step_avg:169.25ms step:859/1530 train_loss:3.5615 train_time:143699ms step_avg:169.26ms step:860/1530 train_loss:3.5824 train_time:143871ms step_avg:169.26ms step:861/1530 train_loss:3.6340 train_time:144049ms step_avg:169.27ms step:862/1530 train_loss:3.6096 train_time:144226ms step_avg:169.28ms step:863/1530 train_loss:3.5745 train_time:144402ms step_avg:169.29ms step:864/1530 train_loss:3.3862 train_time:144576ms step_avg:169.29ms step:865/1530 train_loss:3.6026 train_time:144748ms step_avg:169.30ms step:866/1530 train_loss:3.8892 train_time:144924ms step_avg:169.30ms step:867/1530 train_loss:3.4555 train_time:145097ms step_avg:169.31ms step:868/1530 train_loss:3.6477 train_time:145269ms step_avg:169.31ms step:869/1530 train_loss:3.6167 train_time:145443ms step_avg:169.32ms step:870/1530 train_loss:3.4543 train_time:145617ms step_avg:169.32ms step:871/1530 train_loss:3.3939 train_time:145793ms step_avg:169.33ms step:872/1530 train_loss:3.6535 train_time:145970ms step_avg:169.34ms step:873/1530 train_loss:3.4667 train_time:146142ms step_avg:169.34ms step:874/1530 train_loss:3.2224 train_time:146318ms step_avg:169.35ms step:875/1530 train_loss:3.6395 train_time:146492ms step_avg:169.36ms step:875/1530 val_loss:3.5243 train_time:146542ms step_avg:169.41ms step:876/1530 train_loss:3.4446 train_time:146666ms step_avg:169.36ms step:877/1530 train_loss:3.6269 train_time:146843ms step_avg:169.37ms step:878/1530 train_loss:3.4773 train_time:147017ms step_avg:169.37ms step:879/1530 train_loss:3.6541 train_time:147191ms step_avg:169.38ms step:880/1530 train_loss:3.3146 train_time:147362ms step_avg:169.38ms step:881/1530 train_loss:3.4759 train_time:147534ms step_avg:169.38ms step:882/1530 train_loss:3.6972 train_time:147707ms step_avg:169.39ms step:883/1530 train_loss:3.8468 train_time:147881ms step_avg:169.39ms step:884/1530 train_loss:3.5727 train_time:148056ms step_avg:169.40ms step:885/1530 train_loss:3.5010 train_time:148229ms step_avg:169.40ms step:886/1530 train_loss:3.5723 train_time:148401ms step_avg:169.41ms step:887/1530 train_loss:4.0859 train_time:148577ms step_avg:169.42ms step:888/1530 train_loss:3.8413 train_time:148756ms step_avg:169.43ms step:889/1530 train_loss:3.5193 train_time:148929ms step_avg:169.43ms step:890/1530 train_loss:3.5347 train_time:149101ms step_avg:169.43ms step:891/1530 train_loss:3.3617 train_time:149275ms step_avg:169.44ms step:892/1530 train_loss:3.7254 train_time:149447ms step_avg:169.44ms step:893/1530 train_loss:3.4261 train_time:149619ms step_avg:169.44ms step:894/1530 train_loss:3.6541 train_time:149797ms step_avg:169.45ms step:895/1530 train_loss:3.6770 train_time:149973ms step_avg:169.46ms step:896/1530 train_loss:3.5106 train_time:150144ms step_avg:169.46ms step:897/1530 train_loss:3.5423 train_time:150319ms step_avg:169.47ms step:898/1530 train_loss:3.5960 train_time:150497ms step_avg:169.48ms step:899/1530 train_loss:3.4837 train_time:150670ms step_avg:169.48ms step:900/1530 train_loss:3.4260 train_time:150841ms step_avg:169.48ms step:901/1530 train_loss:3.6215 train_time:151016ms step_avg:169.49ms step:902/1530 train_loss:3.6406 train_time:151189ms step_avg:169.49ms step:903/1530 train_loss:3.5433 train_time:151364ms step_avg:169.50ms step:904/1530 train_loss:3.4978 train_time:151538ms step_avg:169.51ms step:905/1530 train_loss:3.5064 train_time:151708ms step_avg:169.51ms step:906/1530 train_loss:3.7157 train_time:151883ms step_avg:169.51ms step:907/1530 train_loss:3.5162 train_time:152057ms step_avg:169.52ms step:908/1530 train_loss:3.5685 train_time:152230ms step_avg:169.52ms step:909/1530 train_loss:3.4553 train_time:152406ms step_avg:169.53ms step:910/1530 train_loss:3.5289 train_time:152587ms step_avg:169.54ms step:911/1530 train_loss:3.6510 train_time:152762ms step_avg:169.55ms step:912/1530 train_loss:3.5968 train_time:152939ms step_avg:169.56ms step:913/1530 train_loss:3.4670 train_time:153118ms step_avg:169.57ms step:914/1530 train_loss:3.7457 train_time:153297ms step_avg:169.58ms step:915/1530 train_loss:3.5359 train_time:153477ms step_avg:169.59ms step:916/1530 train_loss:3.6203 train_time:153652ms step_avg:169.59ms step:917/1530 train_loss:3.6023 train_time:153827ms step_avg:169.60ms step:918/1530 train_loss:4.8233 train_time:154004ms step_avg:169.61ms step:919/1530 train_loss:3.5076 train_time:154183ms step_avg:169.62ms step:920/1530 train_loss:3.5952 train_time:154357ms step_avg:169.62ms step:921/1530 train_loss:3.5568 train_time:154536ms step_avg:169.63ms step:922/1530 train_loss:3.5842 train_time:154714ms step_avg:169.64ms step:923/1530 train_loss:3.6159 train_time:154891ms step_avg:169.65ms step:924/1530 train_loss:3.6850 train_time:155067ms step_avg:169.66ms step:925/1530 train_loss:3.6507 train_time:155241ms step_avg:169.66ms step:926/1530 train_loss:3.5618 train_time:155415ms step_avg:169.67ms step:927/1530 train_loss:3.5600 train_time:155591ms step_avg:169.67ms step:928/1530 train_loss:3.7815 train_time:155769ms step_avg:169.68ms step:929/1530 train_loss:3.6135 train_time:155942ms step_avg:169.69ms step:930/1530 train_loss:3.4097 train_time:156117ms step_avg:169.69ms step:931/1530 train_loss:3.4984 train_time:156293ms step_avg:169.70ms step:932/1530 train_loss:3.6500 train_time:156470ms step_avg:169.71ms step:933/1530 train_loss:3.3696 train_time:156646ms step_avg:169.71ms step:934/1530 train_loss:3.5840 train_time:156823ms step_avg:169.72ms step:935/1530 train_loss:3.4379 train_time:157001ms step_avg:169.73ms step:936/1530 train_loss:3.5277 train_time:157180ms step_avg:169.74ms step:937/1530 train_loss:3.6307 train_time:157359ms step_avg:169.75ms step:938/1530 train_loss:3.5457 train_time:157534ms step_avg:169.76ms step:939/1530 train_loss:3.6750 train_time:157714ms step_avg:169.77ms step:940/1530 train_loss:3.4848 train_time:157888ms step_avg:169.77ms step:941/1530 train_loss:3.5520 train_time:158062ms step_avg:169.78ms step:942/1530 train_loss:3.3664 train_time:158239ms step_avg:169.78ms step:943/1530 train_loss:3.7150 train_time:158422ms step_avg:169.80ms step:944/1530 train_loss:3.4052 train_time:158734ms step_avg:169.95ms step:945/1530 train_loss:3.4279 train_time:158917ms step_avg:169.96ms step:946/1530 train_loss:5.0796 train_time:159099ms step_avg:169.98ms step:947/1530 train_loss:3.6001 train_time:159276ms step_avg:169.99ms step:948/1530 train_loss:3.4889 train_time:159452ms step_avg:169.99ms step:949/1530 train_loss:3.3788 train_time:159773ms step_avg:170.15ms step:950/1530 train_loss:3.4478 train_time:159950ms step_avg:170.16ms step:951/1530 train_loss:3.4115 train_time:160129ms step_avg:170.17ms step:952/1530 train_loss:3.4818 train_time:160304ms step_avg:170.17ms step:953/1530 train_loss:3.5732 train_time:160482ms step_avg:170.18ms step:954/1530 train_loss:3.4513 train_time:160661ms step_avg:170.19ms step:955/1530 train_loss:3.4805 train_time:160837ms step_avg:170.20ms step:956/1530 train_loss:3.4493 train_time:161014ms step_avg:170.21ms step:957/1530 train_loss:3.4973 train_time:161193ms step_avg:170.21ms step:958/1530 train_loss:3.5030 train_time:161370ms step_avg:170.22ms step:959/1530 train_loss:3.5084 train_time:161544ms step_avg:170.23ms step:960/1530 train_loss:3.4102 train_time:161722ms step_avg:170.23ms step:961/1530 train_loss:3.6470 train_time:161897ms step_avg:170.24ms step:962/1530 train_loss:3.5966 train_time:162072ms step_avg:170.24ms step:963/1530 train_loss:3.4073 train_time:162247ms step_avg:170.25ms step:964/1530 train_loss:3.4344 train_time:162425ms step_avg:170.26ms step:965/1530 train_loss:3.4773 train_time:162598ms step_avg:170.26ms step:966/1530 train_loss:3.7092 train_time:162773ms step_avg:170.26ms step:967/1530 train_loss:3.5241 train_time:162947ms step_avg:170.27ms step:968/1530 train_loss:3.5209 train_time:163122ms step_avg:170.27ms step:969/1530 train_loss:3.5808 train_time:163297ms step_avg:170.28ms step:970/1530 train_loss:3.3743 train_time:163471ms step_avg:170.28ms step:971/1530 train_loss:3.5343 train_time:163643ms step_avg:170.28ms step:972/1530 train_loss:3.4767 train_time:163817ms step_avg:170.29ms step:973/1530 train_loss:3.5391 train_time:163992ms step_avg:170.29ms step:974/1530 train_loss:3.5912 train_time:164166ms step_avg:170.30ms step:975/1530 train_loss:3.4687 train_time:164341ms step_avg:170.30ms step:976/1530 train_loss:3.6755 train_time:164516ms step_avg:170.31ms step:977/1530 train_loss:3.5746 train_time:164689ms step_avg:170.31ms step:978/1530 train_loss:3.3590 train_time:164864ms step_avg:170.31ms step:979/1530 train_loss:3.6283 train_time:165038ms step_avg:170.32ms step:980/1530 train_loss:3.4210 train_time:165216ms step_avg:170.33ms step:981/1530 train_loss:3.5760 train_time:165395ms step_avg:170.33ms step:982/1530 train_loss:3.5462 train_time:165570ms step_avg:170.34ms step:983/1530 train_loss:3.5202 train_time:165745ms step_avg:170.34ms step:984/1530 train_loss:3.4983 train_time:165919ms step_avg:170.35ms step:985/1530 train_loss:3.5741 train_time:166098ms step_avg:170.36ms step:986/1530 train_loss:3.4166 train_time:166274ms step_avg:170.36ms step:987/1530 train_loss:3.4897 train_time:166447ms step_avg:170.37ms step:988/1530 train_loss:3.4954 train_time:166623ms step_avg:170.37ms step:989/1530 train_loss:3.4218 train_time:166796ms step_avg:170.37ms step:990/1530 train_loss:3.6632 train_time:166974ms step_avg:170.38ms step:991/1530 train_loss:3.4687 train_time:167149ms step_avg:170.39ms step:992/1530 train_loss:3.4411 train_time:167327ms step_avg:170.39ms step:993/1530 train_loss:3.4992 train_time:167506ms step_avg:170.40ms step:994/1530 train_loss:3.5992 train_time:167679ms step_avg:170.41ms step:995/1530 train_loss:3.5328 train_time:167853ms step_avg:170.41ms step:996/1530 train_loss:3.4603 train_time:168026ms step_avg:170.41ms step:997/1530 train_loss:3.7522 train_time:168200ms step_avg:170.42ms step:998/1530 train_loss:3.4410 train_time:168374ms step_avg:170.42ms step:999/1530 train_loss:3.5912 train_time:168549ms step_avg:170.42ms step:1000/1530 train_loss:3.4413 train_time:168725ms step_avg:170.43ms step:1000/1530 val_loss:3.4703 train_time:168777ms step_avg:170.48ms step:1001/1530 train_loss:3.5024 train_time:168902ms step_avg:170.44ms step:1002/1530 train_loss:3.3758 train_time:169077ms step_avg:170.44ms step:1003/1530 train_loss:3.5530 train_time:169253ms step_avg:170.45ms step:1004/1530 train_loss:3.6055 train_time:169429ms step_avg:170.45ms step:1005/1530 train_loss:3.3920 train_time:169604ms step_avg:170.46ms step:1006/1530 train_loss:3.4645 train_time:169780ms step_avg:170.46ms step:1007/1530 train_loss:3.4367 train_time:169956ms step_avg:170.47ms step:1008/1530 train_loss:3.5596 train_time:170133ms step_avg:170.47ms step:1009/1530 train_loss:3.6673 train_time:170311ms step_avg:170.48ms step:1010/1530 train_loss:3.5683 train_time:170485ms step_avg:170.49ms step:1011/1530 train_loss:3.5387 train_time:170658ms step_avg:170.49ms step:1012/1530 train_loss:3.3956 train_time:170833ms step_avg:170.49ms step:1013/1530 train_loss:3.5370 train_time:171010ms step_avg:170.50ms step:1014/1530 train_loss:3.6271 train_time:171187ms step_avg:170.50ms step:1015/1530 train_loss:3.3352 train_time:171364ms step_avg:170.51ms step:1016/1530 train_loss:3.4072 train_time:171537ms step_avg:170.51ms step:1017/1530 train_loss:3.3998 train_time:171715ms step_avg:170.52ms step:1018/1530 train_loss:3.3980 train_time:171893ms step_avg:170.53ms step:1019/1530 train_loss:3.5214 train_time:172069ms step_avg:170.53ms step:1020/1530 train_loss:3.3817 train_time:172244ms step_avg:170.54ms step:1021/1530 train_loss:3.3595 train_time:172420ms step_avg:170.54ms step:1022/1530 train_loss:3.4839 train_time:172597ms step_avg:170.55ms step:1023/1530 train_loss:3.5094 train_time:172774ms step_avg:170.56ms step:1024/1530 train_loss:3.4800 train_time:172953ms step_avg:170.56ms step:1025/1530 train_loss:3.4790 train_time:173132ms step_avg:170.57ms step:1026/1530 train_loss:3.6204 train_time:173308ms step_avg:170.58ms step:1027/1530 train_loss:3.3259 train_time:173484ms step_avg:170.58ms step:1028/1530 train_loss:3.4005 train_time:173663ms step_avg:170.59ms step:1029/1530 train_loss:3.3094 train_time:173846ms step_avg:170.60ms step:1030/1530 train_loss:3.5428 train_time:174023ms step_avg:170.61ms step:1031/1530 train_loss:3.5113 train_time:174200ms step_avg:170.62ms step:1032/1530 train_loss:3.6984 train_time:174382ms step_avg:170.63ms step:1033/1530 train_loss:3.4921 train_time:174557ms step_avg:170.63ms step:1034/1530 train_loss:3.4068 train_time:174736ms step_avg:170.64ms step:1035/1530 train_loss:3.4496 train_time:174915ms step_avg:170.65ms step:1036/1530 train_loss:3.4853 train_time:175092ms step_avg:170.66ms step:1037/1530 train_loss:3.7845 train_time:175269ms step_avg:170.66ms step:1038/1530 train_loss:3.6198 train_time:175450ms step_avg:170.67ms step:1039/1530 train_loss:3.5140 train_time:175632ms step_avg:170.68ms step:1040/1530 train_loss:3.4179 train_time:175807ms step_avg:170.69ms step:1041/1530 train_loss:3.4910 train_time:175987ms step_avg:170.70ms step:1042/1530 train_loss:3.5201 train_time:176159ms step_avg:170.70ms step:1043/1530 train_loss:3.4464 train_time:176335ms step_avg:170.70ms step:1044/1530 train_loss:3.4594 train_time:176513ms step_avg:170.71ms step:1045/1530 train_loss:3.5225 train_time:176692ms step_avg:170.72ms step:1046/1530 train_loss:3.4272 train_time:176867ms step_avg:170.72ms step:1047/1530 train_loss:3.6339 train_time:177043ms step_avg:170.73ms step:1048/1530 train_loss:3.5003 train_time:177219ms step_avg:170.73ms step:1049/1530 train_loss:3.4055 train_time:177395ms step_avg:170.74ms step:1050/1530 train_loss:3.3936 train_time:177573ms step_avg:170.74ms step:1051/1530 train_loss:3.5012 train_time:177750ms step_avg:170.75ms step:1052/1530 train_loss:3.3634 train_time:177927ms step_avg:170.76ms step:1053/1530 train_loss:3.6885 train_time:178104ms step_avg:170.76ms step:1054/1530 train_loss:3.5413 train_time:178283ms step_avg:170.77ms step:1055/1530 train_loss:3.3840 train_time:178457ms step_avg:170.77ms step:1056/1530 train_loss:3.4983 train_time:178633ms step_avg:170.78ms step:1057/1530 train_loss:3.5769 train_time:178812ms step_avg:170.78ms step:1058/1530 train_loss:3.3051 train_time:178989ms step_avg:170.79ms step:1059/1530 train_loss:3.3708 train_time:179168ms step_avg:170.80ms step:1060/1530 train_loss:3.4440 train_time:179344ms step_avg:170.80ms step:1061/1530 train_loss:3.4223 train_time:179518ms step_avg:170.81ms step:1062/1530 train_loss:3.3816 train_time:179695ms step_avg:170.81ms step:1063/1530 train_loss:3.4638 train_time:179871ms step_avg:170.82ms step:1064/1530 train_loss:3.3849 train_time:180046ms step_avg:170.82ms step:1065/1530 train_loss:3.3646 train_time:180222ms step_avg:170.83ms step:1066/1530 train_loss:3.4153 train_time:180401ms step_avg:170.83ms step:1067/1530 train_loss:3.2899 train_time:180578ms step_avg:170.84ms step:1068/1530 train_loss:3.4380 train_time:180753ms step_avg:170.84ms step:1069/1530 train_loss:3.3011 train_time:180935ms step_avg:170.85ms step:1070/1530 train_loss:3.5714 train_time:181110ms step_avg:170.86ms step:1071/1530 train_loss:3.5135 train_time:181289ms step_avg:170.87ms step:1072/1530 train_loss:3.4399 train_time:181464ms step_avg:170.87ms step:1073/1530 train_loss:3.5226 train_time:181637ms step_avg:170.87ms step:1074/1530 train_loss:3.4287 train_time:181815ms step_avg:170.88ms step:1075/1530 train_loss:3.3984 train_time:181990ms step_avg:170.88ms step:1076/1530 train_loss:3.7976 train_time:182166ms step_avg:170.89ms step:1077/1530 train_loss:3.4358 train_time:182340ms step_avg:170.89ms step:1078/1530 train_loss:3.0786 train_time:182525ms step_avg:170.90ms step:1079/1530 train_loss:3.5328 train_time:182701ms step_avg:170.91ms step:1080/1530 train_loss:3.4290 train_time:182878ms step_avg:170.91ms step:1081/1530 train_loss:3.5017 train_time:183051ms step_avg:170.92ms step:1082/1530 train_loss:3.5940 train_time:183227ms step_avg:170.92ms step:1083/1530 train_loss:3.4951 train_time:183402ms step_avg:170.92ms step:1084/1530 train_loss:3.4629 train_time:183578ms step_avg:170.93ms step:1085/1530 train_loss:3.4344 train_time:183753ms step_avg:170.93ms step:1086/1530 train_loss:3.6301 train_time:183929ms step_avg:170.94ms step:1087/1530 train_loss:3.5084 train_time:184105ms step_avg:170.94ms step:1088/1530 train_loss:3.3729 train_time:184281ms step_avg:170.95ms step:1089/1530 train_loss:3.3773 train_time:184460ms step_avg:170.95ms step:1090/1530 train_loss:3.4840 train_time:184637ms step_avg:170.96ms step:1091/1530 train_loss:3.2912 train_time:184816ms step_avg:170.97ms step:1092/1530 train_loss:3.4889 train_time:184994ms step_avg:170.97ms step:1093/1530 train_loss:3.6121 train_time:185172ms step_avg:170.98ms step:1094/1530 train_loss:3.4562 train_time:185347ms step_avg:170.98ms step:1095/1530 train_loss:3.4238 train_time:185521ms step_avg:170.99ms step:1096/1530 train_loss:3.4284 train_time:185700ms step_avg:170.99ms step:1097/1530 train_loss:3.4916 train_time:185877ms step_avg:171.00ms step:1098/1530 train_loss:3.5611 train_time:186054ms step_avg:171.01ms step:1099/1530 train_loss:3.5288 train_time:186232ms step_avg:171.01ms step:1100/1530 train_loss:3.4270 train_time:186412ms step_avg:171.02ms step:1101/1530 train_loss:3.2917 train_time:186591ms step_avg:171.03ms step:1102/1530 train_loss:3.3115 train_time:186769ms step_avg:171.03ms step:1103/1530 train_loss:3.4449 train_time:186950ms step_avg:171.04ms step:1104/1530 train_loss:3.3216 train_time:187126ms step_avg:171.05ms step:1105/1530 train_loss:4.0648 train_time:187305ms step_avg:171.06ms step:1106/1530 train_loss:3.2318 train_time:187480ms step_avg:171.06ms step:1107/1530 train_loss:3.5681 train_time:187654ms step_avg:171.06ms step:1108/1530 train_loss:3.3486 train_time:187830ms step_avg:171.07ms step:1109/1530 train_loss:3.5051 train_time:188007ms step_avg:171.07ms step:1110/1530 train_loss:3.4297 train_time:188179ms step_avg:171.07ms step:1111/1530 train_loss:3.4908 train_time:188355ms step_avg:171.08ms step:1112/1530 train_loss:3.5597 train_time:188534ms step_avg:171.08ms step:1113/1530 train_loss:3.4323 train_time:188717ms step_avg:171.09ms step:1114/1530 train_loss:3.3721 train_time:188897ms step_avg:171.10ms step:1115/1530 train_loss:3.2394 train_time:189075ms step_avg:171.11ms step:1116/1530 train_loss:3.4329 train_time:189248ms step_avg:171.11ms step:1117/1530 train_loss:3.5918 train_time:189427ms step_avg:171.12ms step:1118/1530 train_loss:3.6258 train_time:189606ms step_avg:171.12ms step:1119/1530 train_loss:3.4813 train_time:189779ms step_avg:171.13ms step:1120/1530 train_loss:3.4972 train_time:189956ms step_avg:171.13ms step:1121/1530 train_loss:3.3939 train_time:190134ms step_avg:171.14ms step:1122/1530 train_loss:3.4578 train_time:190311ms step_avg:171.14ms step:1123/1530 train_loss:3.5808 train_time:190489ms step_avg:171.15ms step:1124/1530 train_loss:3.3417 train_time:190663ms step_avg:171.15ms step:1125/1530 train_loss:3.2269 train_time:190839ms step_avg:171.16ms step:1125/1530 val_loss:3.4119 train_time:190889ms step_avg:171.20ms step:1126/1530 train_loss:3.4813 train_time:191015ms step_avg:171.16ms step:1127/1530 train_loss:3.6763 train_time:191193ms step_avg:171.17ms step:1128/1530 train_loss:3.2292 train_time:191372ms step_avg:171.17ms step:1129/1530 train_loss:3.5554 train_time:191553ms step_avg:171.18ms step:1130/1530 train_loss:3.3800 train_time:191732ms step_avg:171.19ms step:1131/1530 train_loss:3.4047 train_time:191915ms step_avg:171.20ms step:1132/1530 train_loss:3.3731 train_time:192089ms step_avg:171.20ms step:1133/1530 train_loss:3.4939 train_time:192397ms step_avg:171.32ms step:1134/1530 train_loss:3.4493 train_time:192582ms step_avg:171.34ms step:1135/1530 train_loss:3.5224 train_time:192758ms step_avg:171.34ms step:1136/1530 train_loss:3.5670 train_time:192936ms step_avg:171.35ms step:1137/1530 train_loss:3.4589 train_time:193114ms step_avg:171.35ms step:1138/1530 train_loss:3.3553 train_time:193293ms step_avg:171.36ms step:1139/1530 train_loss:3.6594 train_time:193612ms step_avg:171.49ms step:1140/1530 train_loss:3.4606 train_time:193791ms step_avg:171.50ms step:1141/1530 train_loss:3.5940 train_time:193972ms step_avg:171.50ms step:1142/1530 train_loss:3.4461 train_time:194148ms step_avg:171.51ms step:1143/1530 train_loss:3.3670 train_time:194328ms step_avg:171.52ms step:1144/1530 train_loss:3.4439 train_time:194506ms step_avg:171.52ms step:1145/1530 train_loss:3.5903 train_time:194679ms step_avg:171.52ms step:1146/1530 train_loss:3.5572 train_time:194860ms step_avg:171.53ms step:1147/1530 train_loss:3.5204 train_time:195037ms step_avg:171.54ms step:1148/1530 train_loss:3.5039 train_time:195214ms step_avg:171.54ms step:1149/1530 train_loss:3.3274 train_time:195394ms step_avg:171.55ms step:1150/1530 train_loss:3.3744 train_time:195570ms step_avg:171.55ms step:1151/1530 train_loss:3.3177 train_time:195750ms step_avg:171.56ms step:1152/1530 train_loss:3.4035 train_time:195933ms step_avg:171.57ms step:1153/1530 train_loss:3.4319 train_time:196112ms step_avg:171.58ms step:1154/1530 train_loss:3.5175 train_time:196287ms step_avg:171.58ms step:1155/1530 train_loss:3.3232 train_time:196469ms step_avg:171.59ms step:1156/1530 train_loss:3.5375 train_time:196652ms step_avg:171.60ms step:1157/1530 train_loss:3.5003 train_time:196831ms step_avg:171.60ms step:1158/1530 train_loss:3.2484 train_time:197007ms step_avg:171.61ms step:1159/1530 train_loss:3.3516 train_time:197184ms step_avg:171.61ms step:1160/1530 train_loss:3.3374 train_time:197357ms step_avg:171.61ms step:1161/1530 train_loss:3.0853 train_time:197537ms step_avg:171.62ms step:1162/1530 train_loss:3.4213 train_time:197715ms step_avg:171.63ms step:1163/1530 train_loss:3.3912 train_time:197893ms step_avg:171.63ms step:1164/1530 train_loss:3.2904 train_time:198070ms step_avg:171.64ms step:1165/1530 train_loss:3.2527 train_time:198246ms step_avg:171.64ms step:1166/1530 train_loss:3.3954 train_time:198426ms step_avg:171.65ms step:1167/1530 train_loss:3.4146 train_time:198601ms step_avg:171.65ms step:1168/1530 train_loss:3.7263 train_time:198776ms step_avg:171.65ms step:1169/1530 train_loss:3.3811 train_time:198952ms step_avg:171.66ms step:1170/1530 train_loss:3.3937 train_time:199129ms step_avg:171.66ms step:1171/1530 train_loss:3.2893 train_time:199305ms step_avg:171.67ms step:1172/1530 train_loss:3.4252 train_time:199480ms step_avg:171.67ms step:1173/1530 train_loss:3.5418 train_time:199658ms step_avg:171.68ms step:1174/1530 train_loss:3.3826 train_time:199844ms step_avg:171.69ms step:1175/1530 train_loss:3.3670 train_time:200024ms step_avg:171.69ms step:1176/1530 train_loss:3.4281 train_time:200204ms step_avg:171.70ms step:1177/1530 train_loss:3.4545 train_time:200387ms step_avg:171.71ms step:1178/1530 train_loss:3.4995 train_time:200564ms step_avg:171.72ms step:1179/1530 train_loss:3.4035 train_time:200739ms step_avg:171.72ms step:1180/1530 train_loss:3.3609 train_time:200925ms step_avg:171.73ms step:1181/1530 train_loss:3.3401 train_time:201103ms step_avg:171.74ms step:1182/1530 train_loss:3.3773 train_time:201281ms step_avg:171.74ms step:1183/1530 train_loss:3.3378 train_time:201459ms step_avg:171.75ms step:1184/1530 train_loss:3.5114 train_time:201635ms step_avg:171.75ms step:1185/1530 train_loss:3.5468 train_time:201817ms step_avg:171.76ms step:1186/1530 train_loss:3.3700 train_time:201996ms step_avg:171.77ms step:1187/1530 train_loss:3.4201 train_time:202182ms step_avg:171.78ms step:1188/1530 train_loss:3.4419 train_time:202357ms step_avg:171.78ms step:1189/1530 train_loss:3.2779 train_time:202537ms step_avg:171.79ms step:1190/1530 train_loss:3.4441 train_time:202716ms step_avg:171.79ms step:1191/1530 train_loss:3.5870 train_time:202895ms step_avg:171.80ms step:1192/1530 train_loss:3.3915 train_time:203070ms step_avg:171.80ms step:1193/1530 train_loss:3.2741 train_time:203245ms step_avg:171.80ms step:1194/1530 train_loss:3.5555 train_time:203422ms step_avg:171.81ms step:1195/1530 train_loss:3.3731 train_time:203604ms step_avg:171.82ms step:1196/1530 train_loss:3.3911 train_time:203789ms step_avg:171.83ms step:1197/1530 train_loss:3.2982 train_time:203968ms step_avg:171.83ms step:1198/1530 train_loss:3.3047 train_time:204153ms step_avg:171.85ms step:1199/1530 train_loss:3.3457 train_time:204333ms step_avg:171.85ms step:1200/1530 train_loss:3.4476 train_time:204511ms step_avg:171.86ms step:1201/1530 train_loss:3.4854 train_time:204689ms step_avg:171.86ms step:1202/1530 train_loss:3.5833 train_time:204878ms step_avg:171.88ms step:1203/1530 train_loss:3.4080 train_time:205058ms step_avg:171.88ms step:1204/1530 train_loss:3.3095 train_time:205239ms step_avg:171.89ms step:1205/1530 train_loss:3.4353 train_time:205417ms step_avg:171.90ms step:1206/1530 train_loss:3.4787 train_time:205593ms step_avg:171.90ms step:1207/1530 train_loss:3.5174 train_time:205770ms step_avg:171.90ms step:1208/1530 train_loss:3.4016 train_time:205946ms step_avg:171.91ms step:1209/1530 train_loss:3.2483 train_time:206126ms step_avg:171.92ms step:1210/1530 train_loss:3.3081 train_time:206306ms step_avg:171.92ms step:1211/1530 train_loss:3.3973 train_time:206482ms step_avg:171.93ms step:1212/1530 train_loss:3.3951 train_time:206657ms step_avg:171.93ms step:1213/1530 train_loss:3.4120 train_time:206836ms step_avg:171.93ms step:1214/1530 train_loss:3.2503 train_time:207016ms step_avg:171.94ms step:1215/1530 train_loss:3.4011 train_time:207192ms step_avg:171.94ms step:1216/1530 train_loss:3.3344 train_time:207368ms step_avg:171.95ms step:1217/1530 train_loss:3.3224 train_time:207545ms step_avg:171.95ms step:1218/1530 train_loss:3.4072 train_time:207724ms step_avg:171.96ms step:1219/1530 train_loss:3.2554 train_time:207909ms step_avg:171.97ms step:1220/1530 train_loss:3.4749 train_time:208084ms step_avg:171.97ms step:1221/1530 train_loss:3.5066 train_time:208259ms step_avg:171.97ms step:1222/1530 train_loss:3.4376 train_time:208433ms step_avg:171.97ms step:1223/1530 train_loss:3.2999 train_time:208612ms step_avg:171.98ms step:1224/1530 train_loss:3.2570 train_time:208794ms step_avg:171.99ms step:1225/1530 train_loss:3.3674 train_time:208971ms step_avg:171.99ms step:1226/1530 train_loss:3.3374 train_time:209151ms step_avg:172.00ms step:1227/1530 train_loss:3.2793 train_time:209330ms step_avg:172.01ms step:1228/1530 train_loss:3.4455 train_time:209506ms step_avg:172.01ms step:1229/1530 train_loss:3.3691 train_time:209685ms step_avg:172.01ms step:1230/1530 train_loss:3.3967 train_time:209868ms step_avg:172.02ms step:1231/1530 train_loss:3.5808 train_time:210048ms step_avg:172.03ms step:1232/1530 train_loss:3.5019 train_time:210229ms step_avg:172.04ms step:1233/1530 train_loss:3.4288 train_time:210407ms step_avg:172.04ms step:1234/1530 train_loss:3.5877 train_time:210585ms step_avg:172.05ms step:1235/1530 train_loss:3.3286 train_time:210765ms step_avg:172.05ms step:1236/1530 train_loss:3.2894 train_time:210942ms step_avg:172.06ms step:1237/1530 train_loss:3.2738 train_time:211118ms step_avg:172.06ms step:1238/1530 train_loss:3.2858 train_time:211301ms step_avg:172.07ms step:1239/1530 train_loss:3.3327 train_time:211478ms step_avg:172.07ms step:1240/1530 train_loss:3.3842 train_time:211655ms step_avg:172.08ms step:1241/1530 train_loss:3.4297 train_time:211833ms step_avg:172.08ms step:1242/1530 train_loss:3.3029 train_time:212011ms step_avg:172.09ms step:1243/1530 train_loss:3.4097 train_time:212191ms step_avg:172.09ms step:1244/1530 train_loss:3.4071 train_time:212365ms step_avg:172.09ms step:1245/1530 train_loss:3.4203 train_time:212543ms step_avg:172.10ms step:1246/1530 train_loss:3.2443 train_time:212723ms step_avg:172.11ms step:1247/1530 train_loss:3.3762 train_time:212898ms step_avg:172.11ms step:1248/1530 train_loss:3.4259 train_time:213074ms step_avg:172.11ms step:1249/1530 train_loss:3.4288 train_time:213252ms step_avg:172.12ms step:1250/1530 train_loss:3.3046 train_time:213431ms step_avg:172.12ms step:1250/1530 val_loss:3.3577 train_time:213486ms step_avg:172.17ms step:1251/1530 train_loss:3.4911 train_time:213615ms step_avg:172.13ms step:1252/1530 train_loss:3.3644 train_time:213790ms step_avg:172.13ms step:1253/1530 train_loss:3.3099 train_time:213966ms step_avg:172.14ms step:1254/1530 train_loss:3.4185 train_time:214148ms step_avg:172.14ms step:1255/1530 train_loss:3.5210 train_time:214340ms step_avg:172.16ms step:1256/1530 train_loss:3.3069 train_time:214523ms step_avg:172.17ms step:1257/1530 train_loss:3.3755 train_time:214701ms step_avg:172.17ms step:1258/1530 train_loss:3.3688 train_time:214885ms step_avg:172.18ms step:1259/1530 train_loss:3.3309 train_time:215063ms step_avg:172.19ms step:1260/1530 train_loss:3.2148 train_time:215241ms step_avg:172.19ms step:1261/1530 train_loss:3.3079 train_time:215423ms step_avg:172.20ms step:1262/1530 train_loss:3.3296 train_time:215605ms step_avg:172.21ms step:1263/1530 train_loss:3.2429 train_time:215788ms step_avg:172.22ms step:1264/1530 train_loss:3.4456 train_time:215965ms step_avg:172.22ms step:1265/1530 train_loss:3.4281 train_time:216141ms step_avg:172.22ms step:1266/1530 train_loss:3.4416 train_time:216321ms step_avg:172.23ms step:1267/1530 train_loss:3.3749 train_time:216500ms step_avg:172.24ms step:1268/1530 train_loss:3.4103 train_time:216681ms step_avg:172.24ms step:1269/1530 train_loss:3.2571 train_time:216864ms step_avg:172.25ms step:1270/1530 train_loss:3.1108 train_time:217042ms step_avg:172.26ms step:1271/1530 train_loss:3.4067 train_time:217221ms step_avg:172.26ms step:1272/1530 train_loss:3.3501 train_time:217395ms step_avg:172.26ms step:1273/1530 train_loss:3.3777 train_time:217575ms step_avg:172.27ms step:1274/1530 train_loss:3.3630 train_time:217756ms step_avg:172.28ms step:1275/1530 train_loss:3.4381 train_time:217930ms step_avg:172.28ms step:1276/1530 train_loss:3.4737 train_time:218104ms step_avg:172.28ms step:1277/1530 train_loss:3.4121 train_time:218284ms step_avg:172.28ms step:1278/1530 train_loss:3.4069 train_time:218460ms step_avg:172.29ms step:1279/1530 train_loss:3.2690 train_time:218641ms step_avg:172.29ms step:1280/1530 train_loss:3.3695 train_time:218825ms step_avg:172.30ms step:1281/1530 train_loss:3.4256 train_time:219004ms step_avg:172.31ms step:1282/1530 train_loss:3.4686 train_time:219179ms step_avg:172.31ms step:1283/1530 train_loss:3.3355 train_time:219358ms step_avg:172.32ms step:1284/1530 train_loss:3.3735 train_time:219536ms step_avg:172.32ms step:1285/1530 train_loss:3.3627 train_time:219714ms step_avg:172.32ms step:1286/1530 train_loss:3.3390 train_time:219891ms step_avg:172.33ms step:1287/1530 train_loss:3.4888 train_time:220068ms step_avg:172.33ms step:1288/1530 train_loss:3.2983 train_time:220248ms step_avg:172.34ms step:1289/1530 train_loss:3.3860 train_time:220434ms step_avg:172.35ms step:1290/1530 train_loss:3.4631 train_time:220616ms step_avg:172.36ms step:1291/1530 train_loss:3.3862 train_time:220795ms step_avg:172.36ms step:1292/1530 train_loss:3.4816 train_time:220976ms step_avg:172.37ms step:1293/1530 train_loss:3.5158 train_time:221156ms step_avg:172.37ms step:1294/1530 train_loss:3.4637 train_time:221336ms step_avg:172.38ms step:1295/1530 train_loss:3.2836 train_time:221516ms step_avg:172.39ms step:1296/1530 train_loss:3.3769 train_time:221696ms step_avg:172.39ms step:1297/1530 train_loss:3.2748 train_time:221874ms step_avg:172.40ms step:1298/1530 train_loss:3.2762 train_time:222056ms step_avg:172.40ms step:1299/1530 train_loss:3.3990 train_time:222235ms step_avg:172.41ms step:1300/1530 train_loss:3.4097 train_time:222411ms step_avg:172.41ms step:1301/1530 train_loss:3.4102 train_time:222587ms step_avg:172.41ms step:1302/1530 train_loss:3.5787 train_time:222769ms step_avg:172.42ms step:1303/1530 train_loss:3.3105 train_time:222951ms step_avg:172.43ms step:1304/1530 train_loss:3.5162 train_time:223131ms step_avg:172.44ms step:1305/1530 train_loss:3.2599 train_time:223307ms step_avg:172.44ms step:1306/1530 train_loss:3.4575 train_time:223489ms step_avg:172.45ms step:1307/1530 train_loss:3.4597 train_time:223664ms step_avg:172.45ms step:1308/1530 train_loss:3.2928 train_time:223844ms step_avg:172.45ms step:1309/1530 train_loss:3.3110 train_time:224024ms step_avg:172.46ms step:1310/1530 train_loss:3.2883 train_time:224203ms step_avg:172.46ms step:1311/1530 train_loss:3.3013 train_time:224381ms step_avg:172.47ms step:1312/1530 train_loss:3.3752 train_time:224561ms step_avg:172.47ms step:1313/1530 train_loss:3.3466 train_time:224737ms step_avg:172.48ms step:1314/1530 train_loss:3.0485 train_time:224922ms step_avg:172.49ms step:1315/1530 train_loss:3.2766 train_time:225100ms step_avg:172.49ms step:1316/1530 train_loss:3.4010 train_time:225276ms step_avg:172.49ms step:1317/1530 train_loss:3.4201 train_time:225454ms step_avg:172.50ms step:1318/1530 train_loss:3.3043 train_time:225639ms step_avg:172.51ms step:1319/1530 train_loss:3.4290 train_time:225821ms step_avg:172.51ms step:1320/1530 train_loss:3.4674 train_time:226003ms step_avg:172.52ms step:1321/1530 train_loss:3.3710 train_time:226182ms step_avg:172.53ms step:1322/1530 train_loss:3.3297 train_time:226492ms step_avg:172.63ms step:1323/1530 train_loss:3.3221 train_time:226681ms step_avg:172.64ms step:1324/1530 train_loss:3.4445 train_time:226862ms step_avg:172.65ms step:1325/1530 train_loss:3.4951 train_time:227047ms step_avg:172.66ms step:1326/1530 train_loss:3.2158 train_time:227228ms step_avg:172.67ms step:1327/1530 train_loss:3.1685 train_time:227404ms step_avg:172.67ms step:1328/1530 train_loss:3.4968 train_time:227585ms step_avg:172.67ms step:1329/1530 train_loss:3.3011 train_time:227915ms step_avg:172.79ms step:1330/1530 train_loss:3.4323 train_time:228096ms step_avg:172.80ms step:1331/1530 train_loss:3.3345 train_time:228272ms step_avg:172.80ms step:1332/1530 train_loss:3.7445 train_time:228452ms step_avg:172.81ms step:1333/1530 train_loss:3.4849 train_time:228634ms step_avg:172.81ms step:1334/1530 train_loss:3.3709 train_time:228811ms step_avg:172.82ms step:1335/1530 train_loss:3.2929 train_time:228991ms step_avg:172.82ms step:1336/1530 train_loss:3.2998 train_time:229175ms step_avg:172.83ms step:1337/1530 train_loss:3.5529 train_time:229354ms step_avg:172.84ms step:1338/1530 train_loss:3.5266 train_time:229532ms step_avg:172.84ms step:1339/1530 train_loss:3.3418 train_time:229711ms step_avg:172.85ms step:1340/1530 train_loss:3.2898 train_time:229890ms step_avg:172.85ms step:1341/1530 train_loss:3.5971 train_time:230066ms step_avg:172.85ms step:1342/1530 train_loss:3.3614 train_time:230246ms step_avg:172.86ms step:1343/1530 train_loss:3.3657 train_time:230425ms step_avg:172.86ms step:1344/1530 train_loss:3.4229 train_time:230607ms step_avg:172.87ms step:1345/1530 train_loss:3.3935 train_time:230789ms step_avg:172.88ms step:1346/1530 train_loss:3.3027 train_time:230965ms step_avg:172.88ms step:1347/1530 train_loss:3.2815 train_time:231143ms step_avg:172.88ms step:1348/1530 train_loss:3.3525 train_time:231322ms step_avg:172.89ms step:1349/1530 train_loss:3.2803 train_time:231499ms step_avg:172.89ms step:1350/1530 train_loss:3.3922 train_time:231678ms step_avg:172.89ms step:1351/1530 train_loss:3.2445 train_time:231855ms step_avg:172.90ms step:1352/1530 train_loss:3.3134 train_time:232032ms step_avg:172.90ms step:1353/1530 train_loss:3.4073 train_time:232211ms step_avg:172.90ms step:1354/1530 train_loss:3.2628 train_time:232389ms step_avg:172.91ms step:1355/1530 train_loss:3.1902 train_time:232565ms step_avg:172.91ms step:1356/1530 train_loss:3.5132 train_time:232747ms step_avg:172.92ms step:1357/1530 train_loss:3.4266 train_time:232928ms step_avg:172.92ms step:1358/1530 train_loss:3.1862 train_time:233107ms step_avg:172.93ms step:1359/1530 train_loss:3.4442 train_time:233287ms step_avg:172.93ms step:1360/1530 train_loss:3.3547 train_time:233467ms step_avg:172.94ms step:1361/1530 train_loss:3.1293 train_time:233655ms step_avg:172.95ms step:1362/1530 train_loss:3.3985 train_time:233836ms step_avg:172.96ms step:1363/1530 train_loss:3.2868 train_time:234024ms step_avg:172.97ms step:1364/1530 train_loss:3.3039 train_time:234205ms step_avg:172.97ms step:1365/1530 train_loss:3.3177 train_time:234384ms step_avg:172.98ms step:1366/1530 train_loss:3.4271 train_time:234565ms step_avg:172.98ms step:1367/1530 train_loss:3.4047 train_time:234745ms step_avg:172.99ms step:1368/1530 train_loss:3.3497 train_time:234925ms step_avg:172.99ms step:1369/1530 train_loss:3.2850 train_time:235113ms step_avg:173.00ms step:1370/1530 train_loss:3.6064 train_time:235294ms step_avg:173.01ms step:1371/1530 train_loss:3.3164 train_time:235474ms step_avg:173.02ms step:1372/1530 train_loss:3.3744 train_time:235658ms step_avg:173.02ms step:1373/1530 train_loss:3.3719 train_time:235838ms step_avg:173.03ms step:1374/1530 train_loss:3.1586 train_time:236019ms step_avg:173.03ms step:1375/1530 train_loss:3.5410 train_time:236200ms step_avg:173.04ms step:1375/1530 val_loss:3.3153 train_time:236251ms step_avg:173.08ms step:1376/1530 train_loss:3.3507 train_time:236378ms step_avg:173.04ms step:1377/1530 train_loss:3.4849 train_time:236558ms step_avg:173.05ms step:1378/1530 train_loss:3.4788 train_time:236735ms step_avg:173.05ms step:1379/1530 train_loss:3.1158 train_time:236916ms step_avg:173.06ms step:1380/1530 train_loss:3.3190 train_time:237096ms step_avg:173.06ms step:1381/1530 train_loss:3.7069 train_time:237281ms step_avg:173.07ms step:1382/1530 train_loss:3.2186 train_time:237460ms step_avg:173.08ms step:1383/1530 train_loss:3.3961 train_time:237640ms step_avg:173.08ms step:1384/1530 train_loss:3.4751 train_time:237825ms step_avg:173.09ms step:1385/1530 train_loss:3.4070 train_time:237999ms step_avg:173.09ms step:1386/1530 train_loss:3.3410 train_time:238178ms step_avg:173.09ms step:1387/1530 train_loss:3.2063 train_time:238358ms step_avg:173.10ms step:1388/1530 train_loss:3.3443 train_time:238534ms step_avg:173.10ms step:1389/1530 train_loss:3.3191 train_time:238717ms step_avg:173.11ms step:1390/1530 train_loss:3.5706 train_time:238893ms step_avg:173.11ms step:1391/1530 train_loss:3.2912 train_time:239073ms step_avg:173.12ms step:1392/1530 train_loss:3.2902 train_time:239252ms step_avg:173.12ms step:1393/1530 train_loss:3.2412 train_time:239431ms step_avg:173.12ms step:1394/1530 train_loss:3.5045 train_time:239610ms step_avg:173.13ms step:1395/1530 train_loss:3.3992 train_time:239789ms step_avg:173.13ms step:1396/1530 train_loss:3.4098 train_time:239966ms step_avg:173.14ms step:1397/1530 train_loss:3.3129 train_time:240143ms step_avg:173.14ms step:1398/1530 train_loss:3.2618 train_time:240318ms step_avg:173.14ms step:1399/1530 train_loss:3.3153 train_time:240498ms step_avg:173.14ms step:1400/1530 train_loss:3.3250 train_time:240682ms step_avg:173.15ms step:1401/1530 train_loss:3.3543 train_time:240856ms step_avg:173.15ms step:1402/1530 train_loss:3.3026 train_time:241036ms step_avg:173.16ms step:1403/1530 train_loss:3.5015 train_time:241220ms step_avg:173.17ms step:1404/1530 train_loss:3.2886 train_time:241397ms step_avg:173.17ms step:1405/1530 train_loss:3.3145 train_time:241578ms step_avg:173.17ms step:1406/1530 train_loss:3.3217 train_time:241757ms step_avg:173.18ms step:1407/1530 train_loss:3.1805 train_time:241934ms step_avg:173.18ms step:1408/1530 train_loss:3.3115 train_time:242115ms step_avg:173.19ms step:1409/1530 train_loss:3.3044 train_time:242302ms step_avg:173.20ms step:1410/1530 train_loss:3.2936 train_time:242480ms step_avg:173.20ms step:1411/1530 train_loss:3.3668 train_time:242656ms step_avg:173.20ms step:1412/1530 train_loss:3.3329 train_time:242834ms step_avg:173.21ms step:1413/1530 train_loss:3.3682 train_time:243014ms step_avg:173.21ms step:1414/1530 train_loss:3.3330 train_time:243195ms step_avg:173.22ms step:1415/1530 train_loss:3.4148 train_time:243380ms step_avg:173.22ms step:1416/1530 train_loss:3.2336 train_time:243569ms step_avg:173.24ms step:1417/1530 train_loss:3.2845 train_time:243751ms step_avg:173.24ms step:1418/1530 train_loss:3.4006 train_time:243933ms step_avg:173.25ms step:1419/1530 train_loss:3.3483 train_time:244116ms step_avg:173.25ms step:1420/1530 train_loss:3.3718 train_time:244297ms step_avg:173.26ms step:1421/1530 train_loss:3.3802 train_time:244477ms step_avg:173.27ms step:1422/1530 train_loss:3.3405 train_time:244654ms step_avg:173.27ms step:1423/1530 train_loss:3.3226 train_time:244833ms step_avg:173.27ms step:1424/1530 train_loss:3.3379 train_time:245017ms step_avg:173.28ms step:1425/1530 train_loss:3.1988 train_time:245203ms step_avg:173.29ms step:1426/1530 train_loss:3.3267 train_time:245382ms step_avg:173.29ms step:1427/1530 train_loss:3.2911 train_time:245564ms step_avg:173.30ms step:1428/1530 train_loss:3.3824 train_time:245742ms step_avg:173.30ms step:1429/1530 train_loss:3.3599 train_time:245919ms step_avg:173.30ms step:1430/1530 train_loss:3.2618 train_time:246100ms step_avg:173.31ms step:1431/1530 train_loss:3.3233 train_time:246282ms step_avg:173.32ms step:1432/1530 train_loss:3.3422 train_time:246462ms step_avg:173.32ms step:1433/1530 train_loss:3.1324 train_time:246646ms step_avg:173.33ms step:1434/1530 train_loss:3.2896 train_time:246830ms step_avg:173.34ms step:1435/1530 train_loss:3.1216 train_time:247011ms step_avg:173.34ms step:1436/1530 train_loss:3.2354 train_time:247190ms step_avg:173.35ms step:1437/1530 train_loss:3.4134 train_time:247366ms step_avg:173.35ms step:1438/1530 train_loss:3.3857 train_time:247542ms step_avg:173.35ms step:1439/1530 train_loss:3.3199 train_time:247722ms step_avg:173.35ms step:1440/1530 train_loss:3.1945 train_time:247899ms step_avg:173.36ms step:1441/1530 train_loss:3.3429 train_time:248076ms step_avg:173.36ms step:1442/1530 train_loss:3.3902 train_time:248257ms step_avg:173.36ms step:1443/1530 train_loss:3.4937 train_time:248446ms step_avg:173.37ms step:1444/1530 train_loss:3.4534 train_time:248623ms step_avg:173.38ms step:1445/1530 train_loss:3.3394 train_time:248802ms step_avg:173.38ms step:1446/1530 train_loss:3.2021 train_time:248982ms step_avg:173.39ms step:1447/1530 train_loss:3.2986 train_time:249163ms step_avg:173.39ms step:1448/1530 train_loss:3.2967 train_time:249341ms step_avg:173.39ms step:1449/1530 train_loss:3.4000 train_time:249521ms step_avg:173.40ms step:1450/1530 train_loss:3.3891 train_time:249703ms step_avg:173.40ms step:1451/1530 train_loss:3.2063 train_time:249881ms step_avg:173.41ms step:1452/1530 train_loss:3.3372 train_time:250060ms step_avg:173.41ms step:1453/1530 train_loss:3.2635 train_time:250235ms step_avg:173.41ms step:1454/1530 train_loss:3.2966 train_time:250414ms step_avg:173.42ms step:1455/1530 train_loss:3.3357 train_time:250596ms step_avg:173.42ms step:1456/1530 train_loss:3.2855 train_time:250772ms step_avg:173.42ms step:1457/1530 train_loss:3.1558 train_time:250948ms step_avg:173.43ms step:1458/1530 train_loss:3.4237 train_time:251126ms step_avg:173.43ms step:1459/1530 train_loss:3.2728 train_time:251308ms step_avg:173.44ms step:1460/1530 train_loss:3.3184 train_time:251487ms step_avg:173.44ms step:1461/1530 train_loss:3.4331 train_time:251667ms step_avg:173.44ms step:1462/1530 train_loss:3.2667 train_time:251845ms step_avg:173.45ms step:1463/1530 train_loss:3.4696 train_time:252031ms step_avg:173.46ms step:1464/1530 train_loss:3.3623 train_time:252210ms step_avg:173.46ms step:1465/1530 train_loss:3.3571 train_time:252391ms step_avg:173.46ms step:1466/1530 train_loss:3.2893 train_time:252568ms step_avg:173.47ms step:1467/1530 train_loss:3.3958 train_time:252748ms step_avg:173.47ms step:1468/1530 train_loss:3.2885 train_time:252925ms step_avg:173.47ms step:1469/1530 train_loss:3.2764 train_time:253104ms step_avg:173.48ms step:1470/1530 train_loss:3.3358 train_time:253289ms step_avg:173.49ms step:1471/1530 train_loss:3.2653 train_time:253474ms step_avg:173.49ms step:1472/1530 train_loss:3.2588 train_time:253658ms step_avg:173.50ms step:1473/1530 train_loss:3.4465 train_time:253835ms step_avg:173.50ms step:1474/1530 train_loss:3.3166 train_time:254018ms step_avg:173.51ms step:1475/1530 train_loss:3.1482 train_time:254206ms step_avg:173.52ms step:1476/1530 train_loss:3.2696 train_time:254385ms step_avg:173.52ms step:1477/1530 train_loss:3.2381 train_time:254571ms step_avg:173.53ms step:1478/1530 train_loss:3.3113 train_time:254755ms step_avg:173.54ms step:1479/1530 train_loss:3.4007 train_time:254937ms step_avg:173.54ms step:1480/1530 train_loss:3.2711 train_time:255116ms step_avg:173.55ms step:1481/1530 train_loss:3.4550 train_time:255299ms step_avg:173.55ms step:1482/1530 train_loss:3.3711 train_time:255486ms step_avg:173.56ms step:1483/1530 train_loss:3.2852 train_time:255676ms step_avg:173.58ms step:1484/1530 train_loss:3.2694 train_time:255864ms step_avg:173.58ms step:1485/1530 train_loss:3.2814 train_time:256044ms step_avg:173.59ms step:1486/1530 train_loss:3.2325 train_time:256229ms step_avg:173.60ms step:1487/1530 train_loss:3.3431 train_time:256411ms step_avg:173.60ms step:1488/1530 train_loss:3.2446 train_time:256594ms step_avg:173.61ms step:1489/1530 train_loss:3.3175 train_time:256774ms step_avg:173.61ms step:1490/1530 train_loss:3.2547 train_time:256953ms step_avg:173.62ms step:1491/1530 train_loss:3.1627 train_time:257133ms step_avg:173.62ms step:1492/1530 train_loss:3.2724 train_time:257314ms step_avg:173.63ms step:1493/1530 train_loss:3.4357 train_time:257491ms step_avg:173.63ms step:1494/1530 train_loss:3.2978 train_time:257669ms step_avg:173.63ms step:1495/1530 train_loss:3.0313 train_time:257853ms step_avg:173.64ms step:1496/1530 train_loss:3.3656 train_time:258037ms step_avg:173.65ms step:1497/1530 train_loss:3.3150 train_time:258220ms step_avg:173.65ms step:1498/1530 train_loss:3.3482 train_time:258405ms step_avg:173.66ms step:1499/1530 train_loss:3.3146 train_time:258593ms step_avg:173.67ms step:1500/1530 train_loss:3.2985 train_time:258785ms step_avg:173.68ms step:1500/1530 val_loss:3.2835 train_time:258840ms step_avg:173.72ms step:1501/1530 train_loss:3.0943 train_time:258974ms step_avg:173.69ms step:1502/1530 train_loss:3.3598 train_time:259168ms step_avg:173.71ms step:1503/1530 train_loss:3.2469 train_time:259347ms step_avg:173.71ms step:1504/1530 train_loss:3.2519 train_time:259528ms step_avg:173.71ms step:1505/1530 train_loss:3.2124 train_time:259707ms step_avg:173.72ms step:1506/1530 train_loss:3.2858 train_time:259891ms step_avg:173.72ms step:1507/1530 train_loss:3.1824 train_time:260085ms step_avg:173.74ms step:1508/1530 train_loss:3.4881 train_time:260268ms step_avg:173.74ms step:1509/1530 train_loss:3.2845 train_time:260445ms step_avg:173.75ms step:1510/1530 train_loss:3.2708 train_time:260626ms step_avg:173.75ms step:1511/1530 train_loss:3.4200 train_time:260937ms step_avg:173.84ms step:1512/1530 train_loss:3.4232 train_time:261126ms step_avg:173.85ms step:1513/1530 train_loss:3.2730 train_time:261310ms step_avg:173.86ms step:1514/1530 train_loss:3.0925 train_time:261492ms step_avg:173.86ms step:1515/1530 train_loss:3.2469 train_time:261672ms step_avg:173.87ms step:1516/1530 train_loss:3.2626 train_time:261858ms step_avg:173.88ms step:1517/1530 train_loss:3.3022 train_time:262039ms step_avg:173.88ms step:1518/1530 train_loss:3.2103 train_time:262223ms step_avg:173.89ms step:1519/1530 train_loss:3.5106 train_time:262546ms step_avg:173.99ms step:1520/1530 train_loss:3.1326 train_time:262731ms step_avg:173.99ms step:1521/1530 train_loss:3.2102 train_time:262910ms step_avg:174.00ms step:1522/1530 train_loss:3.3590 train_time:263095ms step_avg:174.00ms step:1523/1530 train_loss:3.2338 train_time:263273ms step_avg:174.01ms step:1524/1530 train_loss:3.3494 train_time:263453ms step_avg:174.01ms step:1525/1530 train_loss:3.3462 train_time:263639ms step_avg:174.02ms step:1526/1530 train_loss:3.2838 train_time:263831ms step_avg:174.03ms step:1527/1530 train_loss:3.2984 train_time:264011ms step_avg:174.03ms step:1528/1530 train_loss:3.4135 train_time:264189ms step_avg:174.04ms step:1529/1530 train_loss:3.4125 train_time:264369ms step_avg:174.04ms step:1530/1530 train_loss:3.2398 train_time:264546ms step_avg:174.04ms step:1530/1530 val_loss:3.2810 train_time:264601ms step_avg:174.08ms