import os
import sys
with open(sys.argv[0]) as f:
    code = f.read() # read the code of this file ASAP, for logging
import uuid
import glob
import time
import contextlib
from dataclasses import dataclass

import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
import torch._inductor.config as config
from torch.nn.parallel import DistributedDataParallel as DDP
# Use of FlexAttention contributed by @KoszarskyB
from torch.nn.attention.flex_attention import flex_attention, create_block_mask
flex_attention = torch.compile(flex_attention, dynamic=False)
create_block_mask = torch.compile(create_block_mask, dynamic=False)

# -----------------------------------------------------------------------------
# Muon optimizer

def zeropower_via_svd(G, steps=None):
    U, S, V = G.svd()
    return U @ V.T

@torch.compile
def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7):
    """
    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
    zero even beyond the point where the iteration no longer converges all the way to one everywhere
    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
    performance at all relative to UV^T, where USV^T = G is the SVD.
    """
    assert len(G.shape) == 2
    a, b, c = (3.4445, -4.7750,  2.0315)
    X = G.bfloat16()
    X /= (X.norm() + eps) # ensure top singular value <= 1
    if G.size(0) > G.size(1):
        X = X.T
    for _ in range(steps):
        A = X @ X.T
        B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
        X = a * X + B @ X
    if G.size(0) > G.size(1):
        X = X.T
    return X

zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5)

class Muon(torch.optim.Optimizer):
    """
    Muon - MomentUm Orthogonalized by Newton-schulz

    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
    the advantage that it can be stably run in bfloat16 on the GPU.

    Some warnings:
    - This optimizer assumes that all parameters passed in are 2D.
    - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D
    parameters; those should all be optimized by a standard method (e.g., AdamW).
    - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
    - We believe it is unlikely to work well for training with small batch size.
    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
    - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M).

    Arguments:
        lr: The learning rate used by the internal SGD.
        momentum: The momentum used by the internal SGD.
        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
        backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5')
        backend_steps: The number of iteration steps to use in the backend, if it is iterative.
    """
    def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True,
                 backend='newtonschulz5', backend_steps=5):
        defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps)
        super().__init__(params, defaults)

    def step(self):

        for group in self.param_groups:

            lr = group['lr']
            momentum = group['momentum']
            zeropower_backend = zeropower_backends[group['backend']]

            # generate weight updates in distributed fashion
            total_params = sum(p.numel() for p in group['params'])
            updates_flat = torch.zeros(total_params, device='cuda', dtype=torch.bfloat16)
            curr_idx = 0
            for i, p in enumerate(group['params']):
                # luckily this will perfectly distribute a transformer with multiple of 4 layers to 8 GPUs
                if i % int(os.environ['WORLD_SIZE']) == int(os.environ['RANK']):
                    g = p.grad
                    assert g is not None
                    state = self.state[p]
                    if 'momentum_buffer' not in state:
                        state['momentum_buffer'] = torch.zeros_like(g)
                    buf = state['momentum_buffer']
                    buf.mul_(momentum).add_(g)
                    g = g.add(buf, alpha=momentum) if group['nesterov'] else buf
                    g = zeropower_backend(g, steps=group['backend_steps'])
                    g *= max(1, g.size(0)/g.size(1))**0.5
                    updates_flat[curr_idx:curr_idx+p.numel()] = g.flatten()
                curr_idx += p.numel()

            # sync updates across devices. we are not memory-constrained so can do this simple deserialization
            dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM)

            # deserialize and apply updates
            curr_idx = 0
            for p in group['params']:
                g = updates_flat[curr_idx:curr_idx+p.numel()].view_as(p.data).type_as(p.data)
                p.data.add_(g, alpha=-lr)
                curr_idx += p.numel()

# -----------------------------------------------------------------------------
# PyTorch nn.Module definitions for the GPT-2 model

def norm(x):
    return F.rms_norm(x, (x.size(-1),))

class CastedLinear(nn.Linear):

    def __init__(self, in_features, out_features):
        super().__init__(in_features, out_features, bias=False)

    def forward(self, x):
        return F.linear(x, self.weight.to(x.dtype))

class Rotary(torch.nn.Module):

    def __init__(self, dim, base=10000):
        super().__init__()
        self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim))
        self.seq_len_cached = None
        self.cos_cached = None
        self.sin_cached = None

    def forward(self, x):
        seq_len = x.shape[1]
        if seq_len != self.seq_len_cached:
            t = torch.arange(seq_len, device=x.device)
            freqs = torch.outer(t, self.inv_freq)
            self.seq_len_cached = seq_len
            self.cos_cached = freqs.cos()
            self.sin_cached = freqs.sin()
        cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :]
        # apply_rotary_emb(x, cos, sin)
        x1, x2 = x.chunk(2, dim=3)
        y1 = x1 * cos + x2 * sin
        y2 = x1 * (-sin) + x2 * cos
        return torch.cat((y1, y2), 3).type_as(x)

class CausalSelfAttention(nn.Module):

    def __init__(self, dim, n_head):
        super().__init__()
        assert dim % n_head == 0
        self.n_head = n_head
        self.c_q = CastedLinear(dim, dim)
        self.c_k = CastedLinear(dim, dim)
        self.c_v = CastedLinear(dim, dim)
        # value residual lambda
        self.lamb = nn.Parameter(torch.tensor(0.5)) # @Grad62304977
        # rotary embeddings
        self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim
        # output projection
        self.c_proj = CastedLinear(dim, dim)
        self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977

    def forward(self, x, vi, block_mask):
        B, T = x.size(0), x.size(1) # batch size, sequence length
        assert B == 1, "Must use batch size = 1 for FlexAttention"
        q = self.c_q(x).view(B, T, self.n_head, -1)
        k = self.c_k(x).view(B, T, self.n_head, -1)
        v = self.c_v(x).view(B, T, self.n_head, -1)
        v = (1 - self.lamb) * v + self.lamb * vi.view_as(v) # @Grad62304977
        q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977
        q, k = self.rotary(q), self.rotary(k)
        y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask)
        y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, dim):
        super().__init__()
        self.c_fc   = CastedLinear(dim, 4 * dim)
        self.c_proj = CastedLinear(4 * dim, dim)
        self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977

    def forward(self, x):
        x = self.c_fc(x)
        x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.attn = CausalSelfAttention(config.n_embd, config.n_head)
        self.mlp = MLP(config.n_embd)
        self.lambdas = nn.Parameter(torch.tensor([1., 0.]))

    def forward(self, x, vi, x0, block_mask):
        x = self.lambdas[0] * x + self.lambdas[1] * x0
        x = x + self.attn(norm(x), vi, block_mask)
        x = x + self.mlp(norm(x))
        return x

# -----------------------------------------------------------------------------
# The main GPT-2 model

@dataclass
class GPTConfig:
    vocab_size : int = 50304
    n_layer : int = 12
    n_head : int = 6 # head dim 128 suggested by @Grad62304977
    n_embd : int = 768

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()

        # U-net design by @brendanh0gan
        self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder
        self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder
        # Add learnable skip connection weights for decoder layers
        self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers))

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning
            vte = nn.Embedding(config.vocab_size, config.n_embd*12),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
        ))
        self.lm_head = CastedLinear(config.n_embd, config.vocab_size)
        self.lm_head.weight.data.zero_() # @Grad62304977

    def forward(self, idx, target, attn_blocksize):

        docs = (idx == 50256).cumsum(0)
        def document_causal_mask(b, h, q_idx, kv_idx):
          causal_mask = q_idx >= kv_idx
          document_mask = docs[q_idx] == docs[kv_idx]
          window_mask = q_idx - kv_idx < attn_blocksize
          return causal_mask & document_mask & window_mask

        S = len(idx)
        block_mask = create_block_mask(document_causal_mask, None, None, S, S, device="cuda", _compile=True)

        # forward the GPT model itself
        x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd)
        x = norm(x) # @Grad62304977
        x0 = x
        vi = self.transformer.vte(idx[None]).chunk(12, dim=-1)

        # Store outputs for U-Net skip connections
        skip_connections = []
        # Encoder pass - process only the first half of the blocks
        for i in range(self.num_encoder_layers):
            x = self.transformer.h[i](x, vi[i], x0, block_mask)
            skip_connections.append(x)
        # Decoder pass - process the remaining blocks with weighted skip connections
        for i in range(self.num_decoder_layers):
            x = x + self.skip_weights[i] * skip_connections.pop()
            x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers+i], x0, block_mask)

        x = norm(x)
        logits = self.lm_head(x)
        logits = 30 * torch.tanh(logits / 30) # @Grad62304977
        logits = logits.float()
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1))
        return loss

# -----------------------------------------------------------------------------
# Our own simple Distributed Data Loader

def _peek_data_shard(filename):
    # only reads the header, returns header data
    with open(filename, "rb") as f:
        # first read the header, which is 256 int32 integers (4 bytes each)
        header = np.frombuffer(f.read(256*4), dtype=np.int32)
    if header[0] != 20240520:
        print("ERROR: magic number mismatch in the data .bin file!")
        print("---> HINT: Are you passing in a correct file with --input_bin?")
        print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README")
        print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
        exit(1)
    assert header[1] == 1, "unsupported version"
    ntok = header[2] # number of tokens (claimed)
    return ntok # for now just return the number of tokens

def _load_data_shard(filename):
    with open(filename, "rb") as f:
        # first read the header, which is 256 int32 integers (4 bytes each)
        header = np.frombuffer(f.read(256*4), dtype=np.int32)
        assert header[0] == 20240520, "magic number mismatch in the data .bin file"
        assert header[1] == 1, "unsupported version"
        ntok = header[2] # number of tokens (claimed)
        # the rest of it are tokens, stored as uint16
        tokens = np.frombuffer(f.read(), dtype=np.uint16)
    assert len(tokens) == ntok, "number of tokens read does not match header?"
    return tokens

class DistributedDataLoader:
    def __init__(self, filename_pattern, T, process_rank, num_processes):
        self.process_rank = process_rank
        self.num_processes = num_processes
        self.T = T

        # glob files that match the pattern
        self.files = sorted(glob.glob(filename_pattern))
        assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}"

        # load and validate all data shards, count number of tokens in total
        ntok_total = 0
        for fname in self.files:
            shard_ntok = _peek_data_shard(fname)
            assert shard_ntok >= num_processes * T + 1
            ntok_total += int(shard_ntok)
        self.ntok_total = ntok_total

        self.reset()

    def reset(self):
        self.current_shard = -1
        self.advance()

    def advance(self): # advance to next data shard
        self.current_shard = (self.current_shard + 1) % len(self.files)
        self.current_position = self.process_rank * self.T
        self.tokens = _load_data_shard(self.files[self.current_shard])

    def next_batch(self):
        batch_size = self.T * self.num_processes
        buf = self.tokens[self.current_position:self.current_position+self.T+1]
        buf = torch.tensor(buf.astype(np.int32), dtype=torch.long)
        x = buf[:-1] # inputs
        y = buf[1:] # targets
        # advance current position and load next shard if necessary
        self.current_position += batch_size
        if self.current_position + batch_size >= len(self.tokens):
            self.advance()
        return x.cuda(), y.cuda()

# -----------------------------------------------------------------------------
# int main

@dataclass
class Hyperparameters:
    # data hyperparams
    input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on
    input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on
    # optimization hyperparams
    batch_size : int = 8 # batch size, in sequences, across all devices
    sequence_length : int = 64*1024 # sequence length, in tokens
    num_iterations : int = 1530 # number of iterations to run
    warmup_iters : int = 0
    cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule
    weight_decay : float = 0
    # evaluation and logging hyperparams
    val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end
    val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons
    save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end
args = Hyperparameters()

# set up DDP (distributed data parallel). torchrun sets this env variable
assert torch.cuda.is_available()
dist.init_process_group(backend='nccl')
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
print(f"using device: {device}")
master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc.

# begin logging
logfile = None
if master_process:
    run_id = str(uuid.uuid4())
    logdir = 'logs/%s/' % run_id
    os.makedirs(logdir, exist_ok=True)
    logfile = 'logs/%s.txt' % run_id
    # create the log file
    with open(logfile, "w") as f:
        # begin the log by printing this file (the Python code)
        f.write(code)
        f.write('='*100 + '\n')
def print0(s, logonly=False):
    if master_process:
        with open(logfile, "a") as f:
            if not logonly:
                print(s)
            f.write(s+'\n')
# log information about the hardware/software environment this is running on
# and print the full `nvidia-smi` to file
print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:")
import subprocess
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
print0(f'{result.stdout}', logonly=True)
print0('='*100, logonly=True)

# convenience variables
T = args.sequence_length
# calculate the number of steps to take in the val loop.
assert args.val_tokens % (T * ddp_world_size) == 0
val_steps = args.val_tokens // (T * ddp_world_size)
# calculate the steps of gradient accumulation required to attain the desired global batch size.
assert args.batch_size % (ddp_world_size) == 0
train_accumulation_steps = args.batch_size // ddp_world_size

# load tokens
train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size)
val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size)
print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files")
print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files")
print0('='*100, logonly=True)
x, y = train_loader.next_batch()

# there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977.
# this originates from Karpathy's experiments.
num_vocab = 50304
model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768))
model = model.cuda().bfloat16()
for m in model.modules():
    if isinstance(m, CastedLinear):
        m.float()
if hasattr(config, "coordinate_descent_tuning"):
    config.coordinate_descent_tuning = True # suggested by @Chillee
model = torch.compile(model)
# here we wrap model into DDP container
model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module # always contains the "raw" unwrapped model

# init the optimizer(s)
optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True)
optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True)
params = list(raw_model.transformer.h.parameters())
matrix_params = [p for p in params if p.ndim == 2]
scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights]
optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95)
optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) # note that this learning rate is neither sensitive nor tuned
optimizers = [optimizer1, optimizer2, optimizer3, optimizer4]
# learning rate decay scheduler (linear warmup and cooldown)
def get_lr(it):
    assert it <= args.num_iterations
    # 1) linear warmup for warmup_iters steps
    if it < args.warmup_iters:
        return (it+1) / args.warmup_iters
    # 2) constant lr for a while
    elif it < args.num_iterations - args.cooldown_iters:
        return 1.0
    # 3) linear cooldown
    else:
        decay_ratio = (args.num_iterations - it) / args.cooldown_iters
        return decay_ratio
schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers]

# Start training loop
training_time_ms = 0
# start the clock
torch.cuda.synchronize()
t0 = time.time()
# begin training
for step in range(args.num_iterations + 1):
    last_step = (step == args.num_iterations)
    # This effectively ignores timing first 10 steps, which are slower for weird reasons.
    # Alternately, and slightly more correctly in terms of benchmarking, we could do 10
    # steps with dummy data first, and then re-initialize the model and reset the loader.
    if step == 10:
        training_time_ms = 0
        t0 = time.time()
    timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val

    # Set the attention blocksize for the current step, in chunks of 64. By @fernbear.bsky.social
    attn_blocksize = torch.tensor(64*((step/args.num_iterations * (1792 - 64) + 64)//64), dtype=torch.int, device='cuda')

    # once in a while evaluate the validation dataset
    if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)):
        # stop the clock
        torch.cuda.synchronize()
        training_time_ms += 1000 * (time.time() - t0)
        # run validation batches
        model.eval()
        val_loader.reset()
        val_loss = 0.0
        for _ in range(val_steps):
            with torch.no_grad():
                x_val, y_val = val_loader.next_batch()
                val_loss += model(x_val, y_val, attn_blocksize=attn_blocksize)
        dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
        val_loss /= val_steps
        # log val loss to console and to logfile
        print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms')
        # start the clock again
        torch.cuda.synchronize()
        t0 = time.time()

    if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)):
        # stop the clock
        torch.cuda.synchronize()
        training_time_ms += 1000 * (time.time() - t0)
        # save the state of the training process
        log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers])
        torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step))
        # start the clock again
        torch.cuda.synchronize()
        t0 = time.time()

    # bit confusing: we want to make sure to eval on 0th iteration
    # but also after the very last iteration. so we loop for step <= num_iterations
    # instead of just < num_iterations (one extra due to <=), only to do
    # the validation/sampling one last time, and then we break right here as we're done.
    if last_step:
        break

    # --------------- TRAINING SECTION BEGIN -----------------
    model.train()
    for i in range(1, train_accumulation_steps+1):
        ctx = model.no_sync() if i < train_accumulation_steps else contextlib.nullcontext()
        with ctx: # there's no need to sync gradients every accumulation step
            # forward pass
            loss = model(x, y, attn_blocksize=attn_blocksize)
            # advance the dataset for the next batch
            x, y = train_loader.next_batch()
            # backward pass
            loss.backward()
        train_loss = loss.detach()
    for p in model.parameters():
        p.grad /= train_accumulation_steps
    # momentum warmup for Muon
    frac = min(step/300, 1)
    optimizer3.param_groups[0]['momentum'] = (1 - frac) * 0.85 + frac * 0.95
    # step the optimizers and schedulers
    for opt, sched in zip(optimizers, schedulers):
        opt.step()
        sched.step()
    # null the gradients
    model.zero_grad(set_to_none=True)
    # --------------- TRAINING SECTION END -------------------
    # everything that follows now is just diagnostics, prints, logging, etc.

    #dist.all_reduce(train_loss, op=dist.ReduceOp.AVG) # all-reducing the training loss would be more correct in terms of logging, but slower
    approx_time = training_time_ms + 1000 * (time.time() - t0)
    print0(f"step:{step+1}/{args.num_iterations} train_loss:{train_loss.item():.4f} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms")

if master_process:
    print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB")

# -------------------------------------------------------------------------
# clean up nice
dist.destroy_process_group()
====================================================================================================
Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4
nvidia-smi:
Thu Dec  5 04:50:39 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:19:00.0 Off |                    0 |
| N/A   39C    P0              75W / 700W |      3MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  | 00000000:3B:00.0 Off |                    0 |
| N/A   31C    P0              99W / 700W |     22MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA H100 80GB HBM3          On  | 00000000:4C:00.0 Off |                    0 |
| N/A   31C    P0              85W / 700W |     22MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   3  NVIDIA H100 80GB HBM3          On  | 00000000:5D:00.0 Off |                    0 |
| N/A   38C    P0             119W / 700W |     43MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   4  NVIDIA H100 80GB HBM3          On  | 00000000:9B:00.0 Off |                    0 |
| N/A   39C    P0             123W / 700W |    529MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   5  NVIDIA H100 80GB HBM3          On  | 00000000:BB:00.0 Off |                    0 |
| N/A   30C    P0             109W / 700W |     39MiB / 81559MiB |      1%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   6  NVIDIA H100 80GB HBM3          On  | 00000000:CB:00.0 Off |                    0 |
| N/A   39C    P0             127W / 700W |    529MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   7  NVIDIA H100 80GB HBM3          On  | 00000000:DB:00.0 Off |                    0 |
| N/A   30C    P0             119W / 700W |    529MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
+---------------------------------------------------------------------------------------+

====================================================================================================
Training DataLoader: total number of tokens: 1100000000 across 11 files
Validation DataLoader: total number of tokens: 100000000 across 1 files
====================================================================================================
step:0/1530 val_loss:10.8258 train_time:0ms step_avg:nanms
step:1/1530 train_loss:10.8258 train_time:32007ms step_avg:nanms
step:2/1530 train_loss:10.0699 train_time:32118ms step_avg:nanms
step:3/1530 train_loss:8.3585 train_time:32278ms step_avg:nanms
step:4/1530 train_loss:7.6048 train_time:32440ms step_avg:nanms
step:5/1530 train_loss:7.4530 train_time:32601ms step_avg:nanms
step:6/1530 train_loss:6.9770 train_time:32762ms step_avg:nanms
step:7/1530 train_loss:7.1810 train_time:32922ms step_avg:nanms
step:8/1530 train_loss:6.7403 train_time:33083ms step_avg:nanms
step:9/1530 train_loss:6.6333 train_time:33244ms step_avg:nanms
step:10/1530 train_loss:6.5202 train_time:33405ms step_avg:nanms
step:11/1530 train_loss:6.5162 train_time:115ms step_avg:nanms
step:12/1530 train_loss:6.3397 train_time:275ms step_avg:nanms
step:13/1530 train_loss:6.2449 train_time:435ms step_avg:145.03ms
step:14/1530 train_loss:6.1789 train_time:596ms step_avg:148.92ms
step:15/1530 train_loss:6.1748 train_time:756ms step_avg:151.13ms
step:16/1530 train_loss:6.0944 train_time:916ms step_avg:152.64ms
step:17/1530 train_loss:6.1580 train_time:1077ms step_avg:153.82ms
step:18/1530 train_loss:5.9519 train_time:1236ms step_avg:154.52ms
step:19/1530 train_loss:6.0066 train_time:1397ms step_avg:155.20ms
step:20/1530 train_loss:5.6697 train_time:1557ms step_avg:155.72ms
step:21/1530 train_loss:5.9755 train_time:1717ms step_avg:156.13ms
step:22/1530 train_loss:6.1991 train_time:1879ms step_avg:156.55ms
step:23/1530 train_loss:5.8444 train_time:2039ms step_avg:156.87ms
step:24/1530 train_loss:6.0191 train_time:2200ms step_avg:157.12ms
step:25/1530 train_loss:5.7055 train_time:2359ms step_avg:157.28ms
step:26/1530 train_loss:5.6038 train_time:2520ms step_avg:157.50ms
step:27/1530 train_loss:5.7844 train_time:2681ms step_avg:157.71ms
step:28/1530 train_loss:5.4030 train_time:2841ms step_avg:157.82ms
step:29/1530 train_loss:5.6830 train_time:3002ms step_avg:157.98ms
step:30/1530 train_loss:5.4696 train_time:3162ms step_avg:158.10ms
step:31/1530 train_loss:5.4413 train_time:3324ms step_avg:158.27ms
step:32/1530 train_loss:5.2839 train_time:3483ms step_avg:158.33ms
step:33/1530 train_loss:5.5819 train_time:3644ms step_avg:158.44ms
step:34/1530 train_loss:5.4937 train_time:3804ms step_avg:158.52ms
step:35/1530 train_loss:5.6433 train_time:3964ms step_avg:158.56ms
step:36/1530 train_loss:5.5649 train_time:4125ms step_avg:158.67ms
step:37/1530 train_loss:5.4634 train_time:4287ms step_avg:158.76ms
step:38/1530 train_loss:5.3056 train_time:4448ms step_avg:158.86ms
step:39/1530 train_loss:5.3271 train_time:4609ms step_avg:158.93ms
step:40/1530 train_loss:5.2365 train_time:4770ms step_avg:159.01ms
step:41/1530 train_loss:5.2206 train_time:4931ms step_avg:159.05ms
step:42/1530 train_loss:5.1593 train_time:5091ms step_avg:159.11ms
step:43/1530 train_loss:5.2675 train_time:5252ms step_avg:159.16ms
step:44/1530 train_loss:5.2282 train_time:5413ms step_avg:159.20ms
step:45/1530 train_loss:5.3880 train_time:5574ms step_avg:159.26ms
step:46/1530 train_loss:5.1766 train_time:5733ms step_avg:159.24ms
step:47/1530 train_loss:5.0679 train_time:5894ms step_avg:159.30ms
step:48/1530 train_loss:5.2081 train_time:6055ms step_avg:159.34ms
step:49/1530 train_loss:5.1519 train_time:6215ms step_avg:159.37ms
step:50/1530 train_loss:5.2506 train_time:6376ms step_avg:159.41ms
step:51/1530 train_loss:5.1388 train_time:6536ms step_avg:159.41ms
step:52/1530 train_loss:5.0217 train_time:6696ms step_avg:159.43ms
step:53/1530 train_loss:5.1656 train_time:6857ms step_avg:159.46ms
step:54/1530 train_loss:5.0105 train_time:7017ms step_avg:159.48ms
step:55/1530 train_loss:5.4168 train_time:7177ms step_avg:159.48ms
step:56/1530 train_loss:5.0089 train_time:7338ms step_avg:159.52ms
step:57/1530 train_loss:4.8833 train_time:7498ms step_avg:159.53ms
step:58/1530 train_loss:5.0423 train_time:7659ms step_avg:159.56ms
step:59/1530 train_loss:5.0164 train_time:7819ms step_avg:159.57ms
step:60/1530 train_loss:5.1328 train_time:7980ms step_avg:159.59ms
step:61/1530 train_loss:4.8393 train_time:8140ms step_avg:159.60ms
step:62/1530 train_loss:4.9634 train_time:8299ms step_avg:159.61ms
step:63/1530 train_loss:4.9640 train_time:8461ms step_avg:159.63ms
step:64/1530 train_loss:4.9806 train_time:8621ms step_avg:159.65ms
step:65/1530 train_loss:4.7761 train_time:8782ms step_avg:159.67ms
step:66/1530 train_loss:4.8996 train_time:8942ms step_avg:159.67ms
step:67/1530 train_loss:4.8256 train_time:9105ms step_avg:159.73ms
step:68/1530 train_loss:5.1034 train_time:9266ms step_avg:159.76ms
step:69/1530 train_loss:4.7059 train_time:9425ms step_avg:159.75ms
step:70/1530 train_loss:4.8626 train_time:9587ms step_avg:159.78ms
step:71/1530 train_loss:4.9704 train_time:9748ms step_avg:159.80ms
step:72/1530 train_loss:4.8786 train_time:9908ms step_avg:159.81ms
step:73/1530 train_loss:4.7416 train_time:10070ms step_avg:159.83ms
step:74/1530 train_loss:4.8936 train_time:10230ms step_avg:159.84ms
step:75/1530 train_loss:4.8276 train_time:10391ms step_avg:159.86ms
step:76/1530 train_loss:4.7822 train_time:10551ms step_avg:159.87ms
step:77/1530 train_loss:4.9019 train_time:10712ms step_avg:159.88ms
step:78/1530 train_loss:5.1104 train_time:10872ms step_avg:159.89ms
step:79/1530 train_loss:4.8005 train_time:11032ms step_avg:159.88ms
step:80/1530 train_loss:4.8517 train_time:11193ms step_avg:159.90ms
step:81/1530 train_loss:4.6364 train_time:11354ms step_avg:159.92ms
step:82/1530 train_loss:4.8107 train_time:11515ms step_avg:159.93ms
step:83/1530 train_loss:4.7747 train_time:11676ms step_avg:159.94ms
step:84/1530 train_loss:4.7556 train_time:11835ms step_avg:159.94ms
step:85/1530 train_loss:4.6195 train_time:11996ms step_avg:159.95ms
step:86/1530 train_loss:4.8198 train_time:12156ms step_avg:159.95ms
step:87/1530 train_loss:4.7266 train_time:12316ms step_avg:159.95ms
step:88/1530 train_loss:4.7131 train_time:12477ms step_avg:159.96ms
step:89/1530 train_loss:4.6766 train_time:12637ms step_avg:159.96ms
step:90/1530 train_loss:4.6120 train_time:12797ms step_avg:159.97ms
step:91/1530 train_loss:4.6074 train_time:12958ms step_avg:159.98ms
step:92/1530 train_loss:4.7602 train_time:13119ms step_avg:159.98ms
step:93/1530 train_loss:4.5920 train_time:13279ms step_avg:159.98ms
step:94/1530 train_loss:4.6315 train_time:13439ms step_avg:159.99ms
step:95/1530 train_loss:4.6536 train_time:13600ms step_avg:160.00ms
step:96/1530 train_loss:4.5735 train_time:13760ms step_avg:160.01ms
step:97/1530 train_loss:4.6112 train_time:13921ms step_avg:160.01ms
step:98/1530 train_loss:4.5543 train_time:14082ms step_avg:160.02ms
step:99/1530 train_loss:4.6254 train_time:14242ms step_avg:160.02ms
step:100/1530 train_loss:4.6507 train_time:14403ms step_avg:160.03ms
step:101/1530 train_loss:4.5112 train_time:14564ms step_avg:160.04ms
step:102/1530 train_loss:4.6849 train_time:14724ms step_avg:160.05ms
step:103/1530 train_loss:4.5643 train_time:14884ms step_avg:160.04ms
step:104/1530 train_loss:4.5281 train_time:15047ms step_avg:160.07ms
step:105/1530 train_loss:4.5354 train_time:15207ms step_avg:160.08ms
step:106/1530 train_loss:4.5793 train_time:15369ms step_avg:160.09ms
step:107/1530 train_loss:4.4975 train_time:15528ms step_avg:160.09ms
step:108/1530 train_loss:4.3505 train_time:15689ms step_avg:160.10ms
step:109/1530 train_loss:4.4880 train_time:15850ms step_avg:160.10ms
step:110/1530 train_loss:4.4750 train_time:16010ms step_avg:160.10ms
step:111/1530 train_loss:4.4194 train_time:16171ms step_avg:160.10ms
step:112/1530 train_loss:4.5816 train_time:16330ms step_avg:160.10ms
step:113/1530 train_loss:4.4731 train_time:16492ms step_avg:160.11ms
step:114/1530 train_loss:4.3517 train_time:16653ms step_avg:160.12ms
step:115/1530 train_loss:4.4910 train_time:16815ms step_avg:160.15ms
step:116/1530 train_loss:4.4505 train_time:16980ms step_avg:160.19ms
step:117/1530 train_loss:4.3540 train_time:17144ms step_avg:160.23ms
step:118/1530 train_loss:4.5758 train_time:17309ms step_avg:160.27ms
step:119/1530 train_loss:4.4335 train_time:17473ms step_avg:160.31ms
step:120/1530 train_loss:4.3138 train_time:17637ms step_avg:160.34ms
step:121/1530 train_loss:4.2911 train_time:17801ms step_avg:160.37ms
step:122/1530 train_loss:4.4450 train_time:17964ms step_avg:160.40ms
step:123/1530 train_loss:4.2716 train_time:18130ms step_avg:160.44ms
step:124/1530 train_loss:4.5665 train_time:18294ms step_avg:160.48ms
step:125/1530 train_loss:4.4376 train_time:18457ms step_avg:160.50ms
step:125/1530 val_loss:4.3899 train_time:18505ms step_avg:160.91ms
step:126/1530 train_loss:4.4000 train_time:18625ms step_avg:160.56ms
step:127/1530 train_loss:4.4220 train_time:18790ms step_avg:160.60ms
step:128/1530 train_loss:4.3629 train_time:18955ms step_avg:160.64ms
step:129/1530 train_loss:4.6634 train_time:19120ms step_avg:160.67ms
step:130/1530 train_loss:4.3375 train_time:19285ms step_avg:160.70ms
step:131/1530 train_loss:4.3788 train_time:19448ms step_avg:160.73ms
step:132/1530 train_loss:4.3269 train_time:19612ms step_avg:160.75ms
step:133/1530 train_loss:4.4499 train_time:19777ms step_avg:160.79ms
step:134/1530 train_loss:4.2680 train_time:19942ms step_avg:160.82ms
step:135/1530 train_loss:4.4454 train_time:20105ms step_avg:160.84ms
step:136/1530 train_loss:4.2074 train_time:20269ms step_avg:160.87ms
step:137/1530 train_loss:4.3628 train_time:20433ms step_avg:160.89ms
step:138/1530 train_loss:4.2760 train_time:20598ms step_avg:160.92ms
step:139/1530 train_loss:4.3803 train_time:20763ms step_avg:160.96ms
step:140/1530 train_loss:4.4581 train_time:20928ms step_avg:160.98ms
step:141/1530 train_loss:4.3090 train_time:21092ms step_avg:161.00ms
step:142/1530 train_loss:4.2991 train_time:21257ms step_avg:161.03ms
step:143/1530 train_loss:4.2502 train_time:21422ms step_avg:161.06ms
step:144/1530 train_loss:4.3409 train_time:21586ms step_avg:161.09ms
step:145/1530 train_loss:4.2923 train_time:21750ms step_avg:161.11ms
step:146/1530 train_loss:4.1528 train_time:21914ms step_avg:161.13ms
step:147/1530 train_loss:4.3054 train_time:22078ms step_avg:161.15ms
step:148/1530 train_loss:4.3523 train_time:22243ms step_avg:161.18ms
step:149/1530 train_loss:4.2879 train_time:22406ms step_avg:161.19ms
step:150/1530 train_loss:4.4281 train_time:22570ms step_avg:161.22ms
step:151/1530 train_loss:4.2587 train_time:22735ms step_avg:161.24ms
step:152/1530 train_loss:4.2682 train_time:22901ms step_avg:161.27ms
step:153/1530 train_loss:4.3681 train_time:23067ms step_avg:161.31ms
step:154/1530 train_loss:4.3757 train_time:23229ms step_avg:161.31ms
step:155/1530 train_loss:4.2744 train_time:23393ms step_avg:161.33ms
step:156/1530 train_loss:4.3322 train_time:23559ms step_avg:161.36ms
step:157/1530 train_loss:4.3885 train_time:23723ms step_avg:161.38ms
step:158/1530 train_loss:4.2338 train_time:23887ms step_avg:161.40ms
step:159/1530 train_loss:4.2991 train_time:24051ms step_avg:161.42ms
step:160/1530 train_loss:4.1194 train_time:24215ms step_avg:161.43ms
step:161/1530 train_loss:4.3412 train_time:24379ms step_avg:161.45ms
step:162/1530 train_loss:4.3463 train_time:24544ms step_avg:161.48ms
step:163/1530 train_loss:4.3352 train_time:24708ms step_avg:161.49ms
step:164/1530 train_loss:4.1739 train_time:24872ms step_avg:161.51ms
step:165/1530 train_loss:4.2730 train_time:25037ms step_avg:161.53ms
step:166/1530 train_loss:4.3270 train_time:25202ms step_avg:161.55ms
step:167/1530 train_loss:4.2004 train_time:25366ms step_avg:161.57ms
step:168/1530 train_loss:4.2778 train_time:25529ms step_avg:161.58ms
step:169/1530 train_loss:4.1564 train_time:25693ms step_avg:161.59ms
step:170/1530 train_loss:4.0267 train_time:25859ms step_avg:161.62ms
step:171/1530 train_loss:4.2018 train_time:26023ms step_avg:161.63ms
step:172/1530 train_loss:4.2090 train_time:26187ms step_avg:161.65ms
step:173/1530 train_loss:4.2572 train_time:26351ms step_avg:161.66ms
step:174/1530 train_loss:4.4103 train_time:26514ms step_avg:161.67ms
step:175/1530 train_loss:4.2419 train_time:26677ms step_avg:161.68ms
step:176/1530 train_loss:4.0890 train_time:26840ms step_avg:161.69ms
step:177/1530 train_loss:4.0615 train_time:27003ms step_avg:161.69ms
step:178/1530 train_loss:4.1781 train_time:27166ms step_avg:161.70ms
step:179/1530 train_loss:4.1194 train_time:27329ms step_avg:161.71ms
step:180/1530 train_loss:4.1056 train_time:27491ms step_avg:161.71ms
step:181/1530 train_loss:4.2898 train_time:27655ms step_avg:161.72ms
step:182/1530 train_loss:4.1578 train_time:27819ms step_avg:161.74ms
step:183/1530 train_loss:4.1226 train_time:27983ms step_avg:161.75ms
step:184/1530 train_loss:4.1201 train_time:28146ms step_avg:161.76ms
step:185/1530 train_loss:4.1974 train_time:28309ms step_avg:161.76ms
step:186/1530 train_loss:4.1706 train_time:28472ms step_avg:161.77ms
step:187/1530 train_loss:4.2334 train_time:28634ms step_avg:161.78ms
step:188/1530 train_loss:4.1594 train_time:28936ms step_avg:162.56ms
step:189/1530 train_loss:4.1004 train_time:29264ms step_avg:163.49ms
step:190/1530 train_loss:4.2014 train_time:29429ms step_avg:163.49ms
step:191/1530 train_loss:4.0735 train_time:29591ms step_avg:163.49ms
step:192/1530 train_loss:4.0229 train_time:29754ms step_avg:163.48ms
step:193/1530 train_loss:4.2538 train_time:29917ms step_avg:163.48ms
step:194/1530 train_loss:4.1713 train_time:30080ms step_avg:163.48ms
step:195/1530 train_loss:4.3538 train_time:30244ms step_avg:163.48ms
step:196/1530 train_loss:4.1710 train_time:30405ms step_avg:163.47ms
step:197/1530 train_loss:4.0378 train_time:30569ms step_avg:163.47ms
step:198/1530 train_loss:4.1707 train_time:30732ms step_avg:163.47ms
step:199/1530 train_loss:4.0198 train_time:30895ms step_avg:163.47ms
step:200/1530 train_loss:4.1018 train_time:31059ms step_avg:163.47ms
step:201/1530 train_loss:3.9920 train_time:31222ms step_avg:163.47ms
step:202/1530 train_loss:4.2481 train_time:31385ms step_avg:163.46ms
step:203/1530 train_loss:4.0649 train_time:31547ms step_avg:163.45ms
step:204/1530 train_loss:4.1842 train_time:31709ms step_avg:163.45ms
step:205/1530 train_loss:4.2338 train_time:31872ms step_avg:163.44ms
step:206/1530 train_loss:3.9370 train_time:32035ms step_avg:163.45ms
step:207/1530 train_loss:4.0763 train_time:32199ms step_avg:163.45ms
step:208/1530 train_loss:4.0919 train_time:32360ms step_avg:163.44ms
step:209/1530 train_loss:4.2296 train_time:32524ms step_avg:163.44ms
step:210/1530 train_loss:4.1701 train_time:32688ms step_avg:163.44ms
step:211/1530 train_loss:4.0638 train_time:32851ms step_avg:163.44ms
step:212/1530 train_loss:4.1235 train_time:33015ms step_avg:163.44ms
step:213/1530 train_loss:4.0471 train_time:33178ms step_avg:163.44ms
step:214/1530 train_loss:4.1075 train_time:33340ms step_avg:163.43ms
step:215/1530 train_loss:3.9511 train_time:33504ms step_avg:163.43ms
step:216/1530 train_loss:3.9962 train_time:33667ms step_avg:163.43ms
step:217/1530 train_loss:4.0146 train_time:33830ms step_avg:163.43ms
step:218/1530 train_loss:4.0751 train_time:33992ms step_avg:163.42ms
step:219/1530 train_loss:4.0631 train_time:34155ms step_avg:163.42ms
step:220/1530 train_loss:4.0775 train_time:34317ms step_avg:163.42ms
step:221/1530 train_loss:4.0839 train_time:34480ms step_avg:163.41ms
step:222/1530 train_loss:3.9853 train_time:34644ms step_avg:163.42ms
step:223/1530 train_loss:3.9744 train_time:34806ms step_avg:163.41ms
step:224/1530 train_loss:4.2899 train_time:34970ms step_avg:163.41ms
step:225/1530 train_loss:3.9195 train_time:35132ms step_avg:163.41ms
step:226/1530 train_loss:3.9880 train_time:35295ms step_avg:163.40ms
step:227/1530 train_loss:3.9763 train_time:35459ms step_avg:163.41ms
step:228/1530 train_loss:4.1418 train_time:35624ms step_avg:163.41ms
step:229/1530 train_loss:3.9193 train_time:35790ms step_avg:163.42ms
step:230/1530 train_loss:4.0368 train_time:35955ms step_avg:163.43ms
step:231/1530 train_loss:3.8969 train_time:36122ms step_avg:163.45ms
step:232/1530 train_loss:3.9621 train_time:36287ms step_avg:163.46ms
step:233/1530 train_loss:4.0845 train_time:36453ms step_avg:163.47ms
step:234/1530 train_loss:4.0274 train_time:36619ms step_avg:163.48ms
step:235/1530 train_loss:3.8968 train_time:36787ms step_avg:163.50ms
step:236/1530 train_loss:4.0779 train_time:36952ms step_avg:163.51ms
step:237/1530 train_loss:4.0809 train_time:37117ms step_avg:163.51ms
step:238/1530 train_loss:3.9345 train_time:37284ms step_avg:163.53ms
step:239/1530 train_loss:4.0727 train_time:37451ms step_avg:163.54ms
step:240/1530 train_loss:4.1079 train_time:37617ms step_avg:163.55ms
step:241/1530 train_loss:3.9613 train_time:37783ms step_avg:163.56ms
step:242/1530 train_loss:4.1350 train_time:37949ms step_avg:163.57ms
step:243/1530 train_loss:3.9992 train_time:38115ms step_avg:163.58ms
step:244/1530 train_loss:4.0699 train_time:38282ms step_avg:163.60ms
step:245/1530 train_loss:4.1417 train_time:38448ms step_avg:163.61ms
step:246/1530 train_loss:4.0536 train_time:38614ms step_avg:163.62ms
step:247/1530 train_loss:3.9962 train_time:38780ms step_avg:163.63ms
step:248/1530 train_loss:4.0922 train_time:38946ms step_avg:163.64ms
step:249/1530 train_loss:3.9175 train_time:39112ms step_avg:163.65ms
step:250/1530 train_loss:3.9660 train_time:39277ms step_avg:163.65ms
step:250/1530 val_loss:3.9936 train_time:39325ms step_avg:163.85ms
step:251/1530 train_loss:4.0660 train_time:39445ms step_avg:163.67ms
step:252/1530 train_loss:4.1558 train_time:39614ms step_avg:163.70ms
step:253/1530 train_loss:3.9246 train_time:39780ms step_avg:163.71ms
step:254/1530 train_loss:3.8708 train_time:39948ms step_avg:163.72ms
step:255/1530 train_loss:4.0777 train_time:40116ms step_avg:163.74ms
step:256/1530 train_loss:3.9833 train_time:40282ms step_avg:163.75ms
step:257/1530 train_loss:3.9844 train_time:40448ms step_avg:163.76ms
step:258/1530 train_loss:3.9776 train_time:40616ms step_avg:163.77ms
step:259/1530 train_loss:4.0256 train_time:40782ms step_avg:163.78ms
step:260/1530 train_loss:4.0518 train_time:40948ms step_avg:163.79ms
step:261/1530 train_loss:4.0146 train_time:41117ms step_avg:163.81ms
step:262/1530 train_loss:3.9845 train_time:41282ms step_avg:163.82ms
step:263/1530 train_loss:3.8904 train_time:41449ms step_avg:163.83ms
step:264/1530 train_loss:3.9772 train_time:41614ms step_avg:163.84ms
step:265/1530 train_loss:3.8584 train_time:41781ms step_avg:163.85ms
step:266/1530 train_loss:3.9171 train_time:41947ms step_avg:163.86ms
step:267/1530 train_loss:3.9239 train_time:42115ms step_avg:163.87ms
step:268/1530 train_loss:3.9550 train_time:42281ms step_avg:163.88ms
step:269/1530 train_loss:3.8367 train_time:42446ms step_avg:163.88ms
step:270/1530 train_loss:4.0932 train_time:42615ms step_avg:163.90ms
step:271/1530 train_loss:3.9616 train_time:42781ms step_avg:163.91ms
step:272/1530 train_loss:3.9162 train_time:42946ms step_avg:163.91ms
step:273/1530 train_loss:3.9335 train_time:43114ms step_avg:163.93ms
step:274/1530 train_loss:4.0428 train_time:43280ms step_avg:163.94ms
step:275/1530 train_loss:4.0599 train_time:43447ms step_avg:163.95ms
step:276/1530 train_loss:4.2295 train_time:43616ms step_avg:163.97ms
step:277/1530 train_loss:4.0345 train_time:43781ms step_avg:163.98ms
step:278/1530 train_loss:4.0739 train_time:43948ms step_avg:163.98ms
step:279/1530 train_loss:3.9908 train_time:44115ms step_avg:164.00ms
step:280/1530 train_loss:4.1764 train_time:44282ms step_avg:164.01ms
step:281/1530 train_loss:3.9686 train_time:44449ms step_avg:164.02ms
step:282/1530 train_loss:3.9368 train_time:44616ms step_avg:164.03ms
step:283/1530 train_loss:3.9028 train_time:44782ms step_avg:164.04ms
step:284/1530 train_loss:4.0392 train_time:44948ms step_avg:164.05ms
step:285/1530 train_loss:4.0659 train_time:45115ms step_avg:164.05ms
step:286/1530 train_loss:4.0823 train_time:45280ms step_avg:164.06ms
step:287/1530 train_loss:3.8993 train_time:45445ms step_avg:164.06ms
step:288/1530 train_loss:4.0104 train_time:45610ms step_avg:164.06ms
step:289/1530 train_loss:3.8638 train_time:45775ms step_avg:164.07ms
step:290/1530 train_loss:3.8533 train_time:45940ms step_avg:164.07ms
step:291/1530 train_loss:3.8936 train_time:46107ms step_avg:164.08ms
step:292/1530 train_loss:3.8568 train_time:46272ms step_avg:164.09ms
step:293/1530 train_loss:3.8962 train_time:46437ms step_avg:164.09ms
step:294/1530 train_loss:3.9261 train_time:46602ms step_avg:164.09ms
step:295/1530 train_loss:3.8311 train_time:46768ms step_avg:164.10ms
step:296/1530 train_loss:3.8577 train_time:46934ms step_avg:164.11ms
step:297/1530 train_loss:3.8630 train_time:47099ms step_avg:164.11ms
step:298/1530 train_loss:3.9704 train_time:47264ms step_avg:164.11ms
step:299/1530 train_loss:3.8164 train_time:47428ms step_avg:164.11ms
step:300/1530 train_loss:3.9579 train_time:47594ms step_avg:164.12ms
step:301/1530 train_loss:3.9616 train_time:47759ms step_avg:164.12ms
step:302/1530 train_loss:3.9287 train_time:47925ms step_avg:164.13ms
step:303/1530 train_loss:3.9679 train_time:48091ms step_avg:164.13ms
step:304/1530 train_loss:3.9616 train_time:48255ms step_avg:164.13ms
step:305/1530 train_loss:4.4492 train_time:48422ms step_avg:164.14ms
step:306/1530 train_loss:3.9369 train_time:48588ms step_avg:164.15ms
step:307/1530 train_loss:3.8308 train_time:48753ms step_avg:164.15ms
step:308/1530 train_loss:3.9709 train_time:48918ms step_avg:164.15ms
step:309/1530 train_loss:3.8607 train_time:49083ms step_avg:164.16ms
step:310/1530 train_loss:4.0756 train_time:49247ms step_avg:164.16ms
step:311/1530 train_loss:3.9243 train_time:49415ms step_avg:164.17ms
step:312/1530 train_loss:3.8544 train_time:49580ms step_avg:164.17ms
step:313/1530 train_loss:3.9299 train_time:49746ms step_avg:164.18ms
step:314/1530 train_loss:4.0575 train_time:49913ms step_avg:164.19ms
step:315/1530 train_loss:3.9365 train_time:50078ms step_avg:164.19ms
step:316/1530 train_loss:3.7869 train_time:50243ms step_avg:164.19ms
step:317/1530 train_loss:3.8667 train_time:50409ms step_avg:164.20ms
step:318/1530 train_loss:3.9168 train_time:50574ms step_avg:164.20ms
step:319/1530 train_loss:3.8874 train_time:50739ms step_avg:164.21ms
step:320/1530 train_loss:4.0008 train_time:50905ms step_avg:164.21ms
step:321/1530 train_loss:3.9530 train_time:51071ms step_avg:164.22ms
step:322/1530 train_loss:3.9265 train_time:51237ms step_avg:164.22ms
step:323/1530 train_loss:3.9994 train_time:51402ms step_avg:164.22ms
step:324/1530 train_loss:3.9478 train_time:51566ms step_avg:164.22ms
step:325/1530 train_loss:4.0084 train_time:51733ms step_avg:164.23ms
step:326/1530 train_loss:3.8903 train_time:51898ms step_avg:164.23ms
step:327/1530 train_loss:4.3960 train_time:52062ms step_avg:164.23ms
step:328/1530 train_loss:4.0692 train_time:52229ms step_avg:164.24ms
step:329/1530 train_loss:3.7935 train_time:52394ms step_avg:164.24ms
step:330/1530 train_loss:3.7376 train_time:52558ms step_avg:164.24ms
step:331/1530 train_loss:3.9734 train_time:52723ms step_avg:164.25ms
step:332/1530 train_loss:3.9159 train_time:52889ms step_avg:164.25ms
step:333/1530 train_loss:3.8788 train_time:53054ms step_avg:164.26ms
step:334/1530 train_loss:3.8341 train_time:53219ms step_avg:164.26ms
step:335/1530 train_loss:4.0043 train_time:53384ms step_avg:164.26ms
step:336/1530 train_loss:3.9549 train_time:53550ms step_avg:164.26ms
step:337/1530 train_loss:4.4195 train_time:53717ms step_avg:164.27ms
step:338/1530 train_loss:3.9276 train_time:53881ms step_avg:164.27ms
step:339/1530 train_loss:3.8563 train_time:54047ms step_avg:164.28ms
step:340/1530 train_loss:3.9282 train_time:54213ms step_avg:164.28ms
step:341/1530 train_loss:3.8489 train_time:54380ms step_avg:164.29ms
step:342/1530 train_loss:3.7998 train_time:54548ms step_avg:164.30ms
step:343/1530 train_loss:3.8323 train_time:54717ms step_avg:164.32ms
step:344/1530 train_loss:3.9840 train_time:54885ms step_avg:164.33ms
step:345/1530 train_loss:3.8117 train_time:55055ms step_avg:164.34ms
step:346/1530 train_loss:3.7678 train_time:55223ms step_avg:164.35ms
step:347/1530 train_loss:3.7915 train_time:55392ms step_avg:164.37ms
step:348/1530 train_loss:3.8549 train_time:55560ms step_avg:164.38ms
step:349/1530 train_loss:3.8236 train_time:55728ms step_avg:164.39ms
step:350/1530 train_loss:3.5661 train_time:55898ms step_avg:164.41ms
step:351/1530 train_loss:3.8187 train_time:56065ms step_avg:164.41ms
step:352/1530 train_loss:4.1727 train_time:56234ms step_avg:164.43ms
step:353/1530 train_loss:3.6547 train_time:56401ms step_avg:164.43ms
step:354/1530 train_loss:3.9192 train_time:56569ms step_avg:164.44ms
step:355/1530 train_loss:3.7791 train_time:56738ms step_avg:164.46ms
step:356/1530 train_loss:3.8783 train_time:56906ms step_avg:164.47ms
step:357/1530 train_loss:3.7602 train_time:57073ms step_avg:164.48ms
step:358/1530 train_loss:3.8540 train_time:57242ms step_avg:164.49ms
step:359/1530 train_loss:3.7804 train_time:57413ms step_avg:164.51ms
step:360/1530 train_loss:3.4254 train_time:57582ms step_avg:164.52ms
step:361/1530 train_loss:4.0172 train_time:57752ms step_avg:164.53ms
step:362/1530 train_loss:3.9136 train_time:57920ms step_avg:164.55ms
step:363/1530 train_loss:3.8330 train_time:58088ms step_avg:164.56ms
step:364/1530 train_loss:3.7338 train_time:58257ms step_avg:164.57ms
step:365/1530 train_loss:3.9116 train_time:58424ms step_avg:164.58ms
step:366/1530 train_loss:3.8559 train_time:58594ms step_avg:164.59ms
step:367/1530 train_loss:3.8525 train_time:58761ms step_avg:164.60ms
step:368/1530 train_loss:3.8471 train_time:58929ms step_avg:164.61ms
step:369/1530 train_loss:3.7434 train_time:59096ms step_avg:164.61ms
step:370/1530 train_loss:3.8765 train_time:59264ms step_avg:164.62ms
step:371/1530 train_loss:3.7222 train_time:59431ms step_avg:164.63ms
step:372/1530 train_loss:3.6877 train_time:59600ms step_avg:164.64ms
step:373/1530 train_loss:3.9134 train_time:59767ms step_avg:164.65ms
step:374/1530 train_loss:3.8237 train_time:59934ms step_avg:164.66ms
step:375/1530 train_loss:3.7989 train_time:60102ms step_avg:164.66ms
step:375/1530 val_loss:3.8223 train_time:60150ms step_avg:164.80ms
step:376/1530 train_loss:3.8605 train_time:60271ms step_avg:164.67ms
step:377/1530 train_loss:3.7869 train_time:60570ms step_avg:165.04ms
step:378/1530 train_loss:3.8479 train_time:60747ms step_avg:165.07ms
step:379/1530 train_loss:3.8667 train_time:61066ms step_avg:165.49ms
step:380/1530 train_loss:3.9520 train_time:61236ms step_avg:165.50ms
step:381/1530 train_loss:3.8271 train_time:61403ms step_avg:165.51ms
step:382/1530 train_loss:3.8026 train_time:61573ms step_avg:165.52ms
step:383/1530 train_loss:3.7953 train_time:61741ms step_avg:165.53ms
step:384/1530 train_loss:3.8695 train_time:61908ms step_avg:165.53ms
step:385/1530 train_loss:3.7949 train_time:62079ms step_avg:165.54ms
step:386/1530 train_loss:3.8822 train_time:62245ms step_avg:165.55ms
step:387/1530 train_loss:4.0519 train_time:62413ms step_avg:165.55ms
step:388/1530 train_loss:3.7915 train_time:62580ms step_avg:165.56ms
step:389/1530 train_loss:3.7873 train_time:62748ms step_avg:165.56ms
step:390/1530 train_loss:3.8870 train_time:62918ms step_avg:165.57ms
step:391/1530 train_loss:3.8052 train_time:63084ms step_avg:165.58ms
step:392/1530 train_loss:3.9152 train_time:63251ms step_avg:165.58ms
step:393/1530 train_loss:3.7542 train_time:63420ms step_avg:165.59ms
step:394/1530 train_loss:3.8743 train_time:63587ms step_avg:165.59ms
step:395/1530 train_loss:3.6270 train_time:63756ms step_avg:165.60ms
step:396/1530 train_loss:3.8342 train_time:63923ms step_avg:165.60ms
step:397/1530 train_loss:3.8598 train_time:64091ms step_avg:165.61ms
step:398/1530 train_loss:3.8765 train_time:64259ms step_avg:165.62ms
step:399/1530 train_loss:3.7682 train_time:64426ms step_avg:165.62ms
step:400/1530 train_loss:3.8289 train_time:64594ms step_avg:165.63ms
step:401/1530 train_loss:3.9110 train_time:64761ms step_avg:165.63ms
step:402/1530 train_loss:3.8370 train_time:64929ms step_avg:165.63ms
step:403/1530 train_loss:3.9507 train_time:65097ms step_avg:165.64ms
step:404/1530 train_loss:3.6738 train_time:65263ms step_avg:165.64ms
step:405/1530 train_loss:3.7843 train_time:65432ms step_avg:165.65ms
step:406/1530 train_loss:4.0870 train_time:65599ms step_avg:165.65ms
step:407/1530 train_loss:3.7693 train_time:65768ms step_avg:165.66ms
step:408/1530 train_loss:3.8157 train_time:65936ms step_avg:165.67ms
step:409/1530 train_loss:3.8505 train_time:66103ms step_avg:165.67ms
step:410/1530 train_loss:3.7564 train_time:66270ms step_avg:165.67ms
step:411/1530 train_loss:3.7571 train_time:66437ms step_avg:165.68ms
step:412/1530 train_loss:4.1777 train_time:66605ms step_avg:165.68ms
step:413/1530 train_loss:3.6226 train_time:66772ms step_avg:165.69ms
step:414/1530 train_loss:4.0044 train_time:66940ms step_avg:165.69ms
step:415/1530 train_loss:3.7431 train_time:67107ms step_avg:165.70ms
step:416/1530 train_loss:3.7576 train_time:67274ms step_avg:165.70ms
step:417/1530 train_loss:3.9496 train_time:67442ms step_avg:165.71ms
step:418/1530 train_loss:3.6833 train_time:67609ms step_avg:165.71ms
step:419/1530 train_loss:3.7965 train_time:67777ms step_avg:165.71ms
step:420/1530 train_loss:3.6947 train_time:67943ms step_avg:165.71ms
step:421/1530 train_loss:3.6390 train_time:68110ms step_avg:165.72ms
step:422/1530 train_loss:3.7751 train_time:68278ms step_avg:165.72ms
step:423/1530 train_loss:3.8639 train_time:68445ms step_avg:165.73ms
step:424/1530 train_loss:3.6087 train_time:68612ms step_avg:165.73ms
step:425/1530 train_loss:3.7904 train_time:68779ms step_avg:165.73ms
step:426/1530 train_loss:3.6500 train_time:68947ms step_avg:165.74ms
step:427/1530 train_loss:3.8885 train_time:69114ms step_avg:165.74ms
step:428/1530 train_loss:3.8045 train_time:69281ms step_avg:165.74ms
step:429/1530 train_loss:3.7523 train_time:69449ms step_avg:165.75ms
step:430/1530 train_loss:3.6953 train_time:69618ms step_avg:165.76ms
step:431/1530 train_loss:3.6210 train_time:69786ms step_avg:165.76ms
step:432/1530 train_loss:3.7571 train_time:69953ms step_avg:165.77ms
step:433/1530 train_loss:3.8086 train_time:70120ms step_avg:165.77ms
step:434/1530 train_loss:3.7649 train_time:70286ms step_avg:165.77ms
step:435/1530 train_loss:3.7984 train_time:70453ms step_avg:165.77ms
step:436/1530 train_loss:3.8295 train_time:70620ms step_avg:165.77ms
step:437/1530 train_loss:3.7210 train_time:70787ms step_avg:165.78ms
step:438/1530 train_loss:3.6961 train_time:70953ms step_avg:165.78ms
step:439/1530 train_loss:3.7065 train_time:71121ms step_avg:165.78ms
step:440/1530 train_loss:3.8842 train_time:71289ms step_avg:165.79ms
step:441/1530 train_loss:3.7534 train_time:71456ms step_avg:165.79ms
step:442/1530 train_loss:3.7393 train_time:71623ms step_avg:165.79ms
step:443/1530 train_loss:3.6211 train_time:71789ms step_avg:165.80ms
step:444/1530 train_loss:3.9272 train_time:71957ms step_avg:165.80ms
step:445/1530 train_loss:3.8390 train_time:72123ms step_avg:165.80ms
step:446/1530 train_loss:3.8258 train_time:72291ms step_avg:165.80ms
step:447/1530 train_loss:3.7458 train_time:72458ms step_avg:165.81ms
step:448/1530 train_loss:3.8410 train_time:72625ms step_avg:165.81ms
step:449/1530 train_loss:3.6824 train_time:72792ms step_avg:165.81ms
step:450/1530 train_loss:3.7133 train_time:72960ms step_avg:165.82ms
step:451/1530 train_loss:3.5783 train_time:73127ms step_avg:165.82ms
step:452/1530 train_loss:3.6969 train_time:73295ms step_avg:165.82ms
step:453/1530 train_loss:3.6656 train_time:73462ms step_avg:165.83ms
step:454/1530 train_loss:3.6358 train_time:73629ms step_avg:165.83ms
step:455/1530 train_loss:3.8336 train_time:73798ms step_avg:165.84ms
step:456/1530 train_loss:3.7209 train_time:73967ms step_avg:165.85ms
step:457/1530 train_loss:3.7760 train_time:74140ms step_avg:165.86ms
step:458/1530 train_loss:3.8195 train_time:74310ms step_avg:165.87ms
step:459/1530 train_loss:3.6311 train_time:74481ms step_avg:165.88ms
step:460/1530 train_loss:3.7884 train_time:74650ms step_avg:165.89ms
step:461/1530 train_loss:3.6884 train_time:74821ms step_avg:165.90ms
step:462/1530 train_loss:3.7246 train_time:74991ms step_avg:165.91ms
step:463/1530 train_loss:3.7648 train_time:75163ms step_avg:165.92ms
step:464/1530 train_loss:3.7094 train_time:75330ms step_avg:165.93ms
step:465/1530 train_loss:3.7130 train_time:75501ms step_avg:165.94ms
step:466/1530 train_loss:3.7865 train_time:75671ms step_avg:165.94ms
step:467/1530 train_loss:3.8164 train_time:75842ms step_avg:165.96ms
step:468/1530 train_loss:3.7859 train_time:76012ms step_avg:165.96ms
step:469/1530 train_loss:3.6849 train_time:76182ms step_avg:165.97ms
step:470/1530 train_loss:3.7587 train_time:76351ms step_avg:165.98ms
step:471/1530 train_loss:3.8046 train_time:76521ms step_avg:165.99ms
step:472/1530 train_loss:3.7787 train_time:76691ms step_avg:166.00ms
step:473/1530 train_loss:3.7111 train_time:76861ms step_avg:166.01ms
step:474/1530 train_loss:3.5919 train_time:77031ms step_avg:166.01ms
step:475/1530 train_loss:4.0137 train_time:77200ms step_avg:166.02ms
step:476/1530 train_loss:3.7548 train_time:77373ms step_avg:166.04ms
step:477/1530 train_loss:3.5897 train_time:77544ms step_avg:166.05ms
step:478/1530 train_loss:3.8191 train_time:77711ms step_avg:166.05ms
step:479/1530 train_loss:3.7620 train_time:77882ms step_avg:166.06ms
step:480/1530 train_loss:3.9150 train_time:78051ms step_avg:166.07ms
step:481/1530 train_loss:3.7156 train_time:78222ms step_avg:166.08ms
step:482/1530 train_loss:3.5270 train_time:78390ms step_avg:166.08ms
step:483/1530 train_loss:3.8016 train_time:78560ms step_avg:166.09ms
step:484/1530 train_loss:3.6519 train_time:78730ms step_avg:166.10ms
step:485/1530 train_loss:3.6477 train_time:78900ms step_avg:166.10ms
step:486/1530 train_loss:3.5651 train_time:79070ms step_avg:166.11ms
step:487/1530 train_loss:3.6780 train_time:79240ms step_avg:166.12ms
step:488/1530 train_loss:3.8730 train_time:79409ms step_avg:166.13ms
step:489/1530 train_loss:3.7077 train_time:79579ms step_avg:166.14ms
step:490/1530 train_loss:3.5881 train_time:79748ms step_avg:166.14ms
step:491/1530 train_loss:3.6073 train_time:79918ms step_avg:166.15ms
step:492/1530 train_loss:3.7278 train_time:80086ms step_avg:166.15ms
step:493/1530 train_loss:3.5689 train_time:80258ms step_avg:166.17ms
step:494/1530 train_loss:3.6970 train_time:80427ms step_avg:166.17ms
step:495/1530 train_loss:3.6559 train_time:80599ms step_avg:166.18ms
step:496/1530 train_loss:3.5030 train_time:80770ms step_avg:166.19ms
step:497/1530 train_loss:3.7252 train_time:80939ms step_avg:166.20ms
step:498/1530 train_loss:3.7760 train_time:81108ms step_avg:166.20ms
step:499/1530 train_loss:3.8133 train_time:81278ms step_avg:166.21ms
step:500/1530 train_loss:3.7251 train_time:81449ms step_avg:166.22ms
step:500/1530 val_loss:3.6983 train_time:81498ms step_avg:166.32ms
step:501/1530 train_loss:3.7970 train_time:81622ms step_avg:166.24ms
step:502/1530 train_loss:3.7408 train_time:81794ms step_avg:166.25ms
step:503/1530 train_loss:3.7694 train_time:81964ms step_avg:166.25ms
step:504/1530 train_loss:3.7166 train_time:82132ms step_avg:166.26ms
step:505/1530 train_loss:3.8034 train_time:82302ms step_avg:166.27ms
step:506/1530 train_loss:3.6412 train_time:82472ms step_avg:166.27ms
step:507/1530 train_loss:3.7577 train_time:82641ms step_avg:166.28ms
step:508/1530 train_loss:3.8187 train_time:82813ms step_avg:166.29ms
step:509/1530 train_loss:3.7726 train_time:82982ms step_avg:166.30ms
step:510/1530 train_loss:3.5721 train_time:83152ms step_avg:166.30ms
step:511/1530 train_loss:3.7636 train_time:83321ms step_avg:166.31ms
step:512/1530 train_loss:3.7130 train_time:83494ms step_avg:166.32ms
step:513/1530 train_loss:3.6563 train_time:83662ms step_avg:166.33ms
step:514/1530 train_loss:3.8037 train_time:83831ms step_avg:166.33ms
step:515/1530 train_loss:3.7312 train_time:84000ms step_avg:166.34ms
step:516/1530 train_loss:4.0663 train_time:84169ms step_avg:166.34ms
step:517/1530 train_loss:3.6883 train_time:84339ms step_avg:166.35ms
step:518/1530 train_loss:3.7620 train_time:84507ms step_avg:166.35ms
step:519/1530 train_loss:3.6526 train_time:84677ms step_avg:166.36ms
step:520/1530 train_loss:3.6789 train_time:84846ms step_avg:166.37ms
step:521/1530 train_loss:3.6552 train_time:85016ms step_avg:166.37ms
step:522/1530 train_loss:3.6502 train_time:85185ms step_avg:166.38ms
step:523/1530 train_loss:4.2823 train_time:85354ms step_avg:166.38ms
step:524/1530 train_loss:3.7359 train_time:85522ms step_avg:166.39ms
step:525/1530 train_loss:3.6784 train_time:85691ms step_avg:166.39ms
step:526/1530 train_loss:3.6940 train_time:85860ms step_avg:166.40ms
step:527/1530 train_loss:3.6567 train_time:86029ms step_avg:166.40ms
step:528/1530 train_loss:3.6249 train_time:86197ms step_avg:166.40ms
step:529/1530 train_loss:3.8451 train_time:86366ms step_avg:166.41ms
step:530/1530 train_loss:3.6469 train_time:86536ms step_avg:166.42ms
step:531/1530 train_loss:3.9156 train_time:86706ms step_avg:166.42ms
step:532/1530 train_loss:3.7268 train_time:86875ms step_avg:166.43ms
step:533/1530 train_loss:3.6482 train_time:87045ms step_avg:166.43ms
step:534/1530 train_loss:3.6636 train_time:87214ms step_avg:166.44ms
step:535/1530 train_loss:3.5993 train_time:87382ms step_avg:166.44ms
step:536/1530 train_loss:3.7433 train_time:87552ms step_avg:166.45ms
step:537/1530 train_loss:3.7211 train_time:87721ms step_avg:166.45ms
step:538/1530 train_loss:3.6200 train_time:87890ms step_avg:166.46ms
step:539/1530 train_loss:4.1086 train_time:88062ms step_avg:166.47ms
step:540/1530 train_loss:3.6712 train_time:88231ms step_avg:166.47ms
step:541/1530 train_loss:3.7769 train_time:88399ms step_avg:166.48ms
step:542/1530 train_loss:3.5777 train_time:88568ms step_avg:166.48ms
step:543/1530 train_loss:3.5835 train_time:88738ms step_avg:166.49ms
step:544/1530 train_loss:3.6364 train_time:88906ms step_avg:166.49ms
step:545/1530 train_loss:3.5877 train_time:89077ms step_avg:166.50ms
step:546/1530 train_loss:3.6182 train_time:89246ms step_avg:166.50ms
step:547/1530 train_loss:3.6363 train_time:89414ms step_avg:166.51ms
step:548/1530 train_loss:3.6036 train_time:89584ms step_avg:166.51ms
step:549/1530 train_loss:3.7162 train_time:89751ms step_avg:166.51ms
step:550/1530 train_loss:3.6091 train_time:89921ms step_avg:166.52ms
step:551/1530 train_loss:3.6279 train_time:90088ms step_avg:166.52ms
step:552/1530 train_loss:3.9293 train_time:90259ms step_avg:166.53ms
step:553/1530 train_loss:3.7488 train_time:90428ms step_avg:166.53ms
step:554/1530 train_loss:3.7123 train_time:90596ms step_avg:166.54ms
step:555/1530 train_loss:3.6236 train_time:90765ms step_avg:166.54ms
step:556/1530 train_loss:3.6959 train_time:90933ms step_avg:166.54ms
step:557/1530 train_loss:3.3115 train_time:91104ms step_avg:166.55ms
step:558/1530 train_loss:3.6128 train_time:91273ms step_avg:166.56ms
step:559/1530 train_loss:3.6426 train_time:91442ms step_avg:166.56ms
step:560/1530 train_loss:3.6849 train_time:91612ms step_avg:166.57ms
step:561/1530 train_loss:3.6104 train_time:91781ms step_avg:166.57ms
step:562/1530 train_loss:3.5493 train_time:91950ms step_avg:166.58ms
step:563/1530 train_loss:3.7529 train_time:92119ms step_avg:166.58ms
step:564/1530 train_loss:3.5676 train_time:92288ms step_avg:166.58ms
step:565/1530 train_loss:3.6763 train_time:92458ms step_avg:166.59ms
step:566/1530 train_loss:3.6155 train_time:92759ms step_avg:166.83ms
step:567/1530 train_loss:3.6001 train_time:92938ms step_avg:166.86ms
step:568/1530 train_loss:3.6838 train_time:93108ms step_avg:166.86ms
step:569/1530 train_loss:3.6436 train_time:93430ms step_avg:167.14ms
step:570/1530 train_loss:3.6775 train_time:93601ms step_avg:167.15ms
step:571/1530 train_loss:3.7558 train_time:93771ms step_avg:167.15ms
step:572/1530 train_loss:3.7214 train_time:93942ms step_avg:167.16ms
step:573/1530 train_loss:3.7316 train_time:94115ms step_avg:167.17ms
step:574/1530 train_loss:3.7785 train_time:94287ms step_avg:167.18ms
step:575/1530 train_loss:3.7236 train_time:94460ms step_avg:167.19ms
step:576/1530 train_loss:3.7567 train_time:94630ms step_avg:167.19ms
step:577/1530 train_loss:3.6612 train_time:94803ms step_avg:167.20ms
step:578/1530 train_loss:3.6668 train_time:94976ms step_avg:167.21ms
step:579/1530 train_loss:3.6651 train_time:95147ms step_avg:167.22ms
step:580/1530 train_loss:3.5808 train_time:95318ms step_avg:167.22ms
step:581/1530 train_loss:3.6304 train_time:95489ms step_avg:167.23ms
step:582/1530 train_loss:3.8357 train_time:95661ms step_avg:167.24ms
step:583/1530 train_loss:3.6192 train_time:95832ms step_avg:167.25ms
step:584/1530 train_loss:3.5828 train_time:96003ms step_avg:167.25ms
step:585/1530 train_loss:3.7833 train_time:96174ms step_avg:167.26ms
step:586/1530 train_loss:3.5144 train_time:96346ms step_avg:167.27ms
step:587/1530 train_loss:3.6634 train_time:96518ms step_avg:167.28ms
step:588/1530 train_loss:3.6407 train_time:96687ms step_avg:167.28ms
step:589/1530 train_loss:3.9878 train_time:96860ms step_avg:167.29ms
step:590/1530 train_loss:3.7718 train_time:97029ms step_avg:167.29ms
step:591/1530 train_loss:3.4987 train_time:97202ms step_avg:167.30ms
step:592/1530 train_loss:3.5287 train_time:97374ms step_avg:167.31ms
step:593/1530 train_loss:3.4957 train_time:97546ms step_avg:167.32ms
step:594/1530 train_loss:3.5413 train_time:97718ms step_avg:167.33ms
step:595/1530 train_loss:3.9165 train_time:97890ms step_avg:167.33ms
step:596/1530 train_loss:3.6467 train_time:98064ms step_avg:167.34ms
step:597/1530 train_loss:3.5778 train_time:98234ms step_avg:167.35ms
step:598/1530 train_loss:3.6510 train_time:98404ms step_avg:167.35ms
step:599/1530 train_loss:3.4696 train_time:98577ms step_avg:167.36ms
step:600/1530 train_loss:3.5913 train_time:98747ms step_avg:167.37ms
step:601/1530 train_loss:3.6456 train_time:98920ms step_avg:167.38ms
step:602/1530 train_loss:3.6680 train_time:99093ms step_avg:167.39ms
step:603/1530 train_loss:3.7749 train_time:99264ms step_avg:167.39ms
step:604/1530 train_loss:3.6057 train_time:99436ms step_avg:167.40ms
step:605/1530 train_loss:3.6064 train_time:99606ms step_avg:167.41ms
step:606/1530 train_loss:3.5663 train_time:99781ms step_avg:167.42ms
step:607/1530 train_loss:3.8307 train_time:99953ms step_avg:167.43ms
step:608/1530 train_loss:3.6296 train_time:100124ms step_avg:167.43ms
step:609/1530 train_loss:3.6130 train_time:100294ms step_avg:167.44ms
step:610/1530 train_loss:3.6953 train_time:100464ms step_avg:167.44ms
step:611/1530 train_loss:3.5934 train_time:100636ms step_avg:167.45ms
step:612/1530 train_loss:3.5604 train_time:100808ms step_avg:167.45ms
step:613/1530 train_loss:3.7554 train_time:100980ms step_avg:167.46ms
step:614/1530 train_loss:3.6937 train_time:101151ms step_avg:167.47ms
step:615/1530 train_loss:3.6866 train_time:101321ms step_avg:167.47ms
step:616/1530 train_loss:3.6263 train_time:101491ms step_avg:167.48ms
step:617/1530 train_loss:3.5516 train_time:101664ms step_avg:167.49ms
step:618/1530 train_loss:3.6845 train_time:101834ms step_avg:167.49ms
step:619/1530 train_loss:3.5397 train_time:102005ms step_avg:167.50ms
step:620/1530 train_loss:3.5862 train_time:102176ms step_avg:167.50ms
step:621/1530 train_loss:3.9190 train_time:102348ms step_avg:167.51ms
step:622/1530 train_loss:3.5669 train_time:102520ms step_avg:167.52ms
step:623/1530 train_loss:3.6018 train_time:102692ms step_avg:167.52ms
step:624/1530 train_loss:3.6855 train_time:102863ms step_avg:167.53ms
step:625/1530 train_loss:3.6904 train_time:103033ms step_avg:167.53ms
step:625/1530 val_loss:3.6157 train_time:103082ms step_avg:167.61ms
step:626/1530 train_loss:3.7276 train_time:103204ms step_avg:167.54ms
step:627/1530 train_loss:3.7105 train_time:103376ms step_avg:167.55ms
step:628/1530 train_loss:3.7522 train_time:103545ms step_avg:167.55ms
step:629/1530 train_loss:3.5883 train_time:103718ms step_avg:167.56ms
step:630/1530 train_loss:3.7202 train_time:103888ms step_avg:167.56ms
step:631/1530 train_loss:3.7360 train_time:104058ms step_avg:167.57ms
step:632/1530 train_loss:3.6363 train_time:104231ms step_avg:167.57ms
step:633/1530 train_loss:3.5960 train_time:104403ms step_avg:167.58ms
step:634/1530 train_loss:3.6953 train_time:104574ms step_avg:167.59ms
step:635/1530 train_loss:3.9426 train_time:104743ms step_avg:167.59ms
step:636/1530 train_loss:3.5443 train_time:104915ms step_avg:167.60ms
step:637/1530 train_loss:3.3534 train_time:105088ms step_avg:167.60ms
step:638/1530 train_loss:3.5823 train_time:105257ms step_avg:167.61ms
step:639/1530 train_loss:3.6260 train_time:105426ms step_avg:167.61ms
step:640/1530 train_loss:3.5586 train_time:105598ms step_avg:167.62ms
step:641/1530 train_loss:3.5777 train_time:105769ms step_avg:167.62ms
step:642/1530 train_loss:3.6249 train_time:105938ms step_avg:167.62ms
step:643/1530 train_loss:3.5838 train_time:106109ms step_avg:167.63ms
step:644/1530 train_loss:3.5561 train_time:106279ms step_avg:167.63ms
step:645/1530 train_loss:3.7751 train_time:106451ms step_avg:167.64ms
step:646/1530 train_loss:3.6621 train_time:106621ms step_avg:167.64ms
step:647/1530 train_loss:3.6561 train_time:106792ms step_avg:167.65ms
step:648/1530 train_loss:3.7048 train_time:106961ms step_avg:167.65ms
step:649/1530 train_loss:3.7607 train_time:107132ms step_avg:167.66ms
step:650/1530 train_loss:3.6144 train_time:107302ms step_avg:167.66ms
step:651/1530 train_loss:3.7589 train_time:107474ms step_avg:167.67ms
step:652/1530 train_loss:3.5803 train_time:107645ms step_avg:167.67ms
step:653/1530 train_loss:3.6519 train_time:107815ms step_avg:167.67ms
step:654/1530 train_loss:3.4223 train_time:107986ms step_avg:167.68ms
step:655/1530 train_loss:3.5719 train_time:108155ms step_avg:167.68ms
step:656/1530 train_loss:3.5708 train_time:108326ms step_avg:167.69ms
step:657/1530 train_loss:3.4963 train_time:108498ms step_avg:167.69ms
step:658/1530 train_loss:3.6828 train_time:108668ms step_avg:167.70ms
step:659/1530 train_loss:3.5802 train_time:108838ms step_avg:167.70ms
step:660/1530 train_loss:3.6724 train_time:109009ms step_avg:167.71ms
step:661/1530 train_loss:3.7409 train_time:109180ms step_avg:167.71ms
step:662/1530 train_loss:3.6645 train_time:109350ms step_avg:167.72ms
step:663/1530 train_loss:3.5502 train_time:109520ms step_avg:167.72ms
step:664/1530 train_loss:3.6101 train_time:109693ms step_avg:167.73ms
step:665/1530 train_loss:3.4846 train_time:109863ms step_avg:167.73ms
step:666/1530 train_loss:3.7754 train_time:110033ms step_avg:167.73ms
step:667/1530 train_loss:3.5961 train_time:110205ms step_avg:167.74ms
step:668/1530 train_loss:3.6428 train_time:110375ms step_avg:167.74ms
step:669/1530 train_loss:3.4809 train_time:110548ms step_avg:167.75ms
step:670/1530 train_loss:3.5912 train_time:110718ms step_avg:167.75ms
step:671/1530 train_loss:3.5562 train_time:110887ms step_avg:167.76ms
step:672/1530 train_loss:3.5633 train_time:111058ms step_avg:167.76ms
step:673/1530 train_loss:3.8444 train_time:111229ms step_avg:167.77ms
step:674/1530 train_loss:3.6155 train_time:111400ms step_avg:167.77ms
step:675/1530 train_loss:3.7010 train_time:111572ms step_avg:167.78ms
step:676/1530 train_loss:3.4844 train_time:111743ms step_avg:167.78ms
step:677/1530 train_loss:3.5908 train_time:111914ms step_avg:167.79ms
step:678/1530 train_loss:3.5464 train_time:112086ms step_avg:167.79ms
step:679/1530 train_loss:3.6695 train_time:112259ms step_avg:167.80ms
step:680/1530 train_loss:3.5793 train_time:112428ms step_avg:167.80ms
step:681/1530 train_loss:3.6101 train_time:112600ms step_avg:167.81ms
step:682/1530 train_loss:3.6577 train_time:112777ms step_avg:167.82ms
step:683/1530 train_loss:3.7283 train_time:112952ms step_avg:167.83ms
step:684/1530 train_loss:3.6383 train_time:113122ms step_avg:167.84ms
step:685/1530 train_loss:3.6795 train_time:113298ms step_avg:167.85ms
step:686/1530 train_loss:3.6267 train_time:113471ms step_avg:167.86ms
step:687/1530 train_loss:3.6611 train_time:113643ms step_avg:167.86ms
step:688/1530 train_loss:3.1892 train_time:113819ms step_avg:167.88ms
step:689/1530 train_loss:3.4019 train_time:113993ms step_avg:167.88ms
step:690/1530 train_loss:3.5370 train_time:114168ms step_avg:167.89ms
step:691/1530 train_loss:3.4021 train_time:114340ms step_avg:167.90ms
step:692/1530 train_loss:3.6187 train_time:114514ms step_avg:167.91ms
step:693/1530 train_loss:3.6423 train_time:114687ms step_avg:167.92ms
step:694/1530 train_loss:3.5481 train_time:114859ms step_avg:167.92ms
step:695/1530 train_loss:3.5317 train_time:115031ms step_avg:167.93ms
step:696/1530 train_loss:3.8437 train_time:115203ms step_avg:167.93ms
step:697/1530 train_loss:3.5783 train_time:115375ms step_avg:167.94ms
step:698/1530 train_loss:3.6418 train_time:115548ms step_avg:167.95ms
step:699/1530 train_loss:3.7611 train_time:115721ms step_avg:167.96ms
step:700/1530 train_loss:3.5583 train_time:115894ms step_avg:167.96ms
step:701/1530 train_loss:3.5366 train_time:116065ms step_avg:167.97ms
step:702/1530 train_loss:3.5029 train_time:116238ms step_avg:167.97ms
step:703/1530 train_loss:3.4893 train_time:116411ms step_avg:167.98ms
step:704/1530 train_loss:3.5657 train_time:116583ms step_avg:167.99ms
step:705/1530 train_loss:3.5504 train_time:116759ms step_avg:168.00ms
step:706/1530 train_loss:3.5763 train_time:116936ms step_avg:168.01ms
step:707/1530 train_loss:3.6411 train_time:117111ms step_avg:168.02ms
step:708/1530 train_loss:3.6006 train_time:117283ms step_avg:168.03ms
step:709/1530 train_loss:3.5783 train_time:117456ms step_avg:168.03ms
step:710/1530 train_loss:3.5351 train_time:117628ms step_avg:168.04ms
step:711/1530 train_loss:3.5866 train_time:117802ms step_avg:168.05ms
step:712/1530 train_loss:3.6396 train_time:117978ms step_avg:168.06ms
step:713/1530 train_loss:3.6440 train_time:118155ms step_avg:168.07ms
step:714/1530 train_loss:3.5511 train_time:118327ms step_avg:168.08ms
step:715/1530 train_loss:3.5667 train_time:118500ms step_avg:168.08ms
step:716/1530 train_loss:3.5796 train_time:118672ms step_avg:168.09ms
step:717/1530 train_loss:3.6988 train_time:118845ms step_avg:168.10ms
step:718/1530 train_loss:3.5861 train_time:119016ms step_avg:168.10ms
step:719/1530 train_loss:3.6726 train_time:119191ms step_avg:168.11ms
step:720/1530 train_loss:3.8471 train_time:119364ms step_avg:168.12ms
step:721/1530 train_loss:3.4638 train_time:119537ms step_avg:168.12ms
step:722/1530 train_loss:3.7337 train_time:119709ms step_avg:168.13ms
step:723/1530 train_loss:3.7627 train_time:119881ms step_avg:168.14ms
step:724/1530 train_loss:3.5641 train_time:120054ms step_avg:168.14ms
step:725/1530 train_loss:3.6457 train_time:120227ms step_avg:168.15ms
step:726/1530 train_loss:3.5244 train_time:120401ms step_avg:168.16ms
step:727/1530 train_loss:3.5735 train_time:120578ms step_avg:168.17ms
step:728/1530 train_loss:3.7195 train_time:120751ms step_avg:168.18ms
step:729/1530 train_loss:3.6641 train_time:120923ms step_avg:168.18ms
step:730/1530 train_loss:3.6497 train_time:121097ms step_avg:168.19ms
step:731/1530 train_loss:3.5455 train_time:121270ms step_avg:168.20ms
step:732/1530 train_loss:3.5845 train_time:121441ms step_avg:168.20ms
step:733/1530 train_loss:3.8213 train_time:121616ms step_avg:168.21ms
step:734/1530 train_loss:3.5565 train_time:121791ms step_avg:168.22ms
step:735/1530 train_loss:3.6131 train_time:121964ms step_avg:168.23ms
step:736/1530 train_loss:3.7241 train_time:122138ms step_avg:168.23ms
step:737/1530 train_loss:3.6721 train_time:122311ms step_avg:168.24ms
step:738/1530 train_loss:3.5976 train_time:122481ms step_avg:168.24ms
step:739/1530 train_loss:3.4999 train_time:122651ms step_avg:168.24ms
step:740/1530 train_loss:4.1081 train_time:122829ms step_avg:168.26ms
step:741/1530 train_loss:3.4890 train_time:123002ms step_avg:168.26ms
step:742/1530 train_loss:3.5521 train_time:123175ms step_avg:168.27ms
step:743/1530 train_loss:3.5745 train_time:123348ms step_avg:168.28ms
step:744/1530 train_loss:3.6392 train_time:123522ms step_avg:168.29ms
step:745/1530 train_loss:3.5798 train_time:123697ms step_avg:168.30ms
step:746/1530 train_loss:3.5892 train_time:123869ms step_avg:168.30ms
step:747/1530 train_loss:3.6433 train_time:124042ms step_avg:168.31ms
step:748/1530 train_loss:3.5609 train_time:124219ms step_avg:168.32ms
step:749/1530 train_loss:3.5590 train_time:124393ms step_avg:168.33ms
step:750/1530 train_loss:3.5910 train_time:124564ms step_avg:168.33ms
step:750/1530 val_loss:3.5590 train_time:124613ms step_avg:168.40ms
step:751/1530 train_loss:3.5645 train_time:124736ms step_avg:168.34ms
step:752/1530 train_loss:3.6078 train_time:124910ms step_avg:168.34ms
step:753/1530 train_loss:3.6087 train_time:125084ms step_avg:168.35ms
step:754/1530 train_loss:3.5830 train_time:125257ms step_avg:168.36ms
step:755/1530 train_loss:3.6753 train_time:125562ms step_avg:168.54ms
step:756/1530 train_loss:3.4503 train_time:125745ms step_avg:168.56ms
step:757/1530 train_loss:3.7156 train_time:125918ms step_avg:168.56ms
step:758/1530 train_loss:3.6416 train_time:126089ms step_avg:168.57ms
step:759/1530 train_loss:3.5817 train_time:126412ms step_avg:168.77ms
step:760/1530 train_loss:3.6962 train_time:126583ms step_avg:168.78ms
step:761/1530 train_loss:3.3988 train_time:126754ms step_avg:168.78ms
step:762/1530 train_loss:3.5399 train_time:126928ms step_avg:168.79ms
step:763/1530 train_loss:3.6487 train_time:127100ms step_avg:168.79ms
step:764/1530 train_loss:3.3132 train_time:127272ms step_avg:168.80ms
step:765/1530 train_loss:3.7287 train_time:127445ms step_avg:168.80ms
step:766/1530 train_loss:3.5591 train_time:127618ms step_avg:168.81ms
step:767/1530 train_loss:3.5615 train_time:127790ms step_avg:168.81ms
step:768/1530 train_loss:3.5648 train_time:127962ms step_avg:168.82ms
step:769/1530 train_loss:3.5801 train_time:128134ms step_avg:168.82ms
step:770/1530 train_loss:3.6362 train_time:128309ms step_avg:168.83ms
step:771/1530 train_loss:3.8804 train_time:128481ms step_avg:168.83ms
step:772/1530 train_loss:3.4478 train_time:128653ms step_avg:168.84ms
step:773/1530 train_loss:3.6261 train_time:128825ms step_avg:168.84ms
step:774/1530 train_loss:3.6339 train_time:128996ms step_avg:168.84ms
step:775/1530 train_loss:3.6007 train_time:129168ms step_avg:168.85ms
step:776/1530 train_loss:3.3889 train_time:129340ms step_avg:168.85ms
step:777/1530 train_loss:3.3879 train_time:129514ms step_avg:168.86ms
step:778/1530 train_loss:3.4834 train_time:129685ms step_avg:168.86ms
step:779/1530 train_loss:3.5769 train_time:129857ms step_avg:168.87ms
step:780/1530 train_loss:3.5807 train_time:130030ms step_avg:168.87ms
step:781/1530 train_loss:3.6648 train_time:130202ms step_avg:168.87ms
step:782/1530 train_loss:3.5798 train_time:130375ms step_avg:168.88ms
step:783/1530 train_loss:3.5629 train_time:130548ms step_avg:168.88ms
step:784/1530 train_loss:3.5993 train_time:130719ms step_avg:168.89ms
step:785/1530 train_loss:3.5538 train_time:130890ms step_avg:168.89ms
step:786/1530 train_loss:3.4322 train_time:131063ms step_avg:168.90ms
step:787/1530 train_loss:3.7328 train_time:131236ms step_avg:168.90ms
step:788/1530 train_loss:3.4950 train_time:131411ms step_avg:168.91ms
step:789/1530 train_loss:3.5413 train_time:131583ms step_avg:168.91ms
step:790/1530 train_loss:3.6232 train_time:131756ms step_avg:168.92ms
step:791/1530 train_loss:3.7652 train_time:131933ms step_avg:168.93ms
step:792/1530 train_loss:3.7491 train_time:132107ms step_avg:168.93ms
step:793/1530 train_loss:3.4385 train_time:132278ms step_avg:168.94ms
step:794/1530 train_loss:3.5911 train_time:132451ms step_avg:168.94ms
step:795/1530 train_loss:3.6641 train_time:132625ms step_avg:168.95ms
step:796/1530 train_loss:3.7513 train_time:132801ms step_avg:168.96ms
step:797/1530 train_loss:3.5189 train_time:132975ms step_avg:168.96ms
step:798/1530 train_loss:3.6409 train_time:133150ms step_avg:168.97ms
step:799/1530 train_loss:3.5305 train_time:133326ms step_avg:168.98ms
step:800/1530 train_loss:3.5149 train_time:133500ms step_avg:168.99ms
step:801/1530 train_loss:3.6241 train_time:133674ms step_avg:168.99ms
step:802/1530 train_loss:3.4906 train_time:133851ms step_avg:169.00ms
step:803/1530 train_loss:3.4831 train_time:134024ms step_avg:169.01ms
step:804/1530 train_loss:3.6192 train_time:134198ms step_avg:169.02ms
step:805/1530 train_loss:3.5110 train_time:134375ms step_avg:169.02ms
step:806/1530 train_loss:3.5553 train_time:134549ms step_avg:169.03ms
step:807/1530 train_loss:3.6321 train_time:134722ms step_avg:169.04ms
step:808/1530 train_loss:3.5327 train_time:134898ms step_avg:169.05ms
step:809/1530 train_loss:3.4844 train_time:135071ms step_avg:169.05ms
step:810/1530 train_loss:3.5519 train_time:135243ms step_avg:169.05ms
step:811/1530 train_loss:3.5735 train_time:135416ms step_avg:169.06ms
step:812/1530 train_loss:3.5937 train_time:135591ms step_avg:169.07ms
step:813/1530 train_loss:3.6187 train_time:135761ms step_avg:169.07ms
step:814/1530 train_loss:3.5621 train_time:135936ms step_avg:169.07ms
step:815/1530 train_loss:3.5567 train_time:136111ms step_avg:169.08ms
step:816/1530 train_loss:3.6779 train_time:136287ms step_avg:169.09ms
step:817/1530 train_loss:3.7601 train_time:136459ms step_avg:169.09ms
step:818/1530 train_loss:3.5200 train_time:136630ms step_avg:169.10ms
step:819/1530 train_loss:3.7168 train_time:136806ms step_avg:169.10ms
step:820/1530 train_loss:3.4869 train_time:136980ms step_avg:169.11ms
step:821/1530 train_loss:3.5557 train_time:137153ms step_avg:169.12ms
step:822/1530 train_loss:3.6915 train_time:137329ms step_avg:169.12ms
step:823/1530 train_loss:3.5706 train_time:137503ms step_avg:169.13ms
step:824/1530 train_loss:3.5098 train_time:137676ms step_avg:169.14ms
step:825/1530 train_loss:3.6118 train_time:137852ms step_avg:169.14ms
step:826/1530 train_loss:3.4761 train_time:138027ms step_avg:169.15ms
step:827/1530 train_loss:3.7291 train_time:138202ms step_avg:169.16ms
step:828/1530 train_loss:3.6131 train_time:138376ms step_avg:169.16ms
step:829/1530 train_loss:3.6225 train_time:138552ms step_avg:169.17ms
step:830/1530 train_loss:3.5312 train_time:138727ms step_avg:169.18ms
step:831/1530 train_loss:3.5904 train_time:138900ms step_avg:169.18ms
step:832/1530 train_loss:3.5076 train_time:139075ms step_avg:169.19ms
step:833/1530 train_loss:3.6382 train_time:139251ms step_avg:169.20ms
step:834/1530 train_loss:3.4710 train_time:139425ms step_avg:169.20ms
step:835/1530 train_loss:3.4475 train_time:139599ms step_avg:169.21ms
step:836/1530 train_loss:3.7090 train_time:139775ms step_avg:169.22ms
step:837/1530 train_loss:3.3903 train_time:139950ms step_avg:169.23ms
step:838/1530 train_loss:3.5830 train_time:140122ms step_avg:169.23ms
step:839/1530 train_loss:3.4129 train_time:140298ms step_avg:169.24ms
step:840/1530 train_loss:3.4624 train_time:140471ms step_avg:169.24ms
step:841/1530 train_loss:3.5679 train_time:140645ms step_avg:169.25ms
step:842/1530 train_loss:3.5779 train_time:140819ms step_avg:169.25ms
step:843/1530 train_loss:3.5564 train_time:140992ms step_avg:169.26ms
step:844/1530 train_loss:3.4225 train_time:141164ms step_avg:169.26ms
step:845/1530 train_loss:3.6566 train_time:141337ms step_avg:169.27ms
step:846/1530 train_loss:3.5121 train_time:141513ms step_avg:169.27ms
step:847/1530 train_loss:3.4844 train_time:141688ms step_avg:169.28ms
step:848/1530 train_loss:3.6352 train_time:141860ms step_avg:169.28ms
step:849/1530 train_loss:3.4818 train_time:142035ms step_avg:169.29ms
step:850/1530 train_loss:3.4351 train_time:142210ms step_avg:169.30ms
step:851/1530 train_loss:3.7311 train_time:142383ms step_avg:169.30ms
step:852/1530 train_loss:3.4333 train_time:142555ms step_avg:169.31ms
step:853/1530 train_loss:3.5570 train_time:142729ms step_avg:169.31ms
step:854/1530 train_loss:3.6483 train_time:142904ms step_avg:169.32ms
step:855/1530 train_loss:3.5103 train_time:143077ms step_avg:169.32ms
step:856/1530 train_loss:3.5378 train_time:143252ms step_avg:169.33ms
step:857/1530 train_loss:3.5996 train_time:143427ms step_avg:169.34ms
step:858/1530 train_loss:3.4601 train_time:143601ms step_avg:169.34ms
step:859/1530 train_loss:3.5562 train_time:143775ms step_avg:169.35ms
step:860/1530 train_loss:3.5772 train_time:143947ms step_avg:169.35ms
step:861/1530 train_loss:3.6229 train_time:144124ms step_avg:169.36ms
step:862/1530 train_loss:3.5976 train_time:144301ms step_avg:169.37ms
step:863/1530 train_loss:3.5648 train_time:144477ms step_avg:169.37ms
step:864/1530 train_loss:3.3739 train_time:144651ms step_avg:169.38ms
step:865/1530 train_loss:3.5969 train_time:144823ms step_avg:169.38ms
step:866/1530 train_loss:3.8659 train_time:144999ms step_avg:169.39ms
step:867/1530 train_loss:3.4498 train_time:145172ms step_avg:169.40ms
step:868/1530 train_loss:3.6352 train_time:145343ms step_avg:169.40ms
step:869/1530 train_loss:3.6100 train_time:145516ms step_avg:169.40ms
step:870/1530 train_loss:3.4431 train_time:145692ms step_avg:169.41ms
step:871/1530 train_loss:3.3892 train_time:145866ms step_avg:169.41ms
step:872/1530 train_loss:3.6412 train_time:146042ms step_avg:169.42ms
step:873/1530 train_loss:3.4527 train_time:146215ms step_avg:169.43ms
step:874/1530 train_loss:3.2175 train_time:146395ms step_avg:169.44ms
step:875/1530 train_loss:3.6233 train_time:146569ms step_avg:169.44ms
step:875/1530 val_loss:3.5126 train_time:146618ms step_avg:169.50ms
step:876/1530 train_loss:3.4329 train_time:146743ms step_avg:169.45ms
step:877/1530 train_loss:3.6133 train_time:146919ms step_avg:169.46ms
step:878/1530 train_loss:3.4606 train_time:147092ms step_avg:169.46ms
step:879/1530 train_loss:3.6406 train_time:147266ms step_avg:169.47ms
step:880/1530 train_loss:3.3031 train_time:147438ms step_avg:169.47ms
step:881/1530 train_loss:3.4734 train_time:147610ms step_avg:169.47ms
step:882/1530 train_loss:3.6885 train_time:147783ms step_avg:169.48ms
step:883/1530 train_loss:3.8305 train_time:147956ms step_avg:169.48ms
step:884/1530 train_loss:3.5602 train_time:148132ms step_avg:169.49ms
step:885/1530 train_loss:3.4886 train_time:148306ms step_avg:169.49ms
step:886/1530 train_loss:3.5675 train_time:148480ms step_avg:169.50ms
step:887/1530 train_loss:4.0781 train_time:148654ms step_avg:169.50ms
step:888/1530 train_loss:3.8342 train_time:148834ms step_avg:169.51ms
step:889/1530 train_loss:3.5146 train_time:149008ms step_avg:169.52ms
step:890/1530 train_loss:3.5293 train_time:149179ms step_avg:169.52ms
step:891/1530 train_loss:3.3532 train_time:149354ms step_avg:169.53ms
step:892/1530 train_loss:3.7150 train_time:149527ms step_avg:169.53ms
step:893/1530 train_loss:3.4128 train_time:149700ms step_avg:169.54ms
step:894/1530 train_loss:3.6331 train_time:149875ms step_avg:169.54ms
step:895/1530 train_loss:3.6720 train_time:150050ms step_avg:169.55ms
step:896/1530 train_loss:3.4865 train_time:150225ms step_avg:169.55ms
step:897/1530 train_loss:3.5354 train_time:150399ms step_avg:169.56ms
step:898/1530 train_loss:3.5795 train_time:150575ms step_avg:169.57ms
step:899/1530 train_loss:3.4723 train_time:150748ms step_avg:169.57ms
step:900/1530 train_loss:3.4174 train_time:150922ms step_avg:169.58ms
step:901/1530 train_loss:3.6123 train_time:151095ms step_avg:169.58ms
step:902/1530 train_loss:3.6264 train_time:151268ms step_avg:169.58ms
step:903/1530 train_loss:3.5379 train_time:151443ms step_avg:169.59ms
step:904/1530 train_loss:3.4873 train_time:151616ms step_avg:169.59ms
step:905/1530 train_loss:3.4966 train_time:151787ms step_avg:169.59ms
step:906/1530 train_loss:3.7026 train_time:151961ms step_avg:169.60ms
step:907/1530 train_loss:3.5087 train_time:152135ms step_avg:169.60ms
step:908/1530 train_loss:3.5551 train_time:152308ms step_avg:169.61ms
step:909/1530 train_loss:3.4454 train_time:152483ms step_avg:169.61ms
step:910/1530 train_loss:3.5249 train_time:152663ms step_avg:169.63ms
step:911/1530 train_loss:3.6391 train_time:152839ms step_avg:169.63ms
step:912/1530 train_loss:3.5856 train_time:153017ms step_avg:169.64ms
step:913/1530 train_loss:3.4546 train_time:153195ms step_avg:169.65ms
step:914/1530 train_loss:3.7338 train_time:153374ms step_avg:169.66ms
step:915/1530 train_loss:3.5272 train_time:153554ms step_avg:169.67ms
step:916/1530 train_loss:3.6114 train_time:153731ms step_avg:169.68ms
step:917/1530 train_loss:3.5997 train_time:153906ms step_avg:169.69ms
step:918/1530 train_loss:4.8132 train_time:154085ms step_avg:169.70ms
step:919/1530 train_loss:3.4888 train_time:154264ms step_avg:169.71ms
step:920/1530 train_loss:3.5777 train_time:154438ms step_avg:169.71ms
step:921/1530 train_loss:3.5486 train_time:154616ms step_avg:169.72ms
step:922/1530 train_loss:3.5784 train_time:154794ms step_avg:169.73ms
step:923/1530 train_loss:3.6006 train_time:154970ms step_avg:169.74ms
step:924/1530 train_loss:3.6743 train_time:155146ms step_avg:169.74ms
step:925/1530 train_loss:3.6400 train_time:155322ms step_avg:169.75ms
step:926/1530 train_loss:3.5500 train_time:155494ms step_avg:169.75ms
step:927/1530 train_loss:3.5527 train_time:155670ms step_avg:169.76ms
step:928/1530 train_loss:3.7768 train_time:155848ms step_avg:169.77ms
step:929/1530 train_loss:3.6070 train_time:156023ms step_avg:169.77ms
step:930/1530 train_loss:3.3962 train_time:156198ms step_avg:169.78ms
step:931/1530 train_loss:3.4887 train_time:156373ms step_avg:169.79ms
step:932/1530 train_loss:3.6441 train_time:156551ms step_avg:169.79ms
step:933/1530 train_loss:3.3648 train_time:156727ms step_avg:169.80ms
step:934/1530 train_loss:3.5794 train_time:156904ms step_avg:169.81ms
step:935/1530 train_loss:3.4354 train_time:157082ms step_avg:169.82ms
step:936/1530 train_loss:3.5116 train_time:157260ms step_avg:169.83ms
step:937/1530 train_loss:3.6123 train_time:157439ms step_avg:169.84ms
step:938/1530 train_loss:3.5336 train_time:157614ms step_avg:169.84ms
step:939/1530 train_loss:3.6644 train_time:157794ms step_avg:169.85ms
step:940/1530 train_loss:3.4749 train_time:157969ms step_avg:169.86ms
step:941/1530 train_loss:3.5427 train_time:158142ms step_avg:169.86ms
step:942/1530 train_loss:3.3510 train_time:158319ms step_avg:169.87ms
step:943/1530 train_loss:3.7015 train_time:158501ms step_avg:169.88ms
step:944/1530 train_loss:3.3941 train_time:158814ms step_avg:170.04ms
step:945/1530 train_loss:3.4190 train_time:158998ms step_avg:170.05ms
step:946/1530 train_loss:5.0705 train_time:159179ms step_avg:170.06ms
step:947/1530 train_loss:3.5910 train_time:159355ms step_avg:170.07ms
step:948/1530 train_loss:3.4724 train_time:159532ms step_avg:170.08ms
step:949/1530 train_loss:3.3685 train_time:159854ms step_avg:170.24ms
step:950/1530 train_loss:3.4355 train_time:160030ms step_avg:170.24ms
step:951/1530 train_loss:3.4075 train_time:160210ms step_avg:170.25ms
step:952/1530 train_loss:3.4693 train_time:160384ms step_avg:170.26ms
step:953/1530 train_loss:3.5552 train_time:160561ms step_avg:170.27ms
step:954/1530 train_loss:3.4389 train_time:160740ms step_avg:170.28ms
step:955/1530 train_loss:3.4695 train_time:160915ms step_avg:170.28ms
step:956/1530 train_loss:3.4389 train_time:161089ms step_avg:170.28ms
step:957/1530 train_loss:3.4894 train_time:161269ms step_avg:170.29ms
step:958/1530 train_loss:3.5032 train_time:161448ms step_avg:170.30ms
step:959/1530 train_loss:3.5068 train_time:161625ms step_avg:170.31ms
step:960/1530 train_loss:3.4071 train_time:161802ms step_avg:170.32ms
step:961/1530 train_loss:3.6392 train_time:161977ms step_avg:170.32ms
step:962/1530 train_loss:3.5820 train_time:162151ms step_avg:170.33ms
step:963/1530 train_loss:3.5891 train_time:162330ms step_avg:170.34ms
step:964/1530 train_loss:3.4225 train_time:162509ms step_avg:170.35ms
step:965/1530 train_loss:3.4701 train_time:162683ms step_avg:170.35ms
step:966/1530 train_loss:3.7049 train_time:162859ms step_avg:170.35ms
step:967/1530 train_loss:3.5128 train_time:163034ms step_avg:170.36ms
step:968/1530 train_loss:3.5115 train_time:163210ms step_avg:170.36ms
step:969/1530 train_loss:3.5832 train_time:163385ms step_avg:170.37ms
step:970/1530 train_loss:3.3661 train_time:163558ms step_avg:170.37ms
step:971/1530 train_loss:3.5244 train_time:163732ms step_avg:170.38ms
step:972/1530 train_loss:3.4631 train_time:163907ms step_avg:170.38ms
step:973/1530 train_loss:3.5360 train_time:164080ms step_avg:170.38ms
step:974/1530 train_loss:3.5851 train_time:164256ms step_avg:170.39ms
step:975/1530 train_loss:3.4594 train_time:164432ms step_avg:170.40ms
step:976/1530 train_loss:3.6671 train_time:164607ms step_avg:170.40ms
step:977/1530 train_loss:3.5639 train_time:164782ms step_avg:170.40ms
step:978/1530 train_loss:3.3494 train_time:164955ms step_avg:170.41ms
step:979/1530 train_loss:3.6146 train_time:165131ms step_avg:170.41ms
step:980/1530 train_loss:3.4114 train_time:165307ms step_avg:170.42ms
step:981/1530 train_loss:3.5724 train_time:165484ms step_avg:170.43ms
step:982/1530 train_loss:3.5328 train_time:165657ms step_avg:170.43ms
step:983/1530 train_loss:3.5043 train_time:165833ms step_avg:170.43ms
step:984/1530 train_loss:3.4917 train_time:166009ms step_avg:170.44ms
step:985/1530 train_loss:3.5680 train_time:166186ms step_avg:170.45ms
step:986/1530 train_loss:3.4033 train_time:166362ms step_avg:170.45ms
step:987/1530 train_loss:3.4810 train_time:166536ms step_avg:170.46ms
step:988/1530 train_loss:3.4528 train_time:166711ms step_avg:170.46ms
step:989/1530 train_loss:3.4139 train_time:166884ms step_avg:170.46ms
step:990/1530 train_loss:3.6555 train_time:167060ms step_avg:170.47ms
step:991/1530 train_loss:3.4626 train_time:167235ms step_avg:170.47ms
step:992/1530 train_loss:3.4385 train_time:167416ms step_avg:170.48ms
step:993/1530 train_loss:3.4967 train_time:167595ms step_avg:170.49ms
step:994/1530 train_loss:3.5911 train_time:167769ms step_avg:170.50ms
step:995/1530 train_loss:3.5264 train_time:167940ms step_avg:170.50ms
step:996/1530 train_loss:3.4525 train_time:168115ms step_avg:170.50ms
step:997/1530 train_loss:3.7470 train_time:168290ms step_avg:170.51ms
step:998/1530 train_loss:3.4351 train_time:168462ms step_avg:170.51ms
step:999/1530 train_loss:3.5815 train_time:168635ms step_avg:170.51ms
step:1000/1530 train_loss:3.4361 train_time:168812ms step_avg:170.52ms
step:1000/1530 val_loss:3.4621 train_time:168863ms step_avg:170.57ms
step:1001/1530 train_loss:3.4929 train_time:168987ms step_avg:170.52ms
step:1002/1530 train_loss:3.3696 train_time:169162ms step_avg:170.53ms
step:1003/1530 train_loss:3.5487 train_time:169339ms step_avg:170.53ms
step:1004/1530 train_loss:3.5973 train_time:169515ms step_avg:170.54ms
step:1005/1530 train_loss:3.3880 train_time:169689ms step_avg:170.54ms
step:1006/1530 train_loss:3.4673 train_time:169865ms step_avg:170.55ms
step:1007/1530 train_loss:3.4344 train_time:170041ms step_avg:170.55ms
step:1008/1530 train_loss:3.5557 train_time:170216ms step_avg:170.56ms
step:1009/1530 train_loss:3.6606 train_time:170395ms step_avg:170.57ms
step:1010/1530 train_loss:3.5594 train_time:170568ms step_avg:170.57ms
step:1011/1530 train_loss:3.5274 train_time:170743ms step_avg:170.57ms
step:1012/1530 train_loss:3.3824 train_time:170917ms step_avg:170.58ms
step:1013/1530 train_loss:3.5306 train_time:171092ms step_avg:170.58ms
step:1014/1530 train_loss:3.6109 train_time:171267ms step_avg:170.59ms
step:1015/1530 train_loss:3.3269 train_time:171445ms step_avg:170.59ms
step:1016/1530 train_loss:3.3923 train_time:171619ms step_avg:170.60ms
step:1017/1530 train_loss:3.3911 train_time:171796ms step_avg:170.60ms
step:1018/1530 train_loss:3.3926 train_time:171971ms step_avg:170.61ms
step:1019/1530 train_loss:3.5128 train_time:172146ms step_avg:170.61ms
step:1020/1530 train_loss:3.3725 train_time:172323ms step_avg:170.62ms
step:1021/1530 train_loss:3.3497 train_time:172498ms step_avg:170.62ms
step:1022/1530 train_loss:3.4712 train_time:172673ms step_avg:170.63ms
step:1023/1530 train_loss:3.5013 train_time:172849ms step_avg:170.63ms
step:1024/1530 train_loss:3.4689 train_time:173026ms step_avg:170.64ms
step:1025/1530 train_loss:3.4719 train_time:173204ms step_avg:170.64ms
step:1026/1530 train_loss:3.6110 train_time:173379ms step_avg:170.65ms
step:1027/1530 train_loss:3.3138 train_time:173556ms step_avg:170.65ms
step:1028/1530 train_loss:3.3943 train_time:173736ms step_avg:170.66ms
step:1029/1530 train_loss:3.3012 train_time:173915ms step_avg:170.67ms
step:1030/1530 train_loss:3.5323 train_time:174091ms step_avg:170.68ms
step:1031/1530 train_loss:3.5051 train_time:174266ms step_avg:170.68ms
step:1032/1530 train_loss:3.6845 train_time:174447ms step_avg:170.69ms
step:1033/1530 train_loss:3.4842 train_time:174623ms step_avg:170.70ms
step:1034/1530 train_loss:3.3877 train_time:174799ms step_avg:170.70ms
step:1035/1530 train_loss:3.4385 train_time:174977ms step_avg:170.71ms
step:1036/1530 train_loss:3.4785 train_time:175154ms step_avg:170.71ms
step:1037/1530 train_loss:3.7774 train_time:175330ms step_avg:170.72ms
step:1038/1530 train_loss:3.6087 train_time:175509ms step_avg:170.73ms
step:1039/1530 train_loss:3.5036 train_time:175690ms step_avg:170.74ms
step:1040/1530 train_loss:3.4075 train_time:175865ms step_avg:170.74ms
step:1041/1530 train_loss:3.4805 train_time:176044ms step_avg:170.75ms
step:1042/1530 train_loss:3.5137 train_time:176217ms step_avg:170.75ms
step:1043/1530 train_loss:3.4393 train_time:176391ms step_avg:170.76ms
step:1044/1530 train_loss:3.4536 train_time:176568ms step_avg:170.76ms
step:1045/1530 train_loss:3.5107 train_time:176746ms step_avg:170.77ms
step:1046/1530 train_loss:3.4220 train_time:176923ms step_avg:170.78ms
step:1047/1530 train_loss:3.6273 train_time:177101ms step_avg:170.78ms
step:1048/1530 train_loss:3.4897 train_time:177277ms step_avg:170.79ms
step:1049/1530 train_loss:3.3995 train_time:177453ms step_avg:170.79ms
step:1050/1530 train_loss:3.3855 train_time:177630ms step_avg:170.80ms
step:1051/1530 train_loss:3.4909 train_time:177808ms step_avg:170.81ms
step:1052/1530 train_loss:3.3563 train_time:177986ms step_avg:170.81ms
step:1053/1530 train_loss:3.6858 train_time:178165ms step_avg:170.82ms
step:1054/1530 train_loss:3.5333 train_time:178345ms step_avg:170.83ms
step:1055/1530 train_loss:3.3795 train_time:178520ms step_avg:170.83ms
step:1056/1530 train_loss:3.4933 train_time:178695ms step_avg:170.84ms
step:1057/1530 train_loss:3.5770 train_time:178872ms step_avg:170.84ms
step:1058/1530 train_loss:3.2980 train_time:179050ms step_avg:170.85ms
step:1059/1530 train_loss:3.3663 train_time:179232ms step_avg:170.86ms
step:1060/1530 train_loss:3.4327 train_time:179407ms step_avg:170.86ms
step:1061/1530 train_loss:3.4096 train_time:179582ms step_avg:170.87ms
step:1062/1530 train_loss:3.3746 train_time:179758ms step_avg:170.87ms
step:1063/1530 train_loss:3.4571 train_time:179934ms step_avg:170.88ms
step:1064/1530 train_loss:3.3742 train_time:180107ms step_avg:170.88ms
step:1065/1530 train_loss:3.3565 train_time:180285ms step_avg:170.89ms
step:1066/1530 train_loss:3.4116 train_time:180462ms step_avg:170.89ms
step:1067/1530 train_loss:3.2823 train_time:180642ms step_avg:170.90ms
step:1068/1530 train_loss:3.4282 train_time:180819ms step_avg:170.91ms
step:1069/1530 train_loss:3.2912 train_time:181000ms step_avg:170.92ms
step:1070/1530 train_loss:3.5627 train_time:181175ms step_avg:170.92ms
step:1071/1530 train_loss:3.5047 train_time:181353ms step_avg:170.93ms
step:1072/1530 train_loss:3.4315 train_time:181529ms step_avg:170.93ms
step:1073/1530 train_loss:3.5175 train_time:181704ms step_avg:170.94ms
step:1074/1530 train_loss:3.4270 train_time:181880ms step_avg:170.94ms
step:1075/1530 train_loss:3.3982 train_time:182057ms step_avg:170.95ms
step:1076/1530 train_loss:3.7890 train_time:182234ms step_avg:170.95ms
step:1077/1530 train_loss:3.4297 train_time:182408ms step_avg:170.95ms
step:1078/1530 train_loss:3.0802 train_time:182592ms step_avg:170.97ms
step:1079/1530 train_loss:3.5270 train_time:182769ms step_avg:170.97ms
step:1080/1530 train_loss:3.4227 train_time:182946ms step_avg:170.98ms
step:1081/1530 train_loss:3.4968 train_time:183121ms step_avg:170.98ms
step:1082/1530 train_loss:3.5816 train_time:183298ms step_avg:170.99ms
step:1083/1530 train_loss:3.4908 train_time:183473ms step_avg:170.99ms
step:1084/1530 train_loss:3.4564 train_time:183648ms step_avg:170.99ms
step:1085/1530 train_loss:3.4293 train_time:183824ms step_avg:171.00ms
step:1086/1530 train_loss:3.6190 train_time:184000ms step_avg:171.00ms
step:1087/1530 train_loss:3.5016 train_time:184175ms step_avg:171.01ms
step:1088/1530 train_loss:3.3665 train_time:184352ms step_avg:171.01ms
step:1089/1530 train_loss:3.3700 train_time:184532ms step_avg:171.02ms
step:1090/1530 train_loss:3.4743 train_time:184710ms step_avg:171.03ms
step:1091/1530 train_loss:3.2810 train_time:184887ms step_avg:171.03ms
step:1092/1530 train_loss:3.4776 train_time:185064ms step_avg:171.04ms
step:1093/1530 train_loss:3.5989 train_time:185242ms step_avg:171.05ms
step:1094/1530 train_loss:3.4411 train_time:185417ms step_avg:171.05ms
step:1095/1530 train_loss:3.4101 train_time:185592ms step_avg:171.05ms
step:1096/1530 train_loss:3.4219 train_time:185767ms step_avg:171.06ms
step:1097/1530 train_loss:3.4886 train_time:185946ms step_avg:171.06ms
step:1098/1530 train_loss:3.5581 train_time:186125ms step_avg:171.07ms
step:1099/1530 train_loss:3.5223 train_time:186304ms step_avg:171.08ms
step:1100/1530 train_loss:3.4214 train_time:186483ms step_avg:171.09ms
step:1101/1530 train_loss:3.2833 train_time:186662ms step_avg:171.09ms
step:1102/1530 train_loss:3.3052 train_time:186841ms step_avg:171.10ms
step:1103/1530 train_loss:3.4369 train_time:187022ms step_avg:171.11ms
step:1104/1530 train_loss:3.3131 train_time:187197ms step_avg:171.11ms
step:1105/1530 train_loss:4.0536 train_time:187372ms step_avg:171.12ms
step:1106/1530 train_loss:3.2130 train_time:187548ms step_avg:171.12ms
step:1107/1530 train_loss:3.5610 train_time:187724ms step_avg:171.13ms
step:1108/1530 train_loss:3.3425 train_time:187899ms step_avg:171.13ms
step:1109/1530 train_loss:3.4977 train_time:188072ms step_avg:171.13ms
step:1110/1530 train_loss:3.4173 train_time:188247ms step_avg:171.13ms
step:1111/1530 train_loss:3.4800 train_time:188423ms step_avg:171.14ms
step:1112/1530 train_loss:3.5543 train_time:188603ms step_avg:171.15ms
step:1113/1530 train_loss:3.4244 train_time:188786ms step_avg:171.16ms
step:1114/1530 train_loss:3.3636 train_time:188969ms step_avg:171.17ms
step:1115/1530 train_loss:3.2374 train_time:189150ms step_avg:171.18ms
step:1116/1530 train_loss:3.4213 train_time:189324ms step_avg:171.18ms
step:1117/1530 train_loss:3.5857 train_time:189503ms step_avg:171.19ms
step:1118/1530 train_loss:3.6201 train_time:189680ms step_avg:171.19ms
step:1119/1530 train_loss:3.4721 train_time:189856ms step_avg:171.20ms
step:1120/1530 train_loss:3.4842 train_time:190031ms step_avg:171.20ms
step:1121/1530 train_loss:3.3867 train_time:190209ms step_avg:171.21ms
step:1122/1530 train_loss:3.4517 train_time:190383ms step_avg:171.21ms
step:1123/1530 train_loss:3.5721 train_time:190560ms step_avg:171.21ms
step:1124/1530 train_loss:3.3327 train_time:190738ms step_avg:171.22ms
step:1125/1530 train_loss:3.2195 train_time:190914ms step_avg:171.22ms
step:1125/1530 val_loss:3.4031 train_time:190965ms step_avg:171.27ms
step:1126/1530 train_loss:3.4729 train_time:191091ms step_avg:171.23ms
step:1127/1530 train_loss:3.6690 train_time:191269ms step_avg:171.23ms
step:1128/1530 train_loss:3.2259 train_time:191446ms step_avg:171.24ms
step:1129/1530 train_loss:3.5477 train_time:191625ms step_avg:171.25ms
step:1130/1530 train_loss:3.3680 train_time:191803ms step_avg:171.25ms
step:1131/1530 train_loss:3.3960 train_time:191986ms step_avg:171.26ms
step:1132/1530 train_loss:3.3614 train_time:192159ms step_avg:171.26ms
step:1133/1530 train_loss:3.4820 train_time:192470ms step_avg:171.39ms
step:1134/1530 train_loss:3.4374 train_time:192652ms step_avg:171.40ms
step:1135/1530 train_loss:3.5168 train_time:192829ms step_avg:171.40ms
step:1136/1530 train_loss:3.5577 train_time:193007ms step_avg:171.41ms
step:1137/1530 train_loss:3.4489 train_time:193184ms step_avg:171.41ms
step:1138/1530 train_loss:3.3464 train_time:193362ms step_avg:171.42ms
step:1139/1530 train_loss:3.6444 train_time:193688ms step_avg:171.56ms
step:1140/1530 train_loss:3.4466 train_time:193868ms step_avg:171.57ms
step:1141/1530 train_loss:3.5898 train_time:194051ms step_avg:171.57ms
step:1142/1530 train_loss:3.4433 train_time:194229ms step_avg:171.58ms
step:1143/1530 train_loss:3.3600 train_time:194408ms step_avg:171.59ms
step:1144/1530 train_loss:3.4429 train_time:194585ms step_avg:171.59ms
step:1145/1530 train_loss:3.5870 train_time:194758ms step_avg:171.59ms
step:1146/1530 train_loss:3.5525 train_time:194938ms step_avg:171.60ms
step:1147/1530 train_loss:3.4747 train_time:195116ms step_avg:171.61ms
step:1148/1530 train_loss:3.4933 train_time:195295ms step_avg:171.61ms
step:1149/1530 train_loss:3.3218 train_time:195475ms step_avg:171.62ms
step:1150/1530 train_loss:3.3723 train_time:195653ms step_avg:171.63ms
step:1151/1530 train_loss:3.3161 train_time:195832ms step_avg:171.63ms
step:1152/1530 train_loss:3.3907 train_time:196012ms step_avg:171.64ms
step:1153/1530 train_loss:3.4244 train_time:196191ms step_avg:171.65ms
step:1154/1530 train_loss:3.5143 train_time:196367ms step_avg:171.65ms
step:1155/1530 train_loss:3.3139 train_time:196550ms step_avg:171.66ms
step:1156/1530 train_loss:3.5292 train_time:196732ms step_avg:171.67ms
step:1157/1530 train_loss:3.4879 train_time:196910ms step_avg:171.67ms
step:1158/1530 train_loss:3.2490 train_time:197086ms step_avg:171.68ms
step:1159/1530 train_loss:3.3398 train_time:197263ms step_avg:171.68ms
step:1160/1530 train_loss:3.3324 train_time:197439ms step_avg:171.69ms
step:1161/1530 train_loss:3.0787 train_time:197617ms step_avg:171.69ms
step:1162/1530 train_loss:3.4173 train_time:197794ms step_avg:171.70ms
step:1163/1530 train_loss:3.3857 train_time:197972ms step_avg:171.70ms
step:1164/1530 train_loss:3.2837 train_time:198150ms step_avg:171.71ms
step:1165/1530 train_loss:3.2413 train_time:198325ms step_avg:171.71ms
step:1166/1530 train_loss:3.3830 train_time:198506ms step_avg:171.72ms
step:1167/1530 train_loss:3.4074 train_time:198682ms step_avg:171.72ms
step:1168/1530 train_loss:3.7151 train_time:198856ms step_avg:171.72ms
step:1169/1530 train_loss:3.3705 train_time:199034ms step_avg:171.73ms
step:1170/1530 train_loss:3.3859 train_time:199211ms step_avg:171.73ms
step:1171/1530 train_loss:3.2971 train_time:199386ms step_avg:171.74ms
step:1172/1530 train_loss:3.4153 train_time:199561ms step_avg:171.74ms
step:1173/1530 train_loss:3.5341 train_time:199741ms step_avg:171.75ms
step:1174/1530 train_loss:3.3767 train_time:199925ms step_avg:171.76ms
step:1175/1530 train_loss:3.3556 train_time:200105ms step_avg:171.76ms
step:1176/1530 train_loss:3.4199 train_time:200285ms step_avg:171.77ms
step:1177/1530 train_loss:3.4455 train_time:200466ms step_avg:171.78ms
step:1178/1530 train_loss:3.4959 train_time:200641ms step_avg:171.78ms
step:1179/1530 train_loss:3.3957 train_time:200816ms step_avg:171.78ms
step:1180/1530 train_loss:3.3507 train_time:201004ms step_avg:171.80ms
step:1181/1530 train_loss:3.3324 train_time:201182ms step_avg:171.80ms
step:1182/1530 train_loss:3.3711 train_time:201358ms step_avg:171.81ms
step:1183/1530 train_loss:3.3276 train_time:201536ms step_avg:171.81ms
step:1184/1530 train_loss:3.5054 train_time:201713ms step_avg:171.82ms
step:1185/1530 train_loss:3.5388 train_time:201894ms step_avg:171.82ms
step:1186/1530 train_loss:3.3607 train_time:202074ms step_avg:171.83ms
step:1187/1530 train_loss:3.4096 train_time:202260ms step_avg:171.84ms
step:1188/1530 train_loss:3.4346 train_time:202438ms step_avg:171.85ms
step:1189/1530 train_loss:3.2701 train_time:202619ms step_avg:171.86ms
step:1190/1530 train_loss:3.4395 train_time:202797ms step_avg:171.86ms
step:1191/1530 train_loss:3.5792 train_time:202977ms step_avg:171.87ms
step:1192/1530 train_loss:3.3879 train_time:203151ms step_avg:171.87ms
step:1193/1530 train_loss:3.2659 train_time:203327ms step_avg:171.87ms
step:1194/1530 train_loss:3.5523 train_time:203504ms step_avg:171.88ms
step:1195/1530 train_loss:3.3637 train_time:203684ms step_avg:171.89ms
step:1196/1530 train_loss:3.3850 train_time:203869ms step_avg:171.90ms
step:1197/1530 train_loss:3.2909 train_time:204049ms step_avg:171.90ms
step:1198/1530 train_loss:3.2975 train_time:204233ms step_avg:171.91ms
step:1199/1530 train_loss:3.3382 train_time:204413ms step_avg:171.92ms
step:1200/1530 train_loss:3.4441 train_time:204591ms step_avg:171.92ms
step:1201/1530 train_loss:3.4796 train_time:204769ms step_avg:171.93ms
step:1202/1530 train_loss:3.6158 train_time:204958ms step_avg:171.94ms
step:1203/1530 train_loss:3.4011 train_time:205138ms step_avg:171.95ms
step:1204/1530 train_loss:3.3023 train_time:205318ms step_avg:171.96ms
step:1205/1530 train_loss:3.4341 train_time:205494ms step_avg:171.96ms
step:1206/1530 train_loss:3.4746 train_time:205669ms step_avg:171.96ms
step:1207/1530 train_loss:3.5118 train_time:205847ms step_avg:171.97ms
step:1208/1530 train_loss:3.3878 train_time:206023ms step_avg:171.97ms
step:1209/1530 train_loss:3.2399 train_time:206202ms step_avg:171.98ms
step:1210/1530 train_loss:3.3052 train_time:206381ms step_avg:171.98ms
step:1211/1530 train_loss:3.3930 train_time:206558ms step_avg:171.99ms
step:1212/1530 train_loss:3.3845 train_time:206736ms step_avg:171.99ms
step:1213/1530 train_loss:3.4046 train_time:206914ms step_avg:172.00ms
step:1214/1530 train_loss:3.2489 train_time:207097ms step_avg:172.01ms
step:1215/1530 train_loss:3.3901 train_time:207275ms step_avg:172.01ms
step:1216/1530 train_loss:3.3251 train_time:207453ms step_avg:172.02ms
step:1217/1530 train_loss:3.3203 train_time:207631ms step_avg:172.02ms
step:1218/1530 train_loss:3.4006 train_time:207811ms step_avg:172.03ms
step:1219/1530 train_loss:3.2526 train_time:207994ms step_avg:172.04ms
step:1220/1530 train_loss:3.4642 train_time:208171ms step_avg:172.04ms
step:1221/1530 train_loss:3.4962 train_time:208348ms step_avg:172.05ms
step:1222/1530 train_loss:3.4239 train_time:208524ms step_avg:172.05ms
step:1223/1530 train_loss:3.2915 train_time:208701ms step_avg:172.05ms
step:1224/1530 train_loss:3.2489 train_time:208884ms step_avg:172.06ms
step:1225/1530 train_loss:3.3597 train_time:209061ms step_avg:172.07ms
step:1226/1530 train_loss:3.3269 train_time:209240ms step_avg:172.07ms
step:1227/1530 train_loss:3.2696 train_time:209421ms step_avg:172.08ms
step:1228/1530 train_loss:3.4416 train_time:209596ms step_avg:172.08ms
step:1229/1530 train_loss:3.3634 train_time:209776ms step_avg:172.09ms
step:1230/1530 train_loss:3.3935 train_time:209959ms step_avg:172.10ms
step:1231/1530 train_loss:3.5718 train_time:210140ms step_avg:172.10ms
step:1232/1530 train_loss:3.4929 train_time:210319ms step_avg:172.11ms
step:1233/1530 train_loss:3.4257 train_time:210498ms step_avg:172.12ms
step:1234/1530 train_loss:3.5803 train_time:210676ms step_avg:172.12ms
step:1235/1530 train_loss:3.3177 train_time:210855ms step_avg:172.13ms
step:1236/1530 train_loss:3.2876 train_time:211033ms step_avg:172.13ms
step:1237/1530 train_loss:3.2699 train_time:211210ms step_avg:172.14ms
step:1238/1530 train_loss:3.2770 train_time:211394ms step_avg:172.14ms
step:1239/1530 train_loss:3.3270 train_time:211572ms step_avg:172.15ms
step:1240/1530 train_loss:3.3808 train_time:211751ms step_avg:172.15ms
step:1241/1530 train_loss:3.4190 train_time:211930ms step_avg:172.16ms
step:1242/1530 train_loss:3.2909 train_time:212108ms step_avg:172.17ms
step:1243/1530 train_loss:3.4003 train_time:212286ms step_avg:172.17ms
step:1244/1530 train_loss:3.3988 train_time:212460ms step_avg:172.17ms
step:1245/1530 train_loss:3.4047 train_time:212636ms step_avg:172.17ms
step:1246/1530 train_loss:3.2430 train_time:212813ms step_avg:172.18ms
step:1247/1530 train_loss:3.3677 train_time:212989ms step_avg:172.18ms
step:1248/1530 train_loss:3.4257 train_time:213167ms step_avg:172.19ms
step:1249/1530 train_loss:3.4228 train_time:213348ms step_avg:172.19ms
step:1250/1530 train_loss:3.2995 train_time:213527ms step_avg:172.20ms
step:1250/1530 val_loss:3.3504 train_time:213579ms step_avg:172.24ms
step:1251/1530 train_loss:3.4802 train_time:213711ms step_avg:172.21ms
step:1252/1530 train_loss:3.3538 train_time:213886ms step_avg:172.21ms
step:1253/1530 train_loss:3.3092 train_time:214064ms step_avg:172.22ms
step:1254/1530 train_loss:3.4134 train_time:214246ms step_avg:172.22ms
step:1255/1530 train_loss:3.5141 train_time:214437ms step_avg:172.24ms
step:1256/1530 train_loss:3.3016 train_time:214619ms step_avg:172.25ms
step:1257/1530 train_loss:3.3703 train_time:214798ms step_avg:172.25ms
step:1258/1530 train_loss:3.3633 train_time:214981ms step_avg:172.26ms
step:1259/1530 train_loss:3.3217 train_time:215160ms step_avg:172.27ms
step:1260/1530 train_loss:3.2051 train_time:215339ms step_avg:172.27ms
step:1261/1530 train_loss:3.3019 train_time:215519ms step_avg:172.28ms
step:1262/1530 train_loss:3.3204 train_time:215701ms step_avg:172.28ms
step:1263/1530 train_loss:3.2371 train_time:215880ms step_avg:172.29ms
step:1264/1530 train_loss:3.4325 train_time:216055ms step_avg:172.29ms
step:1265/1530 train_loss:3.4204 train_time:216230ms step_avg:172.29ms
step:1266/1530 train_loss:3.4368 train_time:216409ms step_avg:172.30ms
step:1267/1530 train_loss:3.3637 train_time:216592ms step_avg:172.31ms
step:1268/1530 train_loss:3.4023 train_time:216772ms step_avg:172.31ms
step:1269/1530 train_loss:3.2499 train_time:216956ms step_avg:172.32ms
step:1270/1530 train_loss:3.1010 train_time:217135ms step_avg:172.33ms
step:1271/1530 train_loss:3.3977 train_time:217313ms step_avg:172.33ms
step:1272/1530 train_loss:3.3456 train_time:217488ms step_avg:172.34ms
step:1273/1530 train_loss:3.3703 train_time:217668ms step_avg:172.34ms
step:1274/1530 train_loss:3.3579 train_time:217849ms step_avg:172.35ms
step:1275/1530 train_loss:3.4309 train_time:218025ms step_avg:172.35ms
step:1276/1530 train_loss:3.4711 train_time:218200ms step_avg:172.35ms
step:1277/1530 train_loss:3.4072 train_time:218378ms step_avg:172.36ms
step:1278/1530 train_loss:3.4060 train_time:218552ms step_avg:172.36ms
step:1279/1530 train_loss:3.2585 train_time:218735ms step_avg:172.37ms
step:1280/1530 train_loss:3.3617 train_time:218920ms step_avg:172.38ms
step:1281/1530 train_loss:3.4250 train_time:219097ms step_avg:172.38ms
step:1282/1530 train_loss:3.4646 train_time:219272ms step_avg:172.38ms
step:1283/1530 train_loss:3.3328 train_time:219450ms step_avg:172.39ms
step:1284/1530 train_loss:3.3613 train_time:219628ms step_avg:172.39ms
step:1285/1530 train_loss:3.3564 train_time:219808ms step_avg:172.40ms
step:1286/1530 train_loss:3.3301 train_time:219986ms step_avg:172.40ms
step:1287/1530 train_loss:3.4845 train_time:220165ms step_avg:172.41ms
step:1288/1530 train_loss:3.2866 train_time:220347ms step_avg:172.42ms
step:1289/1530 train_loss:3.3749 train_time:220534ms step_avg:172.43ms
step:1290/1530 train_loss:3.4528 train_time:220720ms step_avg:172.44ms
step:1291/1530 train_loss:3.3764 train_time:220899ms step_avg:172.44ms
step:1292/1530 train_loss:3.4746 train_time:221080ms step_avg:172.45ms
step:1293/1530 train_loss:3.5125 train_time:221260ms step_avg:172.46ms
step:1294/1530 train_loss:3.4561 train_time:221441ms step_avg:172.46ms
step:1295/1530 train_loss:3.2779 train_time:221621ms step_avg:172.47ms
step:1296/1530 train_loss:3.3719 train_time:221801ms step_avg:172.47ms
step:1297/1530 train_loss:3.2697 train_time:221981ms step_avg:172.48ms
step:1298/1530 train_loss:3.2666 train_time:222162ms step_avg:172.49ms
step:1299/1530 train_loss:3.3928 train_time:222341ms step_avg:172.49ms
step:1300/1530 train_loss:3.3969 train_time:222518ms step_avg:172.49ms
step:1301/1530 train_loss:3.4002 train_time:222694ms step_avg:172.50ms
step:1302/1530 train_loss:3.5743 train_time:222876ms step_avg:172.50ms
step:1303/1530 train_loss:3.3057 train_time:223058ms step_avg:172.51ms
step:1304/1530 train_loss:3.5072 train_time:223240ms step_avg:172.52ms
step:1305/1530 train_loss:3.2515 train_time:223417ms step_avg:172.52ms
step:1306/1530 train_loss:3.4460 train_time:223599ms step_avg:172.53ms
step:1307/1530 train_loss:3.4562 train_time:223774ms step_avg:172.53ms
step:1308/1530 train_loss:3.2826 train_time:223953ms step_avg:172.54ms
step:1309/1530 train_loss:3.3080 train_time:224131ms step_avg:172.54ms
step:1310/1530 train_loss:3.2854 train_time:224308ms step_avg:172.54ms
step:1311/1530 train_loss:3.2926 train_time:224485ms step_avg:172.55ms
step:1312/1530 train_loss:3.3690 train_time:224664ms step_avg:172.55ms
step:1313/1530 train_loss:3.3399 train_time:224840ms step_avg:172.56ms
step:1314/1530 train_loss:3.0403 train_time:225024ms step_avg:172.56ms
step:1315/1530 train_loss:3.2734 train_time:225201ms step_avg:172.57ms
step:1316/1530 train_loss:3.3947 train_time:225377ms step_avg:172.57ms
step:1317/1530 train_loss:3.4148 train_time:225554ms step_avg:172.57ms
step:1318/1530 train_loss:3.2996 train_time:225741ms step_avg:172.58ms
step:1319/1530 train_loss:3.4206 train_time:225922ms step_avg:172.59ms
step:1320/1530 train_loss:3.4590 train_time:226104ms step_avg:172.60ms
step:1321/1530 train_loss:3.3573 train_time:226282ms step_avg:172.60ms
step:1322/1530 train_loss:3.3163 train_time:226593ms step_avg:172.71ms
step:1323/1530 train_loss:3.3155 train_time:226781ms step_avg:172.72ms
step:1324/1530 train_loss:3.4326 train_time:226963ms step_avg:172.73ms
step:1325/1530 train_loss:3.4857 train_time:227148ms step_avg:172.74ms
step:1326/1530 train_loss:3.2095 train_time:227328ms step_avg:172.74ms
step:1327/1530 train_loss:3.1642 train_time:227504ms step_avg:172.74ms
step:1328/1530 train_loss:3.4949 train_time:227684ms step_avg:172.75ms
step:1329/1530 train_loss:3.2948 train_time:228020ms step_avg:172.87ms
step:1330/1530 train_loss:3.4230 train_time:228202ms step_avg:172.88ms
step:1331/1530 train_loss:3.3317 train_time:228377ms step_avg:172.88ms
step:1332/1530 train_loss:3.7374 train_time:228559ms step_avg:172.89ms
step:1333/1530 train_loss:3.4773 train_time:228738ms step_avg:172.89ms
step:1334/1530 train_loss:3.3626 train_time:228917ms step_avg:172.90ms
step:1335/1530 train_loss:3.2908 train_time:229095ms step_avg:172.90ms
step:1336/1530 train_loss:3.2942 train_time:229279ms step_avg:172.91ms
step:1337/1530 train_loss:3.5464 train_time:229460ms step_avg:172.92ms
step:1338/1530 train_loss:3.5196 train_time:229640ms step_avg:172.92ms
step:1339/1530 train_loss:3.3340 train_time:229819ms step_avg:172.93ms
step:1340/1530 train_loss:3.2836 train_time:229997ms step_avg:172.93ms
step:1341/1530 train_loss:3.5880 train_time:230173ms step_avg:172.93ms
step:1342/1530 train_loss:3.3526 train_time:230352ms step_avg:172.94ms
step:1343/1530 train_loss:3.3602 train_time:230531ms step_avg:172.94ms
step:1344/1530 train_loss:3.4148 train_time:230711ms step_avg:172.95ms
step:1345/1530 train_loss:3.3813 train_time:230892ms step_avg:172.95ms
step:1346/1530 train_loss:3.2977 train_time:231068ms step_avg:172.96ms
step:1347/1530 train_loss:3.2777 train_time:231245ms step_avg:172.96ms
step:1348/1530 train_loss:3.3479 train_time:231423ms step_avg:172.96ms
step:1349/1530 train_loss:3.2714 train_time:231600ms step_avg:172.96ms
step:1350/1530 train_loss:3.3834 train_time:231780ms step_avg:172.97ms
step:1351/1530 train_loss:3.2423 train_time:231956ms step_avg:172.97ms
step:1352/1530 train_loss:3.3044 train_time:232135ms step_avg:172.98ms
step:1353/1530 train_loss:3.3962 train_time:232314ms step_avg:172.98ms
step:1354/1530 train_loss:3.2551 train_time:232490ms step_avg:172.98ms
step:1355/1530 train_loss:3.1897 train_time:232667ms step_avg:172.99ms
step:1356/1530 train_loss:3.5113 train_time:232847ms step_avg:172.99ms
step:1357/1530 train_loss:3.4212 train_time:233028ms step_avg:173.00ms
step:1358/1530 train_loss:3.1823 train_time:233205ms step_avg:173.00ms
step:1359/1530 train_loss:3.4383 train_time:233384ms step_avg:173.01ms
step:1360/1530 train_loss:3.3474 train_time:233563ms step_avg:173.01ms
step:1361/1530 train_loss:3.1277 train_time:233749ms step_avg:173.02ms
step:1362/1530 train_loss:3.3877 train_time:233931ms step_avg:173.03ms
step:1363/1530 train_loss:3.2833 train_time:234119ms step_avg:173.04ms
step:1364/1530 train_loss:3.3027 train_time:234296ms step_avg:173.04ms
step:1365/1530 train_loss:3.3103 train_time:234473ms step_avg:173.04ms
step:1366/1530 train_loss:3.4218 train_time:234656ms step_avg:173.05ms
step:1367/1530 train_loss:3.3921 train_time:234835ms step_avg:173.05ms
step:1368/1530 train_loss:3.3422 train_time:235014ms step_avg:173.06ms
step:1369/1530 train_loss:3.2750 train_time:235203ms step_avg:173.07ms
step:1370/1530 train_loss:3.6059 train_time:235381ms step_avg:173.07ms
step:1371/1530 train_loss:3.3101 train_time:235563ms step_avg:173.08ms
step:1372/1530 train_loss:3.3667 train_time:235745ms step_avg:173.09ms
step:1373/1530 train_loss:3.3649 train_time:235925ms step_avg:173.09ms
step:1374/1530 train_loss:3.1491 train_time:236106ms step_avg:173.10ms
step:1375/1530 train_loss:3.5320 train_time:236286ms step_avg:173.10ms
step:1375/1530 val_loss:3.3087 train_time:236336ms step_avg:173.14ms
step:1376/1530 train_loss:3.3426 train_time:236465ms step_avg:173.11ms
step:1377/1530 train_loss:3.4776 train_time:236643ms step_avg:173.11ms
step:1378/1530 train_loss:3.4639 train_time:236821ms step_avg:173.11ms
step:1379/1530 train_loss:3.1100 train_time:237003ms step_avg:173.12ms
step:1380/1530 train_loss:3.3123 train_time:237184ms step_avg:173.13ms
step:1381/1530 train_loss:3.7010 train_time:237369ms step_avg:173.14ms
step:1382/1530 train_loss:3.2116 train_time:237549ms step_avg:173.14ms
step:1383/1530 train_loss:3.3918 train_time:237731ms step_avg:173.15ms
step:1384/1530 train_loss:3.4749 train_time:237913ms step_avg:173.15ms
step:1385/1530 train_loss:3.4042 train_time:238088ms step_avg:173.15ms
step:1386/1530 train_loss:3.3340 train_time:238266ms step_avg:173.16ms
step:1387/1530 train_loss:3.1973 train_time:238445ms step_avg:173.16ms
step:1388/1530 train_loss:3.3464 train_time:238621ms step_avg:173.16ms
step:1389/1530 train_loss:3.3121 train_time:238806ms step_avg:173.17ms
step:1390/1530 train_loss:3.5639 train_time:238982ms step_avg:173.18ms
step:1391/1530 train_loss:3.2856 train_time:239160ms step_avg:173.18ms
step:1392/1530 train_loss:3.2825 train_time:239338ms step_avg:173.18ms
step:1393/1530 train_loss:3.2376 train_time:239517ms step_avg:173.19ms
step:1394/1530 train_loss:3.4958 train_time:239695ms step_avg:173.19ms
step:1395/1530 train_loss:3.3891 train_time:239873ms step_avg:173.19ms
step:1396/1530 train_loss:3.4017 train_time:240051ms step_avg:173.20ms
step:1397/1530 train_loss:3.3064 train_time:240228ms step_avg:173.20ms
step:1398/1530 train_loss:3.2541 train_time:240403ms step_avg:173.20ms
step:1399/1530 train_loss:3.3112 train_time:240581ms step_avg:173.20ms
step:1400/1530 train_loss:3.3179 train_time:240765ms step_avg:173.21ms
step:1401/1530 train_loss:3.3469 train_time:240941ms step_avg:173.21ms
step:1402/1530 train_loss:3.2949 train_time:241120ms step_avg:173.22ms
step:1403/1530 train_loss:3.4940 train_time:241306ms step_avg:173.23ms
step:1404/1530 train_loss:3.2793 train_time:241484ms step_avg:173.23ms
step:1405/1530 train_loss:3.3139 train_time:241665ms step_avg:173.24ms
step:1406/1530 train_loss:3.3090 train_time:241844ms step_avg:173.24ms
step:1407/1530 train_loss:3.1689 train_time:242021ms step_avg:173.24ms
step:1408/1530 train_loss:3.3128 train_time:242200ms step_avg:173.25ms
step:1409/1530 train_loss:3.3013 train_time:242388ms step_avg:173.26ms
step:1410/1530 train_loss:3.2865 train_time:242565ms step_avg:173.26ms
step:1411/1530 train_loss:3.3631 train_time:242741ms step_avg:173.26ms
step:1412/1530 train_loss:3.3337 train_time:242919ms step_avg:173.27ms
step:1413/1530 train_loss:3.3579 train_time:243097ms step_avg:173.27ms
step:1414/1530 train_loss:3.3256 train_time:243277ms step_avg:173.27ms
step:1415/1530 train_loss:3.4030 train_time:243461ms step_avg:173.28ms
step:1416/1530 train_loss:3.2303 train_time:243652ms step_avg:173.29ms
step:1417/1530 train_loss:3.2790 train_time:243834ms step_avg:173.30ms
step:1418/1530 train_loss:3.3869 train_time:244014ms step_avg:173.31ms
step:1419/1530 train_loss:3.3406 train_time:244196ms step_avg:173.31ms
step:1420/1530 train_loss:3.3667 train_time:244376ms step_avg:173.32ms
step:1421/1530 train_loss:3.3653 train_time:244556ms step_avg:173.32ms
step:1422/1530 train_loss:3.3296 train_time:244734ms step_avg:173.32ms
step:1423/1530 train_loss:3.3119 train_time:244914ms step_avg:173.33ms
step:1424/1530 train_loss:3.3310 train_time:245098ms step_avg:173.34ms
step:1425/1530 train_loss:3.1909 train_time:245283ms step_avg:173.35ms
step:1426/1530 train_loss:3.3217 train_time:245462ms step_avg:173.35ms
step:1427/1530 train_loss:3.2808 train_time:245644ms step_avg:173.35ms
step:1428/1530 train_loss:3.3758 train_time:245821ms step_avg:173.36ms
step:1429/1530 train_loss:3.3503 train_time:245999ms step_avg:173.36ms
step:1430/1530 train_loss:3.2597 train_time:246180ms step_avg:173.37ms
step:1431/1530 train_loss:3.3180 train_time:246363ms step_avg:173.37ms
step:1432/1530 train_loss:3.3401 train_time:246544ms step_avg:173.38ms
step:1433/1530 train_loss:3.1268 train_time:246728ms step_avg:173.39ms
step:1434/1530 train_loss:3.2839 train_time:246911ms step_avg:173.39ms
step:1435/1530 train_loss:3.1124 train_time:247092ms step_avg:173.40ms
step:1436/1530 train_loss:3.2258 train_time:247273ms step_avg:173.40ms
step:1437/1530 train_loss:3.4027 train_time:247451ms step_avg:173.41ms
step:1438/1530 train_loss:3.3778 train_time:247630ms step_avg:173.41ms
step:1439/1530 train_loss:3.3118 train_time:247813ms step_avg:173.42ms
step:1440/1530 train_loss:3.1915 train_time:247987ms step_avg:173.42ms
step:1441/1530 train_loss:3.3358 train_time:248166ms step_avg:173.42ms
step:1442/1530 train_loss:3.3818 train_time:248349ms step_avg:173.43ms
step:1443/1530 train_loss:3.4871 train_time:248536ms step_avg:173.44ms
step:1444/1530 train_loss:3.4454 train_time:248713ms step_avg:173.44ms
step:1445/1530 train_loss:3.3358 train_time:248892ms step_avg:173.44ms
step:1446/1530 train_loss:3.1952 train_time:249073ms step_avg:173.45ms
step:1447/1530 train_loss:3.2947 train_time:249254ms step_avg:173.45ms
step:1448/1530 train_loss:3.2956 train_time:249433ms step_avg:173.46ms
step:1449/1530 train_loss:3.3905 train_time:249612ms step_avg:173.46ms
step:1450/1530 train_loss:3.3851 train_time:249794ms step_avg:173.47ms
step:1451/1530 train_loss:3.2017 train_time:249974ms step_avg:173.47ms
step:1452/1530 train_loss:3.3245 train_time:250154ms step_avg:173.48ms
step:1453/1530 train_loss:3.2613 train_time:250329ms step_avg:173.48ms
step:1454/1530 train_loss:3.2889 train_time:250508ms step_avg:173.48ms
step:1455/1530 train_loss:3.3222 train_time:250690ms step_avg:173.49ms
step:1456/1530 train_loss:3.2799 train_time:250867ms step_avg:173.49ms
step:1457/1530 train_loss:3.1498 train_time:251044ms step_avg:173.49ms
step:1458/1530 train_loss:3.4201 train_time:251222ms step_avg:173.50ms
step:1459/1530 train_loss:3.2651 train_time:251404ms step_avg:173.50ms
step:1460/1530 train_loss:3.3136 train_time:251585ms step_avg:173.51ms
step:1461/1530 train_loss:3.4295 train_time:251765ms step_avg:173.51ms
step:1462/1530 train_loss:3.2576 train_time:251940ms step_avg:173.51ms
step:1463/1530 train_loss:3.4635 train_time:252124ms step_avg:173.52ms
step:1464/1530 train_loss:3.3553 train_time:252302ms step_avg:173.52ms
step:1465/1530 train_loss:3.3557 train_time:252480ms step_avg:173.53ms
step:1466/1530 train_loss:3.2829 train_time:252658ms step_avg:173.53ms
step:1467/1530 train_loss:3.3933 train_time:252837ms step_avg:173.53ms
step:1468/1530 train_loss:3.2843 train_time:253015ms step_avg:173.54ms
step:1469/1530 train_loss:3.2712 train_time:253195ms step_avg:173.54ms
step:1470/1530 train_loss:3.3266 train_time:253378ms step_avg:173.55ms
step:1471/1530 train_loss:3.2564 train_time:253565ms step_avg:173.56ms
step:1472/1530 train_loss:3.2427 train_time:253748ms step_avg:173.56ms
step:1473/1530 train_loss:3.4380 train_time:253925ms step_avg:173.56ms
step:1474/1530 train_loss:3.3093 train_time:254109ms step_avg:173.57ms
step:1475/1530 train_loss:3.1468 train_time:254295ms step_avg:173.58ms
step:1476/1530 train_loss:3.2668 train_time:254473ms step_avg:173.58ms
step:1477/1530 train_loss:3.2373 train_time:254660ms step_avg:173.59ms
step:1478/1530 train_loss:3.3040 train_time:254844ms step_avg:173.60ms
step:1479/1530 train_loss:3.3928 train_time:255025ms step_avg:173.60ms
step:1480/1530 train_loss:3.2673 train_time:255203ms step_avg:173.61ms
step:1481/1530 train_loss:3.4489 train_time:255385ms step_avg:173.61ms
step:1482/1530 train_loss:3.3614 train_time:255572ms step_avg:173.62ms
step:1483/1530 train_loss:3.2767 train_time:255762ms step_avg:173.63ms
step:1484/1530 train_loss:3.2607 train_time:255949ms step_avg:173.64ms
step:1485/1530 train_loss:3.2781 train_time:256129ms step_avg:173.65ms
step:1486/1530 train_loss:3.2250 train_time:256314ms step_avg:173.65ms
step:1487/1530 train_loss:3.3374 train_time:256496ms step_avg:173.66ms
step:1488/1530 train_loss:3.2455 train_time:256679ms step_avg:173.67ms
step:1489/1530 train_loss:3.3129 train_time:256859ms step_avg:173.67ms
step:1490/1530 train_loss:3.2509 train_time:257039ms step_avg:173.67ms
step:1491/1530 train_loss:3.1570 train_time:257219ms step_avg:173.68ms
step:1492/1530 train_loss:3.2722 train_time:257399ms step_avg:173.68ms
step:1493/1530 train_loss:3.4314 train_time:257578ms step_avg:173.69ms
step:1494/1530 train_loss:3.2897 train_time:257758ms step_avg:173.69ms
step:1495/1530 train_loss:3.0271 train_time:257943ms step_avg:173.70ms
step:1496/1530 train_loss:3.3555 train_time:258126ms step_avg:173.71ms
step:1497/1530 train_loss:3.3083 train_time:258311ms step_avg:173.71ms
step:1498/1530 train_loss:3.3432 train_time:258496ms step_avg:173.72ms
step:1499/1530 train_loss:3.3081 train_time:258685ms step_avg:173.73ms
step:1500/1530 train_loss:3.2944 train_time:258877ms step_avg:173.74ms
step:1500/1530 val_loss:3.2768 train_time:258933ms step_avg:173.78ms
step:1501/1530 train_loss:3.0853 train_time:259069ms step_avg:173.75ms
step:1502/1530 train_loss:3.3580 train_time:259260ms step_avg:173.77ms
step:1503/1530 train_loss:3.2429 train_time:259439ms step_avg:173.77ms
step:1504/1530 train_loss:3.2432 train_time:259620ms step_avg:173.77ms
step:1505/1530 train_loss:3.2102 train_time:259800ms step_avg:173.78ms
step:1506/1530 train_loss:3.2773 train_time:259981ms step_avg:173.78ms
step:1507/1530 train_loss:3.1756 train_time:260178ms step_avg:173.80ms
step:1508/1530 train_loss:3.4763 train_time:260360ms step_avg:173.81ms
step:1509/1530 train_loss:3.2819 train_time:260538ms step_avg:173.81ms
step:1510/1530 train_loss:3.2701 train_time:260718ms step_avg:173.81ms
step:1511/1530 train_loss:3.4171 train_time:261028ms step_avg:173.90ms
step:1512/1530 train_loss:3.4171 train_time:261215ms step_avg:173.91ms
step:1513/1530 train_loss:3.2624 train_time:261399ms step_avg:173.92ms
step:1514/1530 train_loss:3.0858 train_time:261581ms step_avg:173.92ms
step:1515/1530 train_loss:3.2414 train_time:261761ms step_avg:173.93ms
step:1516/1530 train_loss:3.2538 train_time:261946ms step_avg:173.94ms
step:1517/1530 train_loss:3.3004 train_time:262127ms step_avg:173.94ms
step:1518/1530 train_loss:3.2066 train_time:262310ms step_avg:173.95ms
step:1519/1530 train_loss:3.4996 train_time:262636ms step_avg:174.05ms
step:1520/1530 train_loss:3.1267 train_time:262823ms step_avg:174.05ms
step:1521/1530 train_loss:3.2014 train_time:262999ms step_avg:174.06ms
step:1522/1530 train_loss:3.3553 train_time:263184ms step_avg:174.06ms
step:1523/1530 train_loss:3.2244 train_time:263362ms step_avg:174.07ms
step:1524/1530 train_loss:3.3462 train_time:263542ms step_avg:174.07ms
step:1525/1530 train_loss:3.3361 train_time:263730ms step_avg:174.08ms
step:1526/1530 train_loss:3.2723 train_time:263920ms step_avg:174.09ms
step:1527/1530 train_loss:3.2867 train_time:264102ms step_avg:174.09ms
step:1528/1530 train_loss:3.4051 train_time:264282ms step_avg:174.10ms
step:1529/1530 train_loss:3.4040 train_time:264459ms step_avg:174.10ms
step:1530/1530 train_loss:3.2366 train_time:264638ms step_avg:174.10ms
step:1530/1530 val_loss:3.2744 train_time:264692ms step_avg:174.14ms