import os
import sys
with open(sys.argv[0]) as f:
    code = f.read() # read the code of this file ASAP, for logging
import uuid
import time
from dataclasses import dataclass
from pathlib import Path

import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
import torch._inductor.config as config
from torch.nn.parallel import DistributedDataParallel as DDP
# Use of FlexAttention contributed by @KoszarskyB
from torch.nn.attention.flex_attention import BlockMask, flex_attention

# -----------------------------------------------------------------------------
# Muon optimizer

def zeropower_via_svd(G, steps=None):
    U, S, V = G.svd()
    return U @ V.T

@torch.compile
def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7):
    """
    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
    zero even beyond the point where the iteration no longer converges all the way to one everywhere
    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
    performance at all relative to UV^T, where USV^T = G is the SVD.
    """
    assert len(G.shape) == 2
    a, b, c = (3.4445, -4.7750,  2.0315)
    X = G.bfloat16()
    X /= (X.norm() + eps) # ensure top singular value <= 1
    if G.size(0) > G.size(1):
        X = X.T
    for _ in range(steps):
        A = X @ X.T
        B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
        X = a * X + B @ X
    if G.size(0) > G.size(1):
        X = X.T
    return X

zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5)

class Muon(torch.optim.Optimizer):
    """
    Muon - MomentUm Orthogonalized by Newton-schulz

    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
    the advantage that it can be stably run in bfloat16 on the GPU.

    Some warnings:
    - This optimizer assumes that all parameters passed in are 2D.
    - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D
    parameters; those should all be optimized by a standard method (e.g., AdamW).
    - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
    - We believe it is unlikely to work well for training with small batch size.
    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
    - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M).

    Arguments:
        lr: The learning rate used by the internal SGD.
        momentum: The momentum used by the internal SGD.
        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
        backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5')
        backend_steps: The number of iteration steps to use in the backend, if it is iterative.
    """
    def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True,
                 backend='newtonschulz5', backend_steps=5):
        self.num_process = int(os.environ['WORLD_SIZE'])
        self.rank = int(os.environ["RANK"])
        defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps)
        params: "list[torch.Tensor]" = list(params)
        assert all(isinstance(p, torch.Tensor) for p in params)
        sizes = {p.numel() for p in params}
        param_groups = [
            {
                "params": [p for p in params if p.numel() == size],
                "update_buffer": [
                    torch.empty(size, device="cuda", dtype=torch.bfloat16)
                    for _ in range(self.num_process)
                ],
            }
            for size in sizes
        ]
        super().__init__(param_groups, defaults)

    def step(self):
        for group in self.param_groups:
            lr: float = group["lr"]
            momentum: float = group["momentum"]
            nesterov: bool = group["nesterov"]
            zeropower_backend = zeropower_backends[group["backend"]]
            backend_steps: int = group["backend_steps"]
            update_buffers: "list[torch.Tensor]" = group["update_buffer"]
            # generate weight updates in distributed fashion
            params: "list[torch.Tensor]" = group["params"]
            assert len(params) % self.num_process == 0
            handle = None
            params_world = None
            def update_prev():
                if params_world is None:
                    return
                assert handle is not None
                handle.wait()
                for p_world, g_world in zip(params_world, update_buffers):
                    p_world.data.add_(
                        g_world.view_as(p_world),
                        alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5,
                    )
            for base_i in range(len(params))[::self.num_process]:
                p = params[base_i + self.rank]
                g = p.grad
                assert g is not None
                state = self.state[p] 
                if "momentum_buffer" not in state:
                    state["momentum_buffer"] = torch.zeros_like(g)
                buf: torch.Tensor = state["momentum_buffer"]
                buf.lerp_(g, 1 - momentum)
                g = g.lerp_(buf, momentum) if nesterov else buf
                g = zeropower_backend(g, steps=backend_steps).flatten()
                update_prev()
                handle = dist.all_gather(update_buffers, g, async_op=True)
                params_world = params[base_i : base_i + self.num_process]
            update_prev()


# -----------------------------------------------------------------------------
# PyTorch nn.Module definitions for the GPT-2 model

def norm(x):
    return F.rms_norm(x, (x.size(-1),))

class CastedLinear(nn.Linear):

    def __init__(self, in_features, out_features):
        super().__init__(in_features, out_features, bias=False)

    def forward(self, x):
        return F.linear(x, self.weight.to(x.dtype))

class Rotary(torch.nn.Module):

    def __init__(self, dim, base=10000):
        super().__init__()
        self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim))
        self.seq_len_cached = None
        self.cos_cached = None
        self.sin_cached = None

    def forward(self, x):
        seq_len = x.shape[1]
        if seq_len != self.seq_len_cached:
            t = torch.arange(seq_len, device=x.device)
            freqs = torch.outer(t, self.inv_freq)
            self.seq_len_cached = seq_len
            self.cos_cached = freqs.cos()
            self.sin_cached = freqs.sin()
        cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :]
        # apply_rotary_emb(x, cos, sin)
        x1, x2 = x.chunk(2, dim=3)
        y1 = x1 * cos + x2 * sin
        y2 = x1 * (-sin) + x2 * cos
        return torch.cat((y1, y2), 3).type_as(x)

class CausalSelfAttention(nn.Module):

    def __init__(self, dim, n_head):
        super().__init__()
        assert dim % n_head == 0
        self.n_head = n_head
        self.c_q = CastedLinear(dim, dim)
        self.c_k = CastedLinear(dim, dim)
        self.c_v = CastedLinear(dim, dim)
        # value residual lambda
        self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) # @Grad62304977
        # rotary embeddings
        self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim
        # output projection
        self.c_proj = CastedLinear(dim, dim)
        self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977

    def forward(self, x: torch.Tensor, vi: torch.Tensor, block_mask: BlockMask) -> torch.Tensor:
        B, T = x.size(0), x.size(1) # batch size, sequence length
        assert B == 1, "Must use batch size = 1 for FlexAttention"
        q: torch.Tensor = self.c_q(x).view(B, T, self.n_head, -1)
        k: torch.Tensor = self.c_k(x).view(B, T, self.n_head, -1)
        v: torch.Tensor = self.c_v(x).view(B, T, self.n_head, -1)
        v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @Grad62304977
        q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977
        q, k = self.rotary(q), self.rotary(k)
        y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask)
        y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, dim: int):
        super().__init__()
        self.c_fc   = CastedLinear(dim, 4 * dim)
        self.c_proj = CastedLinear(4 * dim, dim)
        self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.c_fc(x)
        x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.attn = CausalSelfAttention(config.n_embd, config.n_head)
        self.mlp = MLP(config.n_embd)
        self.lambdas = nn.Parameter(torch.tensor([1., 0.]))

    def forward(self, x: torch.Tensor, vi: torch.Tensor, x0: torch.Tensor, block_mask: BlockMask) -> torch.Tensor:
        x = self.lambdas[0] * x + self.lambdas[1] * x0
        x = x + self.attn(norm(x), vi, block_mask)
        x = x + self.mlp(norm(x))
        return x

# -----------------------------------------------------------------------------
# The main GPT-2 model

@dataclass
class GPTConfig:
    vocab_size : int = 50304
    n_layer : int = 12
    n_head : int = 6 # head dim 128 suggested by @Grad62304977
    n_embd : int = 768
    lm_head_softcap : int = 30

class GPT(nn.Module):

    def __init__(self, config: GPTConfig):
        super().__init__()
        self.n_layer = config.n_layer
        self.lm_head_softcap = config.lm_head_softcap

        # U-net design by @brendanh0gan
        self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder
        self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder
        # Add learnable skip connection weights for decoder layers
        self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers))

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning
            # U-net structure on token value embeddings by @leloykun
            vte = nn.Embedding(config.vocab_size, config.n_embd*self.num_encoder_layers),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
        ))
        self.lm_head = CastedLinear(config.n_embd, config.vocab_size)
        self.lm_head.weight.data.zero_() # @Grad62304977

    def forward(self, idx: torch.Tensor, target: torch.Tensor, sliding_window: torch.Tensor) -> torch.Tensor:
        BLOCK_SIZE = 128
        assert idx.ndim == 1
        docs = (idx == 50256).cumsum(0)
        docs_low = docs.reshape(-1, BLOCK_SIZE)[:, 0].contiguous()
        docs_high = docs.reshape(-1, BLOCK_SIZE)[:, -1].contiguous()
        def document_sliding_window_causal(b, h, q_idx, kv_idx):
            causal_mask = q_idx >= kv_idx
            document_mask = docs[q_idx] == docs[kv_idx]
            window_mask = q_idx - kv_idx < sliding_window
            return causal_mask & document_mask & window_mask

        S = len(idx)
        def create_sliding_window_causal_mask(S: int, sliding_window: torch.Tensor):
            kv_idx = block_idx = torch.arange(S // BLOCK_SIZE, dtype=torch.int32, device="cuda")
            q_idx = block_idx[:, None]
            causal_mask = q_idx >= kv_idx
            document_mask = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx])
            window_mask = q_idx - kv_idx < ((sliding_window + BLOCK_SIZE - 1) // BLOCK_SIZE)
            dense_mask = causal_mask & document_mask & window_mask
            dense_mask = dense_mask.to(torch.int32)
            num_blocks = dense_mask.sum(dim=-1).to(torch.int32)
            indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True).to(torch.int32)
            num_blocks = num_blocks[None, None, :].contiguous()
            indices = indices[None, None, :].contiguous()
            return BlockMask.from_kv_blocks(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_sliding_window_causal)
        block_mask = create_sliding_window_causal_mask(S, sliding_window)

        # forward the GPT model itself
        x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd)
        x = norm(x) # @Grad62304977
        x0 = x
        vi = self.transformer.vte(idx[None]).chunk(self.num_encoder_layers, dim=-1)

        # Store outputs for U-Net skip connections
        skip_connections = []
        # Encoder pass - process only the first half of the blocks
        for i in range(self.num_encoder_layers):
            x = self.transformer.h[i](x, vi[i], x0, block_mask)
            skip_connections.append(x)
        # Decoder pass - process the remaining blocks with weighted skip connections
        for i in range(self.num_decoder_layers):
            x = x + self.skip_weights[i] * skip_connections.pop()
            # U-net structure on token value embeddings by @leloykun
            x = self.transformer.h[self.num_encoder_layers + i](x, vi[self.num_encoder_layers-1-i], x0, block_mask)

        x = norm(x)
        logits = self.lm_head(x)
        logits = self.lm_head_softcap * torch.tanh(logits / self.lm_head_softcap) # @Grad62304977
        logits = logits.float()
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1))
        return loss

# -----------------------------------------------------------------------------
# Our own simple Distributed Data Loader

def _peek_data_shard(file: Path):
    # only reads the header, returns header data
    # header is 256 int32
    header = torch.from_file(f"{file}", False, 256, dtype=torch.int32)
    assert header[0] == 20240520, "magic number mismatch in the data .bin file"
    assert header[1] == 1, "unsupported version"
    return int(header[2]) # number of tokens (claimed)

def _load_data_shard(file: Path, ntok: int):
    with file.open("rb") as f:
        tokens = torch.empty(ntok, dtype=torch.uint16, pin_memory=True)
        f.seek(256 * 4)
        nbytes = f.readinto(tokens.numpy())
        assert nbytes == 2 * ntok, "number of tokens read does not match header?"
    return tokens

class DistributedDataLoader:
    def __init__(self, filename_pattern, T, process_rank, num_processes):
        self.process_rank = process_rank
        self.num_processes = num_processes
        self.T = T

        # glob files that match the pattern
        self.files = sorted(Path.cwd().glob(filename_pattern))
        assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}"

        # load and validate all data shards, count number of tokens in total
        self.ntoks = [_peek_data_shard(file) for file in self.files]
        assert min(self.ntoks) >= num_processes * T + 1
        self.ntok_total = sum(self.ntoks)

        self.reset()

    def reset(self):
        self.current_shard = -1
        self.advance()

    def advance(self): # advance to next data shard
        self.current_shard = (self.current_shard + 1) % len(self.files)
        self.current_position = self.process_rank * self.T
        self.tokens = _load_data_shard(self.files[self.current_shard], self.ntoks[self.current_shard])

    def next_batch(self):
        batch_size = self.T * self.num_processes
        buf = self.tokens[self.current_position:self.current_position+self.T+1]
        # host side async is sufficient;
        # no performance improvement was observed when introducing a separate stream.
        x = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs
        y = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets
        # advance current position and load next shard if necessary
        self.current_position += batch_size
        if self.current_position + batch_size + 1 >= len(self.tokens):
            self.advance()
        return x, y

# -----------------------------------------------------------------------------
# int main

@dataclass
class Hyperparameters:
    # data hyperparams
    input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on
    input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on
    # optimization hyperparams
    batch_size : int = 8 # batch size, in sequences, across all devices
    sequence_length : int = 64*1024 # sequence length, in tokens
    num_iterations : int = 1480 # number of iterations to run
    warmup_iters : int = 0
    cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule
    weight_decay : float = 0
    # evaluation and logging hyperparams
    val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end
    val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons
    save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end
args = Hyperparameters()

# set up DDP (distributed data parallel). torchrun sets this env variable
assert torch.cuda.is_available()
dist.init_process_group(backend='nccl')
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
print(f"using device: {device}")
master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc.

# begin logging
logfile = None
if master_process:
    run_id = str(uuid.uuid4())
    logdir = 'logs/%s/' % run_id
    # os.makedirs(logdir, exist_ok=True)
    logfile = 'logs/%s.txt' % run_id
    # create the log file
    with open(logfile, "w") as f:
        # begin the log by printing this file (the Python code)
        f.write(code)
        f.write('='*100 + '\n')
def print0(s, logonly=False):
    if master_process:
        with open(logfile, "a") as f:
            if not logonly:
                print(s)
            f.write(s+'\n')
# log information about the hardware/software environment this is running on
# and print the full `nvidia-smi` to file
print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:")
import subprocess
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
print0(f'{result.stdout}', logonly=True)
print0('='*100, logonly=True)

# convenience variables
T = args.sequence_length
# calculate the number of steps to take in the val loop.
assert args.val_tokens % (T * ddp_world_size) == 0
val_steps = args.val_tokens // (T * ddp_world_size)
# calculate the steps of gradient accumulation required to attain the desired global batch size.
assert args.batch_size % (ddp_world_size) == 0
train_accumulation_steps = args.batch_size // ddp_world_size
assert train_accumulation_steps == 1

# load tokens
train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size)
val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size)
print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files")
print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files")
print0('='*100, logonly=True)
x, y = train_loader.next_batch()

# there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977.
# this originates from Karpathy's experiments.
num_vocab = 50304
model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768))
model = model.cuda().bfloat16()
for m in model.modules():
    if isinstance(m, CastedLinear):
        m.float()
if hasattr(config, "coordinate_descent_tuning"):
    config.coordinate_descent_tuning = True # suggested by @Chillee
model = torch.compile(model)
# here we wrap model into DDP container
model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module # always contains the "raw" unwrapped model

# init the optimizer(s)
optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight, raw_model.transformer.vte.weight], lr=0.6, betas=(0.8, 0.95), fused=True)
optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True)
params = list(raw_model.transformer.h.parameters())
matrix_params = [p for p in params if p.ndim == 2]
scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights]
optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95)
optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True)
optimizers = [optimizer1, optimizer2, optimizer3, optimizer4]
# learning rate decay scheduler (linear warmup and cooldown)
def get_lr(it):
    assert it <= args.num_iterations
    # 1) linear warmup for warmup_iters steps
    if it < args.warmup_iters:
        return (it+1) / args.warmup_iters
    # 2) constant lr for a while
    elif it < args.num_iterations - args.cooldown_iters:
        return 1.0
    # 3) linear cooldown
    else:
        decay_ratio = (args.num_iterations - it) / args.cooldown_iters
        return decay_ratio
schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers]

sliding_window_size = torch.tensor(64, dtype=torch.int32, device="cuda")
sw_size_prev = 64
# Start training loop
training_time_ms = 0
# start the clock
torch.cuda.synchronize()
t0 = time.perf_counter()
# begin training
for step in range(args.num_iterations + 1):
    last_step = (step == args.num_iterations)
    # This effectively ignores timing first 10 steps, which are slower for weird reasons.
    # Alternately, and slightly more correctly in terms of benchmarking, we could do 10
    # steps with dummy data first, and then re-initialize the model and reset the loader.
    if step == 10:
        training_time_ms = 0
        t0 = time.perf_counter()
    timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val

    # Set the sliding window size for the current step, in chunks of 64. By @fernbear.bsky.social
    sw_size =  64 * int((64 + (1792 - 64) * step / args.num_iterations) // 64)
    if sw_size != sw_size_prev:
        sliding_window_size.copy_(sw_size, non_blocking=True)
        sw_size_prev = sw_size

    # once in a while evaluate the validation dataset
    if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)):
        # stop the clock
        torch.cuda.synchronize()
        training_time_ms += 1000 * (time.perf_counter() - t0)
        # run validation batches
        model.eval()
        val_loader.reset()
        val_loss = 0.0
        for _ in range(val_steps):
            with torch.no_grad():
                x_val, y_val = val_loader.next_batch()
                val_loss += model(x_val, y_val, sliding_window=sliding_window_size)
        dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
        val_loss /= val_steps
        # log val loss to console and to logfile
        print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms')
        # start the clock again
        torch.cuda.synchronize()
        t0 = time.perf_counter()

    if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)):
        # stop the clock
        torch.cuda.synchronize()
        training_time_ms += 1000 * (time.perf_counter() - t0)
        # save the state of the training process
        log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers])
        # torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step))
        # start the clock again
        torch.cuda.synchronize()
        t0 = time.perf_counter()

    # bit confusing: we want to make sure to eval on 0th iteration
    # but also after the very last iteration. so we loop for step <= num_iterations
    # instead of just < num_iterations (one extra due to <=), only to do
    # the validation/sampling one last time, and then we break right here as we're done.
    if last_step:
        break

    # --------------- TRAINING SECTION BEGIN -----------------
    model.train()
    loss = model(x, y, sliding_window=sliding_window_size)
    loss.backward()
    del loss
    # advance the dataset for the next batch
    x, y = train_loader.next_batch()
    # momentum warmup for Muon
    frac = min(step/300, 1)
    for group in optimizer3.param_groups:
        group['momentum'] = (1 - frac) * 0.85 + frac * 0.95
    # step the optimizers and schedulers
    for opt, sched in zip(optimizers, schedulers):
        opt.step()
        sched.step()
    # null the gradients
    model.zero_grad(set_to_none=True)
    # --------------- TRAINING SECTION END -------------------
    # everything that follows now is just diagnostics, prints, logging, etc.
    approx_time = training_time_ms + 1000 * (time.perf_counter() - t0)
    print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms")

if master_process:
    print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB")

# -------------------------------------------------------------------------
# clean up nice
dist.destroy_process_group()
====================================================================================================
Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4
nvidia-smi:
Sun Dec  8 08:27:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.6     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:65:02.0 Off |                    0 |
| N/A   36C    P0              74W / 700W |      7MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  | 00000000:67:02.0 Off |                    0 |
| N/A   46C    P0             131W / 700W |    533MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA H100 80GB HBM3          On  | 00000000:69:02.0 Off |                    0 |
| N/A   46C    P0             123W / 700W |    533MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   3  NVIDIA H100 80GB HBM3          On  | 00000000:6B:02.0 Off |                    0 |
| N/A   39C    P0             118W / 700W |    533MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   4  NVIDIA H100 80GB HBM3          On  | 00000000:6F:02.0 Off |                    0 |
| N/A   39C    P0             117W / 700W |    533MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   5  NVIDIA H100 80GB HBM3          On  | 00000000:71:02.0 Off |                    0 |
| N/A   45C    P0             122W / 700W |    533MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   6  NVIDIA H100 80GB HBM3          On  | 00000000:73:02.0 Off |                    0 |
| N/A   46C    P0             127W / 700W |    533MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   7  NVIDIA H100 80GB HBM3          On  | 00000000:75:02.0 Off |                    0 |
| N/A   38C    P0             124W / 700W |    533MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
+---------------------------------------------------------------------------------------+

====================================================================================================
Training DataLoader: total number of tokens: 3200000000 across 32 files
Validation DataLoader: total number of tokens: 100000000 across 1 files
====================================================================================================
step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms
step:1/1480 train_time:23265ms step_avg:nanms
step:2/1480 train_time:23444ms step_avg:nanms
step:3/1480 train_time:23583ms step_avg:nanms
step:4/1480 train_time:23725ms step_avg:nanms
step:5/1480 train_time:23867ms step_avg:nanms
step:6/1480 train_time:24009ms step_avg:nanms
step:7/1480 train_time:24150ms step_avg:nanms
step:8/1480 train_time:24294ms step_avg:nanms
step:9/1480 train_time:24437ms step_avg:nanms
step:10/1480 train_time:24581ms step_avg:nanms
step:11/1480 train_time:143ms step_avg:nanms
step:12/1480 train_time:287ms step_avg:nanms
step:13/1480 train_time:430ms step_avg:143.19ms
step:14/1480 train_time:571ms step_avg:142.73ms
step:15/1480 train_time:713ms step_avg:142.62ms
step:16/1480 train_time:856ms step_avg:142.64ms
step:17/1480 train_time:1000ms step_avg:142.82ms
step:18/1480 train_time:1144ms step_avg:142.99ms
step:19/1480 train_time:1287ms step_avg:142.96ms
step:20/1480 train_time:1430ms step_avg:142.96ms
step:21/1480 train_time:1570ms step_avg:142.74ms
step:22/1480 train_time:1712ms step_avg:142.70ms
step:23/1480 train_time:1854ms step_avg:142.63ms
step:24/1480 train_time:1998ms step_avg:142.68ms
step:25/1480 train_time:2140ms step_avg:142.68ms
step:26/1480 train_time:2284ms step_avg:142.77ms
step:27/1480 train_time:2428ms step_avg:142.83ms
step:28/1480 train_time:2570ms step_avg:142.76ms
step:29/1480 train_time:2711ms step_avg:142.68ms
step:30/1480 train_time:2853ms step_avg:142.63ms
step:31/1480 train_time:2996ms step_avg:142.65ms
step:32/1480 train_time:3139ms step_avg:142.68ms
step:33/1480 train_time:3284ms step_avg:142.76ms
step:34/1480 train_time:3427ms step_avg:142.78ms
step:35/1480 train_time:3569ms step_avg:142.76ms
step:36/1480 train_time:3711ms step_avg:142.73ms
step:37/1480 train_time:3853ms step_avg:142.69ms
step:38/1480 train_time:3995ms step_avg:142.69ms
step:39/1480 train_time:4139ms step_avg:142.73ms
step:40/1480 train_time:4284ms step_avg:142.81ms
step:41/1480 train_time:4429ms step_avg:142.86ms
step:42/1480 train_time:4570ms step_avg:142.80ms
step:43/1480 train_time:4711ms step_avg:142.77ms
step:44/1480 train_time:4852ms step_avg:142.72ms
step:45/1480 train_time:4994ms step_avg:142.67ms
step:46/1480 train_time:5137ms step_avg:142.69ms
step:47/1480 train_time:5282ms step_avg:142.75ms
step:48/1480 train_time:5426ms step_avg:142.80ms
step:49/1480 train_time:5570ms step_avg:142.83ms
step:50/1480 train_time:5713ms step_avg:142.81ms
step:51/1480 train_time:5854ms step_avg:142.78ms
step:52/1480 train_time:5995ms step_avg:142.75ms
step:53/1480 train_time:6138ms step_avg:142.73ms
step:54/1480 train_time:6281ms step_avg:142.76ms
step:55/1480 train_time:6425ms step_avg:142.78ms
step:56/1480 train_time:6569ms step_avg:142.80ms
step:57/1480 train_time:6711ms step_avg:142.79ms
step:58/1480 train_time:6852ms step_avg:142.75ms
step:59/1480 train_time:6993ms step_avg:142.71ms
step:60/1480 train_time:7134ms step_avg:142.68ms
step:61/1480 train_time:7276ms step_avg:142.67ms
step:62/1480 train_time:7419ms step_avg:142.67ms
step:63/1480 train_time:7561ms step_avg:142.66ms
step:64/1480 train_time:7704ms step_avg:142.66ms
step:65/1480 train_time:7847ms step_avg:142.67ms
step:66/1480 train_time:7988ms step_avg:142.65ms
step:67/1480 train_time:8130ms step_avg:142.64ms
step:68/1480 train_time:8273ms step_avg:142.64ms
step:69/1480 train_time:8414ms step_avg:142.61ms
step:70/1480 train_time:8556ms step_avg:142.60ms
step:71/1480 train_time:8699ms step_avg:142.61ms
step:72/1480 train_time:8843ms step_avg:142.62ms
step:73/1480 train_time:8985ms step_avg:142.62ms
step:74/1480 train_time:9128ms step_avg:142.63ms
step:75/1480 train_time:9270ms step_avg:142.62ms
step:76/1480 train_time:9412ms step_avg:142.61ms
step:77/1480 train_time:9554ms step_avg:142.59ms
step:78/1480 train_time:9695ms step_avg:142.58ms
step:79/1480 train_time:9838ms step_avg:142.58ms
step:80/1480 train_time:9981ms step_avg:142.58ms
step:81/1480 train_time:10124ms step_avg:142.60ms
step:82/1480 train_time:10267ms step_avg:142.60ms
step:83/1480 train_time:10411ms step_avg:142.61ms
step:84/1480 train_time:10552ms step_avg:142.60ms
step:85/1480 train_time:10694ms step_avg:142.59ms
step:86/1480 train_time:10835ms step_avg:142.57ms
step:87/1480 train_time:10978ms step_avg:142.58ms
step:88/1480 train_time:11122ms step_avg:142.60ms
step:89/1480 train_time:11264ms step_avg:142.58ms
step:90/1480 train_time:11407ms step_avg:142.59ms
step:91/1480 train_time:11549ms step_avg:142.58ms
step:92/1480 train_time:11691ms step_avg:142.57ms
step:93/1480 train_time:11833ms step_avg:142.57ms
step:94/1480 train_time:11975ms step_avg:142.56ms
step:95/1480 train_time:12119ms step_avg:142.57ms
step:96/1480 train_time:12260ms step_avg:142.56ms
step:97/1480 train_time:12404ms step_avg:142.58ms
step:98/1480 train_time:12548ms step_avg:142.59ms
step:99/1480 train_time:12691ms step_avg:142.59ms
step:100/1480 train_time:12834ms step_avg:142.60ms
step:101/1480 train_time:12974ms step_avg:142.58ms
step:102/1480 train_time:13116ms step_avg:142.57ms
step:103/1480 train_time:13258ms step_avg:142.55ms
step:104/1480 train_time:13400ms step_avg:142.55ms
step:105/1480 train_time:13543ms step_avg:142.56ms
step:106/1480 train_time:13687ms step_avg:142.57ms
step:107/1480 train_time:13830ms step_avg:142.58ms
step:108/1480 train_time:13972ms step_avg:142.57ms
step:109/1480 train_time:14113ms step_avg:142.56ms
step:110/1480 train_time:14254ms step_avg:142.54ms
step:111/1480 train_time:14398ms step_avg:142.55ms
step:112/1480 train_time:14545ms step_avg:142.60ms
step:113/1480 train_time:14692ms step_avg:142.64ms
step:114/1480 train_time:14840ms step_avg:142.69ms
step:115/1480 train_time:14988ms step_avg:142.74ms
step:116/1480 train_time:15134ms step_avg:142.78ms
step:117/1480 train_time:15280ms step_avg:142.81ms
step:118/1480 train_time:15428ms step_avg:142.85ms
step:119/1480 train_time:15575ms step_avg:142.89ms
step:120/1480 train_time:15723ms step_avg:142.93ms
step:121/1480 train_time:15870ms step_avg:142.97ms
step:122/1480 train_time:16015ms step_avg:142.99ms
step:123/1480 train_time:16161ms step_avg:143.02ms
step:124/1480 train_time:16310ms step_avg:143.07ms
step:125/1480 train_time:16455ms step_avg:143.09ms
step:125/1480 val_loss:4.4010 train_time:16513ms step_avg:143.59ms
step:126/1480 train_time:16609ms step_avg:143.18ms
step:127/1480 train_time:16756ms step_avg:143.22ms
step:128/1480 train_time:16903ms step_avg:143.24ms
step:129/1480 train_time:17049ms step_avg:143.27ms
step:130/1480 train_time:17193ms step_avg:143.28ms
step:131/1480 train_time:17340ms step_avg:143.31ms
step:132/1480 train_time:17487ms step_avg:143.34ms
step:133/1480 train_time:17635ms step_avg:143.37ms
step:134/1480 train_time:17784ms step_avg:143.42ms
step:135/1480 train_time:17930ms step_avg:143.44ms
step:136/1480 train_time:18075ms step_avg:143.45ms
step:137/1480 train_time:18222ms step_avg:143.48ms
step:138/1480 train_time:18369ms step_avg:143.51ms
step:139/1480 train_time:18515ms step_avg:143.53ms
step:140/1480 train_time:18664ms step_avg:143.57ms
step:141/1480 train_time:18810ms step_avg:143.59ms
step:142/1480 train_time:18957ms step_avg:143.62ms
step:143/1480 train_time:19105ms step_avg:143.64ms
step:144/1480 train_time:19250ms step_avg:143.66ms
step:145/1480 train_time:19395ms step_avg:143.67ms
step:146/1480 train_time:19544ms step_avg:143.70ms
step:147/1480 train_time:19691ms step_avg:143.73ms
step:148/1480 train_time:19838ms step_avg:143.75ms
step:149/1480 train_time:19986ms step_avg:143.78ms
step:150/1480 train_time:20132ms step_avg:143.80ms
step:151/1480 train_time:20280ms step_avg:143.83ms
step:152/1480 train_time:20427ms step_avg:143.85ms
step:153/1480 train_time:20573ms step_avg:143.86ms
step:154/1480 train_time:20720ms step_avg:143.89ms
step:155/1480 train_time:20867ms step_avg:143.91ms
step:156/1480 train_time:21013ms step_avg:143.93ms
step:157/1480 train_time:21160ms step_avg:143.94ms
step:158/1480 train_time:21307ms step_avg:143.97ms
step:159/1480 train_time:21453ms step_avg:143.98ms
step:160/1480 train_time:21599ms step_avg:143.99ms
step:161/1480 train_time:21747ms step_avg:144.02ms
step:162/1480 train_time:21892ms step_avg:144.03ms
step:163/1480 train_time:22039ms step_avg:144.05ms
step:164/1480 train_time:22187ms step_avg:144.07ms
step:165/1480 train_time:22333ms step_avg:144.09ms
step:166/1480 train_time:22482ms step_avg:144.11ms
step:167/1480 train_time:22628ms step_avg:144.13ms
step:168/1480 train_time:22774ms step_avg:144.14ms
step:169/1480 train_time:22922ms step_avg:144.16ms
step:170/1480 train_time:23069ms step_avg:144.18ms
step:171/1480 train_time:23214ms step_avg:144.18ms
step:172/1480 train_time:23362ms step_avg:144.21ms
step:173/1480 train_time:23509ms step_avg:144.22ms
step:174/1480 train_time:23655ms step_avg:144.24ms
step:175/1480 train_time:23803ms step_avg:144.26ms
step:176/1480 train_time:23950ms step_avg:144.28ms
step:177/1480 train_time:24096ms step_avg:144.29ms
step:178/1480 train_time:24244ms step_avg:144.31ms
step:179/1480 train_time:24391ms step_avg:144.32ms
step:180/1480 train_time:24537ms step_avg:144.33ms
step:181/1480 train_time:24684ms step_avg:144.35ms
step:182/1480 train_time:24831ms step_avg:144.36ms
step:183/1480 train_time:24978ms step_avg:144.38ms
step:184/1480 train_time:25125ms step_avg:144.40ms
step:185/1480 train_time:25272ms step_avg:144.41ms
step:186/1480 train_time:25421ms step_avg:144.44ms
step:187/1480 train_time:25568ms step_avg:144.45ms
step:188/1480 train_time:25714ms step_avg:144.46ms
step:189/1480 train_time:25861ms step_avg:144.47ms
step:190/1480 train_time:26008ms step_avg:144.49ms
step:191/1480 train_time:26152ms step_avg:144.49ms
step:192/1480 train_time:26299ms step_avg:144.50ms
step:193/1480 train_time:26447ms step_avg:144.52ms
step:194/1480 train_time:26592ms step_avg:144.52ms
step:195/1480 train_time:26739ms step_avg:144.54ms
step:196/1480 train_time:26887ms step_avg:144.55ms
step:197/1480 train_time:27032ms step_avg:144.56ms
step:198/1480 train_time:27179ms step_avg:144.57ms
step:199/1480 train_time:27326ms step_avg:144.58ms
step:200/1480 train_time:27473ms step_avg:144.60ms
step:201/1480 train_time:27622ms step_avg:144.62ms
step:202/1480 train_time:27770ms step_avg:144.64ms
step:203/1480 train_time:27916ms step_avg:144.64ms
step:204/1480 train_time:28064ms step_avg:144.66ms
step:205/1480 train_time:28210ms step_avg:144.67ms
step:206/1480 train_time:28356ms step_avg:144.67ms
step:207/1480 train_time:28503ms step_avg:144.69ms
step:208/1480 train_time:28650ms step_avg:144.70ms
step:209/1480 train_time:28796ms step_avg:144.70ms
step:210/1480 train_time:28943ms step_avg:144.72ms
step:211/1480 train_time:29091ms step_avg:144.73ms
step:212/1480 train_time:29236ms step_avg:144.73ms
step:213/1480 train_time:29385ms step_avg:144.75ms
step:214/1480 train_time:29531ms step_avg:144.76ms
step:215/1480 train_time:29679ms step_avg:144.77ms
step:216/1480 train_time:29825ms step_avg:144.78ms
step:217/1480 train_time:29972ms step_avg:144.79ms
step:218/1480 train_time:30121ms step_avg:144.81ms
step:219/1480 train_time:30268ms step_avg:144.82ms
step:220/1480 train_time:30414ms step_avg:144.83ms
step:221/1480 train_time:30564ms step_avg:144.86ms
step:222/1480 train_time:30715ms step_avg:144.88ms
step:223/1480 train_time:30865ms step_avg:144.91ms
step:224/1480 train_time:31016ms step_avg:144.93ms
step:225/1480 train_time:31166ms step_avg:144.96ms
step:226/1480 train_time:31315ms step_avg:144.98ms
step:227/1480 train_time:31466ms step_avg:145.01ms
step:228/1480 train_time:31615ms step_avg:145.02ms
step:229/1480 train_time:31766ms step_avg:145.05ms
step:230/1480 train_time:31915ms step_avg:145.07ms
step:231/1480 train_time:32067ms step_avg:145.10ms
step:232/1480 train_time:32216ms step_avg:145.12ms
step:233/1480 train_time:32367ms step_avg:145.14ms
step:234/1480 train_time:32517ms step_avg:145.16ms
step:235/1480 train_time:32669ms step_avg:145.19ms
step:236/1480 train_time:32819ms step_avg:145.22ms
step:237/1480 train_time:32970ms step_avg:145.24ms
step:238/1480 train_time:33120ms step_avg:145.26ms
step:239/1480 train_time:33271ms step_avg:145.29ms
step:240/1480 train_time:33422ms step_avg:145.31ms
step:241/1480 train_time:33573ms step_avg:145.34ms
step:242/1480 train_time:33724ms step_avg:145.36ms
step:243/1480 train_time:33873ms step_avg:145.38ms
step:244/1480 train_time:34023ms step_avg:145.40ms
step:245/1480 train_time:34174ms step_avg:145.42ms
step:246/1480 train_time:34324ms step_avg:145.44ms
step:247/1480 train_time:34474ms step_avg:145.46ms
step:248/1480 train_time:34624ms step_avg:145.48ms
step:249/1480 train_time:34774ms step_avg:145.50ms
step:250/1480 train_time:34924ms step_avg:145.52ms
step:250/1480 val_loss:3.9879 train_time:34983ms step_avg:145.76ms
step:251/1480 train_time:35080ms step_avg:145.56ms
step:252/1480 train_time:35232ms step_avg:145.59ms
step:253/1480 train_time:35383ms step_avg:145.61ms
step:254/1480 train_time:35532ms step_avg:145.62ms
step:255/1480 train_time:35683ms step_avg:145.64ms
step:256/1480 train_time:35832ms step_avg:145.66ms
step:257/1480 train_time:35983ms step_avg:145.68ms
step:258/1480 train_time:36134ms step_avg:145.70ms
step:259/1480 train_time:36286ms step_avg:145.73ms
step:260/1480 train_time:36436ms step_avg:145.74ms
step:261/1480 train_time:36587ms step_avg:145.76ms
step:262/1480 train_time:36736ms step_avg:145.78ms
step:263/1480 train_time:36888ms step_avg:145.80ms
step:264/1480 train_time:37039ms step_avg:145.82ms
step:265/1480 train_time:37189ms step_avg:145.84ms
step:266/1480 train_time:37341ms step_avg:145.86ms
step:267/1480 train_time:37491ms step_avg:145.88ms
step:268/1480 train_time:37643ms step_avg:145.90ms
step:269/1480 train_time:37793ms step_avg:145.92ms
step:270/1480 train_time:37943ms step_avg:145.93ms
step:271/1480 train_time:38091ms step_avg:145.94ms
step:272/1480 train_time:38241ms step_avg:145.96ms
step:273/1480 train_time:38393ms step_avg:145.98ms
step:274/1480 train_time:38542ms step_avg:145.99ms
step:275/1480 train_time:38693ms step_avg:146.01ms
step:276/1480 train_time:38843ms step_avg:146.03ms
step:277/1480 train_time:38992ms step_avg:146.04ms
step:278/1480 train_time:39142ms step_avg:146.05ms
step:279/1480 train_time:39293ms step_avg:146.07ms
step:280/1480 train_time:39443ms step_avg:146.09ms
step:281/1480 train_time:39593ms step_avg:146.10ms
step:282/1480 train_time:39744ms step_avg:146.12ms
step:283/1480 train_time:39894ms step_avg:146.13ms
step:284/1480 train_time:40046ms step_avg:146.15ms
step:285/1480 train_time:40196ms step_avg:146.17ms
step:286/1480 train_time:40346ms step_avg:146.18ms
step:287/1480 train_time:40498ms step_avg:146.20ms
step:288/1480 train_time:40648ms step_avg:146.22ms
step:289/1480 train_time:40797ms step_avg:146.23ms
step:290/1480 train_time:40948ms step_avg:146.24ms
step:291/1480 train_time:41098ms step_avg:146.26ms
step:292/1480 train_time:41247ms step_avg:146.27ms
step:293/1480 train_time:41398ms step_avg:146.28ms
step:294/1480 train_time:41549ms step_avg:146.30ms
step:295/1480 train_time:41699ms step_avg:146.31ms
step:296/1480 train_time:41850ms step_avg:146.33ms
step:297/1480 train_time:42001ms step_avg:146.34ms
step:298/1480 train_time:42152ms step_avg:146.36ms
step:299/1480 train_time:42303ms step_avg:146.38ms
step:300/1480 train_time:42454ms step_avg:146.39ms
step:301/1480 train_time:42605ms step_avg:146.41ms
step:302/1480 train_time:42755ms step_avg:146.42ms
step:303/1480 train_time:42906ms step_avg:146.44ms
step:304/1480 train_time:43055ms step_avg:146.45ms
step:305/1480 train_time:43206ms step_avg:146.46ms
step:306/1480 train_time:43357ms step_avg:146.48ms
step:307/1480 train_time:43508ms step_avg:146.49ms
step:308/1480 train_time:43658ms step_avg:146.50ms
step:309/1480 train_time:43809ms step_avg:146.52ms
step:310/1480 train_time:43959ms step_avg:146.53ms
step:311/1480 train_time:44110ms step_avg:146.54ms
step:312/1480 train_time:44261ms step_avg:146.56ms
step:313/1480 train_time:44411ms step_avg:146.57ms
step:314/1480 train_time:44561ms step_avg:146.58ms
step:315/1480 train_time:44711ms step_avg:146.59ms
step:316/1480 train_time:44862ms step_avg:146.61ms
step:317/1480 train_time:45012ms step_avg:146.62ms
step:318/1480 train_time:45163ms step_avg:146.63ms
step:319/1480 train_time:45313ms step_avg:146.64ms
step:320/1480 train_time:45465ms step_avg:146.66ms
step:321/1480 train_time:45614ms step_avg:146.67ms
step:322/1480 train_time:45766ms step_avg:146.68ms
step:323/1480 train_time:45915ms step_avg:146.69ms
step:324/1480 train_time:46066ms step_avg:146.71ms
step:325/1480 train_time:46216ms step_avg:146.72ms
step:326/1480 train_time:46367ms step_avg:146.73ms
step:327/1480 train_time:46518ms step_avg:146.74ms
step:328/1480 train_time:46668ms step_avg:146.75ms
step:329/1480 train_time:46817ms step_avg:146.76ms
step:330/1480 train_time:46971ms step_avg:146.78ms
step:331/1480 train_time:47124ms step_avg:146.80ms
step:332/1480 train_time:47277ms step_avg:146.82ms
step:333/1480 train_time:47431ms step_avg:146.85ms
step:334/1480 train_time:47584ms step_avg:146.87ms
step:335/1480 train_time:47737ms step_avg:146.88ms
step:336/1480 train_time:47890ms step_avg:146.90ms
step:337/1480 train_time:48045ms step_avg:146.93ms
step:338/1480 train_time:48200ms step_avg:146.95ms
step:339/1480 train_time:48354ms step_avg:146.97ms
step:340/1480 train_time:48507ms step_avg:146.99ms
step:341/1480 train_time:48661ms step_avg:147.01ms
step:342/1480 train_time:48814ms step_avg:147.03ms
step:343/1480 train_time:48969ms step_avg:147.05ms
step:344/1480 train_time:49123ms step_avg:147.07ms
step:345/1480 train_time:49279ms step_avg:147.10ms
step:346/1480 train_time:49433ms step_avg:147.12ms
step:347/1480 train_time:49587ms step_avg:147.14ms
step:348/1480 train_time:49743ms step_avg:147.17ms
step:349/1480 train_time:49895ms step_avg:147.18ms
step:350/1480 train_time:50048ms step_avg:147.20ms
step:351/1480 train_time:50203ms step_avg:147.22ms
step:352/1480 train_time:50358ms step_avg:147.24ms
step:353/1480 train_time:50511ms step_avg:147.26ms
step:354/1480 train_time:50665ms step_avg:147.28ms
step:355/1480 train_time:50819ms step_avg:147.30ms
step:356/1480 train_time:50973ms step_avg:147.32ms
step:357/1480 train_time:51127ms step_avg:147.34ms
step:358/1480 train_time:51280ms step_avg:147.36ms
step:359/1480 train_time:51435ms step_avg:147.38ms
step:360/1480 train_time:51591ms step_avg:147.40ms
step:361/1480 train_time:51746ms step_avg:147.42ms
step:362/1480 train_time:51901ms step_avg:147.45ms
step:363/1480 train_time:52054ms step_avg:147.46ms
step:364/1480 train_time:52207ms step_avg:147.48ms
step:365/1480 train_time:52363ms step_avg:147.50ms
step:366/1480 train_time:52517ms step_avg:147.52ms
step:367/1480 train_time:52671ms step_avg:147.54ms
step:368/1480 train_time:52824ms step_avg:147.55ms
step:369/1480 train_time:52979ms step_avg:147.57ms
step:370/1480 train_time:53132ms step_avg:147.59ms
step:371/1480 train_time:53285ms step_avg:147.60ms
step:372/1480 train_time:53439ms step_avg:147.62ms
step:373/1480 train_time:53593ms step_avg:147.64ms
step:374/1480 train_time:53745ms step_avg:147.65ms
step:375/1480 train_time:53898ms step_avg:147.67ms
step:375/1480 val_loss:3.8026 train_time:53958ms step_avg:147.83ms
step:376/1480 train_time:54057ms step_avg:147.70ms
step:377/1480 train_time:54212ms step_avg:147.72ms
step:378/1480 train_time:54365ms step_avg:147.73ms
step:379/1480 train_time:54517ms step_avg:147.74ms
step:380/1480 train_time:54669ms step_avg:147.75ms
step:381/1480 train_time:54820ms step_avg:147.76ms
step:382/1480 train_time:54974ms step_avg:147.78ms
step:383/1480 train_time:55128ms step_avg:147.80ms
step:384/1480 train_time:55284ms step_avg:147.82ms
step:385/1480 train_time:55437ms step_avg:147.83ms
step:386/1480 train_time:55591ms step_avg:147.85ms
step:387/1480 train_time:55743ms step_avg:147.86ms
step:388/1480 train_time:55897ms step_avg:147.88ms
step:389/1480 train_time:56050ms step_avg:147.89ms
step:390/1480 train_time:56204ms step_avg:147.91ms
step:391/1480 train_time:56358ms step_avg:147.92ms
step:392/1480 train_time:56511ms step_avg:147.94ms
step:393/1480 train_time:56665ms step_avg:147.95ms
step:394/1480 train_time:56819ms step_avg:147.97ms
step:395/1480 train_time:56972ms step_avg:147.98ms
step:396/1480 train_time:57125ms step_avg:147.99ms
step:397/1480 train_time:57278ms step_avg:148.01ms
step:398/1480 train_time:57432ms step_avg:148.02ms
step:399/1480 train_time:57585ms step_avg:148.03ms
step:400/1480 train_time:57739ms step_avg:148.05ms
step:401/1480 train_time:57893ms step_avg:148.06ms
step:402/1480 train_time:58047ms step_avg:148.08ms
step:403/1480 train_time:58202ms step_avg:148.10ms
step:404/1480 train_time:58356ms step_avg:148.11ms
step:405/1480 train_time:58509ms step_avg:148.12ms
step:406/1480 train_time:58663ms step_avg:148.14ms
step:407/1480 train_time:58817ms step_avg:148.15ms
step:408/1480 train_time:58972ms step_avg:148.17ms
step:409/1480 train_time:59126ms step_avg:148.19ms
step:410/1480 train_time:59281ms step_avg:148.20ms
step:411/1480 train_time:59435ms step_avg:148.22ms
step:412/1480 train_time:59588ms step_avg:148.23ms
step:413/1480 train_time:59742ms step_avg:148.24ms
step:414/1480 train_time:59896ms step_avg:148.26ms
step:415/1480 train_time:60049ms step_avg:148.27ms
step:416/1480 train_time:60203ms step_avg:148.28ms
step:417/1480 train_time:60358ms step_avg:148.30ms
step:418/1480 train_time:60511ms step_avg:148.31ms
step:419/1480 train_time:60665ms step_avg:148.32ms
step:420/1480 train_time:60818ms step_avg:148.34ms
step:421/1480 train_time:60972ms step_avg:148.35ms
step:422/1480 train_time:61126ms step_avg:148.36ms
step:423/1480 train_time:61280ms step_avg:148.38ms
step:424/1480 train_time:61433ms step_avg:148.39ms
step:425/1480 train_time:61587ms step_avg:148.40ms
step:426/1480 train_time:61742ms step_avg:148.42ms
step:427/1480 train_time:61896ms step_avg:148.43ms
step:428/1480 train_time:62049ms step_avg:148.44ms
step:429/1480 train_time:62203ms step_avg:148.46ms
step:430/1480 train_time:62356ms step_avg:148.47ms
step:431/1480 train_time:62510ms step_avg:148.48ms
step:432/1480 train_time:62664ms step_avg:148.49ms
step:433/1480 train_time:62818ms step_avg:148.51ms
step:434/1480 train_time:62971ms step_avg:148.52ms
step:435/1480 train_time:63125ms step_avg:148.53ms
step:436/1480 train_time:63280ms step_avg:148.54ms
step:437/1480 train_time:63434ms step_avg:148.56ms
step:438/1480 train_time:63589ms step_avg:148.57ms
step:439/1480 train_time:63743ms step_avg:148.58ms
step:440/1480 train_time:63898ms step_avg:148.60ms
step:441/1480 train_time:64054ms step_avg:148.62ms
step:442/1480 train_time:64211ms step_avg:148.64ms
step:443/1480 train_time:64368ms step_avg:148.65ms
step:444/1480 train_time:64524ms step_avg:148.67ms
step:445/1480 train_time:64681ms step_avg:148.69ms
step:446/1480 train_time:64837ms step_avg:148.71ms
step:447/1480 train_time:64993ms step_avg:148.72ms
step:448/1480 train_time:65148ms step_avg:148.74ms
step:449/1480 train_time:65307ms step_avg:148.76ms
step:450/1480 train_time:65465ms step_avg:148.78ms
step:451/1480 train_time:65623ms step_avg:148.80ms
step:452/1480 train_time:65779ms step_avg:148.82ms
step:453/1480 train_time:65935ms step_avg:148.84ms
step:454/1480 train_time:66091ms step_avg:148.85ms
step:455/1480 train_time:66248ms step_avg:148.87ms
step:456/1480 train_time:66405ms step_avg:148.89ms
step:457/1480 train_time:66561ms step_avg:148.91ms
step:458/1480 train_time:66716ms step_avg:148.92ms
step:459/1480 train_time:66874ms step_avg:148.94ms
step:460/1480 train_time:67030ms step_avg:148.96ms
step:461/1480 train_time:67190ms step_avg:148.98ms
step:462/1480 train_time:67347ms step_avg:149.00ms
step:463/1480 train_time:67505ms step_avg:149.02ms
step:464/1480 train_time:67661ms step_avg:149.03ms
step:465/1480 train_time:67816ms step_avg:149.05ms
step:466/1480 train_time:67974ms step_avg:149.07ms
step:467/1480 train_time:68132ms step_avg:149.09ms
step:468/1480 train_time:68289ms step_avg:149.10ms
step:469/1480 train_time:68445ms step_avg:149.12ms
step:470/1480 train_time:68602ms step_avg:149.13ms
step:471/1480 train_time:68758ms step_avg:149.15ms
step:472/1480 train_time:68914ms step_avg:149.16ms
step:473/1480 train_time:69071ms step_avg:149.18ms
step:474/1480 train_time:69227ms step_avg:149.20ms
step:475/1480 train_time:69384ms step_avg:149.21ms
step:476/1480 train_time:69541ms step_avg:149.23ms
step:477/1480 train_time:69700ms step_avg:149.25ms
step:478/1480 train_time:69856ms step_avg:149.27ms
step:479/1480 train_time:70012ms step_avg:149.28ms
step:480/1480 train_time:70170ms step_avg:149.30ms
step:481/1480 train_time:70326ms step_avg:149.31ms
step:482/1480 train_time:70484ms step_avg:149.33ms
step:483/1480 train_time:70641ms step_avg:149.35ms
step:484/1480 train_time:70800ms step_avg:149.37ms
step:485/1480 train_time:70957ms step_avg:149.38ms
step:486/1480 train_time:71114ms step_avg:149.40ms
step:487/1480 train_time:71271ms step_avg:149.42ms
step:488/1480 train_time:71427ms step_avg:149.43ms
step:489/1480 train_time:71584ms step_avg:149.44ms
step:490/1480 train_time:71740ms step_avg:149.46ms
step:491/1480 train_time:71897ms step_avg:149.47ms
step:492/1480 train_time:72054ms step_avg:149.49ms
step:493/1480 train_time:72212ms step_avg:149.51ms
step:494/1480 train_time:72369ms step_avg:149.52ms
step:495/1480 train_time:72527ms step_avg:149.54ms
step:496/1480 train_time:72685ms step_avg:149.56ms
step:497/1480 train_time:72844ms step_avg:149.58ms
step:498/1480 train_time:73002ms step_avg:149.59ms
step:499/1480 train_time:73159ms step_avg:149.61ms
step:500/1480 train_time:73317ms step_avg:149.63ms
step:500/1480 val_loss:3.6821 train_time:73380ms step_avg:149.76ms
step:501/1480 train_time:73478ms step_avg:149.65ms
step:502/1480 train_time:73636ms step_avg:149.67ms
step:503/1480 train_time:73792ms step_avg:149.68ms
step:504/1480 train_time:73948ms step_avg:149.69ms
step:505/1480 train_time:74104ms step_avg:149.70ms
step:506/1480 train_time:74260ms step_avg:149.72ms
step:507/1480 train_time:74415ms step_avg:149.73ms
step:508/1480 train_time:74573ms step_avg:149.75ms
step:509/1480 train_time:74730ms step_avg:149.76ms
step:510/1480 train_time:74887ms step_avg:149.77ms
step:511/1480 train_time:75044ms step_avg:149.79ms
step:512/1480 train_time:75201ms step_avg:149.80ms
step:513/1480 train_time:75358ms step_avg:149.82ms
step:514/1480 train_time:75516ms step_avg:149.83ms
step:515/1480 train_time:75673ms step_avg:149.85ms
step:516/1480 train_time:75831ms step_avg:149.86ms
step:517/1480 train_time:75989ms step_avg:149.88ms
step:518/1480 train_time:76146ms step_avg:149.89ms
step:519/1480 train_time:76304ms step_avg:149.91ms
step:520/1480 train_time:76462ms step_avg:149.92ms
step:521/1480 train_time:76618ms step_avg:149.94ms
step:522/1480 train_time:76774ms step_avg:149.95ms
step:523/1480 train_time:76930ms step_avg:149.96ms
step:524/1480 train_time:77088ms step_avg:149.98ms
step:525/1480 train_time:77246ms step_avg:149.99ms
step:526/1480 train_time:77405ms step_avg:150.01ms
step:527/1480 train_time:77562ms step_avg:150.02ms
step:528/1480 train_time:77718ms step_avg:150.04ms
step:529/1480 train_time:77877ms step_avg:150.05ms
step:530/1480 train_time:78033ms step_avg:150.06ms
step:531/1480 train_time:78191ms step_avg:150.08ms
step:532/1480 train_time:78348ms step_avg:150.09ms
step:533/1480 train_time:78506ms step_avg:150.11ms
step:534/1480 train_time:78663ms step_avg:150.12ms
step:535/1480 train_time:78820ms step_avg:150.13ms
step:536/1480 train_time:78978ms step_avg:150.15ms
step:537/1480 train_time:79135ms step_avg:150.16ms
step:538/1480 train_time:79292ms step_avg:150.17ms
step:539/1480 train_time:79451ms step_avg:150.19ms
step:540/1480 train_time:79609ms step_avg:150.20ms
step:541/1480 train_time:79765ms step_avg:150.22ms
step:542/1480 train_time:79921ms step_avg:150.23ms
step:543/1480 train_time:80076ms step_avg:150.24ms
step:544/1480 train_time:80231ms step_avg:150.25ms
step:545/1480 train_time:80389ms step_avg:150.26ms
step:546/1480 train_time:80546ms step_avg:150.27ms
step:547/1480 train_time:80704ms step_avg:150.29ms
step:548/1480 train_time:80863ms step_avg:150.30ms
step:549/1480 train_time:81020ms step_avg:150.32ms
step:550/1480 train_time:81178ms step_avg:150.33ms
step:551/1480 train_time:81335ms step_avg:150.34ms
step:552/1480 train_time:81494ms step_avg:150.36ms
step:553/1480 train_time:81653ms step_avg:150.37ms
step:554/1480 train_time:81812ms step_avg:150.39ms
step:555/1480 train_time:81972ms step_avg:150.41ms
step:556/1480 train_time:82131ms step_avg:150.42ms
step:557/1480 train_time:82292ms step_avg:150.44ms
step:558/1480 train_time:82452ms step_avg:150.46ms
step:559/1480 train_time:82611ms step_avg:150.47ms
step:560/1480 train_time:82770ms step_avg:150.49ms
step:561/1480 train_time:82930ms step_avg:150.51ms
step:562/1480 train_time:83090ms step_avg:150.52ms
step:563/1480 train_time:83250ms step_avg:150.54ms
step:564/1480 train_time:83410ms step_avg:150.56ms
step:565/1480 train_time:83570ms step_avg:150.58ms
step:566/1480 train_time:83731ms step_avg:150.60ms
step:567/1480 train_time:83891ms step_avg:150.61ms
step:568/1480 train_time:84050ms step_avg:150.63ms
step:569/1480 train_time:84209ms step_avg:150.64ms
step:570/1480 train_time:84370ms step_avg:150.66ms
step:571/1480 train_time:84530ms step_avg:150.68ms
step:572/1480 train_time:84691ms step_avg:150.70ms
step:573/1480 train_time:84852ms step_avg:150.71ms
step:574/1480 train_time:85011ms step_avg:150.73ms
step:575/1480 train_time:85172ms step_avg:150.75ms
step:576/1480 train_time:85330ms step_avg:150.76ms
step:577/1480 train_time:85491ms step_avg:150.78ms
step:578/1480 train_time:85650ms step_avg:150.79ms
step:579/1480 train_time:85810ms step_avg:150.81ms
step:580/1480 train_time:85970ms step_avg:150.82ms
step:581/1480 train_time:86130ms step_avg:150.84ms
step:582/1480 train_time:86290ms step_avg:150.86ms
step:583/1480 train_time:86450ms step_avg:150.87ms
step:584/1480 train_time:86610ms step_avg:150.89ms
step:585/1480 train_time:86770ms step_avg:150.90ms
step:586/1480 train_time:86930ms step_avg:150.92ms
step:587/1480 train_time:87091ms step_avg:150.94ms
step:588/1480 train_time:87250ms step_avg:150.95ms
step:589/1480 train_time:87409ms step_avg:150.97ms
step:590/1480 train_time:87570ms step_avg:150.98ms
step:591/1480 train_time:87730ms step_avg:151.00ms
step:592/1480 train_time:87890ms step_avg:151.01ms
step:593/1480 train_time:88052ms step_avg:151.03ms
step:594/1480 train_time:88212ms step_avg:151.05ms
step:595/1480 train_time:88373ms step_avg:151.07ms
step:596/1480 train_time:88534ms step_avg:151.08ms
step:597/1480 train_time:88693ms step_avg:151.10ms
step:598/1480 train_time:88852ms step_avg:151.11ms
step:599/1480 train_time:89010ms step_avg:151.12ms
step:600/1480 train_time:89170ms step_avg:151.14ms
step:601/1480 train_time:89329ms step_avg:151.15ms
step:602/1480 train_time:89490ms step_avg:151.16ms
step:603/1480 train_time:89651ms step_avg:151.18ms
step:604/1480 train_time:89810ms step_avg:151.20ms
step:605/1480 train_time:89970ms step_avg:151.21ms
step:606/1480 train_time:90132ms step_avg:151.23ms
step:607/1480 train_time:90294ms step_avg:151.25ms
step:608/1480 train_time:90454ms step_avg:151.26ms
step:609/1480 train_time:90612ms step_avg:151.27ms
step:610/1480 train_time:90771ms step_avg:151.28ms
step:611/1480 train_time:90931ms step_avg:151.30ms
step:612/1480 train_time:91092ms step_avg:151.32ms
step:613/1480 train_time:91252ms step_avg:151.33ms
step:614/1480 train_time:91411ms step_avg:151.34ms
step:615/1480 train_time:91571ms step_avg:151.36ms
step:616/1480 train_time:91730ms step_avg:151.37ms
step:617/1480 train_time:91889ms step_avg:151.38ms
step:618/1480 train_time:92049ms step_avg:151.40ms
step:619/1480 train_time:92208ms step_avg:151.41ms
step:620/1480 train_time:92369ms step_avg:151.42ms
step:621/1480 train_time:92531ms step_avg:151.44ms
step:622/1480 train_time:92691ms step_avg:151.46ms
step:623/1480 train_time:92851ms step_avg:151.47ms
step:624/1480 train_time:93011ms step_avg:151.48ms
step:625/1480 train_time:93170ms step_avg:151.50ms
step:625/1480 val_loss:3.6023 train_time:93233ms step_avg:151.60ms
step:626/1480 train_time:93331ms step_avg:151.51ms
step:627/1480 train_time:93490ms step_avg:151.52ms
step:628/1480 train_time:93648ms step_avg:151.53ms
step:629/1480 train_time:93807ms step_avg:151.55ms
step:630/1480 train_time:93967ms step_avg:151.56ms
step:631/1480 train_time:94125ms step_avg:151.57ms
step:632/1480 train_time:94284ms step_avg:151.58ms
step:633/1480 train_time:94444ms step_avg:151.60ms
step:634/1480 train_time:94605ms step_avg:151.61ms
step:635/1480 train_time:94765ms step_avg:151.62ms
step:636/1480 train_time:94924ms step_avg:151.64ms
step:637/1480 train_time:95084ms step_avg:151.65ms
step:638/1480 train_time:95245ms step_avg:151.66ms
step:639/1480 train_time:95404ms step_avg:151.68ms
step:640/1480 train_time:95565ms step_avg:151.69ms
step:641/1480 train_time:95725ms step_avg:151.70ms
step:642/1480 train_time:95884ms step_avg:151.72ms
step:643/1480 train_time:96044ms step_avg:151.73ms
step:644/1480 train_time:96203ms step_avg:151.74ms
step:645/1480 train_time:96361ms step_avg:151.75ms
step:646/1480 train_time:96520ms step_avg:151.76ms
step:647/1480 train_time:96678ms step_avg:151.77ms
step:648/1480 train_time:96840ms step_avg:151.79ms
step:649/1480 train_time:97001ms step_avg:151.80ms
step:650/1480 train_time:97163ms step_avg:151.82ms
step:651/1480 train_time:97323ms step_avg:151.83ms
step:652/1480 train_time:97484ms step_avg:151.84ms
step:653/1480 train_time:97644ms step_avg:151.86ms
step:654/1480 train_time:97804ms step_avg:151.87ms
step:655/1480 train_time:97965ms step_avg:151.88ms
step:656/1480 train_time:98125ms step_avg:151.90ms
step:657/1480 train_time:98284ms step_avg:151.91ms
step:658/1480 train_time:98445ms step_avg:151.92ms
step:659/1480 train_time:98608ms step_avg:151.94ms
step:660/1480 train_time:98770ms step_avg:151.95ms
step:661/1480 train_time:98932ms step_avg:151.97ms
step:662/1480 train_time:99091ms step_avg:151.98ms
step:663/1480 train_time:99251ms step_avg:151.99ms
step:664/1480 train_time:99413ms step_avg:152.01ms
step:665/1480 train_time:99575ms step_avg:152.02ms
step:666/1480 train_time:99734ms step_avg:152.03ms
step:667/1480 train_time:99895ms step_avg:152.05ms
step:668/1480 train_time:100058ms step_avg:152.06ms
step:669/1480 train_time:100221ms step_avg:152.08ms
step:670/1480 train_time:100382ms step_avg:152.09ms
step:671/1480 train_time:100542ms step_avg:152.11ms
step:672/1480 train_time:100705ms step_avg:152.12ms
step:673/1480 train_time:100868ms step_avg:152.14ms
step:674/1480 train_time:101030ms step_avg:152.15ms
step:675/1480 train_time:101192ms step_avg:152.17ms
step:676/1480 train_time:101354ms step_avg:152.18ms
step:677/1480 train_time:101514ms step_avg:152.20ms
step:678/1480 train_time:101674ms step_avg:152.21ms
step:679/1480 train_time:101836ms step_avg:152.22ms
step:680/1480 train_time:101997ms step_avg:152.23ms
step:681/1480 train_time:102159ms step_avg:152.25ms
step:682/1480 train_time:102321ms step_avg:152.26ms
step:683/1480 train_time:102483ms step_avg:152.28ms
step:684/1480 train_time:102645ms step_avg:152.29ms
step:685/1480 train_time:102808ms step_avg:152.31ms
step:686/1480 train_time:102969ms step_avg:152.32ms
step:687/1480 train_time:103129ms step_avg:152.33ms
step:688/1480 train_time:103293ms step_avg:152.35ms
step:689/1480 train_time:103455ms step_avg:152.36ms
step:690/1480 train_time:103620ms step_avg:152.38ms
step:691/1480 train_time:103782ms step_avg:152.40ms
step:692/1480 train_time:103944ms step_avg:152.41ms
step:693/1480 train_time:104107ms step_avg:152.43ms
step:694/1480 train_time:104269ms step_avg:152.44ms
step:695/1480 train_time:104430ms step_avg:152.45ms
step:696/1480 train_time:104590ms step_avg:152.46ms
step:697/1480 train_time:104753ms step_avg:152.48ms
step:698/1480 train_time:104912ms step_avg:152.49ms
step:699/1480 train_time:105074ms step_avg:152.50ms
step:700/1480 train_time:105235ms step_avg:152.52ms
step:701/1480 train_time:105394ms step_avg:152.52ms
step:702/1480 train_time:105555ms step_avg:152.54ms
step:703/1480 train_time:105716ms step_avg:152.55ms
step:704/1480 train_time:105876ms step_avg:152.56ms
step:705/1480 train_time:106041ms step_avg:152.58ms
step:706/1480 train_time:106204ms step_avg:152.59ms
step:707/1480 train_time:106366ms step_avg:152.60ms
step:708/1480 train_time:106527ms step_avg:152.62ms
step:709/1480 train_time:106689ms step_avg:152.63ms
step:710/1480 train_time:106849ms step_avg:152.64ms
step:711/1480 train_time:107010ms step_avg:152.65ms
step:712/1480 train_time:107176ms step_avg:152.67ms
step:713/1480 train_time:107339ms step_avg:152.69ms
step:714/1480 train_time:107500ms step_avg:152.70ms
step:715/1480 train_time:107661ms step_avg:152.71ms
step:716/1480 train_time:107826ms step_avg:152.73ms
step:717/1480 train_time:107986ms step_avg:152.74ms
step:718/1480 train_time:108148ms step_avg:152.75ms
step:719/1480 train_time:108307ms step_avg:152.76ms
step:720/1480 train_time:108470ms step_avg:152.77ms
step:721/1480 train_time:108631ms step_avg:152.79ms
step:722/1480 train_time:108792ms step_avg:152.80ms
step:723/1480 train_time:108952ms step_avg:152.81ms
step:724/1480 train_time:109114ms step_avg:152.82ms
step:725/1480 train_time:109278ms step_avg:152.84ms
step:726/1480 train_time:109443ms step_avg:152.85ms
step:727/1480 train_time:109606ms step_avg:152.87ms
step:728/1480 train_time:109767ms step_avg:152.88ms
step:729/1480 train_time:109928ms step_avg:152.89ms
step:730/1480 train_time:110091ms step_avg:152.90ms
step:731/1480 train_time:110252ms step_avg:152.92ms
step:732/1480 train_time:110412ms step_avg:152.93ms
step:733/1480 train_time:110573ms step_avg:152.94ms
step:734/1480 train_time:110735ms step_avg:152.95ms
step:735/1480 train_time:110895ms step_avg:152.96ms
step:736/1480 train_time:111057ms step_avg:152.97ms
step:737/1480 train_time:111217ms step_avg:152.98ms
step:738/1480 train_time:111379ms step_avg:152.99ms
step:739/1480 train_time:111540ms step_avg:153.00ms
step:740/1480 train_time:111705ms step_avg:153.02ms
step:741/1480 train_time:111870ms step_avg:153.04ms
step:742/1480 train_time:112031ms step_avg:153.05ms
step:743/1480 train_time:112191ms step_avg:153.06ms
step:744/1480 train_time:112355ms step_avg:153.07ms
step:745/1480 train_time:112519ms step_avg:153.09ms
step:746/1480 train_time:112679ms step_avg:153.10ms
step:747/1480 train_time:112843ms step_avg:153.11ms
step:748/1480 train_time:113008ms step_avg:153.13ms
step:749/1480 train_time:113171ms step_avg:153.14ms
step:750/1480 train_time:113330ms step_avg:153.15ms
step:750/1480 val_loss:3.5458 train_time:113393ms step_avg:153.23ms
step:751/1480 train_time:113493ms step_avg:153.16ms
step:752/1480 train_time:113654ms step_avg:153.17ms
step:753/1480 train_time:113814ms step_avg:153.18ms
step:754/1480 train_time:113974ms step_avg:153.19ms
step:755/1480 train_time:114135ms step_avg:153.20ms
step:756/1480 train_time:114295ms step_avg:153.21ms
step:757/1480 train_time:114459ms step_avg:153.22ms
step:758/1480 train_time:114620ms step_avg:153.24ms
step:759/1480 train_time:114782ms step_avg:153.25ms
step:760/1480 train_time:114944ms step_avg:153.26ms
step:761/1480 train_time:115108ms step_avg:153.27ms
step:762/1480 train_time:115269ms step_avg:153.28ms
step:763/1480 train_time:115431ms step_avg:153.29ms
step:764/1480 train_time:115593ms step_avg:153.31ms
step:765/1480 train_time:115755ms step_avg:153.32ms
step:766/1480 train_time:115918ms step_avg:153.33ms
step:767/1480 train_time:116079ms step_avg:153.34ms
step:768/1480 train_time:116241ms step_avg:153.35ms
step:769/1480 train_time:116406ms step_avg:153.37ms
step:770/1480 train_time:116568ms step_avg:153.38ms
step:771/1480 train_time:116733ms step_avg:153.39ms
step:772/1480 train_time:116895ms step_avg:153.41ms
step:773/1480 train_time:117056ms step_avg:153.42ms
step:774/1480 train_time:117219ms step_avg:153.43ms
step:775/1480 train_time:117381ms step_avg:153.44ms
step:776/1480 train_time:117547ms step_avg:153.46ms
step:777/1480 train_time:117713ms step_avg:153.47ms
step:778/1480 train_time:117876ms step_avg:153.48ms
step:779/1480 train_time:118039ms step_avg:153.50ms
step:780/1480 train_time:118204ms step_avg:153.51ms
step:781/1480 train_time:118368ms step_avg:153.52ms
step:782/1480 train_time:118531ms step_avg:153.54ms
step:783/1480 train_time:118693ms step_avg:153.55ms
step:784/1480 train_time:118855ms step_avg:153.56ms
step:785/1480 train_time:119016ms step_avg:153.57ms
step:786/1480 train_time:119182ms step_avg:153.59ms
step:787/1480 train_time:119347ms step_avg:153.60ms
step:788/1480 train_time:119510ms step_avg:153.61ms
step:789/1480 train_time:119671ms step_avg:153.62ms
step:790/1480 train_time:119836ms step_avg:153.64ms
step:791/1480 train_time:120003ms step_avg:153.65ms
step:792/1480 train_time:120168ms step_avg:153.67ms
step:793/1480 train_time:120331ms step_avg:153.68ms
step:794/1480 train_time:120494ms step_avg:153.69ms
step:795/1480 train_time:120659ms step_avg:153.71ms
step:796/1480 train_time:120826ms step_avg:153.72ms
step:797/1480 train_time:120991ms step_avg:153.74ms
step:798/1480 train_time:121154ms step_avg:153.75ms
step:799/1480 train_time:121323ms step_avg:153.77ms
step:800/1480 train_time:121487ms step_avg:153.78ms
step:801/1480 train_time:121650ms step_avg:153.79ms
step:802/1480 train_time:121815ms step_avg:153.81ms
step:803/1480 train_time:121977ms step_avg:153.82ms
step:804/1480 train_time:122138ms step_avg:153.83ms
step:805/1480 train_time:122304ms step_avg:153.84ms
step:806/1480 train_time:122466ms step_avg:153.85ms
step:807/1480 train_time:122628ms step_avg:153.86ms
step:808/1480 train_time:122792ms step_avg:153.87ms
step:809/1480 train_time:122954ms step_avg:153.88ms
step:810/1480 train_time:123115ms step_avg:153.89ms
step:811/1480 train_time:123276ms step_avg:153.90ms
step:812/1480 train_time:123441ms step_avg:153.92ms
step:813/1480 train_time:123604ms step_avg:153.93ms
step:814/1480 train_time:123767ms step_avg:153.94ms
step:815/1480 train_time:123929ms step_avg:153.95ms
step:816/1480 train_time:124094ms step_avg:153.96ms
step:817/1480 train_time:124256ms step_avg:153.97ms
step:818/1480 train_time:124417ms step_avg:153.98ms
step:819/1480 train_time:124580ms step_avg:153.99ms
step:820/1480 train_time:124744ms step_avg:154.01ms
step:821/1480 train_time:124907ms step_avg:154.02ms
step:822/1480 train_time:125070ms step_avg:154.03ms
step:823/1480 train_time:125232ms step_avg:154.04ms
step:824/1480 train_time:125395ms step_avg:154.05ms
step:825/1480 train_time:125558ms step_avg:154.06ms
step:826/1480 train_time:125726ms step_avg:154.08ms
step:827/1480 train_time:125890ms step_avg:154.09ms
step:828/1480 train_time:126052ms step_avg:154.10ms
step:829/1480 train_time:126215ms step_avg:154.11ms
step:830/1480 train_time:126378ms step_avg:154.12ms
step:831/1480 train_time:126545ms step_avg:154.14ms
step:832/1480 train_time:126709ms step_avg:154.15ms
step:833/1480 train_time:126872ms step_avg:154.16ms
step:834/1480 train_time:127036ms step_avg:154.17ms
step:835/1480 train_time:127201ms step_avg:154.18ms
step:836/1480 train_time:127365ms step_avg:154.19ms
step:837/1480 train_time:127529ms step_avg:154.21ms
step:838/1480 train_time:127692ms step_avg:154.22ms
step:839/1480 train_time:127853ms step_avg:154.23ms
step:840/1480 train_time:128014ms step_avg:154.23ms
step:841/1480 train_time:128174ms step_avg:154.24ms
step:842/1480 train_time:128337ms step_avg:154.25ms
step:843/1480 train_time:128501ms step_avg:154.26ms
step:844/1480 train_time:128663ms step_avg:154.27ms
step:845/1480 train_time:128829ms step_avg:154.29ms
step:846/1480 train_time:128993ms step_avg:154.30ms
step:847/1480 train_time:129156ms step_avg:154.31ms
step:848/1480 train_time:129317ms step_avg:154.32ms
step:849/1480 train_time:129480ms step_avg:154.33ms
step:850/1480 train_time:129643ms step_avg:154.34ms
step:851/1480 train_time:129809ms step_avg:154.35ms
step:852/1480 train_time:129971ms step_avg:154.36ms
step:853/1480 train_time:130133ms step_avg:154.37ms
step:854/1480 train_time:130297ms step_avg:154.38ms
step:855/1480 train_time:130460ms step_avg:154.39ms
step:856/1480 train_time:130627ms step_avg:154.41ms
step:857/1480 train_time:130791ms step_avg:154.42ms
step:858/1480 train_time:130956ms step_avg:154.43ms
step:859/1480 train_time:131118ms step_avg:154.44ms
step:860/1480 train_time:131281ms step_avg:154.45ms
step:861/1480 train_time:131449ms step_avg:154.46ms
step:862/1480 train_time:131617ms step_avg:154.48ms
step:863/1480 train_time:131786ms step_avg:154.50ms
step:864/1480 train_time:131950ms step_avg:154.51ms
step:865/1480 train_time:132110ms step_avg:154.52ms
step:866/1480 train_time:132276ms step_avg:154.53ms
step:867/1480 train_time:132439ms step_avg:154.54ms
step:868/1480 train_time:132601ms step_avg:154.55ms
step:869/1480 train_time:132764ms step_avg:154.56ms
step:870/1480 train_time:132929ms step_avg:154.57ms
step:871/1480 train_time:133092ms step_avg:154.58ms
step:872/1480 train_time:133254ms step_avg:154.59ms
step:873/1480 train_time:133415ms step_avg:154.59ms
step:874/1480 train_time:133581ms step_avg:154.61ms
step:875/1480 train_time:133746ms step_avg:154.62ms
step:875/1480 val_loss:3.5034 train_time:133812ms step_avg:154.70ms
step:876/1480 train_time:133913ms step_avg:154.63ms
step:877/1480 train_time:134082ms step_avg:154.65ms
step:878/1480 train_time:134246ms step_avg:154.66ms
step:879/1480 train_time:134408ms step_avg:154.67ms
step:880/1480 train_time:134571ms step_avg:154.68ms
step:881/1480 train_time:134733ms step_avg:154.69ms
step:882/1480 train_time:134897ms step_avg:154.70ms
step:883/1480 train_time:135064ms step_avg:154.71ms
step:884/1480 train_time:135230ms step_avg:154.73ms
step:885/1480 train_time:135394ms step_avg:154.74ms
step:886/1480 train_time:135562ms step_avg:154.75ms
step:887/1480 train_time:135728ms step_avg:154.76ms
step:888/1480 train_time:135901ms step_avg:154.78ms
step:889/1480 train_time:136069ms step_avg:154.80ms
step:890/1480 train_time:136230ms step_avg:154.81ms
step:891/1480 train_time:136395ms step_avg:154.82ms
step:892/1480 train_time:136563ms step_avg:154.83ms
step:893/1480 train_time:136727ms step_avg:154.84ms
step:894/1480 train_time:136893ms step_avg:154.86ms
step:895/1480 train_time:137061ms step_avg:154.87ms
step:896/1480 train_time:137226ms step_avg:154.88ms
step:897/1480 train_time:137392ms step_avg:154.89ms
step:898/1480 train_time:137560ms step_avg:154.91ms
step:899/1480 train_time:137725ms step_avg:154.92ms
step:900/1480 train_time:137888ms step_avg:154.93ms
step:901/1480 train_time:138053ms step_avg:154.94ms
step:902/1480 train_time:138217ms step_avg:154.95ms
step:903/1480 train_time:138392ms step_avg:154.97ms
step:904/1480 train_time:138558ms step_avg:154.99ms
step:905/1480 train_time:138720ms step_avg:154.99ms
step:906/1480 train_time:138889ms step_avg:155.01ms
step:907/1480 train_time:139057ms step_avg:155.02ms
step:908/1480 train_time:139219ms step_avg:155.03ms
step:909/1480 train_time:139384ms step_avg:155.04ms
step:910/1480 train_time:139553ms step_avg:155.06ms
step:911/1480 train_time:139718ms step_avg:155.07ms
step:912/1480 train_time:139887ms step_avg:155.09ms
step:913/1480 train_time:140054ms step_avg:155.10ms
step:914/1480 train_time:140221ms step_avg:155.11ms
step:915/1480 train_time:140391ms step_avg:155.13ms
step:916/1480 train_time:140556ms step_avg:155.14ms
step:917/1480 train_time:140719ms step_avg:155.15ms
step:918/1480 train_time:140888ms step_avg:155.16ms
step:919/1480 train_time:141059ms step_avg:155.18ms
step:920/1480 train_time:141225ms step_avg:155.19ms
step:921/1480 train_time:141391ms step_avg:155.20ms
step:922/1480 train_time:141560ms step_avg:155.22ms
step:923/1480 train_time:141722ms step_avg:155.23ms
step:924/1480 train_time:141887ms step_avg:155.24ms
step:925/1480 train_time:142051ms step_avg:155.25ms
step:926/1480 train_time:142212ms step_avg:155.25ms
step:927/1480 train_time:142377ms step_avg:155.26ms
step:928/1480 train_time:142543ms step_avg:155.28ms
step:929/1480 train_time:142707ms step_avg:155.28ms
step:930/1480 train_time:142871ms step_avg:155.29ms
step:931/1480 train_time:143035ms step_avg:155.30ms
step:932/1480 train_time:143202ms step_avg:155.32ms
step:933/1480 train_time:143369ms step_avg:155.33ms
step:934/1480 train_time:143535ms step_avg:155.34ms
step:935/1480 train_time:143707ms step_avg:155.36ms
step:936/1480 train_time:143874ms step_avg:155.37ms
step:937/1480 train_time:144044ms step_avg:155.39ms
step:938/1480 train_time:144206ms step_avg:155.39ms
step:939/1480 train_time:144374ms step_avg:155.41ms
step:940/1480 train_time:144542ms step_avg:155.42ms
step:941/1480 train_time:144707ms step_avg:155.43ms
step:942/1480 train_time:144872ms step_avg:155.44ms
step:943/1480 train_time:145044ms step_avg:155.46ms
step:944/1480 train_time:145217ms step_avg:155.48ms
step:945/1480 train_time:145382ms step_avg:155.49ms
step:946/1480 train_time:145552ms step_avg:155.50ms
step:947/1480 train_time:145721ms step_avg:155.52ms
step:948/1480 train_time:145887ms step_avg:155.53ms
step:949/1480 train_time:146052ms step_avg:155.54ms
step:950/1480 train_time:146215ms step_avg:155.55ms
step:951/1480 train_time:146384ms step_avg:155.56ms
step:952/1480 train_time:146550ms step_avg:155.57ms
step:953/1480 train_time:146716ms step_avg:155.58ms
step:954/1480 train_time:146885ms step_avg:155.60ms
step:955/1480 train_time:147049ms step_avg:155.61ms
step:956/1480 train_time:147213ms step_avg:155.62ms
step:957/1480 train_time:147381ms step_avg:155.63ms
step:958/1480 train_time:147550ms step_avg:155.64ms
step:959/1480 train_time:147715ms step_avg:155.65ms
step:960/1480 train_time:147883ms step_avg:155.67ms
step:961/1480 train_time:148048ms step_avg:155.68ms
step:962/1480 train_time:148212ms step_avg:155.68ms
step:963/1480 train_time:148379ms step_avg:155.70ms
step:964/1480 train_time:148548ms step_avg:155.71ms
step:965/1480 train_time:148711ms step_avg:155.72ms
step:966/1480 train_time:148877ms step_avg:155.73ms
step:967/1480 train_time:149041ms step_avg:155.74ms
step:968/1480 train_time:149206ms step_avg:155.75ms
step:969/1480 train_time:149372ms step_avg:155.76ms
step:970/1480 train_time:149534ms step_avg:155.76ms
step:971/1480 train_time:149699ms step_avg:155.77ms
step:972/1480 train_time:149865ms step_avg:155.78ms
step:973/1480 train_time:150029ms step_avg:155.79ms
step:974/1480 train_time:150199ms step_avg:155.81ms
step:975/1480 train_time:150367ms step_avg:155.82ms
step:976/1480 train_time:150531ms step_avg:155.83ms
step:977/1480 train_time:150695ms step_avg:155.84ms
step:978/1480 train_time:150864ms step_avg:155.85ms
step:979/1480 train_time:151030ms step_avg:155.86ms
step:980/1480 train_time:151195ms step_avg:155.87ms
step:981/1480 train_time:151365ms step_avg:155.89ms
step:982/1480 train_time:151527ms step_avg:155.89ms
step:983/1480 train_time:151691ms step_avg:155.90ms
step:984/1480 train_time:151854ms step_avg:155.91ms
step:985/1480 train_time:152022ms step_avg:155.92ms
step:986/1480 train_time:152188ms step_avg:155.93ms
step:987/1480 train_time:152352ms step_avg:155.94ms
step:988/1480 train_time:152520ms step_avg:155.95ms
step:989/1480 train_time:152688ms step_avg:155.96ms
step:990/1480 train_time:152856ms step_avg:155.98ms
step:991/1480 train_time:153024ms step_avg:155.99ms
step:992/1480 train_time:153197ms step_avg:156.00ms
step:993/1480 train_time:153372ms step_avg:156.02ms
step:994/1480 train_time:153538ms step_avg:156.03ms
step:995/1480 train_time:153703ms step_avg:156.04ms
step:996/1480 train_time:153865ms step_avg:156.05ms
step:997/1480 train_time:154028ms step_avg:156.06ms
step:998/1480 train_time:154191ms step_avg:156.06ms
step:999/1480 train_time:154359ms step_avg:156.08ms
step:1000/1480 train_time:154526ms step_avg:156.09ms
step:1000/1480 val_loss:3.4381 train_time:154594ms step_avg:156.16ms
step:1001/1480 train_time:154697ms step_avg:156.10ms
step:1002/1480 train_time:154862ms step_avg:156.11ms
step:1003/1480 train_time:155032ms step_avg:156.12ms
step:1004/1480 train_time:155201ms step_avg:156.14ms
step:1005/1480 train_time:155368ms step_avg:156.15ms
step:1006/1480 train_time:155535ms step_avg:156.16ms
step:1007/1480 train_time:155703ms step_avg:156.17ms
step:1008/1480 train_time:155869ms step_avg:156.18ms
step:1009/1480 train_time:156042ms step_avg:156.20ms
step:1010/1480 train_time:156209ms step_avg:156.21ms
step:1011/1480 train_time:156374ms step_avg:156.22ms
step:1012/1480 train_time:156540ms step_avg:156.23ms
step:1013/1480 train_time:156710ms step_avg:156.24ms
step:1014/1480 train_time:156878ms step_avg:156.25ms
step:1015/1480 train_time:157048ms step_avg:156.27ms
step:1016/1480 train_time:157215ms step_avg:156.28ms
step:1017/1480 train_time:157387ms step_avg:156.29ms
step:1018/1480 train_time:157555ms step_avg:156.30ms
step:1019/1480 train_time:157724ms step_avg:156.32ms
step:1020/1480 train_time:157893ms step_avg:156.33ms
step:1021/1480 train_time:158059ms step_avg:156.34ms
step:1022/1480 train_time:158228ms step_avg:156.35ms
step:1023/1480 train_time:158395ms step_avg:156.36ms
step:1024/1480 train_time:158563ms step_avg:156.37ms
step:1025/1480 train_time:158733ms step_avg:156.39ms
step:1026/1480 train_time:158899ms step_avg:156.40ms
step:1027/1480 train_time:159066ms step_avg:156.41ms
step:1028/1480 train_time:159238ms step_avg:156.42ms
step:1029/1480 train_time:159412ms step_avg:156.44ms
step:1030/1480 train_time:159581ms step_avg:156.45ms
step:1031/1480 train_time:159746ms step_avg:156.46ms
step:1032/1480 train_time:159918ms step_avg:156.48ms
step:1033/1480 train_time:160085ms step_avg:156.49ms
step:1034/1480 train_time:160252ms step_avg:156.50ms
step:1035/1480 train_time:160422ms step_avg:156.51ms
step:1036/1480 train_time:160586ms step_avg:156.52ms
step:1037/1480 train_time:160753ms step_avg:156.53ms
step:1038/1480 train_time:160922ms step_avg:156.54ms
step:1039/1480 train_time:161091ms step_avg:156.55ms
step:1040/1480 train_time:161256ms step_avg:156.56ms
step:1041/1480 train_time:161425ms step_avg:156.57ms
step:1042/1480 train_time:161588ms step_avg:156.58ms
step:1043/1480 train_time:161753ms step_avg:156.59ms
step:1044/1480 train_time:161921ms step_avg:156.60ms
step:1045/1480 train_time:162090ms step_avg:156.61ms
step:1046/1480 train_time:162258ms step_avg:156.62ms
step:1047/1480 train_time:162424ms step_avg:156.63ms
step:1048/1480 train_time:162590ms step_avg:156.64ms
step:1049/1480 train_time:162755ms step_avg:156.65ms
step:1050/1480 train_time:162926ms step_avg:156.66ms
step:1051/1480 train_time:163095ms step_avg:156.67ms
step:1052/1480 train_time:163263ms step_avg:156.68ms
step:1053/1480 train_time:163429ms step_avg:156.69ms
step:1054/1480 train_time:163597ms step_avg:156.70ms
step:1055/1480 train_time:163764ms step_avg:156.71ms
step:1056/1480 train_time:163928ms step_avg:156.72ms
step:1057/1480 train_time:164094ms step_avg:156.73ms
step:1058/1480 train_time:164264ms step_avg:156.74ms
step:1059/1480 train_time:164435ms step_avg:156.75ms
step:1060/1480 train_time:164604ms step_avg:156.77ms
step:1061/1480 train_time:164767ms step_avg:156.77ms
step:1062/1480 train_time:164933ms step_avg:156.78ms
step:1063/1480 train_time:165100ms step_avg:156.79ms
step:1064/1480 train_time:165264ms step_avg:156.80ms
step:1065/1480 train_time:165431ms step_avg:156.81ms
step:1066/1480 train_time:165599ms step_avg:156.82ms
step:1067/1480 train_time:165768ms step_avg:156.83ms
step:1068/1480 train_time:165934ms step_avg:156.84ms
step:1069/1480 train_time:166106ms step_avg:156.85ms
step:1070/1480 train_time:166271ms step_avg:156.86ms
step:1071/1480 train_time:166444ms step_avg:156.87ms
step:1072/1480 train_time:166609ms step_avg:156.88ms
step:1073/1480 train_time:166773ms step_avg:156.89ms
step:1074/1480 train_time:166940ms step_avg:156.90ms
step:1075/1480 train_time:167111ms step_avg:156.91ms
step:1076/1480 train_time:167279ms step_avg:156.92ms
step:1077/1480 train_time:167447ms step_avg:156.93ms
step:1078/1480 train_time:167623ms step_avg:156.95ms
step:1079/1480 train_time:167795ms step_avg:156.96ms
step:1080/1480 train_time:167965ms step_avg:156.98ms
step:1081/1480 train_time:168131ms step_avg:156.99ms
step:1082/1480 train_time:168299ms step_avg:157.00ms
step:1083/1480 train_time:168465ms step_avg:157.00ms
step:1084/1480 train_time:168633ms step_avg:157.01ms
step:1085/1480 train_time:168803ms step_avg:157.03ms
step:1086/1480 train_time:168970ms step_avg:157.04ms
step:1087/1480 train_time:169136ms step_avg:157.04ms
step:1088/1480 train_time:169306ms step_avg:157.06ms
step:1089/1480 train_time:169479ms step_avg:157.07ms
step:1090/1480 train_time:169649ms step_avg:157.08ms
step:1091/1480 train_time:169817ms step_avg:157.09ms
step:1092/1480 train_time:169985ms step_avg:157.10ms
step:1093/1480 train_time:170151ms step_avg:157.11ms
step:1094/1480 train_time:170318ms step_avg:157.12ms
step:1095/1480 train_time:170483ms step_avg:157.13ms
step:1096/1480 train_time:170651ms step_avg:157.14ms
step:1097/1480 train_time:170820ms step_avg:157.15ms
step:1098/1480 train_time:170989ms step_avg:157.16ms
step:1099/1480 train_time:171160ms step_avg:157.17ms
step:1100/1480 train_time:171332ms step_avg:157.19ms
step:1101/1480 train_time:171503ms step_avg:157.20ms
step:1102/1480 train_time:171672ms step_avg:157.21ms
step:1103/1480 train_time:171849ms step_avg:157.23ms
step:1104/1480 train_time:172017ms step_avg:157.24ms
step:1105/1480 train_time:172187ms step_avg:157.25ms
step:1106/1480 train_time:172356ms step_avg:157.26ms
step:1107/1480 train_time:172525ms step_avg:157.27ms
step:1108/1480 train_time:172689ms step_avg:157.28ms
step:1109/1480 train_time:172854ms step_avg:157.28ms
step:1110/1480 train_time:173022ms step_avg:157.29ms
step:1111/1480 train_time:173187ms step_avg:157.30ms
step:1112/1480 train_time:173357ms step_avg:157.31ms
step:1113/1480 train_time:173538ms step_avg:157.33ms
step:1114/1480 train_time:173711ms step_avg:157.35ms
step:1115/1480 train_time:173885ms step_avg:157.36ms
step:1116/1480 train_time:174051ms step_avg:157.37ms
step:1117/1480 train_time:174225ms step_avg:157.38ms
step:1118/1480 train_time:174398ms step_avg:157.40ms
step:1119/1480 train_time:174564ms step_avg:157.41ms
step:1120/1480 train_time:174731ms step_avg:157.42ms
step:1121/1480 train_time:174903ms step_avg:157.43ms
step:1122/1480 train_time:175068ms step_avg:157.44ms
step:1123/1480 train_time:175234ms step_avg:157.44ms
step:1124/1480 train_time:175403ms step_avg:157.45ms
step:1125/1480 train_time:175571ms step_avg:157.46ms
step:1125/1480 val_loss:3.3839 train_time:175639ms step_avg:157.52ms
step:1126/1480 train_time:175740ms step_avg:157.47ms
step:1127/1480 train_time:175912ms step_avg:157.49ms
step:1128/1480 train_time:176084ms step_avg:157.50ms
step:1129/1480 train_time:176257ms step_avg:157.51ms
step:1130/1480 train_time:176426ms step_avg:157.52ms
step:1131/1480 train_time:176604ms step_avg:157.54ms
step:1132/1480 train_time:176770ms step_avg:157.55ms
step:1133/1480 train_time:176943ms step_avg:157.56ms
step:1134/1480 train_time:177114ms step_avg:157.57ms
step:1135/1480 train_time:177281ms step_avg:157.58ms
step:1136/1480 train_time:177452ms step_avg:157.60ms
step:1137/1480 train_time:177622ms step_avg:157.61ms
step:1138/1480 train_time:177793ms step_avg:157.62ms
step:1139/1480 train_time:177961ms step_avg:157.63ms
step:1140/1480 train_time:178129ms step_avg:157.64ms
step:1141/1480 train_time:178300ms step_avg:157.65ms
step:1142/1480 train_time:178470ms step_avg:157.66ms
step:1143/1480 train_time:178640ms step_avg:157.67ms
step:1144/1480 train_time:178810ms step_avg:157.68ms
step:1145/1480 train_time:178976ms step_avg:157.69ms
step:1146/1480 train_time:179147ms step_avg:157.70ms
step:1147/1480 train_time:179316ms step_avg:157.71ms
step:1148/1480 train_time:179484ms step_avg:157.72ms
step:1149/1480 train_time:179656ms step_avg:157.73ms
step:1150/1480 train_time:179824ms step_avg:157.74ms
step:1151/1480 train_time:179996ms step_avg:157.75ms
step:1152/1480 train_time:180166ms step_avg:157.76ms
step:1153/1480 train_time:180340ms step_avg:157.78ms
step:1154/1480 train_time:180507ms step_avg:157.79ms
step:1155/1480 train_time:180679ms step_avg:157.80ms
step:1156/1480 train_time:180858ms step_avg:157.82ms
step:1157/1480 train_time:181028ms step_avg:157.83ms
step:1158/1480 train_time:181195ms step_avg:157.84ms
step:1159/1480 train_time:181362ms step_avg:157.84ms
step:1160/1480 train_time:181529ms step_avg:157.85ms
step:1161/1480 train_time:181700ms step_avg:157.86ms
step:1162/1480 train_time:181869ms step_avg:157.87ms
step:1163/1480 train_time:182039ms step_avg:157.88ms
step:1164/1480 train_time:182208ms step_avg:157.89ms
step:1165/1480 train_time:182374ms step_avg:157.90ms
step:1166/1480 train_time:182543ms step_avg:157.91ms
step:1167/1480 train_time:182712ms step_avg:157.92ms
step:1168/1480 train_time:182880ms step_avg:157.93ms
step:1169/1480 train_time:183050ms step_avg:157.94ms
step:1170/1480 train_time:183218ms step_avg:157.95ms
step:1171/1480 train_time:183385ms step_avg:157.95ms
step:1172/1480 train_time:183553ms step_avg:157.96ms
step:1173/1480 train_time:183723ms step_avg:157.97ms
step:1174/1480 train_time:183905ms step_avg:157.99ms
step:1175/1480 train_time:184077ms step_avg:158.01ms
step:1176/1480 train_time:184248ms step_avg:158.02ms
step:1177/1480 train_time:184424ms step_avg:158.03ms
step:1178/1480 train_time:184591ms step_avg:158.04ms
step:1179/1480 train_time:184757ms step_avg:158.05ms
step:1180/1480 train_time:184938ms step_avg:158.07ms
step:1181/1480 train_time:185108ms step_avg:158.08ms
step:1182/1480 train_time:185277ms step_avg:158.09ms
step:1183/1480 train_time:185448ms step_avg:158.10ms
step:1184/1480 train_time:185617ms step_avg:158.11ms
step:1185/1480 train_time:185789ms step_avg:158.12ms
step:1186/1480 train_time:185961ms step_avg:158.13ms
step:1187/1480 train_time:186142ms step_avg:158.15ms
step:1188/1480 train_time:186309ms step_avg:158.16ms
step:1189/1480 train_time:186481ms step_avg:158.17ms
step:1190/1480 train_time:186650ms step_avg:158.18ms
step:1191/1480 train_time:186822ms step_avg:158.19ms
step:1192/1480 train_time:186989ms step_avg:158.20ms
step:1193/1480 train_time:187155ms step_avg:158.20ms
step:1194/1480 train_time:187323ms step_avg:158.21ms
step:1195/1480 train_time:187496ms step_avg:158.22ms
step:1196/1480 train_time:187678ms step_avg:158.24ms
step:1197/1480 train_time:187849ms step_avg:158.26ms
step:1198/1480 train_time:188031ms step_avg:158.28ms
step:1199/1480 train_time:188202ms step_avg:158.29ms
step:1200/1480 train_time:188370ms step_avg:158.29ms
step:1201/1480 train_time:188538ms step_avg:158.30ms
step:1202/1480 train_time:188720ms step_avg:158.32ms
step:1203/1480 train_time:188897ms step_avg:158.34ms
step:1204/1480 train_time:189071ms step_avg:158.35ms
step:1205/1480 train_time:189239ms step_avg:158.36ms
step:1206/1480 train_time:189408ms step_avg:158.37ms
step:1207/1480 train_time:189578ms step_avg:158.38ms
step:1208/1480 train_time:189744ms step_avg:158.38ms
step:1209/1480 train_time:189917ms step_avg:158.40ms
step:1210/1480 train_time:190094ms step_avg:158.41ms
step:1211/1480 train_time:190270ms step_avg:158.43ms
step:1212/1480 train_time:190443ms step_avg:158.44ms
step:1213/1480 train_time:190618ms step_avg:158.45ms
step:1214/1480 train_time:190797ms step_avg:158.47ms
step:1215/1480 train_time:190970ms step_avg:158.48ms
step:1216/1480 train_time:191140ms step_avg:158.49ms
step:1217/1480 train_time:191314ms step_avg:158.50ms
step:1218/1480 train_time:191483ms step_avg:158.51ms
step:1219/1480 train_time:191663ms step_avg:158.53ms
step:1220/1480 train_time:191832ms step_avg:158.54ms
step:1221/1480 train_time:192000ms step_avg:158.55ms
step:1222/1480 train_time:192168ms step_avg:158.55ms
step:1223/1480 train_time:192339ms step_avg:158.56ms
step:1224/1480 train_time:192517ms step_avg:158.58ms
step:1225/1480 train_time:192689ms step_avg:158.59ms
step:1226/1480 train_time:192863ms step_avg:158.60ms
step:1227/1480 train_time:193037ms step_avg:158.62ms
step:1228/1480 train_time:193207ms step_avg:158.63ms
step:1229/1480 train_time:193381ms step_avg:158.64ms
step:1230/1480 train_time:193561ms step_avg:158.66ms
step:1231/1480 train_time:193736ms step_avg:158.67ms
step:1232/1480 train_time:193911ms step_avg:158.68ms
step:1233/1480 train_time:194080ms step_avg:158.69ms
step:1234/1480 train_time:194251ms step_avg:158.70ms
step:1235/1480 train_time:194425ms step_avg:158.71ms
step:1236/1480 train_time:194593ms step_avg:158.72ms
step:1237/1480 train_time:194764ms step_avg:158.73ms
step:1238/1480 train_time:194950ms step_avg:158.75ms
step:1239/1480 train_time:195121ms step_avg:158.76ms
step:1240/1480 train_time:195291ms step_avg:158.77ms
step:1241/1480 train_time:195462ms step_avg:158.78ms
step:1242/1480 train_time:195631ms step_avg:158.79ms
step:1243/1480 train_time:195805ms step_avg:158.80ms
step:1244/1480 train_time:195972ms step_avg:158.81ms
step:1245/1480 train_time:196141ms step_avg:158.82ms
step:1246/1480 train_time:196310ms step_avg:158.83ms
step:1247/1480 train_time:196478ms step_avg:158.83ms
step:1248/1480 train_time:196648ms step_avg:158.84ms
step:1249/1480 train_time:196817ms step_avg:158.85ms
step:1250/1480 train_time:196984ms step_avg:158.86ms
step:1250/1480 val_loss:3.3341 train_time:197056ms step_avg:158.92ms
step:1251/1480 train_time:197166ms step_avg:158.88ms
step:1252/1480 train_time:197336ms step_avg:158.89ms
step:1253/1480 train_time:197505ms step_avg:158.89ms
step:1254/1480 train_time:197677ms step_avg:158.90ms
step:1255/1480 train_time:197862ms step_avg:158.93ms
step:1256/1480 train_time:198037ms step_avg:158.94ms
step:1257/1480 train_time:198207ms step_avg:158.95ms
step:1258/1480 train_time:198383ms step_avg:158.96ms
step:1259/1480 train_time:198556ms step_avg:158.97ms
step:1260/1480 train_time:198723ms step_avg:158.98ms
step:1261/1480 train_time:198898ms step_avg:158.99ms
step:1262/1480 train_time:199073ms step_avg:159.00ms
step:1263/1480 train_time:199247ms step_avg:159.02ms
step:1264/1480 train_time:199414ms step_avg:159.02ms
step:1265/1480 train_time:199581ms step_avg:159.03ms
step:1266/1480 train_time:199751ms step_avg:159.04ms
step:1267/1480 train_time:199921ms step_avg:159.05ms
step:1268/1480 train_time:200092ms step_avg:159.06ms
step:1269/1480 train_time:200268ms step_avg:159.07ms
step:1270/1480 train_time:200438ms step_avg:159.08ms
step:1271/1480 train_time:200607ms step_avg:159.09ms
step:1272/1480 train_time:200774ms step_avg:159.09ms
step:1273/1480 train_time:200945ms step_avg:159.10ms
step:1274/1480 train_time:201116ms step_avg:159.11ms
step:1275/1480 train_time:201284ms step_avg:159.12ms
step:1276/1480 train_time:201448ms step_avg:159.12ms
step:1277/1480 train_time:201621ms step_avg:159.13ms
step:1278/1480 train_time:201790ms step_avg:159.14ms
step:1279/1480 train_time:201964ms step_avg:159.15ms
step:1280/1480 train_time:202144ms step_avg:159.17ms
step:1281/1480 train_time:202312ms step_avg:159.18ms
step:1282/1480 train_time:202479ms step_avg:159.18ms
step:1283/1480 train_time:202648ms step_avg:159.19ms
step:1284/1480 train_time:202819ms step_avg:159.20ms
step:1285/1480 train_time:202988ms step_avg:159.21ms
step:1286/1480 train_time:203160ms step_avg:159.22ms
step:1287/1480 train_time:203331ms step_avg:159.23ms
step:1288/1480 train_time:203502ms step_avg:159.24ms
step:1289/1480 train_time:203684ms step_avg:159.25ms
step:1290/1480 train_time:203864ms step_avg:159.27ms
step:1291/1480 train_time:204039ms step_avg:159.28ms
step:1292/1480 train_time:204214ms step_avg:159.29ms
step:1293/1480 train_time:204391ms step_avg:159.31ms
step:1294/1480 train_time:204563ms step_avg:159.32ms
step:1295/1480 train_time:204735ms step_avg:159.33ms
step:1296/1480 train_time:204908ms step_avg:159.34ms
step:1297/1480 train_time:205079ms step_avg:159.35ms
step:1298/1480 train_time:205249ms step_avg:159.35ms
step:1299/1480 train_time:205419ms step_avg:159.36ms
step:1300/1480 train_time:205586ms step_avg:159.37ms
step:1301/1480 train_time:205756ms step_avg:159.38ms
step:1302/1480 train_time:205930ms step_avg:159.39ms
step:1303/1480 train_time:206108ms step_avg:159.40ms
step:1304/1480 train_time:206282ms step_avg:159.41ms
step:1305/1480 train_time:206451ms step_avg:159.42ms
step:1306/1480 train_time:206624ms step_avg:159.43ms
step:1307/1480 train_time:206793ms step_avg:159.44ms
step:1308/1480 train_time:206962ms step_avg:159.45ms
step:1309/1480 train_time:207135ms step_avg:159.46ms
step:1310/1480 train_time:207303ms step_avg:159.46ms
step:1311/1480 train_time:207472ms step_avg:159.47ms
step:1312/1480 train_time:207645ms step_avg:159.48ms
step:1313/1480 train_time:207813ms step_avg:159.49ms
step:1314/1480 train_time:207987ms step_avg:159.50ms
step:1315/1480 train_time:208158ms step_avg:159.51ms
step:1316/1480 train_time:208324ms step_avg:159.51ms
step:1317/1480 train_time:208495ms step_avg:159.52ms
step:1318/1480 train_time:208675ms step_avg:159.54ms
step:1319/1480 train_time:208851ms step_avg:159.55ms
step:1320/1480 train_time:209027ms step_avg:159.56ms
step:1321/1480 train_time:209199ms step_avg:159.57ms
step:1322/1480 train_time:209380ms step_avg:159.59ms
step:1323/1480 train_time:209552ms step_avg:159.60ms
step:1324/1480 train_time:209726ms step_avg:159.61ms
step:1325/1480 train_time:209907ms step_avg:159.63ms
step:1326/1480 train_time:210084ms step_avg:159.64ms
step:1327/1480 train_time:210253ms step_avg:159.65ms
step:1328/1480 train_time:210424ms step_avg:159.65ms
step:1329/1480 train_time:210620ms step_avg:159.68ms
step:1330/1480 train_time:210801ms step_avg:159.70ms
step:1331/1480 train_time:210972ms step_avg:159.71ms
step:1332/1480 train_time:211146ms step_avg:159.72ms
step:1333/1480 train_time:211320ms step_avg:159.73ms
step:1334/1480 train_time:211491ms step_avg:159.74ms
step:1335/1480 train_time:211661ms step_avg:159.74ms
step:1336/1480 train_time:211844ms step_avg:159.76ms
step:1337/1480 train_time:212019ms step_avg:159.77ms
step:1338/1480 train_time:212191ms step_avg:159.78ms
step:1339/1480 train_time:212365ms step_avg:159.79ms
step:1340/1480 train_time:212538ms step_avg:159.80ms
step:1341/1480 train_time:212706ms step_avg:159.81ms
step:1342/1480 train_time:212881ms step_avg:159.82ms
step:1343/1480 train_time:213051ms step_avg:159.83ms
step:1344/1480 train_time:213223ms step_avg:159.84ms
step:1345/1480 train_time:213400ms step_avg:159.85ms
step:1346/1480 train_time:213569ms step_avg:159.86ms
step:1347/1480 train_time:213740ms step_avg:159.87ms
step:1348/1480 train_time:213909ms step_avg:159.87ms
step:1349/1480 train_time:214079ms step_avg:159.88ms
step:1350/1480 train_time:214254ms step_avg:159.89ms
step:1351/1480 train_time:214424ms step_avg:159.90ms
step:1352/1480 train_time:214595ms step_avg:159.91ms
step:1353/1480 train_time:214771ms step_avg:159.92ms
step:1354/1480 train_time:214942ms step_avg:159.93ms
step:1355/1480 train_time:215111ms step_avg:159.93ms
step:1356/1480 train_time:215285ms step_avg:159.94ms
step:1357/1480 train_time:215457ms step_avg:159.95ms
step:1358/1480 train_time:215628ms step_avg:159.96ms
step:1359/1480 train_time:215800ms step_avg:159.97ms
step:1360/1480 train_time:215977ms step_avg:159.98ms
step:1361/1480 train_time:216153ms step_avg:160.00ms
step:1362/1480 train_time:216328ms step_avg:160.01ms
step:1363/1480 train_time:216507ms step_avg:160.02ms
step:1364/1480 train_time:216677ms step_avg:160.03ms
step:1365/1480 train_time:216843ms step_avg:160.03ms
step:1366/1480 train_time:217015ms step_avg:160.04ms
step:1367/1480 train_time:217186ms step_avg:160.05ms
step:1368/1480 train_time:217359ms step_avg:160.06ms
step:1369/1480 train_time:217540ms step_avg:160.07ms
step:1370/1480 train_time:217719ms step_avg:160.09ms
step:1371/1480 train_time:217890ms step_avg:160.10ms
step:1372/1480 train_time:218068ms step_avg:160.11ms
step:1373/1480 train_time:218240ms step_avg:160.12ms
step:1374/1480 train_time:218416ms step_avg:160.13ms
step:1375/1480 train_time:218586ms step_avg:160.14ms
step:1375/1480 val_loss:3.2956 train_time:218654ms step_avg:160.19ms
step:1376/1480 train_time:218758ms step_avg:160.14ms
step:1377/1480 train_time:218931ms step_avg:160.15ms
step:1378/1480 train_time:219099ms step_avg:160.16ms
step:1379/1480 train_time:219276ms step_avg:160.17ms
step:1380/1480 train_time:219451ms step_avg:160.18ms
step:1381/1480 train_time:219634ms step_avg:160.20ms
step:1382/1480 train_time:219805ms step_avg:160.21ms
step:1383/1480 train_time:219978ms step_avg:160.22ms
step:1384/1480 train_time:220156ms step_avg:160.23ms
step:1385/1480 train_time:220322ms step_avg:160.23ms
step:1386/1480 train_time:220493ms step_avg:160.24ms
step:1387/1480 train_time:220663ms step_avg:160.25ms
step:1388/1480 train_time:220832ms step_avg:160.26ms
step:1389/1480 train_time:221004ms step_avg:160.26ms
step:1390/1480 train_time:221172ms step_avg:160.27ms
step:1391/1480 train_time:221341ms step_avg:160.28ms
step:1392/1480 train_time:221514ms step_avg:160.29ms
step:1393/1480 train_time:221684ms step_avg:160.29ms
step:1394/1480 train_time:221856ms step_avg:160.30ms
step:1395/1480 train_time:222025ms step_avg:160.31ms
step:1396/1480 train_time:222193ms step_avg:160.31ms
step:1397/1480 train_time:222360ms step_avg:160.32ms
step:1398/1480 train_time:222527ms step_avg:160.32ms
step:1399/1480 train_time:222697ms step_avg:160.33ms
step:1400/1480 train_time:222875ms step_avg:160.34ms
step:1401/1480 train_time:223040ms step_avg:160.35ms
step:1402/1480 train_time:223212ms step_avg:160.35ms
step:1403/1480 train_time:223389ms step_avg:160.37ms
step:1404/1480 train_time:223561ms step_avg:160.37ms
step:1405/1480 train_time:223736ms step_avg:160.38ms
step:1406/1480 train_time:223912ms step_avg:160.40ms
step:1407/1480 train_time:224080ms step_avg:160.40ms
step:1408/1480 train_time:224248ms step_avg:160.41ms
step:1409/1480 train_time:224431ms step_avg:160.42ms
step:1410/1480 train_time:224599ms step_avg:160.43ms
step:1411/1480 train_time:224768ms step_avg:160.43ms
step:1412/1480 train_time:224937ms step_avg:160.44ms
step:1413/1480 train_time:225108ms step_avg:160.45ms
step:1414/1480 train_time:225280ms step_avg:160.46ms
step:1415/1480 train_time:225456ms step_avg:160.47ms
step:1416/1480 train_time:225641ms step_avg:160.48ms
step:1417/1480 train_time:225816ms step_avg:160.49ms
step:1418/1480 train_time:225987ms step_avg:160.50ms
step:1419/1480 train_time:226160ms step_avg:160.51ms
step:1420/1480 train_time:226334ms step_avg:160.52ms
step:1421/1480 train_time:226508ms step_avg:160.53ms
step:1422/1480 train_time:226680ms step_avg:160.54ms
step:1423/1480 train_time:226849ms step_avg:160.54ms
step:1424/1480 train_time:227026ms step_avg:160.56ms
step:1425/1480 train_time:227205ms step_avg:160.57ms
step:1426/1480 train_time:227377ms step_avg:160.58ms
step:1427/1480 train_time:227552ms step_avg:160.59ms
step:1428/1480 train_time:227722ms step_avg:160.59ms
step:1429/1480 train_time:227892ms step_avg:160.60ms
step:1430/1480 train_time:228065ms step_avg:160.61ms
step:1431/1480 train_time:228239ms step_avg:160.62ms
step:1432/1480 train_time:228417ms step_avg:160.63ms
step:1433/1480 train_time:228598ms step_avg:160.65ms
step:1434/1480 train_time:228778ms step_avg:160.66ms
step:1435/1480 train_time:228955ms step_avg:160.67ms
step:1436/1480 train_time:229129ms step_avg:160.68ms
step:1437/1480 train_time:229298ms step_avg:160.69ms
step:1438/1480 train_time:229466ms step_avg:160.69ms
step:1439/1480 train_time:229638ms step_avg:160.70ms
step:1440/1480 train_time:229808ms step_avg:160.70ms
step:1441/1480 train_time:229978ms step_avg:160.71ms
step:1442/1480 train_time:230156ms step_avg:160.72ms
step:1443/1480 train_time:230343ms step_avg:160.74ms
step:1444/1480 train_time:230514ms step_avg:160.75ms
step:1445/1480 train_time:230687ms step_avg:160.76ms
step:1446/1480 train_time:230862ms step_avg:160.77ms
step:1447/1480 train_time:231039ms step_avg:160.78ms
step:1448/1480 train_time:231210ms step_avg:160.79ms
step:1449/1480 train_time:231385ms step_avg:160.80ms
step:1450/1480 train_time:231558ms step_avg:160.80ms
step:1451/1480 train_time:231728ms step_avg:160.81ms
step:1452/1480 train_time:231902ms step_avg:160.82ms
step:1453/1480 train_time:232071ms step_avg:160.83ms
step:1454/1480 train_time:232242ms step_avg:160.83ms
step:1455/1480 train_time:232420ms step_avg:160.84ms
step:1456/1480 train_time:232593ms step_avg:160.85ms
step:1457/1480 train_time:232764ms step_avg:160.86ms
step:1458/1480 train_time:232935ms step_avg:160.87ms
step:1459/1480 train_time:233111ms step_avg:160.88ms
step:1460/1480 train_time:233283ms step_avg:160.88ms
step:1461/1480 train_time:233458ms step_avg:160.89ms
step:1462/1480 train_time:233629ms step_avg:160.90ms
step:1463/1480 train_time:233805ms step_avg:160.91ms
step:1464/1480 train_time:233980ms step_avg:160.92ms
step:1465/1480 train_time:234153ms step_avg:160.93ms
step:1466/1480 train_time:234323ms step_avg:160.94ms
step:1467/1480 train_time:234498ms step_avg:160.95ms
step:1468/1480 train_time:234668ms step_avg:160.95ms
step:1469/1480 train_time:234841ms step_avg:160.96ms
step:1470/1480 train_time:235022ms step_avg:160.97ms
step:1471/1480 train_time:235209ms step_avg:160.99ms
step:1472/1480 train_time:235388ms step_avg:161.00ms
step:1473/1480 train_time:235560ms step_avg:161.01ms
step:1474/1480 train_time:235737ms step_avg:161.02ms
step:1475/1480 train_time:235916ms step_avg:161.03ms
step:1476/1480 train_time:236088ms step_avg:161.04ms
step:1477/1480 train_time:236271ms step_avg:161.06ms
step:1478/1480 train_time:236454ms step_avg:161.07ms
step:1479/1480 train_time:236627ms step_avg:161.08ms
step:1480/1480 train_time:236800ms step_avg:161.09ms
step:1480/1480 val_loss:3.2764 train_time:236871ms step_avg:161.14ms