import os
import sys
with open(sys.argv[0]) as f:
    code = f.read() # read the code of this file ASAP, for logging
import uuid
import time
import contextlib
from dataclasses import dataclass
from pathlib import Path

import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
import torch._inductor.config as config
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn.attention.flex_attention import BlockMask, flex_attention #KoszarskyB

# -----------------------------------------------------------------------------
# Muon optimizer

@torch.compile
def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7):
    """
    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
    zero even beyond the point where the iteration no longer converges all the way to one everywhere
    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
    performance at all relative to UV^T, where USV^T = G is the SVD.
    """
    assert len(G.shape) == 2
    a, b, c = (3.4445, -4.7750,  2.0315)
    X = G.bfloat16()
    X /= (X.norm() + eps) # ensure top singular value <= 1
    if G.size(0) > G.size(1):
        X = X.T
    for _ in range(steps):
        A = X @ X.T
        B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
        X = a * X + B @ X
    if G.size(0) > G.size(1):
        X = X.T
    return X

class Muon(torch.optim.Optimizer):
    """
    Muon - MomentUm Orthogonalized by Newton-schulz

    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
    the advantage that it can be stably run in bfloat16 on the GPU.

    Some warnings:
    - This optimizer assumes that all parameters passed in are 2D.
    - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D
    parameters; those should all be optimized by a standard method (e.g., AdamW).
    - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
    - We believe it is unlikely to work well for training with small batch size.
    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
    - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M).

    Arguments:
        lr: The learning rate used by the internal SGD.
        momentum: The momentum used by the internal SGD.
        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
        ns_steps: The number of Newton-Schulz iteration steps to use.
    """
    def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5):
        self.world_size = int(os.environ['WORLD_SIZE'])
        self.rank = int(os.environ['RANK'])
        defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
        params = list(params)
        assert all(isinstance(p, torch.Tensor) for p in params)
        sizes = {p.numel() for p in params}
        param_groups = [
            {
                'params': [p for p in params if p.numel() == size],
                'update_buffer': [
                    torch.empty(size, device='cuda', dtype=torch.bfloat16)
                    for _ in range(self.world_size)
                ],
            }
            for size in sizes
        ]
        super().__init__(param_groups, defaults)

    def step(self):

        for group in self.param_groups:

            lr = group['lr']
            momentum = group['momentum']
            nesterov = group['nesterov']
            ns_steps = group['ns_steps']
            update_buffers = group['update_buffer']
            # generate weight updates in distributed fashion
            params = group['params']
            assert len(params) % self.world_size == 0
            handle = None
            params_world = None
            def update_prev():
                if params_world is None:
                    return
                assert handle is not None
                handle.wait()
                for p_world, g_world in zip(params_world, update_buffers):
                    p_world.data.add_(
                        g_world.view_as(p_world),
                        alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5,
                    )
            for base_i in range(len(params))[::self.world_size]:
                p = params[base_i + self.rank]
                g = p.grad
                assert g is not None
                state = self.state[p]
                if 'momentum_buffer' not in state:
                    state['momentum_buffer'] = torch.zeros_like(g)
                buf = state['momentum_buffer']
                buf.lerp_(g, 1 - momentum)
                g = g.lerp_(buf, momentum) if nesterov else buf
                g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten()
                update_prev()
                handle = dist.all_gather(update_buffers, g, async_op=True)
                params_world = params[base_i : base_i + self.world_size]
            update_prev()

# -----------------------------------------------------------------------------
# PyTorch nn.Module definitions for the GPT-2 model

def norm(x):
    return F.rms_norm(x, (x.size(-1),))

class CastedLinear(nn.Linear):

    def __init__(self, in_features, out_features):
        super().__init__(in_features, out_features, bias=False)

    def forward(self, x):
        return F.linear(x, self.weight.to(x.dtype))

class Rotary(torch.nn.Module):

    def __init__(self, dim, base=10000):
        super().__init__()
        self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim))
        self.seq_len_cached = None
        self.cos_cached = None
        self.sin_cached = None

    def forward(self, x):
        seq_len = x.shape[1]
        if seq_len != self.seq_len_cached:
            t = torch.arange(seq_len, device=x.device)
            freqs = torch.outer(t, self.inv_freq)
            self.seq_len_cached = seq_len
            self.cos_cached = freqs.cos()
            self.sin_cached = freqs.sin()
        cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :]
        # apply_rotary_emb(x, cos, sin)
        x1, x2 = x.chunk(2, dim=3)
        y1 = x1 * cos + x2 * sin
        y2 = x1 * (-sin) + x2 * cos
        return torch.cat((y1, y2), 3).type_as(x)

class CausalSelfAttention(nn.Module):

    def __init__(self, dim, num_heads):
        super().__init__()
        assert dim % num_heads == 0
        self.num_heads = num_heads
        self.c_q = CastedLinear(dim, dim)
        self.c_k = CastedLinear(dim, dim)
        self.c_v = CastedLinear(dim, dim)
        self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5]))
        self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim
        self.c_proj = CastedLinear(dim, dim)
        self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977

    def forward(self, x, vi, block_mask):
        B, T = x.size(0), x.size(1) # batch size, sequence length
        assert B == 1, "Must use batch size = 1 for FlexAttention"
        q = self.c_q(x).view(B, T, self.num_heads, -1)
        k = self.c_k(x).view(B, T, self.num_heads, -1)
        v = self.c_v(x).view(B, T, self.num_heads, -1)
        v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977
        q, k = norm(q), norm(k) # QK norm @Grad62304977
        q, k = self.rotary(q), self.rotary(k)
        y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True)
        y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, dim):
        super().__init__()
        self.c_fc   = CastedLinear(dim, 4 * dim)
        self.c_proj = CastedLinear(4 * dim, dim)
        self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977

    def forward(self, x):
        x = self.c_fc(x)
        x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.attn = CausalSelfAttention(config.model_dim, config.num_heads)
        self.mlp = MLP(config.model_dim)
        self.lambdas = nn.Parameter(torch.tensor([1., 0.]))

    def forward(self, x, vi, x0, block_mask):
        x = self.lambdas[0] * x + self.lambdas[1] * x0
        x = x + self.attn(norm(x), vi, block_mask)
        x = x + self.mlp(norm(x))
        return x

class ValueEmbedding(nn.Module):
    def __init__(self, config: "GPTConfig"):
        super().__init__()
        self.__setattr__
        self.embed = nn.ModuleList([
            nn.Embedding(config.vocab_size, config.model_dim)
            for _ in range(6)
        ])

    def forward(self, inputs) -> "list[torch.Tensor]":
        ve = [emb(inputs) for emb in self.embed]
        ve += reversed(ve)
        return ve


# -----------------------------------------------------------------------------
# The main GPT-2 model

@dataclass
class GPTConfig:
    vocab_size : int = 50304
    num_layers : int = 12
    num_heads : int = 6 # head dim 128 suggested by @Grad62304977
    model_dim : int = 768

class GPT(nn.Module):

    def __init__(self, config: GPTConfig):
        super().__init__()
        self.num_layers = config.num_layers

        # U-net design by @brendanh0gan
        self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder
        self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder
        # Add learnable skip connection weights for decoder layers
        self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers))

        self.embed = nn.Embedding(config.vocab_size, config.model_dim)
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)])
        # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning
        # U-net structure on token value embeddings by @leloykun
        self.value_embeds = ValueEmbedding(config)
        self.lm_head = CastedLinear(config.model_dim, config.vocab_size)
        self.lm_head.weight.data.zero_() # @Grad62304977

    def forward(
        self,
        inputs: torch.Tensor,
        targets: torch.Tensor,
        sliding_window_num_blocks: torch.Tensor,
    ):
        BLOCK_SIZE = 128
        assert inputs.ndim == 1
        docs = (inputs == 50256).cumsum(0)
        docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous()
        docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous()

        def document_causal(b, h, q_idx, kv_idx):
            causal_mask = q_idx >= kv_idx
            document_mask = docs[q_idx] == docs[kv_idx]
            return causal_mask & document_mask

        def dense_to_ordered(dense_mask: torch.Tensor):
            num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32)
            indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32)
            return num_blocks[None, None].contiguous(), indices[None, None].contiguous()

        def create_doc_swc_block_mask(sliding_window_num_blocks: torch.Tensor):
            kv_idx = block_idx = torch.arange(512, dtype=torch.int32, device="cuda")
            q_idx = block_idx[:, None]
            causal_bm = q_idx >= kv_idx
            causal_full_bm = q_idx > kv_idx
            window_bm = q_idx - kv_idx < sliding_window_num_blocks
            window_full_bm = window_bm
            # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx])
            document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None])
            document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None])
            nonzero_bm = causal_bm & window_bm & document_bm
            full_bm  = causal_full_bm & window_full_bm & document_full_bm
            kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm ^ full_bm)
            full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm)
            return BlockMask.from_kv_blocks(
                kv_num_blocks,
                kv_indices,
                full_kv_num_blocks,
                full_kv_indices,
                BLOCK_SIZE=BLOCK_SIZE,
                mask_mod=document_causal,
            )

        block_mask = create_doc_swc_block_mask(sliding_window_num_blocks)

        # forward the GPT model itself
        x = self.embed(inputs[None]) # token embeddings of shape (b, t, model_dim)
        x = norm(x) # @Grad62304977
        x0 = x
        ve = self.value_embeds(inputs)
        ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:]

        # Store outputs for U-Net skip connections
        skip_connections = []
        # Encoder pass - process only the first half of the blocks
        for i in range(self.num_encoder_layers):
            x = self.blocks[i](x, ve_enc[i], x0, block_mask)
            skip_connections.append(x)
        # Decoder pass - process the remaining blocks with weighted skip connections
        for i in range(self.num_decoder_layers):
            x = x + self.skip_weights[i] * skip_connections.pop()
            # U-net structure on token value embeddings by @leloykun
            x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask)

        x = norm(x)
        logits = self.lm_head(x)
        logits = 30 * torch.tanh(logits / 30) # @Grad62304977
        logits = logits.float()
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return loss

# -----------------------------------------------------------------------------
# Our own simple Distributed Data Loader

def _peek_data_shard(file: Path):
    # only reads the header, returns header data
    # header is 256 int32
    header = torch.from_file(f"{file}", False, 256, dtype=torch.int32)
    assert header[0] == 20240520, "magic number mismatch in the data .bin file"
    assert header[1] == 1, "unsupported version"
    return int(header[2]) # number of tokens (claimed)

def _load_data_shard(path: Path, num_tokens):
    with path.open("rb", buffering=0) as f:
        tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True)
        f.seek(256 * 4)
        nbytes = f.readinto(tokens.numpy())
        assert nbytes == 2 * num_tokens, "number of tokens read does not match header?"
    return tokens

class DistributedDataLoader:
    def __init__(self, filename_pattern, seq_len, process_rank, num_processes):
        self.process_rank = process_rank
        self.num_processes = num_processes
        self.seq_len = seq_len

        # glob files that match the pattern
        self.files = sorted(Path.cwd().glob(filename_pattern))
        assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}"

        # load and validate all data shards, count number of tokens in total
        self.files_num_tokens = [_peek_data_shard(file) for file in self.files]
        assert min(self.files_num_tokens) >= num_processes * seq_len + 1
        self.total_num_tokens = sum(self.files_num_tokens)

        self.reset()

    def reset(self):
        self.current_shard = -1
        self.advance()

    def advance(self): # advance to next data shard
        self.current_shard = (self.current_shard + 1) % len(self.files)
        self.current_position = self.process_rank * self.seq_len
        self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard])

    def next_batch(self):
        batch_size = self.seq_len * self.num_processes
        buf = self.tokens[self.current_position:self.current_position+self.seq_len+1]
        # host side async is sufficient;
        # no performance improvement was observed when introducing a separate stream.
        inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs
        targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # targets
        # advance current position and load next shard if necessary
        self.current_position += batch_size
        if self.current_position + batch_size + 1 >= len(self.tokens):
            self.advance()
        return inputs, targets

# -----------------------------------------------------------------------------
# int main

@dataclass
class Hyperparameters:
    # data hyperparams
    input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on
    input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on
    # optimization hyperparams
    batch_size : int = 8 # batch size, in sequences, across all devices
    sequence_length : int = 64*1024 # sequence length, in tokens
    num_iterations : int = 1480 # number of iterations to run
    warmup_iters : int = 0
    cooldown_iters : int = 600 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule
    weight_decay : float = 0
    # evaluation and logging hyperparams
    val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end
    val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons
    save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end
args = Hyperparameters()

# set up DDP (distributed data parallel). torchrun sets this env variable
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
assert torch.cuda.is_available()
device = torch.device(f"cuda:{ddp_local_rank}")
torch.cuda.set_device(device)
print(f"using device: {device}")
dist.init_process_group(backend='nccl', device_id=device)
dist.barrier()
master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc.

# begin logging
logfile = None
if master_process:
    run_id = uuid.uuid4()
    logdir = Path("logs") / f"{run_id}"
    logdir.mkdir(exist_ok=True)
    logfile = Path("logs") / f"{run_id}.txt"
    print(logfile.stem)
    # create the log file
    with logfile.open("w") as f:
        # begin the log by printing this file (the Python code)
        print(code, file=f)
        print("=" * 100, file=f)
def print0(s, logonly=False):
    if master_process:
        with logfile.open("a") as f:
            if not logonly:
                print(s)
            print(s, file=f)
# log information about the hardware/software environment this is running on
# and print the full `nvidia-smi` to file
print0(f"Running python {sys.version}")
print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:")
import subprocess
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
print0(f'{result.stdout}', logonly=True)
print0('='*100, logonly=True)

# calculate the number of steps to take in the val loop.
assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0
val_steps = args.val_tokens // (args.sequence_length * ddp_world_size)
# calculate the steps of gradient accumulation required to attain the desired global batch size.
assert args.batch_size % (ddp_world_size) == 0
train_accumulation_steps = args.batch_size // ddp_world_size

# load tokens
train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size)
val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size)
print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files")
print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files")
print0('='*100, logonly=True)
inputs_train, targets_train = train_loader.next_batch()

# there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977.
# this originates from Karpathy's experiments.
num_vocab = 50304
model = GPT(GPTConfig(vocab_size=num_vocab, num_layers=12, num_heads=6, model_dim=768))
model = model.cuda().bfloat16()
for m in model.modules():
    if isinstance(m, CastedLinear):
        m.float()
config.coordinate_descent_tuning = True # suggested by @Chillee
model = torch.compile(model)
# here we wrap model into DDP container
model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True)
raw_model = model.module # always contains the "raw" unwrapped model

# init the optimizer(s)
embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()]
optimizer1 = torch.optim.Adam(embed_params, lr=0.6, betas=(0.8, 0.95), fused=True)
optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True)
params = list(raw_model.blocks.parameters())
matrix_params = [p for p in params if p.ndim == 2]
scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights]
optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95)
optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True)
optimizers = [optimizer1, optimizer2, optimizer3, optimizer4]
# learning rate decay scheduler (linear warmup and cooldown)
def get_lr(it):
    assert it <= args.num_iterations
    # 1) linear warmup for warmup_iters steps
    if it < args.warmup_iters:
        return (it+1) / args.warmup_iters
    # 2) constant lr for a while
    elif it < args.num_iterations - args.cooldown_iters:
        return 1.0
    # 3) linear cooldown
    else:
        decay_ratio = (args.num_iterations - it) / args.cooldown_iters
        return decay_ratio
schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers]

sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device="cuda")
sw_num_blocks_prev = 1
# Start training loop
training_time_ms = 0
# start the clock
torch.cuda.synchronize()
t0 = time.perf_counter()
# begin training
for step in range(args.num_iterations + 1):
    last_step = (step == args.num_iterations)
    # This effectively ignores timing first 10 steps, which are slower for weird reasons.
    # Alternately, and slightly more correctly in terms of benchmarking, we could do 10
    # steps with dummy data first, and then re-initialize the model and reset the loader.
    if step == 10:
        training_time_ms = 0
        t0 = time.perf_counter()
    timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val

    # Linearly increase the sliding window size over training in chunks of 64 from 64 -> 1792. By @fernbear.bsky.social
    frac_done = step / args.num_iterations # training progress
    sw_num_blocks = int(((1 - frac_done) * 64 + frac_done * 1792 + 64) // 128)
    if sw_num_blocks != sw_num_blocks_prev:
        sliding_window_num_blocks.copy_(sw_num_blocks, non_blocking=True)
        sw_num_blocks_prev = sw_num_blocks

    # once in a while evaluate the validation dataset
    if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)):
        # stop the clock
        torch.cuda.synchronize()
        training_time_ms += 1000 * (time.perf_counter() - t0)
        # run validation batches
        model.eval()
        val_loader.reset()
        val_loss = 0.0
        for _ in range(val_steps):
            with torch.no_grad():
                inputs_val, targets_val = val_loader.next_batch()
                val_loss += model(inputs_val, targets_val, sliding_window_num_blocks)
        dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
        val_loss /= val_steps
        # log val loss to console and to logfile
        print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms')
        # start the clock again
        torch.cuda.synchronize()
        t0 = time.perf_counter()

    if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)):
        # stop the clock
        torch.cuda.synchronize()
        training_time_ms += 1000 * (time.perf_counter() - t0)
        # save the state of the training process
        log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers])
        torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step))
        # start the clock again
        torch.cuda.synchronize()
        t0 = time.perf_counter()

    # bit confusing: we want to make sure to eval on 0th iteration
    # but also after the very last iteration. so we loop for step <= num_iterations
    # instead of just < num_iterations (one extra due to <=), only to do
    # the validation/sampling one last time, and then we break right here as we're done.
    if last_step:
        break

    # --------------- TRAINING SECTION BEGIN -----------------
    model.train()
    for i in range(1, train_accumulation_steps + 1):
        with contextlib.ExitStack() as stack:
            if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step
                stack.enter_context(model.no_sync())
            if step >= 5:
                stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True))
            model(inputs_train, targets_train, sliding_window_num_blocks).backward()
            inputs_train, targets_train = train_loader.next_batch()
    if train_accumulation_steps != 1:
        for p in model.parameters():
            p.grad /= train_accumulation_steps
    # momentum warmup for Muon
    frac = min(step/300, 1)
    for group in optimizer3.param_groups:
        group['momentum'] = (1 - frac) * 0.85 + frac * 0.95
    # step the optimizers and schedulers
    for opt, sched in zip(optimizers, schedulers):
        opt.step()
        sched.step()
    # null the gradients
    model.zero_grad(set_to_none=True)
    # --------------- TRAINING SECTION END -------------------
    # everything that follows now is just diagnostics, prints, logging, etc.
    approx_time = training_time_ms + 1000 * (time.perf_counter() - t0)
    print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms")

print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB")

# -------------------------------------------------------------------------
# clean up nice
dist.destroy_process_group()

====================================================================================================
Running python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]
Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4
nvidia-smi:
Wed Dec 11 07:48:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:19:00.0 Off |                    0 |
| N/A   38C    P0             125W / 700W |   7084MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  | 00000000:3B:00.0 Off |                    0 |
| N/A   30C    P0             115W / 700W |   3451MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA H100 80GB HBM3          On  | 00000000:4C:00.0 Off |                    0 |
| N/A   28C    P0             111W / 700W |   3451MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   3  NVIDIA H100 80GB HBM3          On  | 00000000:5D:00.0 Off |                    0 |
| N/A   36C    P0             114W / 700W |   3451MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   4  NVIDIA H100 80GB HBM3          On  | 00000000:9B:00.0 Off |                    0 |
| N/A   38C    P0             119W / 700W |   3451MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   5  NVIDIA H100 80GB HBM3          On  | 00000000:BB:00.0 Off |                    0 |
| N/A   30C    P0             117W / 700W |   3451MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   6  NVIDIA H100 80GB HBM3          On  | 00000000:CB:00.0 Off |                    0 |
| N/A   35C    P0             119W / 700W |   3451MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   7  NVIDIA H100 80GB HBM3          On  | 00000000:DB:00.0 Off |                    0 |
| N/A   30C    P0             118W / 700W |   3211MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
+---------------------------------------------------------------------------------------+

====================================================================================================
Training DataLoader: total number of tokens: 1000000000 across 10 files
Validation DataLoader: total number of tokens: 100000000 across 1 files
====================================================================================================
step:0/1480 val_loss:10.8258 train_time:0ms step_avg:nanms
step:1/1480 train_time:28943ms step_avg:nanms
step:2/1480 train_time:29521ms step_avg:nanms
step:3/1480 train_time:29645ms step_avg:nanms
step:4/1480 train_time:29785ms step_avg:nanms
step:5/1480 train_time:29926ms step_avg:nanms
step:6/1480 train_time:30069ms step_avg:nanms
step:7/1480 train_time:30210ms step_avg:nanms
step:8/1480 train_time:30352ms step_avg:nanms
step:9/1480 train_time:30501ms step_avg:nanms
step:10/1480 train_time:30641ms step_avg:nanms
step:11/1480 train_time:143ms step_avg:nanms
step:12/1480 train_time:280ms step_avg:nanms
step:13/1480 train_time:423ms step_avg:140.95ms
step:14/1480 train_time:567ms step_avg:141.66ms
step:15/1480 train_time:710ms step_avg:142.01ms
step:16/1480 train_time:852ms step_avg:142.02ms
step:17/1480 train_time:994ms step_avg:141.95ms
step:18/1480 train_time:1136ms step_avg:142.04ms
step:19/1480 train_time:1280ms step_avg:142.17ms
step:20/1480 train_time:1421ms step_avg:142.10ms
step:21/1480 train_time:1566ms step_avg:142.37ms
step:22/1480 train_time:1710ms step_avg:142.49ms
step:23/1480 train_time:1852ms step_avg:142.44ms
step:24/1480 train_time:1994ms step_avg:142.41ms
step:25/1480 train_time:2136ms step_avg:142.41ms
step:26/1480 train_time:2278ms step_avg:142.39ms
step:27/1480 train_time:2422ms step_avg:142.46ms
step:28/1480 train_time:2567ms step_avg:142.60ms
step:29/1480 train_time:2710ms step_avg:142.63ms
step:30/1480 train_time:3227ms step_avg:161.33ms
step:31/1480 train_time:3332ms step_avg:158.68ms
step:32/1480 train_time:3474ms step_avg:157.90ms
step:33/1480 train_time:3615ms step_avg:157.17ms
step:34/1480 train_time:3756ms step_avg:156.50ms
step:35/1480 train_time:3898ms step_avg:155.92ms
step:36/1480 train_time:4040ms step_avg:155.38ms
step:37/1480 train_time:4184ms step_avg:154.96ms
step:38/1480 train_time:4331ms step_avg:154.67ms
step:39/1480 train_time:4474ms step_avg:154.26ms
step:40/1480 train_time:4616ms step_avg:153.86ms
step:41/1480 train_time:4757ms step_avg:153.46ms
step:42/1480 train_time:4900ms step_avg:153.13ms
step:43/1480 train_time:5042ms step_avg:152.79ms
step:44/1480 train_time:5185ms step_avg:152.49ms
step:45/1480 train_time:5329ms step_avg:152.26ms
step:46/1480 train_time:5473ms step_avg:152.02ms
step:47/1480 train_time:5615ms step_avg:151.75ms
step:48/1480 train_time:5757ms step_avg:151.49ms
step:49/1480 train_time:5900ms step_avg:151.28ms
step:50/1480 train_time:6042ms step_avg:151.06ms
step:51/1480 train_time:6187ms step_avg:150.90ms
step:52/1480 train_time:6331ms step_avg:150.73ms
step:53/1480 train_time:6474ms step_avg:150.56ms
step:54/1480 train_time:6615ms step_avg:150.35ms
step:55/1480 train_time:6757ms step_avg:150.15ms
step:56/1480 train_time:6899ms step_avg:149.98ms
step:57/1480 train_time:7042ms step_avg:149.83ms
step:58/1480 train_time:7184ms step_avg:149.67ms
step:59/1480 train_time:7328ms step_avg:149.55ms
step:60/1480 train_time:7472ms step_avg:149.44ms
step:61/1480 train_time:7615ms step_avg:149.31ms
step:62/1480 train_time:7755ms step_avg:149.14ms
step:63/1480 train_time:7899ms step_avg:149.03ms
step:64/1480 train_time:8041ms step_avg:148.91ms
step:65/1480 train_time:8561ms step_avg:155.66ms
step:66/1480 train_time:8665ms step_avg:154.73ms
step:67/1480 train_time:8808ms step_avg:154.52ms
step:68/1480 train_time:8950ms step_avg:154.31ms
step:69/1480 train_time:9092ms step_avg:154.10ms
step:70/1480 train_time:9234ms step_avg:153.90ms
step:71/1480 train_time:9376ms step_avg:153.71ms
step:72/1480 train_time:9518ms step_avg:153.52ms
step:73/1480 train_time:9661ms step_avg:153.34ms
step:74/1480 train_time:9804ms step_avg:153.19ms
step:75/1480 train_time:9948ms step_avg:153.05ms
step:76/1480 train_time:10090ms step_avg:152.88ms
step:77/1480 train_time:10232ms step_avg:152.72ms
step:78/1480 train_time:10374ms step_avg:152.56ms
step:79/1480 train_time:10515ms step_avg:152.39ms
step:80/1480 train_time:10657ms step_avg:152.24ms
step:81/1480 train_time:10800ms step_avg:152.11ms
step:82/1480 train_time:10943ms step_avg:151.99ms
step:83/1480 train_time:11088ms step_avg:151.88ms
step:84/1480 train_time:11230ms step_avg:151.75ms
step:85/1480 train_time:11371ms step_avg:151.62ms
step:86/1480 train_time:11516ms step_avg:151.52ms
step:87/1480 train_time:11657ms step_avg:151.38ms
step:88/1480 train_time:11799ms step_avg:151.27ms
step:89/1480 train_time:11941ms step_avg:151.15ms
step:90/1480 train_time:12085ms step_avg:151.06ms
step:91/1480 train_time:12229ms step_avg:150.97ms
step:92/1480 train_time:12372ms step_avg:150.88ms
step:93/1480 train_time:12516ms step_avg:150.79ms
step:94/1480 train_time:12657ms step_avg:150.68ms
step:95/1480 train_time:12801ms step_avg:150.60ms
step:96/1480 train_time:12946ms step_avg:150.53ms
step:97/1480 train_time:13091ms step_avg:150.47ms
step:98/1480 train_time:13233ms step_avg:150.38ms
step:99/1480 train_time:13375ms step_avg:150.28ms
step:100/1480 train_time:13517ms step_avg:150.19ms
step:101/1480 train_time:13661ms step_avg:150.12ms
step:102/1480 train_time:13802ms step_avg:150.02ms
step:103/1480 train_time:13944ms step_avg:149.94ms
step:104/1480 train_time:14088ms step_avg:149.88ms
step:105/1480 train_time:14231ms step_avg:149.80ms
step:106/1480 train_time:14373ms step_avg:149.72ms
step:107/1480 train_time:14516ms step_avg:149.65ms
step:108/1480 train_time:14658ms step_avg:149.57ms
step:109/1480 train_time:14801ms step_avg:149.51ms
step:110/1480 train_time:14944ms step_avg:149.44ms
step:111/1480 train_time:15090ms step_avg:149.40ms
step:112/1480 train_time:15236ms step_avg:149.37ms
step:113/1480 train_time:15382ms step_avg:149.34ms
step:114/1480 train_time:15528ms step_avg:149.31ms
step:115/1480 train_time:15674ms step_avg:149.28ms
step:116/1480 train_time:15819ms step_avg:149.23ms
step:117/1480 train_time:15964ms step_avg:149.20ms
step:118/1480 train_time:16111ms step_avg:149.18ms
step:119/1480 train_time:16256ms step_avg:149.14ms
step:120/1480 train_time:16402ms step_avg:149.11ms
step:121/1480 train_time:16548ms step_avg:149.08ms
step:122/1480 train_time:16694ms step_avg:149.06ms
step:123/1480 train_time:16841ms step_avg:149.04ms
step:124/1480 train_time:16987ms step_avg:149.01ms
step:125/1480 train_time:17133ms step_avg:148.98ms
step:125/1480 val_loss:4.4188 train_time:17197ms step_avg:149.54ms
step:126/1480 train_time:17288ms step_avg:149.03ms
step:127/1480 train_time:17435ms step_avg:149.01ms
step:128/1480 train_time:17581ms step_avg:148.99ms
step:129/1480 train_time:17727ms step_avg:148.97ms
step:130/1480 train_time:17872ms step_avg:148.93ms
step:131/1480 train_time:18017ms step_avg:148.90ms
step:132/1480 train_time:18162ms step_avg:148.87ms
step:133/1480 train_time:18308ms step_avg:148.84ms
step:134/1480 train_time:18454ms step_avg:148.82ms
step:135/1480 train_time:18601ms step_avg:148.81ms
step:136/1480 train_time:18749ms step_avg:148.80ms
step:137/1480 train_time:18894ms step_avg:148.77ms
step:138/1480 train_time:19039ms step_avg:148.75ms
step:139/1480 train_time:19186ms step_avg:148.73ms
step:140/1480 train_time:19332ms step_avg:148.70ms
step:141/1480 train_time:19476ms step_avg:148.67ms
step:142/1480 train_time:19625ms step_avg:148.67ms
step:143/1480 train_time:19772ms step_avg:148.66ms
step:144/1480 train_time:19917ms step_avg:148.64ms
step:145/1480 train_time:20064ms step_avg:148.62ms
step:146/1480 train_time:20210ms step_avg:148.60ms
step:147/1480 train_time:20355ms step_avg:148.58ms
step:148/1480 train_time:20502ms step_avg:148.56ms
step:149/1480 train_time:20648ms step_avg:148.55ms
step:150/1480 train_time:20794ms step_avg:148.53ms
step:151/1480 train_time:20941ms step_avg:148.52ms
step:152/1480 train_time:21088ms step_avg:148.51ms
step:153/1480 train_time:21233ms step_avg:148.48ms
step:154/1480 train_time:21379ms step_avg:148.46ms
step:155/1480 train_time:21525ms step_avg:148.45ms
step:156/1480 train_time:21670ms step_avg:148.43ms
step:157/1480 train_time:21815ms step_avg:148.40ms
step:158/1480 train_time:21961ms step_avg:148.39ms
step:159/1480 train_time:22107ms step_avg:148.37ms
step:160/1480 train_time:22252ms step_avg:148.35ms
step:161/1480 train_time:22398ms step_avg:148.33ms
step:162/1480 train_time:22545ms step_avg:148.32ms
step:163/1480 train_time:22691ms step_avg:148.31ms
step:164/1480 train_time:22836ms step_avg:148.29ms
step:165/1480 train_time:22981ms step_avg:148.27ms
step:166/1480 train_time:23128ms step_avg:148.25ms
step:167/1480 train_time:23272ms step_avg:148.23ms
step:168/1480 train_time:23417ms step_avg:148.21ms
step:169/1480 train_time:23564ms step_avg:148.20ms
step:170/1480 train_time:23709ms step_avg:148.18ms
step:171/1480 train_time:23856ms step_avg:148.17ms
step:172/1480 train_time:24001ms step_avg:148.15ms
step:173/1480 train_time:24147ms step_avg:148.14ms
step:174/1480 train_time:24291ms step_avg:148.12ms
step:175/1480 train_time:24437ms step_avg:148.10ms
step:176/1480 train_time:24584ms step_avg:148.09ms
step:177/1480 train_time:24729ms step_avg:148.08ms
step:178/1480 train_time:24873ms step_avg:148.06ms
step:179/1480 train_time:25020ms step_avg:148.05ms
step:180/1480 train_time:25167ms step_avg:148.04ms
step:181/1480 train_time:25311ms step_avg:148.02ms
step:182/1480 train_time:25457ms step_avg:148.01ms
step:183/1480 train_time:25603ms step_avg:147.99ms
step:184/1480 train_time:25750ms step_avg:147.99ms
step:185/1480 train_time:25894ms step_avg:147.97ms
step:186/1480 train_time:26041ms step_avg:147.96ms
step:187/1480 train_time:26187ms step_avg:147.95ms
step:188/1480 train_time:26332ms step_avg:147.93ms
step:189/1480 train_time:26494ms step_avg:148.01ms
step:190/1480 train_time:26623ms step_avg:147.90ms
step:191/1480 train_time:26770ms step_avg:147.90ms
step:192/1480 train_time:26915ms step_avg:147.88ms
step:193/1480 train_time:27060ms step_avg:147.87ms
step:194/1480 train_time:27206ms step_avg:147.86ms
step:195/1480 train_time:27351ms step_avg:147.84ms
step:196/1480 train_time:27497ms step_avg:147.83ms
step:197/1480 train_time:27642ms step_avg:147.82ms
step:198/1480 train_time:27789ms step_avg:147.81ms
step:199/1480 train_time:27933ms step_avg:147.79ms
step:200/1480 train_time:28079ms step_avg:147.78ms
step:201/1480 train_time:28225ms step_avg:147.78ms
step:202/1480 train_time:28371ms step_avg:147.77ms
step:203/1480 train_time:28516ms step_avg:147.75ms
step:204/1480 train_time:28662ms step_avg:147.74ms
step:205/1480 train_time:28808ms step_avg:147.73ms
step:206/1480 train_time:28954ms step_avg:147.72ms
step:207/1480 train_time:29099ms step_avg:147.71ms
step:208/1480 train_time:29247ms step_avg:147.71ms
step:209/1480 train_time:29392ms step_avg:147.70ms
step:210/1480 train_time:29537ms step_avg:147.69ms
step:211/1480 train_time:29684ms step_avg:147.68ms
step:212/1480 train_time:29829ms step_avg:147.67ms
step:213/1480 train_time:29974ms step_avg:147.65ms
step:214/1480 train_time:30119ms step_avg:147.64ms
step:215/1480 train_time:30266ms step_avg:147.64ms
step:216/1480 train_time:30411ms step_avg:147.63ms
step:217/1480 train_time:30556ms step_avg:147.62ms
step:218/1480 train_time:30703ms step_avg:147.61ms
step:219/1480 train_time:30849ms step_avg:147.60ms
step:220/1480 train_time:30994ms step_avg:147.59ms
step:221/1480 train_time:31531ms step_avg:149.44ms
step:222/1480 train_time:31638ms step_avg:149.24ms
step:223/1480 train_time:31788ms step_avg:149.24ms
step:224/1480 train_time:31934ms step_avg:149.23ms
step:225/1480 train_time:32083ms step_avg:149.22ms
step:226/1480 train_time:32231ms step_avg:149.22ms
step:227/1480 train_time:32378ms step_avg:149.21ms
step:228/1480 train_time:32528ms step_avg:149.21ms
step:229/1480 train_time:32675ms step_avg:149.20ms
step:230/1480 train_time:32825ms step_avg:149.20ms
step:231/1480 train_time:32973ms step_avg:149.20ms
step:232/1480 train_time:33121ms step_avg:149.20ms
step:233/1480 train_time:33270ms step_avg:149.19ms
step:234/1480 train_time:33417ms step_avg:149.18ms
step:235/1480 train_time:33566ms step_avg:149.18ms
step:236/1480 train_time:33714ms step_avg:149.17ms
step:237/1480 train_time:33862ms step_avg:149.17ms
step:238/1480 train_time:34010ms step_avg:149.17ms
step:239/1480 train_time:34160ms step_avg:149.17ms
step:240/1480 train_time:34309ms step_avg:149.17ms
step:241/1480 train_time:34457ms step_avg:149.16ms
step:242/1480 train_time:34605ms step_avg:149.16ms
step:243/1480 train_time:34753ms step_avg:149.15ms
step:244/1480 train_time:34902ms step_avg:149.16ms
step:245/1480 train_time:35051ms step_avg:149.15ms
step:246/1480 train_time:35200ms step_avg:149.15ms
step:247/1480 train_time:35349ms step_avg:149.15ms
step:248/1480 train_time:35496ms step_avg:149.14ms
step:249/1480 train_time:35645ms step_avg:149.14ms
step:250/1480 train_time:35793ms step_avg:149.14ms
step:250/1480 val_loss:4.0012 train_time:35859ms step_avg:149.41ms
step:251/1480 train_time:35954ms step_avg:149.19ms
step:252/1480 train_time:36097ms step_avg:149.16ms
step:253/1480 train_time:36246ms step_avg:149.16ms
step:254/1480 train_time:36393ms step_avg:149.15ms
step:255/1480 train_time:36541ms step_avg:149.15ms
step:256/1480 train_time:36688ms step_avg:149.14ms
step:257/1480 train_time:36836ms step_avg:149.13ms
step:258/1480 train_time:36986ms step_avg:149.14ms
step:259/1480 train_time:37134ms step_avg:149.13ms
step:260/1480 train_time:37284ms step_avg:149.13ms
step:261/1480 train_time:37431ms step_avg:149.13ms
step:262/1480 train_time:37580ms step_avg:149.13ms
step:263/1480 train_time:37728ms step_avg:149.12ms
step:264/1480 train_time:37877ms step_avg:149.12ms
step:265/1480 train_time:38026ms step_avg:149.12ms
step:266/1480 train_time:38174ms step_avg:149.12ms
step:267/1480 train_time:38322ms step_avg:149.11ms
step:268/1480 train_time:38469ms step_avg:149.11ms
step:269/1480 train_time:38618ms step_avg:149.10ms
step:270/1480 train_time:38767ms step_avg:149.10ms
step:271/1480 train_time:38914ms step_avg:149.09ms
step:272/1480 train_time:39063ms step_avg:149.09ms
step:273/1480 train_time:39210ms step_avg:149.09ms
step:274/1480 train_time:39359ms step_avg:149.09ms
step:275/1480 train_time:39507ms step_avg:149.08ms
step:276/1480 train_time:39655ms step_avg:149.08ms
step:277/1480 train_time:39805ms step_avg:149.08ms
step:278/1480 train_time:39953ms step_avg:149.08ms
step:279/1480 train_time:40102ms step_avg:149.08ms
step:280/1480 train_time:40251ms step_avg:149.08ms
step:281/1480 train_time:40401ms step_avg:149.08ms
step:282/1480 train_time:40549ms step_avg:149.08ms
step:283/1480 train_time:40698ms step_avg:149.08ms
step:284/1480 train_time:40847ms step_avg:149.08ms
step:285/1480 train_time:40996ms step_avg:149.07ms
step:286/1480 train_time:41143ms step_avg:149.07ms
step:287/1480 train_time:41292ms step_avg:149.07ms
step:288/1480 train_time:41440ms step_avg:149.07ms
step:289/1480 train_time:41589ms step_avg:149.06ms
step:290/1480 train_time:41737ms step_avg:149.06ms
step:291/1480 train_time:41886ms step_avg:149.06ms
step:292/1480 train_time:42034ms step_avg:149.06ms
step:293/1480 train_time:42184ms step_avg:149.06ms
step:294/1480 train_time:42331ms step_avg:149.05ms
step:295/1480 train_time:42481ms step_avg:149.05ms
step:296/1480 train_time:42629ms step_avg:149.05ms
step:297/1480 train_time:42778ms step_avg:149.05ms
step:298/1480 train_time:42927ms step_avg:149.05ms
step:299/1480 train_time:43074ms step_avg:149.05ms
step:300/1480 train_time:43224ms step_avg:149.05ms
step:301/1480 train_time:43371ms step_avg:149.04ms
step:302/1480 train_time:43520ms step_avg:149.04ms
step:303/1480 train_time:43668ms step_avg:149.04ms
step:304/1480 train_time:43816ms step_avg:149.03ms
step:305/1480 train_time:43965ms step_avg:149.03ms
step:306/1480 train_time:44112ms step_avg:149.03ms
step:307/1480 train_time:44261ms step_avg:149.03ms
step:308/1480 train_time:44409ms step_avg:149.03ms
step:309/1480 train_time:44558ms step_avg:149.02ms
step:310/1480 train_time:44707ms step_avg:149.02ms
step:311/1480 train_time:44855ms step_avg:149.02ms
step:312/1480 train_time:45005ms step_avg:149.02ms
step:313/1480 train_time:45152ms step_avg:149.02ms
step:314/1480 train_time:45301ms step_avg:149.02ms
step:315/1480 train_time:45449ms step_avg:149.01ms
step:316/1480 train_time:45598ms step_avg:149.01ms
step:317/1480 train_time:45746ms step_avg:149.01ms
step:318/1480 train_time:45895ms step_avg:149.01ms
step:319/1480 train_time:46044ms step_avg:149.01ms
step:320/1480 train_time:46192ms step_avg:149.01ms
step:321/1480 train_time:46341ms step_avg:149.01ms
step:322/1480 train_time:46490ms step_avg:149.01ms
step:323/1480 train_time:46639ms step_avg:149.01ms
step:324/1480 train_time:46788ms step_avg:149.01ms
step:325/1480 train_time:46937ms step_avg:149.01ms
step:326/1480 train_time:47086ms step_avg:149.01ms
step:327/1480 train_time:47234ms step_avg:149.00ms
step:328/1480 train_time:47383ms step_avg:149.00ms
step:329/1480 train_time:47531ms step_avg:149.00ms
step:330/1480 train_time:47681ms step_avg:149.00ms
step:331/1480 train_time:47831ms step_avg:149.01ms
step:332/1480 train_time:47983ms step_avg:149.02ms
step:333/1480 train_time:48133ms step_avg:149.02ms
step:334/1480 train_time:48285ms step_avg:149.03ms
step:335/1480 train_time:48435ms step_avg:149.03ms
step:336/1480 train_time:48586ms step_avg:149.04ms
step:337/1480 train_time:48737ms step_avg:149.04ms
step:338/1480 train_time:48888ms step_avg:149.05ms
step:339/1480 train_time:49038ms step_avg:149.05ms
step:340/1480 train_time:49189ms step_avg:149.06ms
step:341/1480 train_time:49340ms step_avg:149.06ms
step:342/1480 train_time:49490ms step_avg:149.07ms
step:343/1480 train_time:49641ms step_avg:149.07ms
step:344/1480 train_time:49792ms step_avg:149.08ms
step:345/1480 train_time:49943ms step_avg:149.08ms
step:346/1480 train_time:50093ms step_avg:149.09ms
step:347/1480 train_time:50245ms step_avg:149.09ms
step:348/1480 train_time:50395ms step_avg:149.10ms
step:349/1480 train_time:50546ms step_avg:149.10ms
step:350/1480 train_time:50698ms step_avg:149.11ms
step:351/1480 train_time:50849ms step_avg:149.12ms
step:352/1480 train_time:51000ms step_avg:149.12ms
step:353/1480 train_time:51151ms step_avg:149.13ms
step:354/1480 train_time:51302ms step_avg:149.13ms
step:355/1480 train_time:51452ms step_avg:149.14ms
step:356/1480 train_time:51603ms step_avg:149.14ms
step:357/1480 train_time:51754ms step_avg:149.15ms
step:358/1480 train_time:51906ms step_avg:149.16ms
step:359/1480 train_time:52057ms step_avg:149.16ms
step:360/1480 train_time:52208ms step_avg:149.17ms
step:361/1480 train_time:52359ms step_avg:149.17ms
step:362/1480 train_time:52510ms step_avg:149.18ms
step:363/1480 train_time:52661ms step_avg:149.18ms
step:364/1480 train_time:52811ms step_avg:149.18ms
step:365/1480 train_time:52963ms step_avg:149.19ms
step:366/1480 train_time:53113ms step_avg:149.19ms
step:367/1480 train_time:53264ms step_avg:149.20ms
step:368/1480 train_time:53414ms step_avg:149.20ms
step:369/1480 train_time:53565ms step_avg:149.21ms
step:370/1480 train_time:53715ms step_avg:149.21ms
step:371/1480 train_time:53866ms step_avg:149.21ms
step:372/1480 train_time:54016ms step_avg:149.22ms
step:373/1480 train_time:54168ms step_avg:149.22ms
step:374/1480 train_time:54318ms step_avg:149.23ms
step:375/1480 train_time:54469ms step_avg:149.23ms
step:375/1480 val_loss:3.8111 train_time:54536ms step_avg:149.41ms
step:376/1480 train_time:54626ms step_avg:149.25ms
step:377/1480 train_time:54777ms step_avg:149.25ms
step:378/1480 train_time:54928ms step_avg:149.26ms
step:379/1480 train_time:55101ms step_avg:149.33ms
step:380/1480 train_time:55228ms step_avg:149.26ms
step:381/1480 train_time:55378ms step_avg:149.27ms
step:382/1480 train_time:55528ms step_avg:149.27ms
step:383/1480 train_time:55679ms step_avg:149.27ms
step:384/1480 train_time:55830ms step_avg:149.28ms
step:385/1480 train_time:55982ms step_avg:149.28ms
step:386/1480 train_time:56131ms step_avg:149.29ms
step:387/1480 train_time:56283ms step_avg:149.29ms
step:388/1480 train_time:56433ms step_avg:149.29ms
step:389/1480 train_time:56584ms step_avg:149.30ms
step:390/1480 train_time:56734ms step_avg:149.30ms
step:391/1480 train_time:56886ms step_avg:149.31ms
step:392/1480 train_time:57037ms step_avg:149.31ms
step:393/1480 train_time:57187ms step_avg:149.31ms
step:394/1480 train_time:57338ms step_avg:149.32ms
step:395/1480 train_time:57488ms step_avg:149.32ms
step:396/1480 train_time:57640ms step_avg:149.33ms
step:397/1480 train_time:57789ms step_avg:149.33ms
step:398/1480 train_time:57941ms step_avg:149.33ms
step:399/1480 train_time:58091ms step_avg:149.33ms
step:400/1480 train_time:58243ms step_avg:149.34ms
step:401/1480 train_time:58394ms step_avg:149.35ms
step:402/1480 train_time:58545ms step_avg:149.35ms
step:403/1480 train_time:58697ms step_avg:149.36ms
step:404/1480 train_time:58848ms step_avg:149.36ms
step:405/1480 train_time:58999ms step_avg:149.37ms
step:406/1480 train_time:59150ms step_avg:149.37ms
step:407/1480 train_time:59302ms step_avg:149.38ms
step:408/1480 train_time:59453ms step_avg:149.38ms
step:409/1480 train_time:59604ms step_avg:149.38ms
step:410/1480 train_time:59754ms step_avg:149.39ms
step:411/1480 train_time:59905ms step_avg:149.39ms
step:412/1480 train_time:60057ms step_avg:149.39ms
step:413/1480 train_time:60207ms step_avg:149.40ms
step:414/1480 train_time:60358ms step_avg:149.40ms
step:415/1480 train_time:60509ms step_avg:149.40ms
step:416/1480 train_time:60660ms step_avg:149.41ms
step:417/1480 train_time:60810ms step_avg:149.41ms
step:418/1480 train_time:60961ms step_avg:149.41ms
step:419/1480 train_time:61113ms step_avg:149.42ms
step:420/1480 train_time:61264ms step_avg:149.42ms
step:421/1480 train_time:61416ms step_avg:149.43ms
step:422/1480 train_time:61567ms step_avg:149.43ms
step:423/1480 train_time:61719ms step_avg:149.44ms
step:424/1480 train_time:61869ms step_avg:149.44ms
step:425/1480 train_time:62020ms step_avg:149.45ms
step:426/1480 train_time:62171ms step_avg:149.45ms
step:427/1480 train_time:62322ms step_avg:149.45ms
step:428/1480 train_time:62472ms step_avg:149.46ms
step:429/1480 train_time:62625ms step_avg:149.46ms
step:430/1480 train_time:62776ms step_avg:149.47ms
step:431/1480 train_time:62927ms step_avg:149.47ms
step:432/1480 train_time:63078ms step_avg:149.47ms
step:433/1480 train_time:63227ms step_avg:149.47ms
step:434/1480 train_time:63378ms step_avg:149.48ms
step:435/1480 train_time:63529ms step_avg:149.48ms
step:436/1480 train_time:63681ms step_avg:149.49ms
step:437/1480 train_time:63831ms step_avg:149.49ms
step:438/1480 train_time:63982ms step_avg:149.49ms
step:439/1480 train_time:64131ms step_avg:149.49ms
step:440/1480 train_time:64284ms step_avg:149.50ms
step:441/1480 train_time:64436ms step_avg:149.50ms
step:442/1480 train_time:64588ms step_avg:149.51ms
step:443/1480 train_time:64742ms step_avg:149.52ms
step:444/1480 train_time:64895ms step_avg:149.53ms
step:445/1480 train_time:65048ms step_avg:149.53ms
step:446/1480 train_time:65200ms step_avg:149.54ms
step:447/1480 train_time:65352ms step_avg:149.55ms
step:448/1480 train_time:65505ms step_avg:149.56ms
step:449/1480 train_time:65659ms step_avg:149.56ms
step:450/1480 train_time:65811ms step_avg:149.57ms
step:451/1480 train_time:65963ms step_avg:149.58ms
step:452/1480 train_time:66117ms step_avg:149.59ms
step:453/1480 train_time:66270ms step_avg:149.59ms
step:454/1480 train_time:66424ms step_avg:149.60ms
step:455/1480 train_time:66577ms step_avg:149.61ms
step:456/1480 train_time:66729ms step_avg:149.62ms
step:457/1480 train_time:66883ms step_avg:149.63ms
step:458/1480 train_time:67035ms step_avg:149.63ms
step:459/1480 train_time:67188ms step_avg:149.64ms
step:460/1480 train_time:67341ms step_avg:149.65ms
step:461/1480 train_time:67493ms step_avg:149.65ms
step:462/1480 train_time:67645ms step_avg:149.66ms
step:463/1480 train_time:67798ms step_avg:149.66ms
step:464/1480 train_time:67951ms step_avg:149.67ms
step:465/1480 train_time:68104ms step_avg:149.68ms
step:466/1480 train_time:68257ms step_avg:149.69ms
step:467/1480 train_time:68411ms step_avg:149.70ms
step:468/1480 train_time:68563ms step_avg:149.70ms
step:469/1480 train_time:68717ms step_avg:149.71ms
step:470/1480 train_time:68870ms step_avg:149.72ms
step:471/1480 train_time:69023ms step_avg:149.72ms
step:472/1480 train_time:69176ms step_avg:149.73ms
step:473/1480 train_time:69328ms step_avg:149.74ms
step:474/1480 train_time:69482ms step_avg:149.75ms
step:475/1480 train_time:69634ms step_avg:149.75ms
step:476/1480 train_time:69787ms step_avg:149.76ms
step:477/1480 train_time:69941ms step_avg:149.77ms
step:478/1480 train_time:70093ms step_avg:149.77ms
step:479/1480 train_time:70246ms step_avg:149.78ms
step:480/1480 train_time:70399ms step_avg:149.79ms
step:481/1480 train_time:70552ms step_avg:149.79ms
step:482/1480 train_time:70704ms step_avg:149.80ms
step:483/1480 train_time:70857ms step_avg:149.80ms
step:484/1480 train_time:71010ms step_avg:149.81ms
step:485/1480 train_time:71164ms step_avg:149.82ms
step:486/1480 train_time:71317ms step_avg:149.83ms
step:487/1480 train_time:71471ms step_avg:149.83ms
step:488/1480 train_time:71624ms step_avg:149.84ms
step:489/1480 train_time:71777ms step_avg:149.85ms
step:490/1480 train_time:71929ms step_avg:149.85ms
step:491/1480 train_time:72083ms step_avg:149.86ms
step:492/1480 train_time:72234ms step_avg:149.86ms
step:493/1480 train_time:72387ms step_avg:149.87ms
step:494/1480 train_time:72540ms step_avg:149.88ms
step:495/1480 train_time:72693ms step_avg:149.88ms
step:496/1480 train_time:72847ms step_avg:149.89ms
step:497/1480 train_time:73001ms step_avg:149.90ms
step:498/1480 train_time:73153ms step_avg:149.90ms
step:499/1480 train_time:73306ms step_avg:149.91ms
step:500/1480 train_time:73459ms step_avg:149.92ms
step:500/1480 val_loss:3.6889 train_time:73527ms step_avg:150.06ms
step:501/1480 train_time:73618ms step_avg:149.94ms
step:502/1480 train_time:73770ms step_avg:149.94ms
step:503/1480 train_time:73923ms step_avg:149.95ms
step:504/1480 train_time:74075ms step_avg:149.95ms
step:505/1480 train_time:74228ms step_avg:149.96ms
step:506/1480 train_time:74380ms step_avg:149.96ms
step:507/1480 train_time:74534ms step_avg:149.97ms
step:508/1480 train_time:74687ms step_avg:149.97ms
step:509/1480 train_time:74841ms step_avg:149.98ms
step:510/1480 train_time:74994ms step_avg:149.99ms
step:511/1480 train_time:75146ms step_avg:149.99ms
step:512/1480 train_time:75300ms step_avg:150.00ms
step:513/1480 train_time:75454ms step_avg:150.01ms
step:514/1480 train_time:75608ms step_avg:150.02ms
step:515/1480 train_time:75762ms step_avg:150.02ms
step:516/1480 train_time:75916ms step_avg:150.03ms
step:517/1480 train_time:76068ms step_avg:150.04ms
step:518/1480 train_time:76220ms step_avg:150.04ms
step:519/1480 train_time:76373ms step_avg:150.04ms
step:520/1480 train_time:76528ms step_avg:150.05ms
step:521/1480 train_time:76681ms step_avg:150.06ms
step:522/1480 train_time:76834ms step_avg:150.07ms
step:523/1480 train_time:76988ms step_avg:150.07ms
step:524/1480 train_time:77140ms step_avg:150.08ms
step:525/1480 train_time:77293ms step_avg:150.08ms
step:526/1480 train_time:77446ms step_avg:150.09ms
step:527/1480 train_time:77598ms step_avg:150.09ms
step:528/1480 train_time:77751ms step_avg:150.10ms
step:529/1480 train_time:77903ms step_avg:150.10ms
step:530/1480 train_time:78058ms step_avg:150.11ms
step:531/1480 train_time:78212ms step_avg:150.12ms
step:532/1480 train_time:78363ms step_avg:150.12ms
step:533/1480 train_time:78516ms step_avg:150.13ms
step:534/1480 train_time:78668ms step_avg:150.13ms
step:535/1480 train_time:78821ms step_avg:150.13ms
step:536/1480 train_time:78973ms step_avg:150.14ms
step:537/1480 train_time:79128ms step_avg:150.15ms
step:538/1480 train_time:79282ms step_avg:150.15ms
step:539/1480 train_time:79435ms step_avg:150.16ms
step:540/1480 train_time:79588ms step_avg:150.17ms
step:541/1480 train_time:79740ms step_avg:150.17ms
step:542/1480 train_time:79893ms step_avg:150.17ms
step:543/1480 train_time:80045ms step_avg:150.18ms
step:544/1480 train_time:80197ms step_avg:150.18ms
step:545/1480 train_time:80351ms step_avg:150.19ms
step:546/1480 train_time:80504ms step_avg:150.19ms
step:547/1480 train_time:80657ms step_avg:150.20ms
step:548/1480 train_time:80810ms step_avg:150.20ms
step:549/1480 train_time:80962ms step_avg:150.21ms
step:550/1480 train_time:81116ms step_avg:150.22ms
step:551/1480 train_time:81271ms step_avg:150.22ms
step:552/1480 train_time:81426ms step_avg:150.23ms
step:553/1480 train_time:81582ms step_avg:150.24ms
step:554/1480 train_time:81737ms step_avg:150.25ms
step:555/1480 train_time:81891ms step_avg:150.26ms
step:556/1480 train_time:82045ms step_avg:150.27ms
step:557/1480 train_time:82200ms step_avg:150.27ms
step:558/1480 train_time:82356ms step_avg:150.28ms
step:559/1480 train_time:82510ms step_avg:150.29ms
step:560/1480 train_time:82664ms step_avg:150.30ms
step:561/1480 train_time:82820ms step_avg:150.31ms
step:562/1480 train_time:82973ms step_avg:150.31ms
step:563/1480 train_time:83128ms step_avg:150.32ms
step:564/1480 train_time:83284ms step_avg:150.33ms
step:565/1480 train_time:83438ms step_avg:150.34ms
step:566/1480 train_time:83594ms step_avg:150.35ms
step:567/1480 train_time:83748ms step_avg:150.36ms
step:568/1480 train_time:83902ms step_avg:150.36ms
step:569/1480 train_time:84078ms step_avg:150.41ms
step:570/1480 train_time:84212ms step_avg:150.38ms
step:571/1480 train_time:84366ms step_avg:150.39ms
step:572/1480 train_time:84521ms step_avg:150.39ms
step:573/1480 train_time:84675ms step_avg:150.40ms
step:574/1480 train_time:84831ms step_avg:150.41ms
step:575/1480 train_time:84986ms step_avg:150.42ms
step:576/1480 train_time:85142ms step_avg:150.43ms
step:577/1480 train_time:85296ms step_avg:150.43ms
step:578/1480 train_time:85450ms step_avg:150.44ms
step:579/1480 train_time:85604ms step_avg:150.45ms
step:580/1480 train_time:85758ms step_avg:150.45ms
step:581/1480 train_time:85913ms step_avg:150.46ms
step:582/1480 train_time:86068ms step_avg:150.47ms
step:583/1480 train_time:86222ms step_avg:150.47ms
step:584/1480 train_time:86376ms step_avg:150.48ms
step:585/1480 train_time:86531ms step_avg:150.49ms
step:586/1480 train_time:86685ms step_avg:150.50ms
step:587/1480 train_time:86839ms step_avg:150.50ms
step:588/1480 train_time:86994ms step_avg:150.51ms
step:589/1480 train_time:87149ms step_avg:150.52ms
step:590/1480 train_time:87304ms step_avg:150.52ms
step:591/1480 train_time:87459ms step_avg:150.53ms
step:592/1480 train_time:87614ms step_avg:150.54ms
step:593/1480 train_time:87769ms step_avg:150.55ms
step:594/1480 train_time:87924ms step_avg:150.56ms
step:595/1480 train_time:88079ms step_avg:150.56ms
step:596/1480 train_time:88235ms step_avg:150.57ms
step:597/1480 train_time:88390ms step_avg:150.58ms
step:598/1480 train_time:88544ms step_avg:150.59ms
step:599/1480 train_time:88699ms step_avg:150.59ms
step:600/1480 train_time:88855ms step_avg:150.60ms
step:601/1480 train_time:89011ms step_avg:150.61ms
step:602/1480 train_time:89165ms step_avg:150.62ms
step:603/1480 train_time:89319ms step_avg:150.62ms
step:604/1480 train_time:89473ms step_avg:150.63ms
step:605/1480 train_time:89629ms step_avg:150.64ms
step:606/1480 train_time:89784ms step_avg:150.64ms
step:607/1480 train_time:89939ms step_avg:150.65ms
step:608/1480 train_time:90094ms step_avg:150.66ms
step:609/1480 train_time:90249ms step_avg:150.67ms
step:610/1480 train_time:90403ms step_avg:150.67ms
step:611/1480 train_time:90557ms step_avg:150.68ms
step:612/1480 train_time:90712ms step_avg:150.68ms
step:613/1480 train_time:90867ms step_avg:150.69ms
step:614/1480 train_time:91024ms step_avg:150.70ms
step:615/1480 train_time:91178ms step_avg:150.71ms
step:616/1480 train_time:91333ms step_avg:150.71ms
step:617/1480 train_time:91488ms step_avg:150.72ms
step:618/1480 train_time:91643ms step_avg:150.73ms
step:619/1480 train_time:91797ms step_avg:150.73ms
step:620/1480 train_time:91952ms step_avg:150.74ms
step:621/1480 train_time:92109ms step_avg:150.75ms
step:622/1480 train_time:92264ms step_avg:150.76ms
step:623/1480 train_time:92418ms step_avg:150.76ms
step:624/1480 train_time:92573ms step_avg:150.77ms
step:625/1480 train_time:92728ms step_avg:150.78ms
step:625/1480 val_loss:3.6079 train_time:92800ms step_avg:150.90ms
step:626/1480 train_time:92892ms step_avg:150.80ms
step:627/1480 train_time:93045ms step_avg:150.80ms
step:628/1480 train_time:93200ms step_avg:150.81ms
step:629/1480 train_time:93354ms step_avg:150.81ms
step:630/1480 train_time:93508ms step_avg:150.82ms
step:631/1480 train_time:93663ms step_avg:150.83ms
step:632/1480 train_time:93817ms step_avg:150.83ms
step:633/1480 train_time:93972ms step_avg:150.84ms
step:634/1480 train_time:94127ms step_avg:150.84ms
step:635/1480 train_time:94281ms step_avg:150.85ms
step:636/1480 train_time:94435ms step_avg:150.85ms
step:637/1480 train_time:94591ms step_avg:150.86ms
step:638/1480 train_time:94745ms step_avg:150.87ms
step:639/1480 train_time:94899ms step_avg:150.87ms
step:640/1480 train_time:95054ms step_avg:150.88ms
step:641/1480 train_time:95208ms step_avg:150.88ms
step:642/1480 train_time:95364ms step_avg:150.89ms
step:643/1480 train_time:95518ms step_avg:150.90ms
step:644/1480 train_time:95673ms step_avg:150.90ms
step:645/1480 train_time:95828ms step_avg:150.91ms
step:646/1480 train_time:95983ms step_avg:150.92ms
step:647/1480 train_time:96138ms step_avg:150.92ms
step:648/1480 train_time:96293ms step_avg:150.93ms
step:649/1480 train_time:96447ms step_avg:150.93ms
step:650/1480 train_time:96603ms step_avg:150.94ms
step:651/1480 train_time:96758ms step_avg:150.95ms
step:652/1480 train_time:96913ms step_avg:150.95ms
step:653/1480 train_time:97067ms step_avg:150.96ms
step:654/1480 train_time:97222ms step_avg:150.97ms
step:655/1480 train_time:97376ms step_avg:150.97ms
step:656/1480 train_time:97531ms step_avg:150.98ms
step:657/1480 train_time:97686ms step_avg:150.98ms
step:658/1480 train_time:97840ms step_avg:150.99ms
step:659/1480 train_time:97996ms step_avg:151.00ms
step:660/1480 train_time:98153ms step_avg:151.00ms
step:661/1480 train_time:98310ms step_avg:151.01ms
step:662/1480 train_time:98467ms step_avg:151.02ms
step:663/1480 train_time:98621ms step_avg:151.03ms
step:664/1480 train_time:98778ms step_avg:151.04ms
step:665/1480 train_time:98935ms step_avg:151.05ms
step:666/1480 train_time:99090ms step_avg:151.05ms
step:667/1480 train_time:99247ms step_avg:151.06ms
step:668/1480 train_time:99404ms step_avg:151.07ms
step:669/1480 train_time:99562ms step_avg:151.08ms
step:670/1480 train_time:99718ms step_avg:151.09ms
step:671/1480 train_time:99873ms step_avg:151.09ms
step:672/1480 train_time:100030ms step_avg:151.10ms
step:673/1480 train_time:100187ms step_avg:151.11ms
step:674/1480 train_time:100344ms step_avg:151.12ms
step:675/1480 train_time:100500ms step_avg:151.13ms
step:676/1480 train_time:100657ms step_avg:151.14ms
step:677/1480 train_time:100814ms step_avg:151.15ms
step:678/1480 train_time:100969ms step_avg:151.15ms
step:679/1480 train_time:101126ms step_avg:151.16ms
step:680/1480 train_time:101283ms step_avg:151.17ms
step:681/1480 train_time:101438ms step_avg:151.17ms
step:682/1480 train_time:101596ms step_avg:151.18ms
step:683/1480 train_time:101752ms step_avg:151.19ms
step:684/1480 train_time:101908ms step_avg:151.20ms
step:685/1480 train_time:102065ms step_avg:151.21ms
step:686/1480 train_time:102222ms step_avg:151.22ms
step:687/1480 train_time:102378ms step_avg:151.22ms
step:688/1480 train_time:102535ms step_avg:151.23ms
step:689/1480 train_time:102692ms step_avg:151.24ms
step:690/1480 train_time:102851ms step_avg:151.25ms
step:691/1480 train_time:103008ms step_avg:151.26ms
step:692/1480 train_time:103165ms step_avg:151.27ms
step:693/1480 train_time:103321ms step_avg:151.27ms
step:694/1480 train_time:103477ms step_avg:151.28ms
step:695/1480 train_time:103632ms step_avg:151.29ms
step:696/1480 train_time:103789ms step_avg:151.30ms
step:697/1480 train_time:103944ms step_avg:151.30ms
step:698/1480 train_time:104100ms step_avg:151.31ms
step:699/1480 train_time:104256ms step_avg:151.31ms
step:700/1480 train_time:104413ms step_avg:151.32ms
step:701/1480 train_time:104568ms step_avg:151.33ms
step:702/1480 train_time:104725ms step_avg:151.34ms
step:703/1480 train_time:104883ms step_avg:151.35ms
step:704/1480 train_time:105038ms step_avg:151.35ms
step:705/1480 train_time:105196ms step_avg:151.36ms
step:706/1480 train_time:105354ms step_avg:151.37ms
step:707/1480 train_time:105512ms step_avg:151.38ms
step:708/1480 train_time:105667ms step_avg:151.39ms
step:709/1480 train_time:105823ms step_avg:151.39ms
step:710/1480 train_time:105978ms step_avg:151.40ms
step:711/1480 train_time:106135ms step_avg:151.40ms
step:712/1480 train_time:106293ms step_avg:151.41ms
step:713/1480 train_time:106451ms step_avg:151.42ms
step:714/1480 train_time:106607ms step_avg:151.43ms
step:715/1480 train_time:106763ms step_avg:151.44ms
step:716/1480 train_time:106919ms step_avg:151.44ms
step:717/1480 train_time:107075ms step_avg:151.45ms
step:718/1480 train_time:107232ms step_avg:151.46ms
step:719/1480 train_time:107387ms step_avg:151.46ms
step:720/1480 train_time:107544ms step_avg:151.47ms
step:721/1480 train_time:107700ms step_avg:151.48ms
step:722/1480 train_time:107856ms step_avg:151.48ms
step:723/1480 train_time:108013ms step_avg:151.49ms
step:724/1480 train_time:108169ms step_avg:151.50ms
step:725/1480 train_time:108326ms step_avg:151.50ms
step:726/1480 train_time:108482ms step_avg:151.51ms
step:727/1480 train_time:108639ms step_avg:151.52ms
step:728/1480 train_time:108795ms step_avg:151.53ms
step:729/1480 train_time:108951ms step_avg:151.53ms
step:730/1480 train_time:109109ms step_avg:151.54ms
step:731/1480 train_time:109266ms step_avg:151.55ms
step:732/1480 train_time:109422ms step_avg:151.55ms
step:733/1480 train_time:109578ms step_avg:151.56ms
step:734/1480 train_time:109734ms step_avg:151.57ms
step:735/1480 train_time:109891ms step_avg:151.57ms
step:736/1480 train_time:110048ms step_avg:151.58ms
step:737/1480 train_time:110204ms step_avg:151.59ms
step:738/1480 train_time:110359ms step_avg:151.59ms
step:739/1480 train_time:110516ms step_avg:151.60ms
step:740/1480 train_time:110675ms step_avg:151.61ms
step:741/1480 train_time:110833ms step_avg:151.62ms
step:742/1480 train_time:110989ms step_avg:151.62ms
step:743/1480 train_time:111145ms step_avg:151.63ms
step:744/1480 train_time:111301ms step_avg:151.64ms
step:745/1480 train_time:111459ms step_avg:151.64ms
step:746/1480 train_time:111615ms step_avg:151.65ms
step:747/1480 train_time:111771ms step_avg:151.66ms
step:748/1480 train_time:111931ms step_avg:151.67ms
step:749/1480 train_time:112087ms step_avg:151.67ms
step:750/1480 train_time:112242ms step_avg:151.68ms
step:750/1480 val_loss:3.5528 train_time:112314ms step_avg:151.78ms
step:751/1480 train_time:112405ms step_avg:151.69ms
step:752/1480 train_time:112560ms step_avg:151.70ms
step:753/1480 train_time:112717ms step_avg:151.71ms
step:754/1480 train_time:112873ms step_avg:151.71ms
step:755/1480 train_time:113028ms step_avg:151.72ms
step:756/1480 train_time:113183ms step_avg:151.72ms
step:757/1480 train_time:113342ms step_avg:151.73ms
step:758/1480 train_time:113499ms step_avg:151.74ms
step:759/1480 train_time:113673ms step_avg:151.77ms
step:760/1480 train_time:113815ms step_avg:151.75ms
step:761/1480 train_time:113970ms step_avg:151.76ms
step:762/1480 train_time:114126ms step_avg:151.76ms
step:763/1480 train_time:114282ms step_avg:151.77ms
step:764/1480 train_time:114440ms step_avg:151.78ms
step:765/1480 train_time:114597ms step_avg:151.78ms
step:766/1480 train_time:114755ms step_avg:151.79ms
step:767/1480 train_time:114912ms step_avg:151.80ms
step:768/1480 train_time:115069ms step_avg:151.81ms
step:769/1480 train_time:115225ms step_avg:151.81ms
step:770/1480 train_time:115383ms step_avg:151.82ms
step:771/1480 train_time:115542ms step_avg:151.83ms
step:772/1480 train_time:115699ms step_avg:151.84ms
step:773/1480 train_time:115856ms step_avg:151.84ms
step:774/1480 train_time:116014ms step_avg:151.85ms
step:775/1480 train_time:116173ms step_avg:151.86ms
step:776/1480 train_time:116331ms step_avg:151.87ms
step:777/1480 train_time:116492ms step_avg:151.88ms
step:778/1480 train_time:116653ms step_avg:151.89ms
step:779/1480 train_time:116810ms step_avg:151.90ms
step:780/1480 train_time:116968ms step_avg:151.91ms
step:781/1480 train_time:117125ms step_avg:151.91ms
step:782/1480 train_time:117283ms step_avg:151.92ms
step:783/1480 train_time:117440ms step_avg:151.93ms
step:784/1480 train_time:117601ms step_avg:151.94ms
step:785/1480 train_time:117758ms step_avg:151.95ms
step:786/1480 train_time:117916ms step_avg:151.95ms
step:787/1480 train_time:118074ms step_avg:151.96ms
step:788/1480 train_time:118233ms step_avg:151.97ms
step:789/1480 train_time:118390ms step_avg:151.98ms
step:790/1480 train_time:118547ms step_avg:151.98ms
step:791/1480 train_time:118707ms step_avg:151.99ms
step:792/1480 train_time:118864ms step_avg:152.00ms
step:793/1480 train_time:119021ms step_avg:152.01ms
step:794/1480 train_time:119180ms step_avg:152.01ms
step:795/1480 train_time:119338ms step_avg:152.02ms
step:796/1480 train_time:119497ms step_avg:152.03ms
step:797/1480 train_time:119655ms step_avg:152.04ms
step:798/1480 train_time:119815ms step_avg:152.05ms
step:799/1480 train_time:119978ms step_avg:152.06ms
step:800/1480 train_time:120136ms step_avg:152.07ms
step:801/1480 train_time:120294ms step_avg:152.08ms
step:802/1480 train_time:120452ms step_avg:152.09ms
step:803/1480 train_time:120611ms step_avg:152.09ms
step:804/1480 train_time:120768ms step_avg:152.10ms
step:805/1480 train_time:120926ms step_avg:152.11ms
step:806/1480 train_time:121083ms step_avg:152.11ms
step:807/1480 train_time:121240ms step_avg:152.12ms
step:808/1480 train_time:121399ms step_avg:152.13ms
step:809/1480 train_time:121556ms step_avg:152.13ms
step:810/1480 train_time:121713ms step_avg:152.14ms
step:811/1480 train_time:121871ms step_avg:152.15ms
step:812/1480 train_time:122029ms step_avg:152.16ms
step:813/1480 train_time:122186ms step_avg:152.16ms
step:814/1480 train_time:122343ms step_avg:152.17ms
step:815/1480 train_time:122501ms step_avg:152.17ms
step:816/1480 train_time:122660ms step_avg:152.18ms
step:817/1480 train_time:122818ms step_avg:152.19ms
step:818/1480 train_time:122975ms step_avg:152.20ms
step:819/1480 train_time:123133ms step_avg:152.20ms
step:820/1480 train_time:123293ms step_avg:152.21ms
step:821/1480 train_time:123451ms step_avg:152.22ms
step:822/1480 train_time:123608ms step_avg:152.23ms
step:823/1480 train_time:123767ms step_avg:152.23ms
step:824/1480 train_time:123923ms step_avg:152.24ms
step:825/1480 train_time:124082ms step_avg:152.25ms
step:826/1480 train_time:124242ms step_avg:152.26ms
step:827/1480 train_time:124400ms step_avg:152.26ms
step:828/1480 train_time:124558ms step_avg:152.27ms
step:829/1480 train_time:124717ms step_avg:152.28ms
step:830/1480 train_time:124876ms step_avg:152.29ms
step:831/1480 train_time:125034ms step_avg:152.29ms
step:832/1480 train_time:125193ms step_avg:152.30ms
step:833/1480 train_time:125351ms step_avg:152.31ms
step:834/1480 train_time:125512ms step_avg:152.32ms
step:835/1480 train_time:125669ms step_avg:152.33ms
step:836/1480 train_time:125828ms step_avg:152.33ms
step:837/1480 train_time:125984ms step_avg:152.34ms
step:838/1480 train_time:126143ms step_avg:152.35ms
step:839/1480 train_time:126300ms step_avg:152.35ms
step:840/1480 train_time:126457ms step_avg:152.36ms
step:841/1480 train_time:126615ms step_avg:152.36ms
step:842/1480 train_time:126774ms step_avg:152.37ms
step:843/1480 train_time:126931ms step_avg:152.38ms
step:844/1480 train_time:127087ms step_avg:152.38ms
step:845/1480 train_time:127243ms step_avg:152.39ms
step:846/1480 train_time:127404ms step_avg:152.40ms
step:847/1480 train_time:127562ms step_avg:152.40ms
step:848/1480 train_time:127720ms step_avg:152.41ms
step:849/1480 train_time:127879ms step_avg:152.42ms
step:850/1480 train_time:128037ms step_avg:152.43ms
step:851/1480 train_time:128196ms step_avg:152.43ms
step:852/1480 train_time:128354ms step_avg:152.44ms
step:853/1480 train_time:128512ms step_avg:152.45ms
step:854/1480 train_time:128669ms step_avg:152.45ms
step:855/1480 train_time:128825ms step_avg:152.46ms
step:856/1480 train_time:128983ms step_avg:152.46ms
step:857/1480 train_time:129141ms step_avg:152.47ms
step:858/1480 train_time:129300ms step_avg:152.48ms
step:859/1480 train_time:129458ms step_avg:152.48ms
step:860/1480 train_time:129616ms step_avg:152.49ms
step:861/1480 train_time:129775ms step_avg:152.50ms
step:862/1480 train_time:129936ms step_avg:152.51ms
step:863/1480 train_time:130096ms step_avg:152.52ms
step:864/1480 train_time:130256ms step_avg:152.52ms
step:865/1480 train_time:130413ms step_avg:152.53ms
step:866/1480 train_time:130572ms step_avg:152.54ms
step:867/1480 train_time:130731ms step_avg:152.54ms
step:868/1480 train_time:130888ms step_avg:152.55ms
step:869/1480 train_time:131044ms step_avg:152.55ms
step:870/1480 train_time:131203ms step_avg:152.56ms
step:871/1480 train_time:131360ms step_avg:152.57ms
step:872/1480 train_time:131519ms step_avg:152.57ms
step:873/1480 train_time:131677ms step_avg:152.58ms
step:874/1480 train_time:131837ms step_avg:152.59ms
step:875/1480 train_time:131996ms step_avg:152.60ms
step:875/1480 val_loss:3.5071 train_time:132068ms step_avg:152.68ms
step:876/1480 train_time:132160ms step_avg:152.61ms
step:877/1480 train_time:132316ms step_avg:152.61ms
step:878/1480 train_time:132474ms step_avg:152.62ms
step:879/1480 train_time:132633ms step_avg:152.63ms
step:880/1480 train_time:132792ms step_avg:152.63ms
step:881/1480 train_time:132950ms step_avg:152.64ms
step:882/1480 train_time:133110ms step_avg:152.65ms
step:883/1480 train_time:133272ms step_avg:152.66ms
step:884/1480 train_time:133434ms step_avg:152.67ms
step:885/1480 train_time:133594ms step_avg:152.68ms
step:886/1480 train_time:133757ms step_avg:152.69ms
step:887/1480 train_time:133916ms step_avg:152.70ms
step:888/1480 train_time:134081ms step_avg:152.71ms
step:889/1480 train_time:134241ms step_avg:152.72ms
step:890/1480 train_time:134400ms step_avg:152.73ms
step:891/1480 train_time:134558ms step_avg:152.73ms
step:892/1480 train_time:134717ms step_avg:152.74ms
step:893/1480 train_time:134876ms step_avg:152.75ms
step:894/1480 train_time:135036ms step_avg:152.76ms
step:895/1480 train_time:135199ms step_avg:152.77ms
step:896/1480 train_time:135357ms step_avg:152.77ms
step:897/1480 train_time:135517ms step_avg:152.78ms
step:898/1480 train_time:135677ms step_avg:152.79ms
step:899/1480 train_time:135836ms step_avg:152.80ms
step:900/1480 train_time:135995ms step_avg:152.80ms
step:901/1480 train_time:136156ms step_avg:152.81ms
step:902/1480 train_time:136312ms step_avg:152.82ms
step:903/1480 train_time:136475ms step_avg:152.83ms
step:904/1480 train_time:136635ms step_avg:152.84ms
step:905/1480 train_time:136793ms step_avg:152.84ms
step:906/1480 train_time:136953ms step_avg:152.85ms
step:907/1480 train_time:137117ms step_avg:152.86ms
step:908/1480 train_time:137275ms step_avg:152.87ms
step:909/1480 train_time:137436ms step_avg:152.88ms
step:910/1480 train_time:137601ms step_avg:152.89ms
step:911/1480 train_time:137759ms step_avg:152.90ms
step:912/1480 train_time:137918ms step_avg:152.90ms
step:913/1480 train_time:138078ms step_avg:152.91ms
step:914/1480 train_time:138238ms step_avg:152.92ms
step:915/1480 train_time:138401ms step_avg:152.93ms
step:916/1480 train_time:138562ms step_avg:152.94ms
step:917/1480 train_time:138720ms step_avg:152.94ms
step:918/1480 train_time:138882ms step_avg:152.95ms
step:919/1480 train_time:139043ms step_avg:152.96ms
step:920/1480 train_time:139202ms step_avg:152.97ms
step:921/1480 train_time:139361ms step_avg:152.98ms
step:922/1480 train_time:139524ms step_avg:152.99ms
step:923/1480 train_time:139683ms step_avg:152.99ms
step:924/1480 train_time:139842ms step_avg:153.00ms
step:925/1480 train_time:140002ms step_avg:153.01ms
step:926/1480 train_time:140159ms step_avg:153.01ms
step:927/1480 train_time:140317ms step_avg:153.02ms
step:928/1480 train_time:140476ms step_avg:153.02ms
step:929/1480 train_time:140635ms step_avg:153.03ms
step:930/1480 train_time:140796ms step_avg:153.04ms
step:931/1480 train_time:140955ms step_avg:153.05ms
step:932/1480 train_time:141114ms step_avg:153.05ms
step:933/1480 train_time:141274ms step_avg:153.06ms
step:934/1480 train_time:141433ms step_avg:153.07ms
step:935/1480 train_time:141593ms step_avg:153.07ms
step:936/1480 train_time:141753ms step_avg:153.08ms
step:937/1480 train_time:141915ms step_avg:153.09ms
step:938/1480 train_time:142073ms step_avg:153.10ms
step:939/1480 train_time:142235ms step_avg:153.11ms
step:940/1480 train_time:142397ms step_avg:153.12ms
step:941/1480 train_time:142555ms step_avg:153.12ms
step:942/1480 train_time:142714ms step_avg:153.13ms
step:943/1480 train_time:142876ms step_avg:153.14ms
step:944/1480 train_time:143038ms step_avg:153.15ms
step:945/1480 train_time:143197ms step_avg:153.15ms
step:946/1480 train_time:143360ms step_avg:153.16ms
step:947/1480 train_time:143520ms step_avg:153.17ms
step:948/1480 train_time:143679ms step_avg:153.18ms
step:949/1480 train_time:143855ms step_avg:153.20ms
step:950/1480 train_time:143997ms step_avg:153.19ms
step:951/1480 train_time:144159ms step_avg:153.20ms
step:952/1480 train_time:144318ms step_avg:153.20ms
step:953/1480 train_time:144478ms step_avg:153.21ms
step:954/1480 train_time:144637ms step_avg:153.22ms
step:955/1480 train_time:144797ms step_avg:153.22ms
step:956/1480 train_time:144956ms step_avg:153.23ms
step:957/1480 train_time:145117ms step_avg:153.24ms
step:958/1480 train_time:145281ms step_avg:153.25ms
step:959/1480 train_time:145439ms step_avg:153.25ms
step:960/1480 train_time:145599ms step_avg:153.26ms
step:961/1480 train_time:145760ms step_avg:153.27ms
step:962/1480 train_time:145917ms step_avg:153.27ms
step:963/1480 train_time:146077ms step_avg:153.28ms
step:964/1480 train_time:146239ms step_avg:153.29ms
step:965/1480 train_time:146398ms step_avg:153.30ms
step:966/1480 train_time:146556ms step_avg:153.30ms
step:967/1480 train_time:146714ms step_avg:153.31ms
step:968/1480 train_time:146873ms step_avg:153.31ms
step:969/1480 train_time:147035ms step_avg:153.32ms
step:970/1480 train_time:147193ms step_avg:153.33ms
step:971/1480 train_time:147354ms step_avg:153.33ms
step:972/1480 train_time:147512ms step_avg:153.34ms
step:973/1480 train_time:147670ms step_avg:153.34ms
step:974/1480 train_time:147830ms step_avg:153.35ms
step:975/1480 train_time:147991ms step_avg:153.36ms
step:976/1480 train_time:148153ms step_avg:153.37ms
step:977/1480 train_time:148312ms step_avg:153.37ms
step:978/1480 train_time:148472ms step_avg:153.38ms
step:979/1480 train_time:148633ms step_avg:153.39ms
step:980/1480 train_time:148793ms step_avg:153.39ms
step:981/1480 train_time:148953ms step_avg:153.40ms
step:982/1480 train_time:149112ms step_avg:153.41ms
step:983/1480 train_time:149272ms step_avg:153.41ms
step:984/1480 train_time:149432ms step_avg:153.42ms
step:985/1480 train_time:149593ms step_avg:153.43ms
step:986/1480 train_time:149754ms step_avg:153.44ms
step:987/1480 train_time:149913ms step_avg:153.44ms
step:988/1480 train_time:150072ms step_avg:153.45ms
step:989/1480 train_time:150231ms step_avg:153.45ms
step:990/1480 train_time:150395ms step_avg:153.46ms
step:991/1480 train_time:150557ms step_avg:153.47ms
step:992/1480 train_time:150721ms step_avg:153.48ms
step:993/1480 train_time:150888ms step_avg:153.50ms
step:994/1480 train_time:151048ms step_avg:153.50ms
step:995/1480 train_time:151207ms step_avg:153.51ms
step:996/1480 train_time:151365ms step_avg:153.51ms
step:997/1480 train_time:151523ms step_avg:153.52ms
step:998/1480 train_time:151682ms step_avg:153.52ms
step:999/1480 train_time:151843ms step_avg:153.53ms
step:1000/1480 train_time:152005ms step_avg:153.54ms
step:1000/1480 val_loss:3.4423 train_time:152078ms step_avg:153.61ms
step:1001/1480 train_time:152173ms step_avg:153.55ms
step:1002/1480 train_time:152327ms step_avg:153.56ms
step:1003/1480 train_time:152489ms step_avg:153.56ms
step:1004/1480 train_time:152651ms step_avg:153.57ms
step:1005/1480 train_time:152811ms step_avg:153.58ms
step:1006/1480 train_time:152972ms step_avg:153.59ms
step:1007/1480 train_time:153131ms step_avg:153.59ms
step:1008/1480 train_time:153290ms step_avg:153.60ms
step:1009/1480 train_time:153457ms step_avg:153.61ms
step:1010/1480 train_time:153618ms step_avg:153.62ms
step:1011/1480 train_time:153777ms step_avg:153.62ms
step:1012/1480 train_time:153936ms step_avg:153.63ms
step:1013/1480 train_time:154096ms step_avg:153.63ms
step:1014/1480 train_time:154256ms step_avg:153.64ms
step:1015/1480 train_time:154420ms step_avg:153.65ms
step:1016/1480 train_time:154580ms step_avg:153.66ms
step:1017/1480 train_time:154742ms step_avg:153.67ms
step:1018/1480 train_time:154903ms step_avg:153.67ms
step:1019/1480 train_time:155065ms step_avg:153.68ms
step:1020/1480 train_time:155226ms step_avg:153.69ms
step:1021/1480 train_time:155385ms step_avg:153.69ms
step:1022/1480 train_time:155545ms step_avg:153.70ms
step:1023/1480 train_time:155705ms step_avg:153.71ms
step:1024/1480 train_time:155866ms step_avg:153.71ms
step:1025/1480 train_time:156029ms step_avg:153.72ms
step:1026/1480 train_time:156188ms step_avg:153.73ms
step:1027/1480 train_time:156346ms step_avg:153.73ms
step:1028/1480 train_time:156507ms step_avg:153.74ms
step:1029/1480 train_time:156670ms step_avg:153.75ms
step:1030/1480 train_time:156831ms step_avg:153.76ms
step:1031/1480 train_time:156990ms step_avg:153.76ms
step:1032/1480 train_time:157156ms step_avg:153.77ms
step:1033/1480 train_time:157316ms step_avg:153.78ms
step:1034/1480 train_time:157475ms step_avg:153.78ms
step:1035/1480 train_time:157635ms step_avg:153.79ms
step:1036/1480 train_time:157795ms step_avg:153.80ms
step:1037/1480 train_time:157955ms step_avg:153.80ms
step:1038/1480 train_time:158115ms step_avg:153.81ms
step:1039/1480 train_time:158276ms step_avg:153.82ms
step:1040/1480 train_time:158435ms step_avg:153.82ms
step:1041/1480 train_time:158596ms step_avg:153.83ms
step:1042/1480 train_time:158756ms step_avg:153.83ms
step:1043/1480 train_time:158915ms step_avg:153.84ms
step:1044/1480 train_time:159074ms step_avg:153.84ms
step:1045/1480 train_time:159234ms step_avg:153.85ms
step:1046/1480 train_time:159396ms step_avg:153.86ms
step:1047/1480 train_time:159556ms step_avg:153.86ms
step:1048/1480 train_time:159717ms step_avg:153.87ms
step:1049/1480 train_time:159878ms step_avg:153.88ms
step:1050/1480 train_time:160040ms step_avg:153.88ms
step:1051/1480 train_time:160201ms step_avg:153.89ms
step:1052/1480 train_time:160363ms step_avg:153.90ms
step:1053/1480 train_time:160524ms step_avg:153.91ms
step:1054/1480 train_time:160685ms step_avg:153.91ms
step:1055/1480 train_time:160845ms step_avg:153.92ms
step:1056/1480 train_time:161004ms step_avg:153.92ms
step:1057/1480 train_time:161165ms step_avg:153.93ms
step:1058/1480 train_time:161327ms step_avg:153.94ms
step:1059/1480 train_time:161489ms step_avg:153.95ms
step:1060/1480 train_time:161650ms step_avg:153.95ms
step:1061/1480 train_time:161808ms step_avg:153.96ms
step:1062/1480 train_time:161969ms step_avg:153.96ms
step:1063/1480 train_time:162127ms step_avg:153.97ms
step:1064/1480 train_time:162286ms step_avg:153.97ms
step:1065/1480 train_time:162447ms step_avg:153.98ms
step:1066/1480 train_time:162607ms step_avg:153.98ms
step:1067/1480 train_time:162769ms step_avg:153.99ms
step:1068/1480 train_time:162928ms step_avg:154.00ms
step:1069/1480 train_time:163092ms step_avg:154.01ms
step:1070/1480 train_time:163251ms step_avg:154.01ms
step:1071/1480 train_time:163415ms step_avg:154.02ms
step:1072/1480 train_time:163574ms step_avg:154.02ms
step:1073/1480 train_time:163733ms step_avg:154.03ms
step:1074/1480 train_time:163894ms step_avg:154.04ms
step:1075/1480 train_time:164056ms step_avg:154.04ms
step:1076/1480 train_time:164216ms step_avg:154.05ms
step:1077/1480 train_time:164375ms step_avg:154.05ms
step:1078/1480 train_time:164541ms step_avg:154.07ms
step:1079/1480 train_time:164706ms step_avg:154.07ms
step:1080/1480 train_time:164867ms step_avg:154.08ms
step:1081/1480 train_time:165026ms step_avg:154.09ms
step:1082/1480 train_time:165186ms step_avg:154.09ms
step:1083/1480 train_time:165346ms step_avg:154.10ms
step:1084/1480 train_time:165506ms step_avg:154.10ms
step:1085/1480 train_time:165668ms step_avg:154.11ms
step:1086/1480 train_time:165829ms step_avg:154.12ms
step:1087/1480 train_time:165988ms step_avg:154.12ms
step:1088/1480 train_time:166149ms step_avg:154.13ms
step:1089/1480 train_time:166313ms step_avg:154.14ms
step:1090/1480 train_time:166477ms step_avg:154.15ms
step:1091/1480 train_time:166638ms step_avg:154.15ms
step:1092/1480 train_time:166798ms step_avg:154.16ms
step:1093/1480 train_time:166959ms step_avg:154.16ms
step:1094/1480 train_time:167121ms step_avg:154.17ms
step:1095/1480 train_time:167281ms step_avg:154.18ms
step:1096/1480 train_time:167444ms step_avg:154.18ms
step:1097/1480 train_time:167607ms step_avg:154.19ms
step:1098/1480 train_time:167769ms step_avg:154.20ms
step:1099/1480 train_time:167931ms step_avg:154.21ms
step:1100/1480 train_time:168096ms step_avg:154.22ms
step:1101/1480 train_time:168260ms step_avg:154.23ms
step:1102/1480 train_time:168422ms step_avg:154.23ms
step:1103/1480 train_time:168586ms step_avg:154.24ms
step:1104/1480 train_time:168747ms step_avg:154.25ms
step:1105/1480 train_time:168909ms step_avg:154.25ms
step:1106/1480 train_time:169070ms step_avg:154.26ms
step:1107/1480 train_time:169231ms step_avg:154.27ms
step:1108/1480 train_time:169391ms step_avg:154.27ms
step:1109/1480 train_time:169551ms step_avg:154.28ms
step:1110/1480 train_time:169711ms step_avg:154.28ms
step:1111/1480 train_time:169871ms step_avg:154.29ms
step:1112/1480 train_time:170033ms step_avg:154.29ms
step:1113/1480 train_time:170199ms step_avg:154.31ms
step:1114/1480 train_time:170364ms step_avg:154.32ms
step:1115/1480 train_time:170525ms step_avg:154.32ms
step:1116/1480 train_time:170685ms step_avg:154.33ms
step:1117/1480 train_time:170848ms step_avg:154.33ms
step:1118/1480 train_time:171012ms step_avg:154.34ms
step:1119/1480 train_time:171173ms step_avg:154.35ms
step:1120/1480 train_time:171334ms step_avg:154.36ms
step:1121/1480 train_time:171496ms step_avg:154.36ms
step:1122/1480 train_time:171656ms step_avg:154.37ms
step:1123/1480 train_time:171816ms step_avg:154.37ms
step:1124/1480 train_time:171980ms step_avg:154.38ms
step:1125/1480 train_time:172143ms step_avg:154.39ms
step:1125/1480 val_loss:3.3873 train_time:172217ms step_avg:154.46ms
step:1126/1480 train_time:172312ms step_avg:154.40ms
step:1127/1480 train_time:172471ms step_avg:154.41ms
step:1128/1480 train_time:172631ms step_avg:154.41ms
step:1129/1480 train_time:172795ms step_avg:154.42ms
step:1130/1480 train_time:172956ms step_avg:154.43ms
step:1131/1480 train_time:173124ms step_avg:154.44ms
step:1132/1480 train_time:173284ms step_avg:154.44ms
step:1133/1480 train_time:173446ms step_avg:154.45ms
step:1134/1480 train_time:173608ms step_avg:154.46ms
step:1135/1480 train_time:173767ms step_avg:154.46ms
step:1136/1480 train_time:173931ms step_avg:154.47ms
step:1137/1480 train_time:174094ms step_avg:154.48ms
step:1138/1480 train_time:174260ms step_avg:154.49ms
step:1139/1480 train_time:174434ms step_avg:154.50ms
step:1140/1480 train_time:174582ms step_avg:154.50ms
step:1141/1480 train_time:174747ms step_avg:154.51ms
step:1142/1480 train_time:174908ms step_avg:154.51ms
step:1143/1480 train_time:175071ms step_avg:154.52ms
step:1144/1480 train_time:175232ms step_avg:154.53ms
step:1145/1480 train_time:175391ms step_avg:154.53ms
step:1146/1480 train_time:175555ms step_avg:154.54ms
step:1147/1480 train_time:175717ms step_avg:154.54ms
step:1148/1480 train_time:175880ms step_avg:154.55ms
step:1149/1480 train_time:176043ms step_avg:154.56ms
step:1150/1480 train_time:176203ms step_avg:154.56ms
step:1151/1480 train_time:176366ms step_avg:154.57ms
step:1152/1480 train_time:176529ms step_avg:154.58ms
step:1153/1480 train_time:176693ms step_avg:154.59ms
step:1154/1480 train_time:176854ms step_avg:154.59ms
step:1155/1480 train_time:177017ms step_avg:154.60ms
step:1156/1480 train_time:177183ms step_avg:154.61ms
step:1157/1480 train_time:177346ms step_avg:154.62ms
step:1158/1480 train_time:177506ms step_avg:154.62ms
step:1159/1480 train_time:177668ms step_avg:154.63ms
step:1160/1480 train_time:177827ms step_avg:154.63ms
step:1161/1480 train_time:177989ms step_avg:154.64ms
step:1162/1480 train_time:178151ms step_avg:154.65ms
step:1163/1480 train_time:178314ms step_avg:154.65ms
step:1164/1480 train_time:178477ms step_avg:154.66ms
step:1165/1480 train_time:178637ms step_avg:154.66ms
step:1166/1480 train_time:178799ms step_avg:154.67ms
step:1167/1480 train_time:178961ms step_avg:154.68ms
step:1168/1480 train_time:179123ms step_avg:154.68ms
step:1169/1480 train_time:179284ms step_avg:154.69ms
step:1170/1480 train_time:179445ms step_avg:154.69ms
step:1171/1480 train_time:179606ms step_avg:154.70ms
step:1172/1480 train_time:179768ms step_avg:154.71ms
step:1173/1480 train_time:179932ms step_avg:154.71ms
step:1174/1480 train_time:180104ms step_avg:154.73ms
step:1175/1480 train_time:180265ms step_avg:154.73ms
step:1176/1480 train_time:180428ms step_avg:154.74ms
step:1177/1480 train_time:180595ms step_avg:154.75ms
step:1178/1480 train_time:180758ms step_avg:154.76ms
step:1179/1480 train_time:180917ms step_avg:154.76ms
step:1180/1480 train_time:181086ms step_avg:154.77ms
step:1181/1480 train_time:181248ms step_avg:154.78ms
step:1182/1480 train_time:181408ms step_avg:154.78ms
step:1183/1480 train_time:181570ms step_avg:154.79ms
step:1184/1480 train_time:181732ms step_avg:154.80ms
step:1185/1480 train_time:181896ms step_avg:154.81ms
step:1186/1480 train_time:182061ms step_avg:154.81ms
step:1187/1480 train_time:182233ms step_avg:154.83ms
step:1188/1480 train_time:182393ms step_avg:154.83ms
step:1189/1480 train_time:182555ms step_avg:154.84ms
step:1190/1480 train_time:182718ms step_avg:154.85ms
step:1191/1480 train_time:182882ms step_avg:154.85ms
step:1192/1480 train_time:183043ms step_avg:154.86ms
step:1193/1480 train_time:183203ms step_avg:154.86ms
step:1194/1480 train_time:183364ms step_avg:154.87ms
step:1195/1480 train_time:183526ms step_avg:154.87ms
step:1196/1480 train_time:183698ms step_avg:154.89ms
step:1197/1480 train_time:183859ms step_avg:154.89ms
step:1198/1480 train_time:184026ms step_avg:154.90ms
step:1199/1480 train_time:184188ms step_avg:154.91ms
step:1200/1480 train_time:184350ms step_avg:154.92ms
step:1201/1480 train_time:184510ms step_avg:154.92ms
step:1202/1480 train_time:184679ms step_avg:154.93ms
step:1203/1480 train_time:184845ms step_avg:154.94ms
step:1204/1480 train_time:185009ms step_avg:154.95ms
step:1205/1480 train_time:185170ms step_avg:154.95ms
step:1206/1480 train_time:185331ms step_avg:154.96ms
step:1207/1480 train_time:185492ms step_avg:154.96ms
step:1208/1480 train_time:185653ms step_avg:154.97ms
step:1209/1480 train_time:185819ms step_avg:154.98ms
step:1210/1480 train_time:185984ms step_avg:154.99ms
step:1211/1480 train_time:186148ms step_avg:154.99ms
step:1212/1480 train_time:186311ms step_avg:155.00ms
step:1213/1480 train_time:186476ms step_avg:155.01ms
step:1214/1480 train_time:186643ms step_avg:155.02ms
step:1215/1480 train_time:186806ms step_avg:155.03ms
step:1216/1480 train_time:186966ms step_avg:155.03ms
step:1217/1480 train_time:187128ms step_avg:155.04ms
step:1218/1480 train_time:187290ms step_avg:155.04ms
step:1219/1480 train_time:187458ms step_avg:155.05ms
step:1220/1480 train_time:187621ms step_avg:155.06ms
step:1221/1480 train_time:187783ms step_avg:155.06ms
step:1222/1480 train_time:187943ms step_avg:155.07ms
step:1223/1480 train_time:188106ms step_avg:155.07ms
step:1224/1480 train_time:188271ms step_avg:155.08ms
step:1225/1480 train_time:188435ms step_avg:155.09ms
step:1226/1480 train_time:188601ms step_avg:155.10ms
step:1227/1480 train_time:188765ms step_avg:155.11ms
step:1228/1480 train_time:188927ms step_avg:155.11ms
step:1229/1480 train_time:189091ms step_avg:155.12ms
step:1230/1480 train_time:189262ms step_avg:155.13ms
step:1231/1480 train_time:189427ms step_avg:155.14ms
step:1232/1480 train_time:189594ms step_avg:155.15ms
step:1233/1480 train_time:189755ms step_avg:155.16ms
step:1234/1480 train_time:189918ms step_avg:155.16ms
step:1235/1480 train_time:190084ms step_avg:155.17ms
step:1236/1480 train_time:190245ms step_avg:155.17ms
step:1237/1480 train_time:190406ms step_avg:155.18ms
step:1238/1480 train_time:190578ms step_avg:155.19ms
step:1239/1480 train_time:190740ms step_avg:155.20ms
step:1240/1480 train_time:190906ms step_avg:155.21ms
step:1241/1480 train_time:191070ms step_avg:155.21ms
step:1242/1480 train_time:191230ms step_avg:155.22ms
step:1243/1480 train_time:191394ms step_avg:155.23ms
step:1244/1480 train_time:191556ms step_avg:155.23ms
step:1245/1480 train_time:191720ms step_avg:155.24ms
step:1246/1480 train_time:191882ms step_avg:155.24ms
step:1247/1480 train_time:192045ms step_avg:155.25ms
step:1248/1480 train_time:192206ms step_avg:155.26ms
step:1249/1480 train_time:192367ms step_avg:155.26ms
step:1250/1480 train_time:192527ms step_avg:155.26ms
step:1250/1480 val_loss:3.3368 train_time:192603ms step_avg:155.32ms
step:1251/1480 train_time:192697ms step_avg:155.28ms
step:1252/1480 train_time:192860ms step_avg:155.28ms
step:1253/1480 train_time:193019ms step_avg:155.29ms
step:1254/1480 train_time:193180ms step_avg:155.29ms
step:1255/1480 train_time:193351ms step_avg:155.30ms
step:1256/1480 train_time:193516ms step_avg:155.31ms
step:1257/1480 train_time:193678ms step_avg:155.31ms
step:1258/1480 train_time:193844ms step_avg:155.32ms
step:1259/1480 train_time:194007ms step_avg:155.33ms
step:1260/1480 train_time:194168ms step_avg:155.33ms
step:1261/1480 train_time:194331ms step_avg:155.34ms
step:1262/1480 train_time:194496ms step_avg:155.35ms
step:1263/1480 train_time:194660ms step_avg:155.36ms
step:1264/1480 train_time:194819ms step_avg:155.36ms
step:1265/1480 train_time:194979ms step_avg:155.36ms
step:1266/1480 train_time:195141ms step_avg:155.37ms
step:1267/1480 train_time:195301ms step_avg:155.37ms
step:1268/1480 train_time:195464ms step_avg:155.38ms
step:1269/1480 train_time:195631ms step_avg:155.39ms
step:1270/1480 train_time:195794ms step_avg:155.39ms
step:1271/1480 train_time:195957ms step_avg:155.40ms
step:1272/1480 train_time:196118ms step_avg:155.40ms
step:1273/1480 train_time:196282ms step_avg:155.41ms
step:1274/1480 train_time:196446ms step_avg:155.42ms
step:1275/1480 train_time:196608ms step_avg:155.42ms
step:1276/1480 train_time:196769ms step_avg:155.43ms
step:1277/1480 train_time:196933ms step_avg:155.43ms
step:1278/1480 train_time:197095ms step_avg:155.44ms
step:1279/1480 train_time:197256ms step_avg:155.44ms
step:1280/1480 train_time:197422ms step_avg:155.45ms
step:1281/1480 train_time:197583ms step_avg:155.45ms
step:1282/1480 train_time:197742ms step_avg:155.46ms
step:1283/1480 train_time:197905ms step_avg:155.46ms
step:1284/1480 train_time:198069ms step_avg:155.47ms
step:1285/1480 train_time:198232ms step_avg:155.48ms
step:1286/1480 train_time:198393ms step_avg:155.48ms
step:1287/1480 train_time:198556ms step_avg:155.49ms
step:1288/1480 train_time:198717ms step_avg:155.49ms
step:1289/1480 train_time:198885ms step_avg:155.50ms
step:1290/1480 train_time:199056ms step_avg:155.51ms
step:1291/1480 train_time:199219ms step_avg:155.52ms
step:1292/1480 train_time:199382ms step_avg:155.52ms
step:1293/1480 train_time:199549ms step_avg:155.53ms
step:1294/1480 train_time:199714ms step_avg:155.54ms
step:1295/1480 train_time:199878ms step_avg:155.55ms
step:1296/1480 train_time:200040ms step_avg:155.55ms
step:1297/1480 train_time:200204ms step_avg:155.56ms
step:1298/1480 train_time:200368ms step_avg:155.57ms
step:1299/1480 train_time:200531ms step_avg:155.57ms
step:1300/1480 train_time:200692ms step_avg:155.58ms
step:1301/1480 train_time:200854ms step_avg:155.58ms
step:1302/1480 train_time:201017ms step_avg:155.59ms
step:1303/1480 train_time:201183ms step_avg:155.59ms
step:1304/1480 train_time:201349ms step_avg:155.60ms
step:1305/1480 train_time:201512ms step_avg:155.61ms
step:1306/1480 train_time:201676ms step_avg:155.61ms
step:1307/1480 train_time:201837ms step_avg:155.62ms
step:1308/1480 train_time:201999ms step_avg:155.62ms
step:1309/1480 train_time:202165ms step_avg:155.63ms
step:1310/1480 train_time:202326ms step_avg:155.64ms
step:1311/1480 train_time:202487ms step_avg:155.64ms
step:1312/1480 train_time:202654ms step_avg:155.65ms
step:1313/1480 train_time:202817ms step_avg:155.65ms
step:1314/1480 train_time:202981ms step_avg:155.66ms
step:1315/1480 train_time:203143ms step_avg:155.67ms
step:1316/1480 train_time:203303ms step_avg:155.67ms
step:1317/1480 train_time:203465ms step_avg:155.67ms
step:1318/1480 train_time:203632ms step_avg:155.68ms
step:1319/1480 train_time:203799ms step_avg:155.69ms
step:1320/1480 train_time:203967ms step_avg:155.70ms
step:1321/1480 train_time:204132ms step_avg:155.71ms
step:1322/1480 train_time:204303ms step_avg:155.72ms
step:1323/1480 train_time:204468ms step_avg:155.73ms
step:1324/1480 train_time:204632ms step_avg:155.73ms
step:1325/1480 train_time:204800ms step_avg:155.74ms
step:1326/1480 train_time:204965ms step_avg:155.75ms
step:1327/1480 train_time:205128ms step_avg:155.75ms
step:1328/1480 train_time:205290ms step_avg:155.76ms
step:1329/1480 train_time:205483ms step_avg:155.79ms
step:1330/1480 train_time:205641ms step_avg:155.79ms
step:1331/1480 train_time:205802ms step_avg:155.79ms
step:1332/1480 train_time:205966ms step_avg:155.80ms
step:1333/1480 train_time:206132ms step_avg:155.81ms
step:1334/1480 train_time:206295ms step_avg:155.81ms
step:1335/1480 train_time:206456ms step_avg:155.82ms
step:1336/1480 train_time:206624ms step_avg:155.82ms
step:1337/1480 train_time:206792ms step_avg:155.83ms
step:1338/1480 train_time:206956ms step_avg:155.84ms
step:1339/1480 train_time:207119ms step_avg:155.85ms
step:1340/1480 train_time:207283ms step_avg:155.85ms
step:1341/1480 train_time:207445ms step_avg:155.86ms
step:1342/1480 train_time:207611ms step_avg:155.86ms
step:1343/1480 train_time:207774ms step_avg:155.87ms
step:1344/1480 train_time:207936ms step_avg:155.87ms
step:1345/1480 train_time:208105ms step_avg:155.88ms
step:1346/1480 train_time:208266ms step_avg:155.89ms
step:1347/1480 train_time:208430ms step_avg:155.89ms
step:1348/1480 train_time:208592ms step_avg:155.90ms
step:1349/1480 train_time:208754ms step_avg:155.90ms
step:1350/1480 train_time:208920ms step_avg:155.91ms
step:1351/1480 train_time:209083ms step_avg:155.92ms
step:1352/1480 train_time:209245ms step_avg:155.92ms
step:1353/1480 train_time:209413ms step_avg:155.93ms
step:1354/1480 train_time:209576ms step_avg:155.93ms
step:1355/1480 train_time:209739ms step_avg:155.94ms
step:1356/1480 train_time:209903ms step_avg:155.95ms
step:1357/1480 train_time:210069ms step_avg:155.95ms
step:1358/1480 train_time:210233ms step_avg:155.96ms
step:1359/1480 train_time:210396ms step_avg:155.96ms
step:1360/1480 train_time:210561ms step_avg:155.97ms
step:1361/1480 train_time:210726ms step_avg:155.98ms
step:1362/1480 train_time:210892ms step_avg:155.99ms
step:1363/1480 train_time:211060ms step_avg:155.99ms
step:1364/1480 train_time:211221ms step_avg:156.00ms
step:1365/1480 train_time:211382ms step_avg:156.00ms
step:1366/1480 train_time:211546ms step_avg:156.01ms
step:1367/1480 train_time:211710ms step_avg:156.01ms
step:1368/1480 train_time:211876ms step_avg:156.02ms
step:1369/1480 train_time:212044ms step_avg:156.03ms
step:1370/1480 train_time:212211ms step_avg:156.04ms
step:1371/1480 train_time:212375ms step_avg:156.04ms
step:1372/1480 train_time:212541ms step_avg:156.05ms
step:1373/1480 train_time:212702ms step_avg:156.05ms
step:1374/1480 train_time:212870ms step_avg:156.06ms
step:1375/1480 train_time:213033ms step_avg:156.07ms
step:1375/1480 val_loss:3.2984 train_time:213108ms step_avg:156.12ms
step:1376/1480 train_time:213202ms step_avg:156.08ms
step:1377/1480 train_time:213366ms step_avg:156.08ms
step:1378/1480 train_time:213528ms step_avg:156.09ms
step:1379/1480 train_time:213692ms step_avg:156.09ms
step:1380/1480 train_time:213856ms step_avg:156.10ms
step:1381/1480 train_time:214026ms step_avg:156.11ms
step:1382/1480 train_time:214190ms step_avg:156.12ms
step:1383/1480 train_time:214353ms step_avg:156.12ms
step:1384/1480 train_time:214520ms step_avg:156.13ms
step:1385/1480 train_time:214680ms step_avg:156.13ms
step:1386/1480 train_time:214844ms step_avg:156.14ms
step:1387/1480 train_time:215010ms step_avg:156.14ms
step:1388/1480 train_time:215170ms step_avg:156.15ms
step:1389/1480 train_time:215336ms step_avg:156.15ms
step:1390/1480 train_time:215498ms step_avg:156.16ms
step:1391/1480 train_time:215659ms step_avg:156.16ms
step:1392/1480 train_time:215823ms step_avg:156.17ms
step:1393/1480 train_time:215986ms step_avg:156.17ms
step:1394/1480 train_time:216151ms step_avg:156.18ms
step:1395/1480 train_time:216312ms step_avg:156.18ms
step:1396/1480 train_time:216474ms step_avg:156.19ms
step:1397/1480 train_time:216634ms step_avg:156.19ms
step:1398/1480 train_time:216795ms step_avg:156.19ms
step:1399/1480 train_time:216956ms step_avg:156.20ms
step:1400/1480 train_time:217126ms step_avg:156.21ms
step:1401/1480 train_time:217287ms step_avg:156.21ms
step:1402/1480 train_time:217449ms step_avg:156.21ms
step:1403/1480 train_time:217615ms step_avg:156.22ms
step:1404/1480 train_time:217778ms step_avg:156.23ms
step:1405/1480 train_time:217947ms step_avg:156.23ms
step:1406/1480 train_time:218111ms step_avg:156.24ms
step:1407/1480 train_time:218273ms step_avg:156.24ms
step:1408/1480 train_time:218434ms step_avg:156.25ms
step:1409/1480 train_time:218607ms step_avg:156.26ms
step:1410/1480 train_time:218769ms step_avg:156.26ms
step:1411/1480 train_time:218930ms step_avg:156.27ms
step:1412/1480 train_time:219092ms step_avg:156.27ms
step:1413/1480 train_time:219255ms step_avg:156.28ms
step:1414/1480 train_time:219419ms step_avg:156.28ms
step:1415/1480 train_time:219584ms step_avg:156.29ms
step:1416/1480 train_time:219758ms step_avg:156.30ms
step:1417/1480 train_time:219925ms step_avg:156.31ms
step:1418/1480 train_time:220088ms step_avg:156.31ms
step:1419/1480 train_time:220252ms step_avg:156.32ms
step:1420/1480 train_time:220417ms step_avg:156.32ms
step:1421/1480 train_time:220582ms step_avg:156.33ms
step:1422/1480 train_time:220747ms step_avg:156.34ms
step:1423/1480 train_time:220909ms step_avg:156.34ms
step:1424/1480 train_time:221074ms step_avg:156.35ms
step:1425/1480 train_time:221245ms step_avg:156.36ms
step:1426/1480 train_time:221409ms step_avg:156.36ms
step:1427/1480 train_time:221574ms step_avg:156.37ms
step:1428/1480 train_time:221735ms step_avg:156.37ms
step:1429/1480 train_time:221897ms step_avg:156.38ms
step:1430/1480 train_time:222062ms step_avg:156.38ms
step:1431/1480 train_time:222228ms step_avg:156.39ms
step:1432/1480 train_time:222397ms step_avg:156.40ms
step:1433/1480 train_time:222565ms step_avg:156.41ms
step:1434/1480 train_time:222734ms step_avg:156.41ms
step:1435/1480 train_time:222899ms step_avg:156.42ms
step:1436/1480 train_time:223064ms step_avg:156.43ms
step:1437/1480 train_time:223227ms step_avg:156.43ms
step:1438/1480 train_time:223389ms step_avg:156.43ms
step:1439/1480 train_time:223555ms step_avg:156.44ms
step:1440/1480 train_time:223718ms step_avg:156.45ms
step:1441/1480 train_time:223882ms step_avg:156.45ms
step:1442/1480 train_time:224048ms step_avg:156.46ms
step:1443/1480 train_time:224222ms step_avg:156.47ms
step:1444/1480 train_time:224387ms step_avg:156.48ms
step:1445/1480 train_time:224549ms step_avg:156.48ms
step:1446/1480 train_time:224715ms step_avg:156.49ms
step:1447/1480 train_time:224884ms step_avg:156.50ms
step:1448/1480 train_time:225047ms step_avg:156.50ms
step:1449/1480 train_time:225209ms step_avg:156.50ms
step:1450/1480 train_time:225373ms step_avg:156.51ms
step:1451/1480 train_time:225537ms step_avg:156.51ms
step:1452/1480 train_time:225703ms step_avg:156.52ms
step:1453/1480 train_time:225866ms step_avg:156.53ms
step:1454/1480 train_time:226029ms step_avg:156.53ms
step:1455/1480 train_time:226196ms step_avg:156.54ms
step:1456/1480 train_time:226361ms step_avg:156.54ms
step:1457/1480 train_time:226524ms step_avg:156.55ms
step:1458/1480 train_time:226688ms step_avg:156.55ms
step:1459/1480 train_time:226853ms step_avg:156.56ms
step:1460/1480 train_time:227017ms step_avg:156.56ms
step:1461/1480 train_time:227182ms step_avg:156.57ms
step:1462/1480 train_time:227348ms step_avg:156.58ms
step:1463/1480 train_time:227512ms step_avg:156.58ms
step:1464/1480 train_time:227678ms step_avg:156.59ms
step:1465/1480 train_time:227842ms step_avg:156.59ms
step:1466/1480 train_time:228006ms step_avg:156.60ms
step:1467/1480 train_time:228170ms step_avg:156.60ms
step:1468/1480 train_time:228333ms step_avg:156.61ms
step:1469/1480 train_time:228496ms step_avg:156.61ms
step:1470/1480 train_time:228664ms step_avg:156.62ms
step:1471/1480 train_time:228834ms step_avg:156.63ms
step:1472/1480 train_time:229004ms step_avg:156.64ms
step:1473/1480 train_time:229168ms step_avg:156.64ms
step:1474/1480 train_time:229335ms step_avg:156.65ms
step:1475/1480 train_time:229505ms step_avg:156.66ms
step:1476/1480 train_time:229668ms step_avg:156.66ms
step:1477/1480 train_time:229834ms step_avg:156.67ms
step:1478/1480 train_time:230005ms step_avg:156.68ms
step:1479/1480 train_time:230169ms step_avg:156.68ms
step:1480/1480 train_time:230332ms step_avg:156.69ms
step:1480/1480 val_loss:3.2793 train_time:230409ms step_avg:156.74ms
peak memory consumption: 34239 MiB