{
  "run_name": "gpu2_b2048_edge_finetune",
  "training_seconds": 561.47,
  "best_step": 500,
  "best_eval_exact_accuracy": 0.993,
  "final_exact_accuracy": 0.9929,
  "final_exact_correct": 19858,
  "final_exact_total": 20000,
  "final_exact_by_operation": {
    "+": {
      "correct": 4022,
      "total": 4022,
      "accuracy": 1.0
    },
    "-": {
      "correct": 3965,
      "total": 3965,
      "accuracy": 1.0
    },
    "*": {
      "correct": 4084,
      "total": 4084,
      "accuracy": 1.0
    },
    "//": {
      "correct": 4007,
      "total": 4028,
      "accuracy": 0.9947864945382324
    },
    "%": {
      "correct": 3780,
      "total": 3901,
      "accuracy": 0.968982312227634
    }
  },
  "model_path": "runs/gpu2_b2048_edge_finetune/arithmetic_transformer_params.msgpack",
  "best_model_path": "runs/gpu2_b2048_edge_finetune/best_params.msgpack",
  "runtime": {
    "timestamp": "2026-05-22 20:47:17 KST",
    "python": "3.13.13",
    "platform": "Linux-5.4.0-216-generic-x86_64-with-glibc2.31",
    "jax": "0.10.1",
    "backend": "gpu",
    "devices": [
      "cuda:0"
    ],
    "cuda_visible_devices": "2",
    "nvidia_smi": "Fri May 22 20:47:17 2026       \n+-----------------------------------------------------------------------------------------+\n| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |\n|-----------------------------------------+------------------------+----------------------+\n| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n|                                         |                        |               MIG M. |\n|=========================================+========================+======================|\n|   0  NVIDIA GeForce RTX 3090        Off |   00000000:1B:00.0 Off |                  N/A |\n| 54%   32C    P8             25W /  350W |   22932MiB /  24576MiB |      0%      Default |\n|                                         |                        |                  N/A |\n+-----------------------------------------+------------------------+----------------------+\n|   1  NVIDIA GeForce RTX 3090        Off |   00000000:1C:00.0 Off |                  N/A |\n| 43%   30C    P8             25W /  350W |   22430MiB /  24576MiB |      0%      Default |\n|                                         |                        |                  N/A |\n+-----------------------------------------+------------------------+----------------------+\n|   2  NVIDIA GeForce RTX 3090        Off |   00000000:1D:00.0 Off |                  N/A |\n| 54%   37C    P2             38W /  350W |     272MiB /  24576MiB |      0%      Default |\n|                                         |                        |                  N/A |\n+-----------------------------------------+------------------------+----------------------+\n|   3  NVIDIA GeForce RTX 3090        Off |   00000000:1E:00.0 Off |                  N/A |\n| 53%   33C    P8             26W /  350W |       1MiB /  24576MiB |      0%      Default |\n|                                         |                        |                  N/A |\n+-----------------------------------------+------------------------+----------------------+\n|   4  NVIDIA GeForce RTX 3090        Off |   00000000:3D:00.0 Off |                  N/A |\n| 51%   30C    P8             27W /  350W |       1MiB /  24576MiB |      0%      Default |\n|                                         |                        |                  N/A |\n+-----------------------------------------+------------------------+----------------------+\n|   5  NVIDIA GeForce RTX 3090        Off |   00000000:3F:00.0 Off |                  N/A |\n| 53%   32C    P8             23W /  350W |       1MiB /  24576MiB |      0%      Default |\n|                                         |                        |                  N/A |\n+-----------------------------------------+------------------------+----------------------+\n|   6  NVIDIA GeForce RTX 3090        Off |   00000000:40:00.0 Off |                  N/A |\n| 53%   31C    P8             26W /  350W |       1MiB /  24576MiB |      0%      Default |\n|                                         |                        |                  N/A |\n+-----------------------------------------+------------------------+----------------------+\n|   7  NVIDIA GeForce RTX 3090        Off |   00000000:41:00.0 Off |                  N/A |\n| 52%   31C    P8             24W /  350W |       1MiB /  24576MiB |      0%      Default |\n|                                         |                        |                  N/A |\n+-----------------------------------------+------------------------+----------------------+\n                                                                                         \n+-----------------------------------------------------------------------------------------+\n| Processes:                                                                              |\n|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |\n|        ID   ID                                                               Usage      |\n|=========================================================================================|\n|    0   N/A  N/A           16627      C   VLLM::EngineCore                      22922MiB |\n|    1   N/A  N/A           16558      C   VLLM::EngineCore                      22420MiB |\n|    2   N/A  N/A         1956597      C   .miniforge/bin/python                   262MiB |\n+-----------------------------------------------------------------------------------------+"
  },
  "dataset": {
    "operations": [
      "+",
      "-",
      "*",
      "//",
      "%"
    ],
    "total_examples": 4998000,
    "unique_valid_examples": 4998000,
    "train_examples": 4898000,
    "validation_examples": 100000,
    "train_examples_by_operation": {
      "+": 980141,
      "-": 980000,
      "*": 979815,
      "//": 978986,
      "%": 979058
    },
    "validation_examples_by_operation": {
      "+": 19859,
      "-": 20000,
      "*": 20185,
      "//": 20014,
      "%": 19942
    },
    "sequence_length": 28,
    "prompt_length": 9,
    "answer_length": 19,
    "vocab_size": 17,
    "vocab": "0123456789 +-*%/=",
    "dataset_build_seconds": 57.91,
    "dataset_memory_mb": 559.78,
    "division_modulo_zero_policy": "b=0 is omitted for // and %"
  },
  "model": {
    "vocab_size": 17,
    "max_seq_len": 28,
    "d_model": 384,
    "n_heads": 6,
    "n_layers": 6,
    "mlp_dim": 1536,
    "parameter_count": 10650624,
    "parameter_count_millions": 10.651
  },
  "hyperparameters": {
    "run_name": "gpu2_b2048_edge_finetune",
    "output_dir": "runs",
    "batch_size": 2048,
    "eval_batch_size": 256,
    "max_steps": 1500,
    "min_steps": 500,
    "eval_every": 250,
    "eval_exact_n": 1000,
    "final_exact_n": 20000,
    "target_exact_accuracy": 0.995,
    "learning_rate": 0.0002,
    "end_lr_ratio": 0.1,
    "weight_decay": 0.0001,
    "warmup_steps": 50,
    "grad_clip": 1.0,
    "multiplication_focus_steps": 2500,
    "multiplication_focus_prob": 0.7,
    "op_probs": [
      0.05,
      0.1,
      0.2,
      0.3,
      0.35
    ],
    "seed": 1234,
    "init_seed": 0,
    "d_model": 384,
    "n_heads": 6,
    "n_layers": 6,
    "mlp_dim": 1536,
    "val_size": 100000,
    "save_every_eval": false,
    "require_gpu": true,
    "init_params": "runs/gpu3_b2048_divmod5000/arithmetic_transformer_params.msgpack",
    "edge_case_prob": 0.25
  },
  "sample_predictions": [
    {
      "problem": "0+0",
      "model_text": "  0+   0=+000000000000000000",
      "prediction": "0",
      "expected": "0",
      "correct": true
    },
    {
      "problem": "7+35",
      "model_text": "  7+  35=+200400000000000000",
      "prediction": "42",
      "expected": "42",
      "correct": true
    },
    {
      "problem": "123-45",
      "model_text": "123-  45=+800700000000000000",
      "prediction": "78",
      "expected": "78",
      "correct": true
    },
    {
      "problem": "45-123",
      "model_text": " 45- 123=-800700000000000000",
      "prediction": "-78",
      "expected": "-78",
      "correct": true
    },
    {
      "problem": "12*89",
      "model_text": " 12*  89=+801602001100000000",
      "prediction": "1068",
      "expected": "1068",
      "correct": true
    },
    {
      "problem": "123*45",
      "model_text": "123*  45=+501302501500000000",
      "prediction": "5535",
      "expected": "5535",
      "correct": true
    },
    {
      "problem": "5//4",
      "model_text": "  5//  4=+100000000000000000",
      "prediction": "1",
      "expected": "1",
      "correct": true
    },
    {
      "problem": "999//1",
      "model_text": "999//  1=+900900900000000000",
      "prediction": "999",
      "expected": "999",
      "correct": true
    },
    {
      "problem": "5%4",
      "model_text": "  5%   4=+100000000000000000",
      "prediction": "1",
      "expected": "1",
      "correct": true
    },
    {
      "problem": "998%999",
      "model_text": "998% 999=+800900900000000000",
      "prediction": "998",
      "expected": "998",
      "correct": true
    },
    {
      "problem": "501*499",
      "model_text": "501* 499=+900900904904402200",
      "prediction": "249999",
      "expected": "249999",
      "correct": true
    },
    {
      "problem": "999*999",
      "model_text": "999* 999=+108017026818909900",
      "prediction": "998001",
      "expected": "998001",
      "correct": true
    }
  ],
  "history": [
    {
      "event": "eval",
      "step": 1,
      "elapsed_seconds": 277.63,
      "train_loss": 0.002636338584125042,
      "train_token_accuracy": 0.9991262555122375,
      "val_loss": 0.001127233263105154,
      "val_token_accuracy": 0.9994603395462036,
      "exact_accuracy": 0.99,
      "exact_correct": 990,
      "exact_total": 1000,
      "exact_by_operation": {
        "+": {
          "correct": 194,
          "total": 194,
          "accuracy": 1.0
        },
        "-": {
          "correct": 198,
          "total": 198,
          "accuracy": 1.0
        },
        "*": {
          "correct": 205,
          "total": 205,
          "accuracy": 1.0
        },
        "//": {
          "correct": 198,
          "total": 200,
          "accuracy": 0.99
        },
        "%": {
          "correct": 195,
          "total": 203,
          "accuracy": 0.9605911330049262
        }
      },
      "learning_rate": 4.0000013541430235e-06,
      "op_sampling": {
        "+": 0.05,
        "-": 0.1,
        "*": 0.2,
        "//": 0.3,
        "%": 0.35
      }
    },
    {
      "event": "eval",
      "step": 250,
      "elapsed_seconds": 324.4,
      "train_loss": 0.004352588206529617,
      "train_token_accuracy": 0.9986379742622375,
      "val_loss": 0.002333001233637333,
      "val_token_accuracy": 0.9990748167037964,
      "exact_accuracy": 0.983,
      "exact_correct": 983,
      "exact_total": 1000,
      "exact_by_operation": {
        "+": {
          "correct": 174,
          "total": 174,
          "accuracy": 1.0
        },
        "-": {
          "correct": 197,
          "total": 197,
          "accuracy": 1.0
        },
        "*": {
          "correct": 181,
          "total": 183,
          "accuracy": 0.9890710382513661
        },
        "//": {
          "correct": 222,
          "total": 226,
          "accuracy": 0.9823008849557522
        },
        "%": {
          "correct": 209,
          "total": 220,
          "accuracy": 0.95
        }
      },
      "learning_rate": 0.00019168177095707506,
      "op_sampling": {
        "+": 0.05,
        "-": 0.1,
        "*": 0.2,
        "//": 0.3,
        "%": 0.35
      }
    },
    {
      "event": "eval",
      "step": 500,
      "elapsed_seconds": 371.57,
      "train_loss": 0.0026333255227655172,
      "train_token_accuracy": 0.9988692402839661,
      "val_loss": 0.0016257410170510411,
      "val_token_accuracy": 0.9993318319320679,
      "exact_accuracy": 0.993,
      "exact_correct": 993,
      "exact_total": 1000,
      "exact_by_operation": {
        "+": {
          "correct": 203,
          "total": 203,
          "accuracy": 1.0
        },
        "-": {
          "correct": 210,
          "total": 210,
          "accuracy": 1.0
        },
        "*": {
          "correct": 205,
          "total": 205,
          "accuracy": 1.0
        },
        "//": {
          "correct": 188,
          "total": 191,
          "accuracy": 0.9842931937172775
        },
        "%": {
          "correct": 187,
          "total": 191,
          "accuracy": 0.9790575916230366
        }
      },
      "learning_rate": 0.00016050682461354882,
      "op_sampling": {
        "+": 0.05,
        "-": 0.1,
        "*": 0.2,
        "//": 0.3,
        "%": 0.35
      }
    },
    {
      "event": "eval",
      "step": 750,
      "elapsed_seconds": 419.21,
      "train_loss": 0.0022381972521543503,
      "train_token_accuracy": 0.9991776347160339,
      "val_loss": 0.0018789999885484576,
      "val_token_accuracy": 0.9992290139198303,
      "exact_accuracy": 0.99,
      "exact_correct": 990,
      "exact_total": 1000,
      "exact_by_operation": {
        "+": {
          "correct": 202,
          "total": 202,
          "accuracy": 1.0
        },
        "-": {
          "correct": 198,
          "total": 198,
          "accuracy": 1.0
        },
        "*": {
          "correct": 193,
          "total": 193,
          "accuracy": 1.0
        },
        "//": {
          "correct": 206,
          "total": 209,
          "accuracy": 0.9856459330143541
        },
        "%": {
          "correct": 191,
          "total": 198,
          "accuracy": 0.9646464646464646
        }
      },
      "learning_rate": 0.00011487249139463529,
      "op_sampling": {
        "+": 0.05,
        "-": 0.1,
        "*": 0.2,
        "//": 0.3,
        "%": 0.35
      }
    },
    {
      "event": "eval",
      "step": 1000,
      "elapsed_seconds": 466.53,
      "train_loss": 0.0014942348934710026,
      "train_token_accuracy": 0.9993061423301697,
      "val_loss": 0.0012976628495380282,
      "val_token_accuracy": 0.9994603395462036,
      "exact_accuracy": 0.993,
      "exact_correct": 993,
      "exact_total": 1000,
      "exact_by_operation": {
        "+": {
          "correct": 214,
          "total": 214,
          "accuracy": 1.0
        },
        "-": {
          "correct": 195,
          "total": 195,
          "accuracy": 1.0
        },
        "*": {
          "correct": 176,
          "total": 176,
          "accuracy": 1.0
        },
        "//": {
          "correct": 209,
          "total": 210,
          "accuracy": 0.9952380952380953
        },
        "%": {
          "correct": 199,
          "total": 205,
          "accuracy": 0.9707317073170731
        }
      },
      "learning_rate": 6.784322613384575e-05,
      "op_sampling": {
        "+": 0.05,
        "-": 0.1,
        "*": 0.2,
        "//": 0.3,
        "%": 0.35
      }
    },
    {
      "event": "eval",
      "step": 1250,
      "elapsed_seconds": 513.89,
      "train_loss": 0.0016579929506406188,
      "train_token_accuracy": 0.9992804527282715,
      "val_loss": 0.0017736267764121294,
      "val_token_accuracy": 0.9992547631263733,
      "exact_accuracy": 0.989,
      "exact_correct": 989,
      "exact_total": 1000,
      "exact_by_operation": {
        "+": {
          "correct": 199,
          "total": 199,
          "accuracy": 1.0
        },
        "-": {
          "correct": 210,
          "total": 210,
          "accuracy": 1.0
        },
        "*": {
          "correct": 194,
          "total": 194,
          "accuracy": 1.0
        },
        "//": {
          "correct": 197,
          "total": 198,
          "accuracy": 0.9949494949494949
        },
        "%": {
          "correct": 189,
          "total": 199,
          "accuracy": 0.949748743718593
        }
      },
      "learning_rate": 3.288284642621875e-05,
      "op_sampling": {
        "+": 0.05,
        "-": 0.1,
        "*": 0.2,
        "//": 0.3,
        "%": 0.35
      }
    },
    {
      "event": "eval",
      "step": 1500,
      "elapsed_seconds": 561.47,
      "train_loss": 0.0014991366770118475,
      "train_token_accuracy": 0.9993318319320679,
      "val_loss": 0.0011124522425234318,
      "val_token_accuracy": 0.9995630979537964,
      "exact_accuracy": 0.993,
      "exact_correct": 993,
      "exact_total": 1000,
      "exact_by_operation": {
        "+": {
          "correct": 191,
          "total": 191,
          "accuracy": 1.0
        },
        "-": {
          "correct": 208,
          "total": 209,
          "accuracy": 0.9952153110047847
        },
        "*": {
          "correct": 192,
          "total": 192,
          "accuracy": 1.0
        },
        "//": {
          "correct": 222,
          "total": 224,
          "accuracy": 0.9910714285714286
        },
        "%": {
          "correct": 180,
          "total": 184,
          "accuracy": 0.9782608695652174
        }
      },
      "learning_rate": 1.9999999494757503e-05,
      "op_sampling": {
        "+": 0.05,
        "-": 0.1,
        "*": 0.2,
        "//": 0.3,
        "%": 0.35
      }
    }
  ]
}
