shank commited on
Commit Β·
b92ad01
1
Parent(s): 85f14d3
Add Gradio training monitor and fix subprocess python path
Browse files- app.py: Gradio Space that streams GRPO training log live
- requirements.txt: training deps (unsloth, trl, wandb, gradio, etc.)
- README.md: sdk switched to gradio for training Space
- calibrate.py: fix python β sys.executable for portability
- train_grpo.py: fix python subprocess + add shutil.which fallback
Made-with: Cursor
- README.md +6 -6
- app.py +106 -0
- calibrate.py +2 -1
- requirements.txt +9 -10
- training/train_grpo.py +129 -42
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title: AgentDebugger-
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
pinned: true
|
| 9 |
license: mit
|
| 10 |
---
|
|
|
|
| 1 |
---
|
| 2 |
+
title: AgentDebugger-Training π§
|
| 3 |
+
emoji: π§
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
license: mit
|
| 10 |
---
|
app.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AgentDebuggerEnv β Training Monitor
|
| 3 |
+
Gradio UI that boots GRPO training in a background process and streams live status.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import subprocess
|
| 7 |
+
import threading
|
| 8 |
+
import gradio as gr
|
| 9 |
+
import os
|
| 10 |
+
import json
|
| 11 |
+
import sys
|
| 12 |
+
import time
|
| 13 |
+
|
| 14 |
+
# ββ Start training in background βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
+
training_log: list[str] = []
|
| 16 |
+
training_proc: subprocess.Popen | None = None
|
| 17 |
+
training_started_at: float = time.time()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _stream_training():
|
| 21 |
+
global training_proc
|
| 22 |
+
script = os.path.join(os.path.dirname(__file__), "training", "train_grpo.py")
|
| 23 |
+
training_proc = subprocess.Popen(
|
| 24 |
+
[sys.executable, script],
|
| 25 |
+
stdout=subprocess.PIPE,
|
| 26 |
+
stderr=subprocess.STDOUT,
|
| 27 |
+
text=True,
|
| 28 |
+
bufsize=1,
|
| 29 |
+
)
|
| 30 |
+
for line in training_proc.stdout:
|
| 31 |
+
line = line.rstrip()
|
| 32 |
+
training_log.append(line)
|
| 33 |
+
if len(training_log) > 300:
|
| 34 |
+
training_log.pop(0)
|
| 35 |
+
training_proc.wait()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
training_thread = threading.Thread(target=_stream_training, daemon=True)
|
| 39 |
+
training_thread.start()
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ββ Status checker βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
def check_status() -> str:
|
| 44 |
+
lines: list[str] = []
|
| 45 |
+
elapsed = int(time.time() - training_started_at)
|
| 46 |
+
lines.append(f"Elapsed: {elapsed // 60}m {elapsed % 60}s")
|
| 47 |
+
|
| 48 |
+
if training_proc is None:
|
| 49 |
+
lines.append("Status: starting up (give it ~2 minutes)...")
|
| 50 |
+
elif training_proc.poll() is None:
|
| 51 |
+
lines.append("Status: TRAINING RUNNING β")
|
| 52 |
+
else:
|
| 53 |
+
code = training_proc.poll()
|
| 54 |
+
lines.append(f"Status: {'COMPLETED β' if code == 0 else f'EXITED (code {code})'}")
|
| 55 |
+
|
| 56 |
+
if os.path.exists("baseline_results.json"):
|
| 57 |
+
try:
|
| 58 |
+
with open("baseline_results.json") as f:
|
| 59 |
+
baseline = json.load(f)
|
| 60 |
+
lines.append(f"\nBaseline solve rate : {baseline['solve_rate']:.1%}")
|
| 61 |
+
lines.append(f"Baseline avg reward : {baseline['avg_reward']:.3f}")
|
| 62 |
+
except Exception:
|
| 63 |
+
pass
|
| 64 |
+
|
| 65 |
+
if os.path.exists("checkpoints"):
|
| 66 |
+
ckpts = sorted(
|
| 67 |
+
[d for d in os.listdir("checkpoints") if os.path.isdir(f"checkpoints/{d}")]
|
| 68 |
+
)
|
| 69 |
+
if ckpts:
|
| 70 |
+
lines.append(f"\nLatest checkpoint : {ckpts[-1]}")
|
| 71 |
+
lines.append(f"Total checkpoints : {len(ckpts)}")
|
| 72 |
+
|
| 73 |
+
if os.path.exists("final_model"):
|
| 74 |
+
lines.append("\nFinal model saved β β training complete!")
|
| 75 |
+
|
| 76 |
+
lines.append("\n" + "β" * 50)
|
| 77 |
+
lines.append("Recent log (last 40 lines):")
|
| 78 |
+
lines.extend(training_log[-40:] if training_log else ["(no output yet)"])
|
| 79 |
+
|
| 80 |
+
return "\n".join(lines)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
with gr.Blocks(title="AgentDebuggerEnv Training Monitor") as demo:
|
| 85 |
+
gr.Markdown(
|
| 86 |
+
"""
|
| 87 |
+
# AgentDebuggerEnv β GRPO Training Monitor
|
| 88 |
+
Training **Qwen2.5-Coder-7B-Instruct** on structured hypothesis-driven debugging.
|
| 89 |
+
- Algorithm: GRPO (same as DeepSeek-R1)
|
| 90 |
+
- Dataset: 90 hand-validated bugs across 3 difficulty tiers
|
| 91 |
+
- Curriculum: Tier 1 (steps 0β300) β Tier 1+2 (300β600) β All tiers (600+)
|
| 92 |
+
"""
|
| 93 |
+
)
|
| 94 |
+
status_box = gr.Textbox(
|
| 95 |
+
label="Training Status",
|
| 96 |
+
lines=50,
|
| 97 |
+
max_lines=50,
|
| 98 |
+
interactive=False,
|
| 99 |
+
)
|
| 100 |
+
refresh_btn = gr.Button("Refresh Status")
|
| 101 |
+
refresh_btn.click(fn=check_status, outputs=status_box)
|
| 102 |
+
|
| 103 |
+
# Auto-refresh every 30s
|
| 104 |
+
demo.load(fn=check_status, outputs=status_box, every=30)
|
| 105 |
+
|
| 106 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
calibrate.py
CHANGED
|
@@ -2,6 +2,7 @@ import json
|
|
| 2 |
import subprocess
|
| 3 |
import tempfile
|
| 4 |
import os
|
|
|
|
| 5 |
|
| 6 |
def test_passes(code, func, inp, expected):
|
| 7 |
if isinstance(inp, (list, tuple)):
|
|
@@ -23,7 +24,7 @@ except Exception as e:
|
|
| 23 |
f.write(script)
|
| 24 |
fname = f.name
|
| 25 |
r = subprocess.run(
|
| 26 |
-
[
|
| 27 |
capture_output=True, text=True, timeout=5
|
| 28 |
)
|
| 29 |
os.unlink(fname)
|
|
|
|
| 2 |
import subprocess
|
| 3 |
import tempfile
|
| 4 |
import os
|
| 5 |
+
import sys
|
| 6 |
|
| 7 |
def test_passes(code, func, inp, expected):
|
| 8 |
if isinstance(inp, (list, tuple)):
|
|
|
|
| 24 |
f.write(script)
|
| 25 |
fname = f.name
|
| 26 |
r = subprocess.run(
|
| 27 |
+
[sys.executable, fname],
|
| 28 |
capture_output=True, text=True, timeout=5
|
| 29 |
)
|
| 30 |
os.unlink(fname)
|
requirements.txt
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
openenv-core>=0.2.0
|
|
|
|
| 1 |
+
gradio>=4.0
|
| 2 |
+
pydantic>=2.0
|
| 3 |
+
wandb
|
| 4 |
+
datasets
|
| 5 |
+
transformers>=4.40
|
| 6 |
+
accelerate>=0.30
|
| 7 |
+
trl>=0.12
|
| 8 |
+
torch>=2.1
|
| 9 |
+
unsloth
|
|
|
training/train_grpo.py
CHANGED
|
@@ -5,7 +5,10 @@ Algorithm: GRPO (Group Relative Policy Optimization) via HuggingFace TRL
|
|
| 5 |
GPU: HuggingFace ZeroGPU H200 (free) or paid HF Spaces A10G
|
| 6 |
|
| 7 |
Usage:
|
| 8 |
-
#
|
|
|
|
|
|
|
|
|
|
| 9 |
python training/train_grpo.py --test
|
| 10 |
|
| 11 |
# Full training run:
|
|
@@ -22,11 +25,13 @@ import argparse
|
|
| 22 |
import random
|
| 23 |
import subprocess
|
| 24 |
import tempfile
|
| 25 |
-
import
|
| 26 |
|
| 27 |
# ββ Parse args ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
parser = argparse.ArgumentParser()
|
| 29 |
-
parser.add_argument("--test", action="store_true", help="Run 10 steps for testing")
|
|
|
|
|
|
|
| 30 |
parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint")
|
| 31 |
parser.add_argument("--max_steps", type=int, default=1000)
|
| 32 |
args = parser.parse_args()
|
|
@@ -36,12 +41,14 @@ args = parser.parse_args()
|
|
| 36 |
if os.environ.get("COLAB_RELEASE_TAG") or os.environ.get("SPACE_ID"):
|
| 37 |
os.system("pip install -q unsloth trl wandb datasets")
|
| 38 |
|
| 39 |
-
# ββ
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
from
|
| 44 |
-
from
|
|
|
|
|
|
|
| 45 |
|
| 46 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 47 |
from server.reward_calculator import DebugRewardCalculator
|
|
@@ -54,7 +61,7 @@ MAX_STEPS = 10 if args.test else args.max_steps
|
|
| 54 |
CHECKPOINT_DIR = "./checkpoints"
|
| 55 |
|
| 56 |
# W&B β optional but strongly recommended for judging
|
| 57 |
-
WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "")
|
| 58 |
if WANDB_API_KEY:
|
| 59 |
wandb.init(
|
| 60 |
project="AgentDebuggerEnv",
|
|
@@ -114,6 +121,118 @@ def bug_to_prompt(bug: dict) -> str:
|
|
| 114 |
f"<|im_start|>assistant\n"
|
| 115 |
)
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
# ββ Load model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
print(f"Loading {MODEL_NAME}...")
|
| 119 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
@@ -178,38 +297,6 @@ def reward_fn(completions: list[str], prompts: list[str], **kwargs) -> list[floa
|
|
| 178 |
|
| 179 |
return rewards
|
| 180 |
|
| 181 |
-
def _run_fix(proposed_code: str, bug: dict) -> dict:
|
| 182 |
-
"""Safely run proposed fix with subprocess timeout."""
|
| 183 |
-
test_cases = bug.get("test_cases", [])
|
| 184 |
-
func_name = bug.get("function_name", "")
|
| 185 |
-
if not proposed_code or not test_cases or not func_name:
|
| 186 |
-
return {"passed": 0, "failed": 0, "total": len(test_cases), "newly_broken": 0}
|
| 187 |
-
|
| 188 |
-
passed = 0
|
| 189 |
-
for test in test_cases:
|
| 190 |
-
inp = test["input"]
|
| 191 |
-
args_str = ", ".join(repr(x) for x in inp) if isinstance(inp, (list, tuple)) else repr(inp)
|
| 192 |
-
script = (
|
| 193 |
-
f"{proposed_code}\n"
|
| 194 |
-
f"try:\n"
|
| 195 |
-
f" r={func_name}({args_str})\n"
|
| 196 |
-
f" print('PASS' if r=={repr(test['expected_output'])} else 'FAIL')\n"
|
| 197 |
-
f"except Exception as e:\n"
|
| 198 |
-
f" print(f'ERROR: {{e}}')\n"
|
| 199 |
-
)
|
| 200 |
-
try:
|
| 201 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
| 202 |
-
f.write(script)
|
| 203 |
-
fname = f.name
|
| 204 |
-
r = subprocess.run(["python", fname], capture_output=True, text=True, timeout=5)
|
| 205 |
-
os.unlink(fname)
|
| 206 |
-
if "PASS" in r.stdout:
|
| 207 |
-
passed += 1
|
| 208 |
-
except Exception:
|
| 209 |
-
pass
|
| 210 |
-
|
| 211 |
-
return {"passed": passed, "failed": len(test_cases) - passed, "total": len(test_cases), "newly_broken": 0}
|
| 212 |
-
|
| 213 |
# ββ Baseline evaluation (run BEFORE training) βββββββββββββββββββββββββββββββββ
|
| 214 |
def run_baseline(n: int = 20) -> dict:
|
| 215 |
print("\nRunning baseline evaluation on UNTRAINED model...")
|
|
|
|
| 5 |
GPU: HuggingFace ZeroGPU H200 (free) or paid HF Spaces A10G
|
| 6 |
|
| 7 |
Usage:
|
| 8 |
+
# Local reward sanity-check (no GPU, no model loading):
|
| 9 |
+
python training/train_grpo.py --test-local
|
| 10 |
+
|
| 11 |
+
# Test run (Colab/GPU, 10 steps):
|
| 12 |
python training/train_grpo.py --test
|
| 13 |
|
| 14 |
# Full training run:
|
|
|
|
| 25 |
import random
|
| 26 |
import subprocess
|
| 27 |
import tempfile
|
| 28 |
+
import shutil
|
| 29 |
|
| 30 |
# ββ Parse args ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
parser = argparse.ArgumentParser()
|
| 32 |
+
parser.add_argument("--test", action="store_true", help="Run 10 steps for testing (Colab/GPU)")
|
| 33 |
+
parser.add_argument("--test-local", action="store_true", dest="test_local",
|
| 34 |
+
help="Sanity-check reward function locally without any model or GPU")
|
| 35 |
parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint")
|
| 36 |
parser.add_argument("--max_steps", type=int, default=1000)
|
| 37 |
args = parser.parse_args()
|
|
|
|
| 41 |
if os.environ.get("COLAB_RELEASE_TAG") or os.environ.get("SPACE_ID"):
|
| 42 |
os.system("pip install -q unsloth trl wandb datasets")
|
| 43 |
|
| 44 |
+
# ββ GPU/training imports (skipped in --test-local mode) βββββββββββββββββββββββ
|
| 45 |
+
if not args.test_local:
|
| 46 |
+
import torch
|
| 47 |
+
import wandb
|
| 48 |
+
from datasets import Dataset
|
| 49 |
+
from unsloth import FastLanguageModel
|
| 50 |
+
from trl import GRPOTrainer, GRPOConfig
|
| 51 |
+
from transformers import TrainerCallback
|
| 52 |
|
| 53 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 54 |
from server.reward_calculator import DebugRewardCalculator
|
|
|
|
| 61 |
CHECKPOINT_DIR = "./checkpoints"
|
| 62 |
|
| 63 |
# W&B β optional but strongly recommended for judging
|
| 64 |
+
WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "") if not args.test_local else ""
|
| 65 |
if WANDB_API_KEY:
|
| 66 |
wandb.init(
|
| 67 |
project="AgentDebuggerEnv",
|
|
|
|
| 121 |
f"<|im_start|>assistant\n"
|
| 122 |
)
|
| 123 |
|
| 124 |
+
def _run_fix(proposed_code: str, bug: dict) -> dict:
|
| 125 |
+
"""Safely run proposed fix with subprocess timeout."""
|
| 126 |
+
test_cases = bug.get("test_cases", [])
|
| 127 |
+
func_name = bug.get("function_name", "")
|
| 128 |
+
if not proposed_code or not test_cases or not func_name:
|
| 129 |
+
return {"passed": 0, "failed": 0, "total": len(test_cases), "newly_broken": 0}
|
| 130 |
+
|
| 131 |
+
passed = 0
|
| 132 |
+
for test in test_cases:
|
| 133 |
+
inp = test["input"]
|
| 134 |
+
args_str = ", ".join(repr(x) for x in inp) if isinstance(inp, (list, tuple)) else repr(inp)
|
| 135 |
+
script = (
|
| 136 |
+
f"{proposed_code}\n"
|
| 137 |
+
f"try:\n"
|
| 138 |
+
f" r={func_name}({args_str})\n"
|
| 139 |
+
f" print('PASS' if r=={repr(test['expected_output'])} else 'FAIL')\n"
|
| 140 |
+
f"except Exception as e:\n"
|
| 141 |
+
f" print(f'ERROR: {{e}}')\n"
|
| 142 |
+
)
|
| 143 |
+
try:
|
| 144 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
| 145 |
+
f.write(script)
|
| 146 |
+
fname = f.name
|
| 147 |
+
python = shutil.which("python3") or shutil.which("python") or sys.executable
|
| 148 |
+
r = subprocess.run([python, fname], capture_output=True, text=True, timeout=5)
|
| 149 |
+
os.unlink(fname)
|
| 150 |
+
if "PASS" in r.stdout:
|
| 151 |
+
passed += 1
|
| 152 |
+
except Exception:
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
return {"passed": passed, "failed": len(test_cases) - passed, "total": len(test_cases), "newly_broken": 0}
|
| 156 |
+
|
| 157 |
+
# ββ Mock completions for --test-local βββββββββββββββββββββββββββββββββββββββββ
|
| 158 |
+
MOCK_GOOD = """
|
| 159 |
+
OBSERVATION: The loop condition on line 4 uses <= instead of
|
| 160 |
+
HYPOTHESIS: This causes an off-by-one error because Python lists are
|
| 161 |
+
0-indexed, so the last valid index is len(arr)-1 not len(arr)
|
| 162 |
+
CONFIDENCE: high
|
| 163 |
+
ACTION: propose_fix
|
| 164 |
+
DETAIL: def binary_search(arr, target):
|
| 165 |
+
left, right = 0, len(arr) - 1
|
| 166 |
+
while left < right:
|
| 167 |
+
mid = (left + right) // 2
|
| 168 |
+
if arr[mid] == target:
|
| 169 |
+
return mid
|
| 170 |
+
elif arr[mid] < target:
|
| 171 |
+
left = mid + 1
|
| 172 |
+
else:
|
| 173 |
+
right = mid - 1
|
| 174 |
+
return -1
|
| 175 |
+
"""
|
| 176 |
+
|
| 177 |
+
MOCK_BAD = """
|
| 178 |
+
I think there might be a bug somewhere in the code.
|
| 179 |
+
Let me try fixing it.
|
| 180 |
+
"""
|
| 181 |
+
|
| 182 |
+
# ββ --test-local: reward sanity-check without any model βββββββββββββββββββββββ
|
| 183 |
+
if args.test_local:
|
| 184 |
+
print("=" * 60)
|
| 185 |
+
print("LOCAL TEST MODE β no model loaded, testing reward function only")
|
| 186 |
+
print("=" * 60)
|
| 187 |
+
|
| 188 |
+
bugs = load_bugs(1)
|
| 189 |
+
if not bugs:
|
| 190 |
+
print("ERROR: No bugs found in data/bugs_tier1.jsonl. Run data/generate_bugs.py first.")
|
| 191 |
+
sys.exit(1)
|
| 192 |
+
|
| 193 |
+
bug = bugs[0]
|
| 194 |
+
print(f"\nUsing bug: {bug.get('function_name', '?')} β {bug.get('bug_type', '?')}\n")
|
| 195 |
+
|
| 196 |
+
calculator_local = DebugRewardCalculator()
|
| 197 |
+
|
| 198 |
+
def _score(label: str, completion: str) -> float:
|
| 199 |
+
try:
|
| 200 |
+
agent_output = parse_agent_output(completion)
|
| 201 |
+
test_results = {"passed": 0, "failed": 0, "total": 0, "newly_broken": 0}
|
| 202 |
+
if agent_output.action == "propose_fix":
|
| 203 |
+
test_results = _run_fix(agent_output.detail, bug)
|
| 204 |
+
breakdown = calculator_local.compute_turn_reward(
|
| 205 |
+
agent_output=agent_output,
|
| 206 |
+
ground_truth={
|
| 207 |
+
"bug_function": bug.get("bug_location", {}).get("function", ""),
|
| 208 |
+
"bug_line": bug.get("bug_location", {}).get("line_start", -1),
|
| 209 |
+
"bug_type": bug.get("bug_type", ""),
|
| 210 |
+
"canonical_fix_code": bug.get("original_code", ""),
|
| 211 |
+
},
|
| 212 |
+
test_results=test_results,
|
| 213 |
+
turn_number=0,
|
| 214 |
+
)
|
| 215 |
+
print(f"--- {label} reward breakdown ---")
|
| 216 |
+
for field, value in breakdown.__dict__.items():
|
| 217 |
+
print(f" {field}: {value}")
|
| 218 |
+
print(f" TOTAL: {breakdown.total}\n")
|
| 219 |
+
return breakdown.total
|
| 220 |
+
except Exception as e:
|
| 221 |
+
print(f"Reward error for {label}: {e}")
|
| 222 |
+
return -0.3
|
| 223 |
+
|
| 224 |
+
good_score = _score("MOCK_GOOD", MOCK_GOOD)
|
| 225 |
+
bad_score = _score("MOCK_BAD", MOCK_BAD)
|
| 226 |
+
|
| 227 |
+
print(f"MOCK_GOOD score: {good_score:.4f}")
|
| 228 |
+
print(f"MOCK_BAD score: {bad_score:.4f}")
|
| 229 |
+
|
| 230 |
+
assert good_score > bad_score, (
|
| 231 |
+
f"ASSERTION FAILED: MOCK_GOOD ({good_score:.4f}) should be > MOCK_BAD ({bad_score:.4f})"
|
| 232 |
+
)
|
| 233 |
+
print("\nLOCAL TEST PASSED")
|
| 234 |
+
sys.exit(0)
|
| 235 |
+
|
| 236 |
# ββ Load model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 237 |
print(f"Loading {MODEL_NAME}...")
|
| 238 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
|
|
| 297 |
|
| 298 |
return rewards
|
| 299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
# ββ Baseline evaluation (run BEFORE training) βββββββββββββββββββββββββββββββββ
|
| 301 |
def run_baseline(n: int = 20) -> dict:
|
| 302 |
print("\nRunning baseline evaluation on UNTRAINED model...")
|